Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 6146 → Rev 6147

/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/Makefile
0,0 → 1,30
include $(SUBDIR)../config.mak
 
NAME = swscale
FFLIBS = avutil
 
HEADERS = swscale.h \
version.h \
 
OBJS = alphablend.o \
hscale_fast_bilinear.o \
input.o \
options.o \
output.o \
rgb2rgb.o \
swscale.o \
swscale_unscaled.o \
utils.o \
yuv2rgb.o \
slice.o \
hscale.o \
vscale.o \
gamma.o \
 
OBJS-$(CONFIG_SHARED) += log2_tab.o
 
# Windows resource file
SLIBOBJS-$(HAVE_GNU_WINDRES) += swscaleres.o
 
TESTPROGS = colorspace \
swscale \
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/alphablend.c
0,0 → 1,169
/*
* Copyright (C) 2015 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "swscale_internal.h"
 
int ff_sws_alphablendaway(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
int nb_components = desc->nb_components;
int plane, x, y;
int plane_count = isGray(c->srcFormat) ? 1 : 3;
int sixteen_bits = desc->comp[0].depth_minus1 >= 8;
unsigned off = 1<<desc->comp[0].depth_minus1;
unsigned shift = desc->comp[0].depth_minus1 + 1;
unsigned max = (1<<shift) - 1;
int target_table[2][3];
 
for (plane = 0; plane < plane_count; plane++) {
int a = 0, b = 0;
if (c->alphablend == SWS_ALPHA_BLEND_CHECKERBOARD) {
a = (1<<desc->comp[0].depth_minus1)/2;
b = 3*(1<<desc->comp[0].depth_minus1)/2;
}
target_table[0][plane] = plane && !(desc->flags & AV_PIX_FMT_FLAG_RGB) ? 1<<desc->comp[0].depth_minus1 : a;
target_table[1][plane] = plane && !(desc->flags & AV_PIX_FMT_FLAG_RGB) ? 1<<desc->comp[0].depth_minus1 : b;
}
 
av_assert0(plane_count == nb_components - 1);
if (desc->flags & AV_PIX_FMT_FLAG_PLANAR) {
for (plane = 0; plane < plane_count; plane++) {
int w = plane ? c->chrSrcW : c->srcW;
int x_subsample = plane ? desc->log2_chroma_w: 0;
int y_subsample = plane ? desc->log2_chroma_h: 0;
for (y = srcSliceY >> y_subsample; y < FF_CEIL_RSHIFT(srcSliceH, y_subsample); y++) {
if (x_subsample || y_subsample) {
int alpha;
unsigned u;
if (sixteen_bits) {
ptrdiff_t alpha_step = srcStride[plane_count] >> 1;
const uint16_t *s = (const uint16_t *)(src[plane ] + srcStride[plane ] * y);
const uint16_t *a = (const uint16_t *)(src[plane_count] + (srcStride[plane_count] * y << y_subsample));
uint16_t *d = ( uint16_t *)(dst[plane ] + dstStride[plane ] * y);
if ((!isBE(c->srcFormat)) == !HAVE_BIGENDIAN) {
for (x = 0; x < w; x++) {
if (y_subsample) {
alpha = (a[2*x] + a[2*x + 1] + 2 +
a[2*x + alpha_step] + a[2*x + alpha_step + 1]) >> 2;
} else
alpha = (a[2*x] + a[2*x + 1]) >> 1;
u = s[x]*alpha + target_table[((x^y)>>5)&1][plane]*(max-alpha) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
} else {
for (x = 0; x < w; x++) {
if (y_subsample) {
alpha = (av_bswap16(a[2*x]) + av_bswap16(a[2*x + 1]) + 2 +
av_bswap16(a[2*x + alpha_step]) + av_bswap16(a[2*x + alpha_step + 1])) >> 2;
} else
alpha = (av_bswap16(a[2*x]) + av_bswap16(a[2*x + 1])) >> 1;
u = av_bswap16(s[x])*alpha + target_table[((x^y)>>5)&1][plane]*(max-alpha) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
} else {
ptrdiff_t alpha_step = srcStride[plane_count];
const uint8_t *s = src[plane ] + srcStride[plane] * y;
const uint8_t *a = src[plane_count] + (srcStride[plane_count] * y << y_subsample);
uint8_t *d = dst[plane ] + dstStride[plane] * y;
for (x = 0; x < w; x++) {
if (y_subsample) {
alpha = (a[2*x] + a[2*x + 1] + 2 +
a[2*x + alpha_step] + a[2*x + alpha_step + 1]) >> 2;
} else
alpha = (a[2*x] + a[2*x + 1]) >> 1;
u = s[x]*alpha + target_table[((x^y)>>5)&1][plane]*(255-alpha) + 128;
d[x] = (257*u) >> 16;
}
}
} else {
if (sixteen_bits) {
const uint16_t *s = (const uint16_t *)(src[plane ] + srcStride[plane ] * y);
const uint16_t *a = (const uint16_t *)(src[plane_count] + srcStride[plane_count] * y);
uint16_t *d = ( uint16_t *)(dst[plane ] + dstStride[plane ] * y);
if ((!isBE(c->srcFormat)) == !HAVE_BIGENDIAN) {
for (x = 0; x < w; x++) {
unsigned u = s[x]*a[x] + target_table[((x^y)>>5)&1][plane]*(max-a[x]) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
} else {
for (x = 0; x < w; x++) {
unsigned aswap =av_bswap16(a[x]);
unsigned u = av_bswap16(s[x])*aswap + target_table[((x^y)>>5)&1][plane]*(max-aswap) + off;
d[x] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
} else {
const uint8_t *s = src[plane ] + srcStride[plane] * y;
const uint8_t *a = src[plane_count] + srcStride[plane_count] * y;
uint8_t *d = dst[plane ] + dstStride[plane] * y;
for (x = 0; x < w; x++) {
unsigned u = s[x]*a[x] + target_table[((x^y)>>5)&1][plane]*(255-a[x]) + 128;
d[x] = (257*u) >> 16;
}
}
}
}
}
} else {
int alpha_pos = desc->comp[plane_count].offset_plus1 - 1;
int w = c->srcW;
for (y = srcSliceY; y < srcSliceH; y++) {
if (sixteen_bits) {
const uint16_t *s = (const uint16_t *)(src[0] + srcStride[0] * y + 2*!alpha_pos);
const uint16_t *a = (const uint16_t *)(src[0] + srcStride[0] * y + alpha_pos);
uint16_t *d = ( uint16_t *)(dst[0] + dstStride[0] * y);
if ((!isBE(c->srcFormat)) == !HAVE_BIGENDIAN) {
for (x = 0; x < w; x++) {
for (plane = 0; plane < plane_count; plane++) {
int x_index = (plane_count + 1) * x;
unsigned u = s[x_index + plane]*a[x_index] + target_table[((x^y)>>5)&1][plane]*(max-a[x_index]) + off;
d[plane_count*x + plane] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
} else {
for (x = 0; x < w; x++) {
for (plane = 0; plane < plane_count; plane++) {
int x_index = (plane_count + 1) * x;
unsigned aswap =av_bswap16(a[x_index]);
unsigned u = av_bswap16(s[x_index + plane])*aswap + target_table[((x^y)>>5)&1][plane]*(max-aswap) + off;
d[plane_count*x + plane] = av_clip((u + (u >> shift)) >> shift, 0, max);
}
}
}
} else {
const uint8_t *s = src[0] + srcStride[0] * y + !alpha_pos;
const uint8_t *a = src[0] + srcStride[0] * y + alpha_pos;
uint8_t *d = dst[0] + dstStride[0] * y;
for (x = 0; x < w; x++) {
for (plane = 0; plane < plane_count; plane++) {
int x_index = (plane_count + 1) * x;
unsigned u = s[x_index + plane]*a[x_index] + target_table[((x^y)>>5)&1][plane]*(255-a[x_index]) + 128;
d[plane_count*x + plane] = (257*u) >> 16;
}
}
}
}
}
 
return 0;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/arm/Makefile
0,0 → 1,4
# OBJS += arm/swscale_unscaled.o
 
# NEON-OBJS += arm/rgb2yuv_neon_32.o
# NEON-OBJS += arm/rgb2yuv_neon_16.o
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/arm/rgb2yuv_neon_16.S
0,0 → 1,80
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "rgb2yuv_neon_common.S"
 
/* downsampled R16G16B16 x8 */
alias_qw r16x8, q7
alias_qw g16x8, q8
alias_qw b16x8, q9
 
alias n16x16_l, q11
alias n16x16_h, q12
 
alias y16x16_l, q13
alias y16x16_h, q14
 
alias_qw y8x16, q15
 
.macro init src
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
vrshrn.i32 CO_R, q13, #7
vrshrn.i32 CO_G, q14, #7
vrshrn.i32 CO_B, q15, #7
 
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
 
 
.macro compute_y_16x1_step action, s8x16, coeff
vmovl.u8 n16x16_l, \s8x16\()_l
vmovl.u8 n16x16_h, \s8x16\()_h
 
\action y16x16_l, n16x16_l, \coeff
\action y16x16_h, n16x16_h, \coeff
.endm
 
.macro compute_y_16x1
compute_y_16x1_step vmul, r8x16, CO_RY
compute_y_16x1_step vmla, g8x16, CO_GY
compute_y_16x1_step vmla, b8x16, CO_BY
 
vrshrn.i16 y8x16_l, y16x16_l, #8
vrshrn.i16 y8x16_h, y16x16_h, #8
 
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
 
alias c16x8, q15
alias_qw c8x8x2, q10
 
 
.macro compute_chroma_8x1 c, C
vmul c16x8, r16x8, CO_R\C
vmla c16x8, g16x8, CO_G\C
vmla c16x8, b16x8, CO_B\C
 
vrshrn.i16 \c\()8x8, c16x8, #8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
 
loop_420sp rgbx, nv12, init, kernel_420_16x2, 16
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/arm/rgb2yuv_neon_32.S
0,0 → 1,119
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "rgb2yuv_neon_common.S"
 
/* downsampled R16G16B16 x8 */
alias_qw r16x8, q7
alias_qw g16x8, q8
alias_qw b16x8, q9
 
alias n16x16_o, q11
alias n16x16_ol, q11_l
alias n16x16_oh, q11_h
 
alias y32x16_el, q12
alias y32x16_eh, q13
alias y32x16_ol, q14
alias y32x16_oh, q15
 
alias y16x16_e, q12
alias y16x16_el, q12_l
alias y16x16_eh, q12_h
alias y16x16_o, q13
alias y16x16_ol, q13_l
alias y16x16_oh, q13_h
 
 
alias y8x16, y16x16_e
 
 
.macro init src
// load s32x3x3, narrow to s16x3x3
vld3.i32 {q13_l, q14_l, q15_l}, [\src]!
vld3.i32 {q13_h[0], q14_h[0], q15_h[0]}, [\src]
 
vmovn.i32 CO_R, q13
vmovn.i32 CO_G, q14
vmovn.i32 CO_B, q15
 
vmov.u8 BIAS_Y, #16
vmov.u8 BIAS_U, #128
.endm
 
 
.macro compute_y_16x1_step action, s8x16, coeff
vmov.u8 n16x16_o, #0
vtrn.u8 \s8x16, n16x16_o
 
\action y32x16_el, \s8x16\()_l, \coeff
\action y32x16_eh, \s8x16\()_h, \coeff
\action y32x16_ol, n16x16_ol, \coeff
\action y32x16_oh, n16x16_oh, \coeff
.endm
 
/*
* in: r8x16, g8x16, b8x16
* out: y8x16
* clobber: q11-q15, r8x16, g8x16, b8x16
*/
.macro compute_y_16x1
compute_y_16x1_step vmull, r8x16, CO_RY
compute_y_16x1_step vmlal, g8x16, CO_GY
compute_y_16x1_step vmlal, b8x16, CO_BY
 
vrshrn.i32 y16x16_el, y32x16_el, #15
vrshrn.i32 y16x16_eh, y32x16_eh, #15
vrshrn.i32 y16x16_ol, y32x16_ol, #15
vrshrn.i32 y16x16_oh, y32x16_oh, #15
 
vtrn.8 y16x16_e, y16x16_o
vadd.u8 y8x16, y8x16, BIAS_Y
.endm
 
alias c32x8_l, q14
alias c32x8_h, q15
 
alias_qw c16x8, q13
alias_qw c8x8x2, q10
 
.macro compute_chroma_8x1_step action, s16x8, coeff
\action c32x8_l, \s16x8\()_l, \coeff
\action c32x8_h, \s16x8\()_h, \coeff
.endm
 
/*
* in: r16x8, g16x8, b16x8
* out: c8x8
* clobber: q14-q15
*/
.macro compute_chroma_8x1 c, C
compute_chroma_8x1_step vmull, r16x8, CO_R\C
compute_chroma_8x1_step vmlal, g16x8, CO_G\C
compute_chroma_8x1_step vmlal, b16x8, CO_B\C
 
vrshrn.i32 c16x8_l, c32x8_l, #15
vrshrn.i32 c16x8_h, c32x8_h, #15
vmovn.i16 \c\()8x8, c16x8
vadd.u8 \c\()8x8, \c\()8x8, BIAS_\C
.endm
 
 
loop_420sp rgbx, nv12, init, kernel_420_16x2, 32
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/arm/rgb2yuv_neon_common.S
0,0 → 1,291
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/arm/asm.S"
 
.macro alias name, tgt, set=1
.if \set != 0
\name .req \tgt
.else
.unreq \name
.endif
.endm
 
.altmacro
 
.macro alias_dw_all qw, dw_l, dw_h
alias q\qw\()_l, d\dw_l
alias q\qw\()_h, d\dw_h
.if \qw < 15
alias_dw_all %(\qw + 1), %(\dw_l + 2), %(\dw_h + 2)
.endif
.endm
 
alias_dw_all 0, 0, 1
 
.noaltmacro
 
.macro alias_qw name, qw, set=1
alias \name\(), \qw, \set
alias \name\()_l, \qw\()_l, \set
alias \name\()_h, \qw\()_h, \set
.endm
 
.macro prologue
push {r4-r12, lr}
vpush {q4-q7}
.endm
 
.macro epilogue
vpop {q4-q7}
pop {r4-r12, pc}
.endm
 
.macro load_arg reg, ix
ldr \reg, [sp, #((10 * 4 + 4 * 16) + (\ix - 4) * 4)]
.endm
 
 
/* ()_to_()_neon(const uint8_t *src, uint8_t *y, uint8_t *chroma
* int width, int height,
* int y_stride, int c_stride, int src_stride,
* int32_t coeff_table[9]);
*/
.macro alias_loop_420sp set=1
alias src, r0, \set
alias src0, src, \set
alias y, r1, \set
alias y0, y, \set
alias chroma, r2, \set
alias width, r3, \set
alias header, width, \set
 
alias height, r4, \set
alias y_stride, r5, \set
alias c_stride, r6, \set
alias c_padding, c_stride, \set
alias src_stride, r7, \set
 
alias y0_end, r8, \set
 
alias src_padding,r9, \set
alias y_padding, r10, \set
 
alias src1, r11, \set
alias y1, r12, \set
 
alias coeff_table,r12, \set
.endm
 
 
.macro loop_420sp s_fmt, d_fmt, init, kernel, precision
 
function \s_fmt\()_to_\d_fmt\()_neon_\precision, export=1
prologue
 
alias_loop_420sp
 
load_arg height, 4
load_arg y_stride, 5
load_arg c_stride, 6
load_arg src_stride, 7
load_arg coeff_table, 8
 
\init coeff_table
 
sub y_padding, y_stride, width
sub c_padding, c_stride, width
sub src_padding, src_stride, width, LSL #2
 
add y0_end, y0, width
and header, width, #15
 
add y1, y0, y_stride
add src1, src0, src_stride
 
0:
cmp header, #0
beq 1f
 
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma, header
 
1:
\kernel \s_fmt, \d_fmt, src0, src1, y0, y1, chroma
 
cmp y0, y0_end
blt 1b
2:
add y0, y1, y_padding
add y0_end, y1, y_stride
add chroma, chroma, c_padding
add src0, src1, src_padding
 
add y1, y0, y_stride
add src1, src0, src_stride
 
subs height, height, #2
 
bgt 0b
 
epilogue
 
alias_loop_420sp 0
 
endfunc
.endm
 
.macro downsample
vpaddl.u8 r16x8, r8x16
vpaddl.u8 g16x8, g8x16
vpaddl.u8 b16x8, b8x16
.endm
 
 
/* acculumate and right shift by 2 */
.macro downsample_ars2
vpadal.u8 r16x8, r8x16
vpadal.u8 g16x8, g8x16
vpadal.u8 b16x8, b8x16
 
vrshr.u16 r16x8, r16x8, #2
vrshr.u16 g16x8, g16x8, #2
vrshr.u16 b16x8, b16x8, #2
.endm
 
.macro store_y8_16x1 dst, count
.ifc "\count",""
vstmia \dst!, {y8x16}
.else
vstmia \dst, {y8x16}
add \dst, \dst, \count
.endif
.endm
 
.macro store_chroma_nv12_8x1 dst, count
.ifc "\count",""
vst2.i8 {u8x8, v8x8}, [\dst]!
.else
vst2.i8 {u8x8, v8x8}, [\dst], \count
.endif
.endm
 
.macro store_chroma_nv21_8x1 dst, count
.ifc "\count",""
vst2.i8 {v8x8, u8x8}, [\dst]!
.else
vst2.i8 {v8x8, u8x8}, [\dst], \count
.endif
.endm
 
.macro load_8888_16x1 a, b, c, d, src, count
.ifc "\count",""
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]!
.else
vld4.8 {\a\()8x16_l, \b\()8x16_l, \c\()8x16_l, \d\()8x16_l}, [\src]!
vld4.8 {\a\()8x16_h, \b\()8x16_h, \c\()8x16_h, \d\()8x16_h}, [\src]
sub \src, \src, #32
add \src, \src, \count, LSL #2
.endif
.endm
 
.macro load_rgbx_16x1 src, count
load_8888_16x1 r, g, b, x, \src, \count
.endm
 
.macro load_bgrx_16x1 src, count
load_8888_16x1 b, g, r, x, \src, \count
.endm
 
.macro alias_src_rgbx set=1
alias_src_8888 r, g, b, x, \set
.endm
 
.macro alias_src_bgrx set=1
alias_src_8888 b, g, r, x, \set
.endm
 
.macro alias_dst_nv12 set=1
alias u8x8, c8x8x2_l, \set
alias v8x8, c8x8x2_h, \set
.endm
 
.macro alias_dst_nv21 set=1
alias v8x8, c8x8x2_l, \set
alias u8x8, c8x8x2_h, \set
.endm
 
 
// common aliases
 
alias CO_R d0
CO_RY .dn d0.s16[0]
CO_RU .dn d0.s16[1]
CO_RV .dn d0.s16[2]
 
alias CO_G d1
CO_GY .dn d1.s16[0]
CO_GU .dn d1.s16[1]
CO_GV .dn d1.s16[2]
 
alias CO_B d2
CO_BY .dn d2.s16[0]
CO_BU .dn d2.s16[1]
CO_BV .dn d2.s16[2]
 
alias BIAS_U, d3
alias BIAS_V, BIAS_U
 
alias BIAS_Y, q2
 
 
/* q3-q6 R8G8B8X8 x16 */
 
.macro alias_src_8888 a, b, c, d, set
alias_qw \a\()8x16, q3, \set
alias_qw \b\()8x16, q4, \set
alias_qw \c\()8x16, q5, \set
alias_qw \d\()8x16, q6, \set
.endm
 
.macro kernel_420_16x2 rgb_fmt, yuv_fmt, rgb0, rgb1, y0, y1, chroma, count
alias_src_\rgb_fmt
alias_dst_\yuv_fmt
 
load_\rgb_fmt\()_16x1 \rgb0, \count
 
downsample
compute_y_16x1
store_y8_16x1 \y0, \count
 
 
load_\rgb_fmt\()_16x1 \rgb1, \count
downsample_ars2
compute_y_16x1
store_y8_16x1 \y1, \count
 
compute_chroma_8x1 u, U
compute_chroma_8x1 v, V
 
store_chroma_\yuv_fmt\()_8x1 \chroma, \count
 
alias_dst_\yuv_fmt 0
alias_src_\rgb_fmt 0
.endm
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/arm/swscale_unscaled.c
0,0 → 1,79
/*
* Copyright (C) 2013 Xiaolei Yu <dreifachstein@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/arm/cpu.h"
 
extern void rgbx_to_nv12_neon_32(const uint8_t *src, uint8_t *y, uint8_t *chroma,
int width, int height,
int y_stride, int c_stride, int src_stride,
int32_t coeff_tbl[9]);
 
extern void rgbx_to_nv12_neon_16(const uint8_t *src, uint8_t *y, uint8_t *chroma,
int width, int height,
int y_stride, int c_stride, int src_stride,
int32_t coeff_tbl[9]);
 
static int rgbx_to_nv12_neon_32_wrapper(SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]) {
 
rgbx_to_nv12_neon_32(src[0] + srcSliceY * srcStride[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY / 2) * dstStride[1],
context->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
context->input_rgb2yuv_table);
 
return 0;
}
 
static int rgbx_to_nv12_neon_16_wrapper(SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]) {
 
rgbx_to_nv12_neon_16(src[0] + srcSliceY * srcStride[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY / 2) * dstStride[1],
context->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
context->input_rgb2yuv_table);
 
return 0;
}
 
static void get_unscaled_swscale_neon(SwsContext *c) {
int accurate_rnd = c->flags & SWS_ACCURATE_RND;
if (c->srcFormat == AV_PIX_FMT_RGBA
&& c->dstFormat == AV_PIX_FMT_NV12
&& (c->srcW >= 16)) {
c->swscale = accurate_rnd ? rgbx_to_nv12_neon_32_wrapper
: rgbx_to_nv12_neon_16_wrapper;
}
}
 
void ff_get_unscaled_swscale_arm(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
if (have_neon(cpu_flags))
get_unscaled_swscale_neon(c);
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/bayer_template.c
0,0 → 1,236
/*
* Bayer-to-RGB/YV12 template
* Copyright (c) 2011-2014 Peter Ross <pross@xvid.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#if defined(BAYER_BGGR) || defined(BAYER_GBRG)
#define BAYER_R 0
#define BAYER_G 1
#define BAYER_B 2
#endif
#if defined(BAYER_RGGB) || defined(BAYER_GRBG)
#define BAYER_R 2
#define BAYER_G 1
#define BAYER_B 0
#endif
 
#if defined(BAYER_8)
#define BAYER_READ(x) (x)
#define BAYER_SIZEOF 1
#define BAYER_SHIFT 0
#endif
#if defined(BAYER_16LE)
#define BAYER_READ(x) AV_RL16(&(x))
#define BAYER_SIZEOF 2
#define BAYER_SHIFT 8
#endif
#if defined(BAYER_16BE)
#define BAYER_READ(x) AV_RB16(&(x))
#define BAYER_SIZEOF 2
#define BAYER_SHIFT 8
#endif
 
#define S(y, x) BAYER_READ(src[(y)*src_stride + BAYER_SIZEOF*(x)])
#define T(y, x) (unsigned int)S(y, x)
#define R(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_R]
#define G(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_G]
#define B(y, x) dst[(y)*dst_stride + (x)*3 + BAYER_B]
 
#if defined(BAYER_BGGR) || defined(BAYER_RGGB)
#define BAYER_TO_RGB24_COPY \
R(0, 0) = \
R(0, 1) = \
R(1, 1) = \
R(1, 0) = S(1, 1) >> BAYER_SHIFT; \
\
G(0, 1) = S(0, 1) >> BAYER_SHIFT; \
G(0, 0) = \
G(1, 1) = (T(0, 1) + T(1, 0)) >> (1 + BAYER_SHIFT); \
G(1, 0) = S(1, 0) >> BAYER_SHIFT; \
\
B(1, 1) = \
B(0, 0) = \
B(0, 1) = \
B(1, 0) = S(0, 0) >> BAYER_SHIFT;
#define BAYER_TO_RGB24_INTERPOLATE \
R(0, 0) = (T(-1, -1) + T(-1, 1) + T(1, -1) + T(1, 1)) >> (2 + BAYER_SHIFT); \
G(0, 0) = (T(-1, 0) + T( 0, -1) + T(0, 1) + T(1, 0)) >> (2 + BAYER_SHIFT); \
B(0, 0) = S(0, 0) >> BAYER_SHIFT; \
\
R(0, 1) = (T(-1, 1) + T(1, 1)) >> (1 + BAYER_SHIFT); \
G(0, 1) = S(0, 1) >> BAYER_SHIFT; \
B(0, 1) = (T(0, 0) + T(0, 2)) >> (1 + BAYER_SHIFT); \
\
R(1, 0) = (T(1, -1) + T(1, 1)) >> (1 + BAYER_SHIFT); \
G(1, 0) = S(1, 0) >> BAYER_SHIFT; \
B(1, 0) = (T(0, 0) + T(2, 0)) >> (1 + BAYER_SHIFT); \
\
R(1, 1) = S(1, 1) >> BAYER_SHIFT; \
G(1, 1) = (T(0, 1) + T(1, 0) + T(1, 2) + T(2, 1)) >> (2 + BAYER_SHIFT); \
B(1, 1) = (T(0, 0) + T(0, 2) + T(2, 0) + T(2, 2)) >> (2 + BAYER_SHIFT);
#else
#define BAYER_TO_RGB24_COPY \
R(0, 0) = \
R(0, 1) = \
R(1, 1) = \
R(1, 0) = S(1, 0) >> BAYER_SHIFT; \
\
G(0, 0) = S(0, 0) >> BAYER_SHIFT; \
G(1, 1) = S(1, 1) >> BAYER_SHIFT; \
G(0, 1) = \
G(1, 0) = (T(0, 0) + T(1, 1)) >> (1 + BAYER_SHIFT); \
\
B(1, 1) = \
B(0, 0) = \
B(0, 1) = \
B(1, 0) = S(0, 1) >> BAYER_SHIFT;
#define BAYER_TO_RGB24_INTERPOLATE \
R(0, 0) = (T(-1, 0) + T(1, 0)) >> (1 + BAYER_SHIFT); \
G(0, 0) = S(0, 0) >> BAYER_SHIFT; \
B(0, 0) = (T(0, -1) + T(0, 1)) >> (1 + BAYER_SHIFT); \
\
R(0, 1) = (T(-1, 0) + T(-1, 2) + T(1, 0) + T(1, 2)) >> (2 + BAYER_SHIFT); \
G(0, 1) = (T(-1, 1) + T(0, 0) + T(0, 2) + T(1, 1)) >> (2 + BAYER_SHIFT); \
B(0, 1) = S(0, 1) >> BAYER_SHIFT; \
\
R(1, 0) = S(1, 0) >> BAYER_SHIFT; \
G(1, 0) = (T(0, 0) + T(1, -1) + T(1, 1) + T(2, 0)) >> (2 + BAYER_SHIFT); \
B(1, 0) = (T(0, -1) + T(0, 1) + T(2, -1) + T(2, 1)) >> (2 + BAYER_SHIFT); \
\
R(1, 1) = (T(1, 0) + T(1, 2)) >> (1 + BAYER_SHIFT); \
G(1, 1) = S(1, 1) >> BAYER_SHIFT; \
B(1, 1) = (T(0, 1) + T(2, 1)) >> (1 + BAYER_SHIFT);
#endif
 
/**
* invoke ff_rgb24toyv12 for 2x2 pixels
*/
#define rgb24toyv12_2x2(src, dstY, dstU, dstV, luma_stride, src_stride, rgb2yuv) \
ff_rgb24toyv12(src, dstY, dstV, dstU, 2, 2, luma_stride, 0, src_stride, rgb2yuv)
 
static void BAYER_RENAME(rgb24_copy)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width)
{
int i;
for (i = 0 ; i < width; i+= 2) {
BAYER_TO_RGB24_COPY
src += 2 * BAYER_SIZEOF;
dst += 6;
}
}
 
static void BAYER_RENAME(rgb24_interpolate)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width)
{
int i;
 
BAYER_TO_RGB24_COPY
src += 2 * BAYER_SIZEOF;
dst += 6;
 
for (i = 2 ; i < width - 2; i+= 2) {
BAYER_TO_RGB24_INTERPOLATE
src += 2 * BAYER_SIZEOF;
dst += 6;
}
 
if (width > 2) {
BAYER_TO_RGB24_COPY
}
}
 
static void BAYER_RENAME(yv12_copy)(const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv)
{
uint8_t dst[12];
const int dst_stride = 6;
int i;
for (i = 0 ; i < width; i+= 2) {
BAYER_TO_RGB24_COPY
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
src += 2 * BAYER_SIZEOF;
dstY += 2;
dstU++;
dstV++;
}
}
 
static void BAYER_RENAME(yv12_interpolate)(const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv)
{
uint8_t dst[12];
const int dst_stride = 6;
int i;
 
BAYER_TO_RGB24_COPY
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
src += 2 * BAYER_SIZEOF;
dstY += 2;
dstU++;
dstV++;
 
for (i = 2 ; i < width - 2; i+= 2) {
BAYER_TO_RGB24_INTERPOLATE
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
src += 2 * BAYER_SIZEOF;
dstY += 2;
dstU++;
dstV++;
}
 
if (width > 2) {
BAYER_TO_RGB24_COPY
rgb24toyv12_2x2(dst, dstY, dstU, dstV, luma_stride, dst_stride, rgb2yuv);
}
}
 
#undef S
#undef T
#undef R
#undef G
#undef B
#undef BAYER_TO_RGB24_COPY
#undef BAYER_TO_RGB24_INTERPOLATE
 
#undef BAYER_RENAME
 
#undef BAYER_R
#undef BAYER_G
#undef BAYER_B
#undef BAYER_READ
#undef BAYER_SIZEOF
#undef BAYER_SHIFT
 
#if defined(BAYER_BGGR)
#undef BAYER_BGGR
#endif
#if defined(BAYER_RGGB)
#undef BAYER_RGGB
#endif
#if defined(BAYER_GBRG)
#undef BAYER_GBRG
#endif
#if defined(BAYER_GRBG)
#undef BAYER_GRBG
#endif
#if defined(BAYER_8)
#undef BAYER_8
#endif
#if defined(BAYER_16LE)
#undef BAYER_16LE
#endif
#if defined(BAYER_16BE)
#undef BAYER_16BE
#endif
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/colorspace-test.c
0,0 → 1,170
/*
* Copyright (C) 2002 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stdio.h>
#include <string.h> /* for memset() */
#include <stdlib.h>
#include <inttypes.h>
 
#include "swscale.h"
#include "rgb2rgb.h"
#include "libavutil/mem.h"
 
#define SIZE 1000
#define srcByte 0x55
#define dstByte 0xBB
 
#define FUNC(s, d, n) { s, d, #n, n }
 
int main(int argc, char **argv)
{
int i, funcNum;
uint8_t *srcBuffer = av_malloc(SIZE);
uint8_t *dstBuffer = av_malloc(SIZE);
int failedNum = 0;
int passedNum = 0;
 
if (!srcBuffer || !dstBuffer)
return -1;
 
av_log(NULL, AV_LOG_INFO, "memory corruption test ...\n");
sws_rgb2rgb_init();
 
for (funcNum = 0; ; funcNum++) {
struct func_info_s {
int src_bpp;
int dst_bpp;
const char *name;
void (*func)(const uint8_t *src, uint8_t *dst, int src_size);
} func_info[] = {
FUNC(2, 2, rgb12to15),
FUNC(2, 2, rgb15to16),
FUNC(2, 3, rgb15to24),
FUNC(2, 4, rgb15to32),
FUNC(2, 3, rgb16to24),
FUNC(2, 4, rgb16to32),
FUNC(3, 2, rgb24to15),
FUNC(3, 2, rgb24to16),
FUNC(3, 4, rgb24to32),
FUNC(4, 2, rgb32to15),
FUNC(4, 2, rgb32to16),
FUNC(4, 3, rgb32to24),
FUNC(2, 2, rgb16to15),
FUNC(2, 2, rgb12tobgr12),
FUNC(2, 2, rgb15tobgr15),
FUNC(2, 2, rgb15tobgr16),
FUNC(2, 3, rgb15tobgr24),
FUNC(2, 4, rgb15tobgr32),
FUNC(2, 2, rgb16tobgr15),
FUNC(2, 2, rgb16tobgr16),
FUNC(2, 3, rgb16tobgr24),
FUNC(2, 4, rgb16tobgr32),
FUNC(3, 2, rgb24tobgr15),
FUNC(3, 2, rgb24tobgr16),
FUNC(3, 3, rgb24tobgr24),
FUNC(3, 4, rgb24tobgr32),
FUNC(4, 2, rgb32tobgr15),
FUNC(4, 2, rgb32tobgr16),
FUNC(4, 3, rgb32tobgr24),
FUNC(4, 4, shuffle_bytes_2103), /* rgb32tobgr32 */
FUNC(6, 6, rgb48tobgr48_nobswap),
FUNC(6, 6, rgb48tobgr48_bswap),
FUNC(8, 6, rgb64to48_nobswap),
FUNC(8, 6, rgb64to48_bswap),
FUNC(8, 6, rgb64tobgr48_nobswap),
FUNC(8, 6, rgb64tobgr48_bswap),
FUNC(0, 0, NULL)
};
int width;
int failed = 0;
int srcBpp = 0;
int dstBpp = 0;
 
if (!func_info[funcNum].func)
break;
 
av_log(NULL, AV_LOG_INFO, ".");
memset(srcBuffer, srcByte, SIZE);
 
for (width = 63; width > 0; width--) {
int dstOffset;
for (dstOffset = 128; dstOffset < 196; dstOffset += 4) {
int srcOffset;
memset(dstBuffer, dstByte, SIZE);
 
for (srcOffset = 128; srcOffset < 196; srcOffset += 4) {
uint8_t *src = srcBuffer + srcOffset;
uint8_t *dst = dstBuffer + dstOffset;
const char *name = NULL;
 
// don't fill the screen with shit ...
if (failed)
break;
 
srcBpp = func_info[funcNum].src_bpp;
dstBpp = func_info[funcNum].dst_bpp;
name = func_info[funcNum].name;
 
func_info[funcNum].func(src, dst, width * srcBpp);
 
if (!srcBpp)
break;
 
for (i = 0; i < SIZE; i++) {
if (srcBuffer[i] != srcByte) {
av_log(NULL, AV_LOG_INFO,
"src damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
for (i = 0; i < dstOffset; i++) {
if (dstBuffer[i] != dstByte) {
av_log(NULL, AV_LOG_INFO,
"dst damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
for (i = dstOffset + width * dstBpp; i < SIZE; i++) {
if (dstBuffer[i] != dstByte) {
av_log(NULL, AV_LOG_INFO,
"dst damaged at %d w:%d src:%d dst:%d %s\n",
i, width, srcOffset, dstOffset, name);
failed = 1;
break;
}
}
}
}
}
if (failed)
failedNum++;
else if (srcBpp)
passedNum++;
}
 
av_log(NULL, AV_LOG_INFO,
"\n%d converters passed, %d converters randomly overwrote memory\n",
passedNum, failedNum);
return failedNum;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/gamma.c
0,0 → 1,72
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "swscale_internal.h"
 
typedef struct GammaContext
{
uint16_t *table;
} GammaContext;
 
// gamma_convert expects 16 bit rgb format
// it writes directly in src slice thus it must be modifiable (done through cascade context)
static int gamma_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
GammaContext *instance = desc->instance;
uint16_t *table = instance->table;
int srcW = desc->src->width;
 
int i;
for (i = 0; i < sliceH; ++i) {
uint8_t ** src = desc->src->plane[0].line;
int src_pos = sliceY+i - desc->src->plane[0].sliceY;
 
uint16_t *src1 = (uint16_t*)*(src+src_pos);
int j;
for (j = 0; j < srcW; ++j) {
uint16_t r = AV_RL16(src1 + j*4 + 0);
uint16_t g = AV_RL16(src1 + j*4 + 1);
uint16_t b = AV_RL16(src1 + j*4 + 2);
 
AV_WL16(src1 + j*4 + 0, table[r]);
AV_WL16(src1 + j*4 + 1, table[g]);
AV_WL16(src1 + j*4 + 2, table[b]);
}
 
}
return sliceH;
}
 
 
int ff_init_gamma_convert(SwsFilterDescriptor *desc, SwsSlice * src, uint16_t *table)
{
GammaContext *li = av_malloc(sizeof(GammaContext));
if (!li)
return AVERROR(ENOMEM);
li->table = table;
 
desc->instance = li;
desc->src = src;
desc->dst = NULL;
desc->process = &gamma_convert;
 
return 0;
}
 
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/hscale.c
0,0 → 1,274
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "swscale_internal.h"
 
static int lum_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
FilterContext *instance = desc->instance;
int srcW = desc->src->width;
int dstW = desc->dst->width;
int xInc = instance->xInc;
 
int i;
for (i = 0; i < sliceH; ++i) {
uint8_t ** src = desc->src->plane[0].line;
uint8_t ** dst = desc->dst->plane[0].line;
int src_pos = sliceY+i - desc->src->plane[0].sliceY;
int dst_pos = sliceY+i - desc->dst->plane[0].sliceY;
 
 
if (c->hyscale_fast) {
c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
} else {
c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
instance->filter_pos, instance->filter_size);
}
 
if (c->lumConvertRange)
c->lumConvertRange((int16_t*)dst[dst_pos], dstW);
 
desc->dst->plane[0].sliceH += 1;
 
if (desc->alpha) {
src = desc->src->plane[3].line;
dst = desc->dst->plane[3].line;
 
src_pos = sliceY+i - desc->src->plane[3].sliceY;
dst_pos = sliceY+i - desc->dst->plane[3].sliceY;
 
desc->dst->plane[3].sliceH += 1;
 
if (c->hyscale_fast) {
c->hyscale_fast(c, (int16_t*)dst[dst_pos], dstW, src[src_pos], srcW, xInc);
} else {
c->hyScale(c, (int16_t*)dst[dst_pos], dstW, (const uint8_t *)src[src_pos], instance->filter,
instance->filter_pos, instance->filter_size);
}
}
}
 
return sliceH;
}
 
static int lum_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
int srcW = desc->src->width;
ColorContext * instance = desc->instance;
uint32_t * pal = instance->pal;
int i;
 
desc->dst->plane[0].sliceY = sliceY;
desc->dst->plane[0].sliceH = sliceH;
desc->dst->plane[3].sliceY = sliceY;
desc->dst->plane[3].sliceH = sliceH;
 
for (i = 0; i < sliceH; ++i) {
int sp0 = sliceY+i - desc->src->plane[0].sliceY;
int sp1 = ((sliceY+i) >> desc->src->v_chr_sub_sample) - desc->src->plane[1].sliceY;
const uint8_t * src[4] = { desc->src->plane[0].line[sp0],
desc->src->plane[1].line[sp1],
desc->src->plane[2].line[sp1],
desc->src->plane[3].line[sp0]};
uint8_t * dst = desc->dst->plane[0].line[i];
 
if (c->lumToYV12) {
c->lumToYV12(dst, src[0], src[1], src[2], srcW, pal);
} else if (c->readLumPlanar) {
c->readLumPlanar(dst, src, srcW, c->input_rgb2yuv_table);
}
 
 
if (desc->alpha) {
dst = desc->dst->plane[3].line[i];
if (c->alpToYV12) {
c->alpToYV12(dst, src[3], src[1], src[2], srcW, pal);
} else if (c->readAlpPlanar) {
c->readAlpPlanar(dst, src, srcW, NULL);
}
}
}
 
return sliceH;
}
 
int ff_init_desc_fmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
{
ColorContext * li = av_malloc(sizeof(ColorContext));
if (!li)
return AVERROR(ENOMEM);
li->pal = pal;
desc->instance = li;
 
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src =src;
desc->dst = dst;
desc->process = &lum_convert;
 
return 0;
}
 
 
int ff_init_desc_hscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
{
FilterContext *li = av_malloc(sizeof(FilterContext));
if (!li)
return AVERROR(ENOMEM);
 
li->filter = filter;
li->filter_pos = filter_pos;
li->filter_size = filter_size;
li->xInc = xInc;
 
desc->instance = li;
 
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src = src;
desc->dst = dst;
 
desc->process = &lum_h_scale;
 
return 0;
}
 
static int chr_h_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
FilterContext *instance = desc->instance;
int srcW = FF_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
int dstW = FF_CEIL_RSHIFT(desc->dst->width, desc->dst->h_chr_sub_sample);
int xInc = instance->xInc;
 
uint8_t ** src1 = desc->src->plane[1].line;
uint8_t ** dst1 = desc->dst->plane[1].line;
uint8_t ** src2 = desc->src->plane[2].line;
uint8_t ** dst2 = desc->dst->plane[2].line;
 
int src_pos1 = sliceY - desc->src->plane[1].sliceY;
int dst_pos1 = sliceY - desc->dst->plane[1].sliceY;
 
int src_pos2 = sliceY - desc->src->plane[2].sliceY;
int dst_pos2 = sliceY - desc->dst->plane[2].sliceY;
 
int i;
for (i = 0; i < sliceH; ++i) {
if (c->hcscale_fast) {
c->hcscale_fast(c, (uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW, src1[src_pos1+i], src2[src_pos2+i], srcW, xInc);
} else {
c->hcScale(c, (uint16_t*)dst1[dst_pos1+i], dstW, src1[src_pos1+i], instance->filter, instance->filter_pos, instance->filter_size);
c->hcScale(c, (uint16_t*)dst2[dst_pos2+i], dstW, src2[src_pos2+i], instance->filter, instance->filter_pos, instance->filter_size);
}
 
if (c->chrConvertRange)
c->chrConvertRange((uint16_t*)dst1[dst_pos1+i], (uint16_t*)dst2[dst_pos2+i], dstW);
 
desc->dst->plane[1].sliceH += 1;
desc->dst->plane[2].sliceH += 1;
}
return sliceH;
}
 
static int chr_convert(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
int srcW = FF_CEIL_RSHIFT(desc->src->width, desc->src->h_chr_sub_sample);
ColorContext * instance = desc->instance;
uint32_t * pal = instance->pal;
 
int sp0 = (sliceY - (desc->src->plane[0].sliceY >> desc->src->v_chr_sub_sample)) << desc->src->v_chr_sub_sample;
int sp1 = sliceY - desc->src->plane[1].sliceY;
 
int i;
 
desc->dst->plane[1].sliceY = sliceY;
desc->dst->plane[1].sliceH = sliceH;
desc->dst->plane[2].sliceY = sliceY;
desc->dst->plane[2].sliceH = sliceH;
 
for (i = 0; i < sliceH; ++i) {
const uint8_t * src[4] = { desc->src->plane[0].line[sp0+i],
desc->src->plane[1].line[sp1+i],
desc->src->plane[2].line[sp1+i],
desc->src->plane[3].line[sp0+i]};
 
uint8_t * dst1 = desc->dst->plane[1].line[i];
uint8_t * dst2 = desc->dst->plane[2].line[i];
if (c->chrToYV12) {
c->chrToYV12(dst1, dst2, src[0], src[1], src[2], srcW, pal);
} else if (c->readChrPlanar) {
c->readChrPlanar(dst1, dst2, src, srcW, c->input_rgb2yuv_table);
}
}
return sliceH;
}
 
int ff_init_desc_cfmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal)
{
ColorContext * li = av_malloc(sizeof(ColorContext));
if (!li)
return AVERROR(ENOMEM);
li->pal = pal;
desc->instance = li;
 
desc->src =src;
desc->dst = dst;
desc->process = &chr_convert;
 
return 0;
}
 
int ff_init_desc_chscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc)
{
FilterContext *li = av_malloc(sizeof(FilterContext));
if (!li)
return AVERROR(ENOMEM);
 
li->filter = filter;
li->filter_pos = filter_pos;
li->filter_size = filter_size;
li->xInc = xInc;
 
desc->instance = li;
 
desc->alpha = isALPHA(src->fmt) && isALPHA(dst->fmt);
desc->src = src;
desc->dst = dst;
 
desc->process = &chr_h_scale;
 
return 0;
}
 
static int no_chr_scale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
desc->dst->plane[1].sliceY = sliceY + sliceH - desc->dst->plane[1].available_lines;
desc->dst->plane[1].sliceH = desc->dst->plane[1].available_lines;
desc->dst->plane[2].sliceY = sliceY + sliceH - desc->dst->plane[2].available_lines;
desc->dst->plane[2].sliceH = desc->dst->plane[2].available_lines;
return 0;
}
 
int ff_init_desc_no_chr(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst)
{
desc->src = src;
desc->dst = dst;
desc->alpha = 0;
desc->instance = NULL;
desc->process = &no_chr_scale;
return 0;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/hscale_fast_bilinear.c
0,0 → 1,55
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "swscale_internal.h"
 
void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
const uint8_t *src, int srcW, int xInc)
{
int i;
unsigned int xpos = 0;
for (i = 0; i < dstWidth; i++) {
register unsigned int xx = xpos >> 16;
register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
dst[i] = (src[xx] << 7) + (src[xx + 1] - src[xx]) * xalpha;
xpos += xInc;
}
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
dst[i] = src[srcW-1]*128;
}
 
void ff_hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc)
{
int i;
unsigned int xpos = 0;
for (i = 0; i < dstWidth; i++) {
register unsigned int xx = xpos >> 16;
register unsigned int xalpha = (xpos & 0xFFFF) >> 9;
dst1[i] = (src1[xx] * (xalpha ^ 127) + src1[xx + 1] * xalpha);
dst2[i] = (src2[xx] * (xalpha ^ 127) + src2[xx + 1] * xalpha);
xpos += xInc;
}
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
dst1[i] = src1[srcW-1]*128;
dst2[i] = src2[srcW-1]*128;
}
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/input.c
0,0 → 1,1441
/*
* Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
 
#include "libavutil/avutil.h"
#include "libavutil/bswap.h"
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mathematics.h"
#include "libavutil/pixdesc.h"
#include "libavutil/avassert.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
 
#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
 
#define r ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? b_r : r_b)
#define b ((origin == AV_PIX_FMT_BGR48BE || origin == AV_PIX_FMT_BGR48LE || origin == AV_PIX_FMT_BGRA64BE || origin == AV_PIX_FMT_BGRA64LE) ? r_b : b_r)
 
static av_always_inline void
rgb64ToY_c_template(uint16_t *dst, const uint16_t *src, int width,
enum AVPixelFormat origin, int32_t *rgb2yuv)
{
int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
int i;
for (i = 0; i < width; i++) {
unsigned int r_b = input_pixel(&src[i*4+0]);
unsigned int g = input_pixel(&src[i*4+1]);
unsigned int b_r = input_pixel(&src[i*4+2]);
 
dst[i] = (ry*r + gy*g + by*b + (0x2001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
}
}
 
static av_always_inline void
rgb64ToUV_c_template(uint16_t *dstU, uint16_t *dstV,
const uint16_t *src1, const uint16_t *src2,
int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
{
int i;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
av_assert1(src1==src2);
for (i = 0; i < width; i++) {
int r_b = input_pixel(&src1[i*4+0]);
int g = input_pixel(&src1[i*4+1]);
int b_r = input_pixel(&src1[i*4+2]);
 
dstU[i] = (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
dstV[i] = (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
}
}
 
static av_always_inline void
rgb64ToUV_half_c_template(uint16_t *dstU, uint16_t *dstV,
const uint16_t *src1, const uint16_t *src2,
int width, enum AVPixelFormat origin, int32_t *rgb2yuv)
{
int i;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
av_assert1(src1==src2);
for (i = 0; i < width; i++) {
int r_b = (input_pixel(&src1[8 * i + 0]) + input_pixel(&src1[8 * i + 4]) + 1) >> 1;
int g = (input_pixel(&src1[8 * i + 1]) + input_pixel(&src1[8 * i + 5]) + 1) >> 1;
int b_r = (input_pixel(&src1[8 * i + 2]) + input_pixel(&src1[8 * i + 6]) + 1) >> 1;
 
dstU[i]= (ru*r + gu*g + bu*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
dstV[i]= (rv*r + gv*g + bv*b + (0x10001<<(RGB2YUV_SHIFT-1))) >> RGB2YUV_SHIFT;
}
}
 
#define rgb64funcs(pattern, BE_LE, origin) \
static void pattern ## 64 ## BE_LE ## ToY_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused0, const uint8_t *unused1,\
int width, uint32_t *rgb2yuv) \
{ \
const uint16_t *src = (const uint16_t *) _src; \
uint16_t *dst = (uint16_t *) _dst; \
rgb64ToY_c_template(dst, src, width, origin, rgb2yuv); \
} \
\
static void pattern ## 64 ## BE_LE ## ToUV_c(uint8_t *_dstU, uint8_t *_dstV, \
const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
int width, uint32_t *rgb2yuv) \
{ \
const uint16_t *src1 = (const uint16_t *) _src1, \
*src2 = (const uint16_t *) _src2; \
uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
rgb64ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
} \
\
static void pattern ## 64 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, \
const uint8_t *unused0, const uint8_t *_src1, const uint8_t *_src2, \
int width, uint32_t *rgb2yuv) \
{ \
const uint16_t *src1 = (const uint16_t *) _src1, \
*src2 = (const uint16_t *) _src2; \
uint16_t *dstU = (uint16_t *) _dstU, *dstV = (uint16_t *) _dstV; \
rgb64ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
}
 
rgb64funcs(rgb, LE, AV_PIX_FMT_RGBA64LE)
rgb64funcs(rgb, BE, AV_PIX_FMT_RGBA64BE)
rgb64funcs(bgr, LE, AV_PIX_FMT_BGRA64LE)
rgb64funcs(bgr, BE, AV_PIX_FMT_BGRA64BE)
 
static av_always_inline void rgb48ToY_c_template(uint16_t *dst,
const uint16_t *src, int width,
enum AVPixelFormat origin,
int32_t *rgb2yuv)
{
int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
int i;
for (i = 0; i < width; i++) {
unsigned int r_b = input_pixel(&src[i * 3 + 0]);
unsigned int g = input_pixel(&src[i * 3 + 1]);
unsigned int b_r = input_pixel(&src[i * 3 + 2]);
 
dst[i] = (ry*r + gy*g + by*b + (0x2001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
}
}
 
static av_always_inline void rgb48ToUV_c_template(uint16_t *dstU,
uint16_t *dstV,
const uint16_t *src1,
const uint16_t *src2,
int width,
enum AVPixelFormat origin,
int32_t *rgb2yuv)
{
int i;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
av_assert1(src1 == src2);
for (i = 0; i < width; i++) {
int r_b = input_pixel(&src1[i * 3 + 0]);
int g = input_pixel(&src1[i * 3 + 1]);
int b_r = input_pixel(&src1[i * 3 + 2]);
 
dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
}
}
 
static av_always_inline void rgb48ToUV_half_c_template(uint16_t *dstU,
uint16_t *dstV,
const uint16_t *src1,
const uint16_t *src2,
int width,
enum AVPixelFormat origin,
int32_t *rgb2yuv)
{
int i;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
av_assert1(src1 == src2);
for (i = 0; i < width; i++) {
int r_b = (input_pixel(&src1[6 * i + 0]) +
input_pixel(&src1[6 * i + 3]) + 1) >> 1;
int g = (input_pixel(&src1[6 * i + 1]) +
input_pixel(&src1[6 * i + 4]) + 1) >> 1;
int b_r = (input_pixel(&src1[6 * i + 2]) +
input_pixel(&src1[6 * i + 5]) + 1) >> 1;
 
dstU[i] = (ru*r + gu*g + bu*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
dstV[i] = (rv*r + gv*g + bv*b + (0x10001 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT;
}
}
 
#undef r
#undef b
#undef input_pixel
 
#define rgb48funcs(pattern, BE_LE, origin) \
static void pattern ## 48 ## BE_LE ## ToY_c(uint8_t *_dst, \
const uint8_t *_src, \
const uint8_t *unused0, const uint8_t *unused1,\
int width, \
uint32_t *rgb2yuv) \
{ \
const uint16_t *src = (const uint16_t *)_src; \
uint16_t *dst = (uint16_t *)_dst; \
rgb48ToY_c_template(dst, src, width, origin, rgb2yuv); \
} \
\
static void pattern ## 48 ## BE_LE ## ToUV_c(uint8_t *_dstU, \
uint8_t *_dstV, \
const uint8_t *unused0, \
const uint8_t *_src1, \
const uint8_t *_src2, \
int width, \
uint32_t *rgb2yuv) \
{ \
const uint16_t *src1 = (const uint16_t *)_src1, \
*src2 = (const uint16_t *)_src2; \
uint16_t *dstU = (uint16_t *)_dstU, \
*dstV = (uint16_t *)_dstV; \
rgb48ToUV_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
} \
\
static void pattern ## 48 ## BE_LE ## ToUV_half_c(uint8_t *_dstU, \
uint8_t *_dstV, \
const uint8_t *unused0, \
const uint8_t *_src1, \
const uint8_t *_src2, \
int width, \
uint32_t *rgb2yuv) \
{ \
const uint16_t *src1 = (const uint16_t *)_src1, \
*src2 = (const uint16_t *)_src2; \
uint16_t *dstU = (uint16_t *)_dstU, \
*dstV = (uint16_t *)_dstV; \
rgb48ToUV_half_c_template(dstU, dstV, src1, src2, width, origin, rgb2yuv); \
}
 
rgb48funcs(rgb, LE, AV_PIX_FMT_RGB48LE)
rgb48funcs(rgb, BE, AV_PIX_FMT_RGB48BE)
rgb48funcs(bgr, LE, AV_PIX_FMT_BGR48LE)
rgb48funcs(bgr, BE, AV_PIX_FMT_BGR48BE)
 
#define input_pixel(i) ((origin == AV_PIX_FMT_RGBA || \
origin == AV_PIX_FMT_BGRA || \
origin == AV_PIX_FMT_ARGB || \
origin == AV_PIX_FMT_ABGR) \
? AV_RN32A(&src[(i) * 4]) \
: (isBE(origin) ? AV_RB16(&src[(i) * 2]) \
: AV_RL16(&src[(i) * 2])))
 
static av_always_inline void rgb16_32ToY_c_template(int16_t *dst,
const uint8_t *src,
int width,
enum AVPixelFormat origin,
int shr, int shg,
int shb, int shp,
int maskr, int maskg,
int maskb, int rsh,
int gsh, int bsh, int S,
int32_t *rgb2yuv)
{
const int ry = rgb2yuv[RY_IDX]<<rsh, gy = rgb2yuv[GY_IDX]<<gsh, by = rgb2yuv[BY_IDX]<<bsh;
const unsigned rnd = (32<<((S)-1)) + (1<<(S-7));
int i;
 
for (i = 0; i < width; i++) {
int px = input_pixel(i) >> shp;
int b = (px & maskb) >> shb;
int g = (px & maskg) >> shg;
int r = (px & maskr) >> shr;
 
dst[i] = (ry * r + gy * g + by * b + rnd) >> ((S)-6);
}
}
 
static av_always_inline void rgb16_32ToUV_c_template(int16_t *dstU,
int16_t *dstV,
const uint8_t *src,
int width,
enum AVPixelFormat origin,
int shr, int shg,
int shb, int shp,
int maskr, int maskg,
int maskb, int rsh,
int gsh, int bsh, int S,
int32_t *rgb2yuv)
{
const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh, bu = rgb2yuv[BU_IDX] << bsh,
rv = rgb2yuv[RV_IDX] << rsh, gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh;
const unsigned rnd = (256u<<((S)-1)) + (1<<(S-7));
int i;
 
for (i = 0; i < width; i++) {
int px = input_pixel(i) >> shp;
int b = (px & maskb) >> shb;
int g = (px & maskg) >> shg;
int r = (px & maskr) >> shr;
 
dstU[i] = (ru * r + gu * g + bu * b + rnd) >> ((S)-6);
dstV[i] = (rv * r + gv * g + bv * b + rnd) >> ((S)-6);
}
}
 
static av_always_inline void rgb16_32ToUV_half_c_template(int16_t *dstU,
int16_t *dstV,
const uint8_t *src,
int width,
enum AVPixelFormat origin,
int shr, int shg,
int shb, int shp,
int maskr, int maskg,
int maskb, int rsh,
int gsh, int bsh, int S,
int32_t *rgb2yuv)
{
const int ru = rgb2yuv[RU_IDX] << rsh, gu = rgb2yuv[GU_IDX] << gsh, bu = rgb2yuv[BU_IDX] << bsh,
rv = rgb2yuv[RV_IDX] << rsh, gv = rgb2yuv[GV_IDX] << gsh, bv = rgb2yuv[BV_IDX] << bsh,
maskgx = ~(maskr | maskb);
const unsigned rnd = (256U<<(S)) + (1<<(S-6));
int i;
 
maskr |= maskr << 1;
maskb |= maskb << 1;
maskg |= maskg << 1;
for (i = 0; i < width; i++) {
unsigned px0 = input_pixel(2 * i + 0) >> shp;
unsigned px1 = input_pixel(2 * i + 1) >> shp;
int b, r, g = (px0 & maskgx) + (px1 & maskgx);
int rb = px0 + px1 - g;
 
b = (rb & maskb) >> shb;
if (shp ||
origin == AV_PIX_FMT_BGR565LE || origin == AV_PIX_FMT_BGR565BE ||
origin == AV_PIX_FMT_RGB565LE || origin == AV_PIX_FMT_RGB565BE) {
g >>= shg;
} else {
g = (g & maskg) >> shg;
}
r = (rb & maskr) >> shr;
 
dstU[i] = (ru * r + gu * g + bu * b + (unsigned)rnd) >> ((S)-6+1);
dstV[i] = (rv * r + gv * g + bv * b + (unsigned)rnd) >> ((S)-6+1);
}
}
 
#undef input_pixel
 
#define rgb16_32_wrapper(fmt, name, shr, shg, shb, shp, maskr, \
maskg, maskb, rsh, gsh, bsh, S) \
static void name ## ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, \
int width, uint32_t *tab) \
{ \
rgb16_32ToY_c_template((int16_t*)dst, src, width, fmt, shr, shg, shb, shp, \
maskr, maskg, maskb, rsh, gsh, bsh, S, tab); \
} \
\
static void name ## ToUV_c(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *unused0, const uint8_t *src, const uint8_t *dummy, \
int width, uint32_t *tab) \
{ \
rgb16_32ToUV_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
shr, shg, shb, shp, \
maskr, maskg, maskb, rsh, gsh, bsh, S, tab);\
} \
\
static void name ## ToUV_half_c(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *unused0, const uint8_t *src, \
const uint8_t *dummy, \
int width, uint32_t *tab) \
{ \
rgb16_32ToUV_half_c_template((int16_t*)dstU, (int16_t*)dstV, src, width, fmt, \
shr, shg, shb, shp, \
maskr, maskg, maskb, \
rsh, gsh, bsh, S, tab); \
}
 
rgb16_32_wrapper(AV_PIX_FMT_BGR32, bgr32, 16, 0, 0, 0, 0xFF0000, 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8)
rgb16_32_wrapper(AV_PIX_FMT_BGR32_1, bgr321, 16, 0, 0, 8, 0xFF0000, 0xFF00, 0x00FF, 8, 0, 8, RGB2YUV_SHIFT + 8)
rgb16_32_wrapper(AV_PIX_FMT_RGB32, rgb32, 0, 0, 16, 0, 0x00FF, 0xFF00, 0xFF0000, 8, 0, 8, RGB2YUV_SHIFT + 8)
rgb16_32_wrapper(AV_PIX_FMT_RGB32_1, rgb321, 0, 0, 16, 8, 0x00FF, 0xFF00, 0xFF0000, 8, 0, 8, RGB2YUV_SHIFT + 8)
rgb16_32_wrapper(AV_PIX_FMT_BGR565LE, bgr16le, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, 11, 5, 0, RGB2YUV_SHIFT + 8)
rgb16_32_wrapper(AV_PIX_FMT_BGR555LE, bgr15le, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, 10, 5, 0, RGB2YUV_SHIFT + 7)
rgb16_32_wrapper(AV_PIX_FMT_BGR444LE, bgr12le, 0, 0, 0, 0, 0x000F, 0x00F0, 0x0F00, 8, 4, 0, RGB2YUV_SHIFT + 4)
rgb16_32_wrapper(AV_PIX_FMT_RGB565LE, rgb16le, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, 0, 5, 11, RGB2YUV_SHIFT + 8)
rgb16_32_wrapper(AV_PIX_FMT_RGB555LE, rgb15le, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, 0, 5, 10, RGB2YUV_SHIFT + 7)
rgb16_32_wrapper(AV_PIX_FMT_RGB444LE, rgb12le, 0, 0, 0, 0, 0x0F00, 0x00F0, 0x000F, 0, 4, 8, RGB2YUV_SHIFT + 4)
rgb16_32_wrapper(AV_PIX_FMT_BGR565BE, bgr16be, 0, 0, 0, 0, 0x001F, 0x07E0, 0xF800, 11, 5, 0, RGB2YUV_SHIFT + 8)
rgb16_32_wrapper(AV_PIX_FMT_BGR555BE, bgr15be, 0, 0, 0, 0, 0x001F, 0x03E0, 0x7C00, 10, 5, 0, RGB2YUV_SHIFT + 7)
rgb16_32_wrapper(AV_PIX_FMT_BGR444BE, bgr12be, 0, 0, 0, 0, 0x000F, 0x00F0, 0x0F00, 8, 4, 0, RGB2YUV_SHIFT + 4)
rgb16_32_wrapper(AV_PIX_FMT_RGB565BE, rgb16be, 0, 0, 0, 0, 0xF800, 0x07E0, 0x001F, 0, 5, 11, RGB2YUV_SHIFT + 8)
rgb16_32_wrapper(AV_PIX_FMT_RGB555BE, rgb15be, 0, 0, 0, 0, 0x7C00, 0x03E0, 0x001F, 0, 5, 10, RGB2YUV_SHIFT + 7)
rgb16_32_wrapper(AV_PIX_FMT_RGB444BE, rgb12be, 0, 0, 0, 0, 0x0F00, 0x00F0, 0x000F, 0, 4, 8, RGB2YUV_SHIFT + 4)
 
static void gbr24pToUV_half_c(uint8_t *_dstU, uint8_t *_dstV,
const uint8_t *gsrc, const uint8_t *bsrc, const uint8_t *rsrc,
int width, uint32_t *rgb2yuv)
{
uint16_t *dstU = (uint16_t *)_dstU;
uint16_t *dstV = (uint16_t *)_dstV;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
 
int i;
for (i = 0; i < width; i++) {
unsigned int g = gsrc[2*i] + gsrc[2*i+1];
unsigned int b = bsrc[2*i] + bsrc[2*i+1];
unsigned int r = rsrc[2*i] + rsrc[2*i+1];
 
dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-6))) >> (RGB2YUV_SHIFT-6+1);
}
}
 
static void rgba64leToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
const uint8_t *unused2, int width, uint32_t *unused)
{
int16_t *dst = (int16_t *)_dst;
const uint16_t *src = (const uint16_t *)_src;
int i;
for (i = 0; i < width; i++)
dst[i] = AV_RL16(src + 4 * i + 3);
}
 
static void rgba64beToA_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1,
const uint8_t *unused2, int width, uint32_t *unused)
{
int16_t *dst = (int16_t *)_dst;
const uint16_t *src = (const uint16_t *)_src;
int i;
for (i = 0; i < width; i++)
dst[i] = AV_RB16(src + 4 * i + 3);
}
 
static void abgrToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
{
int16_t *dst = (int16_t *)_dst;
int i;
for (i=0; i<width; i++) {
dst[i]= src[4*i]<<6;
}
}
 
static void rgbaToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
{
int16_t *dst = (int16_t *)_dst;
int i;
for (i=0; i<width; i++) {
dst[i]= src[4*i+3]<<6;
}
}
 
static void palToA_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
{
int16_t *dst = (int16_t *)_dst;
int i;
for (i=0; i<width; i++) {
int d= src[i];
 
dst[i]= (pal[d] >> 24)<<6;
}
}
 
static void palToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *pal)
{
int16_t *dst = (int16_t *)_dst;
int i;
for (i = 0; i < width; i++) {
int d = src[i];
 
dst[i] = (pal[d] & 0xFF)<<6;
}
}
 
static void palToUV_c(uint8_t *_dstU, uint8_t *_dstV,
const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
int width, uint32_t *pal)
{
uint16_t *dstU = (uint16_t *)_dstU;
int16_t *dstV = (int16_t *)_dstV;
int i;
av_assert1(src1 == src2);
for (i = 0; i < width; i++) {
int p = pal[src1[i]];
 
dstU[i] = (uint8_t)(p>> 8)<<6;
dstV[i] = (uint8_t)(p>>16)<<6;
}
}
 
static void monowhite2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
{
int16_t *dst = (int16_t *)_dst;
int i, j;
width = (width + 7) >> 3;
for (i = 0; i < width; i++) {
int d = ~src[i];
for (j = 0; j < 8; j++)
dst[8*i+j]= ((d>>(7-j))&1) * 16383;
}
if(width&7){
int d= ~src[i];
for (j = 0; j < (width&7); j++)
dst[8*i+j]= ((d>>(7-j))&1) * 16383;
}
}
 
static void monoblack2Y_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width, uint32_t *unused)
{
int16_t *dst = (int16_t *)_dst;
int i, j;
width = (width + 7) >> 3;
for (i = 0; i < width; i++) {
int d = src[i];
for (j = 0; j < 8; j++)
dst[8*i+j]= ((d>>(7-j))&1) * 16383;
}
if(width&7){
int d = src[i];
for (j = 0; j < (width&7); j++)
dst[8*i+j] = ((d>>(7-j))&1) * 16383;
}
}
 
static void yuy2ToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
uint32_t *unused)
{
int i;
for (i = 0; i < width; i++)
dst[i] = src[2 * i];
}
 
static void yuy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
const uint8_t *src2, int width, uint32_t *unused)
{
int i;
for (i = 0; i < width; i++) {
dstU[i] = src1[4 * i + 1];
dstV[i] = src1[4 * i + 3];
}
av_assert1(src1 == src2);
}
 
static void yvy2ToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
const uint8_t *src2, int width, uint32_t *unused)
{
int i;
for (i = 0; i < width; i++) {
dstV[i] = src1[4 * i + 1];
dstU[i] = src1[4 * i + 3];
}
av_assert1(src1 == src2);
}
 
static void bswap16Y_c(uint8_t *_dst, const uint8_t *_src, const uint8_t *unused1, const uint8_t *unused2, int width,
uint32_t *unused)
{
int i;
const uint16_t *src = (const uint16_t *)_src;
uint16_t *dst = (uint16_t *)_dst;
for (i = 0; i < width; i++)
dst[i] = av_bswap16(src[i]);
}
 
static void bswap16UV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *_src1,
const uint8_t *_src2, int width, uint32_t *unused)
{
int i;
const uint16_t *src1 = (const uint16_t *)_src1,
*src2 = (const uint16_t *)_src2;
uint16_t *dstU = (uint16_t *)_dstU, *dstV = (uint16_t *)_dstV;
for (i = 0; i < width; i++) {
dstU[i] = av_bswap16(src1[i]);
dstV[i] = av_bswap16(src2[i]);
}
}
 
static void read_ya16le_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
uint32_t *unused)
{
int i;
for (i = 0; i < width; i++)
AV_WN16(dst + i * 2, AV_RL16(src + i * 4));
}
 
static void read_ya16le_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
uint32_t *unused)
{
int i;
for (i = 0; i < width; i++)
AV_WN16(dst + i * 2, AV_RL16(src + i * 4 + 2));
}
 
static void read_ya16be_gray_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
uint32_t *unused)
{
int i;
for (i = 0; i < width; i++)
AV_WN16(dst + i * 2, AV_RB16(src + i * 4));
}
 
static void read_ya16be_alpha_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
uint32_t *unused)
{
int i;
for (i = 0; i < width; i++)
AV_WN16(dst + i * 2, AV_RB16(src + i * 4 + 2));
}
 
static void read_ayuv64le_Y_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0, const uint8_t *unused1, int width,
uint32_t *unused2)
{
int i;
for (i = 0; i < width; i++)
AV_WN16(dst + i * 2, AV_RL16(src + i * 8 + 2));
}
 
 
static void read_ayuv64le_UV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src,
const uint8_t *unused1, int width, uint32_t *unused2)
{
int i;
for (i = 0; i < width; i++) {
AV_WN16(dstU + i * 2, AV_RL16(src + i * 8 + 4));
AV_WN16(dstV + i * 2, AV_RL16(src + i * 8 + 6));
}
}
 
static void read_ayuv64le_A_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused0, const uint8_t *unused1, int width,
uint32_t *unused2)
{
int i;
for (i = 0; i < width; i++)
AV_WN16(dst + i * 2, AV_RL16(src + i * 8));
}
 
/* This is almost identical to the previous, end exists only because
* yuy2ToY/UV)(dst, src + 1, ...) would have 100% unaligned accesses. */
static void uyvyToY_c(uint8_t *dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
uint32_t *unused)
{
int i;
for (i = 0; i < width; i++)
dst[i] = src[2 * i + 1];
}
 
static void uyvyToUV_c(uint8_t *dstU, uint8_t *dstV, const uint8_t *unused0, const uint8_t *src1,
const uint8_t *src2, int width, uint32_t *unused)
{
int i;
for (i = 0; i < width; i++) {
dstU[i] = src1[4 * i + 0];
dstV[i] = src1[4 * i + 2];
}
av_assert1(src1 == src2);
}
 
static av_always_inline void nvXXtoUV_c(uint8_t *dst1, uint8_t *dst2,
const uint8_t *src, int width)
{
int i;
for (i = 0; i < width; i++) {
dst1[i] = src[2 * i + 0];
dst2[i] = src[2 * i + 1];
}
}
 
static void nv12ToUV_c(uint8_t *dstU, uint8_t *dstV,
const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
int width, uint32_t *unused)
{
nvXXtoUV_c(dstU, dstV, src1, width);
}
 
static void nv21ToUV_c(uint8_t *dstU, uint8_t *dstV,
const uint8_t *unused0, const uint8_t *src1, const uint8_t *src2,
int width, uint32_t *unused)
{
nvXXtoUV_c(dstV, dstU, src1, width);
}
 
#define input_pixel(pos) (isBE(origin) ? AV_RB16(pos) : AV_RL16(pos))
 
static void bgr24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2,
int width, uint32_t *rgb2yuv)
{
int16_t *dst = (int16_t *)_dst;
int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
int i;
for (i = 0; i < width; i++) {
int b = src[i * 3 + 0];
int g = src[i * 3 + 1];
int r = src[i * 3 + 2];
 
dst[i] = ((ry*r + gy*g + by*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
}
}
 
static void bgr24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
const uint8_t *src2, int width, uint32_t *rgb2yuv)
{
int16_t *dstU = (int16_t *)_dstU;
int16_t *dstV = (int16_t *)_dstV;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
int i;
for (i = 0; i < width; i++) {
int b = src1[3 * i + 0];
int g = src1[3 * i + 1];
int r = src1[3 * i + 2];
 
dstU[i] = (ru*r + gu*g + bu*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
dstV[i] = (rv*r + gv*g + bv*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
}
av_assert1(src1 == src2);
}
 
static void bgr24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
const uint8_t *src2, int width, uint32_t *rgb2yuv)
{
int16_t *dstU = (int16_t *)_dstU;
int16_t *dstV = (int16_t *)_dstV;
int i;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
for (i = 0; i < width; i++) {
int b = src1[6 * i + 0] + src1[6 * i + 3];
int g = src1[6 * i + 1] + src1[6 * i + 4];
int r = src1[6 * i + 2] + src1[6 * i + 5];
 
dstU[i] = (ru*r + gu*g + bu*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
dstV[i] = (rv*r + gv*g + bv*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
}
av_assert1(src1 == src2);
}
 
static void rgb24ToY_c(uint8_t *_dst, const uint8_t *src, const uint8_t *unused1, const uint8_t *unused2, int width,
uint32_t *rgb2yuv)
{
int16_t *dst = (int16_t *)_dst;
int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
int i;
for (i = 0; i < width; i++) {
int r = src[i * 3 + 0];
int g = src[i * 3 + 1];
int b = src[i * 3 + 2];
 
dst[i] = ((ry*r + gy*g + by*b + (32<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6));
}
}
 
static void rgb24ToUV_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
const uint8_t *src2, int width, uint32_t *rgb2yuv)
{
int16_t *dstU = (int16_t *)_dstU;
int16_t *dstV = (int16_t *)_dstV;
int i;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
av_assert1(src1 == src2);
for (i = 0; i < width; i++) {
int r = src1[3 * i + 0];
int g = src1[3 * i + 1];
int b = src1[3 * i + 2];
 
dstU[i] = (ru*r + gu*g + bu*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
dstV[i] = (rv*r + gv*g + bv*b + (256<<(RGB2YUV_SHIFT-1)) + (1<<(RGB2YUV_SHIFT-7)))>>(RGB2YUV_SHIFT-6);
}
}
 
static void rgb24ToUV_half_c(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *unused0, const uint8_t *src1,
const uint8_t *src2, int width, uint32_t *rgb2yuv)
{
int16_t *dstU = (int16_t *)_dstU;
int16_t *dstV = (int16_t *)_dstV;
int i;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
av_assert1(src1 == src2);
for (i = 0; i < width; i++) {
int r = src1[6 * i + 0] + src1[6 * i + 3];
int g = src1[6 * i + 1] + src1[6 * i + 4];
int b = src1[6 * i + 2] + src1[6 * i + 5];
 
dstU[i] = (ru*r + gu*g + bu*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
dstV[i] = (rv*r + gv*g + bv*b + (256<<RGB2YUV_SHIFT) + (1<<(RGB2YUV_SHIFT-6)))>>(RGB2YUV_SHIFT-5);
}
}
 
static void planar_rgb_to_y(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *rgb2yuv)
{
uint16_t *dst = (uint16_t *)_dst;
int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
int i;
for (i = 0; i < width; i++) {
int g = src[0][i];
int b = src[1][i];
int r = src[2][i];
 
dst[i] = (ry*r + gy*g + by*b + (0x801<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
}
}
 
static void planar_rgb_to_a(uint8_t *_dst, const uint8_t *src[4], int width, int32_t *unused)
{
uint16_t *dst = (uint16_t *)_dst;
int i;
for (i = 0; i < width; i++)
dst[i] = src[3][i] << 6;
}
 
static void planar_rgb_to_uv(uint8_t *_dstU, uint8_t *_dstV, const uint8_t *src[4], int width, int32_t *rgb2yuv)
{
uint16_t *dstU = (uint16_t *)_dstU;
uint16_t *dstV = (uint16_t *)_dstV;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
int i;
for (i = 0; i < width; i++) {
int g = src[0][i];
int b = src[1][i];
int r = src[2][i];
 
dstU[i] = (ru*r + gu*g + bu*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
dstV[i] = (rv*r + gv*g + bv*b + (0x4001<<(RGB2YUV_SHIFT-7))) >> (RGB2YUV_SHIFT-6);
}
}
 
#define rdpx(src) \
is_be ? AV_RB16(src) : AV_RL16(src)
static av_always_inline void planar_rgb16_to_y(uint8_t *_dst, const uint8_t *_src[4],
int width, int bpc, int is_be, int32_t *rgb2yuv)
{
int i;
const uint16_t **src = (const uint16_t **)_src;
uint16_t *dst = (uint16_t *)_dst;
int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
int shift = bpc < 16 ? bpc : 14;
for (i = 0; i < width; i++) {
int g = rdpx(src[0] + i);
int b = rdpx(src[1] + i);
int r = rdpx(src[2] + i);
 
dst[i] = ((ry*r + gy*g + by*b + (33 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + shift - 14));
}
}
 
static av_always_inline void planar_rgb16_to_uv(uint8_t *_dstU, uint8_t *_dstV,
const uint8_t *_src[4], int width,
int bpc, int is_be, int32_t *rgb2yuv)
{
int i;
const uint16_t **src = (const uint16_t **)_src;
uint16_t *dstU = (uint16_t *)_dstU;
uint16_t *dstV = (uint16_t *)_dstV;
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
int shift = bpc < 16 ? bpc : 14;
for (i = 0; i < width; i++) {
int g = rdpx(src[0] + i);
int b = rdpx(src[1] + i);
int r = rdpx(src[2] + i);
 
dstU[i] = (ru*r + gu*g + bu*b + (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + shift - 14);
dstV[i] = (rv*r + gv*g + bv*b + (257 << (RGB2YUV_SHIFT + bpc - 9))) >> (RGB2YUV_SHIFT + shift - 14);
}
}
#undef rdpx
 
#define rgb9plus_planar_funcs_endian(nbits, endian_name, endian) \
static void planar_rgb##nbits##endian_name##_to_y(uint8_t *dst, const uint8_t *src[4], \
int w, int32_t *rgb2yuv) \
{ \
planar_rgb16_to_y(dst, src, w, nbits, endian, rgb2yuv); \
} \
static void planar_rgb##nbits##endian_name##_to_uv(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *src[4], int w, int32_t *rgb2yuv) \
{ \
planar_rgb16_to_uv(dstU, dstV, src, w, nbits, endian, rgb2yuv); \
} \
 
#define rgb9plus_planar_funcs(nbits) \
rgb9plus_planar_funcs_endian(nbits, le, 0) \
rgb9plus_planar_funcs_endian(nbits, be, 1)
 
rgb9plus_planar_funcs(9)
rgb9plus_planar_funcs(10)
rgb9plus_planar_funcs(12)
rgb9plus_planar_funcs(14)
rgb9plus_planar_funcs(16)
 
av_cold void ff_sws_init_input_funcs(SwsContext *c)
{
enum AVPixelFormat srcFormat = c->srcFormat;
 
c->chrToYV12 = NULL;
switch (srcFormat) {
case AV_PIX_FMT_YUYV422:
c->chrToYV12 = yuy2ToUV_c;
break;
case AV_PIX_FMT_YVYU422:
c->chrToYV12 = yvy2ToUV_c;
break;
case AV_PIX_FMT_UYVY422:
c->chrToYV12 = uyvyToUV_c;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = nv12ToUV_c;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = nv21ToUV_c;
break;
case AV_PIX_FMT_RGB8:
case AV_PIX_FMT_BGR8:
case AV_PIX_FMT_PAL8:
case AV_PIX_FMT_BGR4_BYTE:
case AV_PIX_FMT_RGB4_BYTE:
c->chrToYV12 = palToUV_c;
break;
case AV_PIX_FMT_GBRP9LE:
c->readChrPlanar = planar_rgb9le_to_uv;
break;
case AV_PIX_FMT_GBRP10LE:
c->readChrPlanar = planar_rgb10le_to_uv;
break;
case AV_PIX_FMT_GBRP12LE:
c->readChrPlanar = planar_rgb12le_to_uv;
break;
case AV_PIX_FMT_GBRP14LE:
c->readChrPlanar = planar_rgb14le_to_uv;
break;
case AV_PIX_FMT_GBRAP16LE:
case AV_PIX_FMT_GBRP16LE:
c->readChrPlanar = planar_rgb16le_to_uv;
break;
case AV_PIX_FMT_GBRP9BE:
c->readChrPlanar = planar_rgb9be_to_uv;
break;
case AV_PIX_FMT_GBRP10BE:
c->readChrPlanar = planar_rgb10be_to_uv;
break;
case AV_PIX_FMT_GBRP12BE:
c->readChrPlanar = planar_rgb12be_to_uv;
break;
case AV_PIX_FMT_GBRP14BE:
c->readChrPlanar = planar_rgb14be_to_uv;
break;
case AV_PIX_FMT_GBRAP16BE:
case AV_PIX_FMT_GBRP16BE:
c->readChrPlanar = planar_rgb16be_to_uv;
break;
case AV_PIX_FMT_GBRAP:
case AV_PIX_FMT_GBRP:
c->readChrPlanar = planar_rgb_to_uv;
break;
#if HAVE_BIGENDIAN
case AV_PIX_FMT_YUV444P9LE:
case AV_PIX_FMT_YUV422P9LE:
case AV_PIX_FMT_YUV420P9LE:
case AV_PIX_FMT_YUV422P10LE:
case AV_PIX_FMT_YUV440P10LE:
case AV_PIX_FMT_YUV444P10LE:
case AV_PIX_FMT_YUV420P10LE:
case AV_PIX_FMT_YUV422P12LE:
case AV_PIX_FMT_YUV440P12LE:
case AV_PIX_FMT_YUV444P12LE:
case AV_PIX_FMT_YUV420P12LE:
case AV_PIX_FMT_YUV422P14LE:
case AV_PIX_FMT_YUV444P14LE:
case AV_PIX_FMT_YUV420P14LE:
case AV_PIX_FMT_YUV420P16LE:
case AV_PIX_FMT_YUV422P16LE:
case AV_PIX_FMT_YUV444P16LE:
 
case AV_PIX_FMT_YUVA444P9LE:
case AV_PIX_FMT_YUVA422P9LE:
case AV_PIX_FMT_YUVA420P9LE:
case AV_PIX_FMT_YUVA444P10LE:
case AV_PIX_FMT_YUVA422P10LE:
case AV_PIX_FMT_YUVA420P10LE:
case AV_PIX_FMT_YUVA420P16LE:
case AV_PIX_FMT_YUVA422P16LE:
case AV_PIX_FMT_YUVA444P16LE:
c->chrToYV12 = bswap16UV_c;
break;
#else
case AV_PIX_FMT_YUV444P9BE:
case AV_PIX_FMT_YUV422P9BE:
case AV_PIX_FMT_YUV420P9BE:
case AV_PIX_FMT_YUV440P10BE:
case AV_PIX_FMT_YUV444P10BE:
case AV_PIX_FMT_YUV422P10BE:
case AV_PIX_FMT_YUV420P10BE:
case AV_PIX_FMT_YUV440P12BE:
case AV_PIX_FMT_YUV444P12BE:
case AV_PIX_FMT_YUV422P12BE:
case AV_PIX_FMT_YUV420P12BE:
case AV_PIX_FMT_YUV444P14BE:
case AV_PIX_FMT_YUV422P14BE:
case AV_PIX_FMT_YUV420P14BE:
case AV_PIX_FMT_YUV420P16BE:
case AV_PIX_FMT_YUV422P16BE:
case AV_PIX_FMT_YUV444P16BE:
 
case AV_PIX_FMT_YUVA444P9BE:
case AV_PIX_FMT_YUVA422P9BE:
case AV_PIX_FMT_YUVA420P9BE:
case AV_PIX_FMT_YUVA444P10BE:
case AV_PIX_FMT_YUVA422P10BE:
case AV_PIX_FMT_YUVA420P10BE:
case AV_PIX_FMT_YUVA420P16BE:
case AV_PIX_FMT_YUVA422P16BE:
case AV_PIX_FMT_YUVA444P16BE:
c->chrToYV12 = bswap16UV_c;
break;
#endif
case AV_PIX_FMT_AYUV64LE:
c->chrToYV12 = read_ayuv64le_UV_c;
break;
}
if (c->chrSrcHSubSample) {
switch (srcFormat) {
case AV_PIX_FMT_RGBA64BE:
c->chrToYV12 = rgb64BEToUV_half_c;
break;
case AV_PIX_FMT_RGBA64LE:
c->chrToYV12 = rgb64LEToUV_half_c;
break;
case AV_PIX_FMT_BGRA64BE:
c->chrToYV12 = bgr64BEToUV_half_c;
break;
case AV_PIX_FMT_BGRA64LE:
c->chrToYV12 = bgr64LEToUV_half_c;
break;
case AV_PIX_FMT_RGB48BE:
c->chrToYV12 = rgb48BEToUV_half_c;
break;
case AV_PIX_FMT_RGB48LE:
c->chrToYV12 = rgb48LEToUV_half_c;
break;
case AV_PIX_FMT_BGR48BE:
c->chrToYV12 = bgr48BEToUV_half_c;
break;
case AV_PIX_FMT_BGR48LE:
c->chrToYV12 = bgr48LEToUV_half_c;
break;
case AV_PIX_FMT_RGB32:
c->chrToYV12 = bgr32ToUV_half_c;
break;
case AV_PIX_FMT_RGB32_1:
c->chrToYV12 = bgr321ToUV_half_c;
break;
case AV_PIX_FMT_BGR24:
c->chrToYV12 = bgr24ToUV_half_c;
break;
case AV_PIX_FMT_BGR565LE:
c->chrToYV12 = bgr16leToUV_half_c;
break;
case AV_PIX_FMT_BGR565BE:
c->chrToYV12 = bgr16beToUV_half_c;
break;
case AV_PIX_FMT_BGR555LE:
c->chrToYV12 = bgr15leToUV_half_c;
break;
case AV_PIX_FMT_BGR555BE:
c->chrToYV12 = bgr15beToUV_half_c;
break;
case AV_PIX_FMT_GBRAP:
case AV_PIX_FMT_GBRP:
c->chrToYV12 = gbr24pToUV_half_c;
break;
case AV_PIX_FMT_BGR444LE:
c->chrToYV12 = bgr12leToUV_half_c;
break;
case AV_PIX_FMT_BGR444BE:
c->chrToYV12 = bgr12beToUV_half_c;
break;
case AV_PIX_FMT_BGR32:
c->chrToYV12 = rgb32ToUV_half_c;
break;
case AV_PIX_FMT_BGR32_1:
c->chrToYV12 = rgb321ToUV_half_c;
break;
case AV_PIX_FMT_RGB24:
c->chrToYV12 = rgb24ToUV_half_c;
break;
case AV_PIX_FMT_RGB565LE:
c->chrToYV12 = rgb16leToUV_half_c;
break;
case AV_PIX_FMT_RGB565BE:
c->chrToYV12 = rgb16beToUV_half_c;
break;
case AV_PIX_FMT_RGB555LE:
c->chrToYV12 = rgb15leToUV_half_c;
break;
case AV_PIX_FMT_RGB555BE:
c->chrToYV12 = rgb15beToUV_half_c;
break;
case AV_PIX_FMT_RGB444LE:
c->chrToYV12 = rgb12leToUV_half_c;
break;
case AV_PIX_FMT_RGB444BE:
c->chrToYV12 = rgb12beToUV_half_c;
break;
}
} else {
switch (srcFormat) {
case AV_PIX_FMT_RGBA64BE:
c->chrToYV12 = rgb64BEToUV_c;
break;
case AV_PIX_FMT_RGBA64LE:
c->chrToYV12 = rgb64LEToUV_c;
break;
case AV_PIX_FMT_BGRA64BE:
c->chrToYV12 = bgr64BEToUV_c;
break;
case AV_PIX_FMT_BGRA64LE:
c->chrToYV12 = bgr64LEToUV_c;
break;
case AV_PIX_FMT_RGB48BE:
c->chrToYV12 = rgb48BEToUV_c;
break;
case AV_PIX_FMT_RGB48LE:
c->chrToYV12 = rgb48LEToUV_c;
break;
case AV_PIX_FMT_BGR48BE:
c->chrToYV12 = bgr48BEToUV_c;
break;
case AV_PIX_FMT_BGR48LE:
c->chrToYV12 = bgr48LEToUV_c;
break;
case AV_PIX_FMT_RGB32:
c->chrToYV12 = bgr32ToUV_c;
break;
case AV_PIX_FMT_RGB32_1:
c->chrToYV12 = bgr321ToUV_c;
break;
case AV_PIX_FMT_BGR24:
c->chrToYV12 = bgr24ToUV_c;
break;
case AV_PIX_FMT_BGR565LE:
c->chrToYV12 = bgr16leToUV_c;
break;
case AV_PIX_FMT_BGR565BE:
c->chrToYV12 = bgr16beToUV_c;
break;
case AV_PIX_FMT_BGR555LE:
c->chrToYV12 = bgr15leToUV_c;
break;
case AV_PIX_FMT_BGR555BE:
c->chrToYV12 = bgr15beToUV_c;
break;
case AV_PIX_FMT_BGR444LE:
c->chrToYV12 = bgr12leToUV_c;
break;
case AV_PIX_FMT_BGR444BE:
c->chrToYV12 = bgr12beToUV_c;
break;
case AV_PIX_FMT_BGR32:
c->chrToYV12 = rgb32ToUV_c;
break;
case AV_PIX_FMT_BGR32_1:
c->chrToYV12 = rgb321ToUV_c;
break;
case AV_PIX_FMT_RGB24:
c->chrToYV12 = rgb24ToUV_c;
break;
case AV_PIX_FMT_RGB565LE:
c->chrToYV12 = rgb16leToUV_c;
break;
case AV_PIX_FMT_RGB565BE:
c->chrToYV12 = rgb16beToUV_c;
break;
case AV_PIX_FMT_RGB555LE:
c->chrToYV12 = rgb15leToUV_c;
break;
case AV_PIX_FMT_RGB555BE:
c->chrToYV12 = rgb15beToUV_c;
break;
case AV_PIX_FMT_RGB444LE:
c->chrToYV12 = rgb12leToUV_c;
break;
case AV_PIX_FMT_RGB444BE:
c->chrToYV12 = rgb12beToUV_c;
break;
}
}
 
c->lumToYV12 = NULL;
c->alpToYV12 = NULL;
switch (srcFormat) {
case AV_PIX_FMT_GBRP9LE:
c->readLumPlanar = planar_rgb9le_to_y;
break;
case AV_PIX_FMT_GBRP10LE:
c->readLumPlanar = planar_rgb10le_to_y;
break;
case AV_PIX_FMT_GBRP12LE:
c->readLumPlanar = planar_rgb12le_to_y;
break;
case AV_PIX_FMT_GBRP14LE:
c->readLumPlanar = planar_rgb14le_to_y;
break;
case AV_PIX_FMT_GBRAP16LE:
case AV_PIX_FMT_GBRP16LE:
c->readLumPlanar = planar_rgb16le_to_y;
break;
case AV_PIX_FMT_GBRP9BE:
c->readLumPlanar = planar_rgb9be_to_y;
break;
case AV_PIX_FMT_GBRP10BE:
c->readLumPlanar = planar_rgb10be_to_y;
break;
case AV_PIX_FMT_GBRP12BE:
c->readLumPlanar = planar_rgb12be_to_y;
break;
case AV_PIX_FMT_GBRP14BE:
c->readLumPlanar = planar_rgb14be_to_y;
break;
case AV_PIX_FMT_GBRAP16BE:
case AV_PIX_FMT_GBRP16BE:
c->readLumPlanar = planar_rgb16be_to_y;
break;
case AV_PIX_FMT_GBRAP:
c->readAlpPlanar = planar_rgb_to_a;
case AV_PIX_FMT_GBRP:
c->readLumPlanar = planar_rgb_to_y;
break;
#if HAVE_BIGENDIAN
case AV_PIX_FMT_YUV444P9LE:
case AV_PIX_FMT_YUV422P9LE:
case AV_PIX_FMT_YUV420P9LE:
case AV_PIX_FMT_YUV444P10LE:
case AV_PIX_FMT_YUV440P10LE:
case AV_PIX_FMT_YUV422P10LE:
case AV_PIX_FMT_YUV420P10LE:
case AV_PIX_FMT_YUV444P12LE:
case AV_PIX_FMT_YUV440P12LE:
case AV_PIX_FMT_YUV422P12LE:
case AV_PIX_FMT_YUV420P12LE:
case AV_PIX_FMT_YUV444P14LE:
case AV_PIX_FMT_YUV422P14LE:
case AV_PIX_FMT_YUV420P14LE:
case AV_PIX_FMT_YUV420P16LE:
case AV_PIX_FMT_YUV422P16LE:
case AV_PIX_FMT_YUV444P16LE:
 
case AV_PIX_FMT_GRAY16LE:
c->lumToYV12 = bswap16Y_c;
break;
case AV_PIX_FMT_YUVA444P9LE:
case AV_PIX_FMT_YUVA422P9LE:
case AV_PIX_FMT_YUVA420P9LE:
case AV_PIX_FMT_YUVA444P10LE:
case AV_PIX_FMT_YUVA422P10LE:
case AV_PIX_FMT_YUVA420P10LE:
case AV_PIX_FMT_YUVA420P16LE:
case AV_PIX_FMT_YUVA422P16LE:
case AV_PIX_FMT_YUVA444P16LE:
c->lumToYV12 = bswap16Y_c;
c->alpToYV12 = bswap16Y_c;
break;
#else
case AV_PIX_FMT_YUV444P9BE:
case AV_PIX_FMT_YUV422P9BE:
case AV_PIX_FMT_YUV420P9BE:
case AV_PIX_FMT_YUV444P10BE:
case AV_PIX_FMT_YUV440P10BE:
case AV_PIX_FMT_YUV422P10BE:
case AV_PIX_FMT_YUV420P10BE:
case AV_PIX_FMT_YUV444P12BE:
case AV_PIX_FMT_YUV440P12BE:
case AV_PIX_FMT_YUV422P12BE:
case AV_PIX_FMT_YUV420P12BE:
case AV_PIX_FMT_YUV444P14BE:
case AV_PIX_FMT_YUV422P14BE:
case AV_PIX_FMT_YUV420P14BE:
case AV_PIX_FMT_YUV420P16BE:
case AV_PIX_FMT_YUV422P16BE:
case AV_PIX_FMT_YUV444P16BE:
 
case AV_PIX_FMT_GRAY16BE:
c->lumToYV12 = bswap16Y_c;
break;
case AV_PIX_FMT_YUVA444P9BE:
case AV_PIX_FMT_YUVA422P9BE:
case AV_PIX_FMT_YUVA420P9BE:
case AV_PIX_FMT_YUVA444P10BE:
case AV_PIX_FMT_YUVA422P10BE:
case AV_PIX_FMT_YUVA420P10BE:
case AV_PIX_FMT_YUVA420P16BE:
case AV_PIX_FMT_YUVA422P16BE:
case AV_PIX_FMT_YUVA444P16BE:
c->lumToYV12 = bswap16Y_c;
c->alpToYV12 = bswap16Y_c;
break;
#endif
case AV_PIX_FMT_YA16LE:
c->lumToYV12 = read_ya16le_gray_c;
break;
case AV_PIX_FMT_YA16BE:
c->lumToYV12 = read_ya16be_gray_c;
break;
case AV_PIX_FMT_AYUV64LE:
c->lumToYV12 = read_ayuv64le_Y_c;
break;
case AV_PIX_FMT_YUYV422:
case AV_PIX_FMT_YVYU422:
case AV_PIX_FMT_YA8:
c->lumToYV12 = yuy2ToY_c;
break;
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = uyvyToY_c;
break;
case AV_PIX_FMT_BGR24:
c->lumToYV12 = bgr24ToY_c;
break;
case AV_PIX_FMT_BGR565LE:
c->lumToYV12 = bgr16leToY_c;
break;
case AV_PIX_FMT_BGR565BE:
c->lumToYV12 = bgr16beToY_c;
break;
case AV_PIX_FMT_BGR555LE:
c->lumToYV12 = bgr15leToY_c;
break;
case AV_PIX_FMT_BGR555BE:
c->lumToYV12 = bgr15beToY_c;
break;
case AV_PIX_FMT_BGR444LE:
c->lumToYV12 = bgr12leToY_c;
break;
case AV_PIX_FMT_BGR444BE:
c->lumToYV12 = bgr12beToY_c;
break;
case AV_PIX_FMT_RGB24:
c->lumToYV12 = rgb24ToY_c;
break;
case AV_PIX_FMT_RGB565LE:
c->lumToYV12 = rgb16leToY_c;
break;
case AV_PIX_FMT_RGB565BE:
c->lumToYV12 = rgb16beToY_c;
break;
case AV_PIX_FMT_RGB555LE:
c->lumToYV12 = rgb15leToY_c;
break;
case AV_PIX_FMT_RGB555BE:
c->lumToYV12 = rgb15beToY_c;
break;
case AV_PIX_FMT_RGB444LE:
c->lumToYV12 = rgb12leToY_c;
break;
case AV_PIX_FMT_RGB444BE:
c->lumToYV12 = rgb12beToY_c;
break;
case AV_PIX_FMT_RGB8:
case AV_PIX_FMT_BGR8:
case AV_PIX_FMT_PAL8:
case AV_PIX_FMT_BGR4_BYTE:
case AV_PIX_FMT_RGB4_BYTE:
c->lumToYV12 = palToY_c;
break;
case AV_PIX_FMT_MONOBLACK:
c->lumToYV12 = monoblack2Y_c;
break;
case AV_PIX_FMT_MONOWHITE:
c->lumToYV12 = monowhite2Y_c;
break;
case AV_PIX_FMT_RGB32:
c->lumToYV12 = bgr32ToY_c;
break;
case AV_PIX_FMT_RGB32_1:
c->lumToYV12 = bgr321ToY_c;
break;
case AV_PIX_FMT_BGR32:
c->lumToYV12 = rgb32ToY_c;
break;
case AV_PIX_FMT_BGR32_1:
c->lumToYV12 = rgb321ToY_c;
break;
case AV_PIX_FMT_RGB48BE:
c->lumToYV12 = rgb48BEToY_c;
break;
case AV_PIX_FMT_RGB48LE:
c->lumToYV12 = rgb48LEToY_c;
break;
case AV_PIX_FMT_BGR48BE:
c->lumToYV12 = bgr48BEToY_c;
break;
case AV_PIX_FMT_BGR48LE:
c->lumToYV12 = bgr48LEToY_c;
break;
case AV_PIX_FMT_RGBA64BE:
c->lumToYV12 = rgb64BEToY_c;
break;
case AV_PIX_FMT_RGBA64LE:
c->lumToYV12 = rgb64LEToY_c;
break;
case AV_PIX_FMT_BGRA64BE:
c->lumToYV12 = bgr64BEToY_c;
break;
case AV_PIX_FMT_BGRA64LE:
c->lumToYV12 = bgr64LEToY_c;
}
if (c->alpPixBuf) {
if (is16BPS(srcFormat) || isNBPS(srcFormat)) {
if (HAVE_BIGENDIAN == !isBE(srcFormat))
c->alpToYV12 = bswap16Y_c;
}
switch (srcFormat) {
case AV_PIX_FMT_BGRA64LE:
case AV_PIX_FMT_RGBA64LE: c->alpToYV12 = rgba64leToA_c; break;
case AV_PIX_FMT_BGRA64BE:
case AV_PIX_FMT_RGBA64BE: c->alpToYV12 = rgba64beToA_c; break;
case AV_PIX_FMT_BGRA:
case AV_PIX_FMT_RGBA:
c->alpToYV12 = rgbaToA_c;
break;
case AV_PIX_FMT_ABGR:
case AV_PIX_FMT_ARGB:
c->alpToYV12 = abgrToA_c;
break;
case AV_PIX_FMT_YA8:
c->alpToYV12 = uyvyToY_c;
break;
case AV_PIX_FMT_YA16LE:
c->alpToYV12 = read_ya16le_alpha_c;
break;
case AV_PIX_FMT_YA16BE:
c->alpToYV12 = read_ya16be_alpha_c;
break;
case AV_PIX_FMT_AYUV64LE:
c->alpToYV12 = read_ayuv64le_A_c;
break;
case AV_PIX_FMT_PAL8 :
c->alpToYV12 = palToA_c;
break;
}
}
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/libswscale.pc
0,0 → 1,14
prefix=/usr/local
exec_prefix=${prefix}
libdir=${prefix}/lib
includedir=${prefix}/include
 
Name: libswscale
Description: FFmpeg image rescaling library
Version: 3.1.101
Requires:
Requires.private: libavutil >= 54.31.100
Conflicts:
Libs: -L${libdir} -lswscale
Libs.private: -lm
Cflags: -I${includedir}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/libswscale.v
0,0 → 1,4
LIBSWSCALE_$MAJOR {
global: swscale_*; sws_*;
local: *;
};
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/libswscale.ver
0,0 → 1,5
LIBSWSCALE_3 {
global: DllStartup;
swscale_*; sws_*;
local: *;
};
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/log2_tab.c
0,0 → 1,0
#include "libavutil/log2_tab.c"
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/options.c
0,0 → 1,100
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/avutil.h"
#include "libavutil/opt.h"
#include "libavutil/pixfmt.h"
#include "swscale.h"
#include "swscale_internal.h"
 
static const char *sws_context_to_name(void *ptr)
{
return "swscaler";
}
 
#define OFFSET(x) offsetof(SwsContext, x)
#define DEFAULT 0
#define VE AV_OPT_FLAG_VIDEO_PARAM | AV_OPT_FLAG_ENCODING_PARAM
 
static const AVOption swscale_options[] = {
{ "sws_flags", "scaler flags", OFFSET(flags), AV_OPT_TYPE_FLAGS, { .i64 = SWS_BICUBIC }, 0, UINT_MAX, VE, "sws_flags" },
{ "fast_bilinear", "fast bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FAST_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bilinear", "bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BILINEAR }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bicubic", "bicubic", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBIC }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "experimental", "experimental", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_X }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "neighbor", "nearest neighbor", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_POINT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "area", "averaging area", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_AREA }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bicublin", "luma bicubic, chroma bilinear", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BICUBLIN }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "gauss", "gaussian", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_GAUSS }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "sinc", "sinc", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_SINC }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "lanczos", "lanczos", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_LANCZOS }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "spline", "natural bicubic spline", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_SPLINE }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "print_info", "print info", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_PRINT_INFO }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "accurate_rnd", "accurate rounding", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ACCURATE_RND }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "full_chroma_int", "full chroma interpolation", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "full_chroma_inp", "full chroma input", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_FULL_CHR_H_INP }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "bitexact", "", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_BITEXACT }, INT_MIN, INT_MAX, VE, "sws_flags" },
{ "error_diffusion", "error diffusion dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ERROR_DIFFUSION}, INT_MIN, INT_MAX, VE, "sws_flags" },
 
{ "srcw", "source width", OFFSET(srcW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "srch", "source height", OFFSET(srcH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "dstw", "destination width", OFFSET(dstW), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "dsth", "destination height", OFFSET(dstH), AV_OPT_TYPE_INT, { .i64 = 16 }, 1, INT_MAX, VE },
{ "src_format", "source format", OFFSET(srcFormat), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, AV_PIX_FMT_NB - 1, VE },
{ "dst_format", "destination format", OFFSET(dstFormat), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, AV_PIX_FMT_NB - 1, VE },
{ "src_range", "source range", OFFSET(srcRange), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, 1, VE },
{ "dst_range", "destination range", OFFSET(dstRange), AV_OPT_TYPE_INT, { .i64 = DEFAULT }, 0, 1, VE },
{ "param0", "scaler param 0", OFFSET(param[0]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE },
{ "param1", "scaler param 1", OFFSET(param[1]), AV_OPT_TYPE_DOUBLE, { .dbl = SWS_PARAM_DEFAULT }, INT_MIN, INT_MAX, VE },
 
{ "src_v_chr_pos", "source vertical chroma position in luma grid/256" , OFFSET(src_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "src_h_chr_pos", "source horizontal chroma position in luma grid/256", OFFSET(src_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "dst_v_chr_pos", "destination vertical chroma position in luma grid/256" , OFFSET(dst_v_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
{ "dst_h_chr_pos", "destination horizontal chroma position in luma grid/256", OFFSET(dst_h_chr_pos), AV_OPT_TYPE_INT, { .i64 = -513 }, -513, 512, VE },
 
{ "sws_dither", "set dithering algorithm", OFFSET(dither), AV_OPT_TYPE_INT, { .i64 = SWS_DITHER_AUTO }, 0, NB_SWS_DITHER, VE, "sws_dither" },
{ "auto", "leave choice to sws", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_AUTO }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "bayer", "bayer dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_BAYER }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "ed", "error diffusion", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_ED }, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "a_dither", "arithmetic addition dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_A_DITHER}, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "x_dither", "arithmetic xor dither", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_DITHER_X_DITHER}, INT_MIN, INT_MAX, VE, "sws_dither" },
{ "gamma", "gamma correct scaling", OFFSET(gamma_flag), AV_OPT_TYPE_INT, { .i64 = 0 }, 0, INT_MAX, VE, "gamma" },
{ "true", "enable", 0, AV_OPT_TYPE_CONST, { .i64 = 1 }, INT_MIN, INT_MAX, VE, "gamma" },
{ "false", "disable", 0, AV_OPT_TYPE_CONST, { .i64 = 0 }, INT_MIN, INT_MAX, VE, "gamma" },
{ "alphablend", "mode for alpha -> non alpha", OFFSET(alphablend),AV_OPT_TYPE_INT, { .i64 = SWS_ALPHA_BLEND_NONE}, 0, SWS_ALPHA_BLEND_NB-1, VE, "alphablend" },
{ "none", "ignore alpha", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_NONE}, INT_MIN, INT_MAX, VE, "alphablend" },
{ "uniform_color", "blend onto a uniform color", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_UNIFORM},INT_MIN, INT_MAX, VE, "alphablend" },
{ "checkerboard", "blend onto a checkerboard", 0, AV_OPT_TYPE_CONST, { .i64 = SWS_ALPHA_BLEND_CHECKERBOARD},INT_MIN, INT_MAX, VE, "alphablend" },
 
{ NULL }
};
 
const AVClass sws_context_class = {
.class_name = "SWScaler",
.item_name = sws_context_to_name,
.option = swscale_options,
.category = AV_CLASS_CATEGORY_SWSCALER,
.version = LIBAVUTIL_VERSION_INT,
};
 
const AVClass *sws_get_class(void)
{
return &sws_context_class;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/output.c
0,0 → 1,2513
/*
* Copyright (C) 2001-2012 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <math.h>
#include <stdint.h>
#include <stdio.h>
#include <string.h>
 
#include "libavutil/attributes.h"
#include "libavutil/avutil.h"
#include "libavutil/avassert.h"
#include "libavutil/bswap.h"
#include "libavutil/cpu.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mathematics.h"
#include "libavutil/pixdesc.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
 
DECLARE_ALIGNED(8, const uint8_t, ff_dither_2x2_4)[][8] = {
{ 1, 3, 1, 3, 1, 3, 1, 3, },
{ 2, 0, 2, 0, 2, 0, 2, 0, },
{ 1, 3, 1, 3, 1, 3, 1, 3, },
};
 
DECLARE_ALIGNED(8, const uint8_t, ff_dither_2x2_8)[][8] = {
{ 6, 2, 6, 2, 6, 2, 6, 2, },
{ 0, 4, 0, 4, 0, 4, 0, 4, },
{ 6, 2, 6, 2, 6, 2, 6, 2, },
};
 
DECLARE_ALIGNED(8, const uint8_t, ff_dither_4x4_16)[][8] = {
{ 8, 4, 11, 7, 8, 4, 11, 7, },
{ 2, 14, 1, 13, 2, 14, 1, 13, },
{ 10, 6, 9, 5, 10, 6, 9, 5, },
{ 0, 12, 3, 15, 0, 12, 3, 15, },
{ 8, 4, 11, 7, 8, 4, 11, 7, },
};
 
DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_32)[][8] = {
{ 17, 9, 23, 15, 16, 8, 22, 14, },
{ 5, 29, 3, 27, 4, 28, 2, 26, },
{ 21, 13, 19, 11, 20, 12, 18, 10, },
{ 0, 24, 6, 30, 1, 25, 7, 31, },
{ 16, 8, 22, 14, 17, 9, 23, 15, },
{ 4, 28, 2, 26, 5, 29, 3, 27, },
{ 20, 12, 18, 10, 21, 13, 19, 11, },
{ 1, 25, 7, 31, 0, 24, 6, 30, },
{ 17, 9, 23, 15, 16, 8, 22, 14, },
};
 
DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_73)[][8] = {
{ 0, 55, 14, 68, 3, 58, 17, 72, },
{ 37, 18, 50, 32, 40, 22, 54, 35, },
{ 9, 64, 5, 59, 13, 67, 8, 63, },
{ 46, 27, 41, 23, 49, 31, 44, 26, },
{ 2, 57, 16, 71, 1, 56, 15, 70, },
{ 39, 21, 52, 34, 38, 19, 51, 33, },
{ 11, 66, 7, 62, 10, 65, 6, 60, },
{ 48, 30, 43, 25, 47, 29, 42, 24, },
{ 0, 55, 14, 68, 3, 58, 17, 72, },
};
 
#if 1
DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[][8] = {
{117, 62, 158, 103, 113, 58, 155, 100, },
{ 34, 199, 21, 186, 31, 196, 17, 182, },
{144, 89, 131, 76, 141, 86, 127, 72, },
{ 0, 165, 41, 206, 10, 175, 52, 217, },
{110, 55, 151, 96, 120, 65, 162, 107, },
{ 28, 193, 14, 179, 38, 203, 24, 189, },
{138, 83, 124, 69, 148, 93, 134, 79, },
{ 7, 172, 48, 213, 3, 168, 45, 210, },
{117, 62, 158, 103, 113, 58, 155, 100, },
};
#elif 1
// tries to correct a gamma of 1.5
DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[][8] = {
{ 0, 143, 18, 200, 2, 156, 25, 215, },
{ 78, 28, 125, 64, 89, 36, 138, 74, },
{ 10, 180, 3, 161, 16, 195, 8, 175, },
{109, 51, 93, 38, 121, 60, 105, 47, },
{ 1, 152, 23, 210, 0, 147, 20, 205, },
{ 85, 33, 134, 71, 81, 30, 130, 67, },
{ 14, 190, 6, 171, 12, 185, 5, 166, },
{117, 57, 101, 44, 113, 54, 97, 41, },
{ 0, 143, 18, 200, 2, 156, 25, 215, },
};
#elif 1
// tries to correct a gamma of 2.0
DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[][8] = {
{ 0, 124, 8, 193, 0, 140, 12, 213, },
{ 55, 14, 104, 42, 66, 19, 119, 52, },
{ 3, 168, 1, 145, 6, 187, 3, 162, },
{ 86, 31, 70, 21, 99, 39, 82, 28, },
{ 0, 134, 11, 206, 0, 129, 9, 200, },
{ 62, 17, 114, 48, 58, 16, 109, 45, },
{ 5, 181, 2, 157, 4, 175, 1, 151, },
{ 95, 36, 78, 26, 90, 34, 74, 24, },
{ 0, 124, 8, 193, 0, 140, 12, 213, },
};
#else
// tries to correct a gamma of 2.5
DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_220)[][8] = {
{ 0, 107, 3, 187, 0, 125, 6, 212, },
{ 39, 7, 86, 28, 49, 11, 102, 36, },
{ 1, 158, 0, 131, 3, 180, 1, 151, },
{ 68, 19, 52, 12, 81, 25, 64, 17, },
{ 0, 119, 5, 203, 0, 113, 4, 195, },
{ 45, 9, 96, 33, 42, 8, 91, 30, },
{ 2, 172, 1, 144, 2, 165, 0, 137, },
{ 77, 23, 60, 15, 72, 21, 56, 14, },
{ 0, 107, 3, 187, 0, 125, 6, 212, },
};
#endif
 
#define output_pixel(pos, val, bias, signedness) \
if (big_endian) { \
AV_WB16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
} else { \
AV_WL16(pos, bias + av_clip_ ## signedness ## 16(val >> shift)); \
}
 
static av_always_inline void
yuv2plane1_16_c_template(const int32_t *src, uint16_t *dest, int dstW,
int big_endian, int output_bits)
{
int i;
int shift = 3;
av_assert0(output_bits == 16);
 
for (i = 0; i < dstW; i++) {
int val = src[i] + (1 << (shift - 1));
output_pixel(&dest[i], val, 0, uint);
}
}
 
static av_always_inline void
yuv2planeX_16_c_template(const int16_t *filter, int filterSize,
const int32_t **src, uint16_t *dest, int dstW,
int big_endian, int output_bits)
{
int i;
int shift = 15;
av_assert0(output_bits == 16);
 
for (i = 0; i < dstW; i++) {
int val = 1 << (shift - 1);
int j;
 
/* range of val is [0,0x7FFFFFFF], so 31 bits, but with lanczos/spline
* filters (or anything with negative coeffs, the range can be slightly
* wider in both directions. To account for this overflow, we subtract
* a constant so it always fits in the signed range (assuming a
* reasonable filterSize), and re-add that at the end. */
val -= 0x40000000;
for (j = 0; j < filterSize; j++)
val += src[j][i] * (unsigned)filter[j];
 
output_pixel(&dest[i], val, 0x8000, int);
}
}
 
#undef output_pixel
 
#define output_pixel(pos, val) \
if (big_endian) { \
AV_WB16(pos, av_clip_uintp2(val >> shift, output_bits)); \
} else { \
AV_WL16(pos, av_clip_uintp2(val >> shift, output_bits)); \
}
 
static av_always_inline void
yuv2plane1_10_c_template(const int16_t *src, uint16_t *dest, int dstW,
int big_endian, int output_bits)
{
int i;
int shift = 15 - output_bits;
 
for (i = 0; i < dstW; i++) {
int val = src[i] + (1 << (shift - 1));
output_pixel(&dest[i], val);
}
}
 
static av_always_inline void
yuv2planeX_10_c_template(const int16_t *filter, int filterSize,
const int16_t **src, uint16_t *dest, int dstW,
int big_endian, int output_bits)
{
int i;
int shift = 11 + 16 - output_bits;
 
for (i = 0; i < dstW; i++) {
int val = 1 << (shift - 1);
int j;
 
for (j = 0; j < filterSize; j++)
val += src[j][i] * filter[j];
 
output_pixel(&dest[i], val);
}
}
 
#undef output_pixel
 
#define yuv2NBPS(bits, BE_LE, is_be, template_size, typeX_t) \
static void yuv2plane1_ ## bits ## BE_LE ## _c(const int16_t *src, \
uint8_t *dest, int dstW, \
const uint8_t *dither, int offset)\
{ \
yuv2plane1_ ## template_size ## _c_template((const typeX_t *) src, \
(uint16_t *) dest, dstW, is_be, bits); \
}\
static void yuv2planeX_ ## bits ## BE_LE ## _c(const int16_t *filter, int filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset)\
{ \
yuv2planeX_## template_size ## _c_template(filter, \
filterSize, (const typeX_t **) src, \
(uint16_t *) dest, dstW, is_be, bits); \
}
yuv2NBPS( 9, BE, 1, 10, int16_t)
yuv2NBPS( 9, LE, 0, 10, int16_t)
yuv2NBPS(10, BE, 1, 10, int16_t)
yuv2NBPS(10, LE, 0, 10, int16_t)
yuv2NBPS(12, BE, 1, 10, int16_t)
yuv2NBPS(12, LE, 0, 10, int16_t)
yuv2NBPS(14, BE, 1, 10, int16_t)
yuv2NBPS(14, LE, 0, 10, int16_t)
yuv2NBPS(16, BE, 1, 16, int32_t)
yuv2NBPS(16, LE, 0, 16, int32_t)
 
static void yuv2planeX_8_c(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
int i;
for (i=0; i<dstW; i++) {
int val = dither[(i + offset) & 7] << 12;
int j;
for (j=0; j<filterSize; j++)
val += src[j][i] * filter[j];
 
dest[i]= av_clip_uint8(val>>19);
}
}
 
static void yuv2plane1_8_c(const int16_t *src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
int i;
for (i=0; i<dstW; i++) {
int val = (src[i] + dither[(i + offset) & 7]) >> 7;
dest[i]= av_clip_uint8(val);
}
}
 
static void yuv2nv12cX_c(SwsContext *c, const int16_t *chrFilter, int chrFilterSize,
const int16_t **chrUSrc, const int16_t **chrVSrc,
uint8_t *dest, int chrDstW)
{
enum AVPixelFormat dstFormat = c->dstFormat;
const uint8_t *chrDither = c->chrDither8;
int i;
 
if (dstFormat == AV_PIX_FMT_NV12)
for (i=0; i<chrDstW; i++) {
int u = chrDither[i & 7] << 12;
int v = chrDither[(i + 3) & 7] << 12;
int j;
for (j=0; j<chrFilterSize; j++) {
u += chrUSrc[j][i] * chrFilter[j];
v += chrVSrc[j][i] * chrFilter[j];
}
 
dest[2*i]= av_clip_uint8(u>>19);
dest[2*i+1]= av_clip_uint8(v>>19);
}
else
for (i=0; i<chrDstW; i++) {
int u = chrDither[i & 7] << 12;
int v = chrDither[(i + 3) & 7] << 12;
int j;
for (j=0; j<chrFilterSize; j++) {
u += chrUSrc[j][i] * chrFilter[j];
v += chrVSrc[j][i] * chrFilter[j];
}
 
dest[2*i]= av_clip_uint8(v>>19);
dest[2*i+1]= av_clip_uint8(u>>19);
}
}
 
#define accumulate_bit(acc, val) \
acc <<= 1; \
acc |= (val) >= (128 + 110)
#define output_pixel(pos, acc) \
if (target == AV_PIX_FMT_MONOBLACK) { \
pos = acc; \
} else { \
pos = ~acc; \
}
 
static av_always_inline void
yuv2mono_X_c_template(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, int dstW,
int y, enum AVPixelFormat target)
{
const uint8_t * const d128 = ff_dither_8x8_220[y&7];
int i;
unsigned acc = 0;
int err = 0;
 
for (i = 0; i < dstW; i += 2) {
int j;
int Y1 = 1 << 18;
int Y2 = 1 << 18;
 
for (j = 0; j < lumFilterSize; j++) {
Y1 += lumSrc[j][i] * lumFilter[j];
Y2 += lumSrc[j][i+1] * lumFilter[j];
}
Y1 >>= 19;
Y2 >>= 19;
if ((Y1 | Y2) & 0x100) {
Y1 = av_clip_uint8(Y1);
Y2 = av_clip_uint8(Y2);
}
if (c->dither == SWS_DITHER_ED) {
Y1 += (7*err + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2] + 8 - 256)>>4;
c->dither_error[0][i] = err;
acc = 2*acc + (Y1 >= 128);
Y1 -= 220*(acc&1);
 
err = Y2 + ((7*Y1 + 1*c->dither_error[0][i+1] + 5*c->dither_error[0][i+2] + 3*c->dither_error[0][i+3] + 8 - 256)>>4);
c->dither_error[0][i+1] = Y1;
acc = 2*acc + (err >= 128);
err -= 220*(acc&1);
} else {
accumulate_bit(acc, Y1 + d128[(i + 0) & 7]);
accumulate_bit(acc, Y2 + d128[(i + 1) & 7]);
}
if ((i & 7) == 6) {
output_pixel(*dest++, acc);
}
}
c->dither_error[0][i] = err;
 
if (i & 6) {
output_pixel(*dest, acc);
}
}
 
static av_always_inline void
yuv2mono_2_c_template(SwsContext *c, const int16_t *buf[2],
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf[2], uint8_t *dest, int dstW,
int yalpha, int uvalpha, int y,
enum AVPixelFormat target)
{
const int16_t *buf0 = buf[0], *buf1 = buf[1];
const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
int yalpha1 = 4096 - yalpha;
int i;
 
if (c->dither == SWS_DITHER_ED) {
int err = 0;
int acc = 0;
for (i = 0; i < dstW; i +=2) {
int Y;
 
Y = (buf0[i + 0] * yalpha1 + buf1[i + 0] * yalpha) >> 19;
Y += (7*err + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2] + 8 - 256)>>4;
c->dither_error[0][i] = err;
acc = 2*acc + (Y >= 128);
Y -= 220*(acc&1);
 
err = (buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19;
err += (7*Y + 1*c->dither_error[0][i+1] + 5*c->dither_error[0][i+2] + 3*c->dither_error[0][i+3] + 8 - 256)>>4;
c->dither_error[0][i+1] = Y;
acc = 2*acc + (err >= 128);
err -= 220*(acc&1);
 
if ((i & 7) == 6)
output_pixel(*dest++, acc);
}
c->dither_error[0][i] = err;
} else {
for (i = 0; i < dstW; i += 8) {
int Y, acc = 0;
 
Y = (buf0[i + 0] * yalpha1 + buf1[i + 0] * yalpha) >> 19;
accumulate_bit(acc, Y + d128[0]);
Y = (buf0[i + 1] * yalpha1 + buf1[i + 1] * yalpha) >> 19;
accumulate_bit(acc, Y + d128[1]);
Y = (buf0[i + 2] * yalpha1 + buf1[i + 2] * yalpha) >> 19;
accumulate_bit(acc, Y + d128[2]);
Y = (buf0[i + 3] * yalpha1 + buf1[i + 3] * yalpha) >> 19;
accumulate_bit(acc, Y + d128[3]);
Y = (buf0[i + 4] * yalpha1 + buf1[i + 4] * yalpha) >> 19;
accumulate_bit(acc, Y + d128[4]);
Y = (buf0[i + 5] * yalpha1 + buf1[i + 5] * yalpha) >> 19;
accumulate_bit(acc, Y + d128[5]);
Y = (buf0[i + 6] * yalpha1 + buf1[i + 6] * yalpha) >> 19;
accumulate_bit(acc, Y + d128[6]);
Y = (buf0[i + 7] * yalpha1 + buf1[i + 7] * yalpha) >> 19;
accumulate_bit(acc, Y + d128[7]);
 
output_pixel(*dest++, acc);
}
}
}
 
static av_always_inline void
yuv2mono_1_c_template(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest, int dstW,
int uvalpha, int y, enum AVPixelFormat target)
{
const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
int i;
 
if (c->dither == SWS_DITHER_ED) {
int err = 0;
int acc = 0;
for (i = 0; i < dstW; i +=2) {
int Y;
 
Y = ((buf0[i + 0] + 64) >> 7);
Y += (7*err + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2] + 8 - 256)>>4;
c->dither_error[0][i] = err;
acc = 2*acc + (Y >= 128);
Y -= 220*(acc&1);
 
err = ((buf0[i + 1] + 64) >> 7);
err += (7*Y + 1*c->dither_error[0][i+1] + 5*c->dither_error[0][i+2] + 3*c->dither_error[0][i+3] + 8 - 256)>>4;
c->dither_error[0][i+1] = Y;
acc = 2*acc + (err >= 128);
err -= 220*(acc&1);
 
if ((i & 7) == 6)
output_pixel(*dest++, acc);
}
c->dither_error[0][i] = err;
} else {
for (i = 0; i < dstW; i += 8) {
int acc = 0;
accumulate_bit(acc, ((buf0[i + 0] + 64) >> 7) + d128[0]);
accumulate_bit(acc, ((buf0[i + 1] + 64) >> 7) + d128[1]);
accumulate_bit(acc, ((buf0[i + 2] + 64) >> 7) + d128[2]);
accumulate_bit(acc, ((buf0[i + 3] + 64) >> 7) + d128[3]);
accumulate_bit(acc, ((buf0[i + 4] + 64) >> 7) + d128[4]);
accumulate_bit(acc, ((buf0[i + 5] + 64) >> 7) + d128[5]);
accumulate_bit(acc, ((buf0[i + 6] + 64) >> 7) + d128[6]);
accumulate_bit(acc, ((buf0[i + 7] + 64) >> 7) + d128[7]);
 
output_pixel(*dest++, acc);
}
}
}
 
#undef output_pixel
#undef accumulate_bit
 
#define YUV2PACKEDWRAPPER(name, base, ext, fmt) \
static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
const int16_t **lumSrc, int lumFilterSize, \
const int16_t *chrFilter, const int16_t **chrUSrc, \
const int16_t **chrVSrc, int chrFilterSize, \
const int16_t **alpSrc, uint8_t *dest, int dstW, \
int y) \
{ \
name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
alpSrc, dest, dstW, y, fmt); \
} \
\
static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
const int16_t *ubuf[2], const int16_t *vbuf[2], \
const int16_t *abuf[2], uint8_t *dest, int dstW, \
int yalpha, int uvalpha, int y) \
{ \
name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
dest, dstW, yalpha, uvalpha, y, fmt); \
} \
\
static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
const int16_t *ubuf[2], const int16_t *vbuf[2], \
const int16_t *abuf0, uint8_t *dest, int dstW, \
int uvalpha, int y) \
{ \
name ## base ## _1_c_template(c, buf0, ubuf, vbuf, \
abuf0, dest, dstW, uvalpha, \
y, fmt); \
}
 
YUV2PACKEDWRAPPER(yuv2mono,, white, AV_PIX_FMT_MONOWHITE)
YUV2PACKEDWRAPPER(yuv2mono,, black, AV_PIX_FMT_MONOBLACK)
 
#define output_pixels(pos, Y1, U, Y2, V) \
if (target == AV_PIX_FMT_YUYV422) { \
dest[pos + 0] = Y1; \
dest[pos + 1] = U; \
dest[pos + 2] = Y2; \
dest[pos + 3] = V; \
} else if (target == AV_PIX_FMT_YVYU422) { \
dest[pos + 0] = Y1; \
dest[pos + 1] = V; \
dest[pos + 2] = Y2; \
dest[pos + 3] = U; \
} else { /* AV_PIX_FMT_UYVY422 */ \
dest[pos + 0] = U; \
dest[pos + 1] = Y1; \
dest[pos + 2] = V; \
dest[pos + 3] = Y2; \
}
 
static av_always_inline void
yuv2422_X_c_template(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, int dstW,
int y, enum AVPixelFormat target)
{
int i;
 
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int j;
int Y1 = 1 << 18;
int Y2 = 1 << 18;
int U = 1 << 18;
int V = 1 << 18;
 
for (j = 0; j < lumFilterSize; j++) {
Y1 += lumSrc[j][i * 2] * lumFilter[j];
Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
}
for (j = 0; j < chrFilterSize; j++) {
U += chrUSrc[j][i] * chrFilter[j];
V += chrVSrc[j][i] * chrFilter[j];
}
Y1 >>= 19;
Y2 >>= 19;
U >>= 19;
V >>= 19;
if ((Y1 | Y2 | U | V) & 0x100) {
Y1 = av_clip_uint8(Y1);
Y2 = av_clip_uint8(Y2);
U = av_clip_uint8(U);
V = av_clip_uint8(V);
}
output_pixels(4*i, Y1, U, Y2, V);
}
}
 
static av_always_inline void
yuv2422_2_c_template(SwsContext *c, const int16_t *buf[2],
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf[2], uint8_t *dest, int dstW,
int yalpha, int uvalpha, int y,
enum AVPixelFormat target)
{
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
*vbuf0 = vbuf[0], *vbuf1 = vbuf[1];
int yalpha1 = 4096 - yalpha;
int uvalpha1 = 4096 - uvalpha;
int i;
 
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int Y1 = (buf0[i * 2] * yalpha1 + buf1[i * 2] * yalpha) >> 19;
int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 19;
int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha) >> 19;
int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha) >> 19;
 
if ((Y1 | Y2 | U | V) & 0x100) {
Y1 = av_clip_uint8(Y1);
Y2 = av_clip_uint8(Y2);
U = av_clip_uint8(U);
V = av_clip_uint8(V);
}
 
output_pixels(i * 4, Y1, U, Y2, V);
}
}
 
static av_always_inline void
yuv2422_1_c_template(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest, int dstW,
int uvalpha, int y, enum AVPixelFormat target)
{
const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
int i;
 
if (uvalpha < 2048) {
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int Y1 = (buf0[i * 2 ]+64) >> 7;
int Y2 = (buf0[i * 2 + 1]+64) >> 7;
int U = (ubuf0[i] +64) >> 7;
int V = (vbuf0[i] +64) >> 7;
 
if ((Y1 | Y2 | U | V) & 0x100) {
Y1 = av_clip_uint8(Y1);
Y2 = av_clip_uint8(Y2);
U = av_clip_uint8(U);
V = av_clip_uint8(V);
}
 
Y1 = av_clip_uint8(Y1);
Y2 = av_clip_uint8(Y2);
U = av_clip_uint8(U);
V = av_clip_uint8(V);
 
output_pixels(i * 4, Y1, U, Y2, V);
}
} else {
const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int Y1 = (buf0[i * 2 ] + 64) >> 7;
int Y2 = (buf0[i * 2 + 1] + 64) >> 7;
int U = (ubuf0[i] + ubuf1[i]+128) >> 8;
int V = (vbuf0[i] + vbuf1[i]+128) >> 8;
 
if ((Y1 | Y2 | U | V) & 0x100) {
Y1 = av_clip_uint8(Y1);
Y2 = av_clip_uint8(Y2);
U = av_clip_uint8(U);
V = av_clip_uint8(V);
}
 
Y1 = av_clip_uint8(Y1);
Y2 = av_clip_uint8(Y2);
U = av_clip_uint8(U);
V = av_clip_uint8(V);
 
output_pixels(i * 4, Y1, U, Y2, V);
}
}
}
 
#undef output_pixels
 
YUV2PACKEDWRAPPER(yuv2, 422, yuyv422, AV_PIX_FMT_YUYV422)
YUV2PACKEDWRAPPER(yuv2, 422, yvyu422, AV_PIX_FMT_YVYU422)
YUV2PACKEDWRAPPER(yuv2, 422, uyvy422, AV_PIX_FMT_UYVY422)
 
#define R_B ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE || target == AV_PIX_FMT_RGBA64LE || target == AV_PIX_FMT_RGBA64BE) ? R : B)
#define B_R ((target == AV_PIX_FMT_RGB48LE || target == AV_PIX_FMT_RGB48BE || target == AV_PIX_FMT_RGBA64LE || target == AV_PIX_FMT_RGBA64BE) ? B : R)
#define output_pixel(pos, val) \
if (isBE(target)) { \
AV_WB16(pos, val); \
} else { \
AV_WL16(pos, val); \
}
 
static av_always_inline void
yuv2rgba64_X_c_template(SwsContext *c, const int16_t *lumFilter,
const int32_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int32_t **chrUSrc,
const int32_t **chrVSrc, int chrFilterSize,
const int32_t **alpSrc, uint16_t *dest, int dstW,
int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
{
int i;
int A1 = 0xffff<<14, A2 = 0xffff<<14;
 
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int j;
int Y1 = -0x40000000;
int Y2 = -0x40000000;
int U = -128 << 23; // 19
int V = -128 << 23;
int R, G, B;
 
for (j = 0; j < lumFilterSize; j++) {
Y1 += lumSrc[j][i * 2] * (unsigned)lumFilter[j];
Y2 += lumSrc[j][i * 2 + 1] * (unsigned)lumFilter[j];
}
for (j = 0; j < chrFilterSize; j++) {;
U += chrUSrc[j][i] * (unsigned)chrFilter[j];
V += chrVSrc[j][i] * (unsigned)chrFilter[j];
}
 
if (hasAlpha) {
A1 = -0x40000000;
A2 = -0x40000000;
for (j = 0; j < lumFilterSize; j++) {
A1 += alpSrc[j][i * 2] * (unsigned)lumFilter[j];
A2 += alpSrc[j][i * 2 + 1] * (unsigned)lumFilter[j];
}
A1 >>= 1;
A1 += 0x20002000;
A2 >>= 1;
A2 += 0x20002000;
}
 
// 8bit: 12+15=27; 16-bit: 12+19=31
Y1 >>= 14; // 10
Y1 += 0x10000;
Y2 >>= 14;
Y2 += 0x10000;
U >>= 14;
V >>= 14;
 
// 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
Y1 -= c->yuv2rgb_y_offset;
Y2 -= c->yuv2rgb_y_offset;
Y1 *= c->yuv2rgb_y_coeff;
Y2 *= c->yuv2rgb_y_coeff;
Y1 += 1 << 13; // 21
Y2 += 1 << 13;
// 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 
R = V * c->yuv2rgb_v2r_coeff;
G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
B = U * c->yuv2rgb_u2b_coeff;
 
// 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14);
output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
if (eightbytes) {
output_pixel(&dest[3], av_clip_uintp2(A1 , 30) >> 14);
output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14);
output_pixel(&dest[5], av_clip_uintp2( G + Y2, 30) >> 14);
output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14);
output_pixel(&dest[7], av_clip_uintp2(A2 , 30) >> 14);
dest += 8;
} else {
output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
output_pixel(&dest[4], av_clip_uintp2( G + Y2, 30) >> 14);
output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
dest += 6;
}
}
}
 
static av_always_inline void
yuv2rgba64_2_c_template(SwsContext *c, const int32_t *buf[2],
const int32_t *ubuf[2], const int32_t *vbuf[2],
const int32_t *abuf[2], uint16_t *dest, int dstW,
int yalpha, int uvalpha, int y,
enum AVPixelFormat target, int hasAlpha, int eightbytes)
{
const int32_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
*vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
*abuf0 = hasAlpha ? abuf[0] : NULL,
*abuf1 = hasAlpha ? abuf[1] : NULL;
int yalpha1 = 4096 - yalpha;
int uvalpha1 = 4096 - uvalpha;
int i;
int A1 = 0xffff<<14, A2 = 0xffff<<14;
 
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int Y1 = (buf0[i * 2] * yalpha1 + buf1[i * 2] * yalpha) >> 14;
int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 14;
int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha + (-128 << 23)) >> 14;
int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha + (-128 << 23)) >> 14;
int R, G, B;
 
Y1 -= c->yuv2rgb_y_offset;
Y2 -= c->yuv2rgb_y_offset;
Y1 *= c->yuv2rgb_y_coeff;
Y2 *= c->yuv2rgb_y_coeff;
Y1 += 1 << 13;
Y2 += 1 << 13;
 
R = V * c->yuv2rgb_v2r_coeff;
G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
B = U * c->yuv2rgb_u2b_coeff;
 
if (hasAlpha) {
A1 = (abuf0[i * 2 ] * yalpha1 + abuf1[i * 2 ] * yalpha) >> 1;
A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 1;
 
A1 += 1 << 13;
A2 += 1 << 13;
}
 
output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14);
output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
if (eightbytes) {
output_pixel(&dest[3], av_clip_uintp2(A1 , 30) >> 14);
output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14);
output_pixel(&dest[5], av_clip_uintp2( G + Y2, 30) >> 14);
output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14);
output_pixel(&dest[7], av_clip_uintp2(A2 , 30) >> 14);
dest += 8;
} else {
output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
output_pixel(&dest[4], av_clip_uintp2( G + Y2, 30) >> 14);
output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
dest += 6;
}
}
}
 
static av_always_inline void
yuv2rgba64_1_c_template(SwsContext *c, const int32_t *buf0,
const int32_t *ubuf[2], const int32_t *vbuf[2],
const int32_t *abuf0, uint16_t *dest, int dstW,
int uvalpha, int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
{
const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
int i;
int A1 = 0xffff<<14, A2= 0xffff<<14;
 
if (uvalpha < 2048) {
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int Y1 = (buf0[i * 2] ) >> 2;
int Y2 = (buf0[i * 2 + 1]) >> 2;
int U = (ubuf0[i] + (-128 << 11)) >> 2;
int V = (vbuf0[i] + (-128 << 11)) >> 2;
int R, G, B;
 
Y1 -= c->yuv2rgb_y_offset;
Y2 -= c->yuv2rgb_y_offset;
Y1 *= c->yuv2rgb_y_coeff;
Y2 *= c->yuv2rgb_y_coeff;
Y1 += 1 << 13;
Y2 += 1 << 13;
 
if (hasAlpha) {
A1 = abuf0[i * 2 ] << 11;
A2 = abuf0[i * 2 + 1] << 11;
 
A1 += 1 << 13;
A2 += 1 << 13;
}
 
R = V * c->yuv2rgb_v2r_coeff;
G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
B = U * c->yuv2rgb_u2b_coeff;
 
output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14);
output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
if (eightbytes) {
output_pixel(&dest[3], av_clip_uintp2(A1 , 30) >> 14);
output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14);
output_pixel(&dest[5], av_clip_uintp2( G + Y2, 30) >> 14);
output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14);
output_pixel(&dest[7], av_clip_uintp2(A2 , 30) >> 14);
dest += 8;
} else {
output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
output_pixel(&dest[4], av_clip_uintp2( G + Y2, 30) >> 14);
output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
dest += 6;
}
}
} else {
const int32_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
int A1 = 0xffff<<14, A2 = 0xffff<<14;
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int Y1 = (buf0[i * 2] ) >> 2;
int Y2 = (buf0[i * 2 + 1]) >> 2;
int U = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
int V = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
int R, G, B;
 
Y1 -= c->yuv2rgb_y_offset;
Y2 -= c->yuv2rgb_y_offset;
Y1 *= c->yuv2rgb_y_coeff;
Y2 *= c->yuv2rgb_y_coeff;
Y1 += 1 << 13;
Y2 += 1 << 13;
 
if (hasAlpha) {
A1 = abuf0[i * 2 ] << 11;
A2 = abuf0[i * 2 + 1] << 11;
 
A1 += 1 << 13;
A2 += 1 << 13;
}
 
R = V * c->yuv2rgb_v2r_coeff;
G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
B = U * c->yuv2rgb_u2b_coeff;
 
output_pixel(&dest[0], av_clip_uintp2(R_B + Y1, 30) >> 14);
output_pixel(&dest[1], av_clip_uintp2( G + Y1, 30) >> 14);
output_pixel(&dest[2], av_clip_uintp2(B_R + Y1, 30) >> 14);
if (eightbytes) {
output_pixel(&dest[3], av_clip_uintp2(A1 , 30) >> 14);
output_pixel(&dest[4], av_clip_uintp2(R_B + Y2, 30) >> 14);
output_pixel(&dest[5], av_clip_uintp2( G + Y2, 30) >> 14);
output_pixel(&dest[6], av_clip_uintp2(B_R + Y2, 30) >> 14);
output_pixel(&dest[7], av_clip_uintp2(A2 , 30) >> 14);
dest += 8;
} else {
output_pixel(&dest[3], av_clip_uintp2(R_B + Y2, 30) >> 14);
output_pixel(&dest[4], av_clip_uintp2( G + Y2, 30) >> 14);
output_pixel(&dest[5], av_clip_uintp2(B_R + Y2, 30) >> 14);
dest += 6;
}
}
}
}
 
static av_always_inline void
yuv2rgba64_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
const int32_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int32_t **chrUSrc,
const int32_t **chrVSrc, int chrFilterSize,
const int32_t **alpSrc, uint16_t *dest, int dstW,
int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
{
int i;
int A = 0xffff<<14;
 
for (i = 0; i < dstW; i++) {
int j;
int Y = -0x40000000;
int U = -128 << 23; // 19
int V = -128 << 23;
int R, G, B;
 
for (j = 0; j < lumFilterSize; j++) {
Y += lumSrc[j][i] * (unsigned)lumFilter[j];
}
for (j = 0; j < chrFilterSize; j++) {;
U += chrUSrc[j][i] * (unsigned)chrFilter[j];
V += chrVSrc[j][i] * (unsigned)chrFilter[j];
}
 
if (hasAlpha) {
A = -0x40000000;
for (j = 0; j < lumFilterSize; j++) {
A += alpSrc[j][i] * (unsigned)lumFilter[j];
}
A >>= 1;
A += 0x20002000;
}
 
// 8bit: 12+15=27; 16-bit: 12+19=31
Y >>= 14; // 10
Y += 0x10000;
U >>= 14;
V >>= 14;
 
// 8bit: 27 -> 17bit, 16bit: 31 - 14 = 17bit
Y -= c->yuv2rgb_y_offset;
Y *= c->yuv2rgb_y_coeff;
Y += 1 << 13; // 21
// 8bit: 17 + 13bit = 30bit, 16bit: 17 + 13bit = 30bit
 
R = V * c->yuv2rgb_v2r_coeff;
G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
B = U * c->yuv2rgb_u2b_coeff;
 
// 8bit: 30 - 22 = 8bit, 16bit: 30bit - 14 = 16bit
output_pixel(&dest[0], av_clip_uintp2(R_B + Y, 30) >> 14);
output_pixel(&dest[1], av_clip_uintp2( G + Y, 30) >> 14);
output_pixel(&dest[2], av_clip_uintp2(B_R + Y, 30) >> 14);
if (eightbytes) {
output_pixel(&dest[3], av_clip_uintp2(A, 30) >> 14);
dest += 4;
} else {
dest += 3;
}
}
}
 
static av_always_inline void
yuv2rgba64_full_2_c_template(SwsContext *c, const int32_t *buf[2],
const int32_t *ubuf[2], const int32_t *vbuf[2],
const int32_t *abuf[2], uint16_t *dest, int dstW,
int yalpha, int uvalpha, int y,
enum AVPixelFormat target, int hasAlpha, int eightbytes)
{
const int32_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
*vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
*abuf0 = hasAlpha ? abuf[0] : NULL,
*abuf1 = hasAlpha ? abuf[1] : NULL;
int yalpha1 = 4096 - yalpha;
int uvalpha1 = 4096 - uvalpha;
int i;
int A = 0xffff<<14;
 
for (i = 0; i < dstW; i++) {
int Y = (buf0[i] * yalpha1 + buf1[i] * yalpha) >> 14;
int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha + (-128 << 23)) >> 14;
int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha + (-128 << 23)) >> 14;
int R, G, B;
 
Y -= c->yuv2rgb_y_offset;
Y *= c->yuv2rgb_y_coeff;
Y += 1 << 13;
 
R = V * c->yuv2rgb_v2r_coeff;
G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
B = U * c->yuv2rgb_u2b_coeff;
 
if (hasAlpha) {
A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha) >> 1;
 
A += 1 << 13;
}
 
output_pixel(&dest[0], av_clip_uintp2(R_B + Y, 30) >> 14);
output_pixel(&dest[1], av_clip_uintp2( G + Y, 30) >> 14);
output_pixel(&dest[2], av_clip_uintp2(B_R + Y, 30) >> 14);
if (eightbytes) {
output_pixel(&dest[3], av_clip_uintp2(A, 30) >> 14);
dest += 4;
} else {
dest += 3;
}
}
}
 
static av_always_inline void
yuv2rgba64_full_1_c_template(SwsContext *c, const int32_t *buf0,
const int32_t *ubuf[2], const int32_t *vbuf[2],
const int32_t *abuf0, uint16_t *dest, int dstW,
int uvalpha, int y, enum AVPixelFormat target, int hasAlpha, int eightbytes)
{
const int32_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
int i;
int A = 0xffff<<14;
 
if (uvalpha < 2048) {
for (i = 0; i < dstW; i++) {
int Y = (buf0[i]) >> 2;
int U = (ubuf0[i] + (-128 << 11)) >> 2;
int V = (vbuf0[i] + (-128 << 11)) >> 2;
int R, G, B;
 
Y -= c->yuv2rgb_y_offset;
Y *= c->yuv2rgb_y_coeff;
Y += 1 << 13;
 
if (hasAlpha) {
A = abuf0[i] << 11;
 
A += 1 << 13;
}
 
R = V * c->yuv2rgb_v2r_coeff;
G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
B = U * c->yuv2rgb_u2b_coeff;
 
output_pixel(&dest[0], av_clip_uintp2(R_B + Y, 30) >> 14);
output_pixel(&dest[1], av_clip_uintp2( G + Y, 30) >> 14);
output_pixel(&dest[2], av_clip_uintp2(B_R + Y, 30) >> 14);
if (eightbytes) {
output_pixel(&dest[3], av_clip_uintp2(A, 30) >> 14);
dest += 4;
} else {
dest += 3;
}
}
} else {
const int32_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
int A = 0xffff<<14;
for (i = 0; i < dstW; i++) {
int Y = (buf0[i] ) >> 2;
int U = (ubuf0[i] + ubuf1[i] + (-128 << 12)) >> 3;
int V = (vbuf0[i] + vbuf1[i] + (-128 << 12)) >> 3;
int R, G, B;
 
Y -= c->yuv2rgb_y_offset;
Y *= c->yuv2rgb_y_coeff;
Y += 1 << 13;
 
if (hasAlpha) {
A = abuf0[i] << 11;
 
A += 1 << 13;
}
 
R = V * c->yuv2rgb_v2r_coeff;
G = V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
B = U * c->yuv2rgb_u2b_coeff;
 
output_pixel(&dest[0], av_clip_uintp2(R_B + Y, 30) >> 14);
output_pixel(&dest[1], av_clip_uintp2( G + Y, 30) >> 14);
output_pixel(&dest[2], av_clip_uintp2(B_R + Y, 30) >> 14);
if (eightbytes) {
output_pixel(&dest[3], av_clip_uintp2(A, 30) >> 14);
dest += 4;
} else {
dest += 3;
}
}
}
}
 
#undef output_pixel
#undef r_b
#undef b_r
 
#define YUV2PACKED16WRAPPER(name, base, ext, fmt, hasAlpha, eightbytes) \
static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
const int16_t **_lumSrc, int lumFilterSize, \
const int16_t *chrFilter, const int16_t **_chrUSrc, \
const int16_t **_chrVSrc, int chrFilterSize, \
const int16_t **_alpSrc, uint8_t *_dest, int dstW, \
int y) \
{ \
const int32_t **lumSrc = (const int32_t **) _lumSrc, \
**chrUSrc = (const int32_t **) _chrUSrc, \
**chrVSrc = (const int32_t **) _chrVSrc, \
**alpSrc = (const int32_t **) _alpSrc; \
uint16_t *dest = (uint16_t *) _dest; \
name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
alpSrc, dest, dstW, y, fmt, hasAlpha, eightbytes); \
} \
\
static void name ## ext ## _2_c(SwsContext *c, const int16_t *_buf[2], \
const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
const int16_t *_abuf[2], uint8_t *_dest, int dstW, \
int yalpha, int uvalpha, int y) \
{ \
const int32_t **buf = (const int32_t **) _buf, \
**ubuf = (const int32_t **) _ubuf, \
**vbuf = (const int32_t **) _vbuf, \
**abuf = (const int32_t **) _abuf; \
uint16_t *dest = (uint16_t *) _dest; \
name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha, eightbytes); \
} \
\
static void name ## ext ## _1_c(SwsContext *c, const int16_t *_buf0, \
const int16_t *_ubuf[2], const int16_t *_vbuf[2], \
const int16_t *_abuf0, uint8_t *_dest, int dstW, \
int uvalpha, int y) \
{ \
const int32_t *buf0 = (const int32_t *) _buf0, \
**ubuf = (const int32_t **) _ubuf, \
**vbuf = (const int32_t **) _vbuf, \
*abuf0 = (const int32_t *) _abuf0; \
uint16_t *dest = (uint16_t *) _dest; \
name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
dstW, uvalpha, y, fmt, hasAlpha, eightbytes); \
}
 
YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48be, AV_PIX_FMT_RGB48BE, 0, 0)
YUV2PACKED16WRAPPER(yuv2, rgba64, rgb48le, AV_PIX_FMT_RGB48LE, 0, 0)
YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48be, AV_PIX_FMT_BGR48BE, 0, 0)
YUV2PACKED16WRAPPER(yuv2, rgba64, bgr48le, AV_PIX_FMT_BGR48LE, 0, 0)
YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64be, AV_PIX_FMT_RGBA64BE, 1, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64, rgba64le, AV_PIX_FMT_RGBA64LE, 1, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64be, AV_PIX_FMT_RGBA64BE, 0, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64, rgbx64le, AV_PIX_FMT_RGBA64LE, 0, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64be, AV_PIX_FMT_BGRA64BE, 1, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64, bgra64le, AV_PIX_FMT_BGRA64LE, 1, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64be, AV_PIX_FMT_BGRA64BE, 0, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64, bgrx64le, AV_PIX_FMT_BGRA64LE, 0, 1)
 
YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48be_full, AV_PIX_FMT_RGB48BE, 0, 0)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgb48le_full, AV_PIX_FMT_RGB48LE, 0, 0)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48be_full, AV_PIX_FMT_BGR48BE, 0, 0)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgr48le_full, AV_PIX_FMT_BGR48LE, 0, 0)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64be_full, AV_PIX_FMT_RGBA64BE, 1, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgba64le_full, AV_PIX_FMT_RGBA64LE, 1, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64be_full, AV_PIX_FMT_RGBA64BE, 0, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, rgbx64le_full, AV_PIX_FMT_RGBA64LE, 0, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64be_full, AV_PIX_FMT_BGRA64BE, 1, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgra64le_full, AV_PIX_FMT_BGRA64LE, 1, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64be_full, AV_PIX_FMT_BGRA64BE, 0, 1)
YUV2PACKED16WRAPPER(yuv2, rgba64_full, bgrx64le_full, AV_PIX_FMT_BGRA64LE, 0, 1)
 
/*
* Write out 2 RGB pixels in the target pixel format. This function takes a
* R/G/B LUT as generated by ff_yuv2rgb_c_init_tables(), which takes care of
* things like endianness conversion and shifting. The caller takes care of
* setting the correct offset in these tables from the chroma (U/V) values.
* This function then uses the luminance (Y1/Y2) values to write out the
* correct RGB values into the destination buffer.
*/
static av_always_inline void
yuv2rgb_write(uint8_t *_dest, int i, int Y1, int Y2,
unsigned A1, unsigned A2,
const void *_r, const void *_g, const void *_b, int y,
enum AVPixelFormat target, int hasAlpha)
{
if (target == AV_PIX_FMT_ARGB || target == AV_PIX_FMT_RGBA ||
target == AV_PIX_FMT_ABGR || target == AV_PIX_FMT_BGRA) {
uint32_t *dest = (uint32_t *) _dest;
const uint32_t *r = (const uint32_t *) _r;
const uint32_t *g = (const uint32_t *) _g;
const uint32_t *b = (const uint32_t *) _b;
 
#if CONFIG_SMALL
int sh = hasAlpha ? ((target == AV_PIX_FMT_RGB32_1 || target == AV_PIX_FMT_BGR32_1) ? 0 : 24) : 0;
 
dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (hasAlpha ? A1 << sh : 0);
dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (hasAlpha ? A2 << sh : 0);
#else
if (hasAlpha) {
int sh = (target == AV_PIX_FMT_RGB32_1 || target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
 
av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0);
dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1] + (A1 << sh);
dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2] + (A2 << sh);
} else {
#if defined(ASSERT_LEVEL) && ASSERT_LEVEL > 1
int sh = (target == AV_PIX_FMT_RGB32_1 || target == AV_PIX_FMT_BGR32_1) ? 0 : 24;
 
av_assert2((((r[Y1] + g[Y1] + b[Y1]) >> sh) & 0xFF) == 0xFF);
#endif
dest[i * 2 + 0] = r[Y1] + g[Y1] + b[Y1];
dest[i * 2 + 1] = r[Y2] + g[Y2] + b[Y2];
}
#endif
} else if (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) {
uint8_t *dest = (uint8_t *) _dest;
const uint8_t *r = (const uint8_t *) _r;
const uint8_t *g = (const uint8_t *) _g;
const uint8_t *b = (const uint8_t *) _b;
 
#define r_b ((target == AV_PIX_FMT_RGB24) ? r : b)
#define b_r ((target == AV_PIX_FMT_RGB24) ? b : r)
 
dest[i * 6 + 0] = r_b[Y1];
dest[i * 6 + 1] = g[Y1];
dest[i * 6 + 2] = b_r[Y1];
dest[i * 6 + 3] = r_b[Y2];
dest[i * 6 + 4] = g[Y2];
dest[i * 6 + 5] = b_r[Y2];
#undef r_b
#undef b_r
} else if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565 ||
target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555 ||
target == AV_PIX_FMT_RGB444 || target == AV_PIX_FMT_BGR444) {
uint16_t *dest = (uint16_t *) _dest;
const uint16_t *r = (const uint16_t *) _r;
const uint16_t *g = (const uint16_t *) _g;
const uint16_t *b = (const uint16_t *) _b;
int dr1, dg1, db1, dr2, dg2, db2;
 
if (target == AV_PIX_FMT_RGB565 || target == AV_PIX_FMT_BGR565) {
dr1 = ff_dither_2x2_8[ y & 1 ][0];
dg1 = ff_dither_2x2_4[ y & 1 ][0];
db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
dr2 = ff_dither_2x2_8[ y & 1 ][1];
dg2 = ff_dither_2x2_4[ y & 1 ][1];
db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
} else if (target == AV_PIX_FMT_RGB555 || target == AV_PIX_FMT_BGR555) {
dr1 = ff_dither_2x2_8[ y & 1 ][0];
dg1 = ff_dither_2x2_8[ y & 1 ][1];
db1 = ff_dither_2x2_8[(y & 1) ^ 1][0];
dr2 = ff_dither_2x2_8[ y & 1 ][1];
dg2 = ff_dither_2x2_8[ y & 1 ][0];
db2 = ff_dither_2x2_8[(y & 1) ^ 1][1];
} else {
dr1 = ff_dither_4x4_16[ y & 3 ][0];
dg1 = ff_dither_4x4_16[ y & 3 ][1];
db1 = ff_dither_4x4_16[(y & 3) ^ 3][0];
dr2 = ff_dither_4x4_16[ y & 3 ][1];
dg2 = ff_dither_4x4_16[ y & 3 ][0];
db2 = ff_dither_4x4_16[(y & 3) ^ 3][1];
}
 
dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
} else /* 8/4-bit */ {
uint8_t *dest = (uint8_t *) _dest;
const uint8_t *r = (const uint8_t *) _r;
const uint8_t *g = (const uint8_t *) _g;
const uint8_t *b = (const uint8_t *) _b;
int dr1, dg1, db1, dr2, dg2, db2;
 
if (target == AV_PIX_FMT_RGB8 || target == AV_PIX_FMT_BGR8) {
const uint8_t * const d64 = ff_dither_8x8_73[y & 7];
const uint8_t * const d32 = ff_dither_8x8_32[y & 7];
dr1 = dg1 = d32[(i * 2 + 0) & 7];
db1 = d64[(i * 2 + 0) & 7];
dr2 = dg2 = d32[(i * 2 + 1) & 7];
db2 = d64[(i * 2 + 1) & 7];
} else {
const uint8_t * const d64 = ff_dither_8x8_73 [y & 7];
const uint8_t * const d128 = ff_dither_8x8_220[y & 7];
dr1 = db1 = d128[(i * 2 + 0) & 7];
dg1 = d64[(i * 2 + 0) & 7];
dr2 = db2 = d128[(i * 2 + 1) & 7];
dg2 = d64[(i * 2 + 1) & 7];
}
 
if (target == AV_PIX_FMT_RGB4 || target == AV_PIX_FMT_BGR4) {
dest[i] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1] +
((r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2]) << 4);
} else {
dest[i * 2 + 0] = r[Y1 + dr1] + g[Y1 + dg1] + b[Y1 + db1];
dest[i * 2 + 1] = r[Y2 + dr2] + g[Y2 + dg2] + b[Y2 + db2];
}
}
}
 
static av_always_inline void
yuv2rgb_X_c_template(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, int dstW,
int y, enum AVPixelFormat target, int hasAlpha)
{
int i;
 
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int j, A1, A2;
int Y1 = 1 << 18;
int Y2 = 1 << 18;
int U = 1 << 18;
int V = 1 << 18;
const void *r, *g, *b;
 
for (j = 0; j < lumFilterSize; j++) {
Y1 += lumSrc[j][i * 2] * lumFilter[j];
Y2 += lumSrc[j][i * 2 + 1] * lumFilter[j];
}
for (j = 0; j < chrFilterSize; j++) {
U += chrUSrc[j][i] * chrFilter[j];
V += chrVSrc[j][i] * chrFilter[j];
}
Y1 >>= 19;
Y2 >>= 19;
U >>= 19;
V >>= 19;
if (hasAlpha) {
A1 = 1 << 18;
A2 = 1 << 18;
for (j = 0; j < lumFilterSize; j++) {
A1 += alpSrc[j][i * 2 ] * lumFilter[j];
A2 += alpSrc[j][i * 2 + 1] * lumFilter[j];
}
A1 >>= 19;
A2 >>= 19;
if ((A1 | A2) & 0x100) {
A1 = av_clip_uint8(A1);
A2 = av_clip_uint8(A2);
}
}
 
r = c->table_rV[V + YUVRGB_TABLE_HEADROOM];
g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]);
b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
r, g, b, y, target, hasAlpha);
}
}
 
static av_always_inline void
yuv2rgb_2_c_template(SwsContext *c, const int16_t *buf[2],
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf[2], uint8_t *dest, int dstW,
int yalpha, int uvalpha, int y,
enum AVPixelFormat target, int hasAlpha)
{
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
*vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
*abuf0 = hasAlpha ? abuf[0] : NULL,
*abuf1 = hasAlpha ? abuf[1] : NULL;
int yalpha1 = 4096 - yalpha;
int uvalpha1 = 4096 - uvalpha;
int i;
 
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int Y1 = (buf0[i * 2] * yalpha1 + buf1[i * 2] * yalpha) >> 19;
int Y2 = (buf0[i * 2 + 1] * yalpha1 + buf1[i * 2 + 1] * yalpha) >> 19;
int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha) >> 19;
int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha) >> 19;
int A1, A2;
const void *r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
*g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
*b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
if (hasAlpha) {
A1 = (abuf0[i * 2 ] * yalpha1 + abuf1[i * 2 ] * yalpha) >> 19;
A2 = (abuf0[i * 2 + 1] * yalpha1 + abuf1[i * 2 + 1] * yalpha) >> 19;
A1 = av_clip_uint8(A1);
A2 = av_clip_uint8(A2);
}
 
yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
r, g, b, y, target, hasAlpha);
}
}
 
static av_always_inline void
yuv2rgb_1_c_template(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest, int dstW,
int uvalpha, int y, enum AVPixelFormat target,
int hasAlpha)
{
const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
int i;
 
if (uvalpha < 2048) {
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int Y1 = (buf0[i * 2 ] + 64) >> 7;
int Y2 = (buf0[i * 2 + 1] + 64) >> 7;
int U = (ubuf0[i] + 64) >> 7;
int V = (vbuf0[i] + 64) >> 7;
int A1, A2;
const void *r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
*g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
*b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
if (hasAlpha) {
A1 = abuf0[i * 2 ] * 255 + 16384 >> 15;
A2 = abuf0[i * 2 + 1] * 255 + 16384 >> 15;
A1 = av_clip_uint8(A1);
A2 = av_clip_uint8(A2);
}
 
yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
r, g, b, y, target, hasAlpha);
}
} else {
const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
for (i = 0; i < ((dstW + 1) >> 1); i++) {
int Y1 = (buf0[i * 2 ] + 64) >> 7;
int Y2 = (buf0[i * 2 + 1] + 64) >> 7;
int U = (ubuf0[i] + ubuf1[i] + 128) >> 8;
int V = (vbuf0[i] + vbuf1[i] + 128) >> 8;
int A1, A2;
const void *r = c->table_rV[V + YUVRGB_TABLE_HEADROOM],
*g = (c->table_gU[U + YUVRGB_TABLE_HEADROOM] + c->table_gV[V + YUVRGB_TABLE_HEADROOM]),
*b = c->table_bU[U + YUVRGB_TABLE_HEADROOM];
 
if (hasAlpha) {
A1 = (abuf0[i * 2 ] + 64) >> 7;
A2 = (abuf0[i * 2 + 1] + 64) >> 7;
A1 = av_clip_uint8(A1);
A2 = av_clip_uint8(A2);
}
 
yuv2rgb_write(dest, i, Y1, Y2, hasAlpha ? A1 : 0, hasAlpha ? A2 : 0,
r, g, b, y, target, hasAlpha);
}
}
}
 
#define YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
static void name ## ext ## _X_c(SwsContext *c, const int16_t *lumFilter, \
const int16_t **lumSrc, int lumFilterSize, \
const int16_t *chrFilter, const int16_t **chrUSrc, \
const int16_t **chrVSrc, int chrFilterSize, \
const int16_t **alpSrc, uint8_t *dest, int dstW, \
int y) \
{ \
name ## base ## _X_c_template(c, lumFilter, lumSrc, lumFilterSize, \
chrFilter, chrUSrc, chrVSrc, chrFilterSize, \
alpSrc, dest, dstW, y, fmt, hasAlpha); \
}
 
#define YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
YUV2RGBWRAPPERX(name, base, ext, fmt, hasAlpha) \
static void name ## ext ## _2_c(SwsContext *c, const int16_t *buf[2], \
const int16_t *ubuf[2], const int16_t *vbuf[2], \
const int16_t *abuf[2], uint8_t *dest, int dstW, \
int yalpha, int uvalpha, int y) \
{ \
name ## base ## _2_c_template(c, buf, ubuf, vbuf, abuf, \
dest, dstW, yalpha, uvalpha, y, fmt, hasAlpha); \
}
 
#define YUV2RGBWRAPPER(name, base, ext, fmt, hasAlpha) \
YUV2RGBWRAPPERX2(name, base, ext, fmt, hasAlpha) \
static void name ## ext ## _1_c(SwsContext *c, const int16_t *buf0, \
const int16_t *ubuf[2], const int16_t *vbuf[2], \
const int16_t *abuf0, uint8_t *dest, int dstW, \
int uvalpha, int y) \
{ \
name ## base ## _1_c_template(c, buf0, ubuf, vbuf, abuf0, dest, \
dstW, uvalpha, y, fmt, hasAlpha); \
}
 
#if CONFIG_SMALL
YUV2RGBWRAPPER(yuv2rgb,, 32_1, AV_PIX_FMT_RGB32_1, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
YUV2RGBWRAPPER(yuv2rgb,, 32, AV_PIX_FMT_RGB32, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
#else
#if CONFIG_SWSCALE_ALPHA
YUV2RGBWRAPPER(yuv2rgb,, a32_1, AV_PIX_FMT_RGB32_1, 1)
YUV2RGBWRAPPER(yuv2rgb,, a32, AV_PIX_FMT_RGB32, 1)
#endif
YUV2RGBWRAPPER(yuv2rgb,, x32_1, AV_PIX_FMT_RGB32_1, 0)
YUV2RGBWRAPPER(yuv2rgb,, x32, AV_PIX_FMT_RGB32, 0)
#endif
YUV2RGBWRAPPER(yuv2, rgb, rgb24, AV_PIX_FMT_RGB24, 0)
YUV2RGBWRAPPER(yuv2, rgb, bgr24, AV_PIX_FMT_BGR24, 0)
YUV2RGBWRAPPER(yuv2rgb,, 16, AV_PIX_FMT_RGB565, 0)
YUV2RGBWRAPPER(yuv2rgb,, 15, AV_PIX_FMT_RGB555, 0)
YUV2RGBWRAPPER(yuv2rgb,, 12, AV_PIX_FMT_RGB444, 0)
YUV2RGBWRAPPER(yuv2rgb,, 8, AV_PIX_FMT_RGB8, 0)
YUV2RGBWRAPPER(yuv2rgb,, 4, AV_PIX_FMT_RGB4, 0)
YUV2RGBWRAPPER(yuv2rgb,, 4b, AV_PIX_FMT_RGB4_BYTE, 0)
 
static av_always_inline void yuv2rgb_write_full(SwsContext *c,
uint8_t *dest, int i, int Y, int A, int U, int V,
int y, enum AVPixelFormat target, int hasAlpha, int err[4])
{
int R, G, B;
int isrgb8 = target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8;
 
Y -= c->yuv2rgb_y_offset;
Y *= c->yuv2rgb_y_coeff;
Y += 1 << 21;
R = Y + V*c->yuv2rgb_v2r_coeff;
G = Y + V*c->yuv2rgb_v2g_coeff + U*c->yuv2rgb_u2g_coeff;
B = Y + U*c->yuv2rgb_u2b_coeff;
if ((R | G | B) & 0xC0000000) {
R = av_clip_uintp2(R, 30);
G = av_clip_uintp2(G, 30);
B = av_clip_uintp2(B, 30);
}
 
switch(target) {
case AV_PIX_FMT_ARGB:
dest[0] = hasAlpha ? A : 255;
dest[1] = R >> 22;
dest[2] = G >> 22;
dest[3] = B >> 22;
break;
case AV_PIX_FMT_RGB24:
dest[0] = R >> 22;
dest[1] = G >> 22;
dest[2] = B >> 22;
break;
case AV_PIX_FMT_RGBA:
dest[0] = R >> 22;
dest[1] = G >> 22;
dest[2] = B >> 22;
dest[3] = hasAlpha ? A : 255;
break;
case AV_PIX_FMT_ABGR:
dest[0] = hasAlpha ? A : 255;
dest[1] = B >> 22;
dest[2] = G >> 22;
dest[3] = R >> 22;
break;
case AV_PIX_FMT_BGR24:
dest[0] = B >> 22;
dest[1] = G >> 22;
dest[2] = R >> 22;
break;
case AV_PIX_FMT_BGRA:
dest[0] = B >> 22;
dest[1] = G >> 22;
dest[2] = R >> 22;
dest[3] = hasAlpha ? A : 255;
break;
case AV_PIX_FMT_BGR4_BYTE:
case AV_PIX_FMT_RGB4_BYTE:
case AV_PIX_FMT_BGR8:
case AV_PIX_FMT_RGB8:
{
int r,g,b;
 
switch (c->dither) {
default:
case SWS_DITHER_AUTO:
case SWS_DITHER_ED:
R >>= 22;
G >>= 22;
B >>= 22;
R += (7*err[0] + 1*c->dither_error[0][i] + 5*c->dither_error[0][i+1] + 3*c->dither_error[0][i+2])>>4;
G += (7*err[1] + 1*c->dither_error[1][i] + 5*c->dither_error[1][i+1] + 3*c->dither_error[1][i+2])>>4;
B += (7*err[2] + 1*c->dither_error[2][i] + 5*c->dither_error[2][i+1] + 3*c->dither_error[2][i+2])>>4;
c->dither_error[0][i] = err[0];
c->dither_error[1][i] = err[1];
c->dither_error[2][i] = err[2];
r = R >> (isrgb8 ? 5 : 7);
g = G >> (isrgb8 ? 5 : 6);
b = B >> (isrgb8 ? 6 : 7);
r = av_clip(r, 0, isrgb8 ? 7 : 1);
g = av_clip(g, 0, isrgb8 ? 7 : 3);
b = av_clip(b, 0, isrgb8 ? 3 : 1);
err[0] = R - r*(isrgb8 ? 36 : 255);
err[1] = G - g*(isrgb8 ? 36 : 85);
err[2] = B - b*(isrgb8 ? 85 : 255);
break;
case SWS_DITHER_A_DITHER:
if (isrgb8) {
/* see http://pippin.gimp.org/a_dither/ for details/origin */
#define A_DITHER(u,v) (((((u)+((v)*236))*119)&0xff))
r = (((R >> 19) + A_DITHER(i,y) -96)>>8);
g = (((G >> 19) + A_DITHER(i + 17,y) - 96)>>8);
b = (((B >> 20) + A_DITHER(i + 17*2,y) -96)>>8);
r = av_clip_uintp2(r, 3);
g = av_clip_uintp2(g, 3);
b = av_clip_uintp2(b, 2);
} else {
r = (((R >> 21) + A_DITHER(i,y)-256)>>8);
g = (((G >> 19) + A_DITHER(i + 17,y)-256)>>8);
b = (((B >> 21) + A_DITHER(i + 17*2,y)-256)>>8);
r = av_clip_uintp2(r, 1);
g = av_clip_uintp2(g, 2);
b = av_clip_uintp2(b, 1);
}
break;
case SWS_DITHER_X_DITHER:
if (isrgb8) {
/* see http://pippin.gimp.org/a_dither/ for details/origin */
#define X_DITHER(u,v) (((((u)^((v)*237))*181)&0x1ff)/2)
r = (((R >> 19) + X_DITHER(i,y) - 96)>>8);
g = (((G >> 19) + X_DITHER(i + 17,y) - 96)>>8);
b = (((B >> 20) + X_DITHER(i + 17*2,y) - 96)>>8);
r = av_clip_uintp2(r, 3);
g = av_clip_uintp2(g, 3);
b = av_clip_uintp2(b, 2);
} else {
r = (((R >> 21) + X_DITHER(i,y)-256)>>8);
g = (((G >> 19) + X_DITHER(i + 17,y)-256)>>8);
b = (((B >> 21) + X_DITHER(i + 17*2,y)-256)>>8);
r = av_clip_uintp2(r, 1);
g = av_clip_uintp2(g, 2);
b = av_clip_uintp2(b, 1);
}
 
break;
}
 
if(target == AV_PIX_FMT_BGR4_BYTE) {
dest[0] = r + 2*g + 8*b;
} else if(target == AV_PIX_FMT_RGB4_BYTE) {
dest[0] = b + 2*g + 8*r;
} else if(target == AV_PIX_FMT_BGR8) {
dest[0] = r + 8*g + 64*b;
} else if(target == AV_PIX_FMT_RGB8) {
dest[0] = b + 4*g + 32*r;
} else
av_assert2(0);
break;}
}
}
 
static av_always_inline void
yuv2rgb_full_X_c_template(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest,
int dstW, int y, enum AVPixelFormat target, int hasAlpha)
{
int i;
int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
int err[4] = {0};
int A = 0; //init to silence warning
 
if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
|| target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
step = 1;
 
for (i = 0; i < dstW; i++) {
int j;
int Y = 1<<9;
int U = (1<<9)-(128 << 19);
int V = (1<<9)-(128 << 19);
 
for (j = 0; j < lumFilterSize; j++) {
Y += lumSrc[j][i] * lumFilter[j];
}
for (j = 0; j < chrFilterSize; j++) {
U += chrUSrc[j][i] * chrFilter[j];
V += chrVSrc[j][i] * chrFilter[j];
}
Y >>= 10;
U >>= 10;
V >>= 10;
if (hasAlpha) {
A = 1 << 18;
for (j = 0; j < lumFilterSize; j++) {
A += alpSrc[j][i] * lumFilter[j];
}
A >>= 19;
if (A & 0x100)
A = av_clip_uint8(A);
}
yuv2rgb_write_full(c, dest, i, Y, A, U, V, y, target, hasAlpha, err);
dest += step;
}
c->dither_error[0][i] = err[0];
c->dither_error[1][i] = err[1];
c->dither_error[2][i] = err[2];
}
 
static av_always_inline void
yuv2rgb_full_2_c_template(SwsContext *c, const int16_t *buf[2],
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf[2], uint8_t *dest, int dstW,
int yalpha, int uvalpha, int y,
enum AVPixelFormat target, int hasAlpha)
{
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1],
*vbuf0 = vbuf[0], *vbuf1 = vbuf[1],
*abuf0 = hasAlpha ? abuf[0] : NULL,
*abuf1 = hasAlpha ? abuf[1] : NULL;
int yalpha1 = 4096 - yalpha;
int uvalpha1 = 4096 - uvalpha;
int i;
int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
int err[4] = {0};
int A = 0; // init to silcene warning
 
if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
|| target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
step = 1;
 
for (i = 0; i < dstW; i++) {
int Y = ( buf0[i] * yalpha1 + buf1[i] * yalpha ) >> 10; //FIXME rounding
int U = (ubuf0[i] * uvalpha1 + ubuf1[i] * uvalpha-(128 << 19)) >> 10;
int V = (vbuf0[i] * uvalpha1 + vbuf1[i] * uvalpha-(128 << 19)) >> 10;
 
if (hasAlpha) {
A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha + (1<<18)) >> 19;
if (A & 0x100)
A = av_clip_uint8(A);
}
 
yuv2rgb_write_full(c, dest, i, Y, A, U, V, y, target, hasAlpha, err);
dest += step;
}
c->dither_error[0][i] = err[0];
c->dither_error[1][i] = err[1];
c->dither_error[2][i] = err[2];
}
 
static av_always_inline void
yuv2rgb_full_1_c_template(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest, int dstW,
int uvalpha, int y, enum AVPixelFormat target,
int hasAlpha)
{
const int16_t *ubuf0 = ubuf[0], *vbuf0 = vbuf[0];
int i;
int step = (target == AV_PIX_FMT_RGB24 || target == AV_PIX_FMT_BGR24) ? 3 : 4;
int err[4] = {0};
 
if( target == AV_PIX_FMT_BGR4_BYTE || target == AV_PIX_FMT_RGB4_BYTE
|| target == AV_PIX_FMT_BGR8 || target == AV_PIX_FMT_RGB8)
step = 1;
 
if (uvalpha < 2048) {
int A = 0; //init to silence warning
for (i = 0; i < dstW; i++) {
int Y = buf0[i] << 2;
int U = (ubuf0[i] - (128<<7)) * 4;
int V = (vbuf0[i] - (128<<7)) * 4;
 
if (hasAlpha) {
A = (abuf0[i] + 64) >> 7;
if (A & 0x100)
A = av_clip_uint8(A);
}
 
yuv2rgb_write_full(c, dest, i, Y, A, U, V, y, target, hasAlpha, err);
dest += step;
}
} else {
const int16_t *ubuf1 = ubuf[1], *vbuf1 = vbuf[1];
int A = 0; //init to silence warning
for (i = 0; i < dstW; i++) {
int Y = buf0[i] << 2;
int U = (ubuf0[i] + ubuf1[i] - (128<<8)) << 1;
int V = (vbuf0[i] + vbuf1[i] - (128<<8)) << 1;
 
if (hasAlpha) {
A = (abuf0[i] + 64) >> 7;
if (A & 0x100)
A = av_clip_uint8(A);
}
 
yuv2rgb_write_full(c, dest, i, Y, A, U, V, y, target, hasAlpha, err);
dest += step;
}
}
 
c->dither_error[0][i] = err[0];
c->dither_error[1][i] = err[1];
c->dither_error[2][i] = err[2];
}
 
#if CONFIG_SMALL
YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
#else
#if CONFIG_SWSCALE_ALPHA
YUV2RGBWRAPPER(yuv2, rgb_full, bgra32_full, AV_PIX_FMT_BGRA, 1)
YUV2RGBWRAPPER(yuv2, rgb_full, abgr32_full, AV_PIX_FMT_ABGR, 1)
YUV2RGBWRAPPER(yuv2, rgb_full, rgba32_full, AV_PIX_FMT_RGBA, 1)
YUV2RGBWRAPPER(yuv2, rgb_full, argb32_full, AV_PIX_FMT_ARGB, 1)
#endif
YUV2RGBWRAPPER(yuv2, rgb_full, bgrx32_full, AV_PIX_FMT_BGRA, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, xbgr32_full, AV_PIX_FMT_ABGR, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgbx32_full, AV_PIX_FMT_RGBA, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, xrgb32_full, AV_PIX_FMT_ARGB, 0)
#endif
YUV2RGBWRAPPER(yuv2, rgb_full, bgr24_full, AV_PIX_FMT_BGR24, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb24_full, AV_PIX_FMT_RGB24, 0)
 
YUV2RGBWRAPPER(yuv2, rgb_full, bgr4_byte_full, AV_PIX_FMT_BGR4_BYTE, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb4_byte_full, AV_PIX_FMT_RGB4_BYTE, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, bgr8_full, AV_PIX_FMT_BGR8, 0)
YUV2RGBWRAPPER(yuv2, rgb_full, rgb8_full, AV_PIX_FMT_RGB8, 0)
 
static void
yuv2gbrp_full_X_c(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t **dest,
int dstW, int y)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->dstFormat);
int i;
int hasAlpha = (desc->flags & AV_PIX_FMT_FLAG_ALPHA) && alpSrc;
uint16_t **dest16 = (uint16_t**)dest;
int SH = 22 + 7 - desc->comp[0].depth_minus1;
int A = 0; // init to silence warning
 
for (i = 0; i < dstW; i++) {
int j;
int Y = 1 << 9;
int U = (1 << 9) - (128 << 19);
int V = (1 << 9) - (128 << 19);
int R, G, B;
 
for (j = 0; j < lumFilterSize; j++)
Y += lumSrc[j][i] * lumFilter[j];
 
for (j = 0; j < chrFilterSize; j++) {
U += chrUSrc[j][i] * chrFilter[j];
V += chrVSrc[j][i] * chrFilter[j];
}
 
Y >>= 10;
U >>= 10;
V >>= 10;
 
if (hasAlpha) {
A = 1 << 18;
 
for (j = 0; j < lumFilterSize; j++)
A += alpSrc[j][i] * lumFilter[j];
 
A >>= 19;
 
if (A & 0x100)
A = av_clip_uint8(A);
}
 
Y -= c->yuv2rgb_y_offset;
Y *= c->yuv2rgb_y_coeff;
Y += 1 << 21;
R = Y + V * c->yuv2rgb_v2r_coeff;
G = Y + V * c->yuv2rgb_v2g_coeff + U * c->yuv2rgb_u2g_coeff;
B = Y + U * c->yuv2rgb_u2b_coeff;
 
if ((R | G | B) & 0xC0000000) {
R = av_clip_uintp2(R, 30);
G = av_clip_uintp2(G, 30);
B = av_clip_uintp2(B, 30);
}
 
if (SH != 22) {
dest16[0][i] = G >> SH;
dest16[1][i] = B >> SH;
dest16[2][i] = R >> SH;
if (hasAlpha)
dest16[3][i] = A;
} else {
dest[0][i] = G >> 22;
dest[1][i] = B >> 22;
dest[2][i] = R >> 22;
if (hasAlpha)
dest[3][i] = A;
}
}
if (SH != 22 && (!isBE(c->dstFormat)) != (!HAVE_BIGENDIAN)) {
for (i = 0; i < dstW; i++) {
dest16[0][i] = av_bswap16(dest16[0][i]);
dest16[1][i] = av_bswap16(dest16[1][i]);
dest16[2][i] = av_bswap16(dest16[2][i]);
if (hasAlpha)
dest16[3][i] = av_bswap16(dest16[3][i]);
}
}
}
 
static void
yuv2ya8_1_c(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest, int dstW,
int uvalpha, int y)
{
int hasAlpha = !!abuf0;
int i;
 
for (i = 0; i < dstW; i++) {
int Y = (buf0[i] + 64) >> 7;
int A;
 
Y = av_clip_uint8(Y);
 
if (hasAlpha) {
A = (abuf0[i] + 64) >> 7;
if (A & 0x100)
A = av_clip_uint8(A);
}
 
dest[i * 2 ] = Y;
dest[i * 2 + 1] = hasAlpha ? A : 255;
}
}
 
static void
yuv2ya8_2_c(SwsContext *c, const int16_t *buf[2],
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf[2], uint8_t *dest, int dstW,
int yalpha, int uvalpha, int y)
{
int hasAlpha = abuf && abuf[0] && abuf[1];
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*abuf0 = hasAlpha ? abuf[0] : NULL,
*abuf1 = hasAlpha ? abuf[1] : NULL;
int yalpha1 = 4096 - yalpha;
int i;
 
for (i = 0; i < dstW; i++) {
int Y = (buf0[i] * yalpha1 + buf1[i] * yalpha) >> 19;
int A;
 
Y = av_clip_uint8(Y);
 
if (hasAlpha) {
A = (abuf0[i] * yalpha1 + abuf1[i] * yalpha) >> 19;
A = av_clip_uint8(A);
}
 
dest[i * 2 ] = Y;
dest[i * 2 + 1] = hasAlpha ? A : 255;
}
}
 
static void
yuv2ya8_X_c(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest, int dstW, int y)
{
int hasAlpha = !!alpSrc;
int i;
 
for (i = 0; i < dstW; i++) {
int j;
int Y = 1 << 18, A = 1 << 18;
 
for (j = 0; j < lumFilterSize; j++)
Y += lumSrc[j][i] * lumFilter[j];
 
Y >>= 19;
if (Y & 0x100)
Y = av_clip_uint8(Y);
 
if (hasAlpha) {
for (j = 0; j < lumFilterSize; j++)
A += alpSrc[j][i] * lumFilter[j];
 
A >>= 19;
 
if (A & 0x100)
A = av_clip_uint8(A);
}
 
dest[2 * i ] = Y;
dest[2 * i + 1] = hasAlpha ? A : 255;
}
}
 
static void
yuv2ayuv64le_X_c(SwsContext *c, const int16_t *lumFilter,
const int16_t **_lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **_chrUSrc,
const int16_t **_chrVSrc, int chrFilterSize,
const int16_t **_alpSrc, uint8_t *dest, int dstW, int y)
{
const int32_t **lumSrc = (const int32_t **) _lumSrc,
**chrUSrc = (const int32_t **) _chrUSrc,
**chrVSrc = (const int32_t **) _chrVSrc,
**alpSrc = (const int32_t **) _alpSrc;
int hasAlpha = !!alpSrc;
int i;
 
for (i = 0; i < dstW; i++) {
int Y = 1 << 14, U = 1 << 14;
int V = 1 << 14, A = 1 << 14;
int j;
 
Y -= 0x40000000;
U -= 0x40000000;
V -= 0x40000000;
A -= 0x40000000;
 
for (j = 0; j < lumFilterSize; j++)
Y += lumSrc[j][i] * (unsigned)lumFilter[j];
 
for (j = 0; j < chrFilterSize; j++)
U += chrUSrc[j][i] * (unsigned)chrFilter[j];
 
for (j = 0; j < chrFilterSize; j++)
V += chrVSrc[j][i] * (unsigned)chrFilter[j];
 
if (hasAlpha)
for (j = 0; j < lumFilterSize; j++)
A += alpSrc[j][i] * (unsigned)lumFilter[j];
 
Y = 0x8000 + av_clip_int16(Y >> 15);
U = 0x8000 + av_clip_int16(U >> 15);
V = 0x8000 + av_clip_int16(V >> 15);
A = 0x8000 + av_clip_int16(A >> 15);
 
AV_WL16(dest + 8 * i, hasAlpha ? A : 65535);
AV_WL16(dest + 8 * i + 2, Y);
AV_WL16(dest + 8 * i + 4, U);
AV_WL16(dest + 8 * i + 6, V);
}
}
 
av_cold void ff_sws_init_output_funcs(SwsContext *c,
yuv2planar1_fn *yuv2plane1,
yuv2planarX_fn *yuv2planeX,
yuv2interleavedX_fn *yuv2nv12cX,
yuv2packed1_fn *yuv2packed1,
yuv2packed2_fn *yuv2packed2,
yuv2packedX_fn *yuv2packedX,
yuv2anyX_fn *yuv2anyX)
{
enum AVPixelFormat dstFormat = c->dstFormat;
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
 
if (is16BPS(dstFormat)) {
*yuv2planeX = isBE(dstFormat) ? yuv2planeX_16BE_c : yuv2planeX_16LE_c;
*yuv2plane1 = isBE(dstFormat) ? yuv2plane1_16BE_c : yuv2plane1_16LE_c;
} else if (is9_OR_10BPS(dstFormat)) {
if (desc->comp[0].depth_minus1 == 8) {
*yuv2planeX = isBE(dstFormat) ? yuv2planeX_9BE_c : yuv2planeX_9LE_c;
*yuv2plane1 = isBE(dstFormat) ? yuv2plane1_9BE_c : yuv2plane1_9LE_c;
} else if (desc->comp[0].depth_minus1 == 9) {
*yuv2planeX = isBE(dstFormat) ? yuv2planeX_10BE_c : yuv2planeX_10LE_c;
*yuv2plane1 = isBE(dstFormat) ? yuv2plane1_10BE_c : yuv2plane1_10LE_c;
} else if (desc->comp[0].depth_minus1 == 11) {
*yuv2planeX = isBE(dstFormat) ? yuv2planeX_12BE_c : yuv2planeX_12LE_c;
*yuv2plane1 = isBE(dstFormat) ? yuv2plane1_12BE_c : yuv2plane1_12LE_c;
} else if (desc->comp[0].depth_minus1 == 13) {
*yuv2planeX = isBE(dstFormat) ? yuv2planeX_14BE_c : yuv2planeX_14LE_c;
*yuv2plane1 = isBE(dstFormat) ? yuv2plane1_14BE_c : yuv2plane1_14LE_c;
} else
av_assert0(0);
} else {
*yuv2plane1 = yuv2plane1_8_c;
*yuv2planeX = yuv2planeX_8_c;
if (dstFormat == AV_PIX_FMT_NV12 || dstFormat == AV_PIX_FMT_NV21)
*yuv2nv12cX = yuv2nv12cX_c;
}
 
if(c->flags & SWS_FULL_CHR_H_INT) {
switch (dstFormat) {
case AV_PIX_FMT_RGBA:
#if CONFIG_SMALL
*yuv2packedX = yuv2rgba32_full_X_c;
*yuv2packed2 = yuv2rgba32_full_2_c;
*yuv2packed1 = yuv2rgba32_full_1_c;
#else
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packedX = yuv2rgba32_full_X_c;
*yuv2packed2 = yuv2rgba32_full_2_c;
*yuv2packed1 = yuv2rgba32_full_1_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packedX = yuv2rgbx32_full_X_c;
*yuv2packed2 = yuv2rgbx32_full_2_c;
*yuv2packed1 = yuv2rgbx32_full_1_c;
}
#endif /* !CONFIG_SMALL */
break;
case AV_PIX_FMT_ARGB:
#if CONFIG_SMALL
*yuv2packedX = yuv2argb32_full_X_c;
*yuv2packed2 = yuv2argb32_full_2_c;
*yuv2packed1 = yuv2argb32_full_1_c;
#else
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packedX = yuv2argb32_full_X_c;
*yuv2packed2 = yuv2argb32_full_2_c;
*yuv2packed1 = yuv2argb32_full_1_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packedX = yuv2xrgb32_full_X_c;
*yuv2packed2 = yuv2xrgb32_full_2_c;
*yuv2packed1 = yuv2xrgb32_full_1_c;
}
#endif /* !CONFIG_SMALL */
break;
case AV_PIX_FMT_BGRA:
#if CONFIG_SMALL
*yuv2packedX = yuv2bgra32_full_X_c;
*yuv2packed2 = yuv2bgra32_full_2_c;
*yuv2packed1 = yuv2bgra32_full_1_c;
#else
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packedX = yuv2bgra32_full_X_c;
*yuv2packed2 = yuv2bgra32_full_2_c;
*yuv2packed1 = yuv2bgra32_full_1_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packedX = yuv2bgrx32_full_X_c;
*yuv2packed2 = yuv2bgrx32_full_2_c;
*yuv2packed1 = yuv2bgrx32_full_1_c;
}
#endif /* !CONFIG_SMALL */
break;
case AV_PIX_FMT_ABGR:
#if CONFIG_SMALL
*yuv2packedX = yuv2abgr32_full_X_c;
*yuv2packed2 = yuv2abgr32_full_2_c;
*yuv2packed1 = yuv2abgr32_full_1_c;
#else
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packedX = yuv2abgr32_full_X_c;
*yuv2packed2 = yuv2abgr32_full_2_c;
*yuv2packed1 = yuv2abgr32_full_1_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packedX = yuv2xbgr32_full_X_c;
*yuv2packed2 = yuv2xbgr32_full_2_c;
*yuv2packed1 = yuv2xbgr32_full_1_c;
}
#endif /* !CONFIG_SMALL */
break;
case AV_PIX_FMT_RGBA64LE:
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packedX = yuv2rgba64le_full_X_c;
*yuv2packed2 = yuv2rgba64le_full_2_c;
*yuv2packed1 = yuv2rgba64le_full_1_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packedX = yuv2rgbx64le_full_X_c;
*yuv2packed2 = yuv2rgbx64le_full_2_c;
*yuv2packed1 = yuv2rgbx64le_full_1_c;
}
break;
case AV_PIX_FMT_RGBA64BE:
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packedX = yuv2rgba64be_full_X_c;
*yuv2packed2 = yuv2rgba64be_full_2_c;
*yuv2packed1 = yuv2rgba64be_full_1_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packedX = yuv2rgbx64be_full_X_c;
*yuv2packed2 = yuv2rgbx64be_full_2_c;
*yuv2packed1 = yuv2rgbx64be_full_1_c;
}
break;
case AV_PIX_FMT_BGRA64LE:
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packedX = yuv2bgra64le_full_X_c;
*yuv2packed2 = yuv2bgra64le_full_2_c;
*yuv2packed1 = yuv2bgra64le_full_1_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packedX = yuv2bgrx64le_full_X_c;
*yuv2packed2 = yuv2bgrx64le_full_2_c;
*yuv2packed1 = yuv2bgrx64le_full_1_c;
}
break;
case AV_PIX_FMT_BGRA64BE:
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packedX = yuv2bgra64be_full_X_c;
*yuv2packed2 = yuv2bgra64be_full_2_c;
*yuv2packed1 = yuv2bgra64be_full_1_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packedX = yuv2bgrx64be_full_X_c;
*yuv2packed2 = yuv2bgrx64be_full_2_c;
*yuv2packed1 = yuv2bgrx64be_full_1_c;
}
break;
 
case AV_PIX_FMT_RGB24:
*yuv2packedX = yuv2rgb24_full_X_c;
*yuv2packed2 = yuv2rgb24_full_2_c;
*yuv2packed1 = yuv2rgb24_full_1_c;
break;
case AV_PIX_FMT_BGR24:
*yuv2packedX = yuv2bgr24_full_X_c;
*yuv2packed2 = yuv2bgr24_full_2_c;
*yuv2packed1 = yuv2bgr24_full_1_c;
break;
case AV_PIX_FMT_RGB48LE:
*yuv2packedX = yuv2rgb48le_full_X_c;
*yuv2packed2 = yuv2rgb48le_full_2_c;
*yuv2packed1 = yuv2rgb48le_full_1_c;
break;
case AV_PIX_FMT_BGR48LE:
*yuv2packedX = yuv2bgr48le_full_X_c;
*yuv2packed2 = yuv2bgr48le_full_2_c;
*yuv2packed1 = yuv2bgr48le_full_1_c;
break;
case AV_PIX_FMT_RGB48BE:
*yuv2packedX = yuv2rgb48be_full_X_c;
*yuv2packed2 = yuv2rgb48be_full_2_c;
*yuv2packed1 = yuv2rgb48be_full_1_c;
break;
case AV_PIX_FMT_BGR48BE:
*yuv2packedX = yuv2bgr48be_full_X_c;
*yuv2packed2 = yuv2bgr48be_full_2_c;
*yuv2packed1 = yuv2bgr48be_full_1_c;
break;
case AV_PIX_FMT_BGR4_BYTE:
*yuv2packedX = yuv2bgr4_byte_full_X_c;
*yuv2packed2 = yuv2bgr4_byte_full_2_c;
*yuv2packed1 = yuv2bgr4_byte_full_1_c;
break;
case AV_PIX_FMT_RGB4_BYTE:
*yuv2packedX = yuv2rgb4_byte_full_X_c;
*yuv2packed2 = yuv2rgb4_byte_full_2_c;
*yuv2packed1 = yuv2rgb4_byte_full_1_c;
break;
case AV_PIX_FMT_BGR8:
*yuv2packedX = yuv2bgr8_full_X_c;
*yuv2packed2 = yuv2bgr8_full_2_c;
*yuv2packed1 = yuv2bgr8_full_1_c;
break;
case AV_PIX_FMT_RGB8:
*yuv2packedX = yuv2rgb8_full_X_c;
*yuv2packed2 = yuv2rgb8_full_2_c;
*yuv2packed1 = yuv2rgb8_full_1_c;
break;
case AV_PIX_FMT_GBRP:
case AV_PIX_FMT_GBRP9BE:
case AV_PIX_FMT_GBRP9LE:
case AV_PIX_FMT_GBRP10BE:
case AV_PIX_FMT_GBRP10LE:
case AV_PIX_FMT_GBRP12BE:
case AV_PIX_FMT_GBRP12LE:
case AV_PIX_FMT_GBRP14BE:
case AV_PIX_FMT_GBRP14LE:
case AV_PIX_FMT_GBRP16BE:
case AV_PIX_FMT_GBRP16LE:
case AV_PIX_FMT_GBRAP:
*yuv2anyX = yuv2gbrp_full_X_c;
break;
}
if (!*yuv2packedX && !*yuv2anyX)
goto YUV_PACKED;
} else {
YUV_PACKED:
switch (dstFormat) {
case AV_PIX_FMT_RGBA64LE:
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packed1 = yuv2rgba64le_1_c;
*yuv2packed2 = yuv2rgba64le_2_c;
*yuv2packedX = yuv2rgba64le_X_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packed1 = yuv2rgbx64le_1_c;
*yuv2packed2 = yuv2rgbx64le_2_c;
*yuv2packedX = yuv2rgbx64le_X_c;
}
break;
case AV_PIX_FMT_RGBA64BE:
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packed1 = yuv2rgba64be_1_c;
*yuv2packed2 = yuv2rgba64be_2_c;
*yuv2packedX = yuv2rgba64be_X_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packed1 = yuv2rgbx64be_1_c;
*yuv2packed2 = yuv2rgbx64be_2_c;
*yuv2packedX = yuv2rgbx64be_X_c;
}
break;
case AV_PIX_FMT_BGRA64LE:
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packed1 = yuv2bgra64le_1_c;
*yuv2packed2 = yuv2bgra64le_2_c;
*yuv2packedX = yuv2bgra64le_X_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packed1 = yuv2bgrx64le_1_c;
*yuv2packed2 = yuv2bgrx64le_2_c;
*yuv2packedX = yuv2bgrx64le_X_c;
}
break;
case AV_PIX_FMT_BGRA64BE:
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packed1 = yuv2bgra64be_1_c;
*yuv2packed2 = yuv2bgra64be_2_c;
*yuv2packedX = yuv2bgra64be_X_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packed1 = yuv2bgrx64be_1_c;
*yuv2packed2 = yuv2bgrx64be_2_c;
*yuv2packedX = yuv2bgrx64be_X_c;
}
break;
case AV_PIX_FMT_RGB48LE:
*yuv2packed1 = yuv2rgb48le_1_c;
*yuv2packed2 = yuv2rgb48le_2_c;
*yuv2packedX = yuv2rgb48le_X_c;
break;
case AV_PIX_FMT_RGB48BE:
*yuv2packed1 = yuv2rgb48be_1_c;
*yuv2packed2 = yuv2rgb48be_2_c;
*yuv2packedX = yuv2rgb48be_X_c;
break;
case AV_PIX_FMT_BGR48LE:
*yuv2packed1 = yuv2bgr48le_1_c;
*yuv2packed2 = yuv2bgr48le_2_c;
*yuv2packedX = yuv2bgr48le_X_c;
break;
case AV_PIX_FMT_BGR48BE:
*yuv2packed1 = yuv2bgr48be_1_c;
*yuv2packed2 = yuv2bgr48be_2_c;
*yuv2packedX = yuv2bgr48be_X_c;
break;
case AV_PIX_FMT_RGB32:
case AV_PIX_FMT_BGR32:
#if CONFIG_SMALL
*yuv2packed1 = yuv2rgb32_1_c;
*yuv2packed2 = yuv2rgb32_2_c;
*yuv2packedX = yuv2rgb32_X_c;
#else
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packed1 = yuv2rgba32_1_c;
*yuv2packed2 = yuv2rgba32_2_c;
*yuv2packedX = yuv2rgba32_X_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packed1 = yuv2rgbx32_1_c;
*yuv2packed2 = yuv2rgbx32_2_c;
*yuv2packedX = yuv2rgbx32_X_c;
}
#endif /* !CONFIG_SMALL */
break;
case AV_PIX_FMT_RGB32_1:
case AV_PIX_FMT_BGR32_1:
#if CONFIG_SMALL
*yuv2packed1 = yuv2rgb32_1_1_c;
*yuv2packed2 = yuv2rgb32_1_2_c;
*yuv2packedX = yuv2rgb32_1_X_c;
#else
#if CONFIG_SWSCALE_ALPHA
if (c->alpPixBuf) {
*yuv2packed1 = yuv2rgba32_1_1_c;
*yuv2packed2 = yuv2rgba32_1_2_c;
*yuv2packedX = yuv2rgba32_1_X_c;
} else
#endif /* CONFIG_SWSCALE_ALPHA */
{
*yuv2packed1 = yuv2rgbx32_1_1_c;
*yuv2packed2 = yuv2rgbx32_1_2_c;
*yuv2packedX = yuv2rgbx32_1_X_c;
}
#endif /* !CONFIG_SMALL */
break;
case AV_PIX_FMT_RGB24:
*yuv2packed1 = yuv2rgb24_1_c;
*yuv2packed2 = yuv2rgb24_2_c;
*yuv2packedX = yuv2rgb24_X_c;
break;
case AV_PIX_FMT_BGR24:
*yuv2packed1 = yuv2bgr24_1_c;
*yuv2packed2 = yuv2bgr24_2_c;
*yuv2packedX = yuv2bgr24_X_c;
break;
case AV_PIX_FMT_RGB565LE:
case AV_PIX_FMT_RGB565BE:
case AV_PIX_FMT_BGR565LE:
case AV_PIX_FMT_BGR565BE:
*yuv2packed1 = yuv2rgb16_1_c;
*yuv2packed2 = yuv2rgb16_2_c;
*yuv2packedX = yuv2rgb16_X_c;
break;
case AV_PIX_FMT_RGB555LE:
case AV_PIX_FMT_RGB555BE:
case AV_PIX_FMT_BGR555LE:
case AV_PIX_FMT_BGR555BE:
*yuv2packed1 = yuv2rgb15_1_c;
*yuv2packed2 = yuv2rgb15_2_c;
*yuv2packedX = yuv2rgb15_X_c;
break;
case AV_PIX_FMT_RGB444LE:
case AV_PIX_FMT_RGB444BE:
case AV_PIX_FMT_BGR444LE:
case AV_PIX_FMT_BGR444BE:
*yuv2packed1 = yuv2rgb12_1_c;
*yuv2packed2 = yuv2rgb12_2_c;
*yuv2packedX = yuv2rgb12_X_c;
break;
case AV_PIX_FMT_RGB8:
case AV_PIX_FMT_BGR8:
*yuv2packed1 = yuv2rgb8_1_c;
*yuv2packed2 = yuv2rgb8_2_c;
*yuv2packedX = yuv2rgb8_X_c;
break;
case AV_PIX_FMT_RGB4:
case AV_PIX_FMT_BGR4:
*yuv2packed1 = yuv2rgb4_1_c;
*yuv2packed2 = yuv2rgb4_2_c;
*yuv2packedX = yuv2rgb4_X_c;
break;
case AV_PIX_FMT_RGB4_BYTE:
case AV_PIX_FMT_BGR4_BYTE:
*yuv2packed1 = yuv2rgb4b_1_c;
*yuv2packed2 = yuv2rgb4b_2_c;
*yuv2packedX = yuv2rgb4b_X_c;
break;
}
}
switch (dstFormat) {
case AV_PIX_FMT_MONOWHITE:
*yuv2packed1 = yuv2monowhite_1_c;
*yuv2packed2 = yuv2monowhite_2_c;
*yuv2packedX = yuv2monowhite_X_c;
break;
case AV_PIX_FMT_MONOBLACK:
*yuv2packed1 = yuv2monoblack_1_c;
*yuv2packed2 = yuv2monoblack_2_c;
*yuv2packedX = yuv2monoblack_X_c;
break;
case AV_PIX_FMT_YUYV422:
*yuv2packed1 = yuv2yuyv422_1_c;
*yuv2packed2 = yuv2yuyv422_2_c;
*yuv2packedX = yuv2yuyv422_X_c;
break;
case AV_PIX_FMT_YVYU422:
*yuv2packed1 = yuv2yvyu422_1_c;
*yuv2packed2 = yuv2yvyu422_2_c;
*yuv2packedX = yuv2yvyu422_X_c;
break;
case AV_PIX_FMT_UYVY422:
*yuv2packed1 = yuv2uyvy422_1_c;
*yuv2packed2 = yuv2uyvy422_2_c;
*yuv2packedX = yuv2uyvy422_X_c;
break;
case AV_PIX_FMT_YA8:
*yuv2packed1 = yuv2ya8_1_c;
*yuv2packed2 = yuv2ya8_2_c;
*yuv2packedX = yuv2ya8_X_c;
break;
case AV_PIX_FMT_AYUV64LE:
*yuv2packedX = yuv2ayuv64le_X_c;
break;
}
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/ppc/Makefile
0,0 → 1,3
OBJS += ppc/swscale_altivec.o \
ppc/yuv2rgb_altivec.o \
ppc/yuv2yuv_altivec.o \
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/ppc/swscale_altivec.c
0,0 → 1,371
/*
* AltiVec-enhanced yuv2yuvX
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <inttypes.h>
 
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "yuv2rgb_altivec.h"
#include "libavutil/ppc/util_altivec.h"
 
#if HAVE_ALTIVEC
#define vzero vec_splat_s32(0)
 
#if HAVE_BIGENDIAN
#define GET_LS(a,b,c,s) {\
vector signed short l2 = vec_ld(((b) << 1) + 16, s);\
ls = vec_perm(a, l2, c);\
a = l2;\
}
#else
#define GET_LS(a,b,c,s) {\
ls = a;\
a = vec_vsx_ld(((b) << 1) + 16, s);\
}
#endif
 
#define yuv2planeX_8(d1, d2, l1, src, x, perm, filter) do {\
vector signed short ls;\
GET_LS(l1, x, perm, src);\
vector signed int i1 = vec_mule(filter, ls);\
vector signed int i2 = vec_mulo(filter, ls);\
vector signed int vf1, vf2;\
vf1 = vec_mergeh(i1, i2);\
vf2 = vec_mergel(i1, i2);\
d1 = vec_add(d1, vf1);\
d2 = vec_add(d2, vf2);\
} while (0)
 
#if HAVE_BIGENDIAN
#define LOAD_FILTER(vf,f) {\
vector unsigned char perm0 = vec_lvsl(joffset, f);\
vf = vec_ld(joffset, f);\
vf = vec_perm(vf, vf, perm0);\
}
#define LOAD_L1(ll1,s,p){\
p = vec_lvsl(xoffset, s);\
ll1 = vec_ld(xoffset, s);\
}
#else
#define LOAD_FILTER(vf,f) {\
vf = vec_vsx_ld(joffset, f);\
}
#define LOAD_L1(ll1,s,p){\
ll1 = vec_vsx_ld(xoffset, s);\
}
#endif
 
static void yuv2planeX_16_altivec(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest,
const uint8_t *dither, int offset, int x)
{
register int i, j;
LOCAL_ALIGNED(16, int, val, [16]);
vector signed int vo1, vo2, vo3, vo4;
vector unsigned short vs1, vs2;
vector unsigned char vf;
vector unsigned int altivec_vectorShiftInt19 =
vec_add(vec_splat_u32(10), vec_splat_u32(9));
 
for (i = 0; i < 16; i++)
val[i] = dither[(x + i + offset) & 7] << 12;
 
vo1 = vec_ld(0, val);
vo2 = vec_ld(16, val);
vo3 = vec_ld(32, val);
vo4 = vec_ld(48, val);
 
for (j = 0; j < filterSize; j++) {
unsigned int joffset=j<<1;
unsigned int xoffset=x<<1;
vector unsigned char perm;
vector signed short l1,vLumFilter;
LOAD_FILTER(vLumFilter,filter);
vLumFilter = vec_splat(vLumFilter, 0);
LOAD_L1(l1,src[j],perm);
yuv2planeX_8(vo1, vo2, l1, src[j], x, perm, vLumFilter);
yuv2planeX_8(vo3, vo4, l1, src[j], x + 8, perm, vLumFilter);
}
 
vo1 = vec_sra(vo1, altivec_vectorShiftInt19);
vo2 = vec_sra(vo2, altivec_vectorShiftInt19);
vo3 = vec_sra(vo3, altivec_vectorShiftInt19);
vo4 = vec_sra(vo4, altivec_vectorShiftInt19);
vs1 = vec_packsu(vo1, vo2);
vs2 = vec_packsu(vo3, vo4);
vf = vec_packsu(vs1, vs2);
VEC_ST(vf, 0, dest);
}
 
 
static inline void yuv2planeX_u(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset, int x)
{
int i, j;
 
for (i = x; i < dstW; i++) {
int t = dither[(i + offset) & 7] << 12;
for (j = 0; j < filterSize; j++)
t += src[j][i] * filter[j];
dest[i] = av_clip_uint8(t >> 19);
}
}
 
static void yuv2planeX_altivec(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
int dst_u = -(uintptr_t)dest & 15;
int i;
 
yuv2planeX_u(filter, filterSize, src, dest, dst_u, dither, offset, 0);
 
for (i = dst_u; i < dstW - 15; i += 16)
yuv2planeX_16_altivec(filter, filterSize, src, dest + i, dither,
offset, i);
 
yuv2planeX_u(filter, filterSize, src, dest, dstW, dither, offset, i);
}
 
#if HAVE_BIGENDIAN
// The 3 above is 2 (filterSize == 4) + 1 (sizeof(short) == 2).
 
// The neat trick: We only care for half the elements,
// high or low depending on (i<<3)%16 (it's 0 or 8 here),
// and we're going to use vec_mule, so we choose
// carefully how to "unpack" the elements into the even slots.
#define GET_VF4(a, vf, f) {\
vf = vec_ld(a<< 3, f);\
if ((a << 3) % 16)\
vf = vec_mergel(vf, (vector signed short)vzero);\
else\
vf = vec_mergeh(vf, (vector signed short)vzero);\
}
#define FIRST_LOAD(sv, pos, s, per) {\
sv = vec_ld(pos, s);\
per = vec_lvsl(pos, s);\
}
#define UPDATE_PTR(s0, d0, s1, d1) {\
d0 = s0;\
d1 = s1;\
}
#define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
v1 = vec_ld(pos + a + 16, s);\
vf = vec_perm(v0, v1, per);\
}
#define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) {\
if ((((uintptr_t)s + pos) % 16) > 8) {\
v1 = vec_ld(pos + a + 16, s);\
}\
vf = vec_perm(v0, src_v1, per);\
}
#define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
vf1 = vec_ld((a * 2 * filterSize) + (b * 2) + 16 + off, f);\
vf = vec_perm(vf0, vf1, per);\
}
#else /* else of #if HAVE_BIGENDIAN */
#define GET_VF4(a, vf, f) {\
vf = (vector signed short)vec_vsx_ld(a << 3, f);\
vf = vec_mergeh(vf, (vector signed short)vzero);\
}
#define FIRST_LOAD(sv, pos, s, per) {}
#define UPDATE_PTR(s0, d0, s1, d1) {}
#define LOAD_SRCV(pos, a, s, per, v0, v1, vf) {\
vf = vec_vsx_ld(pos + a, s);\
}
#define LOAD_SRCV8(pos, a, s, per, v0, v1, vf) LOAD_SRCV(pos, a, s, per, v0, v1, vf)
#define GET_VFD(a, b, f, vf0, vf1, per, vf, off) {\
vf = vec_vsx_ld((a * 2 * filterSize) + (b * 2) + off, f);\
}
#endif /* end of #if HAVE_BIGENDIAN */
 
static void hScale_altivec_real(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
register int i;
LOCAL_ALIGNED(16, int, tempo, [4]);
 
if (filterSize % 4) {
for (i = 0; i < dstW; i++) {
register int j;
register int srcPos = filterPos[i];
register int val = 0;
for (j = 0; j < filterSize; j++)
val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
dst[i] = FFMIN(val >> 7, (1 << 15) - 1);
}
} else
switch (filterSize) {
case 4:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
 
vector unsigned char src_vF = unaligned_load(srcPos, src);
vector signed short src_v, filter_v;
vector signed int val_vEven, val_s;
src_v = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
// now put our elements in the even slots
src_v = vec_mergeh(src_v, (vector signed short)vzero);
GET_VF4(i, filter_v, filter);
val_vEven = vec_mule(src_v, filter_v);
val_s = vec_sums(val_vEven, vzero);
vec_st(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
case 8:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
vector unsigned char src_vF, src_v0, src_v1;
vector unsigned char permS;
vector signed short src_v, filter_v;
vector signed int val_v, val_s;
FIRST_LOAD(src_v0, srcPos, src, permS);
LOAD_SRCV8(srcPos, 0, src, permS, src_v0, src_v1, src_vF);
src_v = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
filter_v = vec_ld(i << 4, filter);
val_v = vec_msums(src_v, filter_v, (vector signed int)vzero);
val_s = vec_sums(val_v, vzero);
vec_st(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
 
case 16:
for (i = 0; i < dstW; i++) {
register int srcPos = filterPos[i];
 
vector unsigned char src_vF = unaligned_load(srcPos, src);
vector signed short src_vA = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
vector signed short src_vB = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
vector signed short filter_v0 = vec_ld(i << 5, filter);
vector signed short filter_v1 = vec_ld((i << 5) + 16, filter);
 
vector signed int val_acc = vec_msums(src_vA, filter_v0, (vector signed int)vzero);
vector signed int val_v = vec_msums(src_vB, filter_v1, val_acc);
 
vector signed int val_s = vec_sums(val_v, vzero);
 
VEC_ST(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
break;
 
default:
for (i = 0; i < dstW; i++) {
register int j, offset = i * 2 * filterSize;
register int srcPos = filterPos[i];
 
vector signed int val_s, val_v = (vector signed int)vzero;
vector signed short filter_v0R;
vector unsigned char permF, src_v0, permS;
FIRST_LOAD(filter_v0R, offset, filter, permF);
FIRST_LOAD(src_v0, srcPos, src, permS);
 
for (j = 0; j < filterSize - 15; j += 16) {
vector unsigned char src_v1, src_vF;
vector signed short filter_v1R, filter_v2R, filter_v0, filter_v1;
LOAD_SRCV(srcPos, j, src, permS, src_v0, src_v1, src_vF);
vector signed short src_vA = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
vector signed short src_vB = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEL((vector unsigned char)vzero, src_vF));
GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v0, 0);
GET_VFD(i, j, filter, filter_v1R, filter_v2R, permF, filter_v1, 16);
 
vector signed int val_acc = vec_msums(src_vA, filter_v0, val_v);
val_v = vec_msums(src_vB, filter_v1, val_acc);
UPDATE_PTR(filter_v2R, filter_v0R, src_v1, src_v0);
}
 
if (j < filterSize - 7) {
// loading src_v0 is useless, it's already done above
vector unsigned char src_v1, src_vF;
vector signed short src_v, filter_v1R, filter_v;
LOAD_SRCV8(srcPos, j, src, permS, src_v0, src_v1, src_vF);
src_v = // vec_unpackh sign-extends...
(vector signed short)(VEC_MERGEH((vector unsigned char)vzero, src_vF));
GET_VFD(i, j, filter, filter_v0R, filter_v1R, permF, filter_v, 0);
val_v = vec_msums(src_v, filter_v, val_v);
}
val_s = vec_sums(val_v, vzero);
 
VEC_ST(val_s, 0, tempo);
dst[i] = FFMIN(tempo[3] >> 7, (1 << 15) - 1);
}
}
}
#endif /* HAVE_ALTIVEC */
 
av_cold void ff_sws_init_swscale_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
enum AVPixelFormat dstFormat = c->dstFormat;
 
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
 
if (c->srcBpc == 8 && c->dstBpc <= 14) {
c->hyScale = c->hcScale = hScale_altivec_real;
}
if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) &&
dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
!c->alpPixBuf) {
c->yuv2planeX = yuv2planeX_altivec;
}
 
/* The following list of supported dstFormat values should
* match what's found in the body of ff_yuv2packedX_altivec() */
if (!(c->flags & (SWS_BITEXACT | SWS_FULL_CHR_H_INT)) && !c->alpPixBuf) {
switch (c->dstFormat) {
case AV_PIX_FMT_ABGR:
c->yuv2packedX = ff_yuv2abgr_X_altivec;
break;
case AV_PIX_FMT_BGRA:
c->yuv2packedX = ff_yuv2bgra_X_altivec;
break;
case AV_PIX_FMT_ARGB:
c->yuv2packedX = ff_yuv2argb_X_altivec;
break;
case AV_PIX_FMT_RGBA:
c->yuv2packedX = ff_yuv2rgba_X_altivec;
break;
case AV_PIX_FMT_BGR24:
c->yuv2packedX = ff_yuv2bgr24_X_altivec;
break;
case AV_PIX_FMT_RGB24:
c->yuv2packedX = ff_yuv2rgb24_X_altivec;
break;
}
}
#endif /* HAVE_ALTIVEC */
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/ppc/yuv2rgb_altivec.c
0,0 → 1,873
/*
* AltiVec acceleration for colorspace conversion
*
* copyright (C) 2004 Marc Hoffman <marc.hoffman@analog.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
/*
* Convert I420 YV12 to RGB in various formats,
* it rejects images that are not in 420 formats,
* it rejects images that don't have widths of multiples of 16,
* it rejects images that don't have heights of multiples of 2.
* Reject defers to C simulation code.
*
* Lots of optimizations to be done here.
*
* 1. Need to fix saturation code. I just couldn't get it to fly with packs
* and adds, so we currently use max/min to clip.
*
* 2. The inefficient use of chroma loading needs a bit of brushing up.
*
* 3. Analysis of pipeline stalls needs to be done. Use shark to identify
* pipeline stalls.
*
*
* MODIFIED to calculate coeffs from currently selected color space.
* MODIFIED core to be a macro where you specify the output format.
* ADDED UYVY conversion which is never called due to some thing in swscale.
* CORRECTED algorithim selection to be strict on input formats.
* ADDED runtime detection of AltiVec.
*
* ADDED altivec_yuv2packedX vertical scl + RGB converter
*
* March 27,2004
* PERFORMANCE ANALYSIS
*
* The C version uses 25% of the processor or ~250Mips for D1 video rawvideo
* used as test.
* The AltiVec version uses 10% of the processor or ~100Mips for D1 video
* same sequence.
*
* 720 * 480 * 30 ~10MPS
*
* so we have roughly 10 clocks per pixel. This is too high, something has
* to be wrong.
*
* OPTIMIZED clip codes to utilize vec_max and vec_packs removing the
* need for vec_min.
*
* OPTIMIZED DST OUTPUT cache/DMA controls. We are pretty much guaranteed to
* have the input video frame, it was just decompressed so it probably resides
* in L1 caches. However, we are creating the output video stream. This needs
* to use the DSTST instruction to optimize for the cache. We couple this with
* the fact that we are not going to be visiting the input buffer again so we
* mark it Least Recently Used. This shaves 25% of the processor cycles off.
*
* Now memcpy is the largest mips consumer in the system, probably due
* to the inefficient X11 stuff.
*
* GL libraries seem to be very slow on this machine 1.33Ghz PB running
* Jaguar, this is not the case for my 1Ghz PB. I thought it might be
* a versioning issue, however I have libGL.1.2.dylib for both
* machines. (We need to figure this out now.)
*
* GL2 libraries work now with patch for RGB32.
*
* NOTE: quartz vo driver ARGB32_to_RGB24 consumes 30% of the processor.
*
* Integrated luma prescaling adjustment for saturation/contrast/brightness
* adjustment.
*/
 
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <assert.h>
 
#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
#include "yuv2rgb_altivec.h"
 
#if HAVE_ALTIVEC
 
#undef PROFILE_THE_BEAST
#undef INC_SCALING
 
typedef unsigned char ubyte;
typedef signed char sbyte;
 
/* RGB interleaver, 16 planar pels 8-bit samples per channel in
* homogeneous vector registers x0,x1,x2 are interleaved with the
* following technique:
*
* o0 = vec_mergeh(x0, x1);
* o1 = vec_perm(o0, x2, perm_rgb_0);
* o2 = vec_perm(o0, x2, perm_rgb_1);
* o3 = vec_mergel(x0, x1);
* o4 = vec_perm(o3, o2, perm_rgb_2);
* o5 = vec_perm(o3, o2, perm_rgb_3);
*
* perm_rgb_0: o0(RG).h v1(B) --> o1*
* 0 1 2 3 4
* rgbr|gbrg|brgb|rgbr
* 0010 0100 1001 0010
* 0102 3145 2673 894A
*
* perm_rgb_1: o0(RG).h v1(B) --> o2
* 0 1 2 3 4
* gbrg|brgb|bbbb|bbbb
* 0100 1001 1111 1111
* B5CD 6EF7 89AB CDEF
*
* perm_rgb_2: o3(RG).l o2(rgbB.l) --> o4*
* 0 1 2 3 4
* gbrg|brgb|rgbr|gbrg
* 1111 1111 0010 0100
* 89AB CDEF 0182 3945
*
* perm_rgb_2: o3(RG).l o2(rgbB.l) ---> o5*
* 0 1 2 3 4
* brgb|rgbr|gbrg|brgb
* 1001 0010 0100 1001
* a67b 89cA BdCD eEFf
*
*/
static const vector unsigned char
perm_rgb_0 = { 0x00, 0x01, 0x10, 0x02, 0x03, 0x11, 0x04, 0x05,
0x12, 0x06, 0x07, 0x13, 0x08, 0x09, 0x14, 0x0a },
perm_rgb_1 = { 0x0b, 0x15, 0x0c, 0x0d, 0x16, 0x0e, 0x0f, 0x17,
0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f },
perm_rgb_2 = { 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17,
0x00, 0x01, 0x18, 0x02, 0x03, 0x19, 0x04, 0x05 },
perm_rgb_3 = { 0x1a, 0x06, 0x07, 0x1b, 0x08, 0x09, 0x1c, 0x0a,
0x0b, 0x1d, 0x0c, 0x0d, 0x1e, 0x0e, 0x0f, 0x1f };
 
#define vec_merge3(x2, x1, x0, y0, y1, y2) \
do { \
__typeof__(x0) o0, o2, o3; \
o0 = vec_mergeh(x0, x1); \
y0 = vec_perm(o0, x2, perm_rgb_0); \
o2 = vec_perm(o0, x2, perm_rgb_1); \
o3 = vec_mergel(x0, x1); \
y1 = vec_perm(o3, o2, perm_rgb_2); \
y2 = vec_perm(o3, o2, perm_rgb_3); \
} while (0)
 
#define vec_mstbgr24(x0, x1, x2, ptr) \
do { \
__typeof__(x0) _0, _1, _2; \
vec_merge3(x0, x1, x2, _0, _1, _2); \
vec_st(_0, 0, ptr++); \
vec_st(_1, 0, ptr++); \
vec_st(_2, 0, ptr++); \
} while (0)
 
#define vec_mstrgb24(x0, x1, x2, ptr) \
do { \
__typeof__(x0) _0, _1, _2; \
vec_merge3(x2, x1, x0, _0, _1, _2); \
vec_st(_0, 0, ptr++); \
vec_st(_1, 0, ptr++); \
vec_st(_2, 0, ptr++); \
} while (0)
 
/* pack the pixels in rgb0 format
* msb R
* lsb 0
*/
#define vec_mstrgb32(T, x0, x1, x2, x3, ptr) \
do { \
T _0, _1, _2, _3; \
_0 = vec_mergeh(x0, x1); \
_1 = vec_mergeh(x2, x3); \
_2 = (T) vec_mergeh((vector unsigned short) _0, \
(vector unsigned short) _1); \
_3 = (T) vec_mergel((vector unsigned short) _0, \
(vector unsigned short) _1); \
vec_st(_2, 0 * 16, (T *) ptr); \
vec_st(_3, 1 * 16, (T *) ptr); \
_0 = vec_mergel(x0, x1); \
_1 = vec_mergel(x2, x3); \
_2 = (T) vec_mergeh((vector unsigned short) _0, \
(vector unsigned short) _1); \
_3 = (T) vec_mergel((vector unsigned short) _0, \
(vector unsigned short) _1); \
vec_st(_2, 2 * 16, (T *) ptr); \
vec_st(_3, 3 * 16, (T *) ptr); \
ptr += 4; \
} while (0)
 
/*
* 1 0 1.4021 | | Y |
* 1 -0.3441 -0.7142 |x| Cb|
* 1 1.7718 0 | | Cr|
*
*
* Y: [-128 127]
* Cb/Cr : [-128 127]
*
* typical YUV conversion works on Y: 0-255 this version has been
* optimized for JPEG decoding.
*/
 
#if HAVE_BIGENDIAN
#define vec_unh(x) \
(vector signed short) \
vec_perm(x, (__typeof__(x)) { 0 }, \
((vector unsigned char) { \
0x10, 0x00, 0x10, 0x01, 0x10, 0x02, 0x10, 0x03, \
0x10, 0x04, 0x10, 0x05, 0x10, 0x06, 0x10, 0x07 }))
 
#define vec_unl(x) \
(vector signed short) \
vec_perm(x, (__typeof__(x)) { 0 }, \
((vector unsigned char) { \
0x10, 0x08, 0x10, 0x09, 0x10, 0x0A, 0x10, 0x0B, \
0x10, 0x0C, 0x10, 0x0D, 0x10, 0x0E, 0x10, 0x0F }))
#else
#define vec_unh(x)(vector signed short) vec_mergeh(x,(__typeof__(x)) { 0 })
#define vec_unl(x)(vector signed short) vec_mergel(x,(__typeof__(x)) { 0 })
#endif
 
#define vec_clip_s16(x) \
vec_max(vec_min(x, ((vector signed short) { \
235, 235, 235, 235, 235, 235, 235, 235 })), \
((vector signed short) { 16, 16, 16, 16, 16, 16, 16, 16 }))
 
#define vec_packclp(x, y) \
(vector unsigned char) \
vec_packs((vector unsigned short) \
vec_max(x, ((vector signed short) { 0 })), \
(vector unsigned short) \
vec_max(y, ((vector signed short) { 0 })))
 
static inline void cvtyuvtoRGB(SwsContext *c, vector signed short Y,
vector signed short U, vector signed short V,
vector signed short *R, vector signed short *G,
vector signed short *B)
{
vector signed short vx, ux, uvx;
 
Y = vec_mradds(Y, c->CY, c->OY);
U = vec_sub(U, (vector signed short)
vec_splat((vector signed short) { 128 }, 0));
V = vec_sub(V, (vector signed short)
vec_splat((vector signed short) { 128 }, 0));
 
// ux = (CBU * (u << c->CSHIFT) + 0x4000) >> 15;
ux = vec_sl(U, c->CSHIFT);
*B = vec_mradds(ux, c->CBU, Y);
 
// vx = (CRV * (v << c->CSHIFT) + 0x4000) >> 15;
vx = vec_sl(V, c->CSHIFT);
*R = vec_mradds(vx, c->CRV, Y);
 
// uvx = ((CGU * u) + (CGV * v)) >> 15;
uvx = vec_mradds(U, c->CGU, Y);
*G = vec_mradds(V, c->CGV, uvx);
}
 
/*
* ------------------------------------------------------------------------------
* CS converters
* ------------------------------------------------------------------------------
*/
 
#define DEFCSP420_CVT(name, out_pixels) \
static int altivec_ ## name(SwsContext *c, const unsigned char **in, \
int *instrides, int srcSliceY, int srcSliceH, \
unsigned char **oplanes, int *outstrides) \
{ \
int w = c->srcW; \
int h = srcSliceH; \
int i, j; \
int instrides_scl[3]; \
vector unsigned char y0, y1; \
\
vector signed char u, v; \
\
vector signed short Y0, Y1, Y2, Y3; \
vector signed short U, V; \
vector signed short vx, ux, uvx; \
vector signed short vx0, ux0, uvx0; \
vector signed short vx1, ux1, uvx1; \
vector signed short R0, G0, B0; \
vector signed short R1, G1, B1; \
vector unsigned char R, G, B; \
\
const vector unsigned char *y1ivP, *y2ivP, *uivP, *vivP; \
vector unsigned char align_perm; \
\
vector signed short lCY = c->CY; \
vector signed short lOY = c->OY; \
vector signed short lCRV = c->CRV; \
vector signed short lCBU = c->CBU; \
vector signed short lCGU = c->CGU; \
vector signed short lCGV = c->CGV; \
vector unsigned short lCSHIFT = c->CSHIFT; \
\
const ubyte *y1i = in[0]; \
const ubyte *y2i = in[0] + instrides[0]; \
const ubyte *ui = in[1]; \
const ubyte *vi = in[2]; \
\
vector unsigned char *oute, *outo; \
\
/* loop moves y{1, 2}i by w */ \
instrides_scl[0] = instrides[0] * 2 - w; \
/* loop moves ui by w / 2 */ \
instrides_scl[1] = instrides[1] - w / 2; \
/* loop moves vi by w / 2 */ \
instrides_scl[2] = instrides[2] - w / 2; \
\
for (i = 0; i < h / 2; i++) { \
oute = (vector unsigned char *)(oplanes[0] + outstrides[0] * \
(srcSliceY + i * 2)); \
outo = oute + (outstrides[0] >> 4); \
vec_dstst(outo, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 0); \
vec_dstst(oute, (0x02000002 | (((w * 3 + 32) / 32) << 16)), 1); \
\
for (j = 0; j < w / 16; j++) { \
y1ivP = (const vector unsigned char *) y1i; \
y2ivP = (const vector unsigned char *) y2i; \
uivP = (const vector unsigned char *) ui; \
vivP = (const vector unsigned char *) vi; \
\
align_perm = vec_lvsl(0, y1i); \
y0 = (vector unsigned char) \
vec_perm(y1ivP[0], y1ivP[1], align_perm); \
\
align_perm = vec_lvsl(0, y2i); \
y1 = (vector unsigned char) \
vec_perm(y2ivP[0], y2ivP[1], align_perm); \
\
align_perm = vec_lvsl(0, ui); \
u = (vector signed char) \
vec_perm(uivP[0], uivP[1], align_perm); \
\
align_perm = vec_lvsl(0, vi); \
v = (vector signed char) \
vec_perm(vivP[0], vivP[1], align_perm); \
\
u = (vector signed char) \
vec_sub(u, \
(vector signed char) \
vec_splat((vector signed char) { 128 }, 0)); \
v = (vector signed char) \
vec_sub(v, \
(vector signed char) \
vec_splat((vector signed char) { 128 }, 0)); \
\
U = vec_unpackh(u); \
V = vec_unpackh(v); \
\
Y0 = vec_unh(y0); \
Y1 = vec_unl(y0); \
Y2 = vec_unh(y1); \
Y3 = vec_unl(y1); \
\
Y0 = vec_mradds(Y0, lCY, lOY); \
Y1 = vec_mradds(Y1, lCY, lOY); \
Y2 = vec_mradds(Y2, lCY, lOY); \
Y3 = vec_mradds(Y3, lCY, lOY); \
\
/* ux = (CBU * (u << CSHIFT) + 0x4000) >> 15 */ \
ux = vec_sl(U, lCSHIFT); \
ux = vec_mradds(ux, lCBU, (vector signed short) { 0 }); \
ux0 = vec_mergeh(ux, ux); \
ux1 = vec_mergel(ux, ux); \
\
/* vx = (CRV * (v << CSHIFT) + 0x4000) >> 15; */ \
vx = vec_sl(V, lCSHIFT); \
vx = vec_mradds(vx, lCRV, (vector signed short) { 0 }); \
vx0 = vec_mergeh(vx, vx); \
vx1 = vec_mergel(vx, vx); \
\
/* uvx = ((CGU * u) + (CGV * v)) >> 15 */ \
uvx = vec_mradds(U, lCGU, (vector signed short) { 0 }); \
uvx = vec_mradds(V, lCGV, uvx); \
uvx0 = vec_mergeh(uvx, uvx); \
uvx1 = vec_mergel(uvx, uvx); \
\
R0 = vec_add(Y0, vx0); \
G0 = vec_add(Y0, uvx0); \
B0 = vec_add(Y0, ux0); \
R1 = vec_add(Y1, vx1); \
G1 = vec_add(Y1, uvx1); \
B1 = vec_add(Y1, ux1); \
\
R = vec_packclp(R0, R1); \
G = vec_packclp(G0, G1); \
B = vec_packclp(B0, B1); \
\
out_pixels(R, G, B, oute); \
\
R0 = vec_add(Y2, vx0); \
G0 = vec_add(Y2, uvx0); \
B0 = vec_add(Y2, ux0); \
R1 = vec_add(Y3, vx1); \
G1 = vec_add(Y3, uvx1); \
B1 = vec_add(Y3, ux1); \
R = vec_packclp(R0, R1); \
G = vec_packclp(G0, G1); \
B = vec_packclp(B0, B1); \
\
\
out_pixels(R, G, B, outo); \
\
y1i += 16; \
y2i += 16; \
ui += 8; \
vi += 8; \
} \
\
ui += instrides_scl[1]; \
vi += instrides_scl[2]; \
y1i += instrides_scl[0]; \
y2i += instrides_scl[0]; \
} \
return srcSliceH; \
}
 
#define out_abgr(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), c, b, a, ptr)
#define out_bgra(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), c, b, a, ((__typeof__(a)) { 255 }), ptr)
#define out_rgba(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), a, b, c, ((__typeof__(a)) { 255 }), ptr)
#define out_argb(a, b, c, ptr) \
vec_mstrgb32(__typeof__(a), ((__typeof__(a)) { 255 }), a, b, c, ptr)
#define out_rgb24(a, b, c, ptr) vec_mstrgb24(a, b, c, ptr)
#define out_bgr24(a, b, c, ptr) vec_mstbgr24(a, b, c, ptr)
 
DEFCSP420_CVT(yuv2_abgr, out_abgr)
DEFCSP420_CVT(yuv2_bgra, out_bgra)
DEFCSP420_CVT(yuv2_rgba, out_rgba)
DEFCSP420_CVT(yuv2_argb, out_argb)
DEFCSP420_CVT(yuv2_rgb24, out_rgb24)
DEFCSP420_CVT(yuv2_bgr24, out_bgr24)
 
// uyvy|uyvy|uyvy|uyvy
// 0123 4567 89ab cdef
static const vector unsigned char
demux_u = { 0x10, 0x00, 0x10, 0x00,
0x10, 0x04, 0x10, 0x04,
0x10, 0x08, 0x10, 0x08,
0x10, 0x0c, 0x10, 0x0c },
demux_v = { 0x10, 0x02, 0x10, 0x02,
0x10, 0x06, 0x10, 0x06,
0x10, 0x0A, 0x10, 0x0A,
0x10, 0x0E, 0x10, 0x0E },
demux_y = { 0x10, 0x01, 0x10, 0x03,
0x10, 0x05, 0x10, 0x07,
0x10, 0x09, 0x10, 0x0B,
0x10, 0x0D, 0x10, 0x0F };
 
/*
* this is so I can play live CCIR raw video
*/
static int altivec_uyvy_rgb32(SwsContext *c, const unsigned char **in,
int *instrides, int srcSliceY, int srcSliceH,
unsigned char **oplanes, int *outstrides)
{
int w = c->srcW;
int h = srcSliceH;
int i, j;
vector unsigned char uyvy;
vector signed short Y, U, V;
vector signed short R0, G0, B0, R1, G1, B1;
vector unsigned char R, G, B;
vector unsigned char *out;
const ubyte *img;
 
img = in[0];
out = (vector unsigned char *) (oplanes[0] + srcSliceY * outstrides[0]);
 
for (i = 0; i < h; i++)
for (j = 0; j < w / 16; j++) {
uyvy = vec_ld(0, img);
 
U = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
V = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
Y = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
 
cvtyuvtoRGB(c, Y, U, V, &R0, &G0, &B0);
 
uyvy = vec_ld(16, img);
 
U = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_u);
V = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_v);
Y = (vector signed short)
vec_perm(uyvy, (vector unsigned char) { 0 }, demux_y);
 
cvtyuvtoRGB(c, Y, U, V, &R1, &G1, &B1);
 
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
 
// vec_mstbgr24 (R,G,B, out);
out_rgba(R, G, B, out);
 
img += 32;
}
return srcSliceH;
}
 
#endif /* HAVE_ALTIVEC */
 
/* Ok currently the acceleration routine only supports
* inputs of widths a multiple of 16
* and heights a multiple 2
*
* So we just fall back to the C codes for this.
*/
av_cold SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return NULL;
 
/*
* and this seems not to matter too much I tried a bunch of
* videos with abnormal widths and MPlayer crashes elsewhere.
* mplayer -vo x11 -rawvideo on:w=350:h=240 raw-350x240.eyuv
* boom with X11 bad match.
*
*/
if ((c->srcW & 0xf) != 0)
return NULL;
 
switch (c->srcFormat) {
case AV_PIX_FMT_YUV410P:
case AV_PIX_FMT_YUV420P:
/*case IMGFMT_CLPL: ??? */
case AV_PIX_FMT_GRAY8:
case AV_PIX_FMT_NV12:
case AV_PIX_FMT_NV21:
if ((c->srcH & 0x1) != 0)
return NULL;
 
switch (c->dstFormat) {
case AV_PIX_FMT_RGB24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGB24\n");
return altivec_yuv2_rgb24;
case AV_PIX_FMT_BGR24:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGR24\n");
return altivec_yuv2_bgr24;
case AV_PIX_FMT_ARGB:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ARGB\n");
return altivec_yuv2_argb;
case AV_PIX_FMT_ABGR:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space ABGR\n");
return altivec_yuv2_abgr;
case AV_PIX_FMT_RGBA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space RGBA\n");
return altivec_yuv2_rgba;
case AV_PIX_FMT_BGRA:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space BGRA\n");
return altivec_yuv2_bgra;
default: return NULL;
}
break;
 
case AV_PIX_FMT_UYVY422:
switch (c->dstFormat) {
case AV_PIX_FMT_BGR32:
av_log(c, AV_LOG_WARNING, "ALTIVEC: Color Space UYVY -> RGB32\n");
return altivec_uyvy_rgb32;
default: return NULL;
}
break;
}
#endif /* HAVE_ALTIVEC */
 
return NULL;
}
 
av_cold void ff_yuv2rgb_init_tables_ppc(SwsContext *c,
const int inv_table[4],
int brightness,
int contrast,
int saturation)
{
#if HAVE_ALTIVEC
union {
DECLARE_ALIGNED(16, signed short, tmp)[8];
vector signed short vec;
} buf;
 
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
 
buf.tmp[0] = ((0xffffLL) * contrast >> 8) >> 9; // cy
buf.tmp[1] = -256 * brightness; // oy
buf.tmp[2] = (inv_table[0] >> 3) * (contrast >> 16) * (saturation >> 16); // crv
buf.tmp[3] = (inv_table[1] >> 3) * (contrast >> 16) * (saturation >> 16); // cbu
buf.tmp[4] = -((inv_table[2] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgu
buf.tmp[5] = -((inv_table[3] >> 1) * (contrast >> 16) * (saturation >> 16)); // cgv
 
c->CSHIFT = (vector unsigned short) vec_splat_u16(2);
c->CY = vec_splat((vector signed short) buf.vec, 0);
c->OY = vec_splat((vector signed short) buf.vec, 1);
c->CRV = vec_splat((vector signed short) buf.vec, 2);
c->CBU = vec_splat((vector signed short) buf.vec, 3);
c->CGU = vec_splat((vector signed short) buf.vec, 4);
c->CGV = vec_splat((vector signed short) buf.vec, 5);
return;
#endif /* HAVE_ALTIVEC */
}
 
#if HAVE_ALTIVEC
 
static av_always_inline void yuv2packedX_altivec(SwsContext *c,
const int16_t *lumFilter,
const int16_t **lumSrc,
int lumFilterSize,
const int16_t *chrFilter,
const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize,
const int16_t **alpSrc,
uint8_t *dest,
int dstW, int dstY,
enum AVPixelFormat target)
{
int i, j;
vector signed short X, X0, X1, Y0, U0, V0, Y1, U1, V1, U, V;
vector signed short R0, G0, B0, R1, G1, B1;
 
vector unsigned char R, G, B;
vector unsigned char *out, *nout;
 
vector signed short RND = vec_splat_s16(1 << 3);
vector unsigned short SCL = vec_splat_u16(4);
DECLARE_ALIGNED(16, unsigned int, scratch)[16];
 
vector signed short *YCoeffs, *CCoeffs;
 
YCoeffs = c->vYCoeffsBank + dstY * lumFilterSize;
CCoeffs = c->vCCoeffsBank + dstY * chrFilterSize;
 
out = (vector unsigned char *) dest;
 
for (i = 0; i < dstW; i += 16) {
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for (j = 0; j < lumFilterSize; j++) {
X0 = vec_ld(0, &lumSrc[j][i]);
X1 = vec_ld(16, &lumSrc[j][i]);
Y0 = vec_mradds(X0, YCoeffs[j], Y0);
Y1 = vec_mradds(X1, YCoeffs[j], Y1);
}
 
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for (j = 0; j < chrFilterSize; j++) {
X = vec_ld(0, &chrUSrc[j][i / 2]);
U = vec_mradds(X, CCoeffs[j], U);
X = vec_ld(0, &chrVSrc[j][i / 2]);
V = vec_mradds(X, CCoeffs[j], V);
}
 
/* scale and clip signals */
Y0 = vec_sra(Y0, SCL);
Y1 = vec_sra(Y1, SCL);
U = vec_sra(U, SCL);
V = vec_sra(V, SCL);
 
Y0 = vec_clip_s16(Y0);
Y1 = vec_clip_s16(Y1);
U = vec_clip_s16(U);
V = vec_clip_s16(V);
 
/* now we have
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
*
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
* V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
*/
 
U0 = vec_mergeh(U, U);
V0 = vec_mergeh(V, V);
 
U1 = vec_mergel(U, U);
V1 = vec_mergel(V, V);
 
cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
 
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
 
switch (target) {
case AV_PIX_FMT_ABGR:
out_abgr(R, G, B, out);
break;
case AV_PIX_FMT_BGRA:
out_bgra(R, G, B, out);
break;
case AV_PIX_FMT_RGBA:
out_rgba(R, G, B, out);
break;
case AV_PIX_FMT_ARGB:
out_argb(R, G, B, out);
break;
case AV_PIX_FMT_RGB24:
out_rgb24(R, G, B, out);
break;
case AV_PIX_FMT_BGR24:
out_bgr24(R, G, B, out);
break;
default:
{
/* If this is reached, the caller should have called yuv2packedXinC
* instead. */
static int printed_error_message;
if (!printed_error_message) {
av_log(c, AV_LOG_ERROR,
"altivec_yuv2packedX doesn't support %s output\n",
av_get_pix_fmt_name(c->dstFormat));
printed_error_message = 1;
}
return;
}
}
}
 
if (i < dstW) {
i -= 16;
 
Y0 = RND;
Y1 = RND;
/* extract 16 coeffs from lumSrc */
for (j = 0; j < lumFilterSize; j++) {
X0 = vec_ld(0, &lumSrc[j][i]);
X1 = vec_ld(16, &lumSrc[j][i]);
Y0 = vec_mradds(X0, YCoeffs[j], Y0);
Y1 = vec_mradds(X1, YCoeffs[j], Y1);
}
 
U = RND;
V = RND;
/* extract 8 coeffs from U,V */
for (j = 0; j < chrFilterSize; j++) {
X = vec_ld(0, &chrUSrc[j][i / 2]);
U = vec_mradds(X, CCoeffs[j], U);
X = vec_ld(0, &chrVSrc[j][i / 2]);
V = vec_mradds(X, CCoeffs[j], V);
}
 
/* scale and clip signals */
Y0 = vec_sra(Y0, SCL);
Y1 = vec_sra(Y1, SCL);
U = vec_sra(U, SCL);
V = vec_sra(V, SCL);
 
Y0 = vec_clip_s16(Y0);
Y1 = vec_clip_s16(Y1);
U = vec_clip_s16(U);
V = vec_clip_s16(V);
 
/* now we have
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U = u0 u1 u2 u3 u4 u5 u6 u7 V = v0 v1 v2 v3 v4 v5 v6 v7
*
* Y0 = y0 y1 y2 y3 y4 y5 y6 y7 Y1 = y8 y9 y10 y11 y12 y13 y14 y15
* U0 = u0 u0 u1 u1 u2 u2 u3 u3 U1 = u4 u4 u5 u5 u6 u6 u7 u7
* V0 = v0 v0 v1 v1 v2 v2 v3 v3 V1 = v4 v4 v5 v5 v6 v6 v7 v7
*/
 
U0 = vec_mergeh(U, U);
V0 = vec_mergeh(V, V);
 
U1 = vec_mergel(U, U);
V1 = vec_mergel(V, V);
 
cvtyuvtoRGB(c, Y0, U0, V0, &R0, &G0, &B0);
cvtyuvtoRGB(c, Y1, U1, V1, &R1, &G1, &B1);
 
R = vec_packclp(R0, R1);
G = vec_packclp(G0, G1);
B = vec_packclp(B0, B1);
 
nout = (vector unsigned char *) scratch;
switch (target) {
case AV_PIX_FMT_ABGR:
out_abgr(R, G, B, nout);
break;
case AV_PIX_FMT_BGRA:
out_bgra(R, G, B, nout);
break;
case AV_PIX_FMT_RGBA:
out_rgba(R, G, B, nout);
break;
case AV_PIX_FMT_ARGB:
out_argb(R, G, B, nout);
break;
case AV_PIX_FMT_RGB24:
out_rgb24(R, G, B, nout);
break;
case AV_PIX_FMT_BGR24:
out_bgr24(R, G, B, nout);
break;
default:
/* Unreachable, I think. */
av_log(c, AV_LOG_ERROR,
"altivec_yuv2packedX doesn't support %s output\n",
av_get_pix_fmt_name(c->dstFormat));
return;
}
 
memcpy(&((uint32_t *) dest)[i], scratch, (dstW - i) / 4);
}
}
 
#define YUV2PACKEDX_WRAPPER(suffix, pixfmt) \
void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
const int16_t *lumFilter, \
const int16_t **lumSrc, \
int lumFilterSize, \
const int16_t *chrFilter, \
const int16_t **chrUSrc, \
const int16_t **chrVSrc, \
int chrFilterSize, \
const int16_t **alpSrc, \
uint8_t *dest, int dstW, int dstY) \
{ \
yuv2packedX_altivec(c, lumFilter, lumSrc, lumFilterSize, \
chrFilter, chrUSrc, chrVSrc, \
chrFilterSize, alpSrc, \
dest, dstW, dstY, pixfmt); \
}
 
YUV2PACKEDX_WRAPPER(abgr, AV_PIX_FMT_ABGR);
YUV2PACKEDX_WRAPPER(bgra, AV_PIX_FMT_BGRA);
YUV2PACKEDX_WRAPPER(argb, AV_PIX_FMT_ARGB);
YUV2PACKEDX_WRAPPER(rgba, AV_PIX_FMT_RGBA);
YUV2PACKEDX_WRAPPER(rgb24, AV_PIX_FMT_RGB24);
YUV2PACKEDX_WRAPPER(bgr24, AV_PIX_FMT_BGR24);
 
#endif /* HAVE_ALTIVEC */
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/ppc/yuv2rgb_altivec.h
0,0 → 1,51
/*
* AltiVec-enhanced yuv2yuvX
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef SWSCALE_PPC_YUV2RGB_ALTIVEC_H
#define SWSCALE_PPC_YUV2RGB_ALTIVEC_H
 
#include <stdint.h>
 
#include "libswscale/swscale_internal.h"
 
#define YUV2PACKEDX_HEADER(suffix) \
void ff_yuv2 ## suffix ## _X_altivec(SwsContext *c, \
const int16_t *lumFilter, \
const int16_t **lumSrc, \
int lumFilterSize, \
const int16_t *chrFilter, \
const int16_t **chrUSrc, \
const int16_t **chrVSrc, \
int chrFilterSize, \
const int16_t **alpSrc, \
uint8_t *dest, \
int dstW, int dstY);
 
YUV2PACKEDX_HEADER(abgr);
YUV2PACKEDX_HEADER(bgra);
YUV2PACKEDX_HEADER(argb);
YUV2PACKEDX_HEADER(rgba);
YUV2PACKEDX_HEADER(rgb24);
YUV2PACKEDX_HEADER(bgr24);
 
#endif /* SWSCALE_PPC_YUV2RGB_ALTIVEC_H */
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/ppc/yuv2yuv_altivec.c
0,0 → 1,204
/*
* AltiVec-enhanced yuv-to-yuv conversion routines.
*
* Copyright (C) 2004 Romain Dolbeau <romain@dolbeau.org>
* based on the equivalent C code in swscale.c
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <inttypes.h>
 
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
 
#if HAVE_ALTIVEC
 
static int yv12toyuy2_unscaled_altivec(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam[],
int dstStride_a[])
{
uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
// yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
// srcStride[0], srcStride[1], dstStride[0]);
const uint8_t *ysrc = src[0];
const uint8_t *usrc = src[1];
const uint8_t *vsrc = src[2];
const int width = c->srcW;
const int height = srcSliceH;
const int lumStride = srcStride[0];
const int chromStride = srcStride[1];
const int dstStride = dstStride_a[0];
const vector unsigned char yperm = vec_lvsl(0, ysrc);
const int vertLumPerChroma = 2;
register unsigned int y;
 
/* This code assumes:
*
* 1) dst is 16 bytes-aligned
* 2) dstStride is a multiple of 16
* 3) width is a multiple of 16
* 4) lum & chrom stride are multiples of 8
*/
 
for (y = 0; y < height; y++) {
int i;
for (i = 0; i < width - 31; i += 32) {
const unsigned int j = i >> 1;
vector unsigned char v_yA = vec_ld(i, ysrc);
vector unsigned char v_yB = vec_ld(i + 16, ysrc);
vector unsigned char v_yC = vec_ld(i + 32, ysrc);
vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
vector unsigned char v_uA = vec_ld(j, usrc);
vector unsigned char v_uB = vec_ld(j + 16, usrc);
vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
vector unsigned char v_vA = vec_ld(j, vsrc);
vector unsigned char v_vB = vec_ld(j + 16, vsrc);
vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
vector unsigned char v_yuy2_2 = vec_mergeh(v_y2, v_uv_b);
vector unsigned char v_yuy2_3 = vec_mergel(v_y2, v_uv_b);
vec_st(v_yuy2_0, (i << 1), dst);
vec_st(v_yuy2_1, (i << 1) + 16, dst);
vec_st(v_yuy2_2, (i << 1) + 32, dst);
vec_st(v_yuy2_3, (i << 1) + 48, dst);
}
if (i < width) {
const unsigned int j = i >> 1;
vector unsigned char v_y1 = vec_ld(i, ysrc);
vector unsigned char v_u = vec_ld(j, usrc);
vector unsigned char v_v = vec_ld(j, vsrc);
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_yuy2_0 = vec_mergeh(v_y1, v_uv_a);
vector unsigned char v_yuy2_1 = vec_mergel(v_y1, v_uv_a);
vec_st(v_yuy2_0, (i << 1), dst);
vec_st(v_yuy2_1, (i << 1) + 16, dst);
}
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
 
return srcSliceH;
}
 
static int yv12touyvy_unscaled_altivec(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam[],
int dstStride_a[])
{
uint8_t *dst = dstParam[0] + dstStride_a[0] * srcSliceY;
// yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH,
// srcStride[0], srcStride[1], dstStride[0]);
const uint8_t *ysrc = src[0];
const uint8_t *usrc = src[1];
const uint8_t *vsrc = src[2];
const int width = c->srcW;
const int height = srcSliceH;
const int lumStride = srcStride[0];
const int chromStride = srcStride[1];
const int dstStride = dstStride_a[0];
const int vertLumPerChroma = 2;
const vector unsigned char yperm = vec_lvsl(0, ysrc);
register unsigned int y;
 
/* This code assumes:
*
* 1) dst is 16 bytes-aligned
* 2) dstStride is a multiple of 16
* 3) width is a multiple of 16
* 4) lum & chrom stride are multiples of 8
*/
 
for (y = 0; y < height; y++) {
int i;
for (i = 0; i < width - 31; i += 32) {
const unsigned int j = i >> 1;
vector unsigned char v_yA = vec_ld(i, ysrc);
vector unsigned char v_yB = vec_ld(i + 16, ysrc);
vector unsigned char v_yC = vec_ld(i + 32, ysrc);
vector unsigned char v_y1 = vec_perm(v_yA, v_yB, yperm);
vector unsigned char v_y2 = vec_perm(v_yB, v_yC, yperm);
vector unsigned char v_uA = vec_ld(j, usrc);
vector unsigned char v_uB = vec_ld(j + 16, usrc);
vector unsigned char v_u = vec_perm(v_uA, v_uB, vec_lvsl(j, usrc));
vector unsigned char v_vA = vec_ld(j, vsrc);
vector unsigned char v_vB = vec_ld(j + 16, vsrc);
vector unsigned char v_v = vec_perm(v_vA, v_vB, vec_lvsl(j, vsrc));
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uv_b = vec_mergel(v_u, v_v);
vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
vector unsigned char v_uyvy_2 = vec_mergeh(v_uv_b, v_y2);
vector unsigned char v_uyvy_3 = vec_mergel(v_uv_b, v_y2);
vec_st(v_uyvy_0, (i << 1), dst);
vec_st(v_uyvy_1, (i << 1) + 16, dst);
vec_st(v_uyvy_2, (i << 1) + 32, dst);
vec_st(v_uyvy_3, (i << 1) + 48, dst);
}
if (i < width) {
const unsigned int j = i >> 1;
vector unsigned char v_y1 = vec_ld(i, ysrc);
vector unsigned char v_u = vec_ld(j, usrc);
vector unsigned char v_v = vec_ld(j, vsrc);
vector unsigned char v_uv_a = vec_mergeh(v_u, v_v);
vector unsigned char v_uyvy_0 = vec_mergeh(v_uv_a, v_y1);
vector unsigned char v_uyvy_1 = vec_mergel(v_uv_a, v_y1);
vec_st(v_uyvy_0, (i << 1), dst);
vec_st(v_uyvy_1, (i << 1) + 16, dst);
}
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
return srcSliceH;
}
 
#endif /* HAVE_ALTIVEC */
 
av_cold void ff_get_unscaled_swscale_ppc(SwsContext *c)
{
#if HAVE_ALTIVEC
if (!(av_get_cpu_flags() & AV_CPU_FLAG_ALTIVEC))
return;
 
if (!(c->srcW & 15) && !(c->flags & SWS_BITEXACT) &&
c->srcFormat == AV_PIX_FMT_YUV420P) {
enum AVPixelFormat dstFormat = c->dstFormat;
 
// unscaled YV12 -> packed YUV, we want speed
if (dstFormat == AV_PIX_FMT_YUYV422)
c->swscale = yv12toyuy2_unscaled_altivec;
else if (dstFormat == AV_PIX_FMT_UYVY422)
c->swscale = yv12touyvy_unscaled_altivec;
}
#endif /* HAVE_ALTIVEC */
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/rgb2rgb.c
0,0 → 1,393
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <inttypes.h>
 
#include "libavutil/attributes.h"
#include "libavutil/bswap.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
 
void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
 
void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
 
void (*shuffle_bytes_0321)(const uint8_t *src, uint8_t *dst, int src_size);
void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
 
void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride);
void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride,
int32_t *rgb2yuv);
void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
int srcStride, int dstStride);
void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
int width, int height, int src1Stride,
int src2Stride, int dstStride);
void (*deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride);
void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2);
void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2,
const uint8_t *src3, uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride);
void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride);
 
#define BY ((int)( 0.098 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BV ((int)(-0.071 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BU ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GY ((int)( 0.504 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GV ((int)(-0.368 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GU ((int)(-0.291 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RY ((int)( 0.257 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RV ((int)( 0.439 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RU ((int)(-0.148 * (1 << RGB2YUV_SHIFT) + 0.5))
 
//plain C versions
#include "rgb2rgb_template.c"
 
/*
* RGB15->RGB16 original by Strepto/Astral
* ported to gcc & bugfixed : A'rpi
* MMXEXT, 3DNOW optimization by Nick Kurshev
* 32-bit C version, and and&add trick by Michael Niedermayer
*/
 
av_cold void sws_rgb2rgb_init(void)
{
rgb2rgb_init_c();
if (ARCH_X86)
rgb2rgb_init_x86();
}
 
void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 2;
 
for (i = 0; i < num_pixels; i++) {
#if HAVE_BIGENDIAN
/* RGB32 (= A,B,G,R) -> BGR24 (= B,G,R) */
dst[3 * i + 0] = src[4 * i + 1];
dst[3 * i + 1] = src[4 * i + 2];
dst[3 * i + 2] = src[4 * i + 3];
#else
dst[3 * i + 0] = src[4 * i + 2];
dst[3 * i + 1] = src[4 * i + 1];
dst[3 * i + 2] = src[4 * i + 0];
#endif
}
}
 
void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size)
{
int i;
 
for (i = 0; 3 * i < src_size; i++) {
#if HAVE_BIGENDIAN
/* RGB24 (= R, G, B) -> BGR32 (= A, R, G, B) */
dst[4 * i + 0] = 255;
dst[4 * i + 1] = src[3 * i + 0];
dst[4 * i + 2] = src[3 * i + 1];
dst[4 * i + 3] = src[3 * i + 2];
#else
dst[4 * i + 0] = src[3 * i + 2];
dst[4 * i + 1] = src[3 * i + 1];
dst[4 * i + 2] = src[3 * i + 0];
dst[4 * i + 3] = 255;
#endif
}
}
 
void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
 
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
#else
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = 255;
#endif
}
}
 
void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t rgb, r, g, b;
uint16_t *d = (uint16_t *)dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
 
while (s < end) {
rgb = *s++;
r = rgb & 0xF00;
g = rgb & 0x0F0;
b = rgb & 0x00F;
r = (r << 3) | ((r & 0x800) >> 1);
g = (g << 2) | ((g & 0x080) >> 2);
b = (b << 1) | ( b >> 3);
*d++ = r | g | b;
}
}
 
void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
 
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
}
}
 
void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
 
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = (rgb >> 11) | (rgb & 0x7E0) | (rgb << 11);
}
}
 
void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
 
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = (rgb >> 11) | ((rgb & 0x7C0) >> 1) | ((rgb & 0x1F) << 10);
}
}
 
void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
 
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
#else
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = 255;
#endif
}
}
 
void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
 
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
}
}
 
void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
 
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
((uint16_t *)dst)[i] = ((rgb & 0x7C00) >> 10) | ((rgb & 0x3E0) << 1) | (rgb << 11);
}
}
 
void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size)
{
int i, num_pixels = src_size >> 1;
 
for (i = 0; i < num_pixels; i++) {
unsigned rgb = ((const uint16_t *)src)[i];
unsigned br = rgb & 0x7C1F;
((uint16_t *)dst)[i] = (br >> 10) | (rgb & 0x3E0) | (br << 10);
}
}
 
void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
uint16_t *s = (uint16_t *)src;
int i, num_pixels = src_size >> 1;
 
for (i = 0; i < num_pixels; i++) {
unsigned rgb = s[i];
d[i] = (rgb << 8 | rgb & 0xF0 | rgb >> 8) & 0xFFF;
}
}
 
 
#define DEFINE_SHUFFLE_BYTES(a, b, c, d) \
void shuffle_bytes_ ## a ## b ## c ## d(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
int i; \
\
for (i = 0; i < src_size; i += 4) { \
dst[i + 0] = src[i + a]; \
dst[i + 1] = src[i + b]; \
dst[i + 2] = src[i + c]; \
dst[i + 3] = src[i + d]; \
} \
}
 
DEFINE_SHUFFLE_BYTES(1, 2, 3, 0)
DEFINE_SHUFFLE_BYTES(3, 0, 1, 2)
DEFINE_SHUFFLE_BYTES(3, 2, 1, 0)
 
#define DEFINE_RGB48TOBGR48(need_bswap, swap) \
void rgb48tobgr48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 1; \
\
for (i = 0; i < num_pixels; i += 3) { \
d[i ] = swap ? av_bswap16(s[i + 2]) : s[i + 2]; \
d[i + 1] = swap ? av_bswap16(s[i + 1]) : s[i + 1]; \
d[i + 2] = swap ? av_bswap16(s[i ]) : s[i ]; \
} \
}
 
DEFINE_RGB48TOBGR48(nobswap, 0)
DEFINE_RGB48TOBGR48(bswap, 1)
 
#define DEFINE_RGB64TOBGR48(need_bswap, swap) \
void rgb64tobgr48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 3; \
\
for (i = 0; i < num_pixels; i++) { \
d[3 * i ] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \
d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \
d[3 * i + 2] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \
} \
}
 
DEFINE_RGB64TOBGR48(nobswap, 0)
DEFINE_RGB64TOBGR48(bswap, 1)
 
#define DEFINE_RGB64TO48(need_bswap, swap) \
void rgb64to48_ ## need_bswap(const uint8_t *src, \
uint8_t *dst, int src_size) \
{ \
uint16_t *d = (uint16_t *)dst; \
uint16_t *s = (uint16_t *)src; \
int i, num_pixels = src_size >> 3; \
\
for (i = 0; i < num_pixels; i++) { \
d[3 * i ] = swap ? av_bswap16(s[4 * i ]) : s[4 * i ]; \
d[3 * i + 1] = swap ? av_bswap16(s[4 * i + 1]) : s[4 * i + 1]; \
d[3 * i + 2] = swap ? av_bswap16(s[4 * i + 2]) : s[4 * i + 2]; \
} \
}
 
DEFINE_RGB64TO48(nobswap, 0)
DEFINE_RGB64TO48(bswap, 1)
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/rgb2rgb.h
0,0 → 1,171
/*
* software RGB to RGB converter
* pluralize by Software PAL8 to RGB converter
* Software YUV to YUV converter
* Software YUV to RGB converter
* Written by Nick Kurshev.
* YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef SWSCALE_RGB2RGB_H
#define SWSCALE_RGB2RGB_H
 
#include <inttypes.h>
 
#include "libavutil/avutil.h"
#include "swscale.h"
 
/* A full collection of RGB to RGB(BGR) converters */
extern void (*rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size);
 
extern void (*shuffle_bytes_0321)(const uint8_t *src, uint8_t *dst, int src_size);
extern void (*shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size);
 
void rgb64tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48tobgr48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb48tobgr48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64to48_nobswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb64to48_bswap(const uint8_t *src, uint8_t *dst, int src_size);
void rgb24to32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb32to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
void rgb16tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr32(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15to24(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr16(const uint8_t *src, uint8_t *dst, int src_size);
void rgb15tobgr15(const uint8_t *src, uint8_t *dst, int src_size);
void rgb12tobgr12(const uint8_t *src, uint8_t *dst, int src_size);
void rgb12to15(const uint8_t *src, uint8_t *dst, int src_size);
 
void shuffle_bytes_1230(const uint8_t *src, uint8_t *dst, int src_size);
void shuffle_bytes_3012(const uint8_t *src, uint8_t *dst, int src_size);
void shuffle_bytes_3210(const uint8_t *src, uint8_t *dst, int src_size);
 
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height, int lumStride,
int chromStride, int srcStride, int32_t *rgb2yuv);
 
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
 
/**
* Width should be a multiple of 16.
*/
extern void (*yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
 
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride);
 
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
extern void (*yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
 
/**
* Width should be a multiple of 16.
*/
extern void (*yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride);
 
/**
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write high quality version.
*/
extern void (*ff_rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride,
int32_t *rgb2yuv);
extern void (*planar2x)(const uint8_t *src, uint8_t *dst, int width, int height,
int srcStride, int dstStride);
 
extern void (*interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dst,
int width, int height, int src1Stride,
int src2Stride, int dstStride);
 
extern void (*deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride);
 
extern void (*vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2);
 
extern void (*yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride);
 
extern void (*uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
extern void (*yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride);
 
void sws_rgb2rgb_init(void);
 
void rgb2rgb_init_x86(void);
 
#endif /* SWSCALE_RGB2RGB_H */
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/rgb2rgb_template.c
0,0 → 1,971
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byte order fixes by Alex Beregszaszi
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stddef.h>
 
#include "libavutil/attributes.h"
 
static inline void rgb24tobgr32_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
 
while (s < end) {
#if HAVE_BIGENDIAN
/* RGB24 (= R, G, B) -> RGB32 (= A, B, G, R) */
*dest++ = 255;
*dest++ = s[2];
*dest++ = s[1];
*dest++ = s[0];
s += 3;
#else
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
*dest++ = 255;
#endif
}
}
 
static inline void rgb32tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
 
while (s < end) {
#if HAVE_BIGENDIAN
/* RGB32 (= A, B, G, R) -> RGB24 (= R, G, B) */
s++;
dest[2] = *s++;
dest[1] = *s++;
dest[0] = *s++;
dest += 3;
#else
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
s++;
#endif
}
}
 
/*
* original by Strepto/Astral
* ported to gcc & bugfixed: A'rpi
* MMXEXT, 3DNOW optimization by Nick Kurshev
* 32-bit C version, and and&add trick by Michael Niedermayer
*/
static inline void rgb15to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
register uint8_t *d = dst;
register const uint8_t *s = src;
register const uint8_t *end = s + src_size;
const uint8_t *mm_end = end - 3;
 
while (s < mm_end) {
register unsigned x = *((const uint32_t *)s);
*((uint32_t *)d) = (x & 0x7FFF7FFF) + (x & 0x7FE07FE0);
d += 4;
s += 4;
}
if (s < end) {
register unsigned short x = *((const uint16_t *)s);
*((uint16_t *)d) = (x & 0x7FFF) + (x & 0x7FE0);
}
}
 
static inline void rgb16to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
register uint8_t *d = dst;
register const uint8_t *s = src;
register const uint8_t *end = s + src_size;
const uint8_t *mm_end = end - 3;
 
while (s < mm_end) {
register uint32_t x = *((const uint32_t *)s);
*((uint32_t *)d) = ((x >> 1) & 0x7FE07FE0) | (x & 0x001F001F);
s += 4;
d += 4;
}
if (s < end) {
register uint16_t x = *((const uint16_t *)s);
*((uint16_t *)d) = ((x >> 1) & 0x7FE0) | (x & 0x001F);
}
}
 
static inline void rgb32to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
 
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xFF) >> 3) +
((rgb & 0xFC00) >> 5) +
((rgb & 0xF80000) >> 8);
}
}
 
static inline void rgb32tobgr16_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
 
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xF8) << 8) +
((rgb & 0xFC00) >> 5) +
((rgb & 0xF80000) >> 19);
}
}
 
static inline void rgb32to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
 
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xFF) >> 3) +
((rgb & 0xF800) >> 6) +
((rgb & 0xF80000) >> 9);
}
}
 
static inline void rgb32tobgr15_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
 
while (s < end) {
register int rgb = *(const uint32_t *)s;
s += 4;
*d++ = ((rgb & 0xF8) << 7) +
((rgb & 0xF800) >> 6) +
((rgb & 0xF80000) >> 19);
}
}
 
static inline void rgb24tobgr16_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
 
while (s < end) {
const int b = *s++;
const int g = *s++;
const int r = *s++;
*d++ = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
}
}
 
static inline void rgb24to16_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
 
while (s < end) {
const int r = *s++;
const int g = *s++;
const int b = *s++;
*d++ = (b >> 3) | ((g & 0xFC) << 3) | ((r & 0xF8) << 8);
}
}
 
static inline void rgb24tobgr15_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
 
while (s < end) {
const int b = *s++;
const int g = *s++;
const int r = *s++;
*d++ = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
}
}
 
static inline void rgb24to15_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint16_t *d = (uint16_t *)dst;
const uint8_t *s = src;
const uint8_t *end = s + src_size;
 
while (s < end) {
const int r = *s++;
const int g = *s++;
const int b = *s++;
*d++ = (b >> 3) | ((g & 0xF8) << 2) | ((r & 0xF8) << 7);
}
}
 
static inline void rgb15tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
 
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
}
}
 
static inline void rgb16tobgr24_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
 
while (s < end) {
register uint16_t bgr = *s++;
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
}
}
 
static inline void rgb15to32_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
 
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
#else
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x03E0)>>2) | ((bgr&0x03E0)>> 7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = 255;
#endif
}
}
 
static inline void rgb16to32_c(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
const uint16_t *end = s + src_size / 2;
 
while (s < end) {
register uint16_t bgr = *s++;
#if HAVE_BIGENDIAN
*d++ = 255;
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
#else
*d++ = ((bgr&0x001F)<<3) | ((bgr&0x001F)>> 2);
*d++ = ((bgr&0x07E0)>>3) | ((bgr&0x07E0)>> 9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = 255;
#endif
}
}
 
static inline void shuffle_bytes_2103_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
int idx = 15 - src_size;
const uint8_t *s = src - idx;
uint8_t *d = dst - idx;
 
for (; idx < 15; idx += 4) {
register unsigned v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
v &= 0xff00ff;
*(uint32_t *)&d[idx] = (v >> 16) + g + (v << 16);
}
}
 
static inline void shuffle_bytes_0321_c(const uint8_t *src, uint8_t *dst,
int src_size)
{
int idx = 15 - src_size;
const uint8_t *s = src - idx;
uint8_t *d = dst - idx;
 
for (; idx < 15; idx += 4) {
register unsigned v = *(const uint32_t *)&s[idx], g = v & 0x00ff00ff;
v &= 0xff00ff00;
*(uint32_t *)&d[idx] = (v >> 16) + g + (v << 16);
}
}
 
static inline void rgb24tobgr24_c(const uint8_t *src, uint8_t *dst, int src_size)
{
unsigned i;
 
for (i = 0; i < src_size; i += 3) {
register uint8_t x = src[i + 2];
dst[i + 1] = src[i + 1];
dst[i + 2] = src[i + 0];
dst[i + 0] = x;
}
}
 
static inline void yuvPlanartoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride,
int dstStride, int vertLumPerChroma)
{
int y, i;
const int chromWidth = width >> 1;
 
for (y = 0; y < height; y++) {
#if HAVE_FAST_64BIT
uint64_t *ldst = (uint64_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i += 2) {
uint64_t k = yc[0] + (uc[0] << 8) +
(yc[1] << 16) + ((unsigned) vc[0] << 24);
uint64_t l = yc[2] + (uc[1] << 8) +
(yc[3] << 16) + ((unsigned) vc[1] << 24);
*ldst++ = k + (l << 32);
yc += 4;
uc += 2;
vc += 2;
}
 
#else
int *idst = (int32_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
 
for (i = 0; i < chromWidth; i++) {
#if HAVE_BIGENDIAN
*idst++ = (yc[0] << 24) + (uc[0] << 16) +
(yc[1] << 8) + (vc[0] << 0);
#else
*idst++ = yc[0] + (uc[0] << 8) +
(yc[1] << 16) + (vc[0] << 24);
#endif
yc += 2;
uc++;
vc++;
}
#endif
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
}
 
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yv12toyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
//FIXME interpolate chroma
yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 2);
}
 
static inline void yuvPlanartouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride,
int dstStride, int vertLumPerChroma)
{
int y, i;
const int chromWidth = width >> 1;
 
for (y = 0; y < height; y++) {
#if HAVE_FAST_64BIT
uint64_t *ldst = (uint64_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
for (i = 0; i < chromWidth; i += 2) {
uint64_t k = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + ((unsigned) yc[1] << 24);
uint64_t l = uc[1] + (yc[2] << 8) +
(vc[1] << 16) + ((unsigned) yc[3] << 24);
*ldst++ = k + (l << 32);
yc += 4;
uc += 2;
vc += 2;
}
 
#else
int *idst = (int32_t *)dst;
const uint8_t *yc = ysrc, *uc = usrc, *vc = vsrc;
 
for (i = 0; i < chromWidth; i++) {
#if HAVE_BIGENDIAN
*idst++ = (uc[0] << 24) + (yc[0] << 16) +
(vc[0] << 8) + (yc[1] << 0);
#else
*idst++ = uc[0] + (yc[0] << 8) +
(vc[0] << 16) + (yc[1] << 24);
#endif
yc += 2;
uc++;
vc++;
}
#endif
if ((y & (vertLumPerChroma - 1)) == vertLumPerChroma - 1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
}
 
/**
* Height should be a multiple of 2 and width should be a multiple of 16
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yv12touyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
//FIXME interpolate chroma
yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 2);
}
 
/**
* Width should be a multiple of 16.
*/
static inline void yuv422ptouyvy_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
yuvPlanartouyvy_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 1);
}
 
/**
* Width should be a multiple of 16.
*/
static inline void yuv422ptoyuy2_c(const uint8_t *ysrc, const uint8_t *usrc,
const uint8_t *vsrc, uint8_t *dst,
int width, int height, int lumStride,
int chromStride, int dstStride)
{
yuvPlanartoyuy2_c(ysrc, usrc, vsrc, dst, width, height, lumStride,
chromStride, dstStride, 1);
}
 
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void yuy2toyv12_c(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height, int lumStride,
int chromStride, int srcStride)
{
int y;
const int chromWidth = width >> 1;
 
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 0];
udst[i] = src[4 * i + 1];
ydst[2 * i + 1] = src[4 * i + 2];
vdst[i] = src[4 * i + 3];
}
ydst += lumStride;
src += srcStride;
 
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 0];
ydst[2 * i + 1] = src[4 * i + 2];
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
 
static inline void planar2x_c(const uint8_t *src, uint8_t *dst, int srcWidth,
int srcHeight, int srcStride, int dstStride)
{
int x, y;
 
dst[0] = src[0];
 
// first line
for (x = 0; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (3 * src[x] + src[x + 1]) >> 2;
dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
}
dst[2 * srcWidth - 1] = src[srcWidth - 1];
 
dst += dstStride;
 
for (y = 1; y < srcHeight; y++) {
const int mmxSize = 1;
 
dst[0] = (src[0] * 3 + src[srcStride]) >> 2;
dst[dstStride] = (src[0] + 3 * src[srcStride]) >> 2;
 
for (x = mmxSize - 1; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (src[x + 0] * 3 + src[x + srcStride + 1]) >> 2;
dst[2 * x + dstStride + 2] = (src[x + 0] + 3 * src[x + srcStride + 1]) >> 2;
dst[2 * x + dstStride + 1] = (src[x + 1] + 3 * src[x + srcStride]) >> 2;
dst[2 * x + 2] = (src[x + 1] * 3 + src[x + srcStride]) >> 2;
}
dst[srcWidth * 2 - 1] = (src[srcWidth - 1] * 3 + src[srcWidth - 1 + srcStride]) >> 2;
dst[srcWidth * 2 - 1 + dstStride] = (src[srcWidth - 1] + 3 * src[srcWidth - 1 + srcStride]) >> 2;
 
dst += dstStride * 2;
src += srcStride;
}
 
// last line
dst[0] = src[0];
 
for (x = 0; x < srcWidth - 1; x++) {
dst[2 * x + 1] = (src[x] * 3 + src[x + 1]) >> 2;
dst[2 * x + 2] = (src[x] + 3 * src[x + 1]) >> 2;
}
dst[2 * srcWidth - 1] = src[srcWidth - 1];
}
 
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write HQ version.
*/
static inline void uyvytoyv12_c(const uint8_t *src, uint8_t *ydst,
uint8_t *udst, uint8_t *vdst,
int width, int height, int lumStride,
int chromStride, int srcStride)
{
int y;
const int chromWidth = width >> 1;
 
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
udst[i] = src[4 * i + 0];
ydst[2 * i + 0] = src[4 * i + 1];
vdst[i] = src[4 * i + 2];
ydst[2 * i + 1] = src[4 * i + 3];
}
ydst += lumStride;
src += srcStride;
 
for (i = 0; i < chromWidth; i++) {
ydst[2 * i + 0] = src[4 * i + 1];
ydst[2 * i + 1] = src[4 * i + 3];
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
 
/**
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line,
* others are ignored in the C version.
* FIXME: Write HQ version.
*/
void ff_rgb24toyv12_c(const uint8_t *src, uint8_t *ydst, uint8_t *udst,
uint8_t *vdst, int width, int height, int lumStride,
int chromStride, int srcStride, int32_t *rgb2yuv)
{
int32_t ry = rgb2yuv[RY_IDX], gy = rgb2yuv[GY_IDX], by = rgb2yuv[BY_IDX];
int32_t ru = rgb2yuv[RU_IDX], gu = rgb2yuv[GU_IDX], bu = rgb2yuv[BU_IDX];
int32_t rv = rgb2yuv[RV_IDX], gv = rgb2yuv[GV_IDX], bv = rgb2yuv[BV_IDX];
int y;
const int chromWidth = width >> 1;
 
for (y = 0; y < height; y += 2) {
int i;
for (i = 0; i < chromWidth; i++) {
unsigned int b = src[6 * i + 0];
unsigned int g = src[6 * i + 1];
unsigned int r = src[6 * i + 2];
 
unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
unsigned int V = ((rv * r + gv * g + bv * b) >> RGB2YUV_SHIFT) + 128;
unsigned int U = ((ru * r + gu * g + bu * b) >> RGB2YUV_SHIFT) + 128;
 
udst[i] = U;
vdst[i] = V;
ydst[2 * i] = Y;
 
b = src[6 * i + 3];
g = src[6 * i + 4];
r = src[6 * i + 5];
 
Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i + 1] = Y;
}
ydst += lumStride;
src += srcStride;
 
if (y+1 == height)
break;
 
for (i = 0; i < chromWidth; i++) {
unsigned int b = src[6 * i + 0];
unsigned int g = src[6 * i + 1];
unsigned int r = src[6 * i + 2];
 
unsigned int Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
 
ydst[2 * i] = Y;
 
b = src[6 * i + 3];
g = src[6 * i + 4];
r = src[6 * i + 5];
 
Y = ((ry * r + gy * g + by * b) >> RGB2YUV_SHIFT) + 16;
ydst[2 * i + 1] = Y;
}
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
}
 
static void interleaveBytes_c(const uint8_t *src1, const uint8_t *src2,
uint8_t *dest, int width, int height,
int src1Stride, int src2Stride, int dstStride)
{
int h;
 
for (h = 0; h < height; h++) {
int w;
for (w = 0; w < width; w++) {
dest[2 * w + 0] = src1[w];
dest[2 * w + 1] = src2[w];
}
dest += dstStride;
src1 += src1Stride;
src2 += src2Stride;
}
}
 
static void deinterleaveBytes_c(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride)
{
int h;
 
for (h = 0; h < height; h++) {
int w;
for (w = 0; w < width; w++) {
dst1[w] = src[2 * w + 0];
dst2[w] = src[2 * w + 1];
}
src += srcStride;
dst1 += dst1Stride;
dst2 += dst2Stride;
}
}
 
static inline void vu9_to_vu12_c(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2)
{
int x, y;
int w = width / 2;
int h = height / 2;
 
for (y = 0; y < h; y++) {
const uint8_t *s1 = src1 + srcStride1 * (y >> 1);
uint8_t *d = dst1 + dstStride1 * y;
for (x = 0; x < w; x++)
d[2 * x] = d[2 * x + 1] = s1[x];
}
for (y = 0; y < h; y++) {
const uint8_t *s2 = src2 + srcStride2 * (y >> 1);
uint8_t *d = dst2 + dstStride2 * y;
for (x = 0; x < w; x++)
d[2 * x] = d[2 * x + 1] = s2[x];
}
}
 
static inline void yvu9_to_yuy2_c(const uint8_t *src1, const uint8_t *src2,
const uint8_t *src3, uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride)
{
int x, y;
int w = width / 2;
int h = height;
 
for (y = 0; y < h; y++) {
const uint8_t *yp = src1 + srcStride1 * y;
const uint8_t *up = src2 + srcStride2 * (y >> 2);
const uint8_t *vp = src3 + srcStride3 * (y >> 2);
uint8_t *d = dst + dstStride * y;
for (x = 0; x < w; x++) {
const int x2 = x << 2;
d[8 * x + 0] = yp[x2];
d[8 * x + 1] = up[x];
d[8 * x + 2] = yp[x2 + 1];
d[8 * x + 3] = vp[x];
d[8 * x + 4] = yp[x2 + 2];
d[8 * x + 5] = up[x];
d[8 * x + 6] = yp[x2 + 3];
d[8 * x + 7] = vp[x];
}
}
}
 
static void extract_even_c(const uint8_t *src, uint8_t *dst, int count)
{
dst += count;
src += count * 2;
count = -count;
while (count < 0) {
dst[count] = src[2 * count];
count++;
}
}
 
static void extract_even2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
int count)
{
dst0 += count;
dst1 += count;
src += count * 4;
count = -count;
while (count < 0) {
dst0[count] = src[4 * count + 0];
dst1[count] = src[4 * count + 2];
count++;
}
}
 
static void extract_even2avg_c(const uint8_t *src0, const uint8_t *src1,
uint8_t *dst0, uint8_t *dst1, int count)
{
dst0 += count;
dst1 += count;
src0 += count * 4;
src1 += count * 4;
count = -count;
while (count < 0) {
dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
count++;
}
}
 
static void extract_odd2_c(const uint8_t *src, uint8_t *dst0, uint8_t *dst1,
int count)
{
dst0 += count;
dst1 += count;
src += count * 4;
count = -count;
src++;
while (count < 0) {
dst0[count] = src[4 * count + 0];
dst1[count] = src[4 * count + 2];
count++;
}
}
 
static void extract_odd2avg_c(const uint8_t *src0, const uint8_t *src1,
uint8_t *dst0, uint8_t *dst1, int count)
{
dst0 += count;
dst1 += count;
src0 += count * 4;
src1 += count * 4;
count = -count;
src0++;
src1++;
while (count < 0) {
dst0[count] = (src0[4 * count + 0] + src1[4 * count + 0]) >> 1;
dst1[count] = (src0[4 * count + 2] + src1[4 * count + 2]) >> 1;
count++;
}
}
 
static void yuyvtoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
 
for (y = 0; y < height; y++) {
extract_even_c(src, ydst, width);
if (y & 1) {
extract_odd2avg_c(src - srcStride, src, udst, vdst, chromWidth);
udst += chromStride;
vdst += chromStride;
}
 
src += srcStride;
ydst += lumStride;
}
}
 
static void yuyvtoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
 
for (y = 0; y < height; y++) {
extract_even_c(src, ydst, width);
extract_odd2_c(src, udst, vdst, chromWidth);
 
src += srcStride;
ydst += lumStride;
udst += chromStride;
vdst += chromStride;
}
}
 
static void uyvytoyuv420_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
 
for (y = 0; y < height; y++) {
extract_even_c(src + 1, ydst, width);
if (y & 1) {
extract_even2avg_c(src - srcStride, src, udst, vdst, chromWidth);
udst += chromStride;
vdst += chromStride;
}
 
src += srcStride;
ydst += lumStride;
}
}
 
static void uyvytoyuv422_c(uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
const uint8_t *src, int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
 
for (y = 0; y < height; y++) {
extract_even_c(src + 1, ydst, width);
extract_even2_c(src, udst, vdst, chromWidth);
 
src += srcStride;
ydst += lumStride;
udst += chromStride;
vdst += chromStride;
}
}
 
static av_cold void rgb2rgb_init_c(void)
{
rgb15to16 = rgb15to16_c;
rgb15tobgr24 = rgb15tobgr24_c;
rgb15to32 = rgb15to32_c;
rgb16tobgr24 = rgb16tobgr24_c;
rgb16to32 = rgb16to32_c;
rgb16to15 = rgb16to15_c;
rgb24tobgr16 = rgb24tobgr16_c;
rgb24tobgr15 = rgb24tobgr15_c;
rgb24tobgr32 = rgb24tobgr32_c;
rgb32to16 = rgb32to16_c;
rgb32to15 = rgb32to15_c;
rgb32tobgr24 = rgb32tobgr24_c;
rgb24to15 = rgb24to15_c;
rgb24to16 = rgb24to16_c;
rgb24tobgr24 = rgb24tobgr24_c;
#if HAVE_BIGENDIAN
shuffle_bytes_0321 = shuffle_bytes_2103_c;
shuffle_bytes_2103 = shuffle_bytes_0321_c;
#else
shuffle_bytes_0321 = shuffle_bytes_0321_c;
shuffle_bytes_2103 = shuffle_bytes_2103_c;
#endif
rgb32tobgr16 = rgb32tobgr16_c;
rgb32tobgr15 = rgb32tobgr15_c;
yv12toyuy2 = yv12toyuy2_c;
yv12touyvy = yv12touyvy_c;
yuv422ptoyuy2 = yuv422ptoyuy2_c;
yuv422ptouyvy = yuv422ptouyvy_c;
yuy2toyv12 = yuy2toyv12_c;
planar2x = planar2x_c;
ff_rgb24toyv12 = ff_rgb24toyv12_c;
interleaveBytes = interleaveBytes_c;
deinterleaveBytes = deinterleaveBytes_c;
vu9_to_vu12 = vu9_to_vu12_c;
yvu9_to_yuy2 = yvu9_to_yuy2_c;
 
uyvytoyuv420 = uyvytoyuv420_c;
uyvytoyuv422 = uyvytoyuv422_c;
yuyvtoyuv420 = yuyvtoyuv420_c;
yuyvtoyuv422 = yuyvtoyuv422_c;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/slice.c
0,0 → 1,349
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "swscale_internal.h"
 
static void free_lines(SwsSlice *s)
{
int i;
for (i = 0; i < 2; ++i) {
int n = s->plane[i].available_lines;
int j;
for (j = 0; j < n; ++j) {
av_freep(&s->plane[i].line[j]);
if (s->is_ring)
s->plane[i].line[j+n] = NULL;
}
}
 
for (i = 0; i < 4; ++i)
memset(s->plane[i].line, 0, sizeof(uint8_t*) * s->plane[i].available_lines * (s->is_ring ? 3 : 1));
s->should_free_lines = 0;
}
 
/*
slice lines contains extra bytes for vetorial code thus @size
is the allocated memory size and @width is the number of pixels
*/
static int alloc_lines(SwsSlice *s, int size, int width)
{
int i;
int idx[2] = {3, 2};
 
s->should_free_lines = 1;
s->width = width;
 
for (i = 0; i < 2; ++i) {
int n = s->plane[i].available_lines;
int j;
int ii = idx[i];
 
av_assert0(n == s->plane[ii].available_lines);
for (j = 0; j < n; ++j) {
// chroma plane line U and V are expected to be contiguous in memory
// by mmx vertical scaler code
s->plane[i].line[j] = av_malloc(size * 2 + 32);
if (!s->plane[i].line[j]) {
free_lines(s);
return AVERROR(ENOMEM);
}
s->plane[ii].line[j] = s->plane[i].line[j] + size + 16;
if (s->is_ring) {
s->plane[i].line[j+n] = s->plane[i].line[j];
s->plane[ii].line[j+n] = s->plane[ii].line[j];
}
}
}
 
return 0;
}
 
static int alloc_slice(SwsSlice *s, enum AVPixelFormat fmt, int lumLines, int chrLines, int h_sub_sample, int v_sub_sample, int ring)
{
int i;
int size[4] = { lumLines,
chrLines,
chrLines,
lumLines };
 
s->h_chr_sub_sample = h_sub_sample;
s->v_chr_sub_sample = v_sub_sample;
s->fmt = fmt;
s->is_ring = ring;
s->should_free_lines = 0;
 
for (i = 0; i < 4; ++i) {
int n = size[i] * ( ring == 0 ? 1 : 3);
s->plane[i].line = av_mallocz_array(sizeof(uint8_t*), n);
if (!s->plane[i].line)
return AVERROR(ENOMEM);
 
s->plane[i].tmp = ring ? s->plane[i].line + size[i] * 2 : NULL;
s->plane[i].available_lines = size[i];
s->plane[i].sliceY = 0;
s->plane[i].sliceH = 0;
}
return 0;
}
 
static void free_slice(SwsSlice *s)
{
int i;
if (s) {
if (s->should_free_lines)
free_lines(s);
for (i = 0; i < 4; ++i) {
av_freep(&s->plane[i].line);
s->plane[i].tmp = NULL;
}
}
}
 
int ff_rotate_slice(SwsSlice *s, int lum, int chr)
{
int i;
if (lum) {
for (i = 0; i < 4; i+=3) {
int n = s->plane[i].available_lines;
int l = lum - s->plane[i].sliceY;
 
if (l >= n * 2) {
s->plane[i].sliceY += n;
s->plane[i].sliceH -= n;
}
}
}
if (chr) {
for (i = 1; i < 3; ++i) {
int n = s->plane[i].available_lines;
int l = chr - s->plane[i].sliceY;
 
if (l >= n * 2) {
s->plane[i].sliceY += n;
s->plane[i].sliceH -= n;
}
}
}
return 0;
}
 
int ff_init_slice_from_src(SwsSlice * s, uint8_t *src[4], int stride[4], int srcW, int lumY, int lumH, int chrY, int chrH, int relative)
{
int i = 0;
 
const int start[4] = {lumY,
chrY,
chrY,
lumY};
 
const int end[4] = {lumY +lumH,
chrY + chrH,
chrY + chrH,
lumY + lumH};
 
const uint8_t *src_[4] = {src[0] + (relative ? 0 : start[0]) * stride[0],
src[1] + (relative ? 0 : start[1]) * stride[0],
src[2] + (relative ? 0 : start[2]) * stride[0],
src[3] + (relative ? 0 : start[3]) * stride[0]};
 
s->width = srcW;
 
for (i = 0; i < 4; ++i) {
int j;
int first = s->plane[i].sliceY;
int n = s->plane[i].available_lines;
int lines = end[i] - start[i];
int tot_lines = end[i] - first;
 
if (start[i] >= first && n >= tot_lines) {
s->plane[i].sliceH = FFMAX(tot_lines, s->plane[i].sliceH);
for (j = 0; j < lines; j+= 1)
s->plane[i].line[start[i] - first + j] = src_[i] + j * stride[i];
} else {
s->plane[i].sliceY = start[i];
lines = lines > n ? n : lines;
s->plane[i].sliceH = lines;
for (j = 0; j < lines; j+= 1)
s->plane[i].line[j] = src_[i] + j * stride[i];
}
 
}
 
return 0;
}
 
static void fill_ones(SwsSlice *s, int n, int is16bit)
{
int i;
for (i = 0; i < 4; ++i) {
int j;
int size = s->plane[i].available_lines;
for (j = 0; j < size; ++j) {
int k;
int end = is16bit ? n>>1: n;
// fill also one extra element
end += 1;
if (is16bit)
for (k = 0; k < end; ++k)
((int32_t*)(s->plane[i].line[j]))[k] = 1<<18;
else
for (k = 0; k < end; ++k)
((int16_t*)(s->plane[i].line[j]))[k] = 1<<14;
}
}
}
 
int ff_init_filters(SwsContext * c)
{
int i;
int index;
int num_ydesc;
int num_cdesc;
int num_vdesc = isPlanarYUV(c->dstFormat) && !isGray(c->dstFormat) ? 2 : 1;
int need_lum_conv = c->lumToYV12 || c->readLumPlanar || c->alpToYV12 || c->readAlpPlanar;
int need_chr_conv = c->chrToYV12 || c->readChrPlanar;
int need_gamma = c->is_internal_gamma;
int srcIdx, dstIdx;
int dst_stride = FFALIGN(c->dstW * sizeof(int16_t) + 66, 16);
 
uint32_t * pal = usePal(c->srcFormat) ? c->pal_yuv : (uint32_t*)c->input_rgb2yuv_table;
int res = 0;
 
if (c->dstBpc == 16)
dst_stride <<= 1;
 
num_ydesc = need_lum_conv ? 2 : 1;
num_cdesc = need_chr_conv ? 2 : 1;
 
c->numSlice = FFMAX(num_ydesc, num_cdesc) + 2;
c->numDesc = num_ydesc + num_cdesc + num_vdesc + (need_gamma ? 2 : 0);
c->descIndex[0] = num_ydesc + (need_gamma ? 1 : 0);
c->descIndex[1] = num_ydesc + num_cdesc + (need_gamma ? 1 : 0);
 
 
 
c->desc = av_mallocz_array(sizeof(SwsFilterDescriptor), c->numDesc);
if (!c->desc)
return AVERROR(ENOMEM);
c->slice = av_mallocz_array(sizeof(SwsSlice), c->numSlice);
 
 
res = alloc_slice(&c->slice[0], c->srcFormat, c->srcH, c->chrSrcH, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
if (res < 0) goto cleanup;
for (i = 1; i < c->numSlice-2; ++i) {
res = alloc_slice(&c->slice[i], c->srcFormat, c->vLumFilterSize + MAX_LINES_AHEAD, c->vChrFilterSize + MAX_LINES_AHEAD, c->chrSrcHSubSample, c->chrSrcVSubSample, 0);
if (res < 0) goto cleanup;
res = alloc_lines(&c->slice[i], FFALIGN(c->srcW*2+78, 16), c->srcW);
if (res < 0) goto cleanup;
}
// horizontal scaler output
res = alloc_slice(&c->slice[i], c->srcFormat, c->vLumFilterSize + MAX_LINES_AHEAD, c->vChrFilterSize + MAX_LINES_AHEAD, c->chrDstHSubSample, c->chrDstVSubSample, 1);
if (res < 0) goto cleanup;
res = alloc_lines(&c->slice[i], dst_stride, c->dstW);
if (res < 0) goto cleanup;
 
fill_ones(&c->slice[i], dst_stride>>1, c->dstBpc == 16);
 
// vertical scaler output
++i;
res = alloc_slice(&c->slice[i], c->dstFormat, c->dstH, c->chrDstH, c->chrDstHSubSample, c->chrDstVSubSample, 0);
if (res < 0) goto cleanup;
 
index = 0;
srcIdx = 0;
dstIdx = 1;
 
if (need_gamma) {
res = ff_init_gamma_convert(c->desc + index, c->slice + srcIdx, c->inv_gamma);
if (res < 0) goto cleanup;
++index;
}
 
if (need_lum_conv) {
res = ff_init_desc_fmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
if (res < 0) goto cleanup;
c->desc[index].alpha = c->alpPixBuf != 0;
++index;
srcIdx = dstIdx;
}
 
 
dstIdx = FFMAX(num_ydesc, num_cdesc);
res = ff_init_desc_hscale(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], c->hLumFilter, c->hLumFilterPos, c->hLumFilterSize, c->lumXInc);
if (res < 0) goto cleanup;
c->desc[index].alpha = c->alpPixBuf != 0;
 
 
++index;
{
srcIdx = 0;
dstIdx = 1;
if (need_chr_conv) {
res = ff_init_desc_cfmt_convert(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], pal);
if (res < 0) goto cleanup;
++index;
srcIdx = dstIdx;
}
 
dstIdx = FFMAX(num_ydesc, num_cdesc);
if (c->needs_hcscale)
res = ff_init_desc_chscale(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx], c->hChrFilter, c->hChrFilterPos, c->hChrFilterSize, c->chrXInc);
else
res = ff_init_desc_no_chr(&c->desc[index], &c->slice[srcIdx], &c->slice[dstIdx]);
if (res < 0) goto cleanup;
}
 
++index;
{
srcIdx = c->numSlice - 2;
dstIdx = c->numSlice - 1;
res = ff_init_vscale(c, c->desc + index, c->slice + srcIdx, c->slice + dstIdx);
if (res < 0) goto cleanup;
}
 
++index;
if (need_gamma) {
res = ff_init_gamma_convert(c->desc + index, c->slice + dstIdx, c->gamma);
if (res < 0) goto cleanup;
}
 
return 0;
 
cleanup:
ff_free_filters(c);
return res;
}
 
int ff_free_filters(SwsContext *c)
{
int i;
if (c->desc) {
for (i = 0; i < c->numDesc; ++i)
av_freep(&c->desc[i].instance);
av_freep(&c->desc);
}
 
if (c->slice) {
for (i = 0; i < c->numSlice; ++i)
free_slice(&c->slice[i]);
av_freep(&c->slice);
}
return 0;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/swscale-3.def
0,0 → 1,38
EXPORTS
DllStartup
sws_addVec
sws_allocVec
sws_alloc_context
sws_alloc_set_opts
sws_cloneVec
sws_context_class DATA
sws_convVec
sws_convertPalette8ToPacked24
sws_convertPalette8ToPacked32
sws_freeContext
sws_freeFilter
sws_freeVec
sws_getCachedContext
sws_getCoefficients
sws_getColorspaceDetails
sws_getConstVec
sws_getContext
sws_getDefaultFilter
sws_getGaussianVec
sws_getIdentityVec
sws_get_class
sws_init_context
sws_isSupportedEndiannessConversion
sws_isSupportedInput
sws_isSupportedOutput
sws_normalizeVec
sws_printVec2
sws_rgb2rgb_init
sws_scale
sws_scaleVec
sws_setColorspaceDetails
sws_shiftVec
sws_subVec
swscale_configuration
swscale_license
swscale_version
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/swscale-3.orig.def
0,0 → 1,38
EXPORTS
DllStartup @1
sws_addVec @2
sws_allocVec @3
sws_alloc_context @4
sws_alloc_set_opts @5
sws_cloneVec @6
sws_context_class @7 DATA
sws_convVec @8
sws_convertPalette8ToPacked24 @9
sws_convertPalette8ToPacked32 @10
sws_freeContext @11
sws_freeFilter @12
sws_freeVec @13
sws_getCachedContext @14
sws_getCoefficients @15
sws_getColorspaceDetails @16
sws_getConstVec @17
sws_getContext @18
sws_getDefaultFilter @19
sws_getGaussianVec @20
sws_getIdentityVec @21
sws_get_class @22
sws_init_context @23
sws_isSupportedEndiannessConversion @24
sws_isSupportedInput @25
sws_isSupportedOutput @26
sws_normalizeVec @27
sws_printVec2 @28
sws_rgb2rgb_init @29
sws_scale @30
sws_scaleVec @31
sws_setColorspaceDetails @32
sws_shiftVec @33
sws_subVec @34
swscale_configuration @35
swscale_license @36
swscale_version @37
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/swscale-test.c
0,0 → 1,417
/*
* Copyright (C) 2003-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <stdarg.h>
 
#undef HAVE_AV_CONFIG_H
#include "libavutil/imgutils.h"
#include "libavutil/mem.h"
#include "libavutil/avutil.h"
#include "libavutil/crc.h"
#include "libavutil/pixdesc.h"
#include "libavutil/lfg.h"
#include "swscale.h"
 
/* HACK Duplicated from swscale_internal.h.
* Should be removed when a cleaner pixel format system exists. */
#define isGray(x) \
((x) == AV_PIX_FMT_GRAY8 || \
(x) == AV_PIX_FMT_YA8 || \
(x) == AV_PIX_FMT_GRAY16BE || \
(x) == AV_PIX_FMT_GRAY16LE || \
(x) == AV_PIX_FMT_YA16BE || \
(x) == AV_PIX_FMT_YA16LE)
#define hasChroma(x) \
(!(isGray(x) || \
(x) == AV_PIX_FMT_MONOBLACK || \
(x) == AV_PIX_FMT_MONOWHITE))
#define isALPHA(x) \
((x) == AV_PIX_FMT_BGR32 || \
(x) == AV_PIX_FMT_BGR32_1 || \
(x) == AV_PIX_FMT_RGB32 || \
(x) == AV_PIX_FMT_RGB32_1 || \
(x) == AV_PIX_FMT_YUVA420P)
 
static uint64_t getSSD(const uint8_t *src1, const uint8_t *src2, int stride1,
int stride2, int w, int h)
{
int x, y;
uint64_t ssd = 0;
 
for (y = 0; y < h; y++) {
for (x = 0; x < w; x++) {
int d = src1[x + y * stride1] - src2[x + y * stride2];
ssd += d * d;
}
}
return ssd;
}
 
struct Results {
uint64_t ssdY;
uint64_t ssdU;
uint64_t ssdV;
uint64_t ssdA;
uint32_t crc;
};
 
// test by ref -> src -> dst -> out & compare out against ref
// ref & out are YV12
static int doTest(uint8_t *ref[4], int refStride[4], int w, int h,
enum AVPixelFormat srcFormat, enum AVPixelFormat dstFormat,
int srcW, int srcH, int dstW, int dstH, int flags,
struct Results *r)
{
const AVPixFmtDescriptor *desc_yuva420p = av_pix_fmt_desc_get(AV_PIX_FMT_YUVA420P);
const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(srcFormat);
const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(dstFormat);
static enum AVPixelFormat cur_srcFormat;
static int cur_srcW, cur_srcH;
static uint8_t *src[4];
static int srcStride[4];
uint8_t *dst[4] = { 0 };
uint8_t *out[4] = { 0 };
int dstStride[4] = {0};
int i;
uint64_t ssdY, ssdU = 0, ssdV = 0, ssdA = 0;
struct SwsContext *dstContext = NULL, *outContext = NULL;
uint32_t crc = 0;
int res = 0;
 
if (cur_srcFormat != srcFormat || cur_srcW != srcW || cur_srcH != srcH) {
struct SwsContext *srcContext = NULL;
int p;
 
for (p = 0; p < 4; p++)
av_freep(&src[p]);
 
av_image_fill_linesizes(srcStride, srcFormat, srcW);
for (p = 0; p < 4; p++) {
srcStride[p] = FFALIGN(srcStride[p], 16);
if (srcStride[p])
src[p] = av_mallocz(srcStride[p] * srcH + 16);
if (srcStride[p] && !src[p]) {
perror("Malloc");
res = -1;
goto end;
}
}
srcContext = sws_getContext(w, h, AV_PIX_FMT_YUVA420P, srcW, srcH,
srcFormat, SWS_BILINEAR, NULL, NULL, NULL);
if (!srcContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_yuva420p->name,
desc_src->name);
res = -1;
goto end;
}
sws_scale(srcContext, (const uint8_t * const*)ref, refStride, 0, h, src, srcStride);
sws_freeContext(srcContext);
 
cur_srcFormat = srcFormat;
cur_srcW = srcW;
cur_srcH = srcH;
}
 
av_image_fill_linesizes(dstStride, dstFormat, dstW);
for (i = 0; i < 4; i++) {
/* Image buffers passed into libswscale can be allocated any way you
* prefer, as long as they're aligned enough for the architecture, and
* they're freed appropriately (such as using av_free for buffers
* allocated with av_malloc). */
/* An extra 16 bytes is being allocated because some scalers may write
* out of bounds. */
dstStride[i] = FFALIGN(dstStride[i], 16);
if (dstStride[i])
dst[i] = av_mallocz(dstStride[i] * dstH + 16);
if (dstStride[i] && !dst[i]) {
perror("Malloc");
res = -1;
 
goto end;
}
}
 
dstContext = sws_getContext(srcW, srcH, srcFormat, dstW, dstH, dstFormat,
flags, NULL, NULL, NULL);
if (!dstContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_src->name, desc_dst->name);
res = -1;
goto end;
}
 
printf(" %s %dx%d -> %s %3dx%3d flags=%2d",
desc_src->name, srcW, srcH,
desc_dst->name, dstW, dstH,
flags);
fflush(stdout);
 
sws_scale(dstContext, (const uint8_t * const*)src, srcStride, 0, srcH, dst, dstStride);
 
for (i = 0; i < 4 && dstStride[i]; i++)
crc = av_crc(av_crc_get_table(AV_CRC_32_IEEE), crc, dst[i],
dstStride[i] * dstH);
 
if (r && crc == r->crc) {
ssdY = r->ssdY;
ssdU = r->ssdU;
ssdV = r->ssdV;
ssdA = r->ssdA;
} else {
for (i = 0; i < 4; i++) {
refStride[i] = FFALIGN(refStride[i], 16);
if (refStride[i])
out[i] = av_mallocz(refStride[i] * h);
if (refStride[i] && !out[i]) {
perror("Malloc");
res = -1;
goto end;
}
}
outContext = sws_getContext(dstW, dstH, dstFormat, w, h,
AV_PIX_FMT_YUVA420P, SWS_BILINEAR,
NULL, NULL, NULL);
if (!outContext) {
fprintf(stderr, "Failed to get %s ---> %s\n",
desc_dst->name,
desc_yuva420p->name);
res = -1;
goto end;
}
sws_scale(outContext, (const uint8_t * const*)dst, dstStride, 0, dstH, out, refStride);
 
ssdY = getSSD(ref[0], out[0], refStride[0], refStride[0], w, h);
if (hasChroma(srcFormat) && hasChroma(dstFormat)) {
//FIXME check that output is really gray
ssdU = getSSD(ref[1], out[1], refStride[1], refStride[1],
(w + 1) >> 1, (h + 1) >> 1);
ssdV = getSSD(ref[2], out[2], refStride[2], refStride[2],
(w + 1) >> 1, (h + 1) >> 1);
}
if (isALPHA(srcFormat) && isALPHA(dstFormat))
ssdA = getSSD(ref[3], out[3], refStride[3], refStride[3], w, h);
 
ssdY /= w * h;
ssdU /= w * h / 4;
ssdV /= w * h / 4;
ssdA /= w * h;
 
sws_freeContext(outContext);
 
for (i = 0; i < 4; i++)
if (refStride[i])
av_free(out[i]);
}
 
printf(" CRC=%08x SSD=%5"PRId64 ",%5"PRId64 ",%5"PRId64 ",%5"PRId64 "\n",
crc, ssdY, ssdU, ssdV, ssdA);
 
end:
sws_freeContext(dstContext);
 
for (i = 0; i < 4; i++)
if (dstStride[i])
av_free(dst[i]);
 
return res;
}
 
static void selfTest(uint8_t *ref[4], int refStride[4], int w, int h,
enum AVPixelFormat srcFormat_in,
enum AVPixelFormat dstFormat_in)
{
const int flags[] = { SWS_FAST_BILINEAR, SWS_BILINEAR, SWS_BICUBIC,
SWS_X, SWS_POINT, SWS_AREA, 0 };
const int srcW = w;
const int srcH = h;
const int dstW[] = { srcW - srcW / 3, srcW, srcW + srcW / 3, 0 };
const int dstH[] = { srcH - srcH / 3, srcH, srcH + srcH / 3, 0 };
enum AVPixelFormat srcFormat, dstFormat;
const AVPixFmtDescriptor *desc_src, *desc_dst;
 
for (srcFormat = srcFormat_in != AV_PIX_FMT_NONE ? srcFormat_in : 0;
srcFormat < AV_PIX_FMT_NB; srcFormat++) {
if (!sws_isSupportedInput(srcFormat) ||
!sws_isSupportedOutput(srcFormat))
continue;
 
desc_src = av_pix_fmt_desc_get(srcFormat);
 
for (dstFormat = dstFormat_in != AV_PIX_FMT_NONE ? dstFormat_in : 0;
dstFormat < AV_PIX_FMT_NB; dstFormat++) {
int i, j, k;
int res = 0;
 
if (!sws_isSupportedInput(dstFormat) ||
!sws_isSupportedOutput(dstFormat))
continue;
 
desc_dst = av_pix_fmt_desc_get(dstFormat);
 
printf("%s -> %s\n", desc_src->name, desc_dst->name);
fflush(stdout);
 
for (k = 0; flags[k] && !res; k++)
for (i = 0; dstW[i] && !res; i++)
for (j = 0; dstH[j] && !res; j++)
res = doTest(ref, refStride, w, h,
srcFormat, dstFormat,
srcW, srcH, dstW[i], dstH[j], flags[k],
NULL);
if (dstFormat_in != AV_PIX_FMT_NONE)
break;
}
if (srcFormat_in != AV_PIX_FMT_NONE)
break;
}
}
 
static int fileTest(uint8_t *ref[4], int refStride[4], int w, int h, FILE *fp,
enum AVPixelFormat srcFormat_in,
enum AVPixelFormat dstFormat_in)
{
char buf[256];
 
while (fgets(buf, sizeof(buf), fp)) {
struct Results r;
enum AVPixelFormat srcFormat;
char srcStr[12];
int srcW, srcH;
enum AVPixelFormat dstFormat;
char dstStr[12];
int dstW, dstH;
int flags;
int ret;
 
ret = sscanf(buf,
" %12s %dx%d -> %12s %dx%d flags=%d CRC=%x"
" SSD=%"SCNd64 ", %"SCNd64 ", %"SCNd64 ", %"SCNd64 "\n",
srcStr, &srcW, &srcH, dstStr, &dstW, &dstH,
&flags, &r.crc, &r.ssdY, &r.ssdU, &r.ssdV, &r.ssdA);
if (ret != 12) {
srcStr[0] = dstStr[0] = 0;
ret = sscanf(buf, "%12s -> %12s\n", srcStr, dstStr);
}
 
srcFormat = av_get_pix_fmt(srcStr);
dstFormat = av_get_pix_fmt(dstStr);
 
if (srcFormat == AV_PIX_FMT_NONE || dstFormat == AV_PIX_FMT_NONE ||
srcW > 8192U || srcH > 8192U || dstW > 8192U || dstH > 8192U) {
fprintf(stderr, "malformed input file\n");
return -1;
}
if ((srcFormat_in != AV_PIX_FMT_NONE && srcFormat_in != srcFormat) ||
(dstFormat_in != AV_PIX_FMT_NONE && dstFormat_in != dstFormat))
continue;
if (ret != 12) {
printf("%s", buf);
continue;
}
 
doTest(ref, refStride, w, h,
srcFormat, dstFormat,
srcW, srcH, dstW, dstH, flags,
&r);
}
 
return 0;
}
 
#define W 96
#define H 96
 
int main(int argc, char **argv)
{
enum AVPixelFormat srcFormat = AV_PIX_FMT_NONE;
enum AVPixelFormat dstFormat = AV_PIX_FMT_NONE;
uint8_t *rgb_data = av_malloc(W * H * 4);
const uint8_t * const rgb_src[4] = { rgb_data, NULL, NULL, NULL };
int rgb_stride[4] = { 4 * W, 0, 0, 0 };
uint8_t *data = av_malloc(4 * W * H);
uint8_t *src[4] = { data, data + W * H, data + W * H * 2, data + W * H * 3 };
int stride[4] = { W, W, W, W };
int x, y;
struct SwsContext *sws;
AVLFG rand;
int res = -1;
int i;
FILE *fp = NULL;
 
if (!rgb_data || !data)
return -1;
 
for (i = 1; i < argc; i += 2) {
if (argv[i][0] != '-' || i + 1 == argc)
goto bad_option;
if (!strcmp(argv[i], "-ref")) {
fp = fopen(argv[i + 1], "r");
if (!fp) {
fprintf(stderr, "could not open '%s'\n", argv[i + 1]);
goto error;
}
} else if (!strcmp(argv[i], "-src")) {
srcFormat = av_get_pix_fmt(argv[i + 1]);
if (srcFormat == AV_PIX_FMT_NONE) {
fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
return -1;
}
} else if (!strcmp(argv[i], "-dst")) {
dstFormat = av_get_pix_fmt(argv[i + 1]);
if (dstFormat == AV_PIX_FMT_NONE) {
fprintf(stderr, "invalid pixel format %s\n", argv[i + 1]);
return -1;
}
} else {
bad_option:
fprintf(stderr, "bad option or argument missing (%s)\n", argv[i]);
goto error;
}
}
 
sws = sws_getContext(W / 12, H / 12, AV_PIX_FMT_RGB32, W, H,
AV_PIX_FMT_YUVA420P, SWS_BILINEAR, NULL, NULL, NULL);
 
av_lfg_init(&rand, 1);
 
for (y = 0; y < H; y++)
for (x = 0; x < W * 4; x++)
rgb_data[ x + y * 4 * W] = av_lfg_get(&rand);
sws_scale(sws, rgb_src, rgb_stride, 0, H / 12, src, stride);
sws_freeContext(sws);
av_free(rgb_data);
 
if(fp) {
res = fileTest(src, stride, W, H, fp, srcFormat, dstFormat);
fclose(fp);
} else {
selfTest(src, stride, W, H, srcFormat, dstFormat);
res = 0;
}
error:
av_free(data);
 
return res;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/swscale.c
0,0 → 1,1276
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
 
#include "libavutil/avassert.h"
#include "libavutil/avutil.h"
#include "libavutil/bswap.h"
#include "libavutil/cpu.h"
#include "libavutil/imgutils.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mathematics.h"
#include "libavutil/pixdesc.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale_internal.h"
#include "swscale.h"
 
DECLARE_ALIGNED(8, const uint8_t, ff_dither_8x8_128)[9][8] = {
{ 36, 68, 60, 92, 34, 66, 58, 90, },
{ 100, 4, 124, 28, 98, 2, 122, 26, },
{ 52, 84, 44, 76, 50, 82, 42, 74, },
{ 116, 20, 108, 12, 114, 18, 106, 10, },
{ 32, 64, 56, 88, 38, 70, 62, 94, },
{ 96, 0, 120, 24, 102, 6, 126, 30, },
{ 48, 80, 40, 72, 54, 86, 46, 78, },
{ 112, 16, 104, 8, 118, 22, 110, 14, },
{ 36, 68, 60, 92, 34, 66, 58, 90, },
};
 
DECLARE_ALIGNED(8, static const uint8_t, sws_pb_64)[8] = {
64, 64, 64, 64, 64, 64, 64, 64
};
 
static void gamma_convert(uint8_t * src[], int width, uint16_t *gamma)
{
int i;
uint16_t *src1 = (uint16_t*)src[0];
 
for (i = 0; i < width; ++i) {
uint16_t r = AV_RL16(src1 + i*4 + 0);
uint16_t g = AV_RL16(src1 + i*4 + 1);
uint16_t b = AV_RL16(src1 + i*4 + 2);
 
AV_WL16(src1 + i*4 + 0, gamma[r]);
AV_WL16(src1 + i*4 + 1, gamma[g]);
AV_WL16(src1 + i*4 + 2, gamma[b]);
}
}
 
static av_always_inline void fillPlane(uint8_t *plane, int stride, int width,
int height, int y, uint8_t val)
{
int i;
uint8_t *ptr = plane + stride * y;
for (i = 0; i < height; i++) {
memset(ptr, val, width);
ptr += stride;
}
}
 
static void hScale16To19_c(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
int i;
int32_t *dst = (int32_t *) _dst;
const uint16_t *src = (const uint16_t *) _src;
int bits = desc->comp[0].depth_minus1;
int sh = bits - 4;
 
if((isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8) && desc->comp[0].depth_minus1<15)
sh= 9;
 
for (i = 0; i < dstW; i++) {
int j;
int srcPos = filterPos[i];
int val = 0;
 
for (j = 0; j < filterSize; j++) {
val += src[srcPos + j] * filter[filterSize * i + j];
}
// filter=14 bit, input=16 bit, output=30 bit, >> 11 makes 19 bit
dst[i] = FFMIN(val >> sh, (1 << 19) - 1);
}
}
 
static void hScale16To15_c(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *_src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
int i;
const uint16_t *src = (const uint16_t *) _src;
int sh = desc->comp[0].depth_minus1;
 
if(sh<15)
sh= isAnyRGB(c->srcFormat) || c->srcFormat==AV_PIX_FMT_PAL8 ? 13 : desc->comp[0].depth_minus1;
 
for (i = 0; i < dstW; i++) {
int j;
int srcPos = filterPos[i];
int val = 0;
 
for (j = 0; j < filterSize; j++) {
val += src[srcPos + j] * filter[filterSize * i + j];
}
// filter=14 bit, input=16 bit, output=30 bit, >> 15 makes 15 bit
dst[i] = FFMIN(val >> sh, (1 << 15) - 1);
}
}
 
// bilinear / bicubic scaling
static void hScale8To15_c(SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
int i;
for (i = 0; i < dstW; i++) {
int j;
int srcPos = filterPos[i];
int val = 0;
for (j = 0; j < filterSize; j++) {
val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
}
dst[i] = FFMIN(val >> 7, (1 << 15) - 1); // the cubic equation does overflow ...
}
}
 
static void hScale8To19_c(SwsContext *c, int16_t *_dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize)
{
int i;
int32_t *dst = (int32_t *) _dst;
for (i = 0; i < dstW; i++) {
int j;
int srcPos = filterPos[i];
int val = 0;
for (j = 0; j < filterSize; j++) {
val += ((int)src[srcPos + j]) * filter[filterSize * i + j];
}
dst[i] = FFMIN(val >> 3, (1 << 19) - 1); // the cubic equation does overflow ...
}
}
 
// FIXME all pal and rgb srcFormats could do this conversion as well
// FIXME all scalers more complex than bilinear could do half of this transform
static void chrRangeToJpeg_c(int16_t *dstU, int16_t *dstV, int width)
{
int i;
for (i = 0; i < width; i++) {
dstU[i] = (FFMIN(dstU[i], 30775) * 4663 - 9289992) >> 12; // -264
dstV[i] = (FFMIN(dstV[i], 30775) * 4663 - 9289992) >> 12; // -264
}
}
 
static void chrRangeFromJpeg_c(int16_t *dstU, int16_t *dstV, int width)
{
int i;
for (i = 0; i < width; i++) {
dstU[i] = (dstU[i] * 1799 + 4081085) >> 11; // 1469
dstV[i] = (dstV[i] * 1799 + 4081085) >> 11; // 1469
}
}
 
static void lumRangeToJpeg_c(int16_t *dst, int width)
{
int i;
for (i = 0; i < width; i++)
dst[i] = (FFMIN(dst[i], 30189) * 19077 - 39057361) >> 14;
}
 
static void lumRangeFromJpeg_c(int16_t *dst, int width)
{
int i;
for (i = 0; i < width; i++)
dst[i] = (dst[i] * 14071 + 33561947) >> 14;
}
 
static void chrRangeToJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
{
int i;
int32_t *dstU = (int32_t *) _dstU;
int32_t *dstV = (int32_t *) _dstV;
for (i = 0; i < width; i++) {
dstU[i] = (FFMIN(dstU[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
dstV[i] = (FFMIN(dstV[i], 30775 << 4) * 4663 - (9289992 << 4)) >> 12; // -264
}
}
 
static void chrRangeFromJpeg16_c(int16_t *_dstU, int16_t *_dstV, int width)
{
int i;
int32_t *dstU = (int32_t *) _dstU;
int32_t *dstV = (int32_t *) _dstV;
for (i = 0; i < width; i++) {
dstU[i] = (dstU[i] * 1799 + (4081085 << 4)) >> 11; // 1469
dstV[i] = (dstV[i] * 1799 + (4081085 << 4)) >> 11; // 1469
}
}
 
static void lumRangeToJpeg16_c(int16_t *_dst, int width)
{
int i;
int32_t *dst = (int32_t *) _dst;
for (i = 0; i < width; i++) {
dst[i] = ((int)(FFMIN(dst[i], 30189 << 4) * 4769U - (39057361 << 2))) >> 12;
}
}
 
static void lumRangeFromJpeg16_c(int16_t *_dst, int width)
{
int i;
int32_t *dst = (int32_t *) _dst;
for (i = 0; i < width; i++)
dst[i] = (dst[i]*(14071/4) + (33561947<<4)/4)>>12;
}
 
// *** horizontal scale Y line to temp buffer
static av_always_inline void hyscale(SwsContext *c, int16_t *dst, int dstWidth,
const uint8_t *src_in[4],
int srcW, int xInc,
const int16_t *hLumFilter,
const int32_t *hLumFilterPos,
int hLumFilterSize,
uint8_t *formatConvBuffer,
uint32_t *pal, int isAlpha)
{
void (*toYV12)(uint8_t *, const uint8_t *, const uint8_t *, const uint8_t *, int, uint32_t *) =
isAlpha ? c->alpToYV12 : c->lumToYV12;
void (*convertRange)(int16_t *, int) = isAlpha ? NULL : c->lumConvertRange;
const uint8_t *src = src_in[isAlpha ? 3 : 0];
 
if (toYV12) {
toYV12(formatConvBuffer, src, src_in[1], src_in[2], srcW, pal);
src = formatConvBuffer;
} else if (c->readLumPlanar && !isAlpha) {
c->readLumPlanar(formatConvBuffer, src_in, srcW, c->input_rgb2yuv_table);
src = formatConvBuffer;
} else if (c->readAlpPlanar && isAlpha) {
c->readAlpPlanar(formatConvBuffer, src_in, srcW, NULL);
src = formatConvBuffer;
}
 
if (!c->hyscale_fast) {
c->hyScale(c, dst, dstWidth, src, hLumFilter,
hLumFilterPos, hLumFilterSize);
} else { // fast bilinear upscale / crap downscale
c->hyscale_fast(c, dst, dstWidth, src, srcW, xInc);
}
 
if (convertRange)
convertRange(dst, dstWidth);
}
 
static av_always_inline void hcscale(SwsContext *c, int16_t *dst1,
int16_t *dst2, int dstWidth,
const uint8_t *src_in[4],
int srcW, int xInc,
const int16_t *hChrFilter,
const int32_t *hChrFilterPos,
int hChrFilterSize,
uint8_t *formatConvBuffer, uint32_t *pal)
{
const uint8_t *src1 = src_in[1], *src2 = src_in[2];
if (c->chrToYV12) {
uint8_t *buf2 = formatConvBuffer +
FFALIGN(srcW*2+78, 16);
c->chrToYV12(formatConvBuffer, buf2, src_in[0], src1, src2, srcW, pal);
src1= formatConvBuffer;
src2= buf2;
} else if (c->readChrPlanar) {
uint8_t *buf2 = formatConvBuffer +
FFALIGN(srcW*2+78, 16);
c->readChrPlanar(formatConvBuffer, buf2, src_in, srcW, c->input_rgb2yuv_table);
src1 = formatConvBuffer;
src2 = buf2;
}
 
if (!c->hcscale_fast) {
c->hcScale(c, dst1, dstWidth, src1, hChrFilter, hChrFilterPos, hChrFilterSize);
c->hcScale(c, dst2, dstWidth, src2, hChrFilter, hChrFilterPos, hChrFilterSize);
} else { // fast bilinear upscale / crap downscale
c->hcscale_fast(c, dst1, dst2, dstWidth, src1, src2, srcW, xInc);
}
 
if (c->chrConvertRange)
c->chrConvertRange(dst1, dst2, dstWidth);
}
 
#define DEBUG_SWSCALE_BUFFERS 0
#define DEBUG_BUFFERS(...) \
if (DEBUG_SWSCALE_BUFFERS) \
av_log(c, AV_LOG_DEBUG, __VA_ARGS__)
 
static int swscale(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dst[], int dstStride[])
{
/* load a few things into local vars to make the code more readable?
* and faster */
#ifndef NEW_FILTER
const int srcW = c->srcW;
#endif
const int dstW = c->dstW;
const int dstH = c->dstH;
#ifndef NEW_FILTER
const int chrDstW = c->chrDstW;
const int chrSrcW = c->chrSrcW;
const int lumXInc = c->lumXInc;
const int chrXInc = c->chrXInc;
#endif
const enum AVPixelFormat dstFormat = c->dstFormat;
const int flags = c->flags;
int32_t *vLumFilterPos = c->vLumFilterPos;
int32_t *vChrFilterPos = c->vChrFilterPos;
#ifndef NEW_FILTER
int32_t *hLumFilterPos = c->hLumFilterPos;
int32_t *hChrFilterPos = c->hChrFilterPos;
int16_t *hLumFilter = c->hLumFilter;
int16_t *hChrFilter = c->hChrFilter;
int32_t *lumMmxFilter = c->lumMmxFilter;
int32_t *chrMmxFilter = c->chrMmxFilter;
#endif
const int vLumFilterSize = c->vLumFilterSize;
const int vChrFilterSize = c->vChrFilterSize;
#ifndef NEW_FILTER
const int hLumFilterSize = c->hLumFilterSize;
const int hChrFilterSize = c->hChrFilterSize;
int16_t **lumPixBuf = c->lumPixBuf;
int16_t **chrUPixBuf = c->chrUPixBuf;
int16_t **chrVPixBuf = c->chrVPixBuf;
#endif
int16_t **alpPixBuf = c->alpPixBuf;
const int vLumBufSize = c->vLumBufSize;
const int vChrBufSize = c->vChrBufSize;
#ifndef NEW_FILTER
uint8_t *formatConvBuffer = c->formatConvBuffer;
uint32_t *pal = c->pal_yuv;
int perform_gamma = c->is_internal_gamma;
#endif
yuv2planar1_fn yuv2plane1 = c->yuv2plane1;
yuv2planarX_fn yuv2planeX = c->yuv2planeX;
yuv2interleavedX_fn yuv2nv12cX = c->yuv2nv12cX;
yuv2packed1_fn yuv2packed1 = c->yuv2packed1;
yuv2packed2_fn yuv2packed2 = c->yuv2packed2;
yuv2packedX_fn yuv2packedX = c->yuv2packedX;
yuv2anyX_fn yuv2anyX = c->yuv2anyX;
const int chrSrcSliceY = srcSliceY >> c->chrSrcVSubSample;
const int chrSrcSliceH = FF_CEIL_RSHIFT(srcSliceH, c->chrSrcVSubSample);
int should_dither = is9_OR_10BPS(c->srcFormat) ||
is16BPS(c->srcFormat);
int lastDstY;
 
/* vars which will change and which we need to store back in the context */
int dstY = c->dstY;
int lumBufIndex = c->lumBufIndex;
int chrBufIndex = c->chrBufIndex;
int lastInLumBuf = c->lastInLumBuf;
int lastInChrBuf = c->lastInChrBuf;
 
#ifdef NEW_FILTER
int lumStart = 0;
int lumEnd = c->descIndex[0];
int chrStart = lumEnd;
int chrEnd = c->descIndex[1];
int vStart = chrEnd;
int vEnd = c->numDesc;
SwsSlice *src_slice = &c->slice[lumStart];
SwsSlice *hout_slice = &c->slice[c->numSlice-2];
SwsSlice *vout_slice = &c->slice[c->numSlice-1];
SwsFilterDescriptor *desc = c->desc;
 
int hasLumHoles = 1;
int hasChrHoles = 1;
#endif
 
#ifndef NEW_FILTER
if (!usePal(c->srcFormat)) {
pal = c->input_rgb2yuv_table;
}
#endif
 
if (isPacked(c->srcFormat)) {
src[0] =
src[1] =
src[2] =
src[3] = src[0];
srcStride[0] =
srcStride[1] =
srcStride[2] =
srcStride[3] = srcStride[0];
}
srcStride[1] <<= c->vChrDrop;
srcStride[2] <<= c->vChrDrop;
 
DEBUG_BUFFERS("swscale() %p[%d] %p[%d] %p[%d] %p[%d] -> %p[%d] %p[%d] %p[%d] %p[%d]\n",
src[0], srcStride[0], src[1], srcStride[1],
src[2], srcStride[2], src[3], srcStride[3],
dst[0], dstStride[0], dst[1], dstStride[1],
dst[2], dstStride[2], dst[3], dstStride[3]);
DEBUG_BUFFERS("srcSliceY: %d srcSliceH: %d dstY: %d dstH: %d\n",
srcSliceY, srcSliceH, dstY, dstH);
DEBUG_BUFFERS("vLumFilterSize: %d vLumBufSize: %d vChrFilterSize: %d vChrBufSize: %d\n",
vLumFilterSize, vLumBufSize, vChrFilterSize, vChrBufSize);
 
if (dstStride[0]&15 || dstStride[1]&15 ||
dstStride[2]&15 || dstStride[3]&15) {
static int warnedAlready = 0; // FIXME maybe move this into the context
if (flags & SWS_PRINT_INFO && !warnedAlready) {
av_log(c, AV_LOG_WARNING,
"Warning: dstStride is not aligned!\n"
" ->cannot do aligned memory accesses anymore\n");
warnedAlready = 1;
}
}
 
if ( (uintptr_t)dst[0]&15 || (uintptr_t)dst[1]&15 || (uintptr_t)dst[2]&15
|| (uintptr_t)src[0]&15 || (uintptr_t)src[1]&15 || (uintptr_t)src[2]&15
|| dstStride[0]&15 || dstStride[1]&15 || dstStride[2]&15 || dstStride[3]&15
|| srcStride[0]&15 || srcStride[1]&15 || srcStride[2]&15 || srcStride[3]&15
) {
static int warnedAlready=0;
int cpu_flags = av_get_cpu_flags();
if (HAVE_MMXEXT && (cpu_flags & AV_CPU_FLAG_SSE2) && !warnedAlready){
av_log(c, AV_LOG_WARNING, "Warning: data is not aligned! This can lead to a speedloss\n");
warnedAlready=1;
}
}
 
/* Note the user might start scaling the picture in the middle so this
* will not get executed. This is not really intended but works
* currently, so people might do it. */
if (srcSliceY == 0) {
lumBufIndex = -1;
chrBufIndex = -1;
dstY = 0;
lastInLumBuf = -1;
lastInChrBuf = -1;
}
 
if (!should_dither) {
c->chrDither8 = c->lumDither8 = sws_pb_64;
}
lastDstY = dstY;
 
#ifdef NEW_FILTER
ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, c->use_mmx_vfilter);
 
ff_init_slice_from_src(src_slice, (uint8_t**)src, srcStride, c->srcW,
srcSliceY, srcSliceH, chrSrcSliceY, chrSrcSliceH, 1);
 
ff_init_slice_from_src(vout_slice, (uint8_t**)dst, dstStride, c->dstW,
dstY, dstH, dstY >> c->chrDstVSubSample,
FF_CEIL_RSHIFT(dstH, c->chrDstVSubSample), 0);
if (srcSliceY == 0) {
hout_slice->plane[0].sliceY = lastInLumBuf + 1;
hout_slice->plane[1].sliceY = lastInChrBuf + 1;
hout_slice->plane[2].sliceY = lastInChrBuf + 1;
hout_slice->plane[3].sliceY = lastInLumBuf + 1;
 
hout_slice->plane[0].sliceH =
hout_slice->plane[1].sliceH =
hout_slice->plane[2].sliceH =
hout_slice->plane[3].sliceH = 0;
hout_slice->width = dstW;
}
#endif
 
for (; dstY < dstH; dstY++) {
const int chrDstY = dstY >> c->chrDstVSubSample;
#ifndef NEW_FILTER
uint8_t *dest[4] = {
dst[0] + dstStride[0] * dstY,
dst[1] + dstStride[1] * chrDstY,
dst[2] + dstStride[2] * chrDstY,
(CONFIG_SWSCALE_ALPHA && alpPixBuf) ? dst[3] + dstStride[3] * dstY : NULL,
};
#endif
int use_mmx_vfilter= c->use_mmx_vfilter;
 
// First line needed as input
const int firstLumSrcY = FFMAX(1 - vLumFilterSize, vLumFilterPos[dstY]);
const int firstLumSrcY2 = FFMAX(1 - vLumFilterSize, vLumFilterPos[FFMIN(dstY | ((1 << c->chrDstVSubSample) - 1), dstH - 1)]);
// First line needed as input
const int firstChrSrcY = FFMAX(1 - vChrFilterSize, vChrFilterPos[chrDstY]);
 
// Last line needed as input
int lastLumSrcY = FFMIN(c->srcH, firstLumSrcY + vLumFilterSize) - 1;
int lastLumSrcY2 = FFMIN(c->srcH, firstLumSrcY2 + vLumFilterSize) - 1;
int lastChrSrcY = FFMIN(c->chrSrcH, firstChrSrcY + vChrFilterSize) - 1;
int enough_lines;
#ifdef NEW_FILTER
int i;
int posY, cPosY, firstPosY, lastPosY, firstCPosY, lastCPosY;
#endif
 
// handle holes (FAST_BILINEAR & weird filters)
if (firstLumSrcY > lastInLumBuf) {
#ifdef NEW_FILTER
hasLumHoles = lastInLumBuf != firstLumSrcY - 1;
if (hasLumHoles) {
hout_slice->plane[0].sliceY = firstLumSrcY;
hout_slice->plane[3].sliceY = firstLumSrcY;
hout_slice->plane[0].sliceH =
hout_slice->plane[3].sliceH = 0;
}
#endif
lastInLumBuf = firstLumSrcY - 1;
}
if (firstChrSrcY > lastInChrBuf) {
#ifdef NEW_FILTER
hasChrHoles = lastInChrBuf != firstChrSrcY - 1;
if (hasChrHoles) {
hout_slice->plane[1].sliceY = firstChrSrcY;
hout_slice->plane[2].sliceY = firstChrSrcY;
hout_slice->plane[1].sliceH =
hout_slice->plane[2].sliceH = 0;
}
#endif
lastInChrBuf = firstChrSrcY - 1;
}
av_assert0(firstLumSrcY >= lastInLumBuf - vLumBufSize + 1);
av_assert0(firstChrSrcY >= lastInChrBuf - vChrBufSize + 1);
 
DEBUG_BUFFERS("dstY: %d\n", dstY);
DEBUG_BUFFERS("\tfirstLumSrcY: %d lastLumSrcY: %d lastInLumBuf: %d\n",
firstLumSrcY, lastLumSrcY, lastInLumBuf);
DEBUG_BUFFERS("\tfirstChrSrcY: %d lastChrSrcY: %d lastInChrBuf: %d\n",
firstChrSrcY, lastChrSrcY, lastInChrBuf);
 
// Do we have enough lines in this slice to output the dstY line
enough_lines = lastLumSrcY2 < srcSliceY + srcSliceH &&
lastChrSrcY < FF_CEIL_RSHIFT(srcSliceY + srcSliceH, c->chrSrcVSubSample);
 
if (!enough_lines) {
lastLumSrcY = srcSliceY + srcSliceH - 1;
lastChrSrcY = chrSrcSliceY + chrSrcSliceH - 1;
DEBUG_BUFFERS("buffering slice: lastLumSrcY %d lastChrSrcY %d\n",
lastLumSrcY, lastChrSrcY);
}
 
#ifdef NEW_FILTER
posY = hout_slice->plane[0].sliceY + hout_slice->plane[0].sliceH;
if (posY <= lastLumSrcY && !hasLumHoles) {
firstPosY = FFMAX(firstLumSrcY, posY);
lastPosY = FFMIN(lastLumSrcY + MAX_LINES_AHEAD, srcSliceY + srcSliceH - 1);
} else {
firstPosY = lastInLumBuf + 1;
lastPosY = lastLumSrcY;
}
 
cPosY = hout_slice->plane[1].sliceY + hout_slice->plane[1].sliceH;
if (cPosY <= lastChrSrcY && !hasChrHoles) {
firstCPosY = FFMAX(firstChrSrcY, cPosY);
lastCPosY = FFMIN(lastChrSrcY + MAX_LINES_AHEAD, FF_CEIL_RSHIFT(srcSliceY + srcSliceH, c->chrSrcVSubSample) - 1);
} else {
firstCPosY = lastInChrBuf + 1;
lastCPosY = lastChrSrcY;
}
 
ff_rotate_slice(hout_slice, lastPosY, lastCPosY);
 
if (posY < lastLumSrcY + 1) {
for (i = lumStart; i < lumEnd; ++i)
desc[i].process(c, &desc[i], firstPosY, lastPosY - firstPosY + 1);
}
 
lumBufIndex += lastLumSrcY - lastInLumBuf;
lastInLumBuf = lastLumSrcY;
 
if (cPosY < lastChrSrcY + 1) {
for (i = chrStart; i < chrEnd; ++i)
desc[i].process(c, &desc[i], firstCPosY, lastCPosY - firstCPosY + 1);
}
 
chrBufIndex += lastChrSrcY - lastInChrBuf;
lastInChrBuf = lastChrSrcY;
 
#else
// Do horizontal scaling
while (lastInLumBuf < lastLumSrcY) {
const uint8_t *src1[4] = {
src[0] + (lastInLumBuf + 1 - srcSliceY) * srcStride[0],
src[1] + (lastInLumBuf + 1 - srcSliceY) * srcStride[1],
src[2] + (lastInLumBuf + 1 - srcSliceY) * srcStride[2],
src[3] + (lastInLumBuf + 1 - srcSliceY) * srcStride[3],
};
lumBufIndex++;
av_assert0(lumBufIndex < 2 * vLumBufSize);
av_assert0(lastInLumBuf + 1 - srcSliceY < srcSliceH);
av_assert0(lastInLumBuf + 1 - srcSliceY >= 0);
 
if (perform_gamma)
gamma_convert((uint8_t **)src1, srcW, c->inv_gamma);
 
hyscale(c, lumPixBuf[lumBufIndex], dstW, src1, srcW, lumXInc,
hLumFilter, hLumFilterPos, hLumFilterSize,
formatConvBuffer, pal, 0);
if (CONFIG_SWSCALE_ALPHA && alpPixBuf)
hyscale(c, alpPixBuf[lumBufIndex], dstW, src1, srcW,
lumXInc, hLumFilter, hLumFilterPos, hLumFilterSize,
formatConvBuffer, pal, 1);
lastInLumBuf++;
DEBUG_BUFFERS("\t\tlumBufIndex %d: lastInLumBuf: %d\n",
lumBufIndex, lastInLumBuf);
}
while (lastInChrBuf < lastChrSrcY) {
const uint8_t *src1[4] = {
src[0] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[0],
src[1] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[1],
src[2] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[2],
src[3] + (lastInChrBuf + 1 - chrSrcSliceY) * srcStride[3],
};
chrBufIndex++;
av_assert0(chrBufIndex < 2 * vChrBufSize);
av_assert0(lastInChrBuf + 1 - chrSrcSliceY < (chrSrcSliceH));
av_assert0(lastInChrBuf + 1 - chrSrcSliceY >= 0);
// FIXME replace parameters through context struct (some at least)
 
if (c->needs_hcscale)
hcscale(c, chrUPixBuf[chrBufIndex], chrVPixBuf[chrBufIndex],
chrDstW, src1, chrSrcW, chrXInc,
hChrFilter, hChrFilterPos, hChrFilterSize,
formatConvBuffer, pal);
lastInChrBuf++;
DEBUG_BUFFERS("\t\tchrBufIndex %d: lastInChrBuf: %d\n",
chrBufIndex, lastInChrBuf);
}
#endif
// wrap buf index around to stay inside the ring buffer
if (lumBufIndex >= vLumBufSize)
lumBufIndex -= vLumBufSize;
if (chrBufIndex >= vChrBufSize)
chrBufIndex -= vChrBufSize;
if (!enough_lines)
break; // we can't output a dstY line so let's try with the next slice
 
#if HAVE_MMX_INLINE
ff_updateMMXDitherTables(c, dstY, lumBufIndex, chrBufIndex,
lastInLumBuf, lastInChrBuf);
#endif
if (should_dither) {
c->chrDither8 = ff_dither_8x8_128[chrDstY & 7];
c->lumDither8 = ff_dither_8x8_128[dstY & 7];
}
if (dstY >= dstH - 2) {
/* hmm looks like we can't use MMX here without overwriting
* this array's tail */
ff_sws_init_output_funcs(c, &yuv2plane1, &yuv2planeX, &yuv2nv12cX,
&yuv2packed1, &yuv2packed2, &yuv2packedX, &yuv2anyX);
use_mmx_vfilter= 0;
ff_init_vscale_pfn(c, yuv2plane1, yuv2planeX, yuv2nv12cX,
yuv2packed1, yuv2packed2, yuv2packedX, yuv2anyX, use_mmx_vfilter);
}
 
{
#ifdef NEW_FILTER
for (i = vStart; i < vEnd; ++i)
desc[i].process(c, &desc[i], dstY, 1);
#else
const int16_t **lumSrcPtr = (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **chrVSrcPtr = (const int16_t **)(void*) chrVPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **alpSrcPtr = (CONFIG_SWSCALE_ALPHA && alpPixBuf) ?
(const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
int16_t *vLumFilter = c->vLumFilter;
int16_t *vChrFilter = c->vChrFilter;
 
if (isPlanarYUV(dstFormat) ||
(isGray(dstFormat) && !isALPHA(dstFormat))) { // YV12 like
const int chrSkipMask = (1 << c->chrDstVSubSample) - 1;
 
vLumFilter += dstY * vLumFilterSize;
vChrFilter += chrDstY * vChrFilterSize;
 
// av_assert0(use_mmx_vfilter != (
// yuv2planeX == yuv2planeX_10BE_c
// || yuv2planeX == yuv2planeX_10LE_c
// || yuv2planeX == yuv2planeX_9BE_c
// || yuv2planeX == yuv2planeX_9LE_c
// || yuv2planeX == yuv2planeX_16BE_c
// || yuv2planeX == yuv2planeX_16LE_c
// || yuv2planeX == yuv2planeX_8_c) || !ARCH_X86);
 
if(use_mmx_vfilter){
vLumFilter= (int16_t *)c->lumMmxFilter;
vChrFilter= (int16_t *)c->chrMmxFilter;
}
 
if (vLumFilterSize == 1) {
yuv2plane1(lumSrcPtr[0], dest[0], dstW, c->lumDither8, 0);
} else {
yuv2planeX(vLumFilter, vLumFilterSize,
lumSrcPtr, dest[0],
dstW, c->lumDither8, 0);
}
 
if (!((dstY & chrSkipMask) || isGray(dstFormat))) {
if (yuv2nv12cX) {
yuv2nv12cX(c, vChrFilter,
vChrFilterSize, chrUSrcPtr, chrVSrcPtr,
dest[1], chrDstW);
} else if (vChrFilterSize == 1) {
yuv2plane1(chrUSrcPtr[0], dest[1], chrDstW, c->chrDither8, 0);
yuv2plane1(chrVSrcPtr[0], dest[2], chrDstW, c->chrDither8, 3);
} else {
yuv2planeX(vChrFilter,
vChrFilterSize, chrUSrcPtr, dest[1],
chrDstW, c->chrDither8, 0);
yuv2planeX(vChrFilter,
vChrFilterSize, chrVSrcPtr, dest[2],
chrDstW, c->chrDither8, use_mmx_vfilter ? (c->uv_offx2 >> 1) : 3);
}
}
 
if (CONFIG_SWSCALE_ALPHA && alpPixBuf) {
if(use_mmx_vfilter){
vLumFilter= (int16_t *)c->alpMmxFilter;
}
if (vLumFilterSize == 1) {
yuv2plane1(alpSrcPtr[0], dest[3], dstW,
c->lumDither8, 0);
} else {
yuv2planeX(vLumFilter,
vLumFilterSize, alpSrcPtr, dest[3],
dstW, c->lumDither8, 0);
}
}
} else if (yuv2packedX) {
av_assert1(lumSrcPtr + vLumFilterSize - 1 < (const int16_t **)lumPixBuf + vLumBufSize * 2);
av_assert1(chrUSrcPtr + vChrFilterSize - 1 < (const int16_t **)chrUPixBuf + vChrBufSize * 2);
if (c->yuv2packed1 && vLumFilterSize == 1 &&
vChrFilterSize <= 2) { // unscaled RGB
int chrAlpha = vChrFilterSize == 1 ? 0 : vChrFilter[2 * dstY + 1];
yuv2packed1(c, *lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
alpPixBuf ? *alpSrcPtr : NULL,
dest[0], dstW, chrAlpha, dstY);
} else if (c->yuv2packed2 && vLumFilterSize == 2 &&
vChrFilterSize == 2) { // bilinear upscale RGB
int lumAlpha = vLumFilter[2 * dstY + 1];
int chrAlpha = vChrFilter[2 * dstY + 1];
lumMmxFilter[2] =
lumMmxFilter[3] = vLumFilter[2 * dstY] * 0x10001;
chrMmxFilter[2] =
chrMmxFilter[3] = vChrFilter[2 * chrDstY] * 0x10001;
yuv2packed2(c, lumSrcPtr, chrUSrcPtr, chrVSrcPtr,
alpPixBuf ? alpSrcPtr : NULL,
dest[0], dstW, lumAlpha, chrAlpha, dstY);
} else { // general RGB
yuv2packedX(c, vLumFilter + dstY * vLumFilterSize,
lumSrcPtr, vLumFilterSize,
vChrFilter + dstY * vChrFilterSize,
chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
alpSrcPtr, dest[0], dstW, dstY);
}
} else {
av_assert1(!yuv2packed1 && !yuv2packed2);
yuv2anyX(c, vLumFilter + dstY * vLumFilterSize,
lumSrcPtr, vLumFilterSize,
vChrFilter + dstY * vChrFilterSize,
chrUSrcPtr, chrVSrcPtr, vChrFilterSize,
alpSrcPtr, dest, dstW, dstY);
}
if (perform_gamma)
gamma_convert(dest, dstW, c->gamma);
#endif
}
}
if (isPlanar(dstFormat) && isALPHA(dstFormat) && !alpPixBuf) {
int length = dstW;
int height = dstY - lastDstY;
 
if (is16BPS(dstFormat) || isNBPS(dstFormat)) {
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(dstFormat);
fillPlane16(dst[3], dstStride[3], length, height, lastDstY,
1, desc->comp[3].depth_minus1,
isBE(dstFormat));
} else
fillPlane(dst[3], dstStride[3], length, height, lastDstY, 255);
}
 
#if HAVE_MMXEXT_INLINE
if (av_get_cpu_flags() & AV_CPU_FLAG_MMXEXT)
__asm__ volatile ("sfence" ::: "memory");
#endif
emms_c();
 
/* store changed local vars back in the context */
c->dstY = dstY;
c->lumBufIndex = lumBufIndex;
c->chrBufIndex = chrBufIndex;
c->lastInLumBuf = lastInLumBuf;
c->lastInChrBuf = lastInChrBuf;
 
return dstY - lastDstY;
}
 
av_cold void ff_sws_init_range_convert(SwsContext *c)
{
c->lumConvertRange = NULL;
c->chrConvertRange = NULL;
if (c->srcRange != c->dstRange && !isAnyRGB(c->dstFormat)) {
if (c->dstBpc <= 14) {
if (c->srcRange) {
c->lumConvertRange = lumRangeFromJpeg_c;
c->chrConvertRange = chrRangeFromJpeg_c;
} else {
c->lumConvertRange = lumRangeToJpeg_c;
c->chrConvertRange = chrRangeToJpeg_c;
}
} else {
if (c->srcRange) {
c->lumConvertRange = lumRangeFromJpeg16_c;
c->chrConvertRange = chrRangeFromJpeg16_c;
} else {
c->lumConvertRange = lumRangeToJpeg16_c;
c->chrConvertRange = chrRangeToJpeg16_c;
}
}
}
}
 
static av_cold void sws_init_swscale(SwsContext *c)
{
enum AVPixelFormat srcFormat = c->srcFormat;
 
ff_sws_init_output_funcs(c, &c->yuv2plane1, &c->yuv2planeX,
&c->yuv2nv12cX, &c->yuv2packed1,
&c->yuv2packed2, &c->yuv2packedX, &c->yuv2anyX);
 
ff_sws_init_input_funcs(c);
 
 
if (c->srcBpc == 8) {
if (c->dstBpc <= 14) {
c->hyScale = c->hcScale = hScale8To15_c;
if (c->flags & SWS_FAST_BILINEAR) {
c->hyscale_fast = ff_hyscale_fast_c;
c->hcscale_fast = ff_hcscale_fast_c;
}
} else {
c->hyScale = c->hcScale = hScale8To19_c;
}
} else {
c->hyScale = c->hcScale = c->dstBpc > 14 ? hScale16To19_c
: hScale16To15_c;
}
 
ff_sws_init_range_convert(c);
 
if (!(isGray(srcFormat) || isGray(c->dstFormat) ||
srcFormat == AV_PIX_FMT_MONOBLACK || srcFormat == AV_PIX_FMT_MONOWHITE))
c->needs_hcscale = 1;
}
 
SwsFunc ff_getSwsFunc(SwsContext *c)
{
sws_init_swscale(c);
 
if (ARCH_PPC)
ff_sws_init_swscale_ppc(c);
if (ARCH_X86)
ff_sws_init_swscale_x86(c);
 
return swscale;
}
 
static void reset_ptr(const uint8_t *src[], enum AVPixelFormat format)
{
if (!isALPHA(format))
src[3] = NULL;
if (!isPlanar(format)) {
src[3] = src[2] = NULL;
 
if (!usePal(format))
src[1] = NULL;
}
}
 
static int check_image_pointers(const uint8_t * const data[4], enum AVPixelFormat pix_fmt,
const int linesizes[4])
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
int i;
 
av_assert2(desc);
 
for (i = 0; i < 4; i++) {
int plane = desc->comp[i].plane;
if (!data[plane] || !linesizes[plane])
return 0;
}
 
return 1;
}
 
static void xyz12Torgb48(struct SwsContext *c, uint16_t *dst,
const uint16_t *src, int stride, int h)
{
int xp,yp;
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->srcFormat);
 
for (yp=0; yp<h; yp++) {
for (xp=0; xp+2<stride; xp+=3) {
int x, y, z, r, g, b;
 
if (desc->flags & AV_PIX_FMT_FLAG_BE) {
x = AV_RB16(src + xp + 0);
y = AV_RB16(src + xp + 1);
z = AV_RB16(src + xp + 2);
} else {
x = AV_RL16(src + xp + 0);
y = AV_RL16(src + xp + 1);
z = AV_RL16(src + xp + 2);
}
 
x = c->xyzgamma[x>>4];
y = c->xyzgamma[y>>4];
z = c->xyzgamma[z>>4];
 
// convert from XYZlinear to sRGBlinear
r = c->xyz2rgb_matrix[0][0] * x +
c->xyz2rgb_matrix[0][1] * y +
c->xyz2rgb_matrix[0][2] * z >> 12;
g = c->xyz2rgb_matrix[1][0] * x +
c->xyz2rgb_matrix[1][1] * y +
c->xyz2rgb_matrix[1][2] * z >> 12;
b = c->xyz2rgb_matrix[2][0] * x +
c->xyz2rgb_matrix[2][1] * y +
c->xyz2rgb_matrix[2][2] * z >> 12;
 
// limit values to 12-bit depth
r = av_clip_uintp2(r, 12);
g = av_clip_uintp2(g, 12);
b = av_clip_uintp2(b, 12);
 
// convert from sRGBlinear to RGB and scale from 12bit to 16bit
if (desc->flags & AV_PIX_FMT_FLAG_BE) {
AV_WB16(dst + xp + 0, c->rgbgamma[r] << 4);
AV_WB16(dst + xp + 1, c->rgbgamma[g] << 4);
AV_WB16(dst + xp + 2, c->rgbgamma[b] << 4);
} else {
AV_WL16(dst + xp + 0, c->rgbgamma[r] << 4);
AV_WL16(dst + xp + 1, c->rgbgamma[g] << 4);
AV_WL16(dst + xp + 2, c->rgbgamma[b] << 4);
}
}
src += stride;
dst += stride;
}
}
 
static void rgb48Toxyz12(struct SwsContext *c, uint16_t *dst,
const uint16_t *src, int stride, int h)
{
int xp,yp;
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(c->dstFormat);
 
for (yp=0; yp<h; yp++) {
for (xp=0; xp+2<stride; xp+=3) {
int x, y, z, r, g, b;
 
if (desc->flags & AV_PIX_FMT_FLAG_BE) {
r = AV_RB16(src + xp + 0);
g = AV_RB16(src + xp + 1);
b = AV_RB16(src + xp + 2);
} else {
r = AV_RL16(src + xp + 0);
g = AV_RL16(src + xp + 1);
b = AV_RL16(src + xp + 2);
}
 
r = c->rgbgammainv[r>>4];
g = c->rgbgammainv[g>>4];
b = c->rgbgammainv[b>>4];
 
// convert from sRGBlinear to XYZlinear
x = c->rgb2xyz_matrix[0][0] * r +
c->rgb2xyz_matrix[0][1] * g +
c->rgb2xyz_matrix[0][2] * b >> 12;
y = c->rgb2xyz_matrix[1][0] * r +
c->rgb2xyz_matrix[1][1] * g +
c->rgb2xyz_matrix[1][2] * b >> 12;
z = c->rgb2xyz_matrix[2][0] * r +
c->rgb2xyz_matrix[2][1] * g +
c->rgb2xyz_matrix[2][2] * b >> 12;
 
// limit values to 12-bit depth
x = av_clip_uintp2(x, 12);
y = av_clip_uintp2(y, 12);
z = av_clip_uintp2(z, 12);
 
// convert from XYZlinear to X'Y'Z' and scale from 12bit to 16bit
if (desc->flags & AV_PIX_FMT_FLAG_BE) {
AV_WB16(dst + xp + 0, c->xyzgammainv[x] << 4);
AV_WB16(dst + xp + 1, c->xyzgammainv[y] << 4);
AV_WB16(dst + xp + 2, c->xyzgammainv[z] << 4);
} else {
AV_WL16(dst + xp + 0, c->xyzgammainv[x] << 4);
AV_WL16(dst + xp + 1, c->xyzgammainv[y] << 4);
AV_WL16(dst + xp + 2, c->xyzgammainv[z] << 4);
}
}
src += stride;
dst += stride;
}
}
 
/**
* swscale wrapper, so we don't need to export the SwsContext.
* Assumes planar YUV to be in YUV order instead of YVU.
*/
int attribute_align_arg sws_scale(struct SwsContext *c,
const uint8_t * const srcSlice[],
const int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *const dst[],
const int dstStride[])
{
int i, ret;
const uint8_t *src2[4];
uint8_t *dst2[4];
uint8_t *rgb0_tmp = NULL;
 
if (!srcStride || !dstStride || !dst || !srcSlice) {
av_log(c, AV_LOG_ERROR, "One of the input parameters to sws_scale() is NULL, please check the calling code\n");
return 0;
}
 
if (c->gamma_flag && c->cascaded_context[0]) {
 
 
ret = sws_scale(c->cascaded_context[0],
srcSlice, srcStride, srcSliceY, srcSliceH,
c->cascaded_tmp, c->cascaded_tmpStride);
 
if (ret < 0)
return ret;
 
if (c->cascaded_context[2])
ret = sws_scale(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp, c->cascaded_tmpStride, srcSliceY, srcSliceH, c->cascaded1_tmp, c->cascaded1_tmpStride);
else
ret = sws_scale(c->cascaded_context[1], (const uint8_t * const *)c->cascaded_tmp, c->cascaded_tmpStride, srcSliceY, srcSliceH, dst, dstStride);
 
if (ret < 0)
return ret;
 
if (c->cascaded_context[2]) {
ret = sws_scale(c->cascaded_context[2],
(const uint8_t * const *)c->cascaded1_tmp, c->cascaded1_tmpStride, c->cascaded_context[1]->dstY - ret, c->cascaded_context[1]->dstY,
dst, dstStride);
}
return ret;
}
 
if (c->cascaded_context[0] && srcSliceY == 0 && srcSliceH == c->cascaded_context[0]->srcH) {
ret = sws_scale(c->cascaded_context[0],
srcSlice, srcStride, srcSliceY, srcSliceH,
c->cascaded_tmp, c->cascaded_tmpStride);
if (ret < 0)
return ret;
ret = sws_scale(c->cascaded_context[1],
(const uint8_t * const * )c->cascaded_tmp, c->cascaded_tmpStride, 0, c->cascaded_context[0]->dstH,
dst, dstStride);
return ret;
}
 
memcpy(src2, srcSlice, sizeof(src2));
memcpy(dst2, dst, sizeof(dst2));
 
// do not mess up sliceDir if we have a "trailing" 0-size slice
if (srcSliceH == 0)
return 0;
 
if (!check_image_pointers(srcSlice, c->srcFormat, srcStride)) {
av_log(c, AV_LOG_ERROR, "bad src image pointers\n");
return 0;
}
if (!check_image_pointers((const uint8_t* const*)dst, c->dstFormat, dstStride)) {
av_log(c, AV_LOG_ERROR, "bad dst image pointers\n");
return 0;
}
 
if (c->sliceDir == 0 && srcSliceY != 0 && srcSliceY + srcSliceH != c->srcH) {
av_log(c, AV_LOG_ERROR, "Slices start in the middle!\n");
return 0;
}
if (c->sliceDir == 0) {
if (srcSliceY == 0) c->sliceDir = 1; else c->sliceDir = -1;
}
 
if (usePal(c->srcFormat)) {
for (i = 0; i < 256; i++) {
int r, g, b, y, u, v, a = 0xff;
if (c->srcFormat == AV_PIX_FMT_PAL8) {
uint32_t p = ((const uint32_t *)(srcSlice[1]))[i];
a = (p >> 24) & 0xFF;
r = (p >> 16) & 0xFF;
g = (p >> 8) & 0xFF;
b = p & 0xFF;
} else if (c->srcFormat == AV_PIX_FMT_RGB8) {
r = ( i >> 5 ) * 36;
g = ((i >> 2) & 7) * 36;
b = ( i & 3) * 85;
} else if (c->srcFormat == AV_PIX_FMT_BGR8) {
b = ( i >> 6 ) * 85;
g = ((i >> 3) & 7) * 36;
r = ( i & 7) * 36;
} else if (c->srcFormat == AV_PIX_FMT_RGB4_BYTE) {
r = ( i >> 3 ) * 255;
g = ((i >> 1) & 3) * 85;
b = ( i & 1) * 255;
} else if (c->srcFormat == AV_PIX_FMT_GRAY8 || c->srcFormat == AV_PIX_FMT_GRAY8A) {
r = g = b = i;
} else {
av_assert1(c->srcFormat == AV_PIX_FMT_BGR4_BYTE);
b = ( i >> 3 ) * 255;
g = ((i >> 1) & 3) * 85;
r = ( i & 1) * 255;
}
#define RGB2YUV_SHIFT 15
#define BY ( (int) (0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BV (-(int) (0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define BU ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GY ( (int) (0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GV (-(int) (0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define GU (-(int) (0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RY ( (int) (0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RV ( (int) (0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
#define RU (-(int) (0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5))
 
y = av_clip_uint8((RY * r + GY * g + BY * b + ( 33 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
u = av_clip_uint8((RU * r + GU * g + BU * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
v = av_clip_uint8((RV * r + GV * g + BV * b + (257 << (RGB2YUV_SHIFT - 1))) >> RGB2YUV_SHIFT);
c->pal_yuv[i]= y + (u<<8) + (v<<16) + ((unsigned)a<<24);
 
switch (c->dstFormat) {
case AV_PIX_FMT_BGR32:
#if !HAVE_BIGENDIAN
case AV_PIX_FMT_RGB24:
#endif
c->pal_rgb[i]= r + (g<<8) + (b<<16) + ((unsigned)a<<24);
break;
case AV_PIX_FMT_BGR32_1:
#if HAVE_BIGENDIAN
case AV_PIX_FMT_BGR24:
#endif
c->pal_rgb[i]= a + (r<<8) + (g<<16) + ((unsigned)b<<24);
break;
case AV_PIX_FMT_RGB32_1:
#if HAVE_BIGENDIAN
case AV_PIX_FMT_RGB24:
#endif
c->pal_rgb[i]= a + (b<<8) + (g<<16) + ((unsigned)r<<24);
break;
case AV_PIX_FMT_RGB32:
#if !HAVE_BIGENDIAN
case AV_PIX_FMT_BGR24:
#endif
default:
c->pal_rgb[i]= b + (g<<8) + (r<<16) + ((unsigned)a<<24);
}
}
}
 
if (c->src0Alpha && !c->dst0Alpha && isALPHA(c->dstFormat)) {
uint8_t *base;
int x,y;
rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32);
if (!rgb0_tmp)
return AVERROR(ENOMEM);
 
base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp;
for (y=0; y<srcSliceH; y++){
memcpy(base + srcStride[0]*y, src2[0] + srcStride[0]*y, 4*c->srcW);
for (x=c->src0Alpha-1; x<4*c->srcW; x+=4) {
base[ srcStride[0]*y + x] = 0xFF;
}
}
src2[0] = base;
}
 
if (c->srcXYZ && !(c->dstXYZ && c->srcW==c->dstW && c->srcH==c->dstH)) {
uint8_t *base;
rgb0_tmp = av_malloc(FFABS(srcStride[0]) * srcSliceH + 32);
if (!rgb0_tmp)
return AVERROR(ENOMEM);
 
base = srcStride[0] < 0 ? rgb0_tmp - srcStride[0] * (srcSliceH-1) : rgb0_tmp;
 
xyz12Torgb48(c, (uint16_t*)base, (const uint16_t*)src2[0], srcStride[0]/2, srcSliceH);
src2[0] = base;
}
 
if (!srcSliceY && (c->flags & SWS_BITEXACT) && c->dither == SWS_DITHER_ED && c->dither_error[0])
for (i = 0; i < 4; i++)
memset(c->dither_error[i], 0, sizeof(c->dither_error[0][0]) * (c->dstW+2));
 
 
// copy strides, so they can safely be modified
if (c->sliceDir == 1) {
// slices go from top to bottom
int srcStride2[4] = { srcStride[0], srcStride[1], srcStride[2],
srcStride[3] };
int dstStride2[4] = { dstStride[0], dstStride[1], dstStride[2],
dstStride[3] };
 
reset_ptr(src2, c->srcFormat);
reset_ptr((void*)dst2, c->dstFormat);
 
/* reset slice direction at end of frame */
if (srcSliceY + srcSliceH == c->srcH)
c->sliceDir = 0;
 
ret = c->swscale(c, src2, srcStride2, srcSliceY, srcSliceH, dst2,
dstStride2);
} else {
// slices go from bottom to top => we flip the image internally
int srcStride2[4] = { -srcStride[0], -srcStride[1], -srcStride[2],
-srcStride[3] };
int dstStride2[4] = { -dstStride[0], -dstStride[1], -dstStride[2],
-dstStride[3] };
 
src2[0] += (srcSliceH - 1) * srcStride[0];
if (!usePal(c->srcFormat))
src2[1] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[1];
src2[2] += ((srcSliceH >> c->chrSrcVSubSample) - 1) * srcStride[2];
src2[3] += (srcSliceH - 1) * srcStride[3];
dst2[0] += ( c->dstH - 1) * dstStride[0];
dst2[1] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[1];
dst2[2] += ((c->dstH >> c->chrDstVSubSample) - 1) * dstStride[2];
dst2[3] += ( c->dstH - 1) * dstStride[3];
 
reset_ptr(src2, c->srcFormat);
reset_ptr((void*)dst2, c->dstFormat);
 
/* reset slice direction at end of frame */
if (!srcSliceY)
c->sliceDir = 0;
 
ret = c->swscale(c, src2, srcStride2, c->srcH-srcSliceY-srcSliceH,
srcSliceH, dst2, dstStride2);
}
 
 
if (c->dstXYZ && !(c->srcXYZ && c->srcW==c->dstW && c->srcH==c->dstH)) {
/* replace on the same data */
rgb48Toxyz12(c, (uint16_t*)dst2[0], (const uint16_t*)dst2[0], dstStride[0]/2, ret);
}
 
av_free(rgb0_tmp);
return ret;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/swscale.h
0,0 → 1,365
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef SWSCALE_SWSCALE_H
#define SWSCALE_SWSCALE_H
 
/**
* @file
* @ingroup libsws
* external API header
*/
 
#include <stdint.h>
 
#include "libavutil/avutil.h"
#include "libavutil/log.h"
#include "libavutil/pixfmt.h"
#include "version.h"
 
/**
* @defgroup libsws Color conversion and scaling
* @{
*
* Return the LIBSWSCALE_VERSION_INT constant.
*/
unsigned swscale_version(void);
 
/**
* Return the libswscale build-time configuration.
*/
const char *swscale_configuration(void);
 
/**
* Return the libswscale license.
*/
const char *swscale_license(void);
 
/* values for the flags, the stuff on the command line is different */
#define SWS_FAST_BILINEAR 1
#define SWS_BILINEAR 2
#define SWS_BICUBIC 4
#define SWS_X 8
#define SWS_POINT 0x10
#define SWS_AREA 0x20
#define SWS_BICUBLIN 0x40
#define SWS_GAUSS 0x80
#define SWS_SINC 0x100
#define SWS_LANCZOS 0x200
#define SWS_SPLINE 0x400
 
#define SWS_SRC_V_CHR_DROP_MASK 0x30000
#define SWS_SRC_V_CHR_DROP_SHIFT 16
 
#define SWS_PARAM_DEFAULT 123456
 
#define SWS_PRINT_INFO 0x1000
 
//the following 3 flags are not completely implemented
//internal chrominace subsampling info
#define SWS_FULL_CHR_H_INT 0x2000
//input subsampling info
#define SWS_FULL_CHR_H_INP 0x4000
#define SWS_DIRECT_BGR 0x8000
#define SWS_ACCURATE_RND 0x40000
#define SWS_BITEXACT 0x80000
#define SWS_ERROR_DIFFUSION 0x800000
 
#if FF_API_SWS_CPU_CAPS
/**
* CPU caps are autodetected now, those flags
* are only provided for API compatibility.
*/
#define SWS_CPU_CAPS_MMX 0x80000000
#define SWS_CPU_CAPS_MMXEXT 0x20000000
#define SWS_CPU_CAPS_MMX2 0x20000000
#define SWS_CPU_CAPS_3DNOW 0x40000000
#define SWS_CPU_CAPS_ALTIVEC 0x10000000
#if FF_API_ARCH_BFIN
#define SWS_CPU_CAPS_BFIN 0x01000000
#endif
#define SWS_CPU_CAPS_SSE2 0x02000000
#endif
 
#define SWS_MAX_REDUCE_CUTOFF 0.002
 
#define SWS_CS_ITU709 1
#define SWS_CS_FCC 4
#define SWS_CS_ITU601 5
#define SWS_CS_ITU624 5
#define SWS_CS_SMPTE170M 5
#define SWS_CS_SMPTE240M 7
#define SWS_CS_DEFAULT 5
 
/**
* Return a pointer to yuv<->rgb coefficients for the given colorspace
* suitable for sws_setColorspaceDetails().
*
* @param colorspace One of the SWS_CS_* macros. If invalid,
* SWS_CS_DEFAULT is used.
*/
const int *sws_getCoefficients(int colorspace);
 
// when used for filters they must have an odd number of elements
// coeffs cannot be shared between vectors
typedef struct SwsVector {
double *coeff; ///< pointer to the list of coefficients
int length; ///< number of coefficients in the vector
} SwsVector;
 
// vectors can be shared
typedef struct SwsFilter {
SwsVector *lumH;
SwsVector *lumV;
SwsVector *chrH;
SwsVector *chrV;
} SwsFilter;
 
struct SwsContext;
 
/**
* Return a positive value if pix_fmt is a supported input format, 0
* otherwise.
*/
int sws_isSupportedInput(enum AVPixelFormat pix_fmt);
 
/**
* Return a positive value if pix_fmt is a supported output format, 0
* otherwise.
*/
int sws_isSupportedOutput(enum AVPixelFormat pix_fmt);
 
/**
* @param[in] pix_fmt the pixel format
* @return a positive value if an endianness conversion for pix_fmt is
* supported, 0 otherwise.
*/
int sws_isSupportedEndiannessConversion(enum AVPixelFormat pix_fmt);
 
/**
* Allocate an empty SwsContext. This must be filled and passed to
* sws_init_context(). For filling see AVOptions, options.c and
* sws_setColorspaceDetails().
*/
struct SwsContext *sws_alloc_context(void);
 
/**
* Initialize the swscaler context sws_context.
*
* @return zero or positive value on success, a negative value on
* error
*/
int sws_init_context(struct SwsContext *sws_context, SwsFilter *srcFilter, SwsFilter *dstFilter);
 
/**
* Free the swscaler context swsContext.
* If swsContext is NULL, then does nothing.
*/
void sws_freeContext(struct SwsContext *swsContext);
 
/**
* Allocate and return an SwsContext. You need it to perform
* scaling/conversion operations using sws_scale().
*
* @param srcW the width of the source image
* @param srcH the height of the source image
* @param srcFormat the source image format
* @param dstW the width of the destination image
* @param dstH the height of the destination image
* @param dstFormat the destination image format
* @param flags specify which algorithm and options to use for rescaling
* @param param extra parameters to tune the used scaler
* For SWS_BICUBIC param[0] and [1] tune the shape of the basis
* function, param[0] tunes f(1) and param[1] f´(1)
* For SWS_GAUSS param[0] tunes the exponent and thus cutoff
* frequency
* For SWS_LANCZOS param[0] tunes the width of the window function
* @return a pointer to an allocated context, or NULL in case of error
* @note this function is to be removed after a saner alternative is
* written
*/
struct SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, SwsFilter *srcFilter,
SwsFilter *dstFilter, const double *param);
 
/**
* Scale the image slice in srcSlice and put the resulting scaled
* slice in the image in dst. A slice is a sequence of consecutive
* rows in an image.
*
* Slices have to be provided in sequential order, either in
* top-bottom or bottom-top order. If slices are provided in
* non-sequential order the behavior of the function is undefined.
*
* @param c the scaling context previously created with
* sws_getContext()
* @param srcSlice the array containing the pointers to the planes of
* the source slice
* @param srcStride the array containing the strides for each plane of
* the source image
* @param srcSliceY the position in the source image of the slice to
* process, that is the number (counted starting from
* zero) in the image of the first row of the slice
* @param srcSliceH the height of the source slice, that is the number
* of rows in the slice
* @param dst the array containing the pointers to the planes of
* the destination image
* @param dstStride the array containing the strides for each plane of
* the destination image
* @return the height of the output slice
*/
int sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
const int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *const dst[], const int dstStride[]);
 
/**
* @param dstRange flag indicating the while-black range of the output (1=jpeg / 0=mpeg)
* @param srcRange flag indicating the while-black range of the input (1=jpeg / 0=mpeg)
* @param table the yuv2rgb coefficients describing the output yuv space, normally ff_yuv2rgb_coeffs[x]
* @param inv_table the yuv2rgb coefficients describing the input yuv space, normally ff_yuv2rgb_coeffs[x]
* @param brightness 16.16 fixed point brightness correction
* @param contrast 16.16 fixed point contrast correction
* @param saturation 16.16 fixed point saturation correction
* @return -1 if not supported
*/
int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
int srcRange, const int table[4], int dstRange,
int brightness, int contrast, int saturation);
 
/**
* @return -1 if not supported
*/
int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
int *srcRange, int **table, int *dstRange,
int *brightness, int *contrast, int *saturation);
 
/**
* Allocate and return an uninitialized vector with length coefficients.
*/
SwsVector *sws_allocVec(int length);
 
/**
* Return a normalized Gaussian curve used to filter stuff
* quality = 3 is high quality, lower is lower quality.
*/
SwsVector *sws_getGaussianVec(double variance, double quality);
 
/**
* Allocate and return a vector with length coefficients, all
* with the same value c.
*/
SwsVector *sws_getConstVec(double c, int length);
 
/**
* Allocate and return a vector with just one coefficient, with
* value 1.0.
*/
SwsVector *sws_getIdentityVec(void);
 
/**
* Scale all the coefficients of a by the scalar value.
*/
void sws_scaleVec(SwsVector *a, double scalar);
 
/**
* Scale all the coefficients of a so that their sum equals height.
*/
void sws_normalizeVec(SwsVector *a, double height);
void sws_convVec(SwsVector *a, SwsVector *b);
void sws_addVec(SwsVector *a, SwsVector *b);
void sws_subVec(SwsVector *a, SwsVector *b);
void sws_shiftVec(SwsVector *a, int shift);
 
/**
* Allocate and return a clone of the vector a, that is a vector
* with the same coefficients as a.
*/
SwsVector *sws_cloneVec(SwsVector *a);
 
/**
* Print with av_log() a textual representation of the vector a
* if log_level <= av_log_level.
*/
void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level);
 
void sws_freeVec(SwsVector *a);
 
SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
float lumaSharpen, float chromaSharpen,
float chromaHShift, float chromaVShift,
int verbose);
void sws_freeFilter(SwsFilter *filter);
 
/**
* Check if context can be reused, otherwise reallocate a new one.
*
* If context is NULL, just calls sws_getContext() to get a new
* context. Otherwise, checks if the parameters are the ones already
* saved in context. If that is the case, returns the current
* context. Otherwise, frees context and gets a new context with
* the new parameters.
*
* Be warned that srcFilter and dstFilter are not checked, they
* are assumed to remain the same.
*/
struct SwsContext *sws_getCachedContext(struct SwsContext *context,
int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, SwsFilter *srcFilter,
SwsFilter *dstFilter, const double *param);
 
/**
* Convert an 8-bit paletted frame into a frame with a color depth of 32 bits.
*
* The output frame will have the same packed format as the palette.
*
* @param src source frame buffer
* @param dst destination frame buffer
* @param num_pixels number of pixels to convert
* @param palette array with [256] entries, which must match color arrangement (RGB or BGR) of src
*/
void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
 
/**
* Convert an 8-bit paletted frame into a frame with a color depth of 24 bits.
*
* With the palette format "ABCD", the destination frame ends up with the format "ABC".
*
* @param src source frame buffer
* @param dst destination frame buffer
* @param num_pixels number of pixels to convert
* @param palette array with [256] entries, which must match color arrangement (RGB or BGR) of src
*/
void sws_convertPalette8ToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels, const uint8_t *palette);
 
/**
* Get the AVClass for swsContext. It can be used in combination with
* AV_OPT_SEARCH_FAKE_OBJ for examining options.
*
* @see av_opt_find().
*/
const AVClass *sws_get_class(void);
 
/**
* @}
*/
 
#endif /* SWSCALE_SWSCALE_H */
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/swscale_internal.h
0,0 → 1,1056
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef SWSCALE_SWSCALE_INTERNAL_H
#define SWSCALE_SWSCALE_INTERNAL_H
 
#include "config.h"
 
#if HAVE_ALTIVEC_H
#include <altivec.h>
#endif
 
#include "version.h"
 
#include "libavutil/avassert.h"
#include "libavutil/avutil.h"
#include "libavutil/common.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/log.h"
#include "libavutil/pixfmt.h"
#include "libavutil/pixdesc.h"
 
#define STR(s) AV_TOSTRING(s) // AV_STRINGIFY is too long
 
#define YUVRGB_TABLE_HEADROOM 512
#define YUVRGB_TABLE_LUMA_HEADROOM 512
 
#define MAX_FILTER_SIZE SWS_MAX_FILTER_SIZE
 
#define DITHER1XBPP
 
#if HAVE_BIGENDIAN
#define ALT32_CORR (-1)
#else
#define ALT32_CORR 1
#endif
 
#if ARCH_X86_64
# define APCK_PTR2 8
# define APCK_COEF 16
# define APCK_SIZE 24
#else
# define APCK_PTR2 4
# define APCK_COEF 8
# define APCK_SIZE 16
#endif
 
#define RETCODE_USE_CASCADE -12345
 
struct SwsContext;
 
typedef enum SwsDither {
SWS_DITHER_NONE = 0,
SWS_DITHER_AUTO,
SWS_DITHER_BAYER,
SWS_DITHER_ED,
SWS_DITHER_A_DITHER,
SWS_DITHER_X_DITHER,
NB_SWS_DITHER,
} SwsDither;
 
typedef enum SwsAlphaBlend {
SWS_ALPHA_BLEND_NONE = 0,
SWS_ALPHA_BLEND_UNIFORM,
SWS_ALPHA_BLEND_CHECKERBOARD,
SWS_ALPHA_BLEND_NB,
} SwsAlphaBlend;
 
typedef int (*SwsFunc)(struct SwsContext *context, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]);
 
/**
* Write one line of horizontally scaled data to planar output
* without any additional vertical scaling (or point-scaling).
*
* @param src scaled source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param dest pointer to the output plane. For >8bit
* output, this is in uint16_t
* @param dstW width of destination in pixels
* @param dither ordered dither array of type int16_t and size 8
* @param offset Dither offset
*/
typedef void (*yuv2planar1_fn)(const int16_t *src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
 
/**
* Write one line of horizontally scaled data to planar output
* with multi-point vertical scaling between input pixels.
*
* @param filter vertical luma/alpha scaling coefficients, 12bit [0,4096]
* @param src scaled luma (Y) or alpha (A) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param filterSize number of vertical input lines to scale
* @param dest pointer to output plane. For >8bit
* output, this is in uint16_t
* @param dstW width of destination pixels
* @param offset Dither offset
*/
typedef void (*yuv2planarX_fn)(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset);
 
/**
* Write one line of horizontally scaled chroma to interleaved output
* with multi-point vertical scaling between input pixels.
*
* @param c SWS scaling context
* @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096]
* @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrFilterSize number of vertical chroma input lines to scale
* @param dest pointer to the output plane. For >8bit
* output, this is in uint16_t
* @param dstW width of chroma planes
*/
typedef void (*yuv2interleavedX_fn)(struct SwsContext *c,
const int16_t *chrFilter,
int chrFilterSize,
const int16_t **chrUSrc,
const int16_t **chrVSrc,
uint8_t *dest, int dstW);
 
/**
* Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
* output without any additional vertical scaling (or point-scaling). Note
* that this function may do chroma scaling, see the "uvalpha" argument.
*
* @param c SWS scaling context
* @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param dest pointer to the output plane. For 16bit output, this is
* uint16_t
* @param dstW width of lumSrc and alpSrc in pixels, number of pixels
* to write into dest[]
* @param uvalpha chroma scaling coefficient for the second line of chroma
* pixels, either 2048 or 0. If 0, one chroma input is used
* for 2 output pixels (or if the SWS_FLAG_FULL_CHR_INT flag
* is set, it generates 1 output pixel). If 2048, two chroma
* input pixels should be averaged for 2 output pixels (this
* only happens if SWS_FLAG_FULL_CHR_INT is not set)
* @param y vertical line number for this output. This does not need
* to be used to calculate the offset in the destination,
* but can be used to generate comfort noise using dithering
* for some output formats.
*/
typedef void (*yuv2packed1_fn)(struct SwsContext *c, const int16_t *lumSrc,
const int16_t *chrUSrc[2],
const int16_t *chrVSrc[2],
const int16_t *alpSrc, uint8_t *dest,
int dstW, int uvalpha, int y);
/**
* Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
* output by doing bilinear scaling between two input lines.
*
* @param c SWS scaling context
* @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param dest pointer to the output plane. For 16bit output, this is
* uint16_t
* @param dstW width of lumSrc and alpSrc in pixels, number of pixels
* to write into dest[]
* @param yalpha luma/alpha scaling coefficients for the second input line.
* The first line's coefficients can be calculated by using
* 4096 - yalpha
* @param uvalpha chroma scaling coefficient for the second input line. The
* first line's coefficients can be calculated by using
* 4096 - uvalpha
* @param y vertical line number for this output. This does not need
* to be used to calculate the offset in the destination,
* but can be used to generate comfort noise using dithering
* for some output formats.
*/
typedef void (*yuv2packed2_fn)(struct SwsContext *c, const int16_t *lumSrc[2],
const int16_t *chrUSrc[2],
const int16_t *chrVSrc[2],
const int16_t *alpSrc[2],
uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y);
/**
* Write one line of horizontally scaled Y/U/V/A to packed-pixel YUV/RGB
* output by doing multi-point vertical scaling between input pixels.
*
* @param c SWS scaling context
* @param lumFilter vertical luma/alpha scaling coefficients, 12bit [0,4096]
* @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param lumFilterSize number of vertical luma/alpha input lines to scale
* @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096]
* @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrFilterSize number of vertical chroma input lines to scale
* @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param dest pointer to the output plane. For 16bit output, this is
* uint16_t
* @param dstW width of lumSrc and alpSrc in pixels, number of pixels
* to write into dest[]
* @param y vertical line number for this output. This does not need
* to be used to calculate the offset in the destination,
* but can be used to generate comfort noise using dithering
* or some output formats.
*/
typedef void (*yuv2packedX_fn)(struct SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter,
const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t *dest,
int dstW, int y);
 
/**
* Write one line of horizontally scaled Y/U/V/A to YUV/RGB
* output by doing multi-point vertical scaling between input pixels.
*
* @param c SWS scaling context
* @param lumFilter vertical luma/alpha scaling coefficients, 12bit [0,4096]
* @param lumSrc scaled luma (Y) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param lumFilterSize number of vertical luma/alpha input lines to scale
* @param chrFilter vertical chroma scaling coefficients, 12bit [0,4096]
* @param chrUSrc scaled chroma (U) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrVSrc scaled chroma (V) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param chrFilterSize number of vertical chroma input lines to scale
* @param alpSrc scaled alpha (A) source data, 15bit for 8-10bit output,
* 19-bit for 16bit output (in int32_t)
* @param dest pointer to the output planes. For 16bit output, this is
* uint16_t
* @param dstW width of lumSrc and alpSrc in pixels, number of pixels
* to write into dest[]
* @param y vertical line number for this output. This does not need
* to be used to calculate the offset in the destination,
* but can be used to generate comfort noise using dithering
* or some output formats.
*/
typedef void (*yuv2anyX_fn)(struct SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter,
const int16_t **chrUSrc,
const int16_t **chrVSrc, int chrFilterSize,
const int16_t **alpSrc, uint8_t **dest,
int dstW, int y);
 
struct SwsSlice;
struct SwsFilterDescriptor;
 
/* This struct should be aligned on at least a 32-byte boundary. */
typedef struct SwsContext {
/**
* info on struct for av_log
*/
const AVClass *av_class;
 
/**
* Note that src, dst, srcStride, dstStride will be copied in the
* sws_scale() wrapper so they can be freely modified here.
*/
SwsFunc swscale;
int srcW; ///< Width of source luma/alpha planes.
int srcH; ///< Height of source luma/alpha planes.
int dstH; ///< Height of destination luma/alpha planes.
int chrSrcW; ///< Width of source chroma planes.
int chrSrcH; ///< Height of source chroma planes.
int chrDstW; ///< Width of destination chroma planes.
int chrDstH; ///< Height of destination chroma planes.
int lumXInc, chrXInc;
int lumYInc, chrYInc;
enum AVPixelFormat dstFormat; ///< Destination pixel format.
enum AVPixelFormat srcFormat; ///< Source pixel format.
int dstFormatBpp; ///< Number of bits per pixel of the destination pixel format.
int srcFormatBpp; ///< Number of bits per pixel of the source pixel format.
int dstBpc, srcBpc;
int chrSrcHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in source image.
int chrSrcVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in source image.
int chrDstHSubSample; ///< Binary logarithm of horizontal subsampling factor between luma/alpha and chroma planes in destination image.
int chrDstVSubSample; ///< Binary logarithm of vertical subsampling factor between luma/alpha and chroma planes in destination image.
int vChrDrop; ///< Binary logarithm of extra vertical subsampling factor in source image chroma planes specified by user.
int sliceDir; ///< Direction that slices are fed to the scaler (1 = top-to-bottom, -1 = bottom-to-top).
double param[2]; ///< Input parameters for scaling algorithms that need them.
 
/* The cascaded_* fields allow spliting a scaler task into multiple
* sequential steps, this is for example used to limit the maximum
* downscaling factor that needs to be supported in one scaler.
*/
struct SwsContext *cascaded_context[3];
int cascaded_tmpStride[4];
uint8_t *cascaded_tmp[4];
int cascaded1_tmpStride[4];
uint8_t *cascaded1_tmp[4];
 
double gamma_value;
int gamma_flag;
int is_internal_gamma;
uint16_t *gamma;
uint16_t *inv_gamma;
 
int numDesc;
int descIndex[2];
int numSlice;
struct SwsSlice *slice;
struct SwsFilterDescriptor *desc;
 
uint32_t pal_yuv[256];
uint32_t pal_rgb[256];
 
/**
* @name Scaled horizontal lines ring buffer.
* The horizontal scaler keeps just enough scaled lines in a ring buffer
* so they may be passed to the vertical scaler. The pointers to the
* allocated buffers for each line are duplicated in sequence in the ring
* buffer to simplify indexing and avoid wrapping around between lines
* inside the vertical scaler code. The wrapping is done before the
* vertical scaler is called.
*/
//@{
int16_t **lumPixBuf; ///< Ring buffer for scaled horizontal luma plane lines to be fed to the vertical scaler.
int16_t **chrUPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
int16_t **chrVPixBuf; ///< Ring buffer for scaled horizontal chroma plane lines to be fed to the vertical scaler.
int16_t **alpPixBuf; ///< Ring buffer for scaled horizontal alpha plane lines to be fed to the vertical scaler.
int vLumBufSize; ///< Number of vertical luma/alpha lines allocated in the ring buffer.
int vChrBufSize; ///< Number of vertical chroma lines allocated in the ring buffer.
int lastInLumBuf; ///< Last scaled horizontal luma/alpha line from source in the ring buffer.
int lastInChrBuf; ///< Last scaled horizontal chroma line from source in the ring buffer.
int lumBufIndex; ///< Index in ring buffer of the last scaled horizontal luma/alpha line from source.
int chrBufIndex; ///< Index in ring buffer of the last scaled horizontal chroma line from source.
//@}
 
uint8_t *formatConvBuffer;
 
/**
* @name Horizontal and vertical filters.
* To better understand the following fields, here is a pseudo-code of
* their usage in filtering a horizontal line:
* @code
* for (i = 0; i < width; i++) {
* dst[i] = 0;
* for (j = 0; j < filterSize; j++)
* dst[i] += src[ filterPos[i] + j ] * filter[ filterSize * i + j ];
* dst[i] >>= FRAC_BITS; // The actual implementation is fixed-point.
* }
* @endcode
*/
//@{
int16_t *hLumFilter; ///< Array of horizontal filter coefficients for luma/alpha planes.
int16_t *hChrFilter; ///< Array of horizontal filter coefficients for chroma planes.
int16_t *vLumFilter; ///< Array of vertical filter coefficients for luma/alpha planes.
int16_t *vChrFilter; ///< Array of vertical filter coefficients for chroma planes.
int32_t *hLumFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for luma/alpha planes.
int32_t *hChrFilterPos; ///< Array of horizontal filter starting positions for each dst[i] for chroma planes.
int32_t *vLumFilterPos; ///< Array of vertical filter starting positions for each dst[i] for luma/alpha planes.
int32_t *vChrFilterPos; ///< Array of vertical filter starting positions for each dst[i] for chroma planes.
int hLumFilterSize; ///< Horizontal filter size for luma/alpha pixels.
int hChrFilterSize; ///< Horizontal filter size for chroma pixels.
int vLumFilterSize; ///< Vertical filter size for luma/alpha pixels.
int vChrFilterSize; ///< Vertical filter size for chroma pixels.
//@}
 
int lumMmxextFilterCodeSize; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code size for luma/alpha planes.
int chrMmxextFilterCodeSize; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code size for chroma planes.
uint8_t *lumMmxextFilterCode; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code for luma/alpha planes.
uint8_t *chrMmxextFilterCode; ///< Runtime-generated MMXEXT horizontal fast bilinear scaler code for chroma planes.
 
int canMMXEXTBeUsed;
int warned_unuseable_bilinear;
 
int dstY; ///< Last destination vertical line output from last slice.
int flags; ///< Flags passed by the user to select scaler algorithm, optimizations, subsampling, etc...
void *yuvTable; // pointer to the yuv->rgb table start so it can be freed()
// alignment ensures the offset can be added in a single
// instruction on e.g. ARM
DECLARE_ALIGNED(16, int, table_gV)[256 + 2*YUVRGB_TABLE_HEADROOM];
uint8_t *table_rV[256 + 2*YUVRGB_TABLE_HEADROOM];
uint8_t *table_gU[256 + 2*YUVRGB_TABLE_HEADROOM];
uint8_t *table_bU[256 + 2*YUVRGB_TABLE_HEADROOM];
DECLARE_ALIGNED(16, int32_t, input_rgb2yuv_table)[16+40*4]; // This table can contain both C and SIMD formatted values, the C vales are always at the XY_IDX points
#define RY_IDX 0
#define GY_IDX 1
#define BY_IDX 2
#define RU_IDX 3
#define GU_IDX 4
#define BU_IDX 5
#define RV_IDX 6
#define GV_IDX 7
#define BV_IDX 8
#define RGB2YUV_SHIFT 15
 
int *dither_error[4];
 
//Colorspace stuff
int contrast, brightness, saturation; // for sws_getColorspaceDetails
int srcColorspaceTable[4];
int dstColorspaceTable[4];
int srcRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (source image).
int dstRange; ///< 0 = MPG YUV range, 1 = JPG YUV range (destination image).
int src0Alpha;
int dst0Alpha;
int srcXYZ;
int dstXYZ;
int src_h_chr_pos;
int dst_h_chr_pos;
int src_v_chr_pos;
int dst_v_chr_pos;
int yuv2rgb_y_offset;
int yuv2rgb_y_coeff;
int yuv2rgb_v2r_coeff;
int yuv2rgb_v2g_coeff;
int yuv2rgb_u2g_coeff;
int yuv2rgb_u2b_coeff;
 
#define RED_DITHER "0*8"
#define GREEN_DITHER "1*8"
#define BLUE_DITHER "2*8"
#define Y_COEFF "3*8"
#define VR_COEFF "4*8"
#define UB_COEFF "5*8"
#define VG_COEFF "6*8"
#define UG_COEFF "7*8"
#define Y_OFFSET "8*8"
#define U_OFFSET "9*8"
#define V_OFFSET "10*8"
#define LUM_MMX_FILTER_OFFSET "11*8"
#define CHR_MMX_FILTER_OFFSET "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)
#define DSTW_OFFSET "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2"
#define ESP_OFFSET "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+8"
#define VROUNDER_OFFSET "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+16"
#define U_TEMP "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+24"
#define V_TEMP "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+32"
#define Y_TEMP "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+40"
#define ALP_MMX_FILTER_OFFSET "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*2+48"
#define UV_OFF_PX "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*3+48"
#define UV_OFF_BYTE "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*3+56"
#define DITHER16 "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*3+64"
#define DITHER32 "11*8+4*4*"AV_STRINGIFY(MAX_FILTER_SIZE)"*3+80"
#define DITHER32_INT (11*8+4*4*MAX_FILTER_SIZE*3+80) // value equal to above, used for checking that the struct hasn't been changed by mistake
 
DECLARE_ALIGNED(8, uint64_t, redDither);
DECLARE_ALIGNED(8, uint64_t, greenDither);
DECLARE_ALIGNED(8, uint64_t, blueDither);
 
DECLARE_ALIGNED(8, uint64_t, yCoeff);
DECLARE_ALIGNED(8, uint64_t, vrCoeff);
DECLARE_ALIGNED(8, uint64_t, ubCoeff);
DECLARE_ALIGNED(8, uint64_t, vgCoeff);
DECLARE_ALIGNED(8, uint64_t, ugCoeff);
DECLARE_ALIGNED(8, uint64_t, yOffset);
DECLARE_ALIGNED(8, uint64_t, uOffset);
DECLARE_ALIGNED(8, uint64_t, vOffset);
int32_t lumMmxFilter[4 * MAX_FILTER_SIZE];
int32_t chrMmxFilter[4 * MAX_FILTER_SIZE];
int dstW; ///< Width of destination luma/alpha planes.
DECLARE_ALIGNED(8, uint64_t, esp);
DECLARE_ALIGNED(8, uint64_t, vRounder);
DECLARE_ALIGNED(8, uint64_t, u_temp);
DECLARE_ALIGNED(8, uint64_t, v_temp);
DECLARE_ALIGNED(8, uint64_t, y_temp);
int32_t alpMmxFilter[4 * MAX_FILTER_SIZE];
// alignment of these values is not necessary, but merely here
// to maintain the same offset across x8632 and x86-64. Once we
// use proper offset macros in the asm, they can be removed.
DECLARE_ALIGNED(8, ptrdiff_t, uv_off); ///< offset (in pixels) between u and v planes
DECLARE_ALIGNED(8, ptrdiff_t, uv_offx2); ///< offset (in bytes) between u and v planes
DECLARE_ALIGNED(8, uint16_t, dither16)[8];
DECLARE_ALIGNED(8, uint32_t, dither32)[8];
 
const uint8_t *chrDither8, *lumDither8;
 
#if HAVE_ALTIVEC
vector signed short CY;
vector signed short CRV;
vector signed short CBU;
vector signed short CGU;
vector signed short CGV;
vector signed short OY;
vector unsigned short CSHIFT;
vector signed short *vYCoeffsBank, *vCCoeffsBank;
#endif
 
int use_mmx_vfilter;
 
/* pre defined color-spaces gamma */
#define XYZ_GAMMA (2.6f)
#define RGB_GAMMA (2.2f)
int16_t *xyzgamma;
int16_t *rgbgamma;
int16_t *xyzgammainv;
int16_t *rgbgammainv;
int16_t xyz2rgb_matrix[3][4];
int16_t rgb2xyz_matrix[3][4];
 
/* function pointers for swscale() */
yuv2planar1_fn yuv2plane1;
yuv2planarX_fn yuv2planeX;
yuv2interleavedX_fn yuv2nv12cX;
yuv2packed1_fn yuv2packed1;
yuv2packed2_fn yuv2packed2;
yuv2packedX_fn yuv2packedX;
yuv2anyX_fn yuv2anyX;
 
/// Unscaled conversion of luma plane to YV12 for horizontal scaler.
void (*lumToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
int width, uint32_t *pal);
/// Unscaled conversion of alpha plane to YV12 for horizontal scaler.
void (*alpToYV12)(uint8_t *dst, const uint8_t *src, const uint8_t *src2, const uint8_t *src3,
int width, uint32_t *pal);
/// Unscaled conversion of chroma planes to YV12 for horizontal scaler.
void (*chrToYV12)(uint8_t *dstU, uint8_t *dstV,
const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
int width, uint32_t *pal);
 
/**
* Functions to read planar input, such as planar RGB, and convert
* internally to Y/UV/A.
*/
/** @{ */
void (*readLumPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv);
void (*readChrPlanar)(uint8_t *dstU, uint8_t *dstV, const uint8_t *src[4],
int width, int32_t *rgb2yuv);
void (*readAlpPlanar)(uint8_t *dst, const uint8_t *src[4], int width, int32_t *rgb2yuv);
/** @} */
 
/**
* Scale one horizontal line of input data using a bilinear filter
* to produce one line of output data. Compared to SwsContext->hScale(),
* please take note of the following caveats when using these:
* - Scaling is done using only 7bit instead of 14bit coefficients.
* - You can use no more than 5 input pixels to produce 4 output
* pixels. Therefore, this filter should not be used for downscaling
* by more than ~20% in width (because that equals more than 5/4th
* downscaling and thus more than 5 pixels input per 4 pixels output).
* - In general, bilinear filters create artifacts during downscaling
* (even when <20%), because one output pixel will span more than one
* input pixel, and thus some pixels will need edges of both neighbor
* pixels to interpolate the output pixel. Since you can use at most
* two input pixels per output pixel in bilinear scaling, this is
* impossible and thus downscaling by any size will create artifacts.
* To enable this type of scaling, set SWS_FLAG_FAST_BILINEAR
* in SwsContext->flags.
*/
/** @{ */
void (*hyscale_fast)(struct SwsContext *c,
int16_t *dst, int dstWidth,
const uint8_t *src, int srcW, int xInc);
void (*hcscale_fast)(struct SwsContext *c,
int16_t *dst1, int16_t *dst2, int dstWidth,
const uint8_t *src1, const uint8_t *src2,
int srcW, int xInc);
/** @} */
 
/**
* Scale one horizontal line of input data using a filter over the input
* lines, to produce one (differently sized) line of output data.
*
* @param dst pointer to destination buffer for horizontally scaled
* data. If the number of bits per component of one
* destination pixel (SwsContext->dstBpc) is <= 10, data
* will be 15bpc in 16bits (int16_t) width. Else (i.e.
* SwsContext->dstBpc == 16), data will be 19bpc in
* 32bits (int32_t) width.
* @param dstW width of destination image
* @param src pointer to source data to be scaled. If the number of
* bits per component of a source pixel (SwsContext->srcBpc)
* is 8, this is 8bpc in 8bits (uint8_t) width. Else
* (i.e. SwsContext->dstBpc > 8), this is native depth
* in 16bits (uint16_t) width. In other words, for 9-bit
* YUV input, this is 9bpc, for 10-bit YUV input, this is
* 10bpc, and for 16-bit RGB or YUV, this is 16bpc.
* @param filter filter coefficients to be used per output pixel for
* scaling. This contains 14bpp filtering coefficients.
* Guaranteed to contain dstW * filterSize entries.
* @param filterPos position of the first input pixel to be used for
* each output pixel during scaling. Guaranteed to
* contain dstW entries.
* @param filterSize the number of input coefficients to be used (and
* thus the number of input pixels to be used) for
* creating a single output pixel. Is aligned to 4
* (and input coefficients thus padded with zeroes)
* to simplify creating SIMD code.
*/
/** @{ */
void (*hyScale)(struct SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
void (*hcScale)(struct SwsContext *c, int16_t *dst, int dstW,
const uint8_t *src, const int16_t *filter,
const int32_t *filterPos, int filterSize);
/** @} */
 
/// Color range conversion function for luma plane if needed.
void (*lumConvertRange)(int16_t *dst, int width);
/// Color range conversion function for chroma planes if needed.
void (*chrConvertRange)(int16_t *dst1, int16_t *dst2, int width);
 
int needs_hcscale; ///< Set if there are chroma planes to be converted.
 
SwsDither dither;
 
SwsAlphaBlend alphablend;
} SwsContext;
//FIXME check init (where 0)
 
SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c);
int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
int fullRange, int brightness,
int contrast, int saturation);
void ff_yuv2rgb_init_tables_ppc(SwsContext *c, const int inv_table[4],
int brightness, int contrast, int saturation);
 
void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
int lastInLumBuf, int lastInChrBuf);
 
av_cold void ff_sws_init_range_convert(SwsContext *c);
 
SwsFunc ff_yuv2rgb_init_x86(SwsContext *c);
SwsFunc ff_yuv2rgb_init_ppc(SwsContext *c);
 
static av_always_inline int is16BPS(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return desc->comp[0].depth_minus1 == 15;
}
 
static av_always_inline int is9_OR_10BPS(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return desc->comp[0].depth_minus1 >= 8 && desc->comp[0].depth_minus1 <= 13;
}
 
#define isNBPS(x) is9_OR_10BPS(x)
 
static av_always_inline int isBE(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return desc->flags & AV_PIX_FMT_FLAG_BE;
}
 
static av_always_inline int isYUV(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return !(desc->flags & AV_PIX_FMT_FLAG_RGB) && desc->nb_components >= 2;
}
 
static av_always_inline int isPlanarYUV(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return ((desc->flags & AV_PIX_FMT_FLAG_PLANAR) && isYUV(pix_fmt));
}
 
static av_always_inline int isRGB(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return (desc->flags & AV_PIX_FMT_FLAG_RGB);
}
 
#if 0 // FIXME
#define isGray(x) \
(!(av_pix_fmt_desc_get(x)->flags & AV_PIX_FMT_FLAG_PAL) && \
av_pix_fmt_desc_get(x)->nb_components <= 2)
#else
#define isGray(x) \
((x) == AV_PIX_FMT_GRAY8 || \
(x) == AV_PIX_FMT_YA8 || \
(x) == AV_PIX_FMT_GRAY16BE || \
(x) == AV_PIX_FMT_GRAY16LE || \
(x) == AV_PIX_FMT_YA16BE || \
(x) == AV_PIX_FMT_YA16LE)
#endif
 
#define isRGBinInt(x) \
( \
(x) == AV_PIX_FMT_RGB48BE || \
(x) == AV_PIX_FMT_RGB48LE || \
(x) == AV_PIX_FMT_RGB32 || \
(x) == AV_PIX_FMT_RGB32_1 || \
(x) == AV_PIX_FMT_RGB24 || \
(x) == AV_PIX_FMT_RGB565BE || \
(x) == AV_PIX_FMT_RGB565LE || \
(x) == AV_PIX_FMT_RGB555BE || \
(x) == AV_PIX_FMT_RGB555LE || \
(x) == AV_PIX_FMT_RGB444BE || \
(x) == AV_PIX_FMT_RGB444LE || \
(x) == AV_PIX_FMT_RGB8 || \
(x) == AV_PIX_FMT_RGB4 || \
(x) == AV_PIX_FMT_RGB4_BYTE || \
(x) == AV_PIX_FMT_RGBA64BE || \
(x) == AV_PIX_FMT_RGBA64LE || \
(x) == AV_PIX_FMT_MONOBLACK || \
(x) == AV_PIX_FMT_MONOWHITE \
)
#define isBGRinInt(x) \
( \
(x) == AV_PIX_FMT_BGR48BE || \
(x) == AV_PIX_FMT_BGR48LE || \
(x) == AV_PIX_FMT_BGR32 || \
(x) == AV_PIX_FMT_BGR32_1 || \
(x) == AV_PIX_FMT_BGR24 || \
(x) == AV_PIX_FMT_BGR565BE || \
(x) == AV_PIX_FMT_BGR565LE || \
(x) == AV_PIX_FMT_BGR555BE || \
(x) == AV_PIX_FMT_BGR555LE || \
(x) == AV_PIX_FMT_BGR444BE || \
(x) == AV_PIX_FMT_BGR444LE || \
(x) == AV_PIX_FMT_BGR8 || \
(x) == AV_PIX_FMT_BGR4 || \
(x) == AV_PIX_FMT_BGR4_BYTE || \
(x) == AV_PIX_FMT_BGRA64BE || \
(x) == AV_PIX_FMT_BGRA64LE || \
(x) == AV_PIX_FMT_MONOBLACK || \
(x) == AV_PIX_FMT_MONOWHITE \
)
 
#define isRGBinBytes(x) ( \
(x) == AV_PIX_FMT_RGB48BE \
|| (x) == AV_PIX_FMT_RGB48LE \
|| (x) == AV_PIX_FMT_RGBA64BE \
|| (x) == AV_PIX_FMT_RGBA64LE \
|| (x) == AV_PIX_FMT_RGBA \
|| (x) == AV_PIX_FMT_ARGB \
|| (x) == AV_PIX_FMT_RGB24 \
)
#define isBGRinBytes(x) ( \
(x) == AV_PIX_FMT_BGR48BE \
|| (x) == AV_PIX_FMT_BGR48LE \
|| (x) == AV_PIX_FMT_BGRA64BE \
|| (x) == AV_PIX_FMT_BGRA64LE \
|| (x) == AV_PIX_FMT_BGRA \
|| (x) == AV_PIX_FMT_ABGR \
|| (x) == AV_PIX_FMT_BGR24 \
)
 
#define isBayer(x) ( \
(x)==AV_PIX_FMT_BAYER_BGGR8 \
|| (x)==AV_PIX_FMT_BAYER_BGGR16LE \
|| (x)==AV_PIX_FMT_BAYER_BGGR16BE \
|| (x)==AV_PIX_FMT_BAYER_RGGB8 \
|| (x)==AV_PIX_FMT_BAYER_RGGB16LE \
|| (x)==AV_PIX_FMT_BAYER_RGGB16BE \
|| (x)==AV_PIX_FMT_BAYER_GBRG8 \
|| (x)==AV_PIX_FMT_BAYER_GBRG16LE \
|| (x)==AV_PIX_FMT_BAYER_GBRG16BE \
|| (x)==AV_PIX_FMT_BAYER_GRBG8 \
|| (x)==AV_PIX_FMT_BAYER_GRBG16LE \
|| (x)==AV_PIX_FMT_BAYER_GRBG16BE \
)
 
#define isAnyRGB(x) \
( \
isBayer(x) || \
isRGBinInt(x) || \
isBGRinInt(x) || \
isRGB(x) \
)
 
static av_always_inline int isALPHA(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
if (pix_fmt == AV_PIX_FMT_PAL8)
return 1;
return desc->flags & AV_PIX_FMT_FLAG_ALPHA;
}
 
#if 1
#define isPacked(x) ( \
(x)==AV_PIX_FMT_PAL8 \
|| (x)==AV_PIX_FMT_YUYV422 \
|| (x)==AV_PIX_FMT_YVYU422 \
|| (x)==AV_PIX_FMT_UYVY422 \
|| (x)==AV_PIX_FMT_YA8 \
|| (x)==AV_PIX_FMT_YA16LE \
|| (x)==AV_PIX_FMT_YA16BE \
|| (x)==AV_PIX_FMT_AYUV64LE \
|| (x)==AV_PIX_FMT_AYUV64BE \
|| isRGBinInt(x) \
|| isBGRinInt(x) \
)
#else
static av_always_inline int isPacked(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return ((desc->nb_components >= 2 && !(desc->flags & AV_PIX_FMT_FLAG_PLANAR)) ||
pix_fmt == AV_PIX_FMT_PAL8);
}
 
#endif
static av_always_inline int isPlanar(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return (desc->nb_components >= 2 && (desc->flags & AV_PIX_FMT_FLAG_PLANAR));
}
 
static av_always_inline int isPackedRGB(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return ((desc->flags & (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB)) == AV_PIX_FMT_FLAG_RGB);
}
 
static av_always_inline int isPlanarRGB(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return ((desc->flags & (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB)) ==
(AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB));
}
 
static av_always_inline int usePal(enum AVPixelFormat pix_fmt)
{
const AVPixFmtDescriptor *desc = av_pix_fmt_desc_get(pix_fmt);
av_assert0(desc);
return (desc->flags & AV_PIX_FMT_FLAG_PAL) || (desc->flags & AV_PIX_FMT_FLAG_PSEUDOPAL);
}
 
extern const uint64_t ff_dither4[2];
extern const uint64_t ff_dither8[2];
 
extern const uint8_t ff_dither_2x2_4[3][8];
extern const uint8_t ff_dither_2x2_8[3][8];
extern const uint8_t ff_dither_4x4_16[5][8];
extern const uint8_t ff_dither_8x8_32[9][8];
extern const uint8_t ff_dither_8x8_73[9][8];
extern const uint8_t ff_dither_8x8_128[9][8];
extern const uint8_t ff_dither_8x8_220[9][8];
 
extern const int32_t ff_yuv2rgb_coeffs[8][4];
 
extern const AVClass sws_context_class;
 
/**
* Set c->swscale to an unscaled converter if one exists for the specific
* source and destination formats, bit depths, flags, etc.
*/
void ff_get_unscaled_swscale(SwsContext *c);
void ff_get_unscaled_swscale_ppc(SwsContext *c);
void ff_get_unscaled_swscale_arm(SwsContext *c);
 
/**
* Return function pointer to fastest main scaler path function depending
* on architecture and available optimizations.
*/
SwsFunc ff_getSwsFunc(SwsContext *c);
 
void ff_sws_init_input_funcs(SwsContext *c);
void ff_sws_init_output_funcs(SwsContext *c,
yuv2planar1_fn *yuv2plane1,
yuv2planarX_fn *yuv2planeX,
yuv2interleavedX_fn *yuv2nv12cX,
yuv2packed1_fn *yuv2packed1,
yuv2packed2_fn *yuv2packed2,
yuv2packedX_fn *yuv2packedX,
yuv2anyX_fn *yuv2anyX);
void ff_sws_init_swscale_ppc(SwsContext *c);
void ff_sws_init_swscale_x86(SwsContext *c);
 
void ff_hyscale_fast_c(SwsContext *c, int16_t *dst, int dstWidth,
const uint8_t *src, int srcW, int xInc);
void ff_hcscale_fast_c(SwsContext *c, int16_t *dst1, int16_t *dst2,
int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc);
int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
int16_t *filter, int32_t *filterPos,
int numSplits);
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
int dstWidth, const uint8_t *src,
int srcW, int xInc);
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc);
 
/**
* Allocate and return an SwsContext.
* This is like sws_getContext() but does not perform the init step, allowing
* the user to set additional AVOptions.
*
* @see sws_getContext()
*/
struct SwsContext *sws_alloc_set_opts(int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, const double *param);
 
int ff_sws_alphablendaway(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[]);
 
static inline void fillPlane16(uint8_t *plane, int stride, int width, int height, int y,
int alpha, int bits, const int big_endian)
{
int i, j;
uint8_t *ptr = plane + stride * y;
int v = alpha ? 0xFFFF>>(15-bits) : (1<<bits);
for (i = 0; i < height; i++) {
#define FILL(wfunc) \
for (j = 0; j < width; j++) {\
wfunc(ptr+2*j, v);\
}
if (big_endian) {
FILL(AV_WB16);
} else {
FILL(AV_WL16);
}
ptr += stride;
}
}
 
#define MAX_SLICE_PLANES 4
 
/// Slice plane
typedef struct SwsPlane
{
int available_lines; ///< max number of lines that can be hold by this plane
int sliceY; ///< index of first line
int sliceH; ///< number of lines
uint8_t **line; ///< line buffer
uint8_t **tmp; ///< Tmp line buffer used by mmx code
} SwsPlane;
 
/**
* Struct which defines a slice of an image to be scaled or a output for
* a scaled slice.
* A slice can also be used as intermediate ring buffer for scaling steps.
*/
typedef struct SwsSlice
{
int width; ///< Slice line width
int h_chr_sub_sample; ///< horizontal chroma subsampling factor
int v_chr_sub_sample; ///< vertical chroma subsampling factor
int is_ring; ///< flag to identify if this slice is a ring buffer
int should_free_lines; ///< flag to identify if there are dynamic allocated lines
enum AVPixelFormat fmt; ///< planes pixel format
SwsPlane plane[MAX_SLICE_PLANES]; ///< color planes
} SwsSlice;
 
/**
* Struct which holds all necessary data for processing a slice.
* A processing step can be a color conversion or horizontal/vertical scaling.
*/
typedef struct SwsFilterDescriptor
{
SwsSlice *src; ///< Source slice
SwsSlice *dst; ///< Output slice
 
int alpha; ///< Flag for processing alpha channel
void *instance; ///< Filter instance data
 
/// Function for processing input slice sliceH lines starting from line sliceY
int (*process)(SwsContext *c, struct SwsFilterDescriptor *desc, int sliceY, int sliceH);
} SwsFilterDescriptor;
 
/// Color conversion instance data
typedef struct ColorContext
{
uint32_t *pal;
} ColorContext;
 
/// Scaler instance data
typedef struct FilterContext
{
uint16_t *filter;
int *filter_pos;
int filter_size;
int xInc;
} FilterContext;
 
// warp input lines in the form (src + width*i + j) to slice format (line[i][j])
// relative=true means first line src[x][0] otherwise first line is src[x][lum/crh Y]
int ff_init_slice_from_src(SwsSlice * s, uint8_t *src[4], int stride[4], int srcW, int lumY, int lumH, int chrY, int chrH, int relative);
 
// Initialize scaler filter descriptor chain
int ff_init_filters(SwsContext *c);
 
// Free all filter data
int ff_free_filters(SwsContext *c);
 
/*
function for applying ring buffer logic into slice s
It checks if the slice can hold more @lum lines, if yes
do nothing otherwise remove @lum least used lines.
It applies the same procedure for @chr lines.
*/
int ff_rotate_slice(SwsSlice *s, int lum, int chr);
 
/// initializes gamma conversion descriptor
int ff_init_gamma_convert(SwsFilterDescriptor *desc, SwsSlice * src, uint16_t *table);
 
/// initializes lum pixel format conversion descriptor
int ff_init_desc_fmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal);
 
/// initializes lum horizontal scaling descriptor
int ff_init_desc_hscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc);
 
/// initializes chr pixel format conversion descriptor
int ff_init_desc_cfmt_convert(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst, uint32_t *pal);
 
/// initializes chr horizontal scaling descriptor
int ff_init_desc_chscale(SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst, uint16_t *filter, int * filter_pos, int filter_size, int xInc);
 
int ff_init_desc_no_chr(SwsFilterDescriptor *desc, SwsSlice * src, SwsSlice *dst);
 
/// initializes vertical scaling descriptors
int ff_init_vscale(SwsContext *c, SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst);
 
/// setup vertical scaler functions
void ff_init_vscale_pfn(SwsContext *c, yuv2planar1_fn yuv2plane1, yuv2planarX_fn yuv2planeX,
yuv2interleavedX_fn yuv2nv12cX, yuv2packed1_fn yuv2packed1, yuv2packed2_fn yuv2packed2,
yuv2packedX_fn yuv2packedX, yuv2anyX_fn yuv2anyX, int use_mmx);
 
//number of extra lines to process
#define MAX_LINES_AHEAD 4
 
// enable use of refactored scaler code
#define NEW_FILTER
 
#endif /* SWSCALE_SWSCALE_INTERNAL_H */
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/swscale_unscaled.c
0,0 → 1,1792
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <inttypes.h>
#include <string.h>
#include <math.h>
#include <stdio.h>
#include "config.h"
#include "swscale.h"
#include "swscale_internal.h"
#include "rgb2rgb.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/cpu.h"
#include "libavutil/avutil.h"
#include "libavutil/mathematics.h"
#include "libavutil/bswap.h"
#include "libavutil/pixdesc.h"
#include "libavutil/avassert.h"
 
DECLARE_ALIGNED(8, static const uint8_t, dithers)[8][8][8]={
{
{ 0, 1, 0, 1, 0, 1, 0, 1,},
{ 1, 0, 1, 0, 1, 0, 1, 0,},
{ 0, 1, 0, 1, 0, 1, 0, 1,},
{ 1, 0, 1, 0, 1, 0, 1, 0,},
{ 0, 1, 0, 1, 0, 1, 0, 1,},
{ 1, 0, 1, 0, 1, 0, 1, 0,},
{ 0, 1, 0, 1, 0, 1, 0, 1,},
{ 1, 0, 1, 0, 1, 0, 1, 0,},
},{
{ 1, 2, 1, 2, 1, 2, 1, 2,},
{ 3, 0, 3, 0, 3, 0, 3, 0,},
{ 1, 2, 1, 2, 1, 2, 1, 2,},
{ 3, 0, 3, 0, 3, 0, 3, 0,},
{ 1, 2, 1, 2, 1, 2, 1, 2,},
{ 3, 0, 3, 0, 3, 0, 3, 0,},
{ 1, 2, 1, 2, 1, 2, 1, 2,},
{ 3, 0, 3, 0, 3, 0, 3, 0,},
},{
{ 2, 4, 3, 5, 2, 4, 3, 5,},
{ 6, 0, 7, 1, 6, 0, 7, 1,},
{ 3, 5, 2, 4, 3, 5, 2, 4,},
{ 7, 1, 6, 0, 7, 1, 6, 0,},
{ 2, 4, 3, 5, 2, 4, 3, 5,},
{ 6, 0, 7, 1, 6, 0, 7, 1,},
{ 3, 5, 2, 4, 3, 5, 2, 4,},
{ 7, 1, 6, 0, 7, 1, 6, 0,},
},{
{ 4, 8, 7, 11, 4, 8, 7, 11,},
{ 12, 0, 15, 3, 12, 0, 15, 3,},
{ 6, 10, 5, 9, 6, 10, 5, 9,},
{ 14, 2, 13, 1, 14, 2, 13, 1,},
{ 4, 8, 7, 11, 4, 8, 7, 11,},
{ 12, 0, 15, 3, 12, 0, 15, 3,},
{ 6, 10, 5, 9, 6, 10, 5, 9,},
{ 14, 2, 13, 1, 14, 2, 13, 1,},
},{
{ 9, 17, 15, 23, 8, 16, 14, 22,},
{ 25, 1, 31, 7, 24, 0, 30, 6,},
{ 13, 21, 11, 19, 12, 20, 10, 18,},
{ 29, 5, 27, 3, 28, 4, 26, 2,},
{ 8, 16, 14, 22, 9, 17, 15, 23,},
{ 24, 0, 30, 6, 25, 1, 31, 7,},
{ 12, 20, 10, 18, 13, 21, 11, 19,},
{ 28, 4, 26, 2, 29, 5, 27, 3,},
},{
{ 18, 34, 30, 46, 17, 33, 29, 45,},
{ 50, 2, 62, 14, 49, 1, 61, 13,},
{ 26, 42, 22, 38, 25, 41, 21, 37,},
{ 58, 10, 54, 6, 57, 9, 53, 5,},
{ 16, 32, 28, 44, 19, 35, 31, 47,},
{ 48, 0, 60, 12, 51, 3, 63, 15,},
{ 24, 40, 20, 36, 27, 43, 23, 39,},
{ 56, 8, 52, 4, 59, 11, 55, 7,},
},{
{ 18, 34, 30, 46, 17, 33, 29, 45,},
{ 50, 2, 62, 14, 49, 1, 61, 13,},
{ 26, 42, 22, 38, 25, 41, 21, 37,},
{ 58, 10, 54, 6, 57, 9, 53, 5,},
{ 16, 32, 28, 44, 19, 35, 31, 47,},
{ 48, 0, 60, 12, 51, 3, 63, 15,},
{ 24, 40, 20, 36, 27, 43, 23, 39,},
{ 56, 8, 52, 4, 59, 11, 55, 7,},
},{
{ 36, 68, 60, 92, 34, 66, 58, 90,},
{ 100, 4,124, 28, 98, 2,122, 26,},
{ 52, 84, 44, 76, 50, 82, 42, 74,},
{ 116, 20,108, 12,114, 18,106, 10,},
{ 32, 64, 56, 88, 38, 70, 62, 94,},
{ 96, 0,120, 24,102, 6,126, 30,},
{ 48, 80, 40, 72, 54, 86, 46, 78,},
{ 112, 16,104, 8,118, 22,110, 14,},
}};
 
static const uint16_t dither_scale[15][16]={
{ 2, 3, 3, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,},
{ 2, 3, 7, 7, 13, 13, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,},
{ 3, 3, 4, 15, 15, 29, 57, 57, 57, 113, 113, 113, 113, 113, 113, 113,},
{ 3, 4, 4, 5, 31, 31, 61, 121, 241, 241, 241, 241, 481, 481, 481, 481,},
{ 3, 4, 5, 5, 6, 63, 63, 125, 249, 497, 993, 993, 993, 993, 993, 1985,},
{ 3, 5, 6, 6, 6, 7, 127, 127, 253, 505, 1009, 2017, 4033, 4033, 4033, 4033,},
{ 3, 5, 6, 7, 7, 7, 8, 255, 255, 509, 1017, 2033, 4065, 8129,16257,16257,},
{ 3, 5, 6, 8, 8, 8, 8, 9, 511, 511, 1021, 2041, 4081, 8161,16321,32641,},
{ 3, 5, 7, 8, 9, 9, 9, 9, 10, 1023, 1023, 2045, 4089, 8177,16353,32705,},
{ 3, 5, 7, 8, 10, 10, 10, 10, 10, 11, 2047, 2047, 4093, 8185,16369,32737,},
{ 3, 5, 7, 8, 10, 11, 11, 11, 11, 11, 12, 4095, 4095, 8189,16377,32753,},
{ 3, 5, 7, 9, 10, 12, 12, 12, 12, 12, 12, 13, 8191, 8191,16381,32761,},
{ 3, 5, 7, 9, 10, 12, 13, 13, 13, 13, 13, 13, 14,16383,16383,32765,},
{ 3, 5, 7, 9, 10, 12, 14, 14, 14, 14, 14, 14, 14, 15,32767,32767,},
{ 3, 5, 7, 9, 11, 12, 14, 15, 15, 15, 15, 15, 15, 15, 16,65535,},
};
 
 
static void fillPlane(uint8_t *plane, int stride, int width, int height, int y,
uint8_t val)
{
int i;
uint8_t *ptr = plane + stride * y;
for (i = 0; i < height; i++) {
memset(ptr, val, width);
ptr += stride;
}
}
 
static void copyPlane(const uint8_t *src, int srcStride,
int srcSliceY, int srcSliceH, int width,
uint8_t *dst, int dstStride)
{
dst += dstStride * srcSliceY;
if (dstStride == srcStride && srcStride > 0) {
memcpy(dst, src, srcSliceH * dstStride);
} else {
int i;
for (i = 0; i < srcSliceH; i++) {
memcpy(dst, src, width);
src += srcStride;
dst += dstStride;
}
}
}
 
static int planarToNv12Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam[],
int dstStride[])
{
uint8_t *dst = dstParam[1] + dstStride[1] * srcSliceY / 2;
 
copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
dstParam[0], dstStride[0]);
 
if (c->dstFormat == AV_PIX_FMT_NV12)
interleaveBytes(src[1], src[2], dst, c->srcW / 2, srcSliceH / 2,
srcStride[1], srcStride[2], dstStride[1]);
else
interleaveBytes(src[2], src[1], dst, c->srcW / 2, srcSliceH / 2,
srcStride[2], srcStride[1], dstStride[1]);
 
return srcSliceH;
}
 
static int nv12ToPlanarWrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY,
int srcSliceH, uint8_t *dstParam[],
int dstStride[])
{
uint8_t *dst1 = dstParam[1] + dstStride[1] * srcSliceY / 2;
uint8_t *dst2 = dstParam[2] + dstStride[2] * srcSliceY / 2;
 
copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
dstParam[0], dstStride[0]);
 
if (c->srcFormat == AV_PIX_FMT_NV12)
deinterleaveBytes(src[1], dst1, dst2,c->srcW / 2, srcSliceH / 2,
srcStride[1], dstStride[1], dstStride[2]);
else
deinterleaveBytes(src[1], dst2, dst1, c->srcW / 2, srcSliceH / 2,
srcStride[1], dstStride[2], dstStride[1]);
 
return srcSliceH;
}
 
static int planarToYuy2Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dstParam[], int dstStride[])
{
uint8_t *dst = dstParam[0] + dstStride[0] * srcSliceY;
 
yv12toyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0],
srcStride[1], dstStride[0]);
 
return srcSliceH;
}
 
static int planarToUyvyWrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dstParam[], int dstStride[])
{
uint8_t *dst = dstParam[0] + dstStride[0] * srcSliceY;
 
yv12touyvy(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0],
srcStride[1], dstStride[0]);
 
return srcSliceH;
}
 
static int yuv422pToYuy2Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dstParam[], int dstStride[])
{
uint8_t *dst = dstParam[0] + dstStride[0] * srcSliceY;
 
yuv422ptoyuy2(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0],
srcStride[1], dstStride[0]);
 
return srcSliceH;
}
 
static int yuv422pToUyvyWrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dstParam[], int dstStride[])
{
uint8_t *dst = dstParam[0] + dstStride[0] * srcSliceY;
 
yuv422ptouyvy(src[0], src[1], src[2], dst, c->srcW, srcSliceH, srcStride[0],
srcStride[1], dstStride[0]);
 
return srcSliceH;
}
 
static int yuyvToYuv420Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dstParam[], int dstStride[])
{
uint8_t *ydst = dstParam[0] + dstStride[0] * srcSliceY;
uint8_t *udst = dstParam[1] + dstStride[1] * srcSliceY / 2;
uint8_t *vdst = dstParam[2] + dstStride[2] * srcSliceY / 2;
 
yuyvtoyuv420(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0],
dstStride[1], srcStride[0]);
 
if (dstParam[3])
fillPlane(dstParam[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
 
return srcSliceH;
}
 
static int yuyvToYuv422Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dstParam[], int dstStride[])
{
uint8_t *ydst = dstParam[0] + dstStride[0] * srcSliceY;
uint8_t *udst = dstParam[1] + dstStride[1] * srcSliceY;
uint8_t *vdst = dstParam[2] + dstStride[2] * srcSliceY;
 
yuyvtoyuv422(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0],
dstStride[1], srcStride[0]);
 
return srcSliceH;
}
 
static int uyvyToYuv420Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dstParam[], int dstStride[])
{
uint8_t *ydst = dstParam[0] + dstStride[0] * srcSliceY;
uint8_t *udst = dstParam[1] + dstStride[1] * srcSliceY / 2;
uint8_t *vdst = dstParam[2] + dstStride[2] * srcSliceY / 2;
 
uyvytoyuv420(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0],
dstStride[1], srcStride[0]);
 
if (dstParam[3])
fillPlane(dstParam[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
 
return srcSliceH;
}
 
static int uyvyToYuv422Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dstParam[], int dstStride[])
{
uint8_t *ydst = dstParam[0] + dstStride[0] * srcSliceY;
uint8_t *udst = dstParam[1] + dstStride[1] * srcSliceY;
uint8_t *vdst = dstParam[2] + dstStride[2] * srcSliceY;
 
uyvytoyuv422(ydst, udst, vdst, src[0], c->srcW, srcSliceH, dstStride[0],
dstStride[1], srcStride[0]);
 
return srcSliceH;
}
 
static void gray8aToPacked32(const uint8_t *src, uint8_t *dst, int num_pixels,
const uint8_t *palette)
{
int i;
for (i = 0; i < num_pixels; i++)
((uint32_t *) dst)[i] = ((const uint32_t *) palette)[src[i << 1]] | (src[(i << 1) + 1] << 24);
}
 
static void gray8aToPacked32_1(const uint8_t *src, uint8_t *dst, int num_pixels,
const uint8_t *palette)
{
int i;
 
for (i = 0; i < num_pixels; i++)
((uint32_t *) dst)[i] = ((const uint32_t *) palette)[src[i << 1]] | src[(i << 1) + 1];
}
 
static void gray8aToPacked24(const uint8_t *src, uint8_t *dst, int num_pixels,
const uint8_t *palette)
{
int i;
 
for (i = 0; i < num_pixels; i++) {
//FIXME slow?
dst[0] = palette[src[i << 1] * 4 + 0];
dst[1] = palette[src[i << 1] * 4 + 1];
dst[2] = palette[src[i << 1] * 4 + 2];
dst += 3;
}
}
 
static int packed_16bpc_bswap(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int i, j, p;
 
for (p = 0; p < 4; p++) {
int srcstr = srcStride[p] / 2;
int dststr = dstStride[p] / 2;
uint16_t *dstPtr = (uint16_t *) dst[p];
const uint16_t *srcPtr = (const uint16_t *) src[p];
int min_stride = FFMIN(FFABS(srcstr), FFABS(dststr));
if(!dstPtr || !srcPtr)
continue;
for (i = 0; i < (srcSliceH >> c->chrDstVSubSample); i++) {
for (j = 0; j < min_stride; j++) {
dstPtr[j] = av_bswap16(srcPtr[j]);
}
srcPtr += srcstr;
dstPtr += dststr;
}
}
 
return srcSliceH;
}
 
static int palToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
int srcSliceY, int srcSliceH, uint8_t *dst[],
int dstStride[])
{
const enum AVPixelFormat srcFormat = c->srcFormat;
const enum AVPixelFormat dstFormat = c->dstFormat;
void (*conv)(const uint8_t *src, uint8_t *dst, int num_pixels,
const uint8_t *palette) = NULL;
int i;
uint8_t *dstPtr = dst[0] + dstStride[0] * srcSliceY;
const uint8_t *srcPtr = src[0];
 
if (srcFormat == AV_PIX_FMT_YA8) {
switch (dstFormat) {
case AV_PIX_FMT_RGB32 : conv = gray8aToPacked32; break;
case AV_PIX_FMT_BGR32 : conv = gray8aToPacked32; break;
case AV_PIX_FMT_BGR32_1: conv = gray8aToPacked32_1; break;
case AV_PIX_FMT_RGB32_1: conv = gray8aToPacked32_1; break;
case AV_PIX_FMT_RGB24 : conv = gray8aToPacked24; break;
case AV_PIX_FMT_BGR24 : conv = gray8aToPacked24; break;
}
} else if (usePal(srcFormat)) {
switch (dstFormat) {
case AV_PIX_FMT_RGB32 : conv = sws_convertPalette8ToPacked32; break;
case AV_PIX_FMT_BGR32 : conv = sws_convertPalette8ToPacked32; break;
case AV_PIX_FMT_BGR32_1: conv = sws_convertPalette8ToPacked32; break;
case AV_PIX_FMT_RGB32_1: conv = sws_convertPalette8ToPacked32; break;
case AV_PIX_FMT_RGB24 : conv = sws_convertPalette8ToPacked24; break;
case AV_PIX_FMT_BGR24 : conv = sws_convertPalette8ToPacked24; break;
}
}
 
if (!conv)
av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
else {
for (i = 0; i < srcSliceH; i++) {
conv(srcPtr, dstPtr, c->srcW, (uint8_t *) c->pal_rgb);
srcPtr += srcStride[0];
dstPtr += dstStride[0];
}
}
 
return srcSliceH;
}
 
static void packed16togbra16(const uint8_t *src, int srcStride,
uint16_t *dst[], int dstStride[], int srcSliceH,
int src_alpha, int swap, int shift, int width)
{
int x, h, i;
int dst_alpha = dst[3] != NULL;
for (h = 0; h < srcSliceH; h++) {
uint16_t *src_line = (uint16_t *)(src + srcStride * h);
switch (swap) {
case 3:
if (src_alpha && dst_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
dst[1][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
dst[2][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
dst[3][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
}
} else if (dst_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
dst[1][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
dst[2][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
dst[3][x] = 0xFFFF;
}
} else if (src_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
dst[1][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
dst[2][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
src_line++;
}
} else {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
dst[1][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
dst[2][x] = av_bswap16(av_bswap16(*src_line++) >> shift);
}
}
break;
case 2:
if (src_alpha && dst_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(*src_line++ >> shift);
dst[1][x] = av_bswap16(*src_line++ >> shift);
dst[2][x] = av_bswap16(*src_line++ >> shift);
dst[3][x] = av_bswap16(*src_line++ >> shift);
}
} else if (dst_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(*src_line++ >> shift);
dst[1][x] = av_bswap16(*src_line++ >> shift);
dst[2][x] = av_bswap16(*src_line++ >> shift);
dst[3][x] = 0xFFFF;
}
} else if (src_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(*src_line++ >> shift);
dst[1][x] = av_bswap16(*src_line++ >> shift);
dst[2][x] = av_bswap16(*src_line++ >> shift);
src_line++;
}
} else {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(*src_line++ >> shift);
dst[1][x] = av_bswap16(*src_line++ >> shift);
dst[2][x] = av_bswap16(*src_line++ >> shift);
}
}
break;
case 1:
if (src_alpha && dst_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(*src_line++) >> shift;
dst[1][x] = av_bswap16(*src_line++) >> shift;
dst[2][x] = av_bswap16(*src_line++) >> shift;
dst[3][x] = av_bswap16(*src_line++) >> shift;
}
} else if (dst_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(*src_line++) >> shift;
dst[1][x] = av_bswap16(*src_line++) >> shift;
dst[2][x] = av_bswap16(*src_line++) >> shift;
dst[3][x] = 0xFFFF;
}
} else if (src_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(*src_line++) >> shift;
dst[1][x] = av_bswap16(*src_line++) >> shift;
dst[2][x] = av_bswap16(*src_line++) >> shift;
src_line++;
}
} else {
for (x = 0; x < width; x++) {
dst[0][x] = av_bswap16(*src_line++) >> shift;
dst[1][x] = av_bswap16(*src_line++) >> shift;
dst[2][x] = av_bswap16(*src_line++) >> shift;
}
}
break;
default:
if (src_alpha && dst_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = *src_line++ >> shift;
dst[1][x] = *src_line++ >> shift;
dst[2][x] = *src_line++ >> shift;
dst[3][x] = *src_line++ >> shift;
}
} else if (dst_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = *src_line++ >> shift;
dst[1][x] = *src_line++ >> shift;
dst[2][x] = *src_line++ >> shift;
dst[3][x] = 0xFFFF;
}
} else if (src_alpha) {
for (x = 0; x < width; x++) {
dst[0][x] = *src_line++ >> shift;
dst[1][x] = *src_line++ >> shift;
dst[2][x] = *src_line++ >> shift;
src_line++;
}
} else {
for (x = 0; x < width; x++) {
dst[0][x] = *src_line++ >> shift;
dst[1][x] = *src_line++ >> shift;
dst[2][x] = *src_line++ >> shift;
}
}
}
for (i = 0; i < 4; i++)
dst[i] += dstStride[i] >> 1;
}
}
 
static int Rgb16ToPlanarRgb16Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
uint16_t *dst2013[] = { (uint16_t *)dst[2], (uint16_t *)dst[0], (uint16_t *)dst[1], (uint16_t *)dst[3] };
uint16_t *dst1023[] = { (uint16_t *)dst[1], (uint16_t *)dst[0], (uint16_t *)dst[2], (uint16_t *)dst[3] };
int stride2013[] = { dstStride[2], dstStride[0], dstStride[1], dstStride[3] };
int stride1023[] = { dstStride[1], dstStride[0], dstStride[2], dstStride[3] };
const AVPixFmtDescriptor *src_format = av_pix_fmt_desc_get(c->srcFormat);
const AVPixFmtDescriptor *dst_format = av_pix_fmt_desc_get(c->dstFormat);
int bpc = dst_format->comp[0].depth_minus1 + 1;
int alpha = src_format->flags & AV_PIX_FMT_FLAG_ALPHA;
int swap = 0;
if ( HAVE_BIGENDIAN && !(src_format->flags & AV_PIX_FMT_FLAG_BE) ||
!HAVE_BIGENDIAN && src_format->flags & AV_PIX_FMT_FLAG_BE)
swap++;
if ( HAVE_BIGENDIAN && !(dst_format->flags & AV_PIX_FMT_FLAG_BE) ||
!HAVE_BIGENDIAN && dst_format->flags & AV_PIX_FMT_FLAG_BE)
swap += 2;
 
if ((dst_format->flags & (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB)) !=
(AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB) || bpc < 9) {
av_log(c, AV_LOG_ERROR, "unsupported conversion to planar RGB %s -> %s\n",
src_format->name, dst_format->name);
return srcSliceH;
}
switch (c->srcFormat) {
case AV_PIX_FMT_RGB48LE:
case AV_PIX_FMT_RGB48BE:
case AV_PIX_FMT_RGBA64LE:
case AV_PIX_FMT_RGBA64BE:
packed16togbra16(src[0] + srcSliceY * srcStride[0], srcStride[0],
dst2013, stride2013, srcSliceH, alpha, swap,
16 - bpc, c->srcW);
break;
case AV_PIX_FMT_BGR48LE:
case AV_PIX_FMT_BGR48BE:
case AV_PIX_FMT_BGRA64LE:
case AV_PIX_FMT_BGRA64BE:
packed16togbra16(src[0] + srcSliceY * srcStride[0], srcStride[0],
dst1023, stride1023, srcSliceH, alpha, swap,
16 - bpc, c->srcW);
break;
default:
av_log(c, AV_LOG_ERROR,
"unsupported conversion to planar RGB %s -> %s\n",
src_format->name, dst_format->name);
}
 
return srcSliceH;
}
 
static void gbr16ptopacked16(const uint16_t *src[], int srcStride[],
uint8_t *dst, int dstStride, int srcSliceH,
int alpha, int swap, int bpp, int width)
{
int x, h, i;
int src_alpha = src[3] != NULL;
int scale_high = 16 - bpp, scale_low = (bpp - 8) * 2;
for (h = 0; h < srcSliceH; h++) {
uint16_t *dest = (uint16_t *)(dst + dstStride * h);
uint16_t component;
 
switch(swap) {
case 3:
if (alpha && !src_alpha) {
for (x = 0; x < width; x++) {
component = av_bswap16(src[0][x]);
*dest++ = av_bswap16(component << scale_high | component >> scale_low);
component = av_bswap16(src[1][x]);
*dest++ = av_bswap16(component << scale_high | component >> scale_low);
component = av_bswap16(src[2][x]);
*dest++ = av_bswap16(component << scale_high | component >> scale_low);
*dest++ = 0xffff;
}
} else if (alpha && src_alpha) {
for (x = 0; x < width; x++) {
component = av_bswap16(src[0][x]);
*dest++ = av_bswap16(component << scale_high | component >> scale_low);
component = av_bswap16(src[1][x]);
*dest++ = av_bswap16(component << scale_high | component >> scale_low);
component = av_bswap16(src[2][x]);
*dest++ = av_bswap16(component << scale_high | component >> scale_low);
component = av_bswap16(src[3][x]);
*dest++ = av_bswap16(component << scale_high | component >> scale_low);
}
} else {
for (x = 0; x < width; x++) {
component = av_bswap16(src[0][x]);
*dest++ = av_bswap16(component << scale_high | component >> scale_low);
component = av_bswap16(src[1][x]);
*dest++ = av_bswap16(component << scale_high | component >> scale_low);
component = av_bswap16(src[2][x]);
*dest++ = av_bswap16(component << scale_high | component >> scale_low);
}
}
break;
case 2:
if (alpha && !src_alpha) {
for (x = 0; x < width; x++) {
*dest++ = av_bswap16(src[0][x] << scale_high | src[0][x] >> scale_low);
*dest++ = av_bswap16(src[1][x] << scale_high | src[1][x] >> scale_low);
*dest++ = av_bswap16(src[2][x] << scale_high | src[2][x] >> scale_low);
*dest++ = 0xffff;
}
} else if (alpha && src_alpha) {
for (x = 0; x < width; x++) {
*dest++ = av_bswap16(src[0][x] << scale_high | src[0][x] >> scale_low);
*dest++ = av_bswap16(src[1][x] << scale_high | src[1][x] >> scale_low);
*dest++ = av_bswap16(src[2][x] << scale_high | src[2][x] >> scale_low);
*dest++ = av_bswap16(src[3][x] << scale_high | src[3][x] >> scale_low);
}
} else {
for (x = 0; x < width; x++) {
*dest++ = av_bswap16(src[0][x] << scale_high | src[0][x] >> scale_low);
*dest++ = av_bswap16(src[1][x] << scale_high | src[1][x] >> scale_low);
*dest++ = av_bswap16(src[2][x] << scale_high | src[2][x] >> scale_low);
}
}
break;
case 1:
if (alpha && !src_alpha) {
for (x = 0; x < width; x++) {
*dest++ = av_bswap16(src[0][x]) << scale_high | av_bswap16(src[0][x]) >> scale_low;
*dest++ = av_bswap16(src[1][x]) << scale_high | av_bswap16(src[1][x]) >> scale_low;
*dest++ = av_bswap16(src[2][x]) << scale_high | av_bswap16(src[2][x]) >> scale_low;
*dest++ = 0xffff;
}
} else if (alpha && src_alpha) {
for (x = 0; x < width; x++) {
*dest++ = av_bswap16(src[0][x]) << scale_high | av_bswap16(src[0][x]) >> scale_low;
*dest++ = av_bswap16(src[1][x]) << scale_high | av_bswap16(src[1][x]) >> scale_low;
*dest++ = av_bswap16(src[2][x]) << scale_high | av_bswap16(src[2][x]) >> scale_low;
*dest++ = av_bswap16(src[3][x]) << scale_high | av_bswap16(src[3][x]) >> scale_low;
}
} else {
for (x = 0; x < width; x++) {
*dest++ = av_bswap16(src[0][x]) << scale_high | av_bswap16(src[0][x]) >> scale_low;
*dest++ = av_bswap16(src[1][x]) << scale_high | av_bswap16(src[1][x]) >> scale_low;
*dest++ = av_bswap16(src[2][x]) << scale_high | av_bswap16(src[2][x]) >> scale_low;
}
}
break;
default:
if (alpha && !src_alpha) {
for (x = 0; x < width; x++) {
*dest++ = src[0][x] << scale_high | src[0][x] >> scale_low;
*dest++ = src[1][x] << scale_high | src[1][x] >> scale_low;
*dest++ = src[2][x] << scale_high | src[2][x] >> scale_low;
*dest++ = 0xffff;
}
} else if (alpha && src_alpha) {
for (x = 0; x < width; x++) {
*dest++ = src[0][x] << scale_high | src[0][x] >> scale_low;
*dest++ = src[1][x] << scale_high | src[1][x] >> scale_low;
*dest++ = src[2][x] << scale_high | src[2][x] >> scale_low;
*dest++ = src[3][x] << scale_high | src[3][x] >> scale_low;
}
} else {
for (x = 0; x < width; x++) {
*dest++ = src[0][x] << scale_high | src[0][x] >> scale_low;
*dest++ = src[1][x] << scale_high | src[1][x] >> scale_low;
*dest++ = src[2][x] << scale_high | src[2][x] >> scale_low;
}
}
}
for (i = 0; i < 3 + src_alpha; i++)
src[i] += srcStride[i] >> 1;
}
}
 
static int planarRgb16ToRgb16Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
const uint16_t *src102[] = { (uint16_t *)src[1], (uint16_t *)src[0], (uint16_t *)src[2], (uint16_t *)src[3] };
const uint16_t *src201[] = { (uint16_t *)src[2], (uint16_t *)src[0], (uint16_t *)src[1], (uint16_t *)src[3] };
int stride102[] = { srcStride[1], srcStride[0], srcStride[2], srcStride[3] };
int stride201[] = { srcStride[2], srcStride[0], srcStride[1], srcStride[3] };
const AVPixFmtDescriptor *src_format = av_pix_fmt_desc_get(c->srcFormat);
const AVPixFmtDescriptor *dst_format = av_pix_fmt_desc_get(c->dstFormat);
int bits_per_sample = src_format->comp[0].depth_minus1 + 1;
int swap = 0;
if ( HAVE_BIGENDIAN && !(src_format->flags & AV_PIX_FMT_FLAG_BE) ||
!HAVE_BIGENDIAN && src_format->flags & AV_PIX_FMT_FLAG_BE)
swap++;
if ( HAVE_BIGENDIAN && !(dst_format->flags & AV_PIX_FMT_FLAG_BE) ||
!HAVE_BIGENDIAN && dst_format->flags & AV_PIX_FMT_FLAG_BE)
swap += 2;
 
if ((src_format->flags & (AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB)) !=
(AV_PIX_FMT_FLAG_PLANAR | AV_PIX_FMT_FLAG_RGB) ||
bits_per_sample <= 8) {
av_log(c, AV_LOG_ERROR, "unsupported planar RGB conversion %s -> %s\n",
src_format->name, dst_format->name);
return srcSliceH;
}
switch (c->dstFormat) {
case AV_PIX_FMT_BGR48LE:
case AV_PIX_FMT_BGR48BE:
gbr16ptopacked16(src102, stride102,
dst[0] + srcSliceY * dstStride[0], dstStride[0],
srcSliceH, 0, swap, bits_per_sample, c->srcW);
break;
case AV_PIX_FMT_RGB48LE:
case AV_PIX_FMT_RGB48BE:
gbr16ptopacked16(src201, stride201,
dst[0] + srcSliceY * dstStride[0], dstStride[0],
srcSliceH, 0, swap, bits_per_sample, c->srcW);
break;
case AV_PIX_FMT_RGBA64LE:
case AV_PIX_FMT_RGBA64BE:
gbr16ptopacked16(src201, stride201,
dst[0] + srcSliceY * dstStride[0], dstStride[0],
srcSliceH, 1, swap, bits_per_sample, c->srcW);
break;
case AV_PIX_FMT_BGRA64LE:
case AV_PIX_FMT_BGRA64BE:
gbr16ptopacked16(src102, stride102,
dst[0] + srcSliceY * dstStride[0], dstStride[0],
srcSliceH, 1, swap, bits_per_sample, c->srcW);
break;
default:
av_log(c, AV_LOG_ERROR,
"unsupported planar RGB conversion %s -> %s\n",
src_format->name, dst_format->name);
}
 
return srcSliceH;
}
 
static void gbr24ptopacked24(const uint8_t *src[], int srcStride[],
uint8_t *dst, int dstStride, int srcSliceH,
int width)
{
int x, h, i;
for (h = 0; h < srcSliceH; h++) {
uint8_t *dest = dst + dstStride * h;
for (x = 0; x < width; x++) {
*dest++ = src[0][x];
*dest++ = src[1][x];
*dest++ = src[2][x];
}
 
for (i = 0; i < 3; i++)
src[i] += srcStride[i];
}
}
 
static void gbr24ptopacked32(const uint8_t *src[], int srcStride[],
uint8_t *dst, int dstStride, int srcSliceH,
int alpha_first, int width)
{
int x, h, i;
for (h = 0; h < srcSliceH; h++) {
uint8_t *dest = dst + dstStride * h;
 
if (alpha_first) {
for (x = 0; x < width; x++) {
*dest++ = 0xff;
*dest++ = src[0][x];
*dest++ = src[1][x];
*dest++ = src[2][x];
}
} else {
for (x = 0; x < width; x++) {
*dest++ = src[0][x];
*dest++ = src[1][x];
*dest++ = src[2][x];
*dest++ = 0xff;
}
}
 
for (i = 0; i < 3; i++)
src[i] += srcStride[i];
}
}
 
static int planarRgbToRgbWrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int alpha_first = 0;
const uint8_t *src102[] = { src[1], src[0], src[2] };
const uint8_t *src201[] = { src[2], src[0], src[1] };
int stride102[] = { srcStride[1], srcStride[0], srcStride[2] };
int stride201[] = { srcStride[2], srcStride[0], srcStride[1] };
 
if (c->srcFormat != AV_PIX_FMT_GBRP) {
av_log(c, AV_LOG_ERROR, "unsupported planar RGB conversion %s -> %s\n",
av_get_pix_fmt_name(c->srcFormat),
av_get_pix_fmt_name(c->dstFormat));
return srcSliceH;
}
 
switch (c->dstFormat) {
case AV_PIX_FMT_BGR24:
gbr24ptopacked24(src102, stride102,
dst[0] + srcSliceY * dstStride[0], dstStride[0],
srcSliceH, c->srcW);
break;
 
case AV_PIX_FMT_RGB24:
gbr24ptopacked24(src201, stride201,
dst[0] + srcSliceY * dstStride[0], dstStride[0],
srcSliceH, c->srcW);
break;
 
case AV_PIX_FMT_ARGB:
alpha_first = 1;
case AV_PIX_FMT_RGBA:
gbr24ptopacked32(src201, stride201,
dst[0] + srcSliceY * dstStride[0], dstStride[0],
srcSliceH, alpha_first, c->srcW);
break;
 
case AV_PIX_FMT_ABGR:
alpha_first = 1;
case AV_PIX_FMT_BGRA:
gbr24ptopacked32(src102, stride102,
dst[0] + srcSliceY * dstStride[0], dstStride[0],
srcSliceH, alpha_first, c->srcW);
break;
 
default:
av_log(c, AV_LOG_ERROR,
"unsupported planar RGB conversion %s -> %s\n",
av_get_pix_fmt_name(c->srcFormat),
av_get_pix_fmt_name(c->dstFormat));
}
 
return srcSliceH;
}
 
static int planarRgbToplanarRgbWrapper(SwsContext *c,
const uint8_t *src[], int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
dst[0], dstStride[0]);
copyPlane(src[1], srcStride[1], srcSliceY, srcSliceH, c->srcW,
dst[1], dstStride[1]);
copyPlane(src[2], srcStride[2], srcSliceY, srcSliceH, c->srcW,
dst[2], dstStride[2]);
if (dst[3])
fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
 
return srcSliceH;
}
 
static void packedtogbr24p(const uint8_t *src, int srcStride,
uint8_t *dst[], int dstStride[], int srcSliceH,
int alpha_first, int inc_size, int width)
{
uint8_t *dest[3];
int x, h;
 
dest[0] = dst[0];
dest[1] = dst[1];
dest[2] = dst[2];
 
if (alpha_first)
src++;
 
for (h = 0; h < srcSliceH; h++) {
for (x = 0; x < width; x++) {
dest[0][x] = src[0];
dest[1][x] = src[1];
dest[2][x] = src[2];
 
src += inc_size;
}
src += srcStride - width * inc_size;
dest[0] += dstStride[0];
dest[1] += dstStride[1];
dest[2] += dstStride[2];
}
}
 
static int rgbToPlanarRgbWrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int alpha_first = 0;
int stride102[] = { dstStride[1], dstStride[0], dstStride[2] };
int stride201[] = { dstStride[2], dstStride[0], dstStride[1] };
uint8_t *dst102[] = { dst[1] + srcSliceY * dstStride[1],
dst[0] + srcSliceY * dstStride[0],
dst[2] + srcSliceY * dstStride[2] };
uint8_t *dst201[] = { dst[2] + srcSliceY * dstStride[2],
dst[0] + srcSliceY * dstStride[0],
dst[1] + srcSliceY * dstStride[1] };
 
switch (c->srcFormat) {
case AV_PIX_FMT_RGB24:
packedtogbr24p((const uint8_t *) src[0], srcStride[0], dst201,
stride201, srcSliceH, alpha_first, 3, c->srcW);
break;
case AV_PIX_FMT_BGR24:
packedtogbr24p((const uint8_t *) src[0], srcStride[0], dst102,
stride102, srcSliceH, alpha_first, 3, c->srcW);
break;
case AV_PIX_FMT_ARGB:
alpha_first = 1;
case AV_PIX_FMT_RGBA:
packedtogbr24p((const uint8_t *) src[0], srcStride[0], dst201,
stride201, srcSliceH, alpha_first, 4, c->srcW);
break;
case AV_PIX_FMT_ABGR:
alpha_first = 1;
case AV_PIX_FMT_BGRA:
packedtogbr24p((const uint8_t *) src[0], srcStride[0], dst102,
stride102, srcSliceH, alpha_first, 4, c->srcW);
break;
default:
av_log(c, AV_LOG_ERROR,
"unsupported planar RGB conversion %s -> %s\n",
av_get_pix_fmt_name(c->srcFormat),
av_get_pix_fmt_name(c->dstFormat));
}
 
return srcSliceH;
}
 
#define BAYER_GBRG
#define BAYER_8
#define BAYER_RENAME(x) bayer_gbrg8_to_##x
#include "bayer_template.c"
 
#define BAYER_GBRG
#define BAYER_16LE
#define BAYER_RENAME(x) bayer_gbrg16le_to_##x
#include "bayer_template.c"
 
#define BAYER_GBRG
#define BAYER_16BE
#define BAYER_RENAME(x) bayer_gbrg16be_to_##x
#include "bayer_template.c"
 
#define BAYER_GRBG
#define BAYER_8
#define BAYER_RENAME(x) bayer_grbg8_to_##x
#include "bayer_template.c"
 
#define BAYER_GRBG
#define BAYER_16LE
#define BAYER_RENAME(x) bayer_grbg16le_to_##x
#include "bayer_template.c"
 
#define BAYER_GRBG
#define BAYER_16BE
#define BAYER_RENAME(x) bayer_grbg16be_to_##x
#include "bayer_template.c"
 
#define BAYER_BGGR
#define BAYER_8
#define BAYER_RENAME(x) bayer_bggr8_to_##x
#include "bayer_template.c"
 
#define BAYER_BGGR
#define BAYER_16LE
#define BAYER_RENAME(x) bayer_bggr16le_to_##x
#include "bayer_template.c"
 
#define BAYER_BGGR
#define BAYER_16BE
#define BAYER_RENAME(x) bayer_bggr16be_to_##x
#include "bayer_template.c"
 
#define BAYER_RGGB
#define BAYER_8
#define BAYER_RENAME(x) bayer_rggb8_to_##x
#include "bayer_template.c"
 
#define BAYER_RGGB
#define BAYER_16LE
#define BAYER_RENAME(x) bayer_rggb16le_to_##x
#include "bayer_template.c"
 
#define BAYER_RGGB
#define BAYER_16BE
#define BAYER_RENAME(x) bayer_rggb16be_to_##x
#include "bayer_template.c"
 
static int bayer_to_rgb24_wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[])
{
uint8_t *dstPtr= dst[0];
const uint8_t *srcPtr= src[0];
int i;
void (*copy) (const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width);
void (*interpolate)(const uint8_t *src, int src_stride, uint8_t *dst, int dst_stride, int width);
 
switch(c->srcFormat) {
#define CASE(pixfmt, prefix) \
case pixfmt: copy = bayer_##prefix##_to_rgb24_copy; \
interpolate = bayer_##prefix##_to_rgb24_interpolate; \
break;
CASE(AV_PIX_FMT_BAYER_BGGR8, bggr8)
CASE(AV_PIX_FMT_BAYER_BGGR16LE, bggr16le)
CASE(AV_PIX_FMT_BAYER_BGGR16BE, bggr16be)
CASE(AV_PIX_FMT_BAYER_RGGB8, rggb8)
CASE(AV_PIX_FMT_BAYER_RGGB16LE, rggb16le)
CASE(AV_PIX_FMT_BAYER_RGGB16BE, rggb16be)
CASE(AV_PIX_FMT_BAYER_GBRG8, gbrg8)
CASE(AV_PIX_FMT_BAYER_GBRG16LE, gbrg16le)
CASE(AV_PIX_FMT_BAYER_GBRG16BE, gbrg16be)
CASE(AV_PIX_FMT_BAYER_GRBG8, grbg8)
CASE(AV_PIX_FMT_BAYER_GRBG16LE, grbg16le)
CASE(AV_PIX_FMT_BAYER_GRBG16BE, grbg16be)
#undef CASE
default: return 0;
}
 
copy(srcPtr, srcStride[0], dstPtr, dstStride[0], c->srcW);
srcPtr += 2 * srcStride[0];
dstPtr += 2 * dstStride[0];
 
for (i = 2; i < srcSliceH - 2; i += 2) {
interpolate(srcPtr, srcStride[0], dstPtr, dstStride[0], c->srcW);
srcPtr += 2 * srcStride[0];
dstPtr += 2 * dstStride[0];
}
 
copy(srcPtr, srcStride[0], dstPtr, dstStride[0], c->srcW);
return srcSliceH;
}
 
static int bayer_to_yv12_wrapper(SwsContext *c, const uint8_t* src[], int srcStride[], int srcSliceY,
int srcSliceH, uint8_t* dst[], int dstStride[])
{
const uint8_t *srcPtr= src[0];
uint8_t *dstY= dst[0];
uint8_t *dstU= dst[1];
uint8_t *dstV= dst[2];
int i;
void (*copy) (const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv);
void (*interpolate)(const uint8_t *src, int src_stride, uint8_t *dstY, uint8_t *dstU, uint8_t *dstV, int luma_stride, int width, int32_t *rgb2yuv);
 
switch(c->srcFormat) {
#define CASE(pixfmt, prefix) \
case pixfmt: copy = bayer_##prefix##_to_yv12_copy; \
interpolate = bayer_##prefix##_to_yv12_interpolate; \
break;
CASE(AV_PIX_FMT_BAYER_BGGR8, bggr8)
CASE(AV_PIX_FMT_BAYER_BGGR16LE, bggr16le)
CASE(AV_PIX_FMT_BAYER_BGGR16BE, bggr16be)
CASE(AV_PIX_FMT_BAYER_RGGB8, rggb8)
CASE(AV_PIX_FMT_BAYER_RGGB16LE, rggb16le)
CASE(AV_PIX_FMT_BAYER_RGGB16BE, rggb16be)
CASE(AV_PIX_FMT_BAYER_GBRG8, gbrg8)
CASE(AV_PIX_FMT_BAYER_GBRG16LE, gbrg16le)
CASE(AV_PIX_FMT_BAYER_GBRG16BE, gbrg16be)
CASE(AV_PIX_FMT_BAYER_GRBG8, grbg8)
CASE(AV_PIX_FMT_BAYER_GRBG16LE, grbg16le)
CASE(AV_PIX_FMT_BAYER_GRBG16BE, grbg16be)
#undef CASE
default: return 0;
}
 
copy(srcPtr, srcStride[0], dstY, dstU, dstV, dstStride[0], c->srcW, c->input_rgb2yuv_table);
srcPtr += 2 * srcStride[0];
dstY += 2 * dstStride[0];
dstU += dstStride[1];
dstV += dstStride[1];
 
for (i = 2; i < srcSliceH - 2; i += 2) {
interpolate(srcPtr, srcStride[0], dstY, dstU, dstV, dstStride[0], c->srcW, c->input_rgb2yuv_table);
srcPtr += 2 * srcStride[0];
dstY += 2 * dstStride[0];
dstU += dstStride[1];
dstV += dstStride[1];
}
 
copy(srcPtr, srcStride[0], dstY, dstU, dstV, dstStride[0], c->srcW, c->input_rgb2yuv_table);
return srcSliceH;
}
 
#define isRGBA32(x) ( \
(x) == AV_PIX_FMT_ARGB \
|| (x) == AV_PIX_FMT_RGBA \
|| (x) == AV_PIX_FMT_BGRA \
|| (x) == AV_PIX_FMT_ABGR \
)
 
#define isRGBA64(x) ( \
(x) == AV_PIX_FMT_RGBA64LE \
|| (x) == AV_PIX_FMT_RGBA64BE \
|| (x) == AV_PIX_FMT_BGRA64LE \
|| (x) == AV_PIX_FMT_BGRA64BE \
)
 
#define isRGB48(x) ( \
(x) == AV_PIX_FMT_RGB48LE \
|| (x) == AV_PIX_FMT_RGB48BE \
|| (x) == AV_PIX_FMT_BGR48LE \
|| (x) == AV_PIX_FMT_BGR48BE \
)
 
/* {RGB,BGR}{15,16,24,32,32_1} -> {RGB,BGR}{15,16,24,32} */
typedef void (* rgbConvFn) (const uint8_t *, uint8_t *, int);
static rgbConvFn findRgbConvFn(SwsContext *c)
{
const enum AVPixelFormat srcFormat = c->srcFormat;
const enum AVPixelFormat dstFormat = c->dstFormat;
const int srcId = c->srcFormatBpp;
const int dstId = c->dstFormatBpp;
rgbConvFn conv = NULL;
 
#define IS_NOT_NE(bpp, desc) \
(((bpp + 7) >> 3) == 2 && \
(!(desc->flags & AV_PIX_FMT_FLAG_BE) != !HAVE_BIGENDIAN))
 
#define CONV_IS(src, dst) (srcFormat == AV_PIX_FMT_##src && dstFormat == AV_PIX_FMT_##dst)
 
if (isRGBA32(srcFormat) && isRGBA32(dstFormat)) {
if ( CONV_IS(ABGR, RGBA)
|| CONV_IS(ARGB, BGRA)
|| CONV_IS(BGRA, ARGB)
|| CONV_IS(RGBA, ABGR)) conv = shuffle_bytes_3210;
else if (CONV_IS(ABGR, ARGB)
|| CONV_IS(ARGB, ABGR)) conv = shuffle_bytes_0321;
else if (CONV_IS(ABGR, BGRA)
|| CONV_IS(ARGB, RGBA)) conv = shuffle_bytes_1230;
else if (CONV_IS(BGRA, RGBA)
|| CONV_IS(RGBA, BGRA)) conv = shuffle_bytes_2103;
else if (CONV_IS(BGRA, ABGR)
|| CONV_IS(RGBA, ARGB)) conv = shuffle_bytes_3012;
} else if (isRGB48(srcFormat) && isRGB48(dstFormat)) {
if (CONV_IS(RGB48LE, BGR48LE)
|| CONV_IS(BGR48LE, RGB48LE)
|| CONV_IS(RGB48BE, BGR48BE)
|| CONV_IS(BGR48BE, RGB48BE)) conv = rgb48tobgr48_nobswap;
else if (CONV_IS(RGB48LE, BGR48BE)
|| CONV_IS(BGR48LE, RGB48BE)
|| CONV_IS(RGB48BE, BGR48LE)
|| CONV_IS(BGR48BE, RGB48LE)) conv = rgb48tobgr48_bswap;
} else if (isRGBA64(srcFormat) && isRGB48(dstFormat)) {
if (CONV_IS(RGBA64LE, BGR48LE)
|| CONV_IS(BGRA64LE, RGB48LE)
|| CONV_IS(RGBA64BE, BGR48BE)
|| CONV_IS(BGRA64BE, RGB48BE)) conv = rgb64tobgr48_nobswap;
else if (CONV_IS(RGBA64LE, BGR48BE)
|| CONV_IS(BGRA64LE, RGB48BE)
|| CONV_IS(RGBA64BE, BGR48LE)
|| CONV_IS(BGRA64BE, RGB48LE)) conv = rgb64tobgr48_bswap;
else if (CONV_IS(RGBA64LE, RGB48LE)
|| CONV_IS(BGRA64LE, BGR48LE)
|| CONV_IS(RGBA64BE, RGB48BE)
|| CONV_IS(BGRA64BE, BGR48BE)) conv = rgb64to48_nobswap;
else if (CONV_IS(RGBA64LE, RGB48BE)
|| CONV_IS(BGRA64LE, BGR48BE)
|| CONV_IS(RGBA64BE, RGB48LE)
|| CONV_IS(BGRA64BE, BGR48LE)) conv = rgb64to48_bswap;
} else
/* BGR -> BGR */
if ((isBGRinInt(srcFormat) && isBGRinInt(dstFormat)) ||
(isRGBinInt(srcFormat) && isRGBinInt(dstFormat))) {
switch (srcId | (dstId << 16)) {
case 0x000F000C: conv = rgb12to15; break;
case 0x000F0010: conv = rgb16to15; break;
case 0x000F0018: conv = rgb24to15; break;
case 0x000F0020: conv = rgb32to15; break;
case 0x0010000F: conv = rgb15to16; break;
case 0x00100018: conv = rgb24to16; break;
case 0x00100020: conv = rgb32to16; break;
case 0x0018000F: conv = rgb15to24; break;
case 0x00180010: conv = rgb16to24; break;
case 0x00180020: conv = rgb32to24; break;
case 0x0020000F: conv = rgb15to32; break;
case 0x00200010: conv = rgb16to32; break;
case 0x00200018: conv = rgb24to32; break;
}
} else if ((isBGRinInt(srcFormat) && isRGBinInt(dstFormat)) ||
(isRGBinInt(srcFormat) && isBGRinInt(dstFormat))) {
switch (srcId | (dstId << 16)) {
case 0x000C000C: conv = rgb12tobgr12; break;
case 0x000F000F: conv = rgb15tobgr15; break;
case 0x000F0010: conv = rgb16tobgr15; break;
case 0x000F0018: conv = rgb24tobgr15; break;
case 0x000F0020: conv = rgb32tobgr15; break;
case 0x0010000F: conv = rgb15tobgr16; break;
case 0x00100010: conv = rgb16tobgr16; break;
case 0x00100018: conv = rgb24tobgr16; break;
case 0x00100020: conv = rgb32tobgr16; break;
case 0x0018000F: conv = rgb15tobgr24; break;
case 0x00180010: conv = rgb16tobgr24; break;
case 0x00180018: conv = rgb24tobgr24; break;
case 0x00180020: conv = rgb32tobgr24; break;
case 0x0020000F: conv = rgb15tobgr32; break;
case 0x00200010: conv = rgb16tobgr32; break;
case 0x00200018: conv = rgb24tobgr32; break;
}
}
 
if ((dstFormat == AV_PIX_FMT_RGB32_1 || dstFormat == AV_PIX_FMT_BGR32_1) && !isRGBA32(srcFormat) && ALT32_CORR<0)
return NULL;
 
// Maintain symmetry between endianness
if (c->flags & SWS_BITEXACT)
if ((dstFormat == AV_PIX_FMT_RGB32 || dstFormat == AV_PIX_FMT_BGR32 ) && !isRGBA32(srcFormat) && ALT32_CORR>0)
return NULL;
 
return conv;
}
 
/* {RGB,BGR}{15,16,24,32,32_1} -> {RGB,BGR}{15,16,24,32} */
static int rgbToRgbWrapper(SwsContext *c, const uint8_t *src[], int srcStride[],
int srcSliceY, int srcSliceH, uint8_t *dst[],
int dstStride[])
 
{
const enum AVPixelFormat srcFormat = c->srcFormat;
const enum AVPixelFormat dstFormat = c->dstFormat;
const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(c->srcFormat);
const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(c->dstFormat);
const int srcBpp = (c->srcFormatBpp + 7) >> 3;
const int dstBpp = (c->dstFormatBpp + 7) >> 3;
rgbConvFn conv = findRgbConvFn(c);
 
if (!conv) {
av_log(c, AV_LOG_ERROR, "internal error %s -> %s converter\n",
av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
} else {
const uint8_t *srcPtr = src[0];
uint8_t *dstPtr = dst[0];
int src_bswap = IS_NOT_NE(c->srcFormatBpp, desc_src);
int dst_bswap = IS_NOT_NE(c->dstFormatBpp, desc_dst);
 
if ((srcFormat == AV_PIX_FMT_RGB32_1 || srcFormat == AV_PIX_FMT_BGR32_1) &&
!isRGBA32(dstFormat))
srcPtr += ALT32_CORR;
 
if ((dstFormat == AV_PIX_FMT_RGB32_1 || dstFormat == AV_PIX_FMT_BGR32_1) &&
!isRGBA32(srcFormat)) {
int i;
av_assert0(ALT32_CORR == 1);
for (i = 0; i < srcSliceH; i++)
dstPtr[dstStride[0] * (srcSliceY + i)] = 255;
dstPtr += ALT32_CORR;
}
 
if (dstStride[0] * srcBpp == srcStride[0] * dstBpp && srcStride[0] > 0 &&
!(srcStride[0] % srcBpp) && !dst_bswap && !src_bswap)
conv(srcPtr, dstPtr + dstStride[0] * srcSliceY,
(srcSliceH - 1) * srcStride[0] + c->srcW * srcBpp);
else {
int i, j;
dstPtr += dstStride[0] * srcSliceY;
 
for (i = 0; i < srcSliceH; i++) {
if(src_bswap) {
for(j=0; j<c->srcW; j++)
((uint16_t*)c->formatConvBuffer)[j] = av_bswap16(((uint16_t*)srcPtr)[j]);
conv(c->formatConvBuffer, dstPtr, c->srcW * srcBpp);
}else
conv(srcPtr, dstPtr, c->srcW * srcBpp);
if(dst_bswap)
for(j=0; j<c->srcW; j++)
((uint16_t*)dstPtr)[j] = av_bswap16(((uint16_t*)dstPtr)[j]);
srcPtr += srcStride[0];
dstPtr += dstStride[0];
}
}
}
return srcSliceH;
}
 
static int bgr24ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
ff_rgb24toyv12(
src[0],
dst[0] + srcSliceY * dstStride[0],
dst[1] + (srcSliceY >> 1) * dstStride[1],
dst[2] + (srcSliceY >> 1) * dstStride[2],
c->srcW, srcSliceH,
dstStride[0], dstStride[1], srcStride[0],
c->input_rgb2yuv_table);
if (dst[3])
fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
return srcSliceH;
}
 
static int yvu9ToYv12Wrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
copyPlane(src[0], srcStride[0], srcSliceY, srcSliceH, c->srcW,
dst[0], dstStride[0]);
 
planar2x(src[1], dst[1] + dstStride[1] * (srcSliceY >> 1), c->chrSrcW,
srcSliceH >> 2, srcStride[1], dstStride[1]);
planar2x(src[2], dst[2] + dstStride[2] * (srcSliceY >> 1), c->chrSrcW,
srcSliceH >> 2, srcStride[2], dstStride[2]);
if (dst[3])
fillPlane(dst[3], dstStride[3], c->srcW, srcSliceH, srcSliceY, 255);
return srcSliceH;
}
 
/* unscaled copy like stuff (assumes nearly identical formats) */
static int packedCopyWrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
if (dstStride[0] == srcStride[0] && srcStride[0] > 0)
memcpy(dst[0] + dstStride[0] * srcSliceY, src[0], srcSliceH * dstStride[0]);
else {
int i;
const uint8_t *srcPtr = src[0];
uint8_t *dstPtr = dst[0] + dstStride[0] * srcSliceY;
int length = 0;
 
/* universal length finder */
while (length + c->srcW <= FFABS(dstStride[0]) &&
length + c->srcW <= FFABS(srcStride[0]))
length += c->srcW;
av_assert1(length != 0);
 
for (i = 0; i < srcSliceH; i++) {
memcpy(dstPtr, srcPtr, length);
srcPtr += srcStride[0];
dstPtr += dstStride[0];
}
}
return srcSliceH;
}
 
#define DITHER_COPY(dst, dstStride, src, srcStride, bswap, dbswap)\
uint16_t scale= dither_scale[dst_depth-1][src_depth-1];\
int shift= src_depth-dst_depth + dither_scale[src_depth-2][dst_depth-1];\
for (i = 0; i < height; i++) {\
const uint8_t *dither= dithers[src_depth-9][i&7];\
for (j = 0; j < length-7; j+=8){\
dst[j+0] = dbswap((bswap(src[j+0]) + dither[0])*scale>>shift);\
dst[j+1] = dbswap((bswap(src[j+1]) + dither[1])*scale>>shift);\
dst[j+2] = dbswap((bswap(src[j+2]) + dither[2])*scale>>shift);\
dst[j+3] = dbswap((bswap(src[j+3]) + dither[3])*scale>>shift);\
dst[j+4] = dbswap((bswap(src[j+4]) + dither[4])*scale>>shift);\
dst[j+5] = dbswap((bswap(src[j+5]) + dither[5])*scale>>shift);\
dst[j+6] = dbswap((bswap(src[j+6]) + dither[6])*scale>>shift);\
dst[j+7] = dbswap((bswap(src[j+7]) + dither[7])*scale>>shift);\
}\
for (; j < length; j++)\
dst[j] = dbswap((bswap(src[j]) + dither[j&7])*scale>>shift);\
dst += dstStride;\
src += srcStride;\
}
 
static int planarCopyWrapper(SwsContext *c, const uint8_t *src[],
int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
const AVPixFmtDescriptor *desc_src = av_pix_fmt_desc_get(c->srcFormat);
const AVPixFmtDescriptor *desc_dst = av_pix_fmt_desc_get(c->dstFormat);
int plane, i, j;
for (plane = 0; plane < 4; plane++) {
int length = (plane == 0 || plane == 3) ? c->srcW : FF_CEIL_RSHIFT(c->srcW, c->chrDstHSubSample);
int y = (plane == 0 || plane == 3) ? srcSliceY: FF_CEIL_RSHIFT(srcSliceY, c->chrDstVSubSample);
int height = (plane == 0 || plane == 3) ? srcSliceH: FF_CEIL_RSHIFT(srcSliceH, c->chrDstVSubSample);
const uint8_t *srcPtr = src[plane];
uint8_t *dstPtr = dst[plane] + dstStride[plane] * y;
int shiftonly = plane == 1 || plane == 2 || (!c->srcRange && plane == 0);
 
if (!dst[plane])
continue;
// ignore palette for GRAY8
if (plane == 1 && !dst[2]) continue;
if (!src[plane] || (plane == 1 && !src[2])) {
if (is16BPS(c->dstFormat) || isNBPS(c->dstFormat)) {
fillPlane16(dst[plane], dstStride[plane], length, height, y,
plane == 3, desc_dst->comp[plane].depth_minus1,
isBE(c->dstFormat));
} else {
fillPlane(dst[plane], dstStride[plane], length, height, y,
(plane == 3) ? 255 : 128);
}
} else {
if(isNBPS(c->srcFormat) || isNBPS(c->dstFormat)
|| (is16BPS(c->srcFormat) != is16BPS(c->dstFormat))
) {
const int src_depth = desc_src->comp[plane].depth_minus1 + 1;
const int dst_depth = desc_dst->comp[plane].depth_minus1 + 1;
const uint16_t *srcPtr2 = (const uint16_t *) srcPtr;
uint16_t *dstPtr2 = (uint16_t*)dstPtr;
 
if (dst_depth == 8) {
if(isBE(c->srcFormat) == HAVE_BIGENDIAN){
DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, , )
} else {
DITHER_COPY(dstPtr, dstStride[plane], srcPtr2, srcStride[plane]/2, av_bswap16, )
}
} else if (src_depth == 8) {
for (i = 0; i < height; i++) {
#define COPY816(w)\
if (shiftonly) {\
for (j = 0; j < length; j++)\
w(&dstPtr2[j], srcPtr[j]<<(dst_depth-8));\
} else {\
for (j = 0; j < length; j++)\
w(&dstPtr2[j], (srcPtr[j]<<(dst_depth-8)) |\
(srcPtr[j]>>(2*8-dst_depth)));\
}
if(isBE(c->dstFormat)){
COPY816(AV_WB16)
} else {
COPY816(AV_WL16)
}
dstPtr2 += dstStride[plane]/2;
srcPtr += srcStride[plane];
}
} else if (src_depth <= dst_depth) {
for (i = 0; i < height; i++) {
j = 0;
if(isBE(c->srcFormat) == HAVE_BIGENDIAN &&
isBE(c->dstFormat) == HAVE_BIGENDIAN &&
shiftonly) {
unsigned shift = dst_depth - src_depth;
#if HAVE_FAST_64BIT
#define FAST_COPY_UP(shift) \
for (; j < length - 3; j += 4) { \
uint64_t v = AV_RN64A(srcPtr2 + j); \
AV_WN64A(dstPtr2 + j, v << shift); \
}
#else
#define FAST_COPY_UP(shift) \
for (; j < length - 1; j += 2) { \
uint32_t v = AV_RN32A(srcPtr2 + j); \
AV_WN32A(dstPtr2 + j, v << shift); \
}
#endif
switch (shift)
{
case 6: FAST_COPY_UP(6); break;
case 7: FAST_COPY_UP(7); break;
}
}
#define COPY_UP(r,w) \
if(shiftonly){\
for (; j < length; j++){ \
unsigned int v= r(&srcPtr2[j]);\
w(&dstPtr2[j], v<<(dst_depth-src_depth));\
}\
}else{\
for (; j < length; j++){ \
unsigned int v= r(&srcPtr2[j]);\
w(&dstPtr2[j], (v<<(dst_depth-src_depth)) | \
(v>>(2*src_depth-dst_depth)));\
}\
}
if(isBE(c->srcFormat)){
if(isBE(c->dstFormat)){
COPY_UP(AV_RB16, AV_WB16)
} else {
COPY_UP(AV_RB16, AV_WL16)
}
} else {
if(isBE(c->dstFormat)){
COPY_UP(AV_RL16, AV_WB16)
} else {
COPY_UP(AV_RL16, AV_WL16)
}
}
dstPtr2 += dstStride[plane]/2;
srcPtr2 += srcStride[plane]/2;
}
} else {
if(isBE(c->srcFormat) == HAVE_BIGENDIAN){
if(isBE(c->dstFormat) == HAVE_BIGENDIAN){
DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , )
} else {
DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, , av_bswap16)
}
}else{
if(isBE(c->dstFormat) == HAVE_BIGENDIAN){
DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, )
} else {
DITHER_COPY(dstPtr2, dstStride[plane]/2, srcPtr2, srcStride[plane]/2, av_bswap16, av_bswap16)
}
}
}
} else if (is16BPS(c->srcFormat) && is16BPS(c->dstFormat) &&
isBE(c->srcFormat) != isBE(c->dstFormat)) {
 
for (i = 0; i < height; i++) {
for (j = 0; j < length; j++)
((uint16_t *) dstPtr)[j] = av_bswap16(((const uint16_t *) srcPtr)[j]);
srcPtr += srcStride[plane];
dstPtr += dstStride[plane];
}
} else if (dstStride[plane] == srcStride[plane] &&
srcStride[plane] > 0 && srcStride[plane] == length) {
memcpy(dst[plane] + dstStride[plane] * y, src[plane],
height * dstStride[plane]);
} else {
if (is16BPS(c->srcFormat) && is16BPS(c->dstFormat))
length *= 2;
else if (!desc_src->comp[0].depth_minus1)
length >>= 3; // monowhite/black
for (i = 0; i < height; i++) {
memcpy(dstPtr, srcPtr, length);
srcPtr += srcStride[plane];
dstPtr += dstStride[plane];
}
}
}
}
return srcSliceH;
}
 
 
#define IS_DIFFERENT_ENDIANESS(src_fmt, dst_fmt, pix_fmt) \
((src_fmt == pix_fmt ## BE && dst_fmt == pix_fmt ## LE) || \
(src_fmt == pix_fmt ## LE && dst_fmt == pix_fmt ## BE))
 
 
void ff_get_unscaled_swscale(SwsContext *c)
{
const enum AVPixelFormat srcFormat = c->srcFormat;
const enum AVPixelFormat dstFormat = c->dstFormat;
const int flags = c->flags;
const int dstH = c->dstH;
int needsDither;
 
needsDither = isAnyRGB(dstFormat) &&
c->dstFormatBpp < 24 &&
(c->dstFormatBpp < c->srcFormatBpp || (!isAnyRGB(srcFormat)));
 
/* yv12_to_nv12 */
if ((srcFormat == AV_PIX_FMT_YUV420P || srcFormat == AV_PIX_FMT_YUVA420P) &&
(dstFormat == AV_PIX_FMT_NV12 || dstFormat == AV_PIX_FMT_NV21)) {
c->swscale = planarToNv12Wrapper;
}
/* nv12_to_yv12 */
if (dstFormat == AV_PIX_FMT_YUV420P &&
(srcFormat == AV_PIX_FMT_NV12 || srcFormat == AV_PIX_FMT_NV21)) {
c->swscale = nv12ToPlanarWrapper;
}
/* yuv2bgr */
if ((srcFormat == AV_PIX_FMT_YUV420P || srcFormat == AV_PIX_FMT_YUV422P ||
srcFormat == AV_PIX_FMT_YUVA420P) && isAnyRGB(dstFormat) &&
!(flags & SWS_ACCURATE_RND) && (c->dither == SWS_DITHER_BAYER || c->dither == SWS_DITHER_AUTO) && !(dstH & 1)) {
c->swscale = ff_yuv2rgb_get_func_ptr(c);
}
 
if (srcFormat == AV_PIX_FMT_YUV410P && !(dstH & 3) &&
(dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
!(flags & SWS_BITEXACT)) {
c->swscale = yvu9ToYv12Wrapper;
}
 
/* bgr24toYV12 */
if (srcFormat == AV_PIX_FMT_BGR24 &&
(dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P) &&
!(flags & SWS_ACCURATE_RND))
c->swscale = bgr24ToYv12Wrapper;
 
/* RGB/BGR -> RGB/BGR (no dither needed forms) */
if (isAnyRGB(srcFormat) && isAnyRGB(dstFormat) && findRgbConvFn(c)
&& (!needsDither || (c->flags&(SWS_FAST_BILINEAR|SWS_POINT))))
c->swscale = rgbToRgbWrapper;
 
/* RGB to planar RGB */
if ((srcFormat == AV_PIX_FMT_GBRP && dstFormat == AV_PIX_FMT_GBRAP) ||
(srcFormat == AV_PIX_FMT_GBRAP && dstFormat == AV_PIX_FMT_GBRP))
c->swscale = planarRgbToplanarRgbWrapper;
 
#define isByteRGB(f) ( \
f == AV_PIX_FMT_RGB32 || \
f == AV_PIX_FMT_RGB32_1 || \
f == AV_PIX_FMT_RGB24 || \
f == AV_PIX_FMT_BGR32 || \
f == AV_PIX_FMT_BGR32_1 || \
f == AV_PIX_FMT_BGR24)
 
if (srcFormat == AV_PIX_FMT_GBRP && isPlanar(srcFormat) && isByteRGB(dstFormat))
c->swscale = planarRgbToRgbWrapper;
 
if ((srcFormat == AV_PIX_FMT_RGB48LE || srcFormat == AV_PIX_FMT_RGB48BE ||
srcFormat == AV_PIX_FMT_BGR48LE || srcFormat == AV_PIX_FMT_BGR48BE ||
srcFormat == AV_PIX_FMT_RGBA64LE || srcFormat == AV_PIX_FMT_RGBA64BE ||
srcFormat == AV_PIX_FMT_BGRA64LE || srcFormat == AV_PIX_FMT_BGRA64BE) &&
(dstFormat == AV_PIX_FMT_GBRP9LE || dstFormat == AV_PIX_FMT_GBRP9BE ||
dstFormat == AV_PIX_FMT_GBRP10LE || dstFormat == AV_PIX_FMT_GBRP10BE ||
dstFormat == AV_PIX_FMT_GBRP12LE || dstFormat == AV_PIX_FMT_GBRP12BE ||
dstFormat == AV_PIX_FMT_GBRP14LE || dstFormat == AV_PIX_FMT_GBRP14BE ||
dstFormat == AV_PIX_FMT_GBRP16LE || dstFormat == AV_PIX_FMT_GBRP16BE ||
dstFormat == AV_PIX_FMT_GBRAP16LE || dstFormat == AV_PIX_FMT_GBRAP16BE ))
c->swscale = Rgb16ToPlanarRgb16Wrapper;
 
if ((srcFormat == AV_PIX_FMT_GBRP9LE || srcFormat == AV_PIX_FMT_GBRP9BE ||
srcFormat == AV_PIX_FMT_GBRP16LE || srcFormat == AV_PIX_FMT_GBRP16BE ||
srcFormat == AV_PIX_FMT_GBRP10LE || srcFormat == AV_PIX_FMT_GBRP10BE ||
srcFormat == AV_PIX_FMT_GBRP12LE || srcFormat == AV_PIX_FMT_GBRP12BE ||
srcFormat == AV_PIX_FMT_GBRP14LE || srcFormat == AV_PIX_FMT_GBRP14BE ||
srcFormat == AV_PIX_FMT_GBRAP16LE || srcFormat == AV_PIX_FMT_GBRAP16BE) &&
(dstFormat == AV_PIX_FMT_RGB48LE || dstFormat == AV_PIX_FMT_RGB48BE ||
dstFormat == AV_PIX_FMT_BGR48LE || dstFormat == AV_PIX_FMT_BGR48BE ||
dstFormat == AV_PIX_FMT_RGBA64LE || dstFormat == AV_PIX_FMT_RGBA64BE ||
dstFormat == AV_PIX_FMT_BGRA64LE || dstFormat == AV_PIX_FMT_BGRA64BE))
c->swscale = planarRgb16ToRgb16Wrapper;
 
if (av_pix_fmt_desc_get(srcFormat)->comp[0].depth_minus1 == 7 &&
isPackedRGB(srcFormat) && dstFormat == AV_PIX_FMT_GBRP)
c->swscale = rgbToPlanarRgbWrapper;
 
if (isBayer(srcFormat)) {
if (dstFormat == AV_PIX_FMT_RGB24)
c->swscale = bayer_to_rgb24_wrapper;
else if (dstFormat == AV_PIX_FMT_YUV420P)
c->swscale = bayer_to_yv12_wrapper;
else if (!isBayer(dstFormat)) {
av_log(c, AV_LOG_ERROR, "unsupported bayer conversion\n");
av_assert0(0);
}
}
 
/* bswap 16 bits per pixel/component packed formats */
if (IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BAYER_BGGR16) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BAYER_RGGB16) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BAYER_GBRG16) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BAYER_GRBG16) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR444) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR48) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGRA64) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR555) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGR565) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_BGRA64) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GRAY16) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YA16) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_AYUV64) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRP9) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRP10) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRP12) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRP14) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRP16) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_GBRAP16) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB444) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB48) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGBA64) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB555) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGB565) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_RGBA64) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_XYZ12) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV420P9) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV420P10) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV420P12) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV420P14) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV420P16) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV422P9) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV422P10) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV422P12) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV422P14) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV422P16) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV440P10) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV440P12) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV444P9) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV444P10) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV444P12) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV444P14) ||
IS_DIFFERENT_ENDIANESS(srcFormat, dstFormat, AV_PIX_FMT_YUV444P16))
c->swscale = packed_16bpc_bswap;
 
if (usePal(srcFormat) && isByteRGB(dstFormat))
c->swscale = palToRgbWrapper;
 
if (srcFormat == AV_PIX_FMT_YUV422P) {
if (dstFormat == AV_PIX_FMT_YUYV422)
c->swscale = yuv422pToYuy2Wrapper;
else if (dstFormat == AV_PIX_FMT_UYVY422)
c->swscale = yuv422pToUyvyWrapper;
}
 
/* LQ converters if -sws 0 or -sws 4*/
if (c->flags&(SWS_FAST_BILINEAR|SWS_POINT)) {
/* yv12_to_yuy2 */
if (srcFormat == AV_PIX_FMT_YUV420P || srcFormat == AV_PIX_FMT_YUVA420P) {
if (dstFormat == AV_PIX_FMT_YUYV422)
c->swscale = planarToYuy2Wrapper;
else if (dstFormat == AV_PIX_FMT_UYVY422)
c->swscale = planarToUyvyWrapper;
}
}
if (srcFormat == AV_PIX_FMT_YUYV422 &&
(dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))
c->swscale = yuyvToYuv420Wrapper;
if (srcFormat == AV_PIX_FMT_UYVY422 &&
(dstFormat == AV_PIX_FMT_YUV420P || dstFormat == AV_PIX_FMT_YUVA420P))
c->swscale = uyvyToYuv420Wrapper;
if (srcFormat == AV_PIX_FMT_YUYV422 && dstFormat == AV_PIX_FMT_YUV422P)
c->swscale = yuyvToYuv422Wrapper;
if (srcFormat == AV_PIX_FMT_UYVY422 && dstFormat == AV_PIX_FMT_YUV422P)
c->swscale = uyvyToYuv422Wrapper;
 
#define isPlanarGray(x) (isGray(x) && (x) != AV_PIX_FMT_YA8 && (x) != AV_PIX_FMT_YA16LE && (x) != AV_PIX_FMT_YA16BE)
/* simple copy */
if ( srcFormat == dstFormat ||
(srcFormat == AV_PIX_FMT_YUVA420P && dstFormat == AV_PIX_FMT_YUV420P) ||
(srcFormat == AV_PIX_FMT_YUV420P && dstFormat == AV_PIX_FMT_YUVA420P) ||
(isPlanarYUV(srcFormat) && isPlanarGray(dstFormat)) ||
(isPlanarYUV(dstFormat) && isPlanarGray(srcFormat)) ||
(isPlanarGray(dstFormat) && isPlanarGray(srcFormat)) ||
(isPlanarYUV(srcFormat) && isPlanarYUV(dstFormat) &&
c->chrDstHSubSample == c->chrSrcHSubSample &&
c->chrDstVSubSample == c->chrSrcVSubSample &&
dstFormat != AV_PIX_FMT_NV12 && dstFormat != AV_PIX_FMT_NV21 &&
srcFormat != AV_PIX_FMT_NV12 && srcFormat != AV_PIX_FMT_NV21))
{
if (isPacked(c->srcFormat))
c->swscale = packedCopyWrapper;
else /* Planar YUV or gray */
c->swscale = planarCopyWrapper;
}
 
if (ARCH_PPC)
ff_get_unscaled_swscale_ppc(c);
// if (ARCH_ARM)
// ff_get_unscaled_swscale_arm(c);
}
 
/* Convert the palette to the same packed 32-bit format as the palette */
void sws_convertPalette8ToPacked32(const uint8_t *src, uint8_t *dst,
int num_pixels, const uint8_t *palette)
{
int i;
 
for (i = 0; i < num_pixels; i++)
((uint32_t *) dst)[i] = ((const uint32_t *) palette)[src[i]];
}
 
/* Palette format: ABCD -> dst format: ABC */
void sws_convertPalette8ToPacked24(const uint8_t *src, uint8_t *dst,
int num_pixels, const uint8_t *palette)
{
int i;
 
for (i = 0; i < num_pixels; i++) {
//FIXME slow?
dst[0] = palette[src[i] * 4 + 0];
dst[1] = palette[src[i] * 4 + 1];
dst[2] = palette[src[i] * 4 + 2];
dst += 3;
}
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/swscaleres.rc
0,0 → 1,55
/*
* Windows resource file for libswscale
*
* Copyright (C) 2012 James Almer
* Copyright (C) 2013 Tiancheng "Timothy" Gu
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <windows.h>
#include "libswscale/version.h"
#include "libavutil/ffversion.h"
#include "config.h"
 
1 VERSIONINFO
FILEVERSION LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO, 0
PRODUCTVERSION LIBSWSCALE_VERSION_MAJOR, LIBSWSCALE_VERSION_MINOR, LIBSWSCALE_VERSION_MICRO, 0
FILEFLAGSMASK VS_FFI_FILEFLAGSMASK
FILEOS VOS_NT_WINDOWS32
FILETYPE VFT_DLL
{
BLOCK "StringFileInfo"
{
BLOCK "040904B0"
{
VALUE "CompanyName", "FFmpeg Project"
VALUE "FileDescription", "FFmpeg image rescaling library"
VALUE "FileVersion", AV_STRINGIFY(LIBSWSCALE_VERSION)
VALUE "InternalName", "libswscale"
VALUE "LegalCopyright", "Copyright (C) 2000-" AV_STRINGIFY(CONFIG_THIS_YEAR) " FFmpeg Project"
VALUE "OriginalFilename", "swscale" BUILDSUF "-" AV_STRINGIFY(LIBSWSCALE_VERSION_MAJOR) SLIBSUF
VALUE "ProductName", "FFmpeg"
VALUE "ProductVersion", FFMPEG_VERSION
}
}
 
BLOCK "VarFileInfo"
{
VALUE "Translation", 0x0409, 0x04B0
}
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/utils.c
0,0 → 1,2386
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "config.h"
 
#define _SVID_SOURCE // needed for MAP_ANONYMOUS
#define _DARWIN_C_SOURCE // needed for MAP_ANON
#include <inttypes.h>
#include <math.h>
#include <stdio.h>
#include <string.h>
#if HAVE_SYS_MMAN_H
#include <sys/mman.h>
#if defined(MAP_ANON) && !defined(MAP_ANONYMOUS)
#define MAP_ANONYMOUS MAP_ANON
#endif
#endif
#if HAVE_VIRTUALALLOC
#define WIN32_LEAN_AND_MEAN
#include <windows.h>
#endif
 
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/avutil.h"
#include "libavutil/bswap.h"
#include "libavutil/cpu.h"
#include "libavutil/imgutils.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/mathematics.h"
#include "libavutil/opt.h"
#include "libavutil/pixdesc.h"
#include "libavutil/ppc/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
 
static void handle_formats(SwsContext *c);
 
unsigned swscale_version(void)
{
av_assert0(LIBSWSCALE_VERSION_MICRO >= 100);
return LIBSWSCALE_VERSION_INT;
}
 
const char *swscale_configuration(void)
{
return FFMPEG_CONFIGURATION;
}
 
const char *swscale_license(void)
{
#define LICENSE_PREFIX "libswscale license: "
return LICENSE_PREFIX FFMPEG_LICENSE + sizeof(LICENSE_PREFIX) - 1;
}
 
typedef struct FormatEntry {
uint8_t is_supported_in :1;
uint8_t is_supported_out :1;
uint8_t is_supported_endianness :1;
} FormatEntry;
 
static const FormatEntry format_entries[AV_PIX_FMT_NB] = {
[AV_PIX_FMT_YUV420P] = { 1, 1 },
[AV_PIX_FMT_YUYV422] = { 1, 1 },
[AV_PIX_FMT_RGB24] = { 1, 1 },
[AV_PIX_FMT_BGR24] = { 1, 1 },
[AV_PIX_FMT_YUV422P] = { 1, 1 },
[AV_PIX_FMT_YUV444P] = { 1, 1 },
[AV_PIX_FMT_YUV410P] = { 1, 1 },
[AV_PIX_FMT_YUV411P] = { 1, 1 },
[AV_PIX_FMT_GRAY8] = { 1, 1 },
[AV_PIX_FMT_MONOWHITE] = { 1, 1 },
[AV_PIX_FMT_MONOBLACK] = { 1, 1 },
[AV_PIX_FMT_PAL8] = { 1, 0 },
[AV_PIX_FMT_YUVJ420P] = { 1, 1 },
[AV_PIX_FMT_YUVJ411P] = { 1, 1 },
[AV_PIX_FMT_YUVJ422P] = { 1, 1 },
[AV_PIX_FMT_YUVJ444P] = { 1, 1 },
[AV_PIX_FMT_YVYU422] = { 1, 1 },
[AV_PIX_FMT_UYVY422] = { 1, 1 },
[AV_PIX_FMT_UYYVYY411] = { 0, 0 },
[AV_PIX_FMT_BGR8] = { 1, 1 },
[AV_PIX_FMT_BGR4] = { 0, 1 },
[AV_PIX_FMT_BGR4_BYTE] = { 1, 1 },
[AV_PIX_FMT_RGB8] = { 1, 1 },
[AV_PIX_FMT_RGB4] = { 0, 1 },
[AV_PIX_FMT_RGB4_BYTE] = { 1, 1 },
[AV_PIX_FMT_NV12] = { 1, 1 },
[AV_PIX_FMT_NV21] = { 1, 1 },
[AV_PIX_FMT_ARGB] = { 1, 1 },
[AV_PIX_FMT_RGBA] = { 1, 1 },
[AV_PIX_FMT_ABGR] = { 1, 1 },
[AV_PIX_FMT_BGRA] = { 1, 1 },
[AV_PIX_FMT_0RGB] = { 1, 1 },
[AV_PIX_FMT_RGB0] = { 1, 1 },
[AV_PIX_FMT_0BGR] = { 1, 1 },
[AV_PIX_FMT_BGR0] = { 1, 1 },
[AV_PIX_FMT_GRAY16BE] = { 1, 1 },
[AV_PIX_FMT_GRAY16LE] = { 1, 1 },
[AV_PIX_FMT_YUV440P] = { 1, 1 },
[AV_PIX_FMT_YUVJ440P] = { 1, 1 },
[AV_PIX_FMT_YUV440P10LE] = { 1, 1 },
[AV_PIX_FMT_YUV440P10BE] = { 1, 1 },
[AV_PIX_FMT_YUV440P12LE] = { 1, 1 },
[AV_PIX_FMT_YUV440P12BE] = { 1, 1 },
[AV_PIX_FMT_YUVA420P] = { 1, 1 },
[AV_PIX_FMT_YUVA422P] = { 1, 1 },
[AV_PIX_FMT_YUVA444P] = { 1, 1 },
[AV_PIX_FMT_YUVA420P9BE] = { 1, 1 },
[AV_PIX_FMT_YUVA420P9LE] = { 1, 1 },
[AV_PIX_FMT_YUVA422P9BE] = { 1, 1 },
[AV_PIX_FMT_YUVA422P9LE] = { 1, 1 },
[AV_PIX_FMT_YUVA444P9BE] = { 1, 1 },
[AV_PIX_FMT_YUVA444P9LE] = { 1, 1 },
[AV_PIX_FMT_YUVA420P10BE]= { 1, 1 },
[AV_PIX_FMT_YUVA420P10LE]= { 1, 1 },
[AV_PIX_FMT_YUVA422P10BE]= { 1, 1 },
[AV_PIX_FMT_YUVA422P10LE]= { 1, 1 },
[AV_PIX_FMT_YUVA444P10BE]= { 1, 1 },
[AV_PIX_FMT_YUVA444P10LE]= { 1, 1 },
[AV_PIX_FMT_YUVA420P16BE]= { 1, 1 },
[AV_PIX_FMT_YUVA420P16LE]= { 1, 1 },
[AV_PIX_FMT_YUVA422P16BE]= { 1, 1 },
[AV_PIX_FMT_YUVA422P16LE]= { 1, 1 },
[AV_PIX_FMT_YUVA444P16BE]= { 1, 1 },
[AV_PIX_FMT_YUVA444P16LE]= { 1, 1 },
[AV_PIX_FMT_RGB48BE] = { 1, 1 },
[AV_PIX_FMT_RGB48LE] = { 1, 1 },
[AV_PIX_FMT_RGBA64BE] = { 1, 1, 1 },
[AV_PIX_FMT_RGBA64LE] = { 1, 1, 1 },
[AV_PIX_FMT_RGB565BE] = { 1, 1 },
[AV_PIX_FMT_RGB565LE] = { 1, 1 },
[AV_PIX_FMT_RGB555BE] = { 1, 1 },
[AV_PIX_FMT_RGB555LE] = { 1, 1 },
[AV_PIX_FMT_BGR565BE] = { 1, 1 },
[AV_PIX_FMT_BGR565LE] = { 1, 1 },
[AV_PIX_FMT_BGR555BE] = { 1, 1 },
[AV_PIX_FMT_BGR555LE] = { 1, 1 },
[AV_PIX_FMT_YUV420P16LE] = { 1, 1 },
[AV_PIX_FMT_YUV420P16BE] = { 1, 1 },
[AV_PIX_FMT_YUV422P16LE] = { 1, 1 },
[AV_PIX_FMT_YUV422P16BE] = { 1, 1 },
[AV_PIX_FMT_YUV444P16LE] = { 1, 1 },
[AV_PIX_FMT_YUV444P16BE] = { 1, 1 },
[AV_PIX_FMT_RGB444LE] = { 1, 1 },
[AV_PIX_FMT_RGB444BE] = { 1, 1 },
[AV_PIX_FMT_BGR444LE] = { 1, 1 },
[AV_PIX_FMT_BGR444BE] = { 1, 1 },
[AV_PIX_FMT_YA8] = { 1, 1 },
[AV_PIX_FMT_YA16BE] = { 1, 0 },
[AV_PIX_FMT_YA16LE] = { 1, 0 },
[AV_PIX_FMT_BGR48BE] = { 1, 1 },
[AV_PIX_FMT_BGR48LE] = { 1, 1 },
[AV_PIX_FMT_BGRA64BE] = { 1, 1, 1 },
[AV_PIX_FMT_BGRA64LE] = { 1, 1, 1 },
[AV_PIX_FMT_YUV420P9BE] = { 1, 1 },
[AV_PIX_FMT_YUV420P9LE] = { 1, 1 },
[AV_PIX_FMT_YUV420P10BE] = { 1, 1 },
[AV_PIX_FMT_YUV420P10LE] = { 1, 1 },
[AV_PIX_FMT_YUV420P12BE] = { 1, 1 },
[AV_PIX_FMT_YUV420P12LE] = { 1, 1 },
[AV_PIX_FMT_YUV420P14BE] = { 1, 1 },
[AV_PIX_FMT_YUV420P14LE] = { 1, 1 },
[AV_PIX_FMT_YUV422P9BE] = { 1, 1 },
[AV_PIX_FMT_YUV422P9LE] = { 1, 1 },
[AV_PIX_FMT_YUV422P10BE] = { 1, 1 },
[AV_PIX_FMT_YUV422P10LE] = { 1, 1 },
[AV_PIX_FMT_YUV422P12BE] = { 1, 1 },
[AV_PIX_FMT_YUV422P12LE] = { 1, 1 },
[AV_PIX_FMT_YUV422P14BE] = { 1, 1 },
[AV_PIX_FMT_YUV422P14LE] = { 1, 1 },
[AV_PIX_FMT_YUV444P9BE] = { 1, 1 },
[AV_PIX_FMT_YUV444P9LE] = { 1, 1 },
[AV_PIX_FMT_YUV444P10BE] = { 1, 1 },
[AV_PIX_FMT_YUV444P10LE] = { 1, 1 },
[AV_PIX_FMT_YUV444P12BE] = { 1, 1 },
[AV_PIX_FMT_YUV444P12LE] = { 1, 1 },
[AV_PIX_FMT_YUV444P14BE] = { 1, 1 },
[AV_PIX_FMT_YUV444P14LE] = { 1, 1 },
[AV_PIX_FMT_GBRP] = { 1, 1 },
[AV_PIX_FMT_GBRP9LE] = { 1, 1 },
[AV_PIX_FMT_GBRP9BE] = { 1, 1 },
[AV_PIX_FMT_GBRP10LE] = { 1, 1 },
[AV_PIX_FMT_GBRP10BE] = { 1, 1 },
[AV_PIX_FMT_GBRP12LE] = { 1, 1 },
[AV_PIX_FMT_GBRP12BE] = { 1, 1 },
[AV_PIX_FMT_GBRP14LE] = { 1, 1 },
[AV_PIX_FMT_GBRP14BE] = { 1, 1 },
[AV_PIX_FMT_GBRP16LE] = { 1, 0 },
[AV_PIX_FMT_GBRP16BE] = { 1, 0 },
[AV_PIX_FMT_GBRAP] = { 1, 1 },
[AV_PIX_FMT_GBRAP16LE] = { 1, 0 },
[AV_PIX_FMT_GBRAP16BE] = { 1, 0 },
[AV_PIX_FMT_BAYER_BGGR8] = { 1, 0 },
[AV_PIX_FMT_BAYER_RGGB8] = { 1, 0 },
[AV_PIX_FMT_BAYER_GBRG8] = { 1, 0 },
[AV_PIX_FMT_BAYER_GRBG8] = { 1, 0 },
[AV_PIX_FMT_BAYER_BGGR16LE] = { 1, 0 },
[AV_PIX_FMT_BAYER_BGGR16BE] = { 1, 0 },
[AV_PIX_FMT_BAYER_RGGB16LE] = { 1, 0 },
[AV_PIX_FMT_BAYER_RGGB16BE] = { 1, 0 },
[AV_PIX_FMT_BAYER_GBRG16LE] = { 1, 0 },
[AV_PIX_FMT_BAYER_GBRG16BE] = { 1, 0 },
[AV_PIX_FMT_BAYER_GRBG16LE] = { 1, 0 },
[AV_PIX_FMT_BAYER_GRBG16BE] = { 1, 0 },
[AV_PIX_FMT_XYZ12BE] = { 1, 1, 1 },
[AV_PIX_FMT_XYZ12LE] = { 1, 1, 1 },
[AV_PIX_FMT_AYUV64LE] = { 1, 1},
};
 
int sws_isSupportedInput(enum AVPixelFormat pix_fmt)
{
return (unsigned)pix_fmt < AV_PIX_FMT_NB ?
format_entries[pix_fmt].is_supported_in : 0;
}
 
int sws_isSupportedOutput(enum AVPixelFormat pix_fmt)
{
return (unsigned)pix_fmt < AV_PIX_FMT_NB ?
format_entries[pix_fmt].is_supported_out : 0;
}
 
int sws_isSupportedEndiannessConversion(enum AVPixelFormat pix_fmt)
{
return (unsigned)pix_fmt < AV_PIX_FMT_NB ?
format_entries[pix_fmt].is_supported_endianness : 0;
}
 
static double getSplineCoeff(double a, double b, double c, double d,
double dist)
{
if (dist <= 1.0)
return ((d * dist + c) * dist + b) * dist + a;
else
return getSplineCoeff(0.0,
b + 2.0 * c + 3.0 * d,
c + 3.0 * d,
-b - 3.0 * c - 6.0 * d,
dist - 1.0);
}
 
static av_cold int get_local_pos(SwsContext *s, int chr_subsample, int pos, int dir)
{
if (pos == -1 || pos <= -513) {
pos = (128 << chr_subsample) - 128;
}
pos += 128; // relative to ideal left edge
return pos >> chr_subsample;
}
 
typedef struct {
int flag; ///< flag associated to the algorithm
const char *description; ///< human-readable description
int size_factor; ///< size factor used when initing the filters
} ScaleAlgorithm;
 
static const ScaleAlgorithm scale_algorithms[] = {
{ SWS_AREA, "area averaging", 1 /* downscale only, for upscale it is bilinear */ },
{ SWS_BICUBIC, "bicubic", 4 },
{ SWS_BICUBLIN, "luma bicubic / chroma bilinear", -1 },
{ SWS_BILINEAR, "bilinear", 2 },
{ SWS_FAST_BILINEAR, "fast bilinear", -1 },
{ SWS_GAUSS, "Gaussian", 8 /* infinite ;) */ },
{ SWS_LANCZOS, "Lanczos", -1 /* custom */ },
{ SWS_POINT, "nearest neighbor / point", -1 },
{ SWS_SINC, "sinc", 20 /* infinite ;) */ },
{ SWS_SPLINE, "bicubic spline", 20 /* infinite :)*/ },
{ SWS_X, "experimental", 8 },
};
 
static av_cold int initFilter(int16_t **outFilter, int32_t **filterPos,
int *outFilterSize, int xInc, int srcW,
int dstW, int filterAlign, int one,
int flags, int cpu_flags,
SwsVector *srcFilter, SwsVector *dstFilter,
double param[2], int srcPos, int dstPos)
{
int i;
int filterSize;
int filter2Size;
int minFilterSize;
int64_t *filter = NULL;
int64_t *filter2 = NULL;
const int64_t fone = 1LL << (54 - FFMIN(av_log2(srcW/dstW), 8));
int ret = -1;
 
emms_c(); // FIXME should not be required but IS (even for non-MMX versions)
 
// NOTE: the +3 is for the MMX(+1) / SSE(+3) scaler which reads over the end
FF_ALLOC_ARRAY_OR_GOTO(NULL, *filterPos, (dstW + 3), sizeof(**filterPos), fail);
 
if (FFABS(xInc - 0x10000) < 10 && srcPos == dstPos) { // unscaled
int i;
filterSize = 1;
FF_ALLOCZ_ARRAY_OR_GOTO(NULL, filter,
dstW, sizeof(*filter) * filterSize, fail);
 
for (i = 0; i < dstW; i++) {
filter[i * filterSize] = fone;
(*filterPos)[i] = i;
}
} else if (flags & SWS_POINT) { // lame looking point sampling mode
int i;
int64_t xDstInSrc;
filterSize = 1;
FF_ALLOC_ARRAY_OR_GOTO(NULL, filter,
dstW, sizeof(*filter) * filterSize, fail);
 
xDstInSrc = ((dstPos*(int64_t)xInc)>>8) - ((srcPos*0x8000LL)>>7);
for (i = 0; i < dstW; i++) {
int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;
 
(*filterPos)[i] = xx;
filter[i] = fone;
xDstInSrc += xInc;
}
} else if ((xInc <= (1 << 16) && (flags & SWS_AREA)) ||
(flags & SWS_FAST_BILINEAR)) { // bilinear upscale
int i;
int64_t xDstInSrc;
filterSize = 2;
FF_ALLOC_ARRAY_OR_GOTO(NULL, filter,
dstW, sizeof(*filter) * filterSize, fail);
 
xDstInSrc = ((dstPos*(int64_t)xInc)>>8) - ((srcPos*0x8000LL)>>7);
for (i = 0; i < dstW; i++) {
int xx = (xDstInSrc - ((filterSize - 1) << 15) + (1 << 15)) >> 16;
int j;
 
(*filterPos)[i] = xx;
// bilinear upscale / linear interpolate / area averaging
for (j = 0; j < filterSize; j++) {
int64_t coeff= fone - FFABS(((int64_t)xx<<16) - xDstInSrc)*(fone>>16);
if (coeff < 0)
coeff = 0;
filter[i * filterSize + j] = coeff;
xx++;
}
xDstInSrc += xInc;
}
} else {
int64_t xDstInSrc;
int sizeFactor = -1;
 
for (i = 0; i < FF_ARRAY_ELEMS(scale_algorithms); i++) {
if (flags & scale_algorithms[i].flag && scale_algorithms[i].size_factor > 0) {
sizeFactor = scale_algorithms[i].size_factor;
break;
}
}
if (flags & SWS_LANCZOS)
sizeFactor = param[0] != SWS_PARAM_DEFAULT ? ceil(2 * param[0]) : 6;
av_assert0(sizeFactor > 0);
 
if (xInc <= 1 << 16)
filterSize = 1 + sizeFactor; // upscale
else
filterSize = 1 + (sizeFactor * srcW + dstW - 1) / dstW;
 
filterSize = FFMIN(filterSize, srcW - 2);
filterSize = FFMAX(filterSize, 1);
 
FF_ALLOC_ARRAY_OR_GOTO(NULL, filter,
dstW, sizeof(*filter) * filterSize, fail);
 
xDstInSrc = ((dstPos*(int64_t)xInc)>>7) - ((srcPos*0x10000LL)>>7);
for (i = 0; i < dstW; i++) {
int xx = (xDstInSrc - (filterSize - 2) * (1LL<<16)) / (1 << 17);
int j;
(*filterPos)[i] = xx;
for (j = 0; j < filterSize; j++) {
int64_t d = (FFABS(((int64_t)xx * (1 << 17)) - xDstInSrc)) << 13;
double floatd;
int64_t coeff;
 
if (xInc > 1 << 16)
d = d * dstW / srcW;
floatd = d * (1.0 / (1 << 30));
 
if (flags & SWS_BICUBIC) {
int64_t B = (param[0] != SWS_PARAM_DEFAULT ? param[0] : 0) * (1 << 24);
int64_t C = (param[1] != SWS_PARAM_DEFAULT ? param[1] : 0.6) * (1 << 24);
 
if (d >= 1LL << 31) {
coeff = 0.0;
} else {
int64_t dd = (d * d) >> 30;
int64_t ddd = (dd * d) >> 30;
 
if (d < 1LL << 30)
coeff = (12 * (1 << 24) - 9 * B - 6 * C) * ddd +
(-18 * (1 << 24) + 12 * B + 6 * C) * dd +
(6 * (1 << 24) - 2 * B) * (1 << 30);
else
coeff = (-B - 6 * C) * ddd +
(6 * B + 30 * C) * dd +
(-12 * B - 48 * C) * d +
(8 * B + 24 * C) * (1 << 30);
}
coeff /= (1LL<<54)/fone;
}
#if 0
else if (flags & SWS_X) {
double p = param ? param * 0.01 : 0.3;
coeff = d ? sin(d * M_PI) / (d * M_PI) : 1.0;
coeff *= pow(2.0, -p * d * d);
}
#endif
else if (flags & SWS_X) {
double A = param[0] != SWS_PARAM_DEFAULT ? param[0] : 1.0;
double c;
 
if (floatd < 1.0)
c = cos(floatd * M_PI);
else
c = -1.0;
if (c < 0.0)
c = -pow(-c, A);
else
c = pow(c, A);
coeff = (c * 0.5 + 0.5) * fone;
} else if (flags & SWS_AREA) {
int64_t d2 = d - (1 << 29);
if (d2 * xInc < -(1LL << (29 + 16)))
coeff = 1.0 * (1LL << (30 + 16));
else if (d2 * xInc < (1LL << (29 + 16)))
coeff = -d2 * xInc + (1LL << (29 + 16));
else
coeff = 0.0;
coeff *= fone >> (30 + 16);
} else if (flags & SWS_GAUSS) {
double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
coeff = (pow(2.0, -p * floatd * floatd)) * fone;
} else if (flags & SWS_SINC) {
coeff = (d ? sin(floatd * M_PI) / (floatd * M_PI) : 1.0) * fone;
} else if (flags & SWS_LANCZOS) {
double p = param[0] != SWS_PARAM_DEFAULT ? param[0] : 3.0;
coeff = (d ? sin(floatd * M_PI) * sin(floatd * M_PI / p) /
(floatd * floatd * M_PI * M_PI / p) : 1.0) * fone;
if (floatd > p)
coeff = 0;
} else if (flags & SWS_BILINEAR) {
coeff = (1 << 30) - d;
if (coeff < 0)
coeff = 0;
coeff *= fone >> 30;
} else if (flags & SWS_SPLINE) {
double p = -2.196152422706632;
coeff = getSplineCoeff(1.0, 0.0, p, -p - 1.0, floatd) * fone;
} else {
av_assert0(0);
}
 
filter[i * filterSize + j] = coeff;
xx++;
}
xDstInSrc += 2 * xInc;
}
}
 
/* apply src & dst Filter to filter -> filter2
* av_free(filter);
*/
av_assert0(filterSize > 0);
filter2Size = filterSize;
if (srcFilter)
filter2Size += srcFilter->length - 1;
if (dstFilter)
filter2Size += dstFilter->length - 1;
av_assert0(filter2Size > 0);
FF_ALLOCZ_ARRAY_OR_GOTO(NULL, filter2, dstW, filter2Size * sizeof(*filter2), fail);
 
for (i = 0; i < dstW; i++) {
int j, k;
 
if (srcFilter) {
for (k = 0; k < srcFilter->length; k++) {
for (j = 0; j < filterSize; j++)
filter2[i * filter2Size + k + j] +=
srcFilter->coeff[k] * filter[i * filterSize + j];
}
} else {
for (j = 0; j < filterSize; j++)
filter2[i * filter2Size + j] = filter[i * filterSize + j];
}
// FIXME dstFilter
 
(*filterPos)[i] += (filterSize - 1) / 2 - (filter2Size - 1) / 2;
}
av_freep(&filter);
 
/* try to reduce the filter-size (step1 find size and shift left) */
// Assume it is near normalized (*0.5 or *2.0 is OK but * 0.001 is not).
minFilterSize = 0;
for (i = dstW - 1; i >= 0; i--) {
int min = filter2Size;
int j;
int64_t cutOff = 0.0;
 
/* get rid of near zero elements on the left by shifting left */
for (j = 0; j < filter2Size; j++) {
int k;
cutOff += FFABS(filter2[i * filter2Size]);
 
if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)
break;
 
/* preserve monotonicity because the core can't handle the
* filter otherwise */
if (i < dstW - 1 && (*filterPos)[i] >= (*filterPos)[i + 1])
break;
 
// move filter coefficients left
for (k = 1; k < filter2Size; k++)
filter2[i * filter2Size + k - 1] = filter2[i * filter2Size + k];
filter2[i * filter2Size + k - 1] = 0;
(*filterPos)[i]++;
}
 
cutOff = 0;
/* count near zeros on the right */
for (j = filter2Size - 1; j > 0; j--) {
cutOff += FFABS(filter2[i * filter2Size + j]);
 
if (cutOff > SWS_MAX_REDUCE_CUTOFF * fone)
break;
min--;
}
 
if (min > minFilterSize)
minFilterSize = min;
}
 
if (PPC_ALTIVEC(cpu_flags)) {
// we can handle the special case 4, so we don't want to go the full 8
if (minFilterSize < 5)
filterAlign = 4;
 
/* We really don't want to waste our time doing useless computation, so
* fall back on the scalar C code for very small filters.
* Vectorizing is worth it only if you have a decent-sized vector. */
if (minFilterSize < 3)
filterAlign = 1;
}
 
if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_MMX) {
// special case for unscaled vertical filtering
if (minFilterSize == 1 && filterAlign == 2)
filterAlign = 1;
}
 
av_assert0(minFilterSize > 0);
filterSize = (minFilterSize + (filterAlign - 1)) & (~(filterAlign - 1));
av_assert0(filterSize > 0);
filter = av_malloc_array(dstW, filterSize * sizeof(*filter));
if (!filter)
goto fail;
if (filterSize >= MAX_FILTER_SIZE * 16 /
((flags & SWS_ACCURATE_RND) ? APCK_SIZE : 16)) {
ret = RETCODE_USE_CASCADE;
goto fail;
}
*outFilterSize = filterSize;
 
if (flags & SWS_PRINT_INFO)
av_log(NULL, AV_LOG_VERBOSE,
"SwScaler: reducing / aligning filtersize %d -> %d\n",
filter2Size, filterSize);
/* try to reduce the filter-size (step2 reduce it) */
for (i = 0; i < dstW; i++) {
int j;
 
for (j = 0; j < filterSize; j++) {
if (j >= filter2Size)
filter[i * filterSize + j] = 0;
else
filter[i * filterSize + j] = filter2[i * filter2Size + j];
if ((flags & SWS_BITEXACT) && j >= minFilterSize)
filter[i * filterSize + j] = 0;
}
}
 
// FIXME try to align filterPos if possible
 
// fix borders
for (i = 0; i < dstW; i++) {
int j;
if ((*filterPos)[i] < 0) {
// move filter coefficients left to compensate for filterPos
for (j = 1; j < filterSize; j++) {
int left = FFMAX(j + (*filterPos)[i], 0);
filter[i * filterSize + left] += filter[i * filterSize + j];
filter[i * filterSize + j] = 0;
}
(*filterPos)[i]= 0;
}
 
if ((*filterPos)[i] + filterSize > srcW) {
int shift = (*filterPos)[i] + FFMIN(filterSize - srcW, 0);
int64_t acc = 0;
 
for (j = filterSize - 1; j >= 0; j--) {
if ((*filterPos)[i] + j >= srcW) {
acc += filter[i * filterSize + j];
filter[i * filterSize + j] = 0;
}
}
for (j = filterSize - 1; j >= 0; j--) {
if (j < shift) {
filter[i * filterSize + j] = 0;
} else {
filter[i * filterSize + j] = filter[i * filterSize + j - shift];
}
}
 
(*filterPos)[i]-= shift;
filter[i * filterSize + srcW - 1 - (*filterPos)[i]] += acc;
}
av_assert0((*filterPos)[i] >= 0);
av_assert0((*filterPos)[i] < srcW);
if ((*filterPos)[i] + filterSize > srcW) {
for (j = 0; j < filterSize; j++) {
av_assert0((*filterPos)[i] + j < srcW || !filter[i * filterSize + j]);
}
}
}
 
// Note the +1 is for the MMX scaler which reads over the end
/* align at 16 for AltiVec (needed by hScale_altivec_real) */
FF_ALLOCZ_ARRAY_OR_GOTO(NULL, *outFilter,
(dstW + 3), *outFilterSize * sizeof(int16_t), fail);
 
/* normalize & store in outFilter */
for (i = 0; i < dstW; i++) {
int j;
int64_t error = 0;
int64_t sum = 0;
 
for (j = 0; j < filterSize; j++) {
sum += filter[i * filterSize + j];
}
sum = (sum + one / 2) / one;
if (!sum) {
av_log(NULL, AV_LOG_WARNING, "SwScaler: zero vector in scaling\n");
sum = 1;
}
for (j = 0; j < *outFilterSize; j++) {
int64_t v = filter[i * filterSize + j] + error;
int intV = ROUNDED_DIV(v, sum);
(*outFilter)[i * (*outFilterSize) + j] = intV;
error = v - intV * sum;
}
}
 
(*filterPos)[dstW + 0] =
(*filterPos)[dstW + 1] =
(*filterPos)[dstW + 2] = (*filterPos)[dstW - 1]; /* the MMX/SSE scaler will
* read over the end */
for (i = 0; i < *outFilterSize; i++) {
int k = (dstW - 1) * (*outFilterSize) + i;
(*outFilter)[k + 1 * (*outFilterSize)] =
(*outFilter)[k + 2 * (*outFilterSize)] =
(*outFilter)[k + 3 * (*outFilterSize)] = (*outFilter)[k];
}
 
ret = 0;
 
fail:
if(ret < 0)
av_log(NULL, ret == RETCODE_USE_CASCADE ? AV_LOG_DEBUG : AV_LOG_ERROR, "sws: initFilter failed\n");
av_free(filter);
av_free(filter2);
return ret;
}
 
static void fill_rgb2yuv_table(SwsContext *c, const int table[4], int dstRange)
{
int64_t W, V, Z, Cy, Cu, Cv;
int64_t vr = table[0];
int64_t ub = table[1];
int64_t ug = -table[2];
int64_t vg = -table[3];
int64_t ONE = 65536;
int64_t cy = ONE;
uint8_t *p = (uint8_t*)c->input_rgb2yuv_table;
int i;
static const int8_t map[] = {
BY_IDX, GY_IDX, -1 , BY_IDX, BY_IDX, GY_IDX, -1 , BY_IDX,
RY_IDX, -1 , GY_IDX, RY_IDX, RY_IDX, -1 , GY_IDX, RY_IDX,
RY_IDX, GY_IDX, -1 , RY_IDX, RY_IDX, GY_IDX, -1 , RY_IDX,
BY_IDX, -1 , GY_IDX, BY_IDX, BY_IDX, -1 , GY_IDX, BY_IDX,
BU_IDX, GU_IDX, -1 , BU_IDX, BU_IDX, GU_IDX, -1 , BU_IDX,
RU_IDX, -1 , GU_IDX, RU_IDX, RU_IDX, -1 , GU_IDX, RU_IDX,
RU_IDX, GU_IDX, -1 , RU_IDX, RU_IDX, GU_IDX, -1 , RU_IDX,
BU_IDX, -1 , GU_IDX, BU_IDX, BU_IDX, -1 , GU_IDX, BU_IDX,
BV_IDX, GV_IDX, -1 , BV_IDX, BV_IDX, GV_IDX, -1 , BV_IDX,
RV_IDX, -1 , GV_IDX, RV_IDX, RV_IDX, -1 , GV_IDX, RV_IDX,
RV_IDX, GV_IDX, -1 , RV_IDX, RV_IDX, GV_IDX, -1 , RV_IDX,
BV_IDX, -1 , GV_IDX, BV_IDX, BV_IDX, -1 , GV_IDX, BV_IDX,
RY_IDX, BY_IDX, RY_IDX, BY_IDX, RY_IDX, BY_IDX, RY_IDX, BY_IDX,
BY_IDX, RY_IDX, BY_IDX, RY_IDX, BY_IDX, RY_IDX, BY_IDX, RY_IDX,
GY_IDX, -1 , GY_IDX, -1 , GY_IDX, -1 , GY_IDX, -1 ,
-1 , GY_IDX, -1 , GY_IDX, -1 , GY_IDX, -1 , GY_IDX,
RU_IDX, BU_IDX, RU_IDX, BU_IDX, RU_IDX, BU_IDX, RU_IDX, BU_IDX,
BU_IDX, RU_IDX, BU_IDX, RU_IDX, BU_IDX, RU_IDX, BU_IDX, RU_IDX,
GU_IDX, -1 , GU_IDX, -1 , GU_IDX, -1 , GU_IDX, -1 ,
-1 , GU_IDX, -1 , GU_IDX, -1 , GU_IDX, -1 , GU_IDX,
RV_IDX, BV_IDX, RV_IDX, BV_IDX, RV_IDX, BV_IDX, RV_IDX, BV_IDX,
BV_IDX, RV_IDX, BV_IDX, RV_IDX, BV_IDX, RV_IDX, BV_IDX, RV_IDX,
GV_IDX, -1 , GV_IDX, -1 , GV_IDX, -1 , GV_IDX, -1 ,
-1 , GV_IDX, -1 , GV_IDX, -1 , GV_IDX, -1 , GV_IDX, //23
-1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //24
-1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //25
-1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //26
-1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //27
-1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //28
-1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //29
-1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //30
-1 , -1 , -1 , -1 , -1 , -1 , -1 , -1 , //31
BY_IDX, GY_IDX, RY_IDX, -1 , -1 , -1 , -1 , -1 , //32
BU_IDX, GU_IDX, RU_IDX, -1 , -1 , -1 , -1 , -1 , //33
BV_IDX, GV_IDX, RV_IDX, -1 , -1 , -1 , -1 , -1 , //34
};
 
dstRange = 0; //FIXME range = 1 is handled elsewhere
 
if (!dstRange) {
cy = cy * 255 / 219;
} else {
vr = vr * 224 / 255;
ub = ub * 224 / 255;
ug = ug * 224 / 255;
vg = vg * 224 / 255;
}
W = ROUNDED_DIV(ONE*ONE*ug, ub);
V = ROUNDED_DIV(ONE*ONE*vg, vr);
Z = ONE*ONE-W-V;
 
Cy = ROUNDED_DIV(cy*Z, ONE);
Cu = ROUNDED_DIV(ub*Z, ONE);
Cv = ROUNDED_DIV(vr*Z, ONE);
 
c->input_rgb2yuv_table[RY_IDX] = -ROUNDED_DIV((1 << RGB2YUV_SHIFT)*V , Cy);
c->input_rgb2yuv_table[GY_IDX] = ROUNDED_DIV((1 << RGB2YUV_SHIFT)*ONE*ONE , Cy);
c->input_rgb2yuv_table[BY_IDX] = -ROUNDED_DIV((1 << RGB2YUV_SHIFT)*W , Cy);
 
c->input_rgb2yuv_table[RU_IDX] = ROUNDED_DIV((1 << RGB2YUV_SHIFT)*V , Cu);
c->input_rgb2yuv_table[GU_IDX] = -ROUNDED_DIV((1 << RGB2YUV_SHIFT)*ONE*ONE , Cu);
c->input_rgb2yuv_table[BU_IDX] = ROUNDED_DIV((1 << RGB2YUV_SHIFT)*(Z+W) , Cu);
 
c->input_rgb2yuv_table[RV_IDX] = ROUNDED_DIV((1 << RGB2YUV_SHIFT)*(V+Z) , Cv);
c->input_rgb2yuv_table[GV_IDX] = -ROUNDED_DIV((1 << RGB2YUV_SHIFT)*ONE*ONE , Cv);
c->input_rgb2yuv_table[BV_IDX] = ROUNDED_DIV((1 << RGB2YUV_SHIFT)*W , Cv);
 
if(/*!dstRange && */!memcmp(table, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT], sizeof(ff_yuv2rgb_coeffs[SWS_CS_DEFAULT]))) {
c->input_rgb2yuv_table[BY_IDX] = ((int)(0.114 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5));
c->input_rgb2yuv_table[BV_IDX] = (-(int)(0.081 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5));
c->input_rgb2yuv_table[BU_IDX] = ((int)(0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5));
c->input_rgb2yuv_table[GY_IDX] = ((int)(0.587 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5));
c->input_rgb2yuv_table[GV_IDX] = (-(int)(0.419 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5));
c->input_rgb2yuv_table[GU_IDX] = (-(int)(0.331 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5));
c->input_rgb2yuv_table[RY_IDX] = ((int)(0.299 * 219 / 255 * (1 << RGB2YUV_SHIFT) + 0.5));
c->input_rgb2yuv_table[RV_IDX] = ((int)(0.500 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5));
c->input_rgb2yuv_table[RU_IDX] = (-(int)(0.169 * 224 / 255 * (1 << RGB2YUV_SHIFT) + 0.5));
}
for(i=0; i<FF_ARRAY_ELEMS(map); i++)
AV_WL16(p + 16*4 + 2*i, map[i] >= 0 ? c->input_rgb2yuv_table[map[i]] : 0);
}
 
static void fill_xyztables(struct SwsContext *c)
{
int i;
double xyzgamma = XYZ_GAMMA;
double rgbgamma = 1.0 / RGB_GAMMA;
double xyzgammainv = 1.0 / XYZ_GAMMA;
double rgbgammainv = RGB_GAMMA;
static const int16_t xyz2rgb_matrix[3][4] = {
{13270, -6295, -2041},
{-3969, 7682, 170},
{ 228, -835, 4329} };
static const int16_t rgb2xyz_matrix[3][4] = {
{1689, 1464, 739},
{ 871, 2929, 296},
{ 79, 488, 3891} };
static int16_t xyzgamma_tab[4096], rgbgamma_tab[4096], xyzgammainv_tab[4096], rgbgammainv_tab[4096];
 
memcpy(c->xyz2rgb_matrix, xyz2rgb_matrix, sizeof(c->xyz2rgb_matrix));
memcpy(c->rgb2xyz_matrix, rgb2xyz_matrix, sizeof(c->rgb2xyz_matrix));
c->xyzgamma = xyzgamma_tab;
c->rgbgamma = rgbgamma_tab;
c->xyzgammainv = xyzgammainv_tab;
c->rgbgammainv = rgbgammainv_tab;
 
if (rgbgamma_tab[4095])
return;
 
/* set gamma vectors */
for (i = 0; i < 4096; i++) {
xyzgamma_tab[i] = lrint(pow(i / 4095.0, xyzgamma) * 4095.0);
rgbgamma_tab[i] = lrint(pow(i / 4095.0, rgbgamma) * 4095.0);
xyzgammainv_tab[i] = lrint(pow(i / 4095.0, xyzgammainv) * 4095.0);
rgbgammainv_tab[i] = lrint(pow(i / 4095.0, rgbgammainv) * 4095.0);
}
}
 
int sws_setColorspaceDetails(struct SwsContext *c, const int inv_table[4],
int srcRange, const int table[4], int dstRange,
int brightness, int contrast, int saturation)
{
const AVPixFmtDescriptor *desc_dst;
const AVPixFmtDescriptor *desc_src;
int need_reinit = 0;
 
handle_formats(c);
desc_dst = av_pix_fmt_desc_get(c->dstFormat);
desc_src = av_pix_fmt_desc_get(c->srcFormat);
 
if(!isYUV(c->dstFormat) && !isGray(c->dstFormat))
dstRange = 0;
if(!isYUV(c->srcFormat) && !isGray(c->srcFormat))
srcRange = 0;
 
if (c->srcRange != srcRange ||
c->dstRange != dstRange ||
c->brightness != brightness ||
c->contrast != contrast ||
c->saturation != saturation ||
memcmp(c->srcColorspaceTable, inv_table, sizeof(int) * 4) ||
memcmp(c->dstColorspaceTable, table, sizeof(int) * 4)
)
need_reinit = 1;
 
memmove(c->srcColorspaceTable, inv_table, sizeof(int) * 4);
memmove(c->dstColorspaceTable, table, sizeof(int) * 4);
 
 
 
c->brightness = brightness;
c->contrast = contrast;
c->saturation = saturation;
c->srcRange = srcRange;
c->dstRange = dstRange;
 
//The srcBpc check is possibly wrong but we seem to lack a definitive reference to test this
//and what we have in ticket 2939 looks better with this check
if (need_reinit && (c->srcBpc == 8 || !isYUV(c->srcFormat)))
ff_sws_init_range_convert(c);
 
c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
c->srcFormatBpp = av_get_bits_per_pixel(desc_src);
 
if (c->cascaded_context[0])
return sws_setColorspaceDetails(c->cascaded_context[0],inv_table, srcRange,table, dstRange, brightness, contrast, saturation);
 
if (!need_reinit)
return 0;
 
if ((isYUV(c->dstFormat) || isGray(c->dstFormat)) && (isYUV(c->srcFormat) || isGray(c->srcFormat))) {
if (!c->cascaded_context[0] &&
memcmp(c->dstColorspaceTable, c->srcColorspaceTable, sizeof(int) * 4) &&
c->srcW && c->srcH && c->dstW && c->dstH) {
enum AVPixelFormat tmp_format;
int tmp_width, tmp_height;
int srcW = c->srcW;
int srcH = c->srcH;
int dstW = c->dstW;
int dstH = c->dstH;
int ret;
av_log(c, AV_LOG_VERBOSE, "YUV color matrix differs for YUV->YUV, using intermediate RGB to convert\n");
 
if (isNBPS(c->dstFormat) || is16BPS(c->dstFormat)) {
if (isALPHA(c->srcFormat) && isALPHA(c->dstFormat)) {
tmp_format = AV_PIX_FMT_BGRA64;
} else {
tmp_format = AV_PIX_FMT_BGR48;
}
} else {
if (isALPHA(c->srcFormat) && isALPHA(c->dstFormat)) {
tmp_format = AV_PIX_FMT_BGRA;
} else {
tmp_format = AV_PIX_FMT_BGR24;
}
}
 
if (srcW*srcH > dstW*dstH) {
tmp_width = dstW;
tmp_height = dstH;
} else {
tmp_width = srcW;
tmp_height = srcH;
}
 
ret = av_image_alloc(c->cascaded_tmp, c->cascaded_tmpStride,
tmp_width, tmp_height, tmp_format, 64);
if (ret < 0)
return ret;
 
c->cascaded_context[0] = sws_alloc_set_opts(srcW, srcH, c->srcFormat,
tmp_width, tmp_height, tmp_format,
c->flags, c->param);
if (!c->cascaded_context[0])
return -1;
 
c->cascaded_context[0]->alphablend = c->alphablend;
ret = sws_init_context(c->cascaded_context[0], NULL , NULL);
if (ret < 0)
return ret;
//we set both src and dst depending on that the RGB side will be ignored
sws_setColorspaceDetails(c->cascaded_context[0], inv_table,
srcRange, table, dstRange,
brightness, contrast, saturation);
 
c->cascaded_context[1] = sws_getContext(tmp_width, tmp_height, tmp_format,
dstW, dstH, c->dstFormat,
c->flags, NULL, NULL, c->param);
if (!c->cascaded_context[1])
return -1;
sws_setColorspaceDetails(c->cascaded_context[1], inv_table,
srcRange, table, dstRange,
0, 1 << 16, 1 << 16);
return 0;
}
return -1;
}
 
if (!isYUV(c->dstFormat) && !isGray(c->dstFormat)) {
ff_yuv2rgb_c_init_tables(c, inv_table, srcRange, brightness,
contrast, saturation);
// FIXME factorize
 
if (ARCH_PPC)
ff_yuv2rgb_init_tables_ppc(c, inv_table, brightness,
contrast, saturation);
}
 
fill_rgb2yuv_table(c, table, dstRange);
 
return 0;
}
 
int sws_getColorspaceDetails(struct SwsContext *c, int **inv_table,
int *srcRange, int **table, int *dstRange,
int *brightness, int *contrast, int *saturation)
{
if (!c )
return -1;
 
*inv_table = c->srcColorspaceTable;
*table = c->dstColorspaceTable;
*srcRange = c->srcRange;
*dstRange = c->dstRange;
*brightness = c->brightness;
*contrast = c->contrast;
*saturation = c->saturation;
 
return 0;
}
 
static int handle_jpeg(enum AVPixelFormat *format)
{
switch (*format) {
case AV_PIX_FMT_YUVJ420P:
*format = AV_PIX_FMT_YUV420P;
return 1;
case AV_PIX_FMT_YUVJ411P:
*format = AV_PIX_FMT_YUV411P;
return 1;
case AV_PIX_FMT_YUVJ422P:
*format = AV_PIX_FMT_YUV422P;
return 1;
case AV_PIX_FMT_YUVJ444P:
*format = AV_PIX_FMT_YUV444P;
return 1;
case AV_PIX_FMT_YUVJ440P:
*format = AV_PIX_FMT_YUV440P;
return 1;
case AV_PIX_FMT_GRAY8:
case AV_PIX_FMT_GRAY16LE:
case AV_PIX_FMT_GRAY16BE:
return 1;
default:
return 0;
}
}
 
static int handle_0alpha(enum AVPixelFormat *format)
{
switch (*format) {
case AV_PIX_FMT_0BGR : *format = AV_PIX_FMT_ABGR ; return 1;
case AV_PIX_FMT_BGR0 : *format = AV_PIX_FMT_BGRA ; return 4;
case AV_PIX_FMT_0RGB : *format = AV_PIX_FMT_ARGB ; return 1;
case AV_PIX_FMT_RGB0 : *format = AV_PIX_FMT_RGBA ; return 4;
default: return 0;
}
}
 
static int handle_xyz(enum AVPixelFormat *format)
{
switch (*format) {
case AV_PIX_FMT_XYZ12BE : *format = AV_PIX_FMT_RGB48BE; return 1;
case AV_PIX_FMT_XYZ12LE : *format = AV_PIX_FMT_RGB48LE; return 1;
default: return 0;
}
}
 
static void handle_formats(SwsContext *c)
{
c->src0Alpha |= handle_0alpha(&c->srcFormat);
c->dst0Alpha |= handle_0alpha(&c->dstFormat);
c->srcXYZ |= handle_xyz(&c->srcFormat);
c->dstXYZ |= handle_xyz(&c->dstFormat);
if (c->srcXYZ || c->dstXYZ)
fill_xyztables(c);
}
 
SwsContext *sws_alloc_context(void)
{
SwsContext *c = av_mallocz(sizeof(SwsContext));
 
av_assert0(offsetof(SwsContext, redDither) + DITHER32_INT == offsetof(SwsContext, dither32));
 
if (c) {
c->av_class = &sws_context_class;
av_opt_set_defaults(c);
}
 
return c;
}
 
static uint16_t * alloc_gamma_tbl(double e)
{
int i = 0;
uint16_t * tbl;
tbl = (uint16_t*)av_malloc(sizeof(uint16_t) * 1 << 16);
if (!tbl)
return NULL;
 
for (i = 0; i < 65536; ++i) {
tbl[i] = pow(i / 65535.0, e) * 65535.0;
}
return tbl;
}
 
static enum AVPixelFormat alphaless_fmt(enum AVPixelFormat fmt)
{
switch(fmt) {
case AV_PIX_FMT_ARGB: return AV_PIX_FMT_RGB24;
case AV_PIX_FMT_RGBA: return AV_PIX_FMT_RGB24;
case AV_PIX_FMT_ABGR: return AV_PIX_FMT_BGR24;
case AV_PIX_FMT_BGRA: return AV_PIX_FMT_BGR24;
case AV_PIX_FMT_YA8: return AV_PIX_FMT_GRAY8;
 
case AV_PIX_FMT_YUVA420P: return AV_PIX_FMT_YUV420P;
case AV_PIX_FMT_YUVA422P: return AV_PIX_FMT_YUV422P;
case AV_PIX_FMT_YUVA444P: return AV_PIX_FMT_YUV444P;
 
case AV_PIX_FMT_GBRAP: return AV_PIX_FMT_GBRP;
 
case AV_PIX_FMT_GBRAP16LE: return AV_PIX_FMT_GBRP16;
case AV_PIX_FMT_GBRAP16BE: return AV_PIX_FMT_GBRP16;
 
case AV_PIX_FMT_RGBA64LE: return AV_PIX_FMT_RGB48;
case AV_PIX_FMT_RGBA64BE: return AV_PIX_FMT_RGB48;
case AV_PIX_FMT_BGRA64LE: return AV_PIX_FMT_BGR48;
case AV_PIX_FMT_BGRA64BE: return AV_PIX_FMT_BGR48;
 
case AV_PIX_FMT_YA16BE: return AV_PIX_FMT_GRAY16;
case AV_PIX_FMT_YA16LE: return AV_PIX_FMT_GRAY16;
 
case AV_PIX_FMT_YUVA420P9BE: return AV_PIX_FMT_YUV420P9;
case AV_PIX_FMT_YUVA422P9BE: return AV_PIX_FMT_YUV422P9;
case AV_PIX_FMT_YUVA444P9BE: return AV_PIX_FMT_YUV444P9;
case AV_PIX_FMT_YUVA420P9LE: return AV_PIX_FMT_YUV420P9;
case AV_PIX_FMT_YUVA422P9LE: return AV_PIX_FMT_YUV422P9;
case AV_PIX_FMT_YUVA444P9LE: return AV_PIX_FMT_YUV444P9;
case AV_PIX_FMT_YUVA420P10BE: return AV_PIX_FMT_YUV420P10;
case AV_PIX_FMT_YUVA422P10BE: return AV_PIX_FMT_YUV422P10;
case AV_PIX_FMT_YUVA444P10BE: return AV_PIX_FMT_YUV444P10;
case AV_PIX_FMT_YUVA420P10LE: return AV_PIX_FMT_YUV420P10;
case AV_PIX_FMT_YUVA422P10LE: return AV_PIX_FMT_YUV422P10;
case AV_PIX_FMT_YUVA444P10LE: return AV_PIX_FMT_YUV444P10;
case AV_PIX_FMT_YUVA420P16BE: return AV_PIX_FMT_YUV420P16;
case AV_PIX_FMT_YUVA422P16BE: return AV_PIX_FMT_YUV422P16;
case AV_PIX_FMT_YUVA444P16BE: return AV_PIX_FMT_YUV444P16;
case AV_PIX_FMT_YUVA420P16LE: return AV_PIX_FMT_YUV420P16;
case AV_PIX_FMT_YUVA422P16LE: return AV_PIX_FMT_YUV422P16;
case AV_PIX_FMT_YUVA444P16LE: return AV_PIX_FMT_YUV444P16;
 
// case AV_PIX_FMT_AYUV64LE:
// case AV_PIX_FMT_AYUV64BE:
// case AV_PIX_FMT_PAL8:
default: return AV_PIX_FMT_NONE;
}
}
 
av_cold int sws_init_context(SwsContext *c, SwsFilter *srcFilter,
SwsFilter *dstFilter)
{
int i, j;
int usesVFilter, usesHFilter;
int unscaled;
SwsFilter dummyFilter = { NULL, NULL, NULL, NULL };
int srcW = c->srcW;
int srcH = c->srcH;
int dstW = c->dstW;
int dstH = c->dstH;
int dst_stride = FFALIGN(dstW * sizeof(int16_t) + 66, 16);
int flags, cpu_flags;
enum AVPixelFormat srcFormat = c->srcFormat;
enum AVPixelFormat dstFormat = c->dstFormat;
const AVPixFmtDescriptor *desc_src;
const AVPixFmtDescriptor *desc_dst;
int ret = 0;
enum AVPixelFormat tmpFmt;
 
cpu_flags = av_get_cpu_flags();
flags = c->flags;
emms_c();
if (!rgb15to16)
sws_rgb2rgb_init();
 
unscaled = (srcW == dstW && srcH == dstH);
 
c->srcRange |= handle_jpeg(&c->srcFormat);
c->dstRange |= handle_jpeg(&c->dstFormat);
 
if(srcFormat!=c->srcFormat || dstFormat!=c->dstFormat)
av_log(c, AV_LOG_WARNING, "deprecated pixel format used, make sure you did set range correctly\n");
 
if (!c->contrast && !c->saturation && !c->dstFormatBpp)
sws_setColorspaceDetails(c, ff_yuv2rgb_coeffs[SWS_CS_DEFAULT], c->srcRange,
ff_yuv2rgb_coeffs[SWS_CS_DEFAULT],
c->dstRange, 0, 1 << 16, 1 << 16);
 
handle_formats(c);
srcFormat = c->srcFormat;
dstFormat = c->dstFormat;
desc_src = av_pix_fmt_desc_get(srcFormat);
desc_dst = av_pix_fmt_desc_get(dstFormat);
 
if (!(unscaled && sws_isSupportedEndiannessConversion(srcFormat) &&
av_pix_fmt_swap_endianness(srcFormat) == dstFormat)) {
if (!sws_isSupportedInput(srcFormat)) {
av_log(c, AV_LOG_ERROR, "%s is not supported as input pixel format\n",
av_get_pix_fmt_name(srcFormat));
return AVERROR(EINVAL);
}
if (!sws_isSupportedOutput(dstFormat)) {
av_log(c, AV_LOG_ERROR, "%s is not supported as output pixel format\n",
av_get_pix_fmt_name(dstFormat));
return AVERROR(EINVAL);
}
}
av_assert2(desc_src && desc_dst);
 
i = flags & (SWS_POINT |
SWS_AREA |
SWS_BILINEAR |
SWS_FAST_BILINEAR |
SWS_BICUBIC |
SWS_X |
SWS_GAUSS |
SWS_LANCZOS |
SWS_SINC |
SWS_SPLINE |
SWS_BICUBLIN);
 
/* provide a default scaler if not set by caller */
if (!i) {
if (dstW < srcW && dstH < srcH)
flags |= SWS_BICUBIC;
else if (dstW > srcW && dstH > srcH)
flags |= SWS_BICUBIC;
else
flags |= SWS_BICUBIC;
c->flags = flags;
} else if (i & (i - 1)) {
av_log(c, AV_LOG_ERROR,
"Exactly one scaler algorithm must be chosen, got %X\n", i);
return AVERROR(EINVAL);
}
/* sanity check */
if (srcW < 1 || srcH < 1 || dstW < 1 || dstH < 1) {
/* FIXME check if these are enough and try to lower them after
* fixing the relevant parts of the code */
av_log(c, AV_LOG_ERROR, "%dx%d -> %dx%d is invalid scaling dimension\n",
srcW, srcH, dstW, dstH);
return AVERROR(EINVAL);
}
if (flags & SWS_FAST_BILINEAR) {
if (srcW < 8 || dstW < 8) {
flags ^= SWS_FAST_BILINEAR | SWS_BILINEAR;
c->flags = flags;
}
}
 
if (!dstFilter)
dstFilter = &dummyFilter;
if (!srcFilter)
srcFilter = &dummyFilter;
 
c->lumXInc = (((int64_t)srcW << 16) + (dstW >> 1)) / dstW;
c->lumYInc = (((int64_t)srcH << 16) + (dstH >> 1)) / dstH;
c->dstFormatBpp = av_get_bits_per_pixel(desc_dst);
c->srcFormatBpp = av_get_bits_per_pixel(desc_src);
c->vRounder = 4 * 0x0001000100010001ULL;
 
usesVFilter = (srcFilter->lumV && srcFilter->lumV->length > 1) ||
(srcFilter->chrV && srcFilter->chrV->length > 1) ||
(dstFilter->lumV && dstFilter->lumV->length > 1) ||
(dstFilter->chrV && dstFilter->chrV->length > 1);
usesHFilter = (srcFilter->lumH && srcFilter->lumH->length > 1) ||
(srcFilter->chrH && srcFilter->chrH->length > 1) ||
(dstFilter->lumH && dstFilter->lumH->length > 1) ||
(dstFilter->chrH && dstFilter->chrH->length > 1);
 
av_pix_fmt_get_chroma_sub_sample(srcFormat, &c->chrSrcHSubSample, &c->chrSrcVSubSample);
av_pix_fmt_get_chroma_sub_sample(dstFormat, &c->chrDstHSubSample, &c->chrDstVSubSample);
 
if (isAnyRGB(dstFormat) && !(flags&SWS_FULL_CHR_H_INT)) {
if (dstW&1) {
av_log(c, AV_LOG_DEBUG, "Forcing full internal H chroma due to odd output size\n");
flags |= SWS_FULL_CHR_H_INT;
c->flags = flags;
}
 
if ( c->chrSrcHSubSample == 0
&& c->chrSrcVSubSample == 0
&& c->dither != SWS_DITHER_BAYER //SWS_FULL_CHR_H_INT is currently not supported with SWS_DITHER_BAYER
&& !(c->flags & SWS_FAST_BILINEAR)
) {
av_log(c, AV_LOG_DEBUG, "Forcing full internal H chroma due to input having non subsampled chroma\n");
flags |= SWS_FULL_CHR_H_INT;
c->flags = flags;
}
}
 
if (c->dither == SWS_DITHER_AUTO) {
if (flags & SWS_ERROR_DIFFUSION)
c->dither = SWS_DITHER_ED;
}
 
if(dstFormat == AV_PIX_FMT_BGR4_BYTE ||
dstFormat == AV_PIX_FMT_RGB4_BYTE ||
dstFormat == AV_PIX_FMT_BGR8 ||
dstFormat == AV_PIX_FMT_RGB8) {
if (c->dither == SWS_DITHER_AUTO)
c->dither = (flags & SWS_FULL_CHR_H_INT) ? SWS_DITHER_ED : SWS_DITHER_BAYER;
if (!(flags & SWS_FULL_CHR_H_INT)) {
if (c->dither == SWS_DITHER_ED || c->dither == SWS_DITHER_A_DITHER || c->dither == SWS_DITHER_X_DITHER) {
av_log(c, AV_LOG_DEBUG,
"Desired dithering only supported in full chroma interpolation for destination format '%s'\n",
av_get_pix_fmt_name(dstFormat));
flags |= SWS_FULL_CHR_H_INT;
c->flags = flags;
}
}
if (flags & SWS_FULL_CHR_H_INT) {
if (c->dither == SWS_DITHER_BAYER) {
av_log(c, AV_LOG_DEBUG,
"Ordered dither is not supported in full chroma interpolation for destination format '%s'\n",
av_get_pix_fmt_name(dstFormat));
c->dither = SWS_DITHER_ED;
}
}
}
if (isPlanarRGB(dstFormat)) {
if (!(flags & SWS_FULL_CHR_H_INT)) {
av_log(c, AV_LOG_DEBUG,
"%s output is not supported with half chroma resolution, switching to full\n",
av_get_pix_fmt_name(dstFormat));
flags |= SWS_FULL_CHR_H_INT;
c->flags = flags;
}
}
 
/* reuse chroma for 2 pixels RGB/BGR unless user wants full
* chroma interpolation */
if (flags & SWS_FULL_CHR_H_INT &&
isAnyRGB(dstFormat) &&
!isPlanarRGB(dstFormat) &&
dstFormat != AV_PIX_FMT_RGBA64LE &&
dstFormat != AV_PIX_FMT_RGBA64BE &&
dstFormat != AV_PIX_FMT_BGRA64LE &&
dstFormat != AV_PIX_FMT_BGRA64BE &&
dstFormat != AV_PIX_FMT_RGB48LE &&
dstFormat != AV_PIX_FMT_RGB48BE &&
dstFormat != AV_PIX_FMT_BGR48LE &&
dstFormat != AV_PIX_FMT_BGR48BE &&
dstFormat != AV_PIX_FMT_RGBA &&
dstFormat != AV_PIX_FMT_ARGB &&
dstFormat != AV_PIX_FMT_BGRA &&
dstFormat != AV_PIX_FMT_ABGR &&
dstFormat != AV_PIX_FMT_RGB24 &&
dstFormat != AV_PIX_FMT_BGR24 &&
dstFormat != AV_PIX_FMT_BGR4_BYTE &&
dstFormat != AV_PIX_FMT_RGB4_BYTE &&
dstFormat != AV_PIX_FMT_BGR8 &&
dstFormat != AV_PIX_FMT_RGB8
) {
av_log(c, AV_LOG_WARNING,
"full chroma interpolation for destination format '%s' not yet implemented\n",
av_get_pix_fmt_name(dstFormat));
flags &= ~SWS_FULL_CHR_H_INT;
c->flags = flags;
}
if (isAnyRGB(dstFormat) && !(flags & SWS_FULL_CHR_H_INT))
c->chrDstHSubSample = 1;
 
// drop some chroma lines if the user wants it
c->vChrDrop = (flags & SWS_SRC_V_CHR_DROP_MASK) >>
SWS_SRC_V_CHR_DROP_SHIFT;
c->chrSrcVSubSample += c->vChrDrop;
 
/* drop every other pixel for chroma calculation unless user
* wants full chroma */
if (isAnyRGB(srcFormat) && !(flags & SWS_FULL_CHR_H_INP) &&
srcFormat != AV_PIX_FMT_RGB8 && srcFormat != AV_PIX_FMT_BGR8 &&
srcFormat != AV_PIX_FMT_RGB4 && srcFormat != AV_PIX_FMT_BGR4 &&
srcFormat != AV_PIX_FMT_RGB4_BYTE && srcFormat != AV_PIX_FMT_BGR4_BYTE &&
srcFormat != AV_PIX_FMT_GBRP9BE && srcFormat != AV_PIX_FMT_GBRP9LE &&
srcFormat != AV_PIX_FMT_GBRP10BE && srcFormat != AV_PIX_FMT_GBRP10LE &&
srcFormat != AV_PIX_FMT_GBRP12BE && srcFormat != AV_PIX_FMT_GBRP12LE &&
srcFormat != AV_PIX_FMT_GBRP14BE && srcFormat != AV_PIX_FMT_GBRP14LE &&
srcFormat != AV_PIX_FMT_GBRP16BE && srcFormat != AV_PIX_FMT_GBRP16LE &&
((dstW >> c->chrDstHSubSample) <= (srcW >> 1) ||
(flags & SWS_FAST_BILINEAR)))
c->chrSrcHSubSample = 1;
 
// Note the FF_CEIL_RSHIFT is so that we always round toward +inf.
c->chrSrcW = FF_CEIL_RSHIFT(srcW, c->chrSrcHSubSample);
c->chrSrcH = FF_CEIL_RSHIFT(srcH, c->chrSrcVSubSample);
c->chrDstW = FF_CEIL_RSHIFT(dstW, c->chrDstHSubSample);
c->chrDstH = FF_CEIL_RSHIFT(dstH, c->chrDstVSubSample);
 
FF_ALLOCZ_OR_GOTO(c, c->formatConvBuffer, FFALIGN(srcW*2+78, 16) * 2, fail);
 
c->srcBpc = 1 + desc_src->comp[0].depth_minus1;
if (c->srcBpc < 8)
c->srcBpc = 8;
c->dstBpc = 1 + desc_dst->comp[0].depth_minus1;
if (c->dstBpc < 8)
c->dstBpc = 8;
if (isAnyRGB(srcFormat) || srcFormat == AV_PIX_FMT_PAL8)
c->srcBpc = 16;
if (c->dstBpc == 16)
dst_stride <<= 1;
 
if (INLINE_MMXEXT(cpu_flags) && c->srcBpc == 8 && c->dstBpc <= 14) {
c->canMMXEXTBeUsed = dstW >= srcW && (dstW & 31) == 0 &&
c->chrDstW >= c->chrSrcW &&
(srcW & 15) == 0;
if (!c->canMMXEXTBeUsed && dstW >= srcW && c->chrDstW >= c->chrSrcW && (srcW & 15) == 0
 
&& (flags & SWS_FAST_BILINEAR)) {
if (flags & SWS_PRINT_INFO)
av_log(c, AV_LOG_INFO,
"output width is not a multiple of 32 -> no MMXEXT scaler\n");
}
if (usesHFilter || isNBPS(c->srcFormat) || is16BPS(c->srcFormat) || isAnyRGB(c->srcFormat))
c->canMMXEXTBeUsed = 0;
} else
c->canMMXEXTBeUsed = 0;
 
c->chrXInc = (((int64_t)c->chrSrcW << 16) + (c->chrDstW >> 1)) / c->chrDstW;
c->chrYInc = (((int64_t)c->chrSrcH << 16) + (c->chrDstH >> 1)) / c->chrDstH;
 
/* Match pixel 0 of the src to pixel 0 of dst and match pixel n-2 of src
* to pixel n-2 of dst, but only for the FAST_BILINEAR mode otherwise do
* correct scaling.
* n-2 is the last chrominance sample available.
* This is not perfect, but no one should notice the difference, the more
* correct variant would be like the vertical one, but that would require
* some special code for the first and last pixel */
if (flags & SWS_FAST_BILINEAR) {
if (c->canMMXEXTBeUsed) {
c->lumXInc += 20;
c->chrXInc += 20;
}
// we don't use the x86 asm scaler if MMX is available
else if (INLINE_MMX(cpu_flags) && c->dstBpc <= 14) {
c->lumXInc = ((int64_t)(srcW - 2) << 16) / (dstW - 2) - 20;
c->chrXInc = ((int64_t)(c->chrSrcW - 2) << 16) / (c->chrDstW - 2) - 20;
}
}
 
// hardcoded for now
c->gamma_value = 2.2;
tmpFmt = AV_PIX_FMT_RGBA64LE;
 
 
if (!unscaled && c->gamma_flag && (srcFormat != tmpFmt || dstFormat != tmpFmt)) {
SwsContext *c2;
c->cascaded_context[0] = NULL;
 
ret = av_image_alloc(c->cascaded_tmp, c->cascaded_tmpStride,
srcW, srcH, tmpFmt, 64);
if (ret < 0)
return ret;
 
c->cascaded_context[0] = sws_getContext(srcW, srcH, srcFormat,
srcW, srcH, tmpFmt,
flags, NULL, NULL, c->param);
if (!c->cascaded_context[0]) {
return -1;
}
 
c->cascaded_context[1] = sws_getContext(srcW, srcH, tmpFmt,
dstW, dstH, tmpFmt,
flags, srcFilter, dstFilter, c->param);
 
if (!c->cascaded_context[1])
return -1;
 
c2 = c->cascaded_context[1];
c2->is_internal_gamma = 1;
c2->gamma = alloc_gamma_tbl( c->gamma_value);
c2->inv_gamma = alloc_gamma_tbl(1.f/c->gamma_value);
if (!c2->gamma || !c2->inv_gamma)
return AVERROR(ENOMEM);
 
// is_internal_flag is set after creating the context
// to properly create the gamma convert FilterDescriptor
// we have to re-initialize it
ff_free_filters(c2);
if (ff_init_filters(c2) < 0) {
sws_freeContext(c2);
return -1;
}
 
c->cascaded_context[2] = NULL;
if (dstFormat != tmpFmt) {
ret = av_image_alloc(c->cascaded1_tmp, c->cascaded1_tmpStride,
dstW, dstH, tmpFmt, 64);
if (ret < 0)
return ret;
 
c->cascaded_context[2] = sws_getContext(dstW, dstH, tmpFmt,
dstW, dstH, dstFormat,
flags, NULL, NULL, c->param);
if (!c->cascaded_context[2])
return -1;
}
return 0;
}
 
if (isBayer(srcFormat)) {
if (!unscaled ||
(dstFormat != AV_PIX_FMT_RGB24 && dstFormat != AV_PIX_FMT_YUV420P)) {
enum AVPixelFormat tmpFormat = AV_PIX_FMT_RGB24;
 
ret = av_image_alloc(c->cascaded_tmp, c->cascaded_tmpStride,
srcW, srcH, tmpFormat, 64);
if (ret < 0)
return ret;
 
c->cascaded_context[0] = sws_getContext(srcW, srcH, srcFormat,
srcW, srcH, tmpFormat,
flags, srcFilter, NULL, c->param);
if (!c->cascaded_context[0])
return -1;
 
c->cascaded_context[1] = sws_getContext(srcW, srcH, tmpFormat,
dstW, dstH, dstFormat,
flags, NULL, dstFilter, c->param);
if (!c->cascaded_context[1])
return -1;
return 0;
}
}
 
if (CONFIG_SWSCALE_ALPHA && isALPHA(srcFormat) && !isALPHA(dstFormat)) {
enum AVPixelFormat tmpFormat = alphaless_fmt(srcFormat);
 
if (tmpFormat != AV_PIX_FMT_NONE && c->alphablend != SWS_ALPHA_BLEND_NONE)
if (!unscaled ||
dstFormat != tmpFormat ||
usesHFilter || usesVFilter ||
c->srcRange != c->dstRange
) {
ret = av_image_alloc(c->cascaded_tmp, c->cascaded_tmpStride,
srcW, srcH, tmpFormat, 64);
if (ret < 0)
return ret;
 
c->cascaded_context[0] = sws_alloc_set_opts(srcW, srcH, srcFormat,
srcW, srcH, tmpFormat,
flags, c->param);
if (!c->cascaded_context[0])
return -1;
c->cascaded_context[0]->alphablend = c->alphablend;
ret = sws_init_context(c->cascaded_context[0], NULL , NULL);
if (ret < 0)
return ret;
 
c->cascaded_context[1] = sws_getContext(srcW, srcH, tmpFormat,
dstW, dstH, dstFormat,
flags, srcFilter, dstFilter, c->param);
if (!c->cascaded_context[1])
return -1;
return 0;
}
}
 
#define USE_MMAP (HAVE_MMAP && HAVE_MPROTECT && defined MAP_ANONYMOUS)
 
/* precalculate horizontal scaler filter coefficients */
{
#if HAVE_MMXEXT_INLINE
// can't downscale !!!
if (c->canMMXEXTBeUsed && (flags & SWS_FAST_BILINEAR)) {
c->lumMmxextFilterCodeSize = ff_init_hscaler_mmxext(dstW, c->lumXInc, NULL,
NULL, NULL, 8);
c->chrMmxextFilterCodeSize = ff_init_hscaler_mmxext(c->chrDstW, c->chrXInc,
NULL, NULL, NULL, 4);
 
#if USE_MMAP
c->lumMmxextFilterCode = mmap(NULL, c->lumMmxextFilterCodeSize,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1, 0);
c->chrMmxextFilterCode = mmap(NULL, c->chrMmxextFilterCodeSize,
PROT_READ | PROT_WRITE,
MAP_PRIVATE | MAP_ANONYMOUS,
-1, 0);
#elif HAVE_VIRTUALALLOC
c->lumMmxextFilterCode = VirtualAlloc(NULL,
c->lumMmxextFilterCodeSize,
MEM_COMMIT,
PAGE_EXECUTE_READWRITE);
c->chrMmxextFilterCode = VirtualAlloc(NULL,
c->chrMmxextFilterCodeSize,
MEM_COMMIT,
PAGE_EXECUTE_READWRITE);
#else
c->lumMmxextFilterCode = av_malloc(c->lumMmxextFilterCodeSize);
c->chrMmxextFilterCode = av_malloc(c->chrMmxextFilterCodeSize);
#endif
 
#ifdef MAP_ANONYMOUS
if (c->lumMmxextFilterCode == MAP_FAILED || c->chrMmxextFilterCode == MAP_FAILED)
#else
if (!c->lumMmxextFilterCode || !c->chrMmxextFilterCode)
#endif
{
av_log(c, AV_LOG_ERROR, "Failed to allocate MMX2FilterCode\n");
return AVERROR(ENOMEM);
}
 
FF_ALLOCZ_OR_GOTO(c, c->hLumFilter, (dstW / 8 + 8) * sizeof(int16_t), fail);
FF_ALLOCZ_OR_GOTO(c, c->hChrFilter, (c->chrDstW / 4 + 8) * sizeof(int16_t), fail);
FF_ALLOCZ_OR_GOTO(c, c->hLumFilterPos, (dstW / 2 / 8 + 8) * sizeof(int32_t), fail);
FF_ALLOCZ_OR_GOTO(c, c->hChrFilterPos, (c->chrDstW / 2 / 4 + 8) * sizeof(int32_t), fail);
 
ff_init_hscaler_mmxext( dstW, c->lumXInc, c->lumMmxextFilterCode,
c->hLumFilter, (uint32_t*)c->hLumFilterPos, 8);
ff_init_hscaler_mmxext(c->chrDstW, c->chrXInc, c->chrMmxextFilterCode,
c->hChrFilter, (uint32_t*)c->hChrFilterPos, 4);
 
#if USE_MMAP
if ( mprotect(c->lumMmxextFilterCode, c->lumMmxextFilterCodeSize, PROT_EXEC | PROT_READ) == -1
|| mprotect(c->chrMmxextFilterCode, c->chrMmxextFilterCodeSize, PROT_EXEC | PROT_READ) == -1) {
av_log(c, AV_LOG_ERROR, "mprotect failed, cannot use fast bilinear scaler\n");
goto fail;
}
#endif
} else
#endif /* HAVE_MMXEXT_INLINE */
{
const int filterAlign = X86_MMX(cpu_flags) ? 4 :
PPC_ALTIVEC(cpu_flags) ? 8 : 1;
 
if ((ret = initFilter(&c->hLumFilter, &c->hLumFilterPos,
&c->hLumFilterSize, c->lumXInc,
srcW, dstW, filterAlign, 1 << 14,
(flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
cpu_flags, srcFilter->lumH, dstFilter->lumH,
c->param,
get_local_pos(c, 0, 0, 0),
get_local_pos(c, 0, 0, 0))) < 0)
goto fail;
if ((ret = initFilter(&c->hChrFilter, &c->hChrFilterPos,
&c->hChrFilterSize, c->chrXInc,
c->chrSrcW, c->chrDstW, filterAlign, 1 << 14,
(flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
cpu_flags, srcFilter->chrH, dstFilter->chrH,
c->param,
get_local_pos(c, c->chrSrcHSubSample, c->src_h_chr_pos, 0),
get_local_pos(c, c->chrDstHSubSample, c->dst_h_chr_pos, 0))) < 0)
goto fail;
}
} // initialize horizontal stuff
 
/* precalculate vertical scaler filter coefficients */
{
const int filterAlign = X86_MMX(cpu_flags) ? 2 :
PPC_ALTIVEC(cpu_flags) ? 8 : 1;
 
if ((ret = initFilter(&c->vLumFilter, &c->vLumFilterPos, &c->vLumFilterSize,
c->lumYInc, srcH, dstH, filterAlign, (1 << 12),
(flags & SWS_BICUBLIN) ? (flags | SWS_BICUBIC) : flags,
cpu_flags, srcFilter->lumV, dstFilter->lumV,
c->param,
get_local_pos(c, 0, 0, 1),
get_local_pos(c, 0, 0, 1))) < 0)
goto fail;
if ((ret = initFilter(&c->vChrFilter, &c->vChrFilterPos, &c->vChrFilterSize,
c->chrYInc, c->chrSrcH, c->chrDstH,
filterAlign, (1 << 12),
(flags & SWS_BICUBLIN) ? (flags | SWS_BILINEAR) : flags,
cpu_flags, srcFilter->chrV, dstFilter->chrV,
c->param,
get_local_pos(c, c->chrSrcVSubSample, c->src_v_chr_pos, 1),
get_local_pos(c, c->chrDstVSubSample, c->dst_v_chr_pos, 1))) < 0)
 
goto fail;
 
#if HAVE_ALTIVEC
FF_ALLOC_OR_GOTO(c, c->vYCoeffsBank, sizeof(vector signed short) * c->vLumFilterSize * c->dstH, fail);
FF_ALLOC_OR_GOTO(c, c->vCCoeffsBank, sizeof(vector signed short) * c->vChrFilterSize * c->chrDstH, fail);
 
for (i = 0; i < c->vLumFilterSize * c->dstH; i++) {
int j;
short *p = (short *)&c->vYCoeffsBank[i];
for (j = 0; j < 8; j++)
p[j] = c->vLumFilter[i];
}
 
for (i = 0; i < c->vChrFilterSize * c->chrDstH; i++) {
int j;
short *p = (short *)&c->vCCoeffsBank[i];
for (j = 0; j < 8; j++)
p[j] = c->vChrFilter[i];
}
#endif
}
 
// calculate buffer sizes so that they won't run out while handling these damn slices
c->vLumBufSize = c->vLumFilterSize;
c->vChrBufSize = c->vChrFilterSize;
for (i = 0; i < dstH; i++) {
int chrI = (int64_t)i * c->chrDstH / dstH;
int nextSlice = FFMAX(c->vLumFilterPos[i] + c->vLumFilterSize - 1,
((c->vChrFilterPos[chrI] + c->vChrFilterSize - 1)
<< c->chrSrcVSubSample));
 
nextSlice >>= c->chrSrcVSubSample;
nextSlice <<= c->chrSrcVSubSample;
if (c->vLumFilterPos[i] + c->vLumBufSize < nextSlice)
c->vLumBufSize = nextSlice - c->vLumFilterPos[i];
if (c->vChrFilterPos[chrI] + c->vChrBufSize <
(nextSlice >> c->chrSrcVSubSample))
c->vChrBufSize = (nextSlice >> c->chrSrcVSubSample) -
c->vChrFilterPos[chrI];
}
 
for (i = 0; i < 4; i++)
FF_ALLOCZ_OR_GOTO(c, c->dither_error[i], (c->dstW+2) * sizeof(int), fail);
 
/* Allocate pixbufs (we use dynamic allocation because otherwise we would
* need to allocate several megabytes to handle all possible cases) */
FF_ALLOCZ_OR_GOTO(c, c->lumPixBuf, c->vLumBufSize * 3 * sizeof(int16_t *), fail);
FF_ALLOCZ_OR_GOTO(c, c->chrUPixBuf, c->vChrBufSize * 3 * sizeof(int16_t *), fail);
FF_ALLOCZ_OR_GOTO(c, c->chrVPixBuf, c->vChrBufSize * 3 * sizeof(int16_t *), fail);
if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat) && isALPHA(c->dstFormat))
FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf, c->vLumBufSize * 3 * sizeof(int16_t *), fail);
/* Note we need at least one pixel more at the end because of the MMX code
* (just in case someone wants to replace the 4000/8000). */
/* align at 16 bytes for AltiVec */
for (i = 0; i < c->vLumBufSize; i++) {
FF_ALLOCZ_OR_GOTO(c, c->lumPixBuf[i + c->vLumBufSize],
dst_stride + 16, fail);
c->lumPixBuf[i] = c->lumPixBuf[i + c->vLumBufSize];
}
// 64 / c->scalingBpp is the same as 16 / sizeof(scaling_intermediate)
c->uv_off = (dst_stride>>1) + 64 / (c->dstBpc &~ 7);
c->uv_offx2 = dst_stride + 16;
for (i = 0; i < c->vChrBufSize; i++) {
FF_ALLOC_OR_GOTO(c, c->chrUPixBuf[i + c->vChrBufSize],
dst_stride * 2 + 32, fail);
c->chrUPixBuf[i] = c->chrUPixBuf[i + c->vChrBufSize];
c->chrVPixBuf[i] = c->chrVPixBuf[i + c->vChrBufSize]
= c->chrUPixBuf[i] + (dst_stride >> 1) + 8;
}
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf)
for (i = 0; i < c->vLumBufSize; i++) {
FF_ALLOCZ_OR_GOTO(c, c->alpPixBuf[i + c->vLumBufSize],
dst_stride + 16, fail);
c->alpPixBuf[i] = c->alpPixBuf[i + c->vLumBufSize];
}
 
// try to avoid drawing green stuff between the right end and the stride end
for (i = 0; i < c->vChrBufSize; i++)
if(desc_dst->comp[0].depth_minus1 == 15){
av_assert0(c->dstBpc > 14);
for(j=0; j<dst_stride/2+1; j++)
((int32_t*)(c->chrUPixBuf[i]))[j] = 1<<18;
} else
for(j=0; j<dst_stride+1; j++)
((int16_t*)(c->chrUPixBuf[i]))[j] = 1<<14;
 
av_assert0(c->chrDstH <= dstH);
 
if (flags & SWS_PRINT_INFO) {
const char *scaler = NULL, *cpucaps;
 
for (i = 0; i < FF_ARRAY_ELEMS(scale_algorithms); i++) {
if (flags & scale_algorithms[i].flag) {
scaler = scale_algorithms[i].description;
break;
}
}
if (!scaler)
scaler = "ehh flags invalid?!";
av_log(c, AV_LOG_INFO, "%s scaler, from %s to %s%s ",
scaler,
av_get_pix_fmt_name(srcFormat),
#ifdef DITHER1XBPP
dstFormat == AV_PIX_FMT_BGR555 || dstFormat == AV_PIX_FMT_BGR565 ||
dstFormat == AV_PIX_FMT_RGB444BE || dstFormat == AV_PIX_FMT_RGB444LE ||
dstFormat == AV_PIX_FMT_BGR444BE || dstFormat == AV_PIX_FMT_BGR444LE ?
"dithered " : "",
#else
"",
#endif
av_get_pix_fmt_name(dstFormat));
 
if (INLINE_MMXEXT(cpu_flags))
cpucaps = "MMXEXT";
else if (INLINE_AMD3DNOW(cpu_flags))
cpucaps = "3DNOW";
else if (INLINE_MMX(cpu_flags))
cpucaps = "MMX";
else if (PPC_ALTIVEC(cpu_flags))
cpucaps = "AltiVec";
else
cpucaps = "C";
 
av_log(c, AV_LOG_INFO, "using %s\n", cpucaps);
 
av_log(c, AV_LOG_VERBOSE, "%dx%d -> %dx%d\n", srcW, srcH, dstW, dstH);
av_log(c, AV_LOG_DEBUG,
"lum srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
c->srcW, c->srcH, c->dstW, c->dstH, c->lumXInc, c->lumYInc);
av_log(c, AV_LOG_DEBUG,
"chr srcW=%d srcH=%d dstW=%d dstH=%d xInc=%d yInc=%d\n",
c->chrSrcW, c->chrSrcH, c->chrDstW, c->chrDstH,
c->chrXInc, c->chrYInc);
}
 
/* alpha blend special case, note this has been split via cascaded contexts if its scaled */
if (unscaled && !usesHFilter && !usesVFilter &&
c->alphablend != SWS_ALPHA_BLEND_NONE &&
isALPHA(srcFormat) &&
(c->srcRange == c->dstRange || isAnyRGB(dstFormat)) &&
alphaless_fmt(srcFormat) == dstFormat
) {
c->swscale = ff_sws_alphablendaway;
 
if (flags & SWS_PRINT_INFO)
av_log(c, AV_LOG_INFO,
"using alpha blendaway %s -> %s special converter\n",
av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
return 0;
}
 
/* unscaled special cases */
if (unscaled && !usesHFilter && !usesVFilter &&
(c->srcRange == c->dstRange || isAnyRGB(dstFormat))) {
ff_get_unscaled_swscale(c);
 
if (c->swscale) {
if (flags & SWS_PRINT_INFO)
av_log(c, AV_LOG_INFO,
"using unscaled %s -> %s special converter\n",
av_get_pix_fmt_name(srcFormat), av_get_pix_fmt_name(dstFormat));
return 0;
}
}
 
c->swscale = ff_getSwsFunc(c);
return ff_init_filters(c);
fail: // FIXME replace things by appropriate error codes
if (ret == RETCODE_USE_CASCADE) {
int tmpW = sqrt(srcW * (int64_t)dstW);
int tmpH = sqrt(srcH * (int64_t)dstH);
enum AVPixelFormat tmpFormat = AV_PIX_FMT_YUV420P;
 
if (isALPHA(srcFormat))
tmpFormat = AV_PIX_FMT_YUVA420P;
 
if (srcW*(int64_t)srcH <= 4LL*dstW*dstH)
return AVERROR(EINVAL);
 
ret = av_image_alloc(c->cascaded_tmp, c->cascaded_tmpStride,
tmpW, tmpH, tmpFormat, 64);
if (ret < 0)
return ret;
 
c->cascaded_context[0] = sws_getContext(srcW, srcH, srcFormat,
tmpW, tmpH, tmpFormat,
flags, srcFilter, NULL, c->param);
if (!c->cascaded_context[0])
return -1;
 
c->cascaded_context[1] = sws_getContext(tmpW, tmpH, tmpFormat,
dstW, dstH, dstFormat,
flags, NULL, dstFilter, c->param);
if (!c->cascaded_context[1])
return -1;
return 0;
}
return -1;
}
 
SwsContext *sws_alloc_set_opts(int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, const double *param)
{
SwsContext *c;
 
if (!(c = sws_alloc_context()))
return NULL;
 
c->flags = flags;
c->srcW = srcW;
c->srcH = srcH;
c->dstW = dstW;
c->dstH = dstH;
c->srcFormat = srcFormat;
c->dstFormat = dstFormat;
 
if (param) {
c->param[0] = param[0];
c->param[1] = param[1];
}
 
return c;
}
 
SwsContext *sws_getContext(int srcW, int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH, enum AVPixelFormat dstFormat,
int flags, SwsFilter *srcFilter,
SwsFilter *dstFilter, const double *param)
{
SwsContext *c;
 
c = sws_alloc_set_opts(srcW, srcH, srcFormat,
dstW, dstH, dstFormat,
flags, param);
if (!c)
return NULL;
 
if (sws_init_context(c, srcFilter, dstFilter) < 0) {
sws_freeContext(c);
return NULL;
}
 
return c;
}
 
static int isnan_vec(SwsVector *a)
{
int i;
for (i=0; i<a->length; i++)
if (isnan(a->coeff[i]))
return 1;
return 0;
}
 
static void makenan_vec(SwsVector *a)
{
int i;
for (i=0; i<a->length; i++)
a->coeff[i] = NAN;
}
 
SwsFilter *sws_getDefaultFilter(float lumaGBlur, float chromaGBlur,
float lumaSharpen, float chromaSharpen,
float chromaHShift, float chromaVShift,
int verbose)
{
SwsFilter *filter = av_malloc(sizeof(SwsFilter));
if (!filter)
return NULL;
 
if (lumaGBlur != 0.0) {
filter->lumH = sws_getGaussianVec(lumaGBlur, 3.0);
filter->lumV = sws_getGaussianVec(lumaGBlur, 3.0);
} else {
filter->lumH = sws_getIdentityVec();
filter->lumV = sws_getIdentityVec();
}
 
if (chromaGBlur != 0.0) {
filter->chrH = sws_getGaussianVec(chromaGBlur, 3.0);
filter->chrV = sws_getGaussianVec(chromaGBlur, 3.0);
} else {
filter->chrH = sws_getIdentityVec();
filter->chrV = sws_getIdentityVec();
}
 
if (!filter->lumH || !filter->lumV || !filter->chrH || !filter->chrV)
goto fail;
 
if (chromaSharpen != 0.0) {
SwsVector *id = sws_getIdentityVec();
if (!id)
goto fail;
sws_scaleVec(filter->chrH, -chromaSharpen);
sws_scaleVec(filter->chrV, -chromaSharpen);
sws_addVec(filter->chrH, id);
sws_addVec(filter->chrV, id);
sws_freeVec(id);
}
 
if (lumaSharpen != 0.0) {
SwsVector *id = sws_getIdentityVec();
if (!id)
goto fail;
sws_scaleVec(filter->lumH, -lumaSharpen);
sws_scaleVec(filter->lumV, -lumaSharpen);
sws_addVec(filter->lumH, id);
sws_addVec(filter->lumV, id);
sws_freeVec(id);
}
 
if (chromaHShift != 0.0)
sws_shiftVec(filter->chrH, (int)(chromaHShift + 0.5));
 
if (chromaVShift != 0.0)
sws_shiftVec(filter->chrV, (int)(chromaVShift + 0.5));
 
sws_normalizeVec(filter->chrH, 1.0);
sws_normalizeVec(filter->chrV, 1.0);
sws_normalizeVec(filter->lumH, 1.0);
sws_normalizeVec(filter->lumV, 1.0);
 
if (isnan_vec(filter->chrH) ||
isnan_vec(filter->chrV) ||
isnan_vec(filter->lumH) ||
isnan_vec(filter->lumV))
goto fail;
 
if (verbose)
sws_printVec2(filter->chrH, NULL, AV_LOG_DEBUG);
if (verbose)
sws_printVec2(filter->lumH, NULL, AV_LOG_DEBUG);
 
return filter;
 
fail:
sws_freeVec(filter->lumH);
sws_freeVec(filter->lumV);
sws_freeVec(filter->chrH);
sws_freeVec(filter->chrV);
av_freep(&filter);
return NULL;
}
 
SwsVector *sws_allocVec(int length)
{
SwsVector *vec;
 
if(length <= 0 || length > INT_MAX/ sizeof(double))
return NULL;
 
vec = av_malloc(sizeof(SwsVector));
if (!vec)
return NULL;
vec->length = length;
vec->coeff = av_malloc(sizeof(double) * length);
if (!vec->coeff)
av_freep(&vec);
return vec;
}
 
SwsVector *sws_getGaussianVec(double variance, double quality)
{
const int length = (int)(variance * quality + 0.5) | 1;
int i;
double middle = (length - 1) * 0.5;
SwsVector *vec;
 
if(variance < 0 || quality < 0)
return NULL;
 
vec = sws_allocVec(length);
 
if (!vec)
return NULL;
 
for (i = 0; i < length; i++) {
double dist = i - middle;
vec->coeff[i] = exp(-dist * dist / (2 * variance * variance)) /
sqrt(2 * variance * M_PI);
}
 
sws_normalizeVec(vec, 1.0);
 
return vec;
}
 
SwsVector *sws_getConstVec(double c, int length)
{
int i;
SwsVector *vec = sws_allocVec(length);
 
if (!vec)
return NULL;
 
for (i = 0; i < length; i++)
vec->coeff[i] = c;
 
return vec;
}
 
SwsVector *sws_getIdentityVec(void)
{
return sws_getConstVec(1.0, 1);
}
 
static double sws_dcVec(SwsVector *a)
{
int i;
double sum = 0;
 
for (i = 0; i < a->length; i++)
sum += a->coeff[i];
 
return sum;
}
 
void sws_scaleVec(SwsVector *a, double scalar)
{
int i;
 
for (i = 0; i < a->length; i++)
a->coeff[i] *= scalar;
}
 
void sws_normalizeVec(SwsVector *a, double height)
{
sws_scaleVec(a, height / sws_dcVec(a));
}
 
static SwsVector *sws_getConvVec(SwsVector *a, SwsVector *b)
{
int length = a->length + b->length - 1;
int i, j;
SwsVector *vec = sws_getConstVec(0.0, length);
 
if (!vec)
return NULL;
 
for (i = 0; i < a->length; i++) {
for (j = 0; j < b->length; j++) {
vec->coeff[i + j] += a->coeff[i] * b->coeff[j];
}
}
 
return vec;
}
 
static SwsVector *sws_sumVec(SwsVector *a, SwsVector *b)
{
int length = FFMAX(a->length, b->length);
int i;
SwsVector *vec = sws_getConstVec(0.0, length);
 
if (!vec)
return NULL;
 
for (i = 0; i < a->length; i++)
vec->coeff[i + (length - 1) / 2 - (a->length - 1) / 2] += a->coeff[i];
for (i = 0; i < b->length; i++)
vec->coeff[i + (length - 1) / 2 - (b->length - 1) / 2] += b->coeff[i];
 
return vec;
}
 
static SwsVector *sws_diffVec(SwsVector *a, SwsVector *b)
{
int length = FFMAX(a->length, b->length);
int i;
SwsVector *vec = sws_getConstVec(0.0, length);
 
if (!vec)
return NULL;
 
for (i = 0; i < a->length; i++)
vec->coeff[i + (length - 1) / 2 - (a->length - 1) / 2] += a->coeff[i];
for (i = 0; i < b->length; i++)
vec->coeff[i + (length - 1) / 2 - (b->length - 1) / 2] -= b->coeff[i];
 
return vec;
}
 
/* shift left / or right if "shift" is negative */
static SwsVector *sws_getShiftedVec(SwsVector *a, int shift)
{
int length = a->length + FFABS(shift) * 2;
int i;
SwsVector *vec = sws_getConstVec(0.0, length);
 
if (!vec)
return NULL;
 
for (i = 0; i < a->length; i++) {
vec->coeff[i + (length - 1) / 2 -
(a->length - 1) / 2 - shift] = a->coeff[i];
}
 
return vec;
}
 
void sws_shiftVec(SwsVector *a, int shift)
{
SwsVector *shifted = sws_getShiftedVec(a, shift);
if (!shifted) {
makenan_vec(a);
return;
}
av_free(a->coeff);
a->coeff = shifted->coeff;
a->length = shifted->length;
av_free(shifted);
}
 
void sws_addVec(SwsVector *a, SwsVector *b)
{
SwsVector *sum = sws_sumVec(a, b);
if (!sum) {
makenan_vec(a);
return;
}
av_free(a->coeff);
a->coeff = sum->coeff;
a->length = sum->length;
av_free(sum);
}
 
void sws_subVec(SwsVector *a, SwsVector *b)
{
SwsVector *diff = sws_diffVec(a, b);
if (!diff) {
makenan_vec(a);
return;
}
av_free(a->coeff);
a->coeff = diff->coeff;
a->length = diff->length;
av_free(diff);
}
 
void sws_convVec(SwsVector *a, SwsVector *b)
{
SwsVector *conv = sws_getConvVec(a, b);
if (!conv) {
makenan_vec(a);
return;
}
av_free(a->coeff);
a->coeff = conv->coeff;
a->length = conv->length;
av_free(conv);
}
 
SwsVector *sws_cloneVec(SwsVector *a)
{
SwsVector *vec = sws_allocVec(a->length);
 
if (!vec)
return NULL;
 
memcpy(vec->coeff, a->coeff, a->length * sizeof(*a->coeff));
 
return vec;
}
 
void sws_printVec2(SwsVector *a, AVClass *log_ctx, int log_level)
{
int i;
double max = 0;
double min = 0;
double range;
 
for (i = 0; i < a->length; i++)
if (a->coeff[i] > max)
max = a->coeff[i];
 
for (i = 0; i < a->length; i++)
if (a->coeff[i] < min)
min = a->coeff[i];
 
range = max - min;
 
for (i = 0; i < a->length; i++) {
int x = (int)((a->coeff[i] - min) * 60.0 / range + 0.5);
av_log(log_ctx, log_level, "%1.3f ", a->coeff[i]);
for (; x > 0; x--)
av_log(log_ctx, log_level, " ");
av_log(log_ctx, log_level, "|\n");
}
}
 
void sws_freeVec(SwsVector *a)
{
if (!a)
return;
av_freep(&a->coeff);
a->length = 0;
av_free(a);
}
 
void sws_freeFilter(SwsFilter *filter)
{
if (!filter)
return;
 
sws_freeVec(filter->lumH);
sws_freeVec(filter->lumV);
sws_freeVec(filter->chrH);
sws_freeVec(filter->chrV);
av_free(filter);
}
 
void sws_freeContext(SwsContext *c)
{
int i;
if (!c)
return;
 
if (c->lumPixBuf) {
for (i = 0; i < c->vLumBufSize; i++)
av_freep(&c->lumPixBuf[i]);
av_freep(&c->lumPixBuf);
}
 
if (c->chrUPixBuf) {
for (i = 0; i < c->vChrBufSize; i++)
av_freep(&c->chrUPixBuf[i]);
av_freep(&c->chrUPixBuf);
av_freep(&c->chrVPixBuf);
}
 
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
for (i = 0; i < c->vLumBufSize; i++)
av_freep(&c->alpPixBuf[i]);
av_freep(&c->alpPixBuf);
}
 
for (i = 0; i < 4; i++)
av_freep(&c->dither_error[i]);
 
av_freep(&c->vLumFilter);
av_freep(&c->vChrFilter);
av_freep(&c->hLumFilter);
av_freep(&c->hChrFilter);
#if HAVE_ALTIVEC
av_freep(&c->vYCoeffsBank);
av_freep(&c->vCCoeffsBank);
#endif
 
av_freep(&c->vLumFilterPos);
av_freep(&c->vChrFilterPos);
av_freep(&c->hLumFilterPos);
av_freep(&c->hChrFilterPos);
 
#if HAVE_MMX_INLINE
#if USE_MMAP
if (c->lumMmxextFilterCode)
munmap(c->lumMmxextFilterCode, c->lumMmxextFilterCodeSize);
if (c->chrMmxextFilterCode)
munmap(c->chrMmxextFilterCode, c->chrMmxextFilterCodeSize);
#elif HAVE_VIRTUALALLOC
if (c->lumMmxextFilterCode)
VirtualFree(c->lumMmxextFilterCode, 0, MEM_RELEASE);
if (c->chrMmxextFilterCode)
VirtualFree(c->chrMmxextFilterCode, 0, MEM_RELEASE);
#else
av_free(c->lumMmxextFilterCode);
av_free(c->chrMmxextFilterCode);
#endif
c->lumMmxextFilterCode = NULL;
c->chrMmxextFilterCode = NULL;
#endif /* HAVE_MMX_INLINE */
 
av_freep(&c->yuvTable);
av_freep(&c->formatConvBuffer);
 
sws_freeContext(c->cascaded_context[0]);
sws_freeContext(c->cascaded_context[1]);
sws_freeContext(c->cascaded_context[2]);
memset(c->cascaded_context, 0, sizeof(c->cascaded_context));
av_freep(&c->cascaded_tmp[0]);
av_freep(&c->cascaded1_tmp[0]);
 
av_freep(&c->gamma);
av_freep(&c->inv_gamma);
 
ff_free_filters(c);
 
av_free(c);
}
 
struct SwsContext *sws_getCachedContext(struct SwsContext *context, int srcW,
int srcH, enum AVPixelFormat srcFormat,
int dstW, int dstH,
enum AVPixelFormat dstFormat, int flags,
SwsFilter *srcFilter,
SwsFilter *dstFilter,
const double *param)
{
static const double default_param[2] = { SWS_PARAM_DEFAULT,
SWS_PARAM_DEFAULT };
int64_t src_h_chr_pos = -513, dst_h_chr_pos = -513,
src_v_chr_pos = -513, dst_v_chr_pos = -513;
 
if (!param)
param = default_param;
 
if (context &&
(context->srcW != srcW ||
context->srcH != srcH ||
context->srcFormat != srcFormat ||
context->dstW != dstW ||
context->dstH != dstH ||
context->dstFormat != dstFormat ||
context->flags != flags ||
context->param[0] != param[0] ||
context->param[1] != param[1])) {
 
av_opt_get_int(context, "src_h_chr_pos", 0, &src_h_chr_pos);
av_opt_get_int(context, "src_v_chr_pos", 0, &src_v_chr_pos);
av_opt_get_int(context, "dst_h_chr_pos", 0, &dst_h_chr_pos);
av_opt_get_int(context, "dst_v_chr_pos", 0, &dst_v_chr_pos);
sws_freeContext(context);
context = NULL;
}
 
if (!context) {
if (!(context = sws_alloc_context()))
return NULL;
context->srcW = srcW;
context->srcH = srcH;
context->srcFormat = srcFormat;
context->dstW = dstW;
context->dstH = dstH;
context->dstFormat = dstFormat;
context->flags = flags;
context->param[0] = param[0];
context->param[1] = param[1];
 
av_opt_set_int(context, "src_h_chr_pos", src_h_chr_pos, 0);
av_opt_set_int(context, "src_v_chr_pos", src_v_chr_pos, 0);
av_opt_set_int(context, "dst_h_chr_pos", dst_h_chr_pos, 0);
av_opt_set_int(context, "dst_v_chr_pos", dst_v_chr_pos, 0);
 
if (sws_init_context(context, srcFilter, dstFilter) < 0) {
sws_freeContext(context);
return NULL;
}
}
return context;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/version.h
0,0 → 1,56
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef SWSCALE_VERSION_H
#define SWSCALE_VERSION_H
 
/**
* @file
* swscale version macros
*/
 
#include "libavutil/version.h"
 
#define LIBSWSCALE_VERSION_MAJOR 3
#define LIBSWSCALE_VERSION_MINOR 1
#define LIBSWSCALE_VERSION_MICRO 101
 
#define LIBSWSCALE_VERSION_INT AV_VERSION_INT(LIBSWSCALE_VERSION_MAJOR, \
LIBSWSCALE_VERSION_MINOR, \
LIBSWSCALE_VERSION_MICRO)
#define LIBSWSCALE_VERSION AV_VERSION(LIBSWSCALE_VERSION_MAJOR, \
LIBSWSCALE_VERSION_MINOR, \
LIBSWSCALE_VERSION_MICRO)
#define LIBSWSCALE_BUILD LIBSWSCALE_VERSION_INT
 
#define LIBSWSCALE_IDENT "SwS" AV_STRINGIFY(LIBSWSCALE_VERSION)
 
/**
* FF_API_* defines may be placed below to indicate public API that will be
* dropped at a future version bump. The defines themselves are not part of
* the public API and may change, break or disappear at any time.
*/
 
#ifndef FF_API_SWS_CPU_CAPS
#define FF_API_SWS_CPU_CAPS (LIBSWSCALE_VERSION_MAJOR < 4)
#endif
#ifndef FF_API_ARCH_BFIN
#define FF_API_ARCH_BFIN (LIBSWSCALE_VERSION_MAJOR < 4)
#endif
 
#endif /* SWSCALE_VERSION_H */
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/vscale.c
0,0 → 1,315
/*
* Copyright (C) 2015 Pedro Arthur <bygrandao@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "swscale_internal.h"
 
typedef struct VScalerContext
{
uint16_t *filter[2];
int32_t *filter_pos;
int filter_size;
int isMMX;
void *pfn;
yuv2packedX_fn yuv2packedX;
} VScalerContext;
 
 
static int lum_planar_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
VScalerContext *inst = desc->instance;
int dstW = desc->dst->width;
 
int first = FFMAX(1-inst->filter_size, inst->filter_pos[sliceY]);
int sp = first - desc->src->plane[0].sliceY;
int dp = sliceY - desc->dst->plane[0].sliceY;
uint8_t **src = desc->src->plane[0].line + sp;
uint8_t **dst = desc->dst->plane[0].line + dp;
uint16_t *filter = inst->filter[0] + (inst->isMMX ? 0 : sliceY * inst->filter_size);
 
if (inst->filter_size == 1)
((yuv2planar1_fn)inst->pfn)((const int16_t*)src[0], dst[0], dstW, c->lumDither8, 0);
else
((yuv2planarX_fn)inst->pfn)(filter, inst->filter_size, (const int16_t**)src, dst[0], dstW, c->lumDither8, 0);
 
if (desc->alpha) {
int sp = first - desc->src->plane[3].sliceY;
int dp = sliceY - desc->dst->plane[3].sliceY;
uint8_t **src = desc->src->plane[3].line + sp;
uint8_t **dst = desc->dst->plane[3].line + dp;
uint16_t *filter = inst->filter[1] + (inst->isMMX ? 0 : sliceY * inst->filter_size);
 
if (inst->filter_size == 1)
((yuv2planar1_fn)inst->pfn)((const int16_t*)src[0], dst[0], dstW, c->lumDither8, 0);
else
((yuv2planarX_fn)inst->pfn)(filter, inst->filter_size, (const int16_t**)src, dst[0], dstW, c->lumDither8, 0);
}
 
return 1;
}
 
static int chr_planar_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
const int chrSkipMask = (1 << desc->dst->v_chr_sub_sample) - 1;
if (sliceY & chrSkipMask)
return 0;
else {
VScalerContext *inst = desc->instance;
int dstW = FF_CEIL_RSHIFT(desc->dst->width, desc->dst->h_chr_sub_sample);
int chrSliceY = sliceY >> desc->dst->v_chr_sub_sample;
 
int first = FFMAX(1-inst->filter_size, inst->filter_pos[chrSliceY]);
int sp1 = first - desc->src->plane[1].sliceY;
int sp2 = first - desc->src->plane[2].sliceY;
int dp1 = chrSliceY - desc->dst->plane[1].sliceY;
int dp2 = chrSliceY - desc->dst->plane[2].sliceY;
uint8_t **src1 = desc->src->plane[1].line + sp1;
uint8_t **src2 = desc->src->plane[2].line + sp2;
uint8_t **dst1 = desc->dst->plane[1].line + dp1;
uint8_t **dst2 = desc->dst->plane[2].line + dp2;
uint16_t *filter = inst->filter[0] + (inst->isMMX ? 0 : chrSliceY * inst->filter_size);
 
if (c->yuv2nv12cX) {
((yuv2interleavedX_fn)inst->pfn)(c, filter, inst->filter_size, (const int16_t**)src1, (const int16_t**)src2, dst1[0], dstW);
} else if (inst->filter_size == 1) {
((yuv2planar1_fn)inst->pfn)((const int16_t*)src1[0], dst1[0], dstW, c->chrDither8, 0);
((yuv2planar1_fn)inst->pfn)((const int16_t*)src2[0], dst2[0], dstW, c->chrDither8, 3);
} else {
((yuv2planarX_fn)inst->pfn)(filter, inst->filter_size, (const int16_t**)src1, dst1[0], dstW, c->chrDither8, 0);
((yuv2planarX_fn)inst->pfn)(filter, inst->filter_size, (const int16_t**)src2, dst2[0], dstW, c->chrDither8, inst->isMMX ? (c->uv_offx2 >> 1) : 3);
}
}
 
return 1;
}
 
static int packed_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
VScalerContext *inst = desc->instance;
int dstW = desc->dst->width;
int chrSliceY = sliceY >> desc->dst->v_chr_sub_sample;
 
int lum_fsize = inst[0].filter_size;
int chr_fsize = inst[1].filter_size;
uint16_t *lum_filter = inst[0].filter[0];
uint16_t *chr_filter = inst[1].filter[0];
 
int firstLum = FFMAX(1-lum_fsize, inst[0].filter_pos[chrSliceY]);
int firstChr = FFMAX(1-chr_fsize, inst[1].filter_pos[chrSliceY]);
 
int sp0 = firstLum - desc->src->plane[0].sliceY;
int sp1 = firstChr - desc->src->plane[1].sliceY;
int sp2 = firstChr - desc->src->plane[2].sliceY;
int sp3 = firstLum - desc->src->plane[3].sliceY;
int dp = sliceY - desc->dst->plane[0].sliceY;
uint8_t **src0 = desc->src->plane[0].line + sp0;
uint8_t **src1 = desc->src->plane[1].line + sp1;
uint8_t **src2 = desc->src->plane[2].line + sp2;
uint8_t **src3 = desc->alpha ? desc->src->plane[3].line + sp3 : NULL;
uint8_t **dst = desc->dst->plane[0].line + dp;
 
 
if (c->yuv2packed1 && lum_fsize == 1 && chr_fsize == 1) { // unscaled RGB
((yuv2packed1_fn)inst->pfn)(c, (const int16_t*)*src0, (const int16_t**)src1, (const int16_t**)src2,
(const int16_t*)(desc->alpha ? *src3 : NULL), *dst, dstW, 0, sliceY);
} else if (c->yuv2packed1 && lum_fsize == 1 && chr_fsize == 2 &&
chr_filter[2 * sliceY + 1] + chr_filter[2 * chrSliceY] == 4096 &&
chr_filter[2 * sliceY + 1] <= 4096U) { // unscaled RGB
int chrAlpha = chr_filter[2 * sliceY + 1];
((yuv2packed1_fn)inst->pfn)(c, (const int16_t*)*src0, (const int16_t**)src1, (const int16_t**)src2,
(const int16_t*)(desc->alpha ? *src3 : NULL), *dst, dstW, chrAlpha, sliceY);
} else if (c->yuv2packed2 && lum_fsize == 2 && chr_fsize == 2 &&
lum_filter[2 * sliceY + 1] + lum_filter[2 * sliceY] == 4096 &&
lum_filter[2 * sliceY + 1] <= 4096U &&
chr_filter[2 * chrSliceY + 1] + chr_filter[2 * chrSliceY] == 4096 &&
chr_filter[2 * chrSliceY + 1] <= 4096U
) { // bilinear upscale RGB
int lumAlpha = lum_filter[2 * sliceY + 1];
int chrAlpha = chr_filter[2 * sliceY + 1];
c->lumMmxFilter[2] =
c->lumMmxFilter[3] = lum_filter[2 * sliceY] * 0x10001;
c->chrMmxFilter[2] =
c->chrMmxFilter[3] = chr_filter[2 * chrSliceY] * 0x10001;
((yuv2packed2_fn)inst->pfn)(c, (const int16_t**)src0, (const int16_t**)src1, (const int16_t**)src2, (const int16_t**)src3,
*dst, dstW, lumAlpha, chrAlpha, sliceY);
} else { // general RGB
if ((c->yuv2packed1 && lum_fsize == 1 && chr_fsize == 2) ||
(c->yuv2packed2 && lum_fsize == 2 && chr_fsize == 2)) {
if (!c->warned_unuseable_bilinear)
av_log(c, AV_LOG_INFO, "Optimized 2 tap filter code cannot be used\n");
c->warned_unuseable_bilinear = 1;
}
 
inst->yuv2packedX(c, lum_filter + sliceY * lum_fsize,
(const int16_t**)src0, lum_fsize, chr_filter + sliceY * chr_fsize,
(const int16_t**)src1, (const int16_t**)src2, chr_fsize, (const int16_t**)src3, *dst, dstW, sliceY);
}
return 1;
}
 
static int any_vscale(SwsContext *c, SwsFilterDescriptor *desc, int sliceY, int sliceH)
{
VScalerContext *inst = desc->instance;
int dstW = desc->dst->width;
int chrSliceY = sliceY >> desc->dst->v_chr_sub_sample;
 
int lum_fsize = inst[0].filter_size;
int chr_fsize = inst[1].filter_size;
uint16_t *lum_filter = inst[0].filter[0];
uint16_t *chr_filter = inst[1].filter[0];
 
int firstLum = FFMAX(1-lum_fsize, inst[0].filter_pos[chrSliceY]);
int firstChr = FFMAX(1-chr_fsize, inst[1].filter_pos[chrSliceY]);
 
int sp0 = firstLum - desc->src->plane[0].sliceY;
int sp1 = firstChr - desc->src->plane[1].sliceY;
int sp2 = firstChr - desc->src->plane[2].sliceY;
int sp3 = firstLum - desc->src->plane[3].sliceY;
int dp0 = sliceY - desc->dst->plane[0].sliceY;
int dp1 = chrSliceY - desc->dst->plane[1].sliceY;
int dp2 = chrSliceY - desc->dst->plane[2].sliceY;
int dp3 = sliceY - desc->dst->plane[3].sliceY;
 
uint8_t **src0 = desc->src->plane[0].line + sp0;
uint8_t **src1 = desc->src->plane[1].line + sp1;
uint8_t **src2 = desc->src->plane[2].line + sp2;
uint8_t **src3 = desc->alpha ? desc->src->plane[3].line + sp3 : NULL;
uint8_t *dst[4] = { desc->dst->plane[0].line[dp0],
desc->dst->plane[1].line[dp1],
desc->dst->plane[2].line[dp2],
desc->alpha ? desc->dst->plane[3].line[dp3] : NULL };
 
av_assert1(!c->yuv2packed1 && !c->yuv2packed2);
((yuv2anyX_fn)inst->pfn)(c, lum_filter + sliceY * lum_fsize,
(const int16_t**)src0, lum_fsize, chr_filter + sliceY * chr_fsize,
(const int16_t**)src1, (const int16_t**)src2, chr_fsize, (const int16_t**)src3, dst, dstW, sliceY);
 
return 1;
 
}
 
int ff_init_vscale(SwsContext *c, SwsFilterDescriptor *desc, SwsSlice *src, SwsSlice *dst)
{
VScalerContext *lumCtx = NULL;
VScalerContext *chrCtx = NULL;
 
if (isPlanarYUV(c->dstFormat) || (isGray(c->dstFormat) && !isALPHA(c->dstFormat))) {
lumCtx = av_mallocz(sizeof(VScalerContext));
if (!lumCtx)
return AVERROR(ENOMEM);
 
 
desc[0].process = lum_planar_vscale;
desc[0].instance = lumCtx;
desc[0].src = src;
desc[0].dst = dst;
desc[0].alpha = c->alpPixBuf != 0;
 
if (!isGray(c->dstFormat)) {
chrCtx = av_mallocz(sizeof(VScalerContext));
if (!chrCtx)
return AVERROR(ENOMEM);
desc[1].process = chr_planar_vscale;
desc[1].instance = chrCtx;
desc[1].src = src;
desc[1].dst = dst;
}
} else {
lumCtx = av_mallocz_array(sizeof(VScalerContext), 2);
if (!lumCtx)
return AVERROR(ENOMEM);
chrCtx = &lumCtx[1];
 
desc[0].process = c->yuv2packedX ? packed_vscale : any_vscale;
desc[0].instance = lumCtx;
desc[0].src = src;
desc[0].dst = dst;
desc[0].alpha = c->alpPixBuf != 0;
}
 
ff_init_vscale_pfn(c, c->yuv2plane1, c->yuv2planeX, c->yuv2nv12cX,
c->yuv2packed1, c->yuv2packed2, c->yuv2packedX, c->yuv2anyX, c->use_mmx_vfilter);
return 0;
}
 
void ff_init_vscale_pfn(SwsContext *c,
yuv2planar1_fn yuv2plane1,
yuv2planarX_fn yuv2planeX,
yuv2interleavedX_fn yuv2nv12cX,
yuv2packed1_fn yuv2packed1,
yuv2packed2_fn yuv2packed2,
yuv2packedX_fn yuv2packedX,
yuv2anyX_fn yuv2anyX, int use_mmx)
{
VScalerContext *lumCtx = NULL;
VScalerContext *chrCtx = NULL;
int idx = c->numDesc - (c->is_internal_gamma ? 2 : 1);
 
if (isPlanarYUV(c->dstFormat) || (isGray(c->dstFormat) && !isALPHA(c->dstFormat))) {
if (!isGray(c->dstFormat)) {
chrCtx = c->desc[idx].instance;
 
chrCtx->filter[0] = use_mmx ? (int16_t*)c->chrMmxFilter : c->vChrFilter;
chrCtx->filter_size = c->vChrFilterSize;
chrCtx->filter_pos = c->vChrFilterPos;
chrCtx->isMMX = use_mmx;
 
--idx;
if (yuv2nv12cX) chrCtx->pfn = yuv2nv12cX;
else if (c->vChrFilterSize == 1) chrCtx->pfn = yuv2plane1;
else chrCtx->pfn = yuv2planeX;
}
 
lumCtx = c->desc[idx].instance;
 
lumCtx->filter[0] = use_mmx ? (int16_t*)c->lumMmxFilter : c->vLumFilter;
lumCtx->filter[1] = use_mmx ? (int16_t*)c->alpMmxFilter : c->vLumFilter;
lumCtx->filter_size = c->vLumFilterSize;
lumCtx->filter_pos = c->vLumFilterPos;
lumCtx->isMMX = use_mmx;
 
if (c->vLumFilterSize == 1) lumCtx->pfn = yuv2plane1;
else lumCtx->pfn = yuv2planeX;
 
} else {
lumCtx = c->desc[idx].instance;
chrCtx = &lumCtx[1];
 
lumCtx->filter[0] = c->vLumFilter;
lumCtx->filter_size = c->vLumFilterSize;
lumCtx->filter_pos = c->vLumFilterPos;
 
chrCtx->filter[0] = c->vChrFilter;
chrCtx->filter_size = c->vChrFilterSize;
chrCtx->filter_pos = c->vChrFilterPos;
 
lumCtx->isMMX = use_mmx;
chrCtx->isMMX = use_mmx;
 
if (yuv2packedX) {
if (c->yuv2packed1 && c->vLumFilterSize == 1 && c->vChrFilterSize <= 2)
lumCtx->pfn = yuv2packed1;
else if (c->yuv2packed2 && c->vLumFilterSize == 2 && c->vChrFilterSize == 2)
lumCtx->pfn = yuv2packed2;
lumCtx->yuv2packedX = yuv2packedX;
} else
lumCtx->pfn = yuv2anyX;
}
}
 
 
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/Makefile
0,0 → 1,13
$(SUBDIR)x86/swscale_mmx.o: CFLAGS += $(NOREDZONE_FLAGS)
 
OBJS += x86/rgb2rgb.o \
x86/swscale.o \
x86/yuv2rgb.o \
 
MMX-OBJS += x86/hscale_fast_bilinear_simd.o \
 
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 
YASM-OBJS += x86/input.o \
x86/output.o \
x86/scale.o \
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/hscale_fast_bilinear_simd.c
0,0 → 1,359
/*
* Copyright (C) 2001-2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "../swscale_internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
 
#define RET 0xC3 // near return opcode for x86
#define PREFETCH "prefetchnta"
 
#if HAVE_INLINE_ASM
av_cold int ff_init_hscaler_mmxext(int dstW, int xInc, uint8_t *filterCode,
int16_t *filter, int32_t *filterPos,
int numSplits)
{
uint8_t *fragmentA;
x86_reg imm8OfPShufW1A;
x86_reg imm8OfPShufW2A;
x86_reg fragmentLengthA;
uint8_t *fragmentB;
x86_reg imm8OfPShufW1B;
x86_reg imm8OfPShufW2B;
x86_reg fragmentLengthB;
int fragmentPos;
 
int xpos, i;
 
// create an optimized horizontal scaling routine
/* This scaler is made of runtime-generated MMXEXT code using specially tuned
* pshufw instructions. For every four output pixels, if four input pixels
* are enough for the fast bilinear scaling, then a chunk of fragmentB is
* used. If five input pixels are needed, then a chunk of fragmentA is used.
*/
 
// code fragment
 
__asm__ volatile (
"jmp 9f \n\t"
// Begin
"0: \n\t"
"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
"movd 1(%%"REG_c", %%"REG_S"), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm1, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
 
"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
 
"add $8, %%"REG_a" \n\t"
// End
"9: \n\t"
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
"dec %1 \n\t"
"dec %2 \n\t"
"sub %0, %1 \n\t"
"sub %0, %2 \n\t"
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
"sub %0, %3 \n\t"
 
 
: "=r" (fragmentA), "=r" (imm8OfPShufW1A), "=r" (imm8OfPShufW2A),
"=r" (fragmentLengthA)
);
 
__asm__ volatile (
"jmp 9f \n\t"
// Begin
"0: \n\t"
"movq (%%"REG_d", %%"REG_a"), %%mm3 \n\t"
"movd (%%"REG_c", %%"REG_S"), %%mm0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"pshufw $0xFF, %%mm0, %%mm1 \n\t"
"1: \n\t"
"pshufw $0xFF, %%mm0, %%mm0 \n\t"
"2: \n\t"
"psubw %%mm1, %%mm0 \n\t"
"movl 8(%%"REG_b", %%"REG_a"), %%esi \n\t"
"pmullw %%mm3, %%mm0 \n\t"
"psllw $7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
 
"movq %%mm0, (%%"REG_D", %%"REG_a") \n\t"
 
"add $8, %%"REG_a" \n\t"
// End
"9: \n\t"
"lea " LOCAL_MANGLE(0b) ", %0 \n\t"
"lea " LOCAL_MANGLE(1b) ", %1 \n\t"
"lea " LOCAL_MANGLE(2b) ", %2 \n\t"
"dec %1 \n\t"
"dec %2 \n\t"
"sub %0, %1 \n\t"
"sub %0, %2 \n\t"
"lea " LOCAL_MANGLE(9b) ", %3 \n\t"
"sub %0, %3 \n\t"
 
 
: "=r" (fragmentB), "=r" (imm8OfPShufW1B), "=r" (imm8OfPShufW2B),
"=r" (fragmentLengthB)
);
 
xpos = 0; // lumXInc/2 - 0x8000; // difference between pixel centers
fragmentPos = 0;
 
for (i = 0; i < dstW / numSplits; i++) {
int xx = xpos >> 16;
 
if ((i & 3) == 0) {
int a = 0;
int b = ((xpos + xInc) >> 16) - xx;
int c = ((xpos + xInc * 2) >> 16) - xx;
int d = ((xpos + xInc * 3) >> 16) - xx;
int inc = (d + 1 < 4);
uint8_t *fragment = inc ? fragmentB : fragmentA;
x86_reg imm8OfPShufW1 = inc ? imm8OfPShufW1B : imm8OfPShufW1A;
x86_reg imm8OfPShufW2 = inc ? imm8OfPShufW2B : imm8OfPShufW2A;
x86_reg fragmentLength = inc ? fragmentLengthB : fragmentLengthA;
int maxShift = 3 - (d + inc);
int shift = 0;
 
if (filterCode) {
filter[i] = ((xpos & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 1] = (((xpos + xInc) & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 2] = (((xpos + xInc * 2) & 0xFFFF) ^ 0xFFFF) >> 9;
filter[i + 3] = (((xpos + xInc * 3) & 0xFFFF) ^ 0xFFFF) >> 9;
filterPos[i / 2] = xx;
 
memcpy(filterCode + fragmentPos, fragment, fragmentLength);
 
filterCode[fragmentPos + imm8OfPShufW1] = (a + inc) |
((b + inc) << 2) |
((c + inc) << 4) |
((d + inc) << 6);
filterCode[fragmentPos + imm8OfPShufW2] = a | (b << 2) |
(c << 4) |
(d << 6);
 
if (i + 4 - inc >= dstW)
shift = maxShift; // avoid overread
else if ((filterPos[i / 2] & 3) <= maxShift)
shift = filterPos[i / 2] & 3; // align
 
if (shift && i >= shift) {
filterCode[fragmentPos + imm8OfPShufW1] += 0x55 * shift;
filterCode[fragmentPos + imm8OfPShufW2] += 0x55 * shift;
filterPos[i / 2] -= shift;
}
}
 
fragmentPos += fragmentLength;
 
if (filterCode)
filterCode[fragmentPos] = RET;
}
xpos += xInc;
}
if (filterCode)
filterPos[((i / 2) + 1) & (~1)] = xpos >> 16; // needed to jump to the next part
 
return fragmentPos + 1;
}
 
void ff_hyscale_fast_mmxext(SwsContext *c, int16_t *dst,
int dstWidth, const uint8_t *src,
int srcW, int xInc)
{
int32_t *filterPos = c->hLumFilterPos;
int16_t *filter = c->hLumFilter;
void *mmxextFilterCode = c->lumMmxextFilterCode;
int i;
#if ARCH_X86_64
uint64_t retsave;
#else
#if defined(PIC)
uint64_t ebxsave;
#endif
#endif
 
__asm__ volatile(
#if ARCH_X86_64
"mov -8(%%rsp), %%"REG_a" \n\t"
"mov %%"REG_a", %5 \n\t" // retsave
#else
#if defined(PIC)
"mov %%"REG_b", %5 \n\t" // ebxsave
#endif
#endif
"pxor %%mm7, %%mm7 \n\t"
"mov %0, %%"REG_c" \n\t"
"mov %1, %%"REG_D" \n\t"
"mov %2, %%"REG_d" \n\t"
"mov %3, %%"REG_b" \n\t"
"xor %%"REG_a", %%"REG_a" \n\t" // i
PREFETCH" (%%"REG_c") \n\t"
PREFETCH" 32(%%"REG_c") \n\t"
PREFETCH" 64(%%"REG_c") \n\t"
 
#if ARCH_X86_64
#define CALL_MMXEXT_FILTER_CODE \
"movl (%%"REG_b"), %%esi \n\t"\
"call *%4 \n\t"\
"movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\
"add %%"REG_S", %%"REG_c" \n\t"\
"add %%"REG_a", %%"REG_D" \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
 
#else
#define CALL_MMXEXT_FILTER_CODE \
"movl (%%"REG_b"), %%esi \n\t"\
"call *%4 \n\t"\
"addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\
"add %%"REG_a", %%"REG_D" \n\t"\
"xor %%"REG_a", %%"REG_a" \n\t"\
 
#endif /* ARCH_X86_64 */
 
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
 
#if ARCH_X86_64
"mov %5, %%"REG_a" \n\t"
"mov %%"REG_a", -8(%%rsp) \n\t"
#else
#if defined(PIC)
"mov %5, %%"REG_b" \n\t"
#endif
#endif
:: "m" (src), "m" (dst), "m" (filter), "m" (filterPos),
"m" (mmxextFilterCode)
#if ARCH_X86_64
,"m"(retsave)
#else
#if defined(PIC)
,"m" (ebxsave)
#endif
#endif
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
#if ARCH_X86_64 || !defined(PIC)
,"%"REG_b
#endif
);
 
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--)
dst[i] = src[srcW-1]*128;
}
 
void ff_hcscale_fast_mmxext(SwsContext *c, int16_t *dst1, int16_t *dst2,
int dstWidth, const uint8_t *src1,
const uint8_t *src2, int srcW, int xInc)
{
int32_t *filterPos = c->hChrFilterPos;
int16_t *filter = c->hChrFilter;
void *mmxextFilterCode = c->chrMmxextFilterCode;
int i;
#if ARCH_X86_64
DECLARE_ALIGNED(8, uint64_t, retsave);
#else
#if defined(PIC)
DECLARE_ALIGNED(8, uint64_t, ebxsave);
#endif
#endif
__asm__ volatile(
#if ARCH_X86_64
"mov -8(%%rsp), %%"REG_a" \n\t"
"mov %%"REG_a", %7 \n\t" // retsave
#else
#if defined(PIC)
"mov %%"REG_b", %7 \n\t" // ebxsave
#endif
#endif
"pxor %%mm7, %%mm7 \n\t"
"mov %0, %%"REG_c" \n\t"
"mov %1, %%"REG_D" \n\t"
"mov %2, %%"REG_d" \n\t"
"mov %3, %%"REG_b" \n\t"
"xor %%"REG_a", %%"REG_a" \n\t" // i
PREFETCH" (%%"REG_c") \n\t"
PREFETCH" 32(%%"REG_c") \n\t"
PREFETCH" 64(%%"REG_c") \n\t"
 
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
"xor %%"REG_a", %%"REG_a" \n\t" // i
"mov %5, %%"REG_c" \n\t" // src2
"mov %6, %%"REG_D" \n\t" // dst2
PREFETCH" (%%"REG_c") \n\t"
PREFETCH" 32(%%"REG_c") \n\t"
PREFETCH" 64(%%"REG_c") \n\t"
 
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
CALL_MMXEXT_FILTER_CODE
 
#if ARCH_X86_64
"mov %7, %%"REG_a" \n\t"
"mov %%"REG_a", -8(%%rsp) \n\t"
#else
#if defined(PIC)
"mov %7, %%"REG_b" \n\t"
#endif
#endif
:: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos),
"m" (mmxextFilterCode), "m" (src2), "m"(dst2)
#if ARCH_X86_64
,"m"(retsave)
#else
#if defined(PIC)
,"m" (ebxsave)
#endif
#endif
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D
#if ARCH_X86_64 || !defined(PIC)
,"%"REG_b
#endif
);
 
for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) {
dst1[i] = src1[srcW-1]*128;
dst2[i] = src2[srcW-1]*128;
}
}
#endif //HAVE_INLINE_ASM
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/input.asm
0,0 → 1,740
;******************************************************************************
;* x86-optimized input routines; does shuffling of packed
;* YUV formats into individual planes, and converts RGB
;* into YUV planes also.
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
%define RY 0x20DE
%define GY 0x4087
%define BY 0x0C88
%define RU 0xECFF
%define GU 0xDAC8
%define BU 0x3838
%define RV 0x3838
%define GV 0xD0E3
%define BV 0xF6E4
 
rgb_Yrnd: times 4 dd 0x80100 ; 16.5 << 15
rgb_UVrnd: times 4 dd 0x400100 ; 128.5 << 15
%define bgr_Ycoeff_12x4 16*4 + 16* 0 + tableq
%define bgr_Ycoeff_3x56 16*4 + 16* 1 + tableq
%define rgb_Ycoeff_12x4 16*4 + 16* 2 + tableq
%define rgb_Ycoeff_3x56 16*4 + 16* 3 + tableq
%define bgr_Ucoeff_12x4 16*4 + 16* 4 + tableq
%define bgr_Ucoeff_3x56 16*4 + 16* 5 + tableq
%define rgb_Ucoeff_12x4 16*4 + 16* 6 + tableq
%define rgb_Ucoeff_3x56 16*4 + 16* 7 + tableq
%define bgr_Vcoeff_12x4 16*4 + 16* 8 + tableq
%define bgr_Vcoeff_3x56 16*4 + 16* 9 + tableq
%define rgb_Vcoeff_12x4 16*4 + 16*10 + tableq
%define rgb_Vcoeff_3x56 16*4 + 16*11 + tableq
 
%define rgba_Ycoeff_rb 16*4 + 16*12 + tableq
%define rgba_Ycoeff_br 16*4 + 16*13 + tableq
%define rgba_Ycoeff_ga 16*4 + 16*14 + tableq
%define rgba_Ycoeff_ag 16*4 + 16*15 + tableq
%define rgba_Ucoeff_rb 16*4 + 16*16 + tableq
%define rgba_Ucoeff_br 16*4 + 16*17 + tableq
%define rgba_Ucoeff_ga 16*4 + 16*18 + tableq
%define rgba_Ucoeff_ag 16*4 + 16*19 + tableq
%define rgba_Vcoeff_rb 16*4 + 16*20 + tableq
%define rgba_Vcoeff_br 16*4 + 16*21 + tableq
%define rgba_Vcoeff_ga 16*4 + 16*22 + tableq
%define rgba_Vcoeff_ag 16*4 + 16*23 + tableq
 
; bgr_Ycoeff_12x4: times 2 dw BY, GY, 0, BY
; bgr_Ycoeff_3x56: times 2 dw RY, 0, GY, RY
; rgb_Ycoeff_12x4: times 2 dw RY, GY, 0, RY
; rgb_Ycoeff_3x56: times 2 dw BY, 0, GY, BY
; bgr_Ucoeff_12x4: times 2 dw BU, GU, 0, BU
; bgr_Ucoeff_3x56: times 2 dw RU, 0, GU, RU
; rgb_Ucoeff_12x4: times 2 dw RU, GU, 0, RU
; rgb_Ucoeff_3x56: times 2 dw BU, 0, GU, BU
; bgr_Vcoeff_12x4: times 2 dw BV, GV, 0, BV
; bgr_Vcoeff_3x56: times 2 dw RV, 0, GV, RV
; rgb_Vcoeff_12x4: times 2 dw RV, GV, 0, RV
; rgb_Vcoeff_3x56: times 2 dw BV, 0, GV, BV
 
; rgba_Ycoeff_rb: times 4 dw RY, BY
; rgba_Ycoeff_br: times 4 dw BY, RY
; rgba_Ycoeff_ga: times 4 dw GY, 0
; rgba_Ycoeff_ag: times 4 dw 0, GY
; rgba_Ucoeff_rb: times 4 dw RU, BU
; rgba_Ucoeff_br: times 4 dw BU, RU
; rgba_Ucoeff_ga: times 4 dw GU, 0
; rgba_Ucoeff_ag: times 4 dw 0, GU
; rgba_Vcoeff_rb: times 4 dw RV, BV
; rgba_Vcoeff_br: times 4 dw BV, RV
; rgba_Vcoeff_ga: times 4 dw GV, 0
; rgba_Vcoeff_ag: times 4 dw 0, GV
 
shuf_rgb_12x4: db 0, 0x80, 1, 0x80, 2, 0x80, 3, 0x80, \
6, 0x80, 7, 0x80, 8, 0x80, 9, 0x80
shuf_rgb_3x56: db 2, 0x80, 3, 0x80, 4, 0x80, 5, 0x80, \
8, 0x80, 9, 0x80, 10, 0x80, 11, 0x80
 
SECTION .text
 
;-----------------------------------------------------------------------------
; RGB to Y/UV.
;
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
; and
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
; const uint8_t *unused, int w);
;-----------------------------------------------------------------------------
 
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_Y_FN 2-3
cglobal %2 %+ 24ToY, 6, 6, %1, dst, src, u1, u2, w, table
%if mmsize == 8
mova m5, [%2_Ycoeff_12x4]
mova m6, [%2_Ycoeff_3x56]
%define coeff1 m5
%define coeff2 m6
%elif ARCH_X86_64
mova m8, [%2_Ycoeff_12x4]
mova m9, [%2_Ycoeff_3x56]
%define coeff1 m8
%define coeff2 m9
%else ; x86-32 && mmsize == 16
%define coeff1 [%2_Ycoeff_12x4]
%define coeff2 [%2_Ycoeff_3x56]
%endif ; x86-32/64 && mmsize == 8/16
%if (ARCH_X86_64 || mmsize == 8) && %0 == 3
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToY %+ SUFFIX).body
%else ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
.body:
%if cpuflag(ssse3)
mova m7, [shuf_rgb_12x4]
%define shuf_rgb1 m7
%if ARCH_X86_64
mova m10, [shuf_rgb_3x56]
%define shuf_rgb2 m10
%else ; x86-32
%define shuf_rgb2 [shuf_rgb_3x56]
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
movsxd wq, wd
%endif
add wq, wq
add dstq, wq
neg wq
%if notcpuflag(ssse3)
pxor m7, m7
%endif ; !cpuflag(ssse3)
mova m4, [rgb_Yrnd]
.loop:
%if cpuflag(ssse3)
movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
movu m2, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
pshufb m3, m2, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m2, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else ; !cpuflag(ssse3)
movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m2, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m3, [srcq+8] ; (byte) { R2, B3, G3, R3 }
%if mmsize == 16 ; i.e. sse2
punpckldq m0, m2 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m3 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m2, [srcq+12] ; (byte) { B4, G4, R4, B5 }
movd m3, [srcq+14] ; (byte) { R4, B5, G5, R5 }
movd m5, [srcq+18] ; (byte) { B6, G6, R6, B7 }
movd m6, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m2, m5 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m3, m6 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
punpcklbw m2, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m3, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; cpuflag(ssse3)
add srcq, 3 * mmsize / 2
pmaddwd m0, coeff1 ; (dword) { B0*BY + G0*GY, B1*BY, B2*BY + G2*GY, B3*BY }
pmaddwd m1, coeff2 ; (dword) { R0*RY, G1+GY + R1*RY, R2*RY, G3+GY + R3*RY }
pmaddwd m2, coeff1 ; (dword) { B4*BY + G4*GY, B5*BY, B6*BY + G6*GY, B7*BY }
pmaddwd m3, coeff2 ; (dword) { R4*RY, G5+GY + R5*RY, R6*RY, G7+GY + R7*RY }
paddd m0, m1 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[0-3]
paddd m2, m3 ; (dword) { Bx*BY + Gx*GY + Rx*RY }[4-7]
paddd m0, m4 ; += rgb_Yrnd, i.e. (dword) { Y[0-3] }
paddd m2, m4 ; += rgb_Yrnd, i.e. (dword) { Y[4-7] }
psrad m0, 9
psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
mova [dstq+wq], m0
add wq, mmsize
jl .loop
REP_RET
%endif ; (ARCH_X86_64 && %0 == 3) || mmsize == 8
%endmacro
 
; %1 = nr. of XMM registers
; %2 = rgb or bgr
%macro RGB24_TO_UV_FN 2-3
cglobal %2 %+ 24ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [%2_Ucoeff_12x4]
mova m9, [%2_Ucoeff_3x56]
mova m10, [%2_Vcoeff_12x4]
mova m11, [%2_Vcoeff_3x56]
%define coeffU1 m8
%define coeffU2 m9
%define coeffV1 m10
%define coeffV2 m11
%else ; x86-32
%define coeffU1 [%2_Ucoeff_12x4]
%define coeffU2 [%2_Ucoeff_3x56]
%define coeffV1 [%2_Vcoeff_12x4]
%define coeffV2 [%2_Vcoeff_3x56]
%endif ; x86-32/64
%if ARCH_X86_64 && %0 == 3
jmp mangle(private_prefix %+ _ %+ %3 %+ 24ToUV %+ SUFFIX).body
%else ; ARCH_X86_64 && %0 == 3
.body:
%if cpuflag(ssse3)
mova m7, [shuf_rgb_12x4]
%define shuf_rgb1 m7
%if ARCH_X86_64
mova m12, [shuf_rgb_3x56]
%define shuf_rgb2 m12
%else ; x86-32
%define shuf_rgb2 [shuf_rgb_3x56]
%endif ; x86-32/64
%endif ; cpuflag(ssse3)
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add wq, wq
add dstUq, wq
add dstVq, wq
neg wq
mova m6, [rgb_UVrnd]
%if notcpuflag(ssse3)
pxor m7, m7
%endif
.loop:
%if cpuflag(ssse3)
movu m0, [srcq+0] ; (byte) { Bx, Gx, Rx }[0-3]
movu m4, [srcq+12] ; (byte) { Bx, Gx, Rx }[4-7]
pshufb m1, m0, shuf_rgb2 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
pshufb m0, shuf_rgb1 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
%else ; !cpuflag(ssse3)
movd m0, [srcq+0] ; (byte) { B0, G0, R0, B1 }
movd m1, [srcq+2] ; (byte) { R0, B1, G1, R1 }
movd m4, [srcq+6] ; (byte) { B2, G2, R2, B3 }
movd m5, [srcq+8] ; (byte) { R2, B3, G3, R3 }
%if mmsize == 16
punpckldq m0, m4 ; (byte) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpckldq m1, m5 ; (byte) { R0, B1, G1, R1, R2, B3, G3, R3 }
movd m4, [srcq+12] ; (byte) { B4, G4, R4, B5 }
movd m5, [srcq+14] ; (byte) { R4, B5, G5, R5 }
%endif ; mmsize == 16
punpcklbw m0, m7 ; (word) { B0, G0, R0, B1, B2, G2, R2, B3 }
punpcklbw m1, m7 ; (word) { R0, B1, G1, R1, R2, B3, G3, R3 }
%endif ; cpuflag(ssse3)
pmaddwd m2, m0, coeffV1 ; (dword) { B0*BV + G0*GV, B1*BV, B2*BV + G2*GV, B3*BV }
pmaddwd m3, m1, coeffV2 ; (dword) { R0*BV, G1*GV + R1*BV, R2*BV, G3*GV + R3*BV }
pmaddwd m0, coeffU1 ; (dword) { B0*BU + G0*GU, B1*BU, B2*BU + G2*GU, B3*BU }
pmaddwd m1, coeffU2 ; (dword) { R0*BU, G1*GU + R1*BU, R2*BU, G3*GU + R3*BU }
paddd m0, m1 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[0-3]
paddd m2, m3 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[0-3]
%if cpuflag(ssse3)
pshufb m5, m4, shuf_rgb2 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
pshufb m4, shuf_rgb1 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
%else ; !cpuflag(ssse3)
%if mmsize == 16
movd m1, [srcq+18] ; (byte) { B6, G6, R6, B7 }
movd m3, [srcq+20] ; (byte) { R6, B7, G7, R7 }
punpckldq m4, m1 ; (byte) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpckldq m5, m3 ; (byte) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; mmsize == 16 && !cpuflag(ssse3)
punpcklbw m4, m7 ; (word) { B4, G4, R4, B5, B6, G6, R6, B7 }
punpcklbw m5, m7 ; (word) { R4, B5, G5, R5, R6, B7, G7, R7 }
%endif ; cpuflag(ssse3)
add srcq, 3 * mmsize / 2
pmaddwd m1, m4, coeffU1 ; (dword) { B4*BU + G4*GU, B5*BU, B6*BU + G6*GU, B7*BU }
pmaddwd m3, m5, coeffU2 ; (dword) { R4*BU, G5*GU + R5*BU, R6*BU, G7*GU + R7*BU }
pmaddwd m4, coeffV1 ; (dword) { B4*BV + G4*GV, B5*BV, B6*BV + G6*GV, B7*BV }
pmaddwd m5, coeffV2 ; (dword) { R4*BV, G5*GV + R5*BV, R6*BV, G7*GV + R7*BV }
paddd m1, m3 ; (dword) { Bx*BU + Gx*GU + Rx*RU }[4-7]
paddd m4, m5 ; (dword) { Bx*BV + Gx*GV + Rx*RV }[4-7]
paddd m0, m6 ; += rgb_UVrnd, i.e. (dword) { U[0-3] }
paddd m2, m6 ; += rgb_UVrnd, i.e. (dword) { V[0-3] }
paddd m1, m6 ; += rgb_UVrnd, i.e. (dword) { U[4-7] }
paddd m4, m6 ; += rgb_UVrnd, i.e. (dword) { V[4-7] }
psrad m0, 9
psrad m2, 9
psrad m1, 9
psrad m4, 9
packssdw m0, m1 ; (word) { U[0-7] }
packssdw m2, m4 ; (word) { V[0-7] }
%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%else ; mmsize == 16
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
%endmacro
 
; %1 = nr. of XMM registers for rgb-to-Y func
; %2 = nr. of XMM registers for rgb-to-UV func
%macro RGB24_FUNCS 2
RGB24_TO_Y_FN %1, rgb
RGB24_TO_Y_FN %1, bgr, rgb
RGB24_TO_UV_FN %2, rgb
RGB24_TO_UV_FN %2, bgr, rgb
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
RGB24_FUNCS 0, 0
%endif
 
INIT_XMM sse2
RGB24_FUNCS 10, 12
 
INIT_XMM ssse3
RGB24_FUNCS 11, 13
 
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB24_FUNCS 11, 13
%endif
 
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_Y_FN 5-6
cglobal %2%3%4%5 %+ ToY, 6, 6, %1, dst, src, u1, u2, w, table
mova m5, [rgba_Ycoeff_%2%4]
mova m6, [rgba_Ycoeff_%3%5]
%if %0 == 6
jmp mangle(private_prefix %+ _ %+ %6 %+ ToY %+ SUFFIX).body
%else ; %0 == 6
.body:
%if ARCH_X86_64
movsxd wq, wd
%endif
add wq, wq
sub wq, mmsize - 1
lea srcq, [srcq+wq*2]
add dstq, wq
neg wq
mova m4, [rgb_Yrnd]
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
.loop:
; FIXME check alignment and use mova
movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
movu m2, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
pmaddwd m3, m5 ; (dword) { Bx*BY + Rx*RY }[4-7]
pmaddwd m2, m6 ; (dword) { Gx*GY }[4-7]
paddd m0, m4 ; += rgb_Yrnd
paddd m2, m4 ; += rgb_Yrnd
paddd m0, m1 ; (dword) { Y[0-3] }
paddd m2, m3 ; (dword) { Y[4-7] }
psrad m0, 9
psrad m2, 9
packssdw m0, m2 ; (word) { Y[0-7] }
mova [dstq+wq], m0
add wq, mmsize
jl .loop
sub wq, mmsize - 1
jz .end
add srcq, 2*mmsize - 2
add dstq, mmsize - 1
.loop2:
movd m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
DEINTB 1, 0, 3, 2, 7 ; (word) { Gx, xx (m0/m2) or Bx, Rx (m1/m3) }[0-3]/[4-7]
pmaddwd m1, m5 ; (dword) { Bx*BY + Rx*RY }[0-3]
pmaddwd m0, m6 ; (dword) { Gx*GY }[0-3]
paddd m0, m4 ; += rgb_Yrnd
paddd m0, m1 ; (dword) { Y[0-3] }
psrad m0, 9
packssdw m0, m0 ; (word) { Y[0-7] }
movd [dstq+wq], m0
add wq, 2
jl .loop2
.end:
REP_RET
%endif ; %0 == 3
%endmacro
 
; %1 = nr. of XMM registers
; %2-5 = rgba, bgra, argb or abgr (in individual characters)
%macro RGB32_TO_UV_FN 5-6
cglobal %2%3%4%5 %+ ToUV, 7, 7, %1, dstU, dstV, u1, src, u2, w, table
%if ARCH_X86_64
mova m8, [rgba_Ucoeff_%2%4]
mova m9, [rgba_Ucoeff_%3%5]
mova m10, [rgba_Vcoeff_%2%4]
mova m11, [rgba_Vcoeff_%3%5]
%define coeffU1 m8
%define coeffU2 m9
%define coeffV1 m10
%define coeffV2 m11
%else ; x86-32
%define coeffU1 [rgba_Ucoeff_%2%4]
%define coeffU2 [rgba_Ucoeff_%3%5]
%define coeffV1 [rgba_Vcoeff_%2%4]
%define coeffV2 [rgba_Vcoeff_%3%5]
%endif ; x86-64/32
%if ARCH_X86_64 && %0 == 6
jmp mangle(private_prefix %+ _ %+ %6 %+ ToUV %+ SUFFIX).body
%else ; ARCH_X86_64 && %0 == 6
.body:
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add wq, wq
sub wq, mmsize - 1
add dstUq, wq
add dstVq, wq
lea srcq, [srcq+wq*2]
neg wq
pcmpeqb m7, m7
psrlw m7, 8 ; (word) { 0x00ff } x4
mova m6, [rgb_UVrnd]
.loop:
; FIXME check alignment and use mova
movu m0, [srcq+wq*2+0] ; (byte) { Bx, Gx, Rx, xx }[0-3]
movu m4, [srcq+wq*2+mmsize] ; (byte) { Bx, Gx, Rx, xx }[4-7]
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
paddd m3, m6 ; += rgb_UVrnd
paddd m1, m6 ; += rgb_UVrnd
paddd m2, m3 ; (dword) { V[0-3] }
paddd m0, m1 ; (dword) { U[0-3] }
pmaddwd m3, m5, coeffV1 ; (dword) { Bx*BV + Rx*RV }[4-7]
pmaddwd m1, m4, coeffV2 ; (dword) { Gx*GV }[4-7]
pmaddwd m5, coeffU1 ; (dword) { Bx*BU + Rx*RU }[4-7]
pmaddwd m4, coeffU2 ; (dword) { Gx*GU }[4-7]
paddd m3, m6 ; += rgb_UVrnd
paddd m5, m6 ; += rgb_UVrnd
psrad m0, 9
paddd m1, m3 ; (dword) { V[4-7] }
paddd m4, m5 ; (dword) { U[4-7] }
psrad m2, 9
psrad m4, 9
psrad m1, 9
packssdw m0, m4 ; (word) { U[0-7] }
packssdw m2, m1 ; (word) { V[0-7] }
%if mmsize == 8
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%else ; mmsize == 16
mova [dstUq+wq], m0
mova [dstVq+wq], m2
%endif ; mmsize == 8/16
add wq, mmsize
jl .loop
sub wq, mmsize - 1
jz .end
add srcq , 2*mmsize - 2
add dstUq, mmsize - 1
add dstVq, mmsize - 1
.loop2:
movd m0, [srcq+wq*2] ; (byte) { Bx, Gx, Rx, xx }[0-3]
DEINTB 1, 0, 5, 4, 7 ; (word) { Gx, xx (m0/m4) or Bx, Rx (m1/m5) }[0-3]/[4-7]
pmaddwd m3, m1, coeffV1 ; (dword) { Bx*BV + Rx*RV }[0-3]
pmaddwd m2, m0, coeffV2 ; (dword) { Gx*GV }[0-3]
pmaddwd m1, coeffU1 ; (dword) { Bx*BU + Rx*RU }[0-3]
pmaddwd m0, coeffU2 ; (dword) { Gx*GU }[0-3]
paddd m3, m6 ; += rgb_UVrnd
paddd m1, m6 ; += rgb_UVrnd
paddd m2, m3 ; (dword) { V[0-3] }
paddd m0, m1 ; (dword) { U[0-3] }
psrad m0, 9
psrad m2, 9
packssdw m0, m0 ; (word) { U[0-7] }
packssdw m2, m2 ; (word) { V[0-7] }
movd [dstUq+wq], m0
movd [dstVq+wq], m2
add wq, 2
jl .loop2
.end:
REP_RET
%endif ; ARCH_X86_64 && %0 == 3
%endmacro
 
; %1 = nr. of XMM registers for rgb-to-Y func
; %2 = nr. of XMM registers for rgb-to-UV func
%macro RGB32_FUNCS 2
RGB32_TO_Y_FN %1, r, g, b, a
RGB32_TO_Y_FN %1, b, g, r, a, rgba
RGB32_TO_Y_FN %1, a, r, g, b, rgba
RGB32_TO_Y_FN %1, a, b, g, r, rgba
 
RGB32_TO_UV_FN %2, r, g, b, a
RGB32_TO_UV_FN %2, b, g, r, a, rgba
RGB32_TO_UV_FN %2, a, r, g, b, rgba
RGB32_TO_UV_FN %2, a, b, g, r, rgba
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
RGB32_FUNCS 0, 0
%endif
 
INIT_XMM sse2
RGB32_FUNCS 8, 12
 
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
RGB32_FUNCS 8, 12
%endif
 
;-----------------------------------------------------------------------------
; YUYV/UYVY/NV12/NV21 packed pixel shuffling.
;
; void <fmt>ToY_<opt>(uint8_t *dst, const uint8_t *src, int w);
; and
; void <fmt>toUV_<opt>(uint8_t *dstU, uint8_t *dstV, const uint8_t *src,
; const uint8_t *unused, int w);
;-----------------------------------------------------------------------------
 
; %1 = a (aligned) or u (unaligned)
; %2 = yuyv or uyvy
%macro LOOP_YUYV_TO_Y 2
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
%ifidn %2, yuyv
pand m0, m2 ; (word) { Y0, Y1, ..., Y7 }
pand m1, m2 ; (word) { Y8, Y9, ..., Y15 }
%else ; uyvy
psrlw m0, 8 ; (word) { Y0, Y1, ..., Y7 }
psrlw m1, 8 ; (word) { Y8, Y9, ..., Y15 }
%endif ; yuyv/uyvy
packuswb m0, m1 ; (byte) { Y0, ..., Y15 }
mova [dstq+wq], m0
add wq, mmsize
jl .loop_%1
REP_RET
%endmacro
 
; %1 = nr. of XMM registers
; %2 = yuyv or uyvy
; %3 = if specified, it means that unaligned and aligned code in loop
; will be the same (i.e. YUYV+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_Y_FN 2-3
cglobal %2ToY, 5, 5, %1, dst, unused0, unused1, src, w
%if ARCH_X86_64
movsxd wq, wd
%endif
add dstq, wq
%if mmsize == 16
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
%ifidn %2, yuyv
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
%endif ; yuyv
%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_Y a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_Y u, %2
%else ; mmsize == 8
neg wq
LOOP_YUYV_TO_Y a, %2
%endif ; mmsize == 8/16
%endmacro
 
; %1 = a (aligned) or u (unaligned)
; %2 = yuyv or uyvy
%macro LOOP_YUYV_TO_UV 2
.loop_%1:
%ifidn %2, yuyv
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
psrlw m0, 8 ; (word) { U0, V0, ..., U3, V3 }
psrlw m1, 8 ; (word) { U4, V4, ..., U7, V7 }
%else ; uyvy
%if cpuflag(avx)
vpand m0, m2, [srcq+wq*4] ; (word) { U0, V0, ..., U3, V3 }
vpand m1, m2, [srcq+wq*4+mmsize] ; (word) { U4, V4, ..., U7, V7 }
%else
mov%1 m0, [srcq+wq*4] ; (byte) { Y0, U0, Y1, V0, ... }
mov%1 m1, [srcq+wq*4+mmsize] ; (byte) { Y8, U4, Y9, V4, ... }
pand m0, m2 ; (word) { U0, V0, ..., U3, V3 }
pand m1, m2 ; (word) { U4, V4, ..., U7, V7 }
%endif
%endif ; yuyv/uyvy
packuswb m0, m1 ; (byte) { U0, V0, ..., U7, V7 }
pand m1, m0, m2 ; (word) { U0, U1, ..., U7 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
%if mmsize == 16
packuswb m1, m0 ; (byte) { U0, ... U7, V1, ... V7 }
movh [dstUq+wq], m1
movhps [dstVq+wq], m1
%else ; mmsize == 8
packuswb m1, m1 ; (byte) { U0, ... U3 }
packuswb m0, m0 ; (byte) { V0, ... V3 }
movh [dstUq+wq], m1
movh [dstVq+wq], m0
%endif ; mmsize == 8/16
add wq, mmsize / 2
jl .loop_%1
REP_RET
%endmacro
 
; %1 = nr. of XMM registers
; %2 = yuyv or uyvy
; %3 = if specified, it means that unaligned and aligned code in loop
; will be the same (i.e. UYVY+AVX), and thus we don't need to
; split the loop in an aligned and unaligned case
%macro YUYV_TO_UV_FN 2-3
cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
%if mmsize == 16 && %0 == 2
test srcq, 15
%endif
lea srcq, [srcq+wq*4]
pcmpeqb m2, m2 ; (byte) { 0xff } x 16
psrlw m2, 8 ; (word) { 0x00ff } x 8
; NOTE: if uyvy+avx, u/a are identical
%if mmsize == 16 && %0 == 2
jnz .loop_u_start
neg wq
LOOP_YUYV_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_YUYV_TO_UV u, %2
%else ; mmsize == 8
neg wq
LOOP_YUYV_TO_UV a, %2
%endif ; mmsize == 8/16
%endmacro
 
; %1 = a (aligned) or u (unaligned)
; %2 = nv12 or nv21
%macro LOOP_NVXX_TO_UV 2
.loop_%1:
mov%1 m0, [srcq+wq*2] ; (byte) { U0, V0, U1, V1, ... }
mov%1 m1, [srcq+wq*2+mmsize] ; (byte) { U8, V8, U9, V9, ... }
pand m2, m0, m5 ; (word) { U0, U1, ..., U7 }
pand m3, m1, m5 ; (word) { U8, U9, ..., U15 }
psrlw m0, 8 ; (word) { V0, V1, ..., V7 }
psrlw m1, 8 ; (word) { V8, V9, ..., V15 }
packuswb m2, m3 ; (byte) { U0, ..., U15 }
packuswb m0, m1 ; (byte) { V0, ..., V15 }
%ifidn %2, nv12
mova [dstUq+wq], m2
mova [dstVq+wq], m0
%else ; nv21
mova [dstVq+wq], m2
mova [dstUq+wq], m0
%endif ; nv12/21
add wq, mmsize
jl .loop_%1
REP_RET
%endmacro
 
; %1 = nr. of XMM registers
; %2 = nv12 or nv21
%macro NVXX_TO_UV_FN 2
cglobal %2ToUV, 4, 5, %1, dstU, dstV, unused, src, w
%if ARCH_X86_64
movsxd wq, dword r5m
%else ; x86-32
mov wq, r5m
%endif
add dstUq, wq
add dstVq, wq
%if mmsize == 16
test srcq, 15
%endif
lea srcq, [srcq+wq*2]
pcmpeqb m5, m5 ; (byte) { 0xff } x 16
psrlw m5, 8 ; (word) { 0x00ff } x 8
%if mmsize == 16
jnz .loop_u_start
neg wq
LOOP_NVXX_TO_UV a, %2
.loop_u_start:
neg wq
LOOP_NVXX_TO_UV u, %2
%else ; mmsize == 8
neg wq
LOOP_NVXX_TO_UV a, %2
%endif ; mmsize == 8/16
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
YUYV_TO_Y_FN 0, yuyv
YUYV_TO_Y_FN 0, uyvy
YUYV_TO_UV_FN 0, yuyv
YUYV_TO_UV_FN 0, uyvy
NVXX_TO_UV_FN 0, nv12
NVXX_TO_UV_FN 0, nv21
%endif
 
INIT_XMM sse2
YUYV_TO_Y_FN 3, yuyv
YUYV_TO_Y_FN 2, uyvy
YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
 
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
; in theory, we could write a yuy2-to-y using vpand (i.e. AVX), but
; that's not faster in practice
YUYV_TO_UV_FN 3, yuyv
YUYV_TO_UV_FN 3, uyvy, 1
NVXX_TO_UV_FN 5, nv12
NVXX_TO_UV_FN 5, nv21
%endif
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/output.asm
0,0 → 1,413
;******************************************************************************
;* x86-optimized vertical line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;* Kieran Kunhya <kieran@kunhya.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
minshort: times 8 dw 0x8000
yuv2yuvX_16_start: times 4 dd 0x4000 - 0x40000000
yuv2yuvX_10_start: times 4 dd 0x10000
yuv2yuvX_9_start: times 4 dd 0x20000
yuv2yuvX_10_upper: times 8 dw 0x3ff
yuv2yuvX_9_upper: times 8 dw 0x1ff
pd_4: times 4 dd 4
pd_4min0x40000:times 4 dd 4 - (0x40000)
pw_16: times 8 dw 16
pw_32: times 8 dw 32
pw_512: times 8 dw 512
pw_1024: times 8 dw 1024
 
SECTION .text
 
;-----------------------------------------------------------------------------
; vertical line scaling
;
; void yuv2plane1_<output_size>_<opt>(const int16_t *src, uint8_t *dst, int dstW,
; const uint8_t *dither, int offset)
; and
; void yuv2planeX_<output_size>_<opt>(const int16_t *filter, int filterSize,
; const int16_t **src, uint8_t *dst, int dstW,
; const uint8_t *dither, int offset)
;
; Scale one or $filterSize lines of source data to generate one line of output
; data. The input is 15-bit in int16_t if $output_size is [8,10] and 19-bit in
; int32_t if $output_size is 16. $filter is 12-bits. $filterSize is a multiple
; of 2. $offset is either 0 or 3. $dither holds 8 values.
;-----------------------------------------------------------------------------
 
%macro yuv2planeX_fn 3
 
%if ARCH_X86_32
%define cntr_reg fltsizeq
%define movsx mov
%else
%define cntr_reg r7
%define movsx movsxd
%endif
 
cglobal yuv2planeX_%1, %3, 8, %2, filter, fltsize, src, dst, w, dither, offset
%if %1 == 8 || %1 == 9 || %1 == 10
pxor m6, m6
%endif ; %1 == 8/9/10
 
%if %1 == 8
%if ARCH_X86_32
%assign pad 0x2c - (stack_offset & 15)
SUB rsp, pad
%define m_dith m7
%else ; x86-64
%define m_dith m9
%endif ; x86-32
 
; create registers holding dither
movq m_dith, [ditherq] ; dither
test offsetd, offsetd
jz .no_rot
%if mmsize == 16
punpcklqdq m_dith, m_dith
%endif ; mmsize == 16
PALIGNR m_dith, m_dith, 3, m0
.no_rot:
%if mmsize == 16
punpcklbw m_dith, m6
%if ARCH_X86_64
punpcklwd m8, m_dith, m6
pslld m8, 12
%else ; x86-32
punpcklwd m5, m_dith, m6
pslld m5, 12
%endif ; x86-32/64
punpckhwd m_dith, m6
pslld m_dith, 12
%if ARCH_X86_32
mova [rsp+ 0], m5
mova [rsp+16], m_dith
%endif
%else ; mmsize == 8
punpcklbw m5, m_dith, m6
punpckhbw m_dith, m6
punpcklwd m4, m5, m6
punpckhwd m5, m6
punpcklwd m3, m_dith, m6
punpckhwd m_dith, m6
pslld m4, 12
pslld m5, 12
pslld m3, 12
pslld m_dith, 12
mova [rsp+ 0], m4
mova [rsp+ 8], m5
mova [rsp+16], m3
mova [rsp+24], m_dith
%endif ; mmsize == 8/16
%endif ; %1 == 8
 
xor r5, r5
 
.pixelloop:
%assign %%i 0
; the rep here is for the 8bit output mmx case, where dither covers
; 8 pixels but we can only handle 2 pixels per register, and thus 4
; pixels per iteration. In order to not have to keep track of where
; we are w.r.t. dithering, we unroll the mmx/8bit loop x2.
%if %1 == 8
%assign %%repcnt 16/mmsize
%else
%assign %%repcnt 1
%endif
 
%rep %%repcnt
 
%if %1 == 8
%if ARCH_X86_32
mova m2, [rsp+mmsize*(0+%%i)]
mova m1, [rsp+mmsize*(1+%%i)]
%else ; x86-64
mova m2, m8
mova m1, m_dith
%endif ; x86-32/64
%else ; %1 == 9/10/16
mova m1, [yuv2yuvX_%1_start]
mova m2, m1
%endif ; %1 == 8/9/10/16
movsx cntr_reg, fltsizem
.filterloop_ %+ %%i:
; input pixels
mov r6, [srcq+gprsize*cntr_reg-2*gprsize]
%if %1 == 16
mova m3, [r6+r5*4]
mova m5, [r6+r5*4+mmsize]
%else ; %1 == 8/9/10
mova m3, [r6+r5*2]
%endif ; %1 == 8/9/10/16
mov r6, [srcq+gprsize*cntr_reg-gprsize]
%if %1 == 16
mova m4, [r6+r5*4]
mova m6, [r6+r5*4+mmsize]
%else ; %1 == 8/9/10
mova m4, [r6+r5*2]
%endif ; %1 == 8/9/10/16
 
; coefficients
movd m0, [filterq+2*cntr_reg-4] ; coeff[0], coeff[1]
%if %1 == 16
pshuflw m7, m0, 0 ; coeff[0]
pshuflw m0, m0, 0x55 ; coeff[1]
pmovsxwd m7, m7 ; word -> dword
pmovsxwd m0, m0 ; word -> dword
 
pmulld m3, m7
pmulld m5, m7
pmulld m4, m0
pmulld m6, m0
 
paddd m2, m3
paddd m1, m5
paddd m2, m4
paddd m1, m6
%else ; %1 == 10/9/8
punpcklwd m5, m3, m4
punpckhwd m3, m4
SPLATD m0
 
pmaddwd m5, m0
pmaddwd m3, m0
 
paddd m2, m5
paddd m1, m3
%endif ; %1 == 8/9/10/16
 
sub cntr_reg, 2
jg .filterloop_ %+ %%i
 
%if %1 == 16
psrad m2, 31 - %1
psrad m1, 31 - %1
%else ; %1 == 10/9/8
psrad m2, 27 - %1
psrad m1, 27 - %1
%endif ; %1 == 8/9/10/16
 
%if %1 == 8
packssdw m2, m1
packuswb m2, m2
movh [dstq+r5*1], m2
%else ; %1 == 9/10/16
%if %1 == 16
packssdw m2, m1
paddw m2, [minshort]
%else ; %1 == 9/10
%if cpuflag(sse4)
packusdw m2, m1
%else ; mmxext/sse2
packssdw m2, m1
pmaxsw m2, m6
%endif ; mmxext/sse2/sse4/avx
pminsw m2, [yuv2yuvX_%1_upper]
%endif ; %1 == 9/10/16
mova [dstq+r5*2], m2
%endif ; %1 == 8/9/10/16
 
add r5, mmsize/2
sub wd, mmsize/2
 
%assign %%i %%i+2
%endrep
jg .pixelloop
 
%if %1 == 8
%if ARCH_X86_32
ADD rsp, pad
RET
%else ; x86-64
REP_RET
%endif ; x86-32/64
%else ; %1 == 9/10/16
REP_RET
%endif ; %1 == 8/9/10/16
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmxext
yuv2planeX_fn 8, 0, 7
yuv2planeX_fn 9, 0, 5
yuv2planeX_fn 10, 0, 5
%endif
 
INIT_XMM sse2
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
 
INIT_XMM sse4
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
yuv2planeX_fn 16, 8, 5
 
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2planeX_fn 8, 10, 7
yuv2planeX_fn 9, 7, 5
yuv2planeX_fn 10, 7, 5
%endif
 
; %1=outout-bpc, %2=alignment (u/a)
%macro yuv2plane1_mainloop 2
.loop_%2:
%if %1 == 8
paddsw m0, m2, [srcq+wq*2+mmsize*0]
paddsw m1, m3, [srcq+wq*2+mmsize*1]
psraw m0, 7
psraw m1, 7
packuswb m0, m1
mov%2 [dstq+wq], m0
%elif %1 == 16
paddd m0, m4, [srcq+wq*4+mmsize*0]
paddd m1, m4, [srcq+wq*4+mmsize*1]
paddd m2, m4, [srcq+wq*4+mmsize*2]
paddd m3, m4, [srcq+wq*4+mmsize*3]
psrad m0, 3
psrad m1, 3
psrad m2, 3
psrad m3, 3
%if cpuflag(sse4) ; avx/sse4
packusdw m0, m1
packusdw m2, m3
%else ; mmx/sse2
packssdw m0, m1
packssdw m2, m3
paddw m0, m5
paddw m2, m5
%endif ; mmx/sse2/sse4/avx
mov%2 [dstq+wq*2+mmsize*0], m0
mov%2 [dstq+wq*2+mmsize*1], m2
%else ; %1 == 9/10
paddsw m0, m2, [srcq+wq*2+mmsize*0]
paddsw m1, m2, [srcq+wq*2+mmsize*1]
psraw m0, 15 - %1
psraw m1, 15 - %1
pmaxsw m0, m4
pmaxsw m1, m4
pminsw m0, m3
pminsw m1, m3
mov%2 [dstq+wq*2+mmsize*0], m0
mov%2 [dstq+wq*2+mmsize*1], m1
%endif
add wq, mmsize
jl .loop_%2
%endmacro
 
%macro yuv2plane1_fn 3
cglobal yuv2plane1_%1, %3, %3, %2, src, dst, w, dither, offset
movsxdifnidn wq, wd
add wq, mmsize - 1
and wq, ~(mmsize - 1)
%if %1 == 8
add dstq, wq
%else ; %1 != 8
lea dstq, [dstq+wq*2]
%endif ; %1 == 8
%if %1 == 16
lea srcq, [srcq+wq*4]
%else ; %1 != 16
lea srcq, [srcq+wq*2]
%endif ; %1 == 16
neg wq
 
%if %1 == 8
pxor m4, m4 ; zero
 
; create registers holding dither
movq m3, [ditherq] ; dither
test offsetd, offsetd
jz .no_rot
%if mmsize == 16
punpcklqdq m3, m3
%endif ; mmsize == 16
PALIGNR m3, m3, 3, m2
.no_rot:
%if mmsize == 8
mova m2, m3
punpckhbw m3, m4 ; byte->word
punpcklbw m2, m4 ; byte->word
%else
punpcklbw m3, m4
mova m2, m3
%endif
%elif %1 == 9
pxor m4, m4
mova m3, [pw_512]
mova m2, [pw_32]
%elif %1 == 10
pxor m4, m4
mova m3, [pw_1024]
mova m2, [pw_16]
%else ; %1 == 16
%if cpuflag(sse4) ; sse4/avx
mova m4, [pd_4]
%else ; mmx/sse2
mova m4, [pd_4min0x40000]
mova m5, [minshort]
%endif ; mmx/sse2/sse4/avx
%endif ; %1 == ..
 
; actual pixel scaling
%if mmsize == 8
yuv2plane1_mainloop %1, a
%else ; mmsize == 16
test dstq, 15
jnz .unaligned
yuv2plane1_mainloop %1, a
REP_RET
.unaligned:
yuv2plane1_mainloop %1, u
%endif ; mmsize == 8/16
REP_RET
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
yuv2plane1_fn 8, 0, 5
yuv2plane1_fn 16, 0, 3
 
INIT_MMX mmxext
yuv2plane1_fn 9, 0, 3
yuv2plane1_fn 10, 0, 3
%endif
 
INIT_XMM sse2
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 6, 3
 
INIT_XMM sse4
yuv2plane1_fn 16, 5, 3
 
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
yuv2plane1_fn 8, 5, 5
yuv2plane1_fn 9, 5, 3
yuv2plane1_fn 10, 5, 3
yuv2plane1_fn 16, 5, 3
%endif
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/rgb2rgb.c
0,0 → 1,164
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stdint.h>
 
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/bswap.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
 
#if HAVE_INLINE_ASM
 
DECLARE_ASM_CONST(8, uint64_t, mmx_ff) = 0x00000000000000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_null) = 0x0000000000000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_one) = 0xFFFFFFFFFFFFFFFFULL;
DECLARE_ASM_CONST(8, uint64_t, mask32b) = 0x000000FF000000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mask32g) = 0x0000FF000000FF00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask32r) = 0x00FF000000FF0000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask32a) = 0xFF000000FF000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask32) = 0x00FFFFFF00FFFFFFULL;
DECLARE_ASM_CONST(8, uint64_t, mask3216br) = 0x00F800F800F800F8ULL;
DECLARE_ASM_CONST(8, uint64_t, mask3216g) = 0x0000FC000000FC00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask3215g) = 0x0000F8000000F800ULL;
DECLARE_ASM_CONST(8, uint64_t, mul3216) = 0x2000000420000004ULL;
DECLARE_ASM_CONST(8, uint64_t, mul3215) = 0x2000000820000008ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24b) = 0x00FF0000FF0000FFULL;
DECLARE_ASM_CONST(8, uint64_t, mask24g) = 0xFF0000FF0000FF00ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24r) = 0x0000FF0000FF0000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24l) = 0x0000000000FFFFFFULL;
DECLARE_ASM_CONST(8, uint64_t, mask24h) = 0x0000FFFFFF000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24hh) = 0xffff000000000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24hhh) = 0xffffffff00000000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask24hhhh) = 0xffffffffffff0000ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15b) = 0x001F001F001F001FULL; /* 00000000 00011111 xxB */
DECLARE_ASM_CONST(8, uint64_t, mask15rg) = 0x7FE07FE07FE07FE0ULL; /* 01111111 11100000 RGx */
DECLARE_ASM_CONST(8, uint64_t, mask15s) = 0xFFE0FFE0FFE0FFE0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15g) = 0x03E003E003E003E0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask15r) = 0x7C007C007C007C00ULL;
#define mask16b mask15b
DECLARE_ASM_CONST(8, uint64_t, mask16g) = 0x07E007E007E007E0ULL;
DECLARE_ASM_CONST(8, uint64_t, mask16r) = 0xF800F800F800F800ULL;
DECLARE_ASM_CONST(8, uint64_t, red_16mask) = 0x0000f8000000f800ULL;
DECLARE_ASM_CONST(8, uint64_t, green_16mask) = 0x000007e0000007e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_16mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, red_15mask) = 0x00007c0000007c00ULL;
DECLARE_ASM_CONST(8, uint64_t, green_15mask) = 0x000003e0000003e0ULL;
DECLARE_ASM_CONST(8, uint64_t, blue_15mask) = 0x0000001f0000001fULL;
DECLARE_ASM_CONST(8, uint64_t, mul15_mid) = 0x4200420042004200ULL;
DECLARE_ASM_CONST(8, uint64_t, mul15_hi) = 0x0210021002100210ULL;
DECLARE_ASM_CONST(8, uint64_t, mul16_mid) = 0x2080208020802080ULL;
 
DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2YOffset);
DECLARE_ALIGNED(8, extern const uint64_t, ff_w1111);
DECLARE_ALIGNED(8, extern const uint64_t, ff_bgr2UVOffset);
 
#define BY ((int)( 0.098*(1<<RGB2YUV_SHIFT)+0.5))
#define BV ((int)(-0.071*(1<<RGB2YUV_SHIFT)+0.5))
#define BU ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define GY ((int)( 0.504*(1<<RGB2YUV_SHIFT)+0.5))
#define GV ((int)(-0.368*(1<<RGB2YUV_SHIFT)+0.5))
#define GU ((int)(-0.291*(1<<RGB2YUV_SHIFT)+0.5))
#define RY ((int)( 0.257*(1<<RGB2YUV_SHIFT)+0.5))
#define RV ((int)( 0.439*(1<<RGB2YUV_SHIFT)+0.5))
#define RU ((int)(-0.148*(1<<RGB2YUV_SHIFT)+0.5))
 
// Note: We have C, MMX, MMXEXT, 3DNOW versions, there is no 3DNOW + MMXEXT one.
 
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_AMD3DNOW 0
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_AVX 0
 
//MMX versions
#undef RENAME
#define RENAME(a) a ## _mmx
#include "rgb2rgb_template.c"
 
// MMXEXT versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "rgb2rgb_template.c"
 
//SSE2 versions
#undef RENAME
#undef COMPILE_TEMPLATE_SSE2
#define COMPILE_TEMPLATE_SSE2 1
#define RENAME(a) a ## _sse2
#include "rgb2rgb_template.c"
 
//AVX versions
#undef RENAME
#undef COMPILE_TEMPLATE_AVX
#define COMPILE_TEMPLATE_AVX 1
#define RENAME(a) a ## _avx
#include "rgb2rgb_template.c"
 
//3DNOW versions
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_AVX
#undef COMPILE_TEMPLATE_AMD3DNOW
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_AVX 0
#define COMPILE_TEMPLATE_AMD3DNOW 1
#define RENAME(a) a ## _3dnow
#include "rgb2rgb_template.c"
 
/*
RGB15->RGB16 original by Strepto/Astral
ported to gcc & bugfixed : A'rpi
MMXEXT, 3DNOW optimization by Nick Kurshev
32-bit C version, and and&add trick by Michael Niedermayer
*/
 
#endif /* HAVE_INLINE_ASM */
 
av_cold void rgb2rgb_init_x86(void)
{
#if HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
 
if (INLINE_MMX(cpu_flags))
rgb2rgb_init_mmx();
if (INLINE_AMD3DNOW(cpu_flags))
rgb2rgb_init_3dnow();
if (INLINE_MMXEXT(cpu_flags))
rgb2rgb_init_mmxext();
if (INLINE_SSE2(cpu_flags))
rgb2rgb_init_sse2();
if (INLINE_AVX(cpu_flags))
rgb2rgb_init_avx();
#endif /* HAVE_INLINE_ASM */
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/rgb2rgb_template.c
0,0 → 1,2601
/*
* software RGB to RGB converter
* pluralize by software PAL8 to RGB converter
* software YUV to YUV converter
* software YUV to RGB converter
* Written by Nick Kurshev.
* palette & YUV & runtime CPU stuff by Michael (michaelni@gmx.at)
* lot of big-endian byte order fixes by Alex Beregszaszi
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stddef.h>
#include <stdint.h>
 
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
 
#undef PREFETCH
#undef MOVNTQ
#undef EMMS
#undef SFENCE
#undef PAVGB
 
#if COMPILE_TEMPLATE_AMD3DNOW
#define PREFETCH "prefetch"
#define PAVGB "pavgusb"
#elif COMPILE_TEMPLATE_MMXEXT
#define PREFETCH "prefetchnta"
#define PAVGB "pavgb"
#else
#define PREFETCH " # nop"
#endif
 
#if COMPILE_TEMPLATE_AMD3DNOW
/* On K6 femms is faster than emms. On K7 femms is directly mapped to emms. */
#define EMMS "femms"
#else
#define EMMS "emms"
#endif
 
#if COMPILE_TEMPLATE_MMXEXT
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE " # nop"
#endif
 
#if !COMPILE_TEMPLATE_SSE2
 
#if !COMPILE_TEMPLATE_AMD3DNOW
 
static inline void RENAME(rgb24tobgr32)(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end;
const uint8_t *mm_end;
end = s + src_size;
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
mm_end = end - 23;
__asm__ volatile("movq %0, %%mm7"::"m"(mask32a):"memory");
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"punpckldq 3(%1), %%mm0 \n\t"
"movd 6(%1), %%mm1 \n\t"
"punpckldq 9(%1), %%mm1 \n\t"
"movd 12(%1), %%mm2 \n\t"
"punpckldq 15(%1), %%mm2 \n\t"
"movd 18(%1), %%mm3 \n\t"
"punpckldq 21(%1), %%mm3 \n\t"
"por %%mm7, %%mm0 \n\t"
"por %%mm7, %%mm1 \n\t"
"por %%mm7, %%mm2 \n\t"
"por %%mm7, %%mm3 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
MOVNTQ" %%mm1, 8(%0) \n\t"
MOVNTQ" %%mm2, 16(%0) \n\t"
MOVNTQ" %%mm3, 24(%0)"
:: "r"(dest), "r"(s)
:"memory");
dest += 32;
s += 24;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
*dest++ = 255;
}
}
 
#define STORE_BGR24_MMX \
"psrlq $8, %%mm2 \n\t" \
"psrlq $8, %%mm3 \n\t" \
"psrlq $8, %%mm6 \n\t" \
"psrlq $8, %%mm7 \n\t" \
"pand "MANGLE(mask24l)", %%mm0\n\t" \
"pand "MANGLE(mask24l)", %%mm1\n\t" \
"pand "MANGLE(mask24l)", %%mm4\n\t" \
"pand "MANGLE(mask24l)", %%mm5\n\t" \
"pand "MANGLE(mask24h)", %%mm2\n\t" \
"pand "MANGLE(mask24h)", %%mm3\n\t" \
"pand "MANGLE(mask24h)", %%mm6\n\t" \
"pand "MANGLE(mask24h)", %%mm7\n\t" \
"por %%mm2, %%mm0 \n\t" \
"por %%mm3, %%mm1 \n\t" \
"por %%mm6, %%mm4 \n\t" \
"por %%mm7, %%mm5 \n\t" \
\
"movq %%mm1, %%mm2 \n\t" \
"movq %%mm4, %%mm3 \n\t" \
"psllq $48, %%mm2 \n\t" \
"psllq $32, %%mm3 \n\t" \
"por %%mm2, %%mm0 \n\t" \
"psrlq $16, %%mm1 \n\t" \
"psrlq $32, %%mm4 \n\t" \
"psllq $16, %%mm5 \n\t" \
"por %%mm3, %%mm1 \n\t" \
"por %%mm5, %%mm4 \n\t" \
\
MOVNTQ" %%mm0, (%0) \n\t" \
MOVNTQ" %%mm1, 8(%0) \n\t" \
MOVNTQ" %%mm4, 16(%0)"
 
 
static inline void RENAME(rgb32tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
{
uint8_t *dest = dst;
const uint8_t *s = src;
const uint8_t *end;
const uint8_t *mm_end;
end = s + src_size;
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
mm_end = end - 31;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm1 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm1, %%mm3 \n\t"
"movq %%mm4, %%mm6 \n\t"
"movq %%mm5, %%mm7 \n\t"
STORE_BGR24_MMX
:: "r"(dest), "r"(s)
NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
:"memory");
dest += 24;
s += 32;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
*dest++ = *s++;
*dest++ = *s++;
*dest++ = *s++;
s++;
}
}
 
/*
original by Strepto/Astral
ported to gcc & bugfixed: A'rpi
MMXEXT, 3DNOW optimization by Nick Kurshev
32-bit C version, and and&add trick by Michael Niedermayer
*/
static inline void RENAME(rgb15to16)(const uint8_t *src, uint8_t *dst, int src_size)
{
register const uint8_t* s=src;
register uint8_t* d=dst;
register const uint8_t *end;
const uint8_t *mm_end;
end = s + src_size;
__asm__ volatile(PREFETCH" %0"::"m"(*s));
__asm__ volatile("movq %0, %%mm4"::"m"(mask15s));
mm_end = end - 15;
while (s<mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"pand %%mm4, %%mm0 \n\t"
"pand %%mm4, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm3, %%mm2 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
MOVNTQ" %%mm2, 8(%0)"
:: "r"(d), "r"(s)
);
d+=16;
s+=16;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
mm_end = end - 3;
while (s < mm_end) {
register unsigned x= *((const uint32_t *)s);
*((uint32_t *)d) = (x&0x7FFF7FFF) + (x&0x7FE07FE0);
d+=4;
s+=4;
}
if (s < end) {
register unsigned short x= *((const uint16_t *)s);
*((uint16_t *)d) = (x&0x7FFF) + (x&0x7FE0);
}
}
 
static inline void RENAME(rgb16to15)(const uint8_t *src, uint8_t *dst, int src_size)
{
register const uint8_t* s=src;
register uint8_t* d=dst;
register const uint8_t *end;
const uint8_t *mm_end;
end = s + src_size;
__asm__ volatile(PREFETCH" %0"::"m"(*s));
__asm__ volatile("movq %0, %%mm7"::"m"(mask15rg));
__asm__ volatile("movq %0, %%mm6"::"m"(mask15b));
mm_end = end - 15;
while (s<mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlq $1, %%mm0 \n\t"
"psrlq $1, %%mm2 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm2 \n\t"
"pand %%mm6, %%mm1 \n\t"
"pand %%mm6, %%mm3 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm3, %%mm2 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
MOVNTQ" %%mm2, 8(%0)"
:: "r"(d), "r"(s)
);
d+=16;
s+=16;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
mm_end = end - 3;
while (s < mm_end) {
register uint32_t x= *((const uint32_t*)s);
*((uint32_t *)d) = ((x>>1)&0x7FE07FE0) | (x&0x001F001F);
s+=4;
d+=4;
}
if (s < end) {
register uint16_t x= *((const uint16_t*)s);
*((uint16_t *)d) = ((x>>1)&0x7FE0) | (x&0x001F);
}
}
 
static inline void RENAME(rgb32to16)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint8_t *s = src;
const uint8_t *end;
const uint8_t *mm_end;
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
mm_end = end - 15;
__asm__ volatile(
"movq %3, %%mm5 \n\t"
"movq %4, %%mm6 \n\t"
"movq %5, %%mm7 \n\t"
"jmp 2f \n\t"
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movd 4(%1), %%mm3 \n\t"
"punpckldq 8(%1), %%mm0 \n\t"
"punpckldq 12(%1), %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm3, %%mm4 \n\t"
"pand %%mm6, %%mm0 \n\t"
"pand %%mm6, %%mm3 \n\t"
"pmaddwd %%mm7, %%mm0 \n\t"
"pmaddwd %%mm7, %%mm3 \n\t"
"pand %%mm5, %%mm1 \n\t"
"pand %%mm5, %%mm4 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm4, %%mm3 \n\t"
"psrld $5, %%mm0 \n\t"
"pslld $11, %%mm3 \n\t"
"por %%mm3, %%mm0 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
"add $16, %1 \n\t"
"add $8, %0 \n\t"
"2: \n\t"
"cmp %2, %1 \n\t"
" jb 1b \n\t"
: "+r" (d), "+r"(s)
: "r" (mm_end), "m" (mask3216g), "m" (mask3216br), "m" (mul3216)
);
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
register int rgb = *(const uint32_t*)s; s += 4;
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>8);
}
}
 
static inline void RENAME(rgb32tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint8_t *s = src;
const uint8_t *end;
const uint8_t *mm_end;
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm__ volatile(
"movq %0, %%mm7 \n\t"
"movq %1, %%mm6 \n\t"
::"m"(red_16mask),"m"(green_16mask));
mm_end = end - 15;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movd 4(%1), %%mm3 \n\t"
"punpckldq 8(%1), %%mm0 \n\t"
"punpckldq 12(%1), %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm3, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"psllq $8, %%mm0 \n\t"
"psllq $8, %%mm3 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm3 \n\t"
"psrlq $5, %%mm1 \n\t"
"psrlq $5, %%mm4 \n\t"
"pand %%mm6, %%mm1 \n\t"
"pand %%mm6, %%mm4 \n\t"
"psrlq $19, %%mm2 \n\t"
"psrlq $19, %%mm5 \n\t"
"pand %2, %%mm2 \n\t"
"pand %2, %%mm5 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm4, %%mm3 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm5, %%mm3 \n\t"
"psllq $16, %%mm3 \n\t"
"por %%mm3, %%mm0 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
:: "r"(d),"r"(s),"m"(blue_16mask):"memory");
d += 4;
s += 16;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
register int rgb = *(const uint32_t*)s; s += 4;
*d++ = ((rgb&0xF8)<<8) + ((rgb&0xFC00)>>5) + ((rgb&0xF80000)>>19);
}
}
 
static inline void RENAME(rgb32to15)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint8_t *s = src;
const uint8_t *end;
const uint8_t *mm_end;
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
mm_end = end - 15;
__asm__ volatile(
"movq %3, %%mm5 \n\t"
"movq %4, %%mm6 \n\t"
"movq %5, %%mm7 \n\t"
"jmp 2f \n\t"
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movd 4(%1), %%mm3 \n\t"
"punpckldq 8(%1), %%mm0 \n\t"
"punpckldq 12(%1), %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm3, %%mm4 \n\t"
"pand %%mm6, %%mm0 \n\t"
"pand %%mm6, %%mm3 \n\t"
"pmaddwd %%mm7, %%mm0 \n\t"
"pmaddwd %%mm7, %%mm3 \n\t"
"pand %%mm5, %%mm1 \n\t"
"pand %%mm5, %%mm4 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm4, %%mm3 \n\t"
"psrld $6, %%mm0 \n\t"
"pslld $10, %%mm3 \n\t"
"por %%mm3, %%mm0 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
"add $16, %1 \n\t"
"add $8, %0 \n\t"
"2: \n\t"
"cmp %2, %1 \n\t"
" jb 1b \n\t"
: "+r" (d), "+r"(s)
: "r" (mm_end), "m" (mask3215g), "m" (mask3216br), "m" (mul3215)
);
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
register int rgb = *(const uint32_t*)s; s += 4;
*d++ = ((rgb&0xFF)>>3) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>9);
}
}
 
static inline void RENAME(rgb32tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint8_t *s = src;
const uint8_t *end;
const uint8_t *mm_end;
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm__ volatile(
"movq %0, %%mm7 \n\t"
"movq %1, %%mm6 \n\t"
::"m"(red_15mask),"m"(green_15mask));
mm_end = end - 15;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movd 4(%1), %%mm3 \n\t"
"punpckldq 8(%1), %%mm0 \n\t"
"punpckldq 12(%1), %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm3, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"psllq $7, %%mm0 \n\t"
"psllq $7, %%mm3 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm3 \n\t"
"psrlq $6, %%mm1 \n\t"
"psrlq $6, %%mm4 \n\t"
"pand %%mm6, %%mm1 \n\t"
"pand %%mm6, %%mm4 \n\t"
"psrlq $19, %%mm2 \n\t"
"psrlq $19, %%mm5 \n\t"
"pand %2, %%mm2 \n\t"
"pand %2, %%mm5 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm4, %%mm3 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm5, %%mm3 \n\t"
"psllq $16, %%mm3 \n\t"
"por %%mm3, %%mm0 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
::"r"(d),"r"(s),"m"(blue_15mask):"memory");
d += 4;
s += 16;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
register int rgb = *(const uint32_t*)s; s += 4;
*d++ = ((rgb&0xF8)<<7) + ((rgb&0xF800)>>6) + ((rgb&0xF80000)>>19);
}
}
 
static inline void RENAME(rgb24tobgr16)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint8_t *s = src;
const uint8_t *end;
const uint8_t *mm_end;
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm__ volatile(
"movq %0, %%mm7 \n\t"
"movq %1, %%mm6 \n\t"
::"m"(red_16mask),"m"(green_16mask));
mm_end = end - 11;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movd 3(%1), %%mm3 \n\t"
"punpckldq 6(%1), %%mm0 \n\t"
"punpckldq 9(%1), %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm3, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"psrlq $3, %%mm0 \n\t"
"psrlq $3, %%mm3 \n\t"
"pand %2, %%mm0 \n\t"
"pand %2, %%mm3 \n\t"
"psrlq $5, %%mm1 \n\t"
"psrlq $5, %%mm4 \n\t"
"pand %%mm6, %%mm1 \n\t"
"pand %%mm6, %%mm4 \n\t"
"psrlq $8, %%mm2 \n\t"
"psrlq $8, %%mm5 \n\t"
"pand %%mm7, %%mm2 \n\t"
"pand %%mm7, %%mm5 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm4, %%mm3 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm5, %%mm3 \n\t"
"psllq $16, %%mm3 \n\t"
"por %%mm3, %%mm0 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
::"r"(d),"r"(s),"m"(blue_16mask):"memory");
d += 4;
s += 12;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
const int b = *s++;
const int g = *s++;
const int r = *s++;
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
}
}
 
static inline void RENAME(rgb24to16)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint8_t *s = src;
const uint8_t *end;
const uint8_t *mm_end;
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm__ volatile(
"movq %0, %%mm7 \n\t"
"movq %1, %%mm6 \n\t"
::"m"(red_16mask),"m"(green_16mask));
mm_end = end - 15;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movd 3(%1), %%mm3 \n\t"
"punpckldq 6(%1), %%mm0 \n\t"
"punpckldq 9(%1), %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm3, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"psllq $8, %%mm0 \n\t"
"psllq $8, %%mm3 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm3 \n\t"
"psrlq $5, %%mm1 \n\t"
"psrlq $5, %%mm4 \n\t"
"pand %%mm6, %%mm1 \n\t"
"pand %%mm6, %%mm4 \n\t"
"psrlq $19, %%mm2 \n\t"
"psrlq $19, %%mm5 \n\t"
"pand %2, %%mm2 \n\t"
"pand %2, %%mm5 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm4, %%mm3 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm5, %%mm3 \n\t"
"psllq $16, %%mm3 \n\t"
"por %%mm3, %%mm0 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
::"r"(d),"r"(s),"m"(blue_16mask):"memory");
d += 4;
s += 12;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
const int r = *s++;
const int g = *s++;
const int b = *s++;
*d++ = (b>>3) | ((g&0xFC)<<3) | ((r&0xF8)<<8);
}
}
 
static inline void RENAME(rgb24tobgr15)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint8_t *s = src;
const uint8_t *end;
const uint8_t *mm_end;
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm__ volatile(
"movq %0, %%mm7 \n\t"
"movq %1, %%mm6 \n\t"
::"m"(red_15mask),"m"(green_15mask));
mm_end = end - 11;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movd 3(%1), %%mm3 \n\t"
"punpckldq 6(%1), %%mm0 \n\t"
"punpckldq 9(%1), %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm3, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"psrlq $3, %%mm0 \n\t"
"psrlq $3, %%mm3 \n\t"
"pand %2, %%mm0 \n\t"
"pand %2, %%mm3 \n\t"
"psrlq $6, %%mm1 \n\t"
"psrlq $6, %%mm4 \n\t"
"pand %%mm6, %%mm1 \n\t"
"pand %%mm6, %%mm4 \n\t"
"psrlq $9, %%mm2 \n\t"
"psrlq $9, %%mm5 \n\t"
"pand %%mm7, %%mm2 \n\t"
"pand %%mm7, %%mm5 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm4, %%mm3 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm5, %%mm3 \n\t"
"psllq $16, %%mm3 \n\t"
"por %%mm3, %%mm0 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
::"r"(d),"r"(s),"m"(blue_15mask):"memory");
d += 4;
s += 12;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
const int b = *s++;
const int g = *s++;
const int r = *s++;
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
}
}
 
static inline void RENAME(rgb24to15)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint8_t *s = src;
const uint8_t *end;
const uint8_t *mm_end;
uint16_t *d = (uint16_t *)dst;
end = s + src_size;
__asm__ volatile(PREFETCH" %0"::"m"(*src):"memory");
__asm__ volatile(
"movq %0, %%mm7 \n\t"
"movq %1, %%mm6 \n\t"
::"m"(red_15mask),"m"(green_15mask));
mm_end = end - 15;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movd (%1), %%mm0 \n\t"
"movd 3(%1), %%mm3 \n\t"
"punpckldq 6(%1), %%mm0 \n\t"
"punpckldq 9(%1), %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm3, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"psllq $7, %%mm0 \n\t"
"psllq $7, %%mm3 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm3 \n\t"
"psrlq $6, %%mm1 \n\t"
"psrlq $6, %%mm4 \n\t"
"pand %%mm6, %%mm1 \n\t"
"pand %%mm6, %%mm4 \n\t"
"psrlq $19, %%mm2 \n\t"
"psrlq $19, %%mm5 \n\t"
"pand %2, %%mm2 \n\t"
"pand %2, %%mm5 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm4, %%mm3 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm5, %%mm3 \n\t"
"psllq $16, %%mm3 \n\t"
"por %%mm3, %%mm0 \n\t"
MOVNTQ" %%mm0, (%0) \n\t"
::"r"(d),"r"(s),"m"(blue_15mask):"memory");
d += 4;
s += 12;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
const int r = *s++;
const int g = *s++;
const int b = *s++;
*d++ = (b>>3) | ((g&0xF8)<<2) | ((r&0xF8)<<7);
}
}
 
static inline void RENAME(rgb15tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint16_t *end;
const uint16_t *mm_end;
uint8_t *d = dst;
const uint16_t *s = (const uint16_t*)src;
end = s + src_size/2;
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
mm_end = end - 7;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%1), %%mm1 \n\t"
"movq (%1), %%mm2 \n\t"
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
"psllq $5, %%mm0 \n\t"
"pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
"pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
"punpcklwd %5, %%mm0 \n\t"
"punpcklwd %5, %%mm1 \n\t"
"punpcklwd %5, %%mm2 \n\t"
"punpckhwd %5, %%mm3 \n\t"
"punpckhwd %5, %%mm4 \n\t"
"punpckhwd %5, %%mm5 \n\t"
"psllq $8, %%mm1 \n\t"
"psllq $16, %%mm2 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm2, %%mm0 \n\t"
"psllq $8, %%mm4 \n\t"
"psllq $16, %%mm5 \n\t"
"por %%mm4, %%mm3 \n\t"
"por %%mm5, %%mm3 \n\t"
 
"movq %%mm0, %%mm6 \n\t"
"movq %%mm3, %%mm7 \n\t"
 
"movq 8(%1), %%mm0 \n\t"
"movq 8(%1), %%mm1 \n\t"
"movq 8(%1), %%mm2 \n\t"
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
"psllq $5, %%mm0 \n\t"
"pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
"pmulhw "MANGLE(mul15_mid)", %%mm1 \n\t"
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
"punpcklwd %5, %%mm0 \n\t"
"punpcklwd %5, %%mm1 \n\t"
"punpcklwd %5, %%mm2 \n\t"
"punpckhwd %5, %%mm3 \n\t"
"punpckhwd %5, %%mm4 \n\t"
"punpckhwd %5, %%mm5 \n\t"
"psllq $8, %%mm1 \n\t"
"psllq $16, %%mm2 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm2, %%mm0 \n\t"
"psllq $8, %%mm4 \n\t"
"psllq $16, %%mm5 \n\t"
"por %%mm4, %%mm3 \n\t"
"por %%mm5, %%mm3 \n\t"
 
:"=m"(*d)
:"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r), "m"(mmx_null)
NAMED_CONSTRAINTS_ADD(mul15_mid,mul15_hi)
:"memory");
/* borrowed 32 to 24 */
__asm__ volatile(
"movq %%mm0, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"movq %%mm6, %%mm0 \n\t"
"movq %%mm7, %%mm1 \n\t"
 
"movq %%mm4, %%mm6 \n\t"
"movq %%mm5, %%mm7 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm1, %%mm3 \n\t"
 
STORE_BGR24_MMX
 
:: "r"(d), "m"(*s)
NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
:"memory");
d += 24;
s += 8;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
register uint16_t bgr;
bgr = *s++;
*d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
*d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
}
}
 
static inline void RENAME(rgb16tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint16_t *end;
const uint16_t *mm_end;
uint8_t *d = (uint8_t *)dst;
const uint16_t *s = (const uint16_t *)src;
end = s + src_size/2;
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
mm_end = end - 7;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%1), %%mm1 \n\t"
"movq (%1), %%mm2 \n\t"
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
"psllq $5, %%mm0 \n\t"
"psrlq $1, %%mm2 \n\t"
"pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
"pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
"punpcklwd %5, %%mm0 \n\t"
"punpcklwd %5, %%mm1 \n\t"
"punpcklwd %5, %%mm2 \n\t"
"punpckhwd %5, %%mm3 \n\t"
"punpckhwd %5, %%mm4 \n\t"
"punpckhwd %5, %%mm5 \n\t"
"psllq $8, %%mm1 \n\t"
"psllq $16, %%mm2 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm2, %%mm0 \n\t"
"psllq $8, %%mm4 \n\t"
"psllq $16, %%mm5 \n\t"
"por %%mm4, %%mm3 \n\t"
"por %%mm5, %%mm3 \n\t"
 
"movq %%mm0, %%mm6 \n\t"
"movq %%mm3, %%mm7 \n\t"
 
"movq 8(%1), %%mm0 \n\t"
"movq 8(%1), %%mm1 \n\t"
"movq 8(%1), %%mm2 \n\t"
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
"psllq $5, %%mm0 \n\t"
"psrlq $1, %%mm2 \n\t"
"pmulhw "MANGLE(mul15_mid)", %%mm0 \n\t"
"pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
"movq %%mm0, %%mm3 \n\t"
"movq %%mm1, %%mm4 \n\t"
"movq %%mm2, %%mm5 \n\t"
"punpcklwd %5, %%mm0 \n\t"
"punpcklwd %5, %%mm1 \n\t"
"punpcklwd %5, %%mm2 \n\t"
"punpckhwd %5, %%mm3 \n\t"
"punpckhwd %5, %%mm4 \n\t"
"punpckhwd %5, %%mm5 \n\t"
"psllq $8, %%mm1 \n\t"
"psllq $16, %%mm2 \n\t"
"por %%mm1, %%mm0 \n\t"
"por %%mm2, %%mm0 \n\t"
"psllq $8, %%mm4 \n\t"
"psllq $16, %%mm5 \n\t"
"por %%mm4, %%mm3 \n\t"
"por %%mm5, %%mm3 \n\t"
:"=m"(*d)
:"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mmx_null)
NAMED_CONSTRAINTS_ADD(mul15_mid,mul16_mid,mul15_hi)
:"memory");
/* borrowed 32 to 24 */
__asm__ volatile(
"movq %%mm0, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"movq %%mm6, %%mm0 \n\t"
"movq %%mm7, %%mm1 \n\t"
 
"movq %%mm4, %%mm6 \n\t"
"movq %%mm5, %%mm7 \n\t"
"movq %%mm0, %%mm2 \n\t"
"movq %%mm1, %%mm3 \n\t"
 
STORE_BGR24_MMX
 
:: "r"(d), "m"(*s)
NAMED_CONSTRAINTS_ADD(mask24l,mask24h)
:"memory");
d += 24;
s += 8;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
register uint16_t bgr;
bgr = *s++;
*d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
*d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
}
}
 
/*
* mm0 = 00 B3 00 B2 00 B1 00 B0
* mm1 = 00 G3 00 G2 00 G1 00 G0
* mm2 = 00 R3 00 R2 00 R1 00 R0
* mm6 = FF FF FF FF FF FF FF FF
* mm7 = 00 00 00 00 00 00 00 00
*/
#define PACK_RGB32 \
"packuswb %%mm7, %%mm0 \n\t" /* 00 00 00 00 B3 B2 B1 B0 */ \
"packuswb %%mm7, %%mm1 \n\t" /* 00 00 00 00 G3 G2 G1 G0 */ \
"packuswb %%mm7, %%mm2 \n\t" /* 00 00 00 00 R3 R2 R1 R0 */ \
"punpcklbw %%mm1, %%mm0 \n\t" /* G3 B3 G2 B2 G1 B1 G0 B0 */ \
"punpcklbw %%mm6, %%mm2 \n\t" /* FF R3 FF R2 FF R1 FF R0 */ \
"movq %%mm0, %%mm3 \n\t" \
"punpcklwd %%mm2, %%mm0 \n\t" /* FF R1 G1 B1 FF R0 G0 B0 */ \
"punpckhwd %%mm2, %%mm3 \n\t" /* FF R3 G3 B3 FF R2 G2 B2 */ \
MOVNTQ" %%mm0, (%0) \n\t" \
MOVNTQ" %%mm3, 8(%0) \n\t" \
 
static inline void RENAME(rgb15to32)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint16_t *end;
const uint16_t *mm_end;
uint8_t *d = dst;
const uint16_t *s = (const uint16_t *)src;
end = s + src_size/2;
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
__asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
__asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
mm_end = end - 3;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%1), %%mm1 \n\t"
"movq (%1), %%mm2 \n\t"
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
"psllq $5, %%mm0 \n\t"
"pmulhw %5, %%mm0 \n\t"
"pmulhw %5, %%mm1 \n\t"
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
PACK_RGB32
::"r"(d),"r"(s),"m"(mask15b),"m"(mask15g),"m"(mask15r) ,"m"(mul15_mid)
NAMED_CONSTRAINTS_ADD(mul15_hi)
:"memory");
d += 16;
s += 4;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
register uint16_t bgr;
bgr = *s++;
*d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
*d++ = ((bgr&0x3E0)>>2) | ((bgr&0x3E0)>>7);
*d++ = ((bgr&0x7C00)>>7) | ((bgr&0x7C00)>>12);
*d++ = 255;
}
}
 
static inline void RENAME(rgb16to32)(const uint8_t *src, uint8_t *dst, int src_size)
{
const uint16_t *end;
const uint16_t *mm_end;
uint8_t *d = dst;
const uint16_t *s = (const uint16_t*)src;
end = s + src_size/2;
__asm__ volatile(PREFETCH" %0"::"m"(*s):"memory");
__asm__ volatile("pxor %%mm7,%%mm7 \n\t":::"memory");
__asm__ volatile("pcmpeqd %%mm6,%%mm6 \n\t":::"memory");
mm_end = end - 3;
while (s < mm_end) {
__asm__ volatile(
PREFETCH" 32(%1) \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%1), %%mm1 \n\t"
"movq (%1), %%mm2 \n\t"
"pand %2, %%mm0 \n\t"
"pand %3, %%mm1 \n\t"
"pand %4, %%mm2 \n\t"
"psllq $5, %%mm0 \n\t"
"psrlq $1, %%mm2 \n\t"
"pmulhw %5, %%mm0 \n\t"
"pmulhw "MANGLE(mul16_mid)", %%mm1 \n\t"
"pmulhw "MANGLE(mul15_hi)", %%mm2 \n\t"
PACK_RGB32
::"r"(d),"r"(s),"m"(mask16b),"m"(mask16g),"m"(mask16r),"m"(mul15_mid)
NAMED_CONSTRAINTS_ADD(mul16_mid,mul15_hi)
:"memory");
d += 16;
s += 4;
}
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
while (s < end) {
register uint16_t bgr;
bgr = *s++;
*d++ = ((bgr&0x1F)<<3) | ((bgr&0x1F)>>2);
*d++ = ((bgr&0x7E0)>>3) | ((bgr&0x7E0)>>9);
*d++ = ((bgr&0xF800)>>8) | ((bgr&0xF800)>>13);
*d++ = 255;
}
}
 
static inline void RENAME(shuffle_bytes_2103)(const uint8_t *src, uint8_t *dst, int src_size)
{
x86_reg idx = 15 - src_size;
const uint8_t *s = src-idx;
uint8_t *d = dst-idx;
__asm__ volatile(
"test %0, %0 \n\t"
"jns 2f \n\t"
PREFETCH" (%1, %0) \n\t"
"movq %3, %%mm7 \n\t"
"pxor %4, %%mm7 \n\t"
"movq %%mm7, %%mm6 \n\t"
"pxor %5, %%mm7 \n\t"
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 32(%1, %0) \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
# if COMPILE_TEMPLATE_MMXEXT
"pshufw $177, %%mm0, %%mm3 \n\t"
"pshufw $177, %%mm1, %%mm5 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm6, %%mm3 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm6, %%mm5 \n\t"
"por %%mm3, %%mm0 \n\t"
"por %%mm5, %%mm1 \n\t"
# else
"movq %%mm0, %%mm2 \n\t"
"movq %%mm1, %%mm4 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm6, %%mm2 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm6, %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"pslld $16, %%mm2 \n\t"
"psrld $16, %%mm3 \n\t"
"pslld $16, %%mm4 \n\t"
"psrld $16, %%mm5 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm4, %%mm1 \n\t"
"por %%mm3, %%mm0 \n\t"
"por %%mm5, %%mm1 \n\t"
# endif
MOVNTQ" %%mm0, (%2, %0) \n\t"
MOVNTQ" %%mm1, 8(%2, %0) \n\t"
"add $16, %0 \n\t"
"js 1b \n\t"
SFENCE" \n\t"
EMMS" \n\t"
"2: \n\t"
: "+&r"(idx)
: "r" (s), "r" (d), "m" (mask32b), "m" (mask32r), "m" (mmx_one)
: "memory");
for (; idx<15; idx+=4) {
register unsigned v = *(const uint32_t *)&s[idx], g = v & 0xff00ff00;
v &= 0xff00ff;
*(uint32_t *)&d[idx] = (v>>16) + g + (v<<16);
}
}
 
static inline void RENAME(rgb24tobgr24)(const uint8_t *src, uint8_t *dst, int src_size)
{
unsigned i;
x86_reg mmx_size= 23 - src_size;
__asm__ volatile (
"test %%"REG_a", %%"REG_a" \n\t"
"jns 2f \n\t"
"movq "MANGLE(mask24r)", %%mm5 \n\t"
"movq "MANGLE(mask24g)", %%mm6 \n\t"
"movq "MANGLE(mask24b)", %%mm7 \n\t"
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 32(%1, %%"REG_a") \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
"movq (%1, %%"REG_a"), %%mm1 \n\t" // BGR BGR BG
"movq 2(%1, %%"REG_a"), %%mm2 \n\t" // R BGR BGR B
"psllq $16, %%mm0 \n\t" // 00 BGR BGR
"pand %%mm5, %%mm0 \n\t"
"pand %%mm6, %%mm1 \n\t"
"pand %%mm7, %%mm2 \n\t"
"por %%mm0, %%mm1 \n\t"
"por %%mm2, %%mm1 \n\t"
"movq 6(%1, %%"REG_a"), %%mm0 \n\t" // BGR BGR BG
MOVNTQ" %%mm1, (%2, %%"REG_a") \n\t" // RGB RGB RG
"movq 8(%1, %%"REG_a"), %%mm1 \n\t" // R BGR BGR B
"movq 10(%1, %%"REG_a"), %%mm2 \n\t" // GR BGR BGR
"pand %%mm7, %%mm0 \n\t"
"pand %%mm5, %%mm1 \n\t"
"pand %%mm6, %%mm2 \n\t"
"por %%mm0, %%mm1 \n\t"
"por %%mm2, %%mm1 \n\t"
"movq 14(%1, %%"REG_a"), %%mm0 \n\t" // R BGR BGR B
MOVNTQ" %%mm1, 8(%2, %%"REG_a") \n\t" // B RGB RGB R
"movq 16(%1, %%"REG_a"), %%mm1 \n\t" // GR BGR BGR
"movq 18(%1, %%"REG_a"), %%mm2 \n\t" // BGR BGR BG
"pand %%mm6, %%mm0 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm5, %%mm2 \n\t"
"por %%mm0, %%mm1 \n\t"
"por %%mm2, %%mm1 \n\t"
MOVNTQ" %%mm1, 16(%2, %%"REG_a") \n\t"
"add $24, %%"REG_a" \n\t"
" js 1b \n\t"
"2: \n\t"
: "+a" (mmx_size)
: "r" (src-mmx_size), "r"(dst-mmx_size)
NAMED_CONSTRAINTS_ADD(mask24r,mask24g,mask24b)
);
 
__asm__ volatile(SFENCE:::"memory");
__asm__ volatile(EMMS:::"memory");
 
if (mmx_size==23) return; //finished, was multiple of 8
 
src+= src_size;
dst+= src_size;
src_size= 23-mmx_size;
src-= src_size;
dst-= src_size;
for (i=0; i<src_size; i+=3) {
register uint8_t x;
x = src[i + 2];
dst[i + 1] = src[i + 1];
dst[i + 2] = src[i + 0];
dst[i + 0] = x;
}
}
 
static inline void RENAME(yuvPlanartoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
{
int y;
const x86_reg chromWidth= width>>1;
for (y=0; y<height; y++) {
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
__asm__ volatile(
"xor %%"REG_a", %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
PREFETCH" 32(%2, %%"REG_a") \n\t"
PREFETCH" 32(%3, %%"REG_a") \n\t"
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
"movq %%mm0, %%mm2 \n\t" // U(0)
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
 
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
"movq %%mm3, %%mm4 \n\t" // Y(0)
"movq %%mm5, %%mm6 \n\t" // Y(8)
"punpcklbw %%mm0, %%mm3 \n\t" // YUYV YUYV(0)
"punpckhbw %%mm0, %%mm4 \n\t" // YUYV YUYV(4)
"punpcklbw %%mm2, %%mm5 \n\t" // YUYV YUYV(8)
"punpckhbw %%mm2, %%mm6 \n\t" // YUYV YUYV(12)
 
MOVNTQ" %%mm3, (%0, %%"REG_a", 4) \n\t"
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
MOVNTQ" %%mm5, 16(%0, %%"REG_a", 4) \n\t"
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
 
"add $8, %%"REG_a" \n\t"
"cmp %4, %%"REG_a" \n\t"
" jb 1b \n\t"
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
: "%"REG_a
);
if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
__asm__(EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
}
 
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void RENAME(yv12toyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride)
{
//FIXME interpolate chroma
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
}
 
static inline void RENAME(yuvPlanartouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride, int vertLumPerChroma)
{
int y;
const x86_reg chromWidth= width>>1;
for (y=0; y<height; y++) {
//FIXME handle 2 lines at once (fewer prefetches, reuse some chroma, but very likely memory-limited anyway)
__asm__ volatile(
"xor %%"REG_a", %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 32(%1, %%"REG_a", 2) \n\t"
PREFETCH" 32(%2, %%"REG_a") \n\t"
PREFETCH" 32(%3, %%"REG_a") \n\t"
"movq (%2, %%"REG_a"), %%mm0 \n\t" // U(0)
"movq %%mm0, %%mm2 \n\t" // U(0)
"movq (%3, %%"REG_a"), %%mm1 \n\t" // V(0)
"punpcklbw %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
"punpckhbw %%mm1, %%mm2 \n\t" // UVUV UVUV(8)
 
"movq (%1, %%"REG_a",2), %%mm3 \n\t" // Y(0)
"movq 8(%1, %%"REG_a",2), %%mm5 \n\t" // Y(8)
"movq %%mm0, %%mm4 \n\t" // Y(0)
"movq %%mm2, %%mm6 \n\t" // Y(8)
"punpcklbw %%mm3, %%mm0 \n\t" // YUYV YUYV(0)
"punpckhbw %%mm3, %%mm4 \n\t" // YUYV YUYV(4)
"punpcklbw %%mm5, %%mm2 \n\t" // YUYV YUYV(8)
"punpckhbw %%mm5, %%mm6 \n\t" // YUYV YUYV(12)
 
MOVNTQ" %%mm0, (%0, %%"REG_a", 4) \n\t"
MOVNTQ" %%mm4, 8(%0, %%"REG_a", 4) \n\t"
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 4) \n\t"
MOVNTQ" %%mm6, 24(%0, %%"REG_a", 4) \n\t"
 
"add $8, %%"REG_a" \n\t"
"cmp %4, %%"REG_a" \n\t"
" jb 1b \n\t"
::"r"(dst), "r"(ysrc), "r"(usrc), "r"(vsrc), "g" (chromWidth)
: "%"REG_a
);
if ((y&(vertLumPerChroma-1)) == vertLumPerChroma-1) {
usrc += chromStride;
vsrc += chromStride;
}
ysrc += lumStride;
dst += dstStride;
}
__asm__(EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
}
 
/**
* Height should be a multiple of 2 and width should be a multiple of 16
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void RENAME(yv12touyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride)
{
//FIXME interpolate chroma
RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 2);
}
 
/**
* Width should be a multiple of 16.
*/
static inline void RENAME(yuv422ptouyvy)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride)
{
RENAME(yuvPlanartouyvy)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
}
 
/**
* Width should be a multiple of 16.
*/
static inline void RENAME(yuv422ptoyuy2)(const uint8_t *ysrc, const uint8_t *usrc, const uint8_t *vsrc, uint8_t *dst,
int width, int height,
int lumStride, int chromStride, int dstStride)
{
RENAME(yuvPlanartoyuy2)(ysrc, usrc, vsrc, dst, width, height, lumStride, chromStride, dstStride, 1);
}
 
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
*/
static inline void RENAME(yuy2toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const x86_reg chromWidth= width>>1;
for (y=0; y<height; y+=2) {
__asm__ volatile(
"xor %%"REG_a", %%"REG_a" \n\t"
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
"movq %%mm0, %%mm2 \n\t" // YUYV YUYV(0)
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(4)
"psrlw $8, %%mm0 \n\t" // U0V0 U0V0(0)
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(4)
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
 
MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
 
"movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(8)
"movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(12)
"movq %%mm1, %%mm3 \n\t" // YUYV YUYV(8)
"movq %%mm2, %%mm4 \n\t" // YUYV YUYV(12)
"psrlw $8, %%mm1 \n\t" // U0V0 U0V0(8)
"psrlw $8, %%mm2 \n\t" // U0V0 U0V0(12)
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
"pand %%mm7, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
 
MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
 
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
 
MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
 
"add $8, %%"REG_a" \n\t"
"cmp %4, %%"REG_a" \n\t"
" jb 1b \n\t"
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
: "memory", "%"REG_a
);
 
ydst += lumStride;
src += srcStride;
 
__asm__ volatile(
"xor %%"REG_a", %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
"movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
"movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
"pand %%mm7, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
"pand %%mm7, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
"pand %%mm7, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
"pand %%mm7, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
 
MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
 
"add $8, %%"REG_a" \n\t"
"cmp %4, %%"REG_a" \n\t"
" jb 1b \n\t"
 
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
: "memory", "%"REG_a
);
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
__asm__ volatile(EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
 
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
static inline void RENAME(planar2x)(const uint8_t *src, uint8_t *dst, int srcWidth, int srcHeight, int srcStride, int dstStride)
{
int x,y;
 
dst[0]= src[0];
 
// first line
for (x=0; x<srcWidth-1; x++) {
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
}
dst[2*srcWidth-1]= src[srcWidth-1];
 
dst+= dstStride;
 
for (y=1; y<srcHeight; y++) {
const x86_reg mmxSize= srcWidth&~15;
__asm__ volatile(
"mov %4, %%"REG_a" \n\t"
"movq "MANGLE(mmx_ff)", %%mm0 \n\t"
"movq (%0, %%"REG_a"), %%mm4 \n\t"
"movq %%mm4, %%mm2 \n\t"
"psllq $8, %%mm4 \n\t"
"pand %%mm0, %%mm2 \n\t"
"por %%mm2, %%mm4 \n\t"
"movq (%1, %%"REG_a"), %%mm5 \n\t"
"movq %%mm5, %%mm3 \n\t"
"psllq $8, %%mm5 \n\t"
"pand %%mm0, %%mm3 \n\t"
"por %%mm3, %%mm5 \n\t"
"1: \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq 1(%0, %%"REG_a"), %%mm2 \n\t"
"movq 1(%1, %%"REG_a"), %%mm3 \n\t"
PAVGB" %%mm0, %%mm5 \n\t"
PAVGB" %%mm0, %%mm3 \n\t"
PAVGB" %%mm0, %%mm5 \n\t"
PAVGB" %%mm0, %%mm3 \n\t"
PAVGB" %%mm1, %%mm4 \n\t"
PAVGB" %%mm1, %%mm2 \n\t"
PAVGB" %%mm1, %%mm4 \n\t"
PAVGB" %%mm1, %%mm2 \n\t"
"movq %%mm5, %%mm7 \n\t"
"movq %%mm4, %%mm6 \n\t"
"punpcklbw %%mm3, %%mm5 \n\t"
"punpckhbw %%mm3, %%mm7 \n\t"
"punpcklbw %%mm2, %%mm4 \n\t"
"punpckhbw %%mm2, %%mm6 \n\t"
MOVNTQ" %%mm5, (%2, %%"REG_a", 2) \n\t"
MOVNTQ" %%mm7, 8(%2, %%"REG_a", 2) \n\t"
MOVNTQ" %%mm4, (%3, %%"REG_a", 2) \n\t"
MOVNTQ" %%mm6, 8(%3, %%"REG_a", 2) \n\t"
"add $8, %%"REG_a" \n\t"
"movq -1(%0, %%"REG_a"), %%mm4 \n\t"
"movq -1(%1, %%"REG_a"), %%mm5 \n\t"
" js 1b \n\t"
:: "r" (src + mmxSize ), "r" (src + srcStride + mmxSize ),
"r" (dst + mmxSize*2), "r" (dst + dstStride + mmxSize*2),
"g" (-mmxSize)
NAMED_CONSTRAINTS_ADD(mmx_ff)
: "%"REG_a
);
 
for (x=mmxSize-1; x<srcWidth-1; x++) {
dst[2*x +1]= (3*src[x+0] + src[x+srcStride+1])>>2;
dst[2*x+dstStride+2]= ( src[x+0] + 3*src[x+srcStride+1])>>2;
dst[2*x+dstStride+1]= ( src[x+1] + 3*src[x+srcStride ])>>2;
dst[2*x +2]= (3*src[x+1] + src[x+srcStride ])>>2;
}
dst[srcWidth*2 -1 ]= (3*src[srcWidth-1] + src[srcWidth-1 + srcStride])>>2;
dst[srcWidth*2 -1 + dstStride]= ( src[srcWidth-1] + 3*src[srcWidth-1 + srcStride])>>2;
 
dst+=dstStride*2;
src+=srcStride;
}
 
// last line
dst[0]= src[0];
 
for (x=0; x<srcWidth-1; x++) {
dst[2*x+1]= (3*src[x] + src[x+1])>>2;
dst[2*x+2]= ( src[x] + 3*src[x+1])>>2;
}
dst[2*srcWidth-1]= src[srcWidth-1];
 
__asm__ volatile(EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
}
#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
 
#if !COMPILE_TEMPLATE_AMD3DNOW
/**
* Height should be a multiple of 2 and width should be a multiple of 16.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line, others are ignored.
* FIXME: Write HQ version.
*/
static inline void RENAME(uyvytoyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const x86_reg chromWidth= width>>1;
for (y=0; y<height; y+=2) {
__asm__ volatile(
"xor %%"REG_a", %%"REG_a" \n\t"
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t" // FF,00,FF,00...
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // UYVY UYVY(0)
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(4)
"movq %%mm0, %%mm2 \n\t" // UYVY UYVY(0)
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(4)
"pand %%mm7, %%mm0 \n\t" // U0V0 U0V0(0)
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(4)
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(0)
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(4)
"packuswb %%mm1, %%mm0 \n\t" // UVUV UVUV(0)
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(0)
 
MOVNTQ" %%mm2, (%1, %%"REG_a", 2) \n\t"
 
"movq 16(%0, %%"REG_a", 4), %%mm1 \n\t" // UYVY UYVY(8)
"movq 24(%0, %%"REG_a", 4), %%mm2 \n\t" // UYVY UYVY(12)
"movq %%mm1, %%mm3 \n\t" // UYVY UYVY(8)
"movq %%mm2, %%mm4 \n\t" // UYVY UYVY(12)
"pand %%mm7, %%mm1 \n\t" // U0V0 U0V0(8)
"pand %%mm7, %%mm2 \n\t" // U0V0 U0V0(12)
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(8)
"psrlw $8, %%mm4 \n\t" // Y0Y0 Y0Y0(12)
"packuswb %%mm2, %%mm1 \n\t" // UVUV UVUV(8)
"packuswb %%mm4, %%mm3 \n\t" // YYYY YYYY(8)
 
MOVNTQ" %%mm3, 8(%1, %%"REG_a", 2) \n\t"
 
"movq %%mm0, %%mm2 \n\t" // UVUV UVUV(0)
"movq %%mm1, %%mm3 \n\t" // UVUV UVUV(8)
"psrlw $8, %%mm0 \n\t" // V0V0 V0V0(0)
"psrlw $8, %%mm1 \n\t" // V0V0 V0V0(8)
"pand %%mm7, %%mm2 \n\t" // U0U0 U0U0(0)
"pand %%mm7, %%mm3 \n\t" // U0U0 U0U0(8)
"packuswb %%mm1, %%mm0 \n\t" // VVVV VVVV(0)
"packuswb %%mm3, %%mm2 \n\t" // UUUU UUUU(0)
 
MOVNTQ" %%mm0, (%3, %%"REG_a") \n\t"
MOVNTQ" %%mm2, (%2, %%"REG_a") \n\t"
 
"add $8, %%"REG_a" \n\t"
"cmp %4, %%"REG_a" \n\t"
" jb 1b \n\t"
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
: "memory", "%"REG_a
);
 
ydst += lumStride;
src += srcStride;
 
__asm__ volatile(
"xor %%"REG_a", %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%"REG_a", 4) \n\t"
"movq (%0, %%"REG_a", 4), %%mm0 \n\t" // YUYV YUYV(0)
"movq 8(%0, %%"REG_a", 4), %%mm1 \n\t" // YUYV YUYV(4)
"movq 16(%0, %%"REG_a", 4), %%mm2 \n\t" // YUYV YUYV(8)
"movq 24(%0, %%"REG_a", 4), %%mm3 \n\t" // YUYV YUYV(12)
"psrlw $8, %%mm0 \n\t" // Y0Y0 Y0Y0(0)
"psrlw $8, %%mm1 \n\t" // Y0Y0 Y0Y0(4)
"psrlw $8, %%mm2 \n\t" // Y0Y0 Y0Y0(8)
"psrlw $8, %%mm3 \n\t" // Y0Y0 Y0Y0(12)
"packuswb %%mm1, %%mm0 \n\t" // YYYY YYYY(0)
"packuswb %%mm3, %%mm2 \n\t" // YYYY YYYY(8)
 
MOVNTQ" %%mm0, (%1, %%"REG_a", 2) \n\t"
MOVNTQ" %%mm2, 8(%1, %%"REG_a", 2) \n\t"
 
"add $8, %%"REG_a" \n\t"
"cmp %4, %%"REG_a" \n\t"
" jb 1b \n\t"
 
::"r"(src), "r"(ydst), "r"(udst), "r"(vdst), "g" (chromWidth)
: "memory", "%"REG_a
);
udst += chromStride;
vdst += chromStride;
ydst += lumStride;
src += srcStride;
}
__asm__ volatile(EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
 
/**
* Height should be a multiple of 2 and width should be a multiple of 2.
* (If this is a problem for anyone then tell me, and I will fix it.)
* Chrominance data is only taken from every second line,
* others are ignored in the C version.
* FIXME: Write HQ version.
*/
#if HAVE_7REGS
static inline void RENAME(rgb24toyv12)(const uint8_t *src, uint8_t *ydst, uint8_t *udst, uint8_t *vdst,
int width, int height,
int lumStride, int chromStride, int srcStride,
int32_t *rgb2yuv)
{
#define BGR2Y_IDX "16*4+16*32"
#define BGR2U_IDX "16*4+16*33"
#define BGR2V_IDX "16*4+16*34"
int y;
const x86_reg chromWidth= width>>1;
 
if (height > 2) {
ff_rgb24toyv12_c(src, ydst, udst, vdst, width, 2, lumStride, chromStride, srcStride, rgb2yuv);
src += 2*srcStride;
ydst += 2*lumStride;
udst += chromStride;
vdst += chromStride;
height -= 2;
}
 
for (y=0; y<height-2; y+=2) {
int i;
for (i=0; i<2; i++) {
__asm__ volatile(
"mov %2, %%"REG_a" \n\t"
"movq "BGR2Y_IDX"(%3), %%mm6 \n\t"
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%"REG_d") \n\t"
"movd (%0, %%"REG_d"), %%mm0 \n\t"
"movd 3(%0, %%"REG_d"), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movd 6(%0, %%"REG_d"), %%mm2 \n\t"
"movd 9(%0, %%"REG_d"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
"packssdw %%mm1, %%mm0 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
"pmaddwd %%mm5, %%mm2 \n\t"
"packssdw %%mm2, %%mm0 \n\t"
"psraw $7, %%mm0 \n\t"
 
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
"movd 15(%0, %%"REG_d"), %%mm1 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"movd 18(%0, %%"REG_d"), %%mm2 \n\t"
"movd 21(%0, %%"REG_d"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm1 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"pmaddwd %%mm6, %%mm3 \n\t"
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
"packssdw %%mm1, %%mm4 \n\t"
"packssdw %%mm3, %%mm2 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
"pmaddwd %%mm5, %%mm2 \n\t"
"add $24, %%"REG_d" \n\t"
"packssdw %%mm2, %%mm4 \n\t"
"psraw $7, %%mm4 \n\t"
 
"packuswb %%mm4, %%mm0 \n\t"
"paddusb "MANGLE(ff_bgr2YOffset)", %%mm0 \n\t"
 
MOVNTQ" %%mm0, (%1, %%"REG_a") \n\t"
"add $8, %%"REG_a" \n\t"
" js 1b \n\t"
: : "r" (src+width*3), "r" (ydst+width), "g" ((x86_reg)-width), "r"(rgb2yuv)
NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2YOffset)
: "%"REG_a, "%"REG_d
);
ydst += lumStride;
src += srcStride;
}
src -= srcStride*2;
__asm__ volatile(
"mov %4, %%"REG_a" \n\t"
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
"movq "BGR2U_IDX"(%5), %%mm6 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_d" \n\t"
"add %%"REG_d", %%"REG_d" \n\t"
".p2align 4 \n\t"
"1: \n\t"
PREFETCH" 64(%0, %%"REG_d") \n\t"
PREFETCH" 64(%1, %%"REG_d") \n\t"
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
"movq (%0, %%"REG_d"), %%mm0 \n\t"
"movq (%1, %%"REG_d"), %%mm1 \n\t"
"movq 6(%0, %%"REG_d"), %%mm2 \n\t"
"movq 6(%1, %%"REG_d"), %%mm3 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm3, %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlq $24, %%mm0 \n\t"
"psrlq $24, %%mm2 \n\t"
PAVGB" %%mm1, %%mm0 \n\t"
PAVGB" %%mm3, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
#else
"movd (%0, %%"REG_d"), %%mm0 \n\t"
"movd (%1, %%"REG_d"), %%mm1 \n\t"
"movd 3(%0, %%"REG_d"), %%mm2 \n\t"
"movd 3(%1, %%"REG_d"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"movd 6(%0, %%"REG_d"), %%mm4 \n\t"
"movd 6(%1, %%"REG_d"), %%mm1 \n\t"
"movd 9(%0, %%"REG_d"), %%mm2 \n\t"
"movd 9(%1, %%"REG_d"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm4 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm4, %%mm2 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
"movq "BGR2V_IDX"(%5), %%mm1 \n\t"
"movq "BGR2V_IDX"(%5), %%mm3 \n\t"
 
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm0 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"psrad $8, %%mm0 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
"packssdw %%mm2, %%mm0 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm0 \n\t"
"pmaddwd %%mm5, %%mm1 \n\t"
"packssdw %%mm1, %%mm0 \n\t" // V1 V0 U1 U0
"psraw $7, %%mm0 \n\t"
 
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
"movq 12(%0, %%"REG_d"), %%mm4 \n\t"
"movq 12(%1, %%"REG_d"), %%mm1 \n\t"
"movq 18(%0, %%"REG_d"), %%mm2 \n\t"
"movq 18(%1, %%"REG_d"), %%mm3 \n\t"
PAVGB" %%mm1, %%mm4 \n\t"
PAVGB" %%mm3, %%mm2 \n\t"
"movq %%mm4, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlq $24, %%mm4 \n\t"
"psrlq $24, %%mm2 \n\t"
PAVGB" %%mm1, %%mm4 \n\t"
PAVGB" %%mm3, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
#else
"movd 12(%0, %%"REG_d"), %%mm4 \n\t"
"movd 12(%1, %%"REG_d"), %%mm1 \n\t"
"movd 15(%0, %%"REG_d"), %%mm2 \n\t"
"movd 15(%1, %%"REG_d"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm4 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm4 \n\t"
"movd 18(%0, %%"REG_d"), %%mm5 \n\t"
"movd 18(%1, %%"REG_d"), %%mm1 \n\t"
"movd 21(%0, %%"REG_d"), %%mm2 \n\t"
"movd 21(%1, %%"REG_d"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"paddw %%mm1, %%mm5 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm5, %%mm2 \n\t"
"movq "MANGLE(ff_w1111)", %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm2 \n\t"
#endif
"movq "BGR2V_IDX"(%5), %%mm1 \n\t"
"movq "BGR2V_IDX"(%5), %%mm3 \n\t"
 
"pmaddwd %%mm4, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm6, %%mm4 \n\t"
"pmaddwd %%mm6, %%mm2 \n\t"
"psrad $8, %%mm4 \n\t"
"psrad $8, %%mm1 \n\t"
"psrad $8, %%mm2 \n\t"
"psrad $8, %%mm3 \n\t"
"packssdw %%mm2, %%mm4 \n\t"
"packssdw %%mm3, %%mm1 \n\t"
"pmaddwd %%mm5, %%mm4 \n\t"
"pmaddwd %%mm5, %%mm1 \n\t"
"add $24, %%"REG_d" \n\t"
"packssdw %%mm1, %%mm4 \n\t" // V3 V2 U3 U2
"psraw $7, %%mm4 \n\t"
 
"movq %%mm0, %%mm1 \n\t"
"punpckldq %%mm4, %%mm0 \n\t"
"punpckhdq %%mm4, %%mm1 \n\t"
"packsswb %%mm1, %%mm0 \n\t"
"paddb "MANGLE(ff_bgr2UVOffset)", %%mm0 \n\t"
"movd %%mm0, (%2, %%"REG_a") \n\t"
"punpckhdq %%mm0, %%mm0 \n\t"
"movd %%mm0, (%3, %%"REG_a") \n\t"
"add $4, %%"REG_a" \n\t"
" js 1b \n\t"
: : "r" (src+chromWidth*6), "r" (src+srcStride+chromWidth*6), "r" (udst+chromWidth), "r" (vdst+chromWidth), "g" (-chromWidth), "r"(rgb2yuv)
NAMED_CONSTRAINTS_ADD(ff_w1111,ff_bgr2UVOffset)
: "%"REG_a, "%"REG_d
);
 
udst += chromStride;
vdst += chromStride;
src += srcStride*2;
}
 
__asm__ volatile(EMMS" \n\t"
SFENCE" \n\t"
:::"memory");
 
ff_rgb24toyv12_c(src, ydst, udst, vdst, width, height-y, lumStride, chromStride, srcStride, rgb2yuv);
}
#endif /* HAVE_7REGS */
#endif /* !COMPILE_TEMPLATE_SSE2 */
 
#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
static void RENAME(interleaveBytes)(const uint8_t *src1, const uint8_t *src2, uint8_t *dest,
int width, int height, int src1Stride,
int src2Stride, int dstStride)
{
int h;
 
for (h=0; h < height; h++) {
int w;
 
if (width >= 16) {
#if COMPILE_TEMPLATE_SSE2
if (!((((intptr_t)src1) | ((intptr_t)src2) | ((intptr_t)dest))&15)) {
__asm__(
"xor %%"REG_a", %%"REG_a" \n\t"
"1: \n\t"
PREFETCH" 64(%1, %%"REG_a") \n\t"
PREFETCH" 64(%2, %%"REG_a") \n\t"
"movdqa (%1, %%"REG_a"), %%xmm0 \n\t"
"movdqa (%1, %%"REG_a"), %%xmm1 \n\t"
"movdqa (%2, %%"REG_a"), %%xmm2 \n\t"
"punpcklbw %%xmm2, %%xmm0 \n\t"
"punpckhbw %%xmm2, %%xmm1 \n\t"
"movntdq %%xmm0, (%0, %%"REG_a", 2) \n\t"
"movntdq %%xmm1, 16(%0, %%"REG_a", 2) \n\t"
"add $16, %%"REG_a" \n\t"
"cmp %3, %%"REG_a" \n\t"
" jb 1b \n\t"
::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
: "memory", XMM_CLOBBERS("xmm0", "xmm1", "xmm2",) "%"REG_a
);
} else
#endif
__asm__(
"xor %%"REG_a", %%"REG_a" \n\t"
"1: \n\t"
PREFETCH" 64(%1, %%"REG_a") \n\t"
PREFETCH" 64(%2, %%"REG_a") \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq 8(%1, %%"REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq (%2, %%"REG_a"), %%mm4 \n\t"
"movq 8(%2, %%"REG_a"), %%mm5 \n\t"
"punpcklbw %%mm4, %%mm0 \n\t"
"punpckhbw %%mm4, %%mm1 \n\t"
"punpcklbw %%mm5, %%mm2 \n\t"
"punpckhbw %%mm5, %%mm3 \n\t"
MOVNTQ" %%mm0, (%0, %%"REG_a", 2) \n\t"
MOVNTQ" %%mm1, 8(%0, %%"REG_a", 2) \n\t"
MOVNTQ" %%mm2, 16(%0, %%"REG_a", 2) \n\t"
MOVNTQ" %%mm3, 24(%0, %%"REG_a", 2) \n\t"
"add $16, %%"REG_a" \n\t"
"cmp %3, %%"REG_a" \n\t"
" jb 1b \n\t"
::"r"(dest), "r"(src1), "r"(src2), "r" ((x86_reg)width-15)
: "memory", "%"REG_a
);
 
}
for (w= (width&(~15)); w < width; w++) {
dest[2*w+0] = src1[w];
dest[2*w+1] = src2[w];
}
dest += dstStride;
src1 += src1Stride;
src2 += src2Stride;
}
__asm__(
EMMS" \n\t"
SFENCE" \n\t"
::: "memory"
);
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
 
#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
void RENAME(ff_nv12ToUV)(uint8_t *dstU, uint8_t *dstV,
const uint8_t *unused,
const uint8_t *src1,
const uint8_t *src2,
int w,
uint32_t *unused2);
static void RENAME(deinterleaveBytes)(const uint8_t *src, uint8_t *dst1, uint8_t *dst2,
int width, int height, int srcStride,
int dst1Stride, int dst2Stride)
{
int h;
 
for (h = 0; h < height; h++) {
RENAME(ff_nv12ToUV)(dst1, dst2, NULL, src, NULL, width, NULL);
src += srcStride;
dst1 += dst1Stride;
dst2 += dst2Stride;
}
__asm__(
#if !COMPILE_TEMPLATE_SSE2
EMMS" \n\t"
#endif
SFENCE" \n\t"
::: "memory"
);
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
#endif /* !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL */
 
#if !COMPILE_TEMPLATE_SSE2
#if !COMPILE_TEMPLATE_AMD3DNOW
static inline void RENAME(vu9_to_vu12)(const uint8_t *src1, const uint8_t *src2,
uint8_t *dst1, uint8_t *dst2,
int width, int height,
int srcStride1, int srcStride2,
int dstStride1, int dstStride2)
{
x86_reg x, y;
int w,h;
w=width/2; h=height/2;
__asm__ volatile(
PREFETCH" %0 \n\t"
PREFETCH" %1 \n\t"
::"m"(*(src1+srcStride1)),"m"(*(src2+srcStride2)):"memory");
for (y=0;y<h;y++) {
const uint8_t* s1=src1+srcStride1*(y>>1);
uint8_t* d=dst1+dstStride1*y;
x=0;
for (;x<w-31;x+=32) {
__asm__ volatile(
PREFETCH" 32(%1,%2) \n\t"
"movq (%1,%2), %%mm0 \n\t"
"movq 8(%1,%2), %%mm2 \n\t"
"movq 16(%1,%2), %%mm4 \n\t"
"movq 24(%1,%2), %%mm6 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"movq %%mm6, %%mm7 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpckhbw %%mm1, %%mm1 \n\t"
"punpcklbw %%mm2, %%mm2 \n\t"
"punpckhbw %%mm3, %%mm3 \n\t"
"punpcklbw %%mm4, %%mm4 \n\t"
"punpckhbw %%mm5, %%mm5 \n\t"
"punpcklbw %%mm6, %%mm6 \n\t"
"punpckhbw %%mm7, %%mm7 \n\t"
MOVNTQ" %%mm0, (%0,%2,2) \n\t"
MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
MOVNTQ" %%mm7, 56(%0,%2,2)"
:: "r"(d), "r"(s1), "r"(x)
:"memory");
}
for (;x<w;x++) d[2*x]=d[2*x+1]=s1[x];
}
for (y=0;y<h;y++) {
const uint8_t* s2=src2+srcStride2*(y>>1);
uint8_t* d=dst2+dstStride2*y;
x=0;
for (;x<w-31;x+=32) {
__asm__ volatile(
PREFETCH" 32(%1,%2) \n\t"
"movq (%1,%2), %%mm0 \n\t"
"movq 8(%1,%2), %%mm2 \n\t"
"movq 16(%1,%2), %%mm4 \n\t"
"movq 24(%1,%2), %%mm6 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"movq %%mm6, %%mm7 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpckhbw %%mm1, %%mm1 \n\t"
"punpcklbw %%mm2, %%mm2 \n\t"
"punpckhbw %%mm3, %%mm3 \n\t"
"punpcklbw %%mm4, %%mm4 \n\t"
"punpckhbw %%mm5, %%mm5 \n\t"
"punpcklbw %%mm6, %%mm6 \n\t"
"punpckhbw %%mm7, %%mm7 \n\t"
MOVNTQ" %%mm0, (%0,%2,2) \n\t"
MOVNTQ" %%mm1, 8(%0,%2,2) \n\t"
MOVNTQ" %%mm2, 16(%0,%2,2) \n\t"
MOVNTQ" %%mm3, 24(%0,%2,2) \n\t"
MOVNTQ" %%mm4, 32(%0,%2,2) \n\t"
MOVNTQ" %%mm5, 40(%0,%2,2) \n\t"
MOVNTQ" %%mm6, 48(%0,%2,2) \n\t"
MOVNTQ" %%mm7, 56(%0,%2,2)"
:: "r"(d), "r"(s2), "r"(x)
:"memory");
}
for (;x<w;x++) d[2*x]=d[2*x+1]=s2[x];
}
__asm__(
EMMS" \n\t"
SFENCE" \n\t"
::: "memory"
);
}
 
static inline void RENAME(yvu9_to_yuy2)(const uint8_t *src1, const uint8_t *src2, const uint8_t *src3,
uint8_t *dst,
int width, int height,
int srcStride1, int srcStride2,
int srcStride3, int dstStride)
{
x86_reg x;
int y,w,h;
w=width/2; h=height;
for (y=0;y<h;y++) {
const uint8_t* yp=src1+srcStride1*y;
const uint8_t* up=src2+srcStride2*(y>>2);
const uint8_t* vp=src3+srcStride3*(y>>2);
uint8_t* d=dst+dstStride*y;
x=0;
for (;x<w-7;x+=8) {
__asm__ volatile(
PREFETCH" 32(%1, %0) \n\t"
PREFETCH" 32(%2, %0) \n\t"
PREFETCH" 32(%3, %0) \n\t"
"movq (%1, %0, 4), %%mm0 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
"movq (%2, %0), %%mm1 \n\t" /* U0U1U2U3U4U5U6U7 */
"movq (%3, %0), %%mm2 \n\t" /* V0V1V2V3V4V5V6V7 */
"movq %%mm0, %%mm3 \n\t" /* Y0Y1Y2Y3Y4Y5Y6Y7 */
"movq %%mm1, %%mm4 \n\t" /* U0U1U2U3U4U5U6U7 */
"movq %%mm2, %%mm5 \n\t" /* V0V1V2V3V4V5V6V7 */
"punpcklbw %%mm1, %%mm1 \n\t" /* U0U0 U1U1 U2U2 U3U3 */
"punpcklbw %%mm2, %%mm2 \n\t" /* V0V0 V1V1 V2V2 V3V3 */
"punpckhbw %%mm4, %%mm4 \n\t" /* U4U4 U5U5 U6U6 U7U7 */
"punpckhbw %%mm5, %%mm5 \n\t" /* V4V4 V5V5 V6V6 V7V7 */
 
"movq %%mm1, %%mm6 \n\t"
"punpcklbw %%mm2, %%mm1 \n\t" /* U0V0 U0V0 U1V1 U1V1*/
"punpcklbw %%mm1, %%mm0 \n\t" /* Y0U0 Y1V0 Y2U0 Y3V0*/
"punpckhbw %%mm1, %%mm3 \n\t" /* Y4U1 Y5V1 Y6U1 Y7V1*/
MOVNTQ" %%mm0, (%4, %0, 8) \n\t"
MOVNTQ" %%mm3, 8(%4, %0, 8) \n\t"
 
"punpckhbw %%mm2, %%mm6 \n\t" /* U2V2 U2V2 U3V3 U3V3*/
"movq 8(%1, %0, 4), %%mm0 \n\t"
"movq %%mm0, %%mm3 \n\t"
"punpcklbw %%mm6, %%mm0 \n\t" /* Y U2 Y V2 Y U2 Y V2*/
"punpckhbw %%mm6, %%mm3 \n\t" /* Y U3 Y V3 Y U3 Y V3*/
MOVNTQ" %%mm0, 16(%4, %0, 8) \n\t"
MOVNTQ" %%mm3, 24(%4, %0, 8) \n\t"
 
"movq %%mm4, %%mm6 \n\t"
"movq 16(%1, %0, 4), %%mm0 \n\t"
"movq %%mm0, %%mm3 \n\t"
"punpcklbw %%mm5, %%mm4 \n\t"
"punpcklbw %%mm4, %%mm0 \n\t" /* Y U4 Y V4 Y U4 Y V4*/
"punpckhbw %%mm4, %%mm3 \n\t" /* Y U5 Y V5 Y U5 Y V5*/
MOVNTQ" %%mm0, 32(%4, %0, 8) \n\t"
MOVNTQ" %%mm3, 40(%4, %0, 8) \n\t"
 
"punpckhbw %%mm5, %%mm6 \n\t"
"movq 24(%1, %0, 4), %%mm0 \n\t"
"movq %%mm0, %%mm3 \n\t"
"punpcklbw %%mm6, %%mm0 \n\t" /* Y U6 Y V6 Y U6 Y V6*/
"punpckhbw %%mm6, %%mm3 \n\t" /* Y U7 Y V7 Y U7 Y V7*/
MOVNTQ" %%mm0, 48(%4, %0, 8) \n\t"
MOVNTQ" %%mm3, 56(%4, %0, 8) \n\t"
 
: "+r" (x)
: "r"(yp), "r" (up), "r"(vp), "r"(d)
:"memory");
}
for (; x<w; x++) {
const int x2 = x<<2;
d[8*x+0] = yp[x2];
d[8*x+1] = up[x];
d[8*x+2] = yp[x2+1];
d[8*x+3] = vp[x];
d[8*x+4] = yp[x2+2];
d[8*x+5] = up[x];
d[8*x+6] = yp[x2+3];
d[8*x+7] = vp[x];
}
}
__asm__(
EMMS" \n\t"
SFENCE" \n\t"
::: "memory"
);
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
 
static void RENAME(extract_even)(const uint8_t *src, uint8_t *dst, x86_reg count)
{
dst += count;
src += 2*count;
count= - count;
 
if(count <= -16) {
count += 15;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t"
"1: \n\t"
"movq -30(%1, %0, 2), %%mm0 \n\t"
"movq -22(%1, %0, 2), %%mm1 \n\t"
"movq -14(%1, %0, 2), %%mm2 \n\t"
"movq -6(%1, %0, 2), %%mm3 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm7, %%mm2 \n\t"
"pand %%mm7, %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
MOVNTQ" %%mm0,-15(%2, %0) \n\t"
MOVNTQ" %%mm2,- 7(%2, %0) \n\t"
"add $16, %0 \n\t"
" js 1b \n\t"
: "+r"(count)
: "r"(src), "r"(dst)
);
count -= 15;
}
while(count<0) {
dst[count]= src[2*count];
count++;
}
}
 
static void RENAME(extract_odd)(const uint8_t *src, uint8_t *dst, x86_reg count)
{
src ++;
dst += count;
src += 2*count;
count= - count;
 
if(count < -16) {
count += 16;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t"
"1: \n\t"
"movq -32(%1, %0, 2), %%mm0 \n\t"
"movq -24(%1, %0, 2), %%mm1 \n\t"
"movq -16(%1, %0, 2), %%mm2 \n\t"
"movq -8(%1, %0, 2), %%mm3 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm7, %%mm2 \n\t"
"pand %%mm7, %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
MOVNTQ" %%mm0,-16(%2, %0) \n\t"
MOVNTQ" %%mm2,- 8(%2, %0) \n\t"
"add $16, %0 \n\t"
" js 1b \n\t"
: "+r"(count)
: "r"(src), "r"(dst)
);
count -= 16;
}
while(count<0) {
dst[count]= src[2*count];
count++;
}
}
 
#if !COMPILE_TEMPLATE_AMD3DNOW
static void RENAME(extract_even2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
{
dst0+= count;
dst1+= count;
src += 4*count;
count= - count;
if(count <= -8) {
count += 7;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t"
"1: \n\t"
"movq -28(%1, %0, 4), %%mm0 \n\t"
"movq -20(%1, %0, 4), %%mm1 \n\t"
"movq -12(%1, %0, 4), %%mm2 \n\t"
"movq -4(%1, %0, 4), %%mm3 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm7, %%mm2 \n\t"
"pand %%mm7, %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlw $8, %%mm0 \n\t"
"psrlw $8, %%mm2 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm7, %%mm3 \n\t"
"packuswb %%mm2, %%mm0 \n\t"
"packuswb %%mm3, %%mm1 \n\t"
MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
"add $8, %0 \n\t"
" js 1b \n\t"
: "+r"(count)
: "r"(src), "r"(dst0), "r"(dst1)
);
count -= 7;
}
while(count<0) {
dst0[count]= src[4*count+0];
dst1[count]= src[4*count+2];
count++;
}
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
 
static void RENAME(extract_even2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
{
dst0 += count;
dst1 += count;
src0 += 4*count;
src1 += 4*count;
count= - count;
#ifdef PAVGB
if(count <= -8) {
count += 7;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t"
"1: \n\t"
"movq -28(%1, %0, 4), %%mm0 \n\t"
"movq -20(%1, %0, 4), %%mm1 \n\t"
"movq -12(%1, %0, 4), %%mm2 \n\t"
"movq -4(%1, %0, 4), %%mm3 \n\t"
PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
"pand %%mm7, %%mm0 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm7, %%mm2 \n\t"
"pand %%mm7, %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlw $8, %%mm0 \n\t"
"psrlw $8, %%mm2 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm7, %%mm3 \n\t"
"packuswb %%mm2, %%mm0 \n\t"
"packuswb %%mm3, %%mm1 \n\t"
MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
"add $8, %0 \n\t"
" js 1b \n\t"
: "+r"(count)
: "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
);
count -= 7;
}
#endif
while(count<0) {
dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
count++;
}
}
 
#if !COMPILE_TEMPLATE_AMD3DNOW
static void RENAME(extract_odd2)(const uint8_t *src, uint8_t *dst0, uint8_t *dst1, x86_reg count)
{
dst0+= count;
dst1+= count;
src += 4*count;
count= - count;
if(count <= -8) {
count += 7;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t"
"1: \n\t"
"movq -28(%1, %0, 4), %%mm0 \n\t"
"movq -20(%1, %0, 4), %%mm1 \n\t"
"movq -12(%1, %0, 4), %%mm2 \n\t"
"movq -4(%1, %0, 4), %%mm3 \n\t"
"psrlw $8, %%mm0 \n\t"
"psrlw $8, %%mm1 \n\t"
"psrlw $8, %%mm2 \n\t"
"psrlw $8, %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlw $8, %%mm0 \n\t"
"psrlw $8, %%mm2 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm7, %%mm3 \n\t"
"packuswb %%mm2, %%mm0 \n\t"
"packuswb %%mm3, %%mm1 \n\t"
MOVNTQ" %%mm0,- 7(%3, %0) \n\t"
MOVNTQ" %%mm1,- 7(%2, %0) \n\t"
"add $8, %0 \n\t"
" js 1b \n\t"
: "+r"(count)
: "r"(src), "r"(dst0), "r"(dst1)
);
count -= 7;
}
src++;
while(count<0) {
dst0[count]= src[4*count+0];
dst1[count]= src[4*count+2];
count++;
}
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
 
static void RENAME(extract_odd2avg)(const uint8_t *src0, const uint8_t *src1, uint8_t *dst0, uint8_t *dst1, x86_reg count)
{
dst0 += count;
dst1 += count;
src0 += 4*count;
src1 += 4*count;
count= - count;
#ifdef PAVGB
if(count <= -8) {
count += 7;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $8, %%mm7 \n\t"
"1: \n\t"
"movq -28(%1, %0, 4), %%mm0 \n\t"
"movq -20(%1, %0, 4), %%mm1 \n\t"
"movq -12(%1, %0, 4), %%mm2 \n\t"
"movq -4(%1, %0, 4), %%mm3 \n\t"
PAVGB" -28(%2, %0, 4), %%mm0 \n\t"
PAVGB" -20(%2, %0, 4), %%mm1 \n\t"
PAVGB" -12(%2, %0, 4), %%mm2 \n\t"
PAVGB" - 4(%2, %0, 4), %%mm3 \n\t"
"psrlw $8, %%mm0 \n\t"
"psrlw $8, %%mm1 \n\t"
"psrlw $8, %%mm2 \n\t"
"psrlw $8, %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"psrlw $8, %%mm0 \n\t"
"psrlw $8, %%mm2 \n\t"
"pand %%mm7, %%mm1 \n\t"
"pand %%mm7, %%mm3 \n\t"
"packuswb %%mm2, %%mm0 \n\t"
"packuswb %%mm3, %%mm1 \n\t"
MOVNTQ" %%mm0,- 7(%4, %0) \n\t"
MOVNTQ" %%mm1,- 7(%3, %0) \n\t"
"add $8, %0 \n\t"
" js 1b \n\t"
: "+r"(count)
: "r"(src0), "r"(src1), "r"(dst0), "r"(dst1)
);
count -= 7;
}
#endif
src0++;
src1++;
while(count<0) {
dst0[count]= (src0[4*count+0]+src1[4*count+0])>>1;
dst1[count]= (src0[4*count+2]+src1[4*count+2])>>1;
count++;
}
}
 
static void RENAME(yuyvtoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
 
for (y=0; y<height; y++) {
RENAME(extract_even)(src, ydst, width);
if(y&1) {
RENAME(extract_odd2avg)(src-srcStride, src, udst, vdst, chromWidth);
udst+= chromStride;
vdst+= chromStride;
}
 
src += srcStride;
ydst+= lumStride;
}
__asm__(
EMMS" \n\t"
SFENCE" \n\t"
::: "memory"
);
}
 
#if !COMPILE_TEMPLATE_AMD3DNOW
static void RENAME(yuyvtoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
 
for (y=0; y<height; y++) {
RENAME(extract_even)(src, ydst, width);
RENAME(extract_odd2)(src, udst, vdst, chromWidth);
 
src += srcStride;
ydst+= lumStride;
udst+= chromStride;
vdst+= chromStride;
}
__asm__(
EMMS" \n\t"
SFENCE" \n\t"
::: "memory"
);
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
 
static void RENAME(uyvytoyuv420)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
 
for (y=0; y<height; y++) {
RENAME(extract_odd)(src, ydst, width);
if(y&1) {
RENAME(extract_even2avg)(src-srcStride, src, udst, vdst, chromWidth);
udst+= chromStride;
vdst+= chromStride;
}
 
src += srcStride;
ydst+= lumStride;
}
__asm__(
EMMS" \n\t"
SFENCE" \n\t"
::: "memory"
);
}
 
#if !COMPILE_TEMPLATE_AMD3DNOW
static void RENAME(uyvytoyuv422)(uint8_t *ydst, uint8_t *udst, uint8_t *vdst, const uint8_t *src,
int width, int height,
int lumStride, int chromStride, int srcStride)
{
int y;
const int chromWidth = FF_CEIL_RSHIFT(width, 1);
 
for (y=0; y<height; y++) {
RENAME(extract_odd)(src, ydst, width);
RENAME(extract_even2)(src, udst, vdst, chromWidth);
 
src += srcStride;
ydst+= lumStride;
udst+= chromStride;
vdst+= chromStride;
}
__asm__(
EMMS" \n\t"
SFENCE" \n\t"
::: "memory"
);
}
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
#endif /* !COMPILE_TEMPLATE_SSE2 */
 
static av_cold void RENAME(rgb2rgb_init)(void)
{
#if !COMPILE_TEMPLATE_SSE2
#if !COMPILE_TEMPLATE_AMD3DNOW
rgb15to16 = RENAME(rgb15to16);
rgb15tobgr24 = RENAME(rgb15tobgr24);
rgb15to32 = RENAME(rgb15to32);
rgb16tobgr24 = RENAME(rgb16tobgr24);
rgb16to32 = RENAME(rgb16to32);
rgb16to15 = RENAME(rgb16to15);
rgb24tobgr16 = RENAME(rgb24tobgr16);
rgb24tobgr15 = RENAME(rgb24tobgr15);
rgb24tobgr32 = RENAME(rgb24tobgr32);
rgb32to16 = RENAME(rgb32to16);
rgb32to15 = RENAME(rgb32to15);
rgb32tobgr24 = RENAME(rgb32tobgr24);
rgb24to15 = RENAME(rgb24to15);
rgb24to16 = RENAME(rgb24to16);
rgb24tobgr24 = RENAME(rgb24tobgr24);
shuffle_bytes_2103 = RENAME(shuffle_bytes_2103);
rgb32tobgr16 = RENAME(rgb32tobgr16);
rgb32tobgr15 = RENAME(rgb32tobgr15);
yv12toyuy2 = RENAME(yv12toyuy2);
yv12touyvy = RENAME(yv12touyvy);
yuv422ptoyuy2 = RENAME(yuv422ptoyuy2);
yuv422ptouyvy = RENAME(yuv422ptouyvy);
yuy2toyv12 = RENAME(yuy2toyv12);
vu9_to_vu12 = RENAME(vu9_to_vu12);
yvu9_to_yuy2 = RENAME(yvu9_to_yuy2);
uyvytoyuv422 = RENAME(uyvytoyuv422);
yuyvtoyuv422 = RENAME(yuyvtoyuv422);
#endif /* !COMPILE_TEMPLATE_AMD3DNOW */
 
#if COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW
planar2x = RENAME(planar2x);
#endif /* COMPILE_TEMPLATE_MMXEXT || COMPILE_TEMPLATE_AMD3DNOW */
#if HAVE_7REGS
ff_rgb24toyv12 = RENAME(rgb24toyv12);
#endif /* HAVE_7REGS */
 
yuyvtoyuv420 = RENAME(yuyvtoyuv420);
uyvytoyuv420 = RENAME(uyvytoyuv420);
#endif /* !COMPILE_TEMPLATE_SSE2 */
 
#if !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX
interleaveBytes = RENAME(interleaveBytes);
#endif /* !COMPILE_TEMPLATE_AMD3DNOW && !COMPILE_TEMPLATE_AVX */
#if !COMPILE_TEMPLATE_AVX || HAVE_AVX_EXTERNAL
#if !COMPILE_TEMPLATE_AMD3DNOW && (ARCH_X86_32 || COMPILE_TEMPLATE_SSE2) && COMPILE_TEMPLATE_MMXEXT == COMPILE_TEMPLATE_SSE2 && HAVE_YASM
deinterleaveBytes = RENAME(deinterleaveBytes);
#endif
#endif
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/scale.asm
0,0 → 1,431
;******************************************************************************
;* x86-optimized horizontal line scaling functions
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
max_19bit_int: times 4 dd 0x7ffff
max_19bit_flt: times 4 dd 524287.0
minshort: times 8 dw 0x8000
unicoeff: times 4 dd 0x20000000
 
SECTION .text
 
;-----------------------------------------------------------------------------
; horizontal line scaling
;
; void hscale<source_width>to<intermediate_nbits>_<filterSize>_<opt>
; (SwsContext *c, int{16,32}_t *dst,
; int dstW, const uint{8,16}_t *src,
; const int16_t *filter,
; const int32_t *filterPos, int filterSize);
;
; Scale one horizontal line. Input is either 8-bits width or 16-bits width
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
; downscale before multiplying). Filter is 14-bits. Output is either 15bits
; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
; output pixel is generated from $filterSize input pixels, the position of
; the first pixel is given in filterPos[nOutputPixel].
;-----------------------------------------------------------------------------
 
; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
%macro SCALE_FUNC 6
%ifnidn %3, X
cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
%else
cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
%endif
%if ARCH_X86_64
movsxd wq, wd
%define mov32 movsxd
%else ; x86-32
%define mov32 mov
%endif ; x86-64
%if %2 == 19
%if mmsize == 8 ; mmx
mova m2, [max_19bit_int]
%elif cpuflag(sse4)
mova m2, [max_19bit_int]
%else ; ssse3/sse2
mova m2, [max_19bit_flt]
%endif ; mmx/sse2/ssse3/sse4
%endif ; %2 == 19
%if %1 == 16
mova m6, [minshort]
mova m7, [unicoeff]
%elif %1 == 8
pxor m3, m3
%endif ; %1 == 8/16
 
%if %1 == 8
%define movlh movd
%define movbh movh
%define srcmul 1
%else ; %1 == 9-16
%define movlh movq
%define movbh movu
%define srcmul 2
%endif ; %1 == 8/9-16
 
%ifnidn %3, X
 
; setup loop
%if %3 == 8
shl wq, 1 ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
%define wshr 1
%else ; %3 == 4
%define wshr 0
%endif ; %3 == 8
lea filterq, [filterq+wq*8]
%if %2 == 15
lea dstq, [dstq+wq*(2>>wshr)]
%else ; %2 == 19
lea dstq, [dstq+wq*(4>>wshr)]
%endif ; %2 == 15/19
lea fltposq, [fltposq+wq*(4>>wshr)]
neg wq
 
.loop:
%if %3 == 4 ; filterSize == 4 scaling
; load 2x4 or 4x4 source pixels into m0/m1
mov32 pos0q, dword [fltposq+wq*4+ 0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*4+ 4] ; filterPos[1]
movlh m0, [srcq+pos0q*srcmul] ; src[filterPos[0] + {0,1,2,3}]
%if mmsize == 8
movlh m1, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%else ; mmsize == 16
%if %1 > 8
movhps m0, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%else ; %1 == 8
movd m4, [srcq+pos1q*srcmul] ; src[filterPos[1] + {0,1,2,3}]
%endif
mov32 pos0q, dword [fltposq+wq*4+ 8] ; filterPos[2]
mov32 pos1q, dword [fltposq+wq*4+12] ; filterPos[3]
movlh m1, [srcq+pos0q*srcmul] ; src[filterPos[2] + {0,1,2,3}]
%if %1 > 8
movhps m1, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
%else ; %1 == 8
movd m5, [srcq+pos1q*srcmul] ; src[filterPos[3] + {0,1,2,3}]
punpckldq m0, m4
punpckldq m1, m5
%endif ; %1 == 8
%endif ; mmsize == 8/16
%if %1 == 8
punpcklbw m0, m3 ; byte -> word
punpcklbw m1, m3 ; byte -> word
%endif ; %1 == 8
 
; multiply with filter coefficients
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
%endif ; %1 == 16
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
 
; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
%if mmsize == 8 ; mmx
movq m4, m0
punpckldq m0, m1
punpckhdq m4, m1
paddd m0, m4
%elif notcpuflag(ssse3) ; sse2
mova m4, m0
shufps m0, m1, 10001000b
shufps m4, m1, 11011101b
paddd m0, m4
%else ; ssse3/sse4
phaddd m0, m1 ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
%endif ; mmx/sse2/ssse3/sse4
%else ; %3 == 8, i.e. filterSize == 8 scaling
; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
mov32 pos0q, dword [fltposq+wq*2+0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*2+4] ; filterPos[1]
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
%if mmsize == 8
movbh m1, [srcq+(pos0q+4)*srcmul] ; src[filterPos[0] + {4,5,6,7}]
movbh m4, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3}]
movbh m5, [srcq+(pos1q+4)*srcmul] ; src[filterPos[1] + {4,5,6,7}]
%else ; mmsize == 16
movbh m1, [srcq+ pos1q *srcmul] ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
mov32 pos0q, dword [fltposq+wq*2+8] ; filterPos[2]
mov32 pos1q, dword [fltposq+wq*2+12] ; filterPos[3]
movbh m4, [srcq+ pos0q *srcmul] ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
movbh m5, [srcq+ pos1q *srcmul] ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
%endif ; mmsize == 8/16
%if %1 == 8
punpcklbw m0, m3 ; byte -> word
punpcklbw m1, m3 ; byte -> word
punpcklbw m4, m3 ; byte -> word
punpcklbw m5, m3 ; byte -> word
%endif ; %1 == 8
 
; multiply
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
psubw m4, m6
psubw m5, m6
%endif ; %1 == 16
pmaddwd m0, [filterq+wq*8+mmsize*0] ; *= filter[{0,1,..,6,7}]
pmaddwd m1, [filterq+wq*8+mmsize*1] ; *= filter[{8,9,..,14,15}]
pmaddwd m4, [filterq+wq*8+mmsize*2] ; *= filter[{16,17,..,22,23}]
pmaddwd m5, [filterq+wq*8+mmsize*3] ; *= filter[{24,25,..,30,31}]
 
; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
%if mmsize == 8
paddd m0, m1
paddd m4, m5
movq m1, m0
punpckldq m0, m4
punpckhdq m1, m4
paddd m0, m1
%elif notcpuflag(ssse3) ; sse2
%if %1 == 8
%define mex m6
%else
%define mex m3
%endif
; emulate horizontal add as transpose + vertical add
mova mex, m0
punpckldq m0, m1
punpckhdq mex, m1
paddd m0, mex
mova m1, m4
punpckldq m4, m5
punpckhdq m1, m5
paddd m4, m1
mova m1, m0
punpcklqdq m0, m4
punpckhqdq m1, m4
paddd m0, m1
%else ; ssse3/sse4
; FIXME if we rearrange the filter in pairs of 4, we can
; load pixels likewise and use 2 x paddd + phaddd instead
; of 3 x phaddd here, faster on older cpus
phaddd m0, m1
phaddd m4, m5
phaddd m0, m4 ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
%endif ; mmx/sse2/ssse3/sse4
%endif ; %3 == 4/8
 
%else ; %3 == X, i.e. any filterSize scaling
 
%ifidn %4, X4
%define dlt 4
%else ; %4 == X || %4 == X8
%define dlt 0
%endif ; %4 ==/!= X4
%if ARCH_X86_64
%define srcq r8
%define pos1q r7
%define srcendq r9
movsxd fltsizeq, fltsized ; filterSize
lea srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
%else ; x86-32
%define srcq srcmemq
%define pos1q dstq
%define srcendq r6m
lea pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
mov srcendq, pos0q
%endif ; x86-32/64
lea fltposq, [fltposq+wq*4]
%if %2 == 15
lea dstq, [dstq+wq*2]
%else ; %2 == 19
lea dstq, [dstq+wq*4]
%endif ; %2 == 15/19
movifnidn dstmp, dstq
neg wq
 
.loop:
mov32 pos0q, dword [fltposq+wq*4+0] ; filterPos[0]
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
pxor m4, m4
pxor m5, m5
mov srcq, srcmemmp
 
.innerloop:
; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
movbh m0, [srcq+ pos0q *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
movbh m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
%if %1 == 8
punpcklbw m0, m3
punpcklbw m1, m3
%endif ; %1 == 8
 
; multiply
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
psubw m1, m6
%endif ; %1 == 16
pmaddwd m0, [filterq] ; filter[{0,1,2,3(,4,5,6,7)}]
pmaddwd m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
paddd m4, m0
paddd m5, m1
add filterq, mmsize
add srcq, srcmul*mmsize/2
cmp srcq, srcendq ; while (src += 4) < &src[filterSize]
jl .innerloop
 
%ifidn %4, X4
mov32 pos1q, dword [fltposq+wq*4+4] ; filterPos[1]
movlh m0, [srcq+ pos0q *srcmul] ; split last 4 srcpx of dstpx[0]
sub pos1q, fltsizeq ; and first 4 srcpx of dstpx[1]
%if %1 > 8
movhps m0, [srcq+(pos1q+dlt)*srcmul]
%else ; %1 == 8
movd m1, [srcq+(pos1q+dlt)*srcmul]
punpckldq m0, m1
%endif ; %1 == 8
%if %1 == 8
punpcklbw m0, m3
%endif ; %1 == 8
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
; add back 0x8000 * sum(coeffs) after the horizontal add
psubw m0, m6
%endif ; %1 == 16
pmaddwd m0, [filterq]
%endif ; %4 == X4
 
lea filterq, [filterq+(fltsizeq+dlt)*2]
 
%if mmsize == 8 ; mmx
movq m0, m4
punpckldq m4, m5
punpckhdq m0, m5
paddd m0, m4
%else ; mmsize == 16
%if notcpuflag(ssse3) ; sse2
mova m1, m4
punpcklqdq m4, m5
punpckhqdq m1, m5
paddd m4, m1
%else ; ssse3/sse4
phaddd m4, m5
%endif ; sse2/ssse3/sse4
%ifidn %4, X4
paddd m4, m0
%endif ; %3 == X4
%if notcpuflag(ssse3) ; sse2
pshufd m4, m4, 11011000b
movhlps m0, m4
paddd m0, m4
%else ; ssse3/sse4
phaddd m4, m4
SWAP 0, 4
%endif ; sse2/ssse3/sse4
%endif ; mmsize == 8/16
%endif ; %3 ==/!= X
 
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
paddd m0, m7
%endif ; %1 == 16
 
; clip, store
psrad m0, 14 + %1 - %2
%ifidn %3, X
movifnidn dstq, dstmp
%endif ; %3 == X
%if %2 == 15
packssdw m0, m0
%ifnidn %3, X
movh [dstq+wq*(2>>wshr)], m0
%else ; %3 == X
movd [dstq+wq*2], m0
%endif ; %3 ==/!= X
%else ; %2 == 19
%if mmsize == 8
PMINSD_MMX m0, m2, m4
%elif cpuflag(sse4)
pminsd m0, m2
%else ; sse2/ssse3
cvtdq2ps m0, m0
minps m0, m2
cvtps2dq m0, m0
%endif ; mmx/sse2/ssse3/sse4
%ifnidn %3, X
mova [dstq+wq*(4>>wshr)], m0
%else ; %3 == X
movq [dstq+wq*4], m0
%endif ; %3 ==/!= X
%endif ; %2 == 15/19
%ifnidn %3, X
add wq, (mmsize<<wshr)/4 ; both 8tap and 4tap really only do 4 pixels (or for mmx: 2 pixels)
; per iteration. see "shl wq,1" above as for why we do this
%else ; %3 == X
add wq, 2
%endif ; %3 ==/!= X
jl .loop
REP_RET
%endmacro
 
; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
%macro SCALE_FUNCS 3
SCALE_FUNC %1, %2, 4, 4, 6, %3
SCALE_FUNC %1, %2, 8, 8, 6, %3
%if mmsize == 8
SCALE_FUNC %1, %2, X, X, 7, %3
%else
SCALE_FUNC %1, %2, X, X4, 7, %3
SCALE_FUNC %1, %2, X, X8, 7, %3
%endif
%endmacro
 
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
%macro SCALE_FUNCS2 3
%if notcpuflag(sse4)
SCALE_FUNCS 8, 15, %1
SCALE_FUNCS 9, 15, %2
SCALE_FUNCS 10, 15, %2
SCALE_FUNCS 12, 15, %2
SCALE_FUNCS 14, 15, %2
SCALE_FUNCS 16, 15, %3
%endif ; !sse4
SCALE_FUNCS 8, 19, %1
SCALE_FUNCS 9, 19, %2
SCALE_FUNCS 10, 19, %2
SCALE_FUNCS 12, 19, %2
SCALE_FUNCS 14, 19, %2
SCALE_FUNCS 16, 19, %3
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
SCALE_FUNCS2 0, 0, 0
%endif
INIT_XMM sse2
SCALE_FUNCS2 7, 6, 8
INIT_XMM ssse3
SCALE_FUNCS2 6, 6, 8
INIT_XMM sse4
SCALE_FUNCS2 6, 6, 8
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/swscale.c
0,0 → 1,607
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <inttypes.h>
#include "config.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/intreadwrite.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
#include "libavutil/pixdesc.h"
 
#if HAVE_INLINE_ASM
 
#define DITHER1XBPP
 
DECLARE_ASM_CONST(8, uint64_t, bF8)= 0xF8F8F8F8F8F8F8F8LL;
DECLARE_ASM_CONST(8, uint64_t, bFC)= 0xFCFCFCFCFCFCFCFCLL;
DECLARE_ASM_CONST(8, uint64_t, w10)= 0x0010001000100010LL;
DECLARE_ASM_CONST(8, uint64_t, w02)= 0x0002000200020002LL;
 
const DECLARE_ALIGNED(8, uint64_t, ff_dither4)[2] = {
0x0103010301030103LL,
0x0200020002000200LL,};
 
const DECLARE_ALIGNED(8, uint64_t, ff_dither8)[2] = {
0x0602060206020602LL,
0x0004000400040004LL,};
 
DECLARE_ASM_CONST(8, uint64_t, b16Mask)= 0x001F001F001F001FLL;
DECLARE_ASM_CONST(8, uint64_t, g16Mask)= 0x07E007E007E007E0LL;
DECLARE_ASM_CONST(8, uint64_t, r16Mask)= 0xF800F800F800F800LL;
DECLARE_ASM_CONST(8, uint64_t, b15Mask)= 0x001F001F001F001FLL;
DECLARE_ASM_CONST(8, uint64_t, g15Mask)= 0x03E003E003E003E0LL;
DECLARE_ASM_CONST(8, uint64_t, r15Mask)= 0x7C007C007C007C00LL;
 
DECLARE_ALIGNED(8, const uint64_t, ff_M24A) = 0x00FF0000FF0000FFLL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24B) = 0xFF0000FF0000FF00LL;
DECLARE_ALIGNED(8, const uint64_t, ff_M24C) = 0x0000FF0000FF0000LL;
 
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2YOffset) = 0x1010101010101010ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_bgr2UVOffset) = 0x8080808080808080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_w1111) = 0x0001000100010001ULL;
 
 
//MMX versions
#if HAVE_MMX_INLINE
#undef RENAME
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _mmx
#include "swscale_template.c"
#endif
 
// MMXEXT versions
#if HAVE_MMXEXT_INLINE
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "swscale_template.c"
#endif
 
void ff_updateMMXDitherTables(SwsContext *c, int dstY, int lumBufIndex, int chrBufIndex,
int lastInLumBuf, int lastInChrBuf)
{
const int dstH= c->dstH;
const int flags= c->flags;
#ifdef NEW_FILTER
SwsPlane *lumPlane = &c->slice[c->numSlice-2].plane[0];
SwsPlane *chrUPlane = &c->slice[c->numSlice-2].plane[1];
SwsPlane *alpPlane = &c->slice[c->numSlice-2].plane[3];
#else
int16_t **lumPixBuf= c->lumPixBuf;
int16_t **chrUPixBuf= c->chrUPixBuf;
int16_t **alpPixBuf= c->alpPixBuf;
const int vLumBufSize= c->vLumBufSize;
const int vChrBufSize= c->vChrBufSize;
#endif
int hasAlpha = c->alpPixBuf != NULL;
int32_t *vLumFilterPos= c->vLumFilterPos;
int32_t *vChrFilterPos= c->vChrFilterPos;
int16_t *vLumFilter= c->vLumFilter;
int16_t *vChrFilter= c->vChrFilter;
int32_t *lumMmxFilter= c->lumMmxFilter;
int32_t *chrMmxFilter= c->chrMmxFilter;
int32_t av_unused *alpMmxFilter= c->alpMmxFilter;
const int vLumFilterSize= c->vLumFilterSize;
const int vChrFilterSize= c->vChrFilterSize;
const int chrDstY= dstY>>c->chrDstVSubSample;
const int firstLumSrcY= vLumFilterPos[dstY]; //First line needed as input
const int firstChrSrcY= vChrFilterPos[chrDstY]; //First line needed as input
 
c->blueDither= ff_dither8[dstY&1];
if (c->dstFormat == AV_PIX_FMT_RGB555 || c->dstFormat == AV_PIX_FMT_BGR555)
c->greenDither= ff_dither8[dstY&1];
else
c->greenDither= ff_dither4[dstY&1];
c->redDither= ff_dither8[(dstY+1)&1];
if (dstY < dstH - 2) {
#ifdef NEW_FILTER
const int16_t **lumSrcPtr = (const int16_t **)(void*) lumPlane->line + firstLumSrcY - lumPlane->sliceY;
const int16_t **chrUSrcPtr = (const int16_t **)(void*) chrUPlane->line + firstChrSrcY - chrUPlane->sliceY;
const int16_t **alpSrcPtr = (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) ? (const int16_t **)(void*) alpPlane->line + firstLumSrcY - alpPlane->sliceY : NULL;
#else
const int16_t **lumSrcPtr= (const int16_t **)(void*) lumPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize;
const int16_t **chrUSrcPtr= (const int16_t **)(void*) chrUPixBuf + chrBufIndex + firstChrSrcY - lastInChrBuf + vChrBufSize;
const int16_t **alpSrcPtr= (CONFIG_SWSCALE_ALPHA && alpPixBuf) ? (const int16_t **)(void*) alpPixBuf + lumBufIndex + firstLumSrcY - lastInLumBuf + vLumBufSize : NULL;
#endif
int i;
if (firstLumSrcY < 0 || firstLumSrcY + vLumFilterSize > c->srcH) {
#ifdef NEW_FILTER
const int16_t **tmpY = (const int16_t **) lumPlane->tmp;
#else
const int16_t **tmpY = (const int16_t **) lumPixBuf + 2 * vLumBufSize;
#endif
int neg = -firstLumSrcY, i, end = FFMIN(c->srcH - firstLumSrcY, vLumFilterSize);
for (i = 0; i < neg; i++)
tmpY[i] = lumSrcPtr[neg];
for ( ; i < end; i++)
tmpY[i] = lumSrcPtr[i];
for ( ; i < vLumFilterSize; i++)
tmpY[i] = tmpY[i-1];
lumSrcPtr = tmpY;
 
if (alpSrcPtr) {
#ifdef NEW_FILTER
const int16_t **tmpA = (const int16_t **) alpPlane->tmp;
#else
const int16_t **tmpA = (const int16_t **) alpPixBuf + 2 * vLumBufSize;
#endif
for (i = 0; i < neg; i++)
tmpA[i] = alpSrcPtr[neg];
for ( ; i < end; i++)
tmpA[i] = alpSrcPtr[i];
for ( ; i < vLumFilterSize; i++)
tmpA[i] = tmpA[i - 1];
alpSrcPtr = tmpA;
}
}
if (firstChrSrcY < 0 || firstChrSrcY + vChrFilterSize > c->chrSrcH) {
#ifdef NEW_FILTER
const int16_t **tmpU = (const int16_t **) chrUPlane->tmp;
#else
const int16_t **tmpU = (const int16_t **) chrUPixBuf + 2 * vChrBufSize;
#endif
int neg = -firstChrSrcY, i, end = FFMIN(c->chrSrcH - firstChrSrcY, vChrFilterSize);
for (i = 0; i < neg; i++) {
tmpU[i] = chrUSrcPtr[neg];
}
for ( ; i < end; i++) {
tmpU[i] = chrUSrcPtr[i];
}
for ( ; i < vChrFilterSize; i++) {
tmpU[i] = tmpU[i - 1];
}
chrUSrcPtr = tmpU;
}
 
if (flags & SWS_ACCURATE_RND) {
int s= APCK_SIZE / 8;
for (i=0; i<vLumFilterSize; i+=2) {
*(const void**)&lumMmxFilter[s*i ]= lumSrcPtr[i ];
*(const void**)&lumMmxFilter[s*i+APCK_PTR2/4 ]= lumSrcPtr[i+(vLumFilterSize>1)];
lumMmxFilter[s*i+APCK_COEF/4 ]=
lumMmxFilter[s*i+APCK_COEF/4+1]= vLumFilter[dstY*vLumFilterSize + i ]
+ (vLumFilterSize>1 ? vLumFilter[dstY*vLumFilterSize + i + 1]<<16 : 0);
if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
*(const void**)&alpMmxFilter[s*i ]= alpSrcPtr[i ];
*(const void**)&alpMmxFilter[s*i+APCK_PTR2/4 ]= alpSrcPtr[i+(vLumFilterSize>1)];
alpMmxFilter[s*i+APCK_COEF/4 ]=
alpMmxFilter[s*i+APCK_COEF/4+1]= lumMmxFilter[s*i+APCK_COEF/4 ];
}
}
for (i=0; i<vChrFilterSize; i+=2) {
*(const void**)&chrMmxFilter[s*i ]= chrUSrcPtr[i ];
*(const void**)&chrMmxFilter[s*i+APCK_PTR2/4 ]= chrUSrcPtr[i+(vChrFilterSize>1)];
chrMmxFilter[s*i+APCK_COEF/4 ]=
chrMmxFilter[s*i+APCK_COEF/4+1]= vChrFilter[chrDstY*vChrFilterSize + i ]
+ (vChrFilterSize>1 ? vChrFilter[chrDstY*vChrFilterSize + i + 1]<<16 : 0);
}
} else {
for (i=0; i<vLumFilterSize; i++) {
*(const void**)&lumMmxFilter[4*i+0]= lumSrcPtr[i];
lumMmxFilter[4*i+2]=
lumMmxFilter[4*i+3]=
((uint16_t)vLumFilter[dstY*vLumFilterSize + i])*0x10001U;
if (CONFIG_SWSCALE_ALPHA && hasAlpha) {
*(const void**)&alpMmxFilter[4*i+0]= alpSrcPtr[i];
alpMmxFilter[4*i+2]=
alpMmxFilter[4*i+3]= lumMmxFilter[4*i+2];
}
}
for (i=0; i<vChrFilterSize; i++) {
*(const void**)&chrMmxFilter[4*i+0]= chrUSrcPtr[i];
chrMmxFilter[4*i+2]=
chrMmxFilter[4*i+3]=
((uint16_t)vChrFilter[chrDstY*vChrFilterSize + i])*0x10001U;
}
}
}
}
 
#if HAVE_MMXEXT
static void yuv2yuvX_sse3(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
if(((uintptr_t)dest) & 15){
yuv2yuvX_mmxext(filter, filterSize, src, dest, dstW, dither, offset);
return;
}
filterSize--;
#define MAIN_FUNCTION \
"pxor %%xmm0, %%xmm0 \n\t" \
"punpcklbw %%xmm0, %%xmm3 \n\t" \
"movd %4, %%xmm1 \n\t" \
"punpcklwd %%xmm1, %%xmm1 \n\t" \
"punpckldq %%xmm1, %%xmm1 \n\t" \
"punpcklqdq %%xmm1, %%xmm1 \n\t" \
"psllw $3, %%xmm1 \n\t" \
"paddw %%xmm1, %%xmm3 \n\t" \
"psraw $4, %%xmm3 \n\t" \
"movdqa %%xmm3, %%xmm4 \n\t" \
"movdqa %%xmm3, %%xmm7 \n\t" \
"movl %3, %%ecx \n\t" \
"mov %0, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
".p2align 4 \n\t" /* FIXME Unroll? */\
"1: \n\t"\
"movddup 8(%%"REG_d"), %%xmm0 \n\t" /* filterCoeff */\
"movdqa (%%"REG_S", %%"REG_c", 2), %%xmm2 \n\t" /* srcData */\
"movdqa 16(%%"REG_S", %%"REG_c", 2), %%xmm5 \n\t" /* srcData */\
"add $16, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\
"pmulhw %%xmm0, %%xmm2 \n\t"\
"pmulhw %%xmm0, %%xmm5 \n\t"\
"paddw %%xmm2, %%xmm3 \n\t"\
"paddw %%xmm5, %%xmm4 \n\t"\
" jnz 1b \n\t"\
"psraw $3, %%xmm3 \n\t"\
"psraw $3, %%xmm4 \n\t"\
"packuswb %%xmm4, %%xmm3 \n\t"\
"movntdq %%xmm3, (%1, %%"REG_c")\n\t"\
"add $16, %%"REG_c" \n\t"\
"cmp %2, %%"REG_c" \n\t"\
"movdqa %%xmm7, %%xmm3 \n\t" \
"movdqa %%xmm7, %%xmm4 \n\t" \
"mov %0, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"
 
if (offset) {
__asm__ volatile(
"movq %5, %%xmm3 \n\t"
"movdqa %%xmm3, %%xmm4 \n\t"
"psrlq $24, %%xmm3 \n\t"
"psllq $40, %%xmm4 \n\t"
"por %%xmm4, %%xmm3 \n\t"
MAIN_FUNCTION
:: "g" (filter),
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
"m"(filterSize), "m"(((uint64_t *) dither)[0])
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
"%"REG_d, "%"REG_S, "%"REG_c
);
} else {
__asm__ volatile(
"movq %5, %%xmm3 \n\t"
MAIN_FUNCTION
:: "g" (filter),
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset),
"m"(filterSize), "m"(((uint64_t *) dither)[0])
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , "%xmm4" , "%xmm5" , "%xmm7" ,)
"%"REG_d, "%"REG_S, "%"REG_c
);
}
}
#endif
 
#endif /* HAVE_INLINE_ASM */
 
#define SCALE_FUNC(filter_n, from_bpc, to_bpc, opt) \
void ff_hscale ## from_bpc ## to ## to_bpc ## _ ## filter_n ## _ ## opt( \
SwsContext *c, int16_t *data, \
int dstW, const uint8_t *src, \
const int16_t *filter, \
const int32_t *filterPos, int filterSize)
 
#define SCALE_FUNCS(filter_n, opt) \
SCALE_FUNC(filter_n, 8, 15, opt); \
SCALE_FUNC(filter_n, 9, 15, opt); \
SCALE_FUNC(filter_n, 10, 15, opt); \
SCALE_FUNC(filter_n, 12, 15, opt); \
SCALE_FUNC(filter_n, 14, 15, opt); \
SCALE_FUNC(filter_n, 16, 15, opt); \
SCALE_FUNC(filter_n, 8, 19, opt); \
SCALE_FUNC(filter_n, 9, 19, opt); \
SCALE_FUNC(filter_n, 10, 19, opt); \
SCALE_FUNC(filter_n, 12, 19, opt); \
SCALE_FUNC(filter_n, 14, 19, opt); \
SCALE_FUNC(filter_n, 16, 19, opt)
 
#define SCALE_FUNCS_MMX(opt) \
SCALE_FUNCS(4, opt); \
SCALE_FUNCS(8, opt); \
SCALE_FUNCS(X, opt)
 
#define SCALE_FUNCS_SSE(opt) \
SCALE_FUNCS(4, opt); \
SCALE_FUNCS(8, opt); \
SCALE_FUNCS(X4, opt); \
SCALE_FUNCS(X8, opt)
 
#if ARCH_X86_32
SCALE_FUNCS_MMX(mmx);
#endif
SCALE_FUNCS_SSE(sse2);
SCALE_FUNCS_SSE(ssse3);
SCALE_FUNCS_SSE(sse4);
 
#define VSCALEX_FUNC(size, opt) \
void ff_yuv2planeX_ ## size ## _ ## opt(const int16_t *filter, int filterSize, \
const int16_t **src, uint8_t *dest, int dstW, \
const uint8_t *dither, int offset)
#define VSCALEX_FUNCS(opt) \
VSCALEX_FUNC(8, opt); \
VSCALEX_FUNC(9, opt); \
VSCALEX_FUNC(10, opt)
 
#if ARCH_X86_32
VSCALEX_FUNCS(mmxext);
#endif
VSCALEX_FUNCS(sse2);
VSCALEX_FUNCS(sse4);
VSCALEX_FUNC(16, sse4);
VSCALEX_FUNCS(avx);
 
#define VSCALE_FUNC(size, opt) \
void ff_yuv2plane1_ ## size ## _ ## opt(const int16_t *src, uint8_t *dst, int dstW, \
const uint8_t *dither, int offset)
#define VSCALE_FUNCS(opt1, opt2) \
VSCALE_FUNC(8, opt1); \
VSCALE_FUNC(9, opt2); \
VSCALE_FUNC(10, opt2); \
VSCALE_FUNC(16, opt1)
 
#if ARCH_X86_32
VSCALE_FUNCS(mmx, mmxext);
#endif
VSCALE_FUNCS(sse2, sse2);
VSCALE_FUNC(16, sse4);
VSCALE_FUNCS(avx, avx);
 
#define INPUT_Y_FUNC(fmt, opt) \
void ff_ ## fmt ## ToY_ ## opt(uint8_t *dst, const uint8_t *src, \
const uint8_t *unused1, const uint8_t *unused2, \
int w, uint32_t *unused)
#define INPUT_UV_FUNC(fmt, opt) \
void ff_ ## fmt ## ToUV_ ## opt(uint8_t *dstU, uint8_t *dstV, \
const uint8_t *unused0, \
const uint8_t *src1, \
const uint8_t *src2, \
int w, uint32_t *unused)
#define INPUT_FUNC(fmt, opt) \
INPUT_Y_FUNC(fmt, opt); \
INPUT_UV_FUNC(fmt, opt)
#define INPUT_FUNCS(opt) \
INPUT_FUNC(uyvy, opt); \
INPUT_FUNC(yuyv, opt); \
INPUT_UV_FUNC(nv12, opt); \
INPUT_UV_FUNC(nv21, opt); \
INPUT_FUNC(rgba, opt); \
INPUT_FUNC(bgra, opt); \
INPUT_FUNC(argb, opt); \
INPUT_FUNC(abgr, opt); \
INPUT_FUNC(rgb24, opt); \
INPUT_FUNC(bgr24, opt)
 
#if ARCH_X86_32
INPUT_FUNCS(mmx);
#endif
INPUT_FUNCS(sse2);
INPUT_FUNCS(ssse3);
INPUT_FUNCS(avx);
 
av_cold void ff_sws_init_swscale_x86(SwsContext *c)
{
int cpu_flags = av_get_cpu_flags();
 
#if HAVE_MMX_INLINE
if (INLINE_MMX(cpu_flags))
sws_init_swscale_mmx(c);
#endif
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
sws_init_swscale_mmxext(c);
if (cpu_flags & AV_CPU_FLAG_SSE3){
if(c->use_mmx_vfilter && !(c->flags & SWS_ACCURATE_RND))
c->yuv2planeX = yuv2yuvX_sse3;
}
#endif
 
#define ASSIGN_SCALE_FUNC2(hscalefn, filtersize, opt1, opt2) do { \
if (c->srcBpc == 8) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale8to15_ ## filtersize ## _ ## opt2 : \
ff_hscale8to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 9) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale9to15_ ## filtersize ## _ ## opt2 : \
ff_hscale9to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 10) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale10to15_ ## filtersize ## _ ## opt2 : \
ff_hscale10to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 12) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale12to15_ ## filtersize ## _ ## opt2 : \
ff_hscale12to19_ ## filtersize ## _ ## opt1; \
} else if (c->srcBpc == 14 || ((c->srcFormat==AV_PIX_FMT_PAL8||isAnyRGB(c->srcFormat)) && av_pix_fmt_desc_get(c->srcFormat)->comp[0].depth_minus1<15)) { \
hscalefn = c->dstBpc <= 14 ? ff_hscale14to15_ ## filtersize ## _ ## opt2 : \
ff_hscale14to19_ ## filtersize ## _ ## opt1; \
} else { /* c->srcBpc == 16 */ \
av_assert0(c->srcBpc == 16);\
hscalefn = c->dstBpc <= 14 ? ff_hscale16to15_ ## filtersize ## _ ## opt2 : \
ff_hscale16to19_ ## filtersize ## _ ## opt1; \
} \
} while (0)
#define ASSIGN_MMX_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
default: ASSIGN_SCALE_FUNC2(hscalefn, X, opt1, opt2); break; \
}
#define ASSIGN_VSCALEX_FUNC(vscalefn, opt, do_16_case, condition_8bit) \
switch(c->dstBpc){ \
case 16: do_16_case; break; \
case 10: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_10_ ## opt; break; \
case 9: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2planeX_9_ ## opt; break; \
case 8: if ((condition_8bit) && !c->use_mmx_vfilter) vscalefn = ff_yuv2planeX_8_ ## opt; break; \
}
#define ASSIGN_VSCALE_FUNC(vscalefn, opt1, opt2, opt2chk) \
switch(c->dstBpc){ \
case 16: if (!isBE(c->dstFormat)) vscalefn = ff_yuv2plane1_16_ ## opt1; break; \
case 10: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_10_ ## opt2; break; \
case 9: if (!isBE(c->dstFormat) && opt2chk) vscalefn = ff_yuv2plane1_9_ ## opt2; break; \
case 8: vscalefn = ff_yuv2plane1_8_ ## opt1; break; \
default: av_assert0(c->dstBpc>8); \
}
#define case_rgb(x, X, opt) \
case AV_PIX_FMT_ ## X: \
c->lumToYV12 = ff_ ## x ## ToY_ ## opt; \
if (!c->chrSrcHSubSample) \
c->chrToYV12 = ff_ ## x ## ToUV_ ## opt; \
break
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
ASSIGN_MMX_SCALE_FUNC(c->hyScale, c->hLumFilterSize, mmx, mmx);
ASSIGN_MMX_SCALE_FUNC(c->hcScale, c->hChrFilterSize, mmx, mmx);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, mmx, mmxext, cpu_flags & AV_CPU_FLAG_MMXEXT);
 
switch (c->srcFormat) {
case AV_PIX_FMT_YA8:
c->lumToYV12 = ff_yuyvToY_mmx;
if (c->alpPixBuf)
c->alpToYV12 = ff_uyvyToY_mmx;
break;
case AV_PIX_FMT_YUYV422:
c->lumToYV12 = ff_yuyvToY_mmx;
c->chrToYV12 = ff_yuyvToUV_mmx;
break;
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = ff_uyvyToY_mmx;
c->chrToYV12 = ff_uyvyToUV_mmx;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_mmx;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_mmx;
break;
case_rgb(rgb24, RGB24, mmx);
case_rgb(bgr24, BGR24, mmx);
case_rgb(bgra, BGRA, mmx);
case_rgb(rgba, RGBA, mmx);
case_rgb(abgr, ABGR, mmx);
case_rgb(argb, ARGB, mmx);
default:
break;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, mmxext, , 1);
}
#endif /* ARCH_X86_32 */
#define ASSIGN_SSE_SCALE_FUNC(hscalefn, filtersize, opt1, opt2) \
switch (filtersize) { \
case 4: ASSIGN_SCALE_FUNC2(hscalefn, 4, opt1, opt2); break; \
case 8: ASSIGN_SCALE_FUNC2(hscalefn, 8, opt1, opt2); break; \
default: if (filtersize & 4) ASSIGN_SCALE_FUNC2(hscalefn, X4, opt1, opt2); \
else ASSIGN_SCALE_FUNC2(hscalefn, X8, opt1, opt2); \
break; \
}
if (EXTERNAL_SSE2(cpu_flags)) {
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse2, sse2);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse2, sse2);
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse2, ,
HAVE_ALIGNED_STACK || ARCH_X86_64);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, sse2, sse2, 1);
 
switch (c->srcFormat) {
case AV_PIX_FMT_YA8:
c->lumToYV12 = ff_yuyvToY_sse2;
if (c->alpPixBuf)
c->alpToYV12 = ff_uyvyToY_sse2;
break;
case AV_PIX_FMT_YUYV422:
c->lumToYV12 = ff_yuyvToY_sse2;
c->chrToYV12 = ff_yuyvToUV_sse2;
break;
case AV_PIX_FMT_UYVY422:
c->lumToYV12 = ff_uyvyToY_sse2;
c->chrToYV12 = ff_uyvyToUV_sse2;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_sse2;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_sse2;
break;
case_rgb(rgb24, RGB24, sse2);
case_rgb(bgr24, BGR24, sse2);
case_rgb(bgra, BGRA, sse2);
case_rgb(rgba, RGBA, sse2);
case_rgb(abgr, ABGR, sse2);
case_rgb(argb, ARGB, sse2);
default:
break;
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, ssse3, ssse3);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, ssse3, ssse3);
switch (c->srcFormat) {
case_rgb(rgb24, RGB24, ssse3);
case_rgb(bgr24, BGR24, ssse3);
default:
break;
}
}
if (EXTERNAL_SSE4(cpu_flags)) {
/* Xto15 don't need special sse4 functions */
ASSIGN_SSE_SCALE_FUNC(c->hyScale, c->hLumFilterSize, sse4, ssse3);
ASSIGN_SSE_SCALE_FUNC(c->hcScale, c->hChrFilterSize, sse4, ssse3);
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, sse4,
if (!isBE(c->dstFormat)) c->yuv2planeX = ff_yuv2planeX_16_sse4,
HAVE_ALIGNED_STACK || ARCH_X86_64);
if (c->dstBpc == 16 && !isBE(c->dstFormat))
c->yuv2plane1 = ff_yuv2plane1_16_sse4;
}
 
if (EXTERNAL_AVX(cpu_flags)) {
ASSIGN_VSCALEX_FUNC(c->yuv2planeX, avx, ,
HAVE_ALIGNED_STACK || ARCH_X86_64);
ASSIGN_VSCALE_FUNC(c->yuv2plane1, avx, avx, 1);
 
switch (c->srcFormat) {
case AV_PIX_FMT_YUYV422:
c->chrToYV12 = ff_yuyvToUV_avx;
break;
case AV_PIX_FMT_UYVY422:
c->chrToYV12 = ff_uyvyToUV_avx;
break;
case AV_PIX_FMT_NV12:
c->chrToYV12 = ff_nv12ToUV_avx;
break;
case AV_PIX_FMT_NV21:
c->chrToYV12 = ff_nv21ToUV_avx;
break;
case_rgb(rgb24, RGB24, avx);
case_rgb(bgr24, BGR24, avx);
case_rgb(bgra, BGRA, avx);
case_rgb(rgba, RGBA, avx);
case_rgb(abgr, ABGR, avx);
case_rgb(argb, ARGB, avx);
default:
break;
}
}
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/swscale_template.c
0,0 → 1,1576
/*
* Copyright (C) 2001-2011 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stdint.h>
 
#include "libavutil/x86/asm.h"
#include "libswscale/swscale_internal.h"
 
#undef REAL_MOVNTQ
#undef MOVNTQ
#undef MOVNTQ2
#undef PREFETCH
 
 
#if COMPILE_TEMPLATE_MMXEXT
#define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t"
#define MOVNTQ2 "movntq "
#else
#define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t"
#define MOVNTQ2 "movq "
#endif
#define MOVNTQ(a,b) REAL_MOVNTQ(a,b)
 
#if !COMPILE_TEMPLATE_MMXEXT
static av_always_inline void
dither_8to16(const uint8_t *srcDither, int rot)
{
if (rot) {
__asm__ volatile("pxor %%mm0, %%mm0\n\t"
"movq (%0), %%mm3\n\t"
"movq %%mm3, %%mm4\n\t"
"psrlq $24, %%mm3\n\t"
"psllq $40, %%mm4\n\t"
"por %%mm4, %%mm3\n\t"
"movq %%mm3, %%mm4\n\t"
"punpcklbw %%mm0, %%mm3\n\t"
"punpckhbw %%mm0, %%mm4\n\t"
:: "r"(srcDither)
);
} else {
__asm__ volatile("pxor %%mm0, %%mm0\n\t"
"movq (%0), %%mm3\n\t"
"movq %%mm3, %%mm4\n\t"
"punpcklbw %%mm0, %%mm3\n\t"
"punpckhbw %%mm0, %%mm4\n\t"
:: "r"(srcDither)
);
}
}
#endif
 
static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize,
const int16_t **src, uint8_t *dest, int dstW,
const uint8_t *dither, int offset)
{
dither_8to16(dither, offset);
filterSize--;
__asm__ volatile(
"movd %0, %%mm1\n\t"
"punpcklwd %%mm1, %%mm1\n\t"
"punpckldq %%mm1, %%mm1\n\t"
"psllw $3, %%mm1\n\t"
"paddw %%mm1, %%mm3\n\t"
"paddw %%mm1, %%mm4\n\t"
"psraw $4, %%mm3\n\t"
"psraw $4, %%mm4\n\t"
::"m"(filterSize)
);
 
__asm__ volatile(\
"movq %%mm3, %%mm6\n\t"
"movq %%mm4, %%mm7\n\t"
"movl %3, %%ecx\n\t"
"mov %0, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
".p2align 4 \n\t" /* FIXME Unroll? */\
"1: \n\t"\
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
"movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\
"movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\
"add $16, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\
"pmulhw %%mm0, %%mm5 \n\t"\
"paddw %%mm2, %%mm3 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
" jnz 1b \n\t"\
"psraw $3, %%mm3 \n\t"\
"psraw $3, %%mm4 \n\t"\
"packuswb %%mm4, %%mm3 \n\t"
MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t"
"add $8, %%"REG_c" \n\t"\
"cmp %2, %%"REG_c" \n\t"\
"movq %%mm6, %%mm3\n\t"
"movq %%mm7, %%mm4\n\t"
"mov %0, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"jb 1b \n\t"\
:: "g" (filter),
"r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset)
: "%"REG_d, "%"REG_S, "%"REG_c
);
}
 
#define YSCALEYUV2PACKEDX_UV \
__asm__ volatile(\
"xor %%"REG_a", %%"REG_a" \n\t"\
".p2align 4 \n\t"\
"nop \n\t"\
"1: \n\t"\
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\
"movq %%mm3, %%mm4 \n\t"\
".p2align 4 \n\t"\
"2: \n\t"\
"movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\
"movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\
"add %6, %%"REG_S" \n\t" \
"movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\
"add $16, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"pmulhw %%mm0, %%mm2 \n\t"\
"pmulhw %%mm0, %%mm5 \n\t"\
"paddw %%mm2, %%mm3 \n\t"\
"paddw %%mm5, %%mm4 \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\
" jnz 2b \n\t"\
 
#define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \
"lea "offset"(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\
"movq "#dst1", "#dst2" \n\t"\
".p2align 4 \n\t"\
"2: \n\t"\
"movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\
"movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\
"movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\
"add $16, %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"pmulhw "#coeff", "#src1" \n\t"\
"pmulhw "#coeff", "#src2" \n\t"\
"paddw "#src1", "#dst1" \n\t"\
"paddw "#src2", "#dst2" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\
" jnz 2b \n\t"\
 
#define YSCALEYUV2PACKEDX \
YSCALEYUV2PACKEDX_UV \
YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \
 
#define YSCALEYUV2PACKEDX_END \
:: "r" (&c->redDither), \
"m" (dummy), "m" (dummy), "m" (dummy),\
"r" (dest), "m" (dstW_reg), "m"(uv_off) \
NAMED_CONSTRAINTS_ADD(bF8,bFC) \
: "%"REG_a, "%"REG_d, "%"REG_S \
);
 
#define YSCALEYUV2PACKEDX_ACCURATE_UV \
__asm__ volatile(\
"xor %%"REG_a", %%"REG_a" \n\t"\
".p2align 4 \n\t"\
"nop \n\t"\
"1: \n\t"\
"lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"pxor %%mm4, %%mm4 \n\t"\
"pxor %%mm5, %%mm5 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
".p2align 4 \n\t"\
"2: \n\t"\
"movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\
"add %6, %%"REG_S" \n\t" \
"movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
"movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\
"movq %%mm0, %%mm3 \n\t"\
"punpcklwd %%mm1, %%mm0 \n\t"\
"punpckhwd %%mm1, %%mm3 \n\t"\
"movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\
"pmaddwd %%mm1, %%mm0 \n\t"\
"pmaddwd %%mm1, %%mm3 \n\t"\
"paddd %%mm0, %%mm4 \n\t"\
"paddd %%mm3, %%mm5 \n\t"\
"add %6, %%"REG_S" \n\t" \
"movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"punpcklwd %%mm3, %%mm2 \n\t"\
"punpckhwd %%mm3, %%mm0 \n\t"\
"pmaddwd %%mm1, %%mm2 \n\t"\
"pmaddwd %%mm1, %%mm0 \n\t"\
"paddd %%mm2, %%mm6 \n\t"\
"paddd %%mm0, %%mm7 \n\t"\
" jnz 2b \n\t"\
"psrad $16, %%mm4 \n\t"\
"psrad $16, %%mm5 \n\t"\
"psrad $16, %%mm6 \n\t"\
"psrad $16, %%mm7 \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
"packssdw %%mm5, %%mm4 \n\t"\
"packssdw %%mm7, %%mm6 \n\t"\
"paddw %%mm0, %%mm4 \n\t"\
"paddw %%mm0, %%mm6 \n\t"\
"movq %%mm4, "U_TEMP"(%0) \n\t"\
"movq %%mm6, "V_TEMP"(%0) \n\t"\
 
#define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \
"lea "offset"(%0), %%"REG_d" \n\t"\
"mov (%%"REG_d"), %%"REG_S" \n\t"\
"pxor %%mm1, %%mm1 \n\t"\
"pxor %%mm5, %%mm5 \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
".p2align 4 \n\t"\
"2: \n\t"\
"movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\
"movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\
"mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\
"movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\
"movq %%mm0, %%mm3 \n\t"\
"punpcklwd %%mm4, %%mm0 \n\t"\
"punpckhwd %%mm4, %%mm3 \n\t"\
"movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\
"pmaddwd %%mm4, %%mm0 \n\t"\
"pmaddwd %%mm4, %%mm3 \n\t"\
"paddd %%mm0, %%mm1 \n\t"\
"paddd %%mm3, %%mm5 \n\t"\
"movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\
"mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\
"add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\
"test %%"REG_S", %%"REG_S" \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"punpcklwd %%mm3, %%mm2 \n\t"\
"punpckhwd %%mm3, %%mm0 \n\t"\
"pmaddwd %%mm4, %%mm2 \n\t"\
"pmaddwd %%mm4, %%mm0 \n\t"\
"paddd %%mm2, %%mm7 \n\t"\
"paddd %%mm0, %%mm6 \n\t"\
" jnz 2b \n\t"\
"psrad $16, %%mm1 \n\t"\
"psrad $16, %%mm5 \n\t"\
"psrad $16, %%mm7 \n\t"\
"psrad $16, %%mm6 \n\t"\
"movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\
"packssdw %%mm5, %%mm1 \n\t"\
"packssdw %%mm6, %%mm7 \n\t"\
"paddw %%mm0, %%mm1 \n\t"\
"paddw %%mm0, %%mm7 \n\t"\
"movq "U_TEMP"(%0), %%mm3 \n\t"\
"movq "V_TEMP"(%0), %%mm4 \n\t"\
 
#define YSCALEYUV2PACKEDX_ACCURATE \
YSCALEYUV2PACKEDX_ACCURATE_UV \
YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET)
 
#define YSCALEYUV2RGBX \
"psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\
"pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\
"pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\
"psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\
"pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
 
#define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \
"movq "#b", "#q2" \n\t" /* B */\
"movq "#r", "#t" \n\t" /* R */\
"punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\
"punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\
"punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\
"punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\
"movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\
"movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\
"punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\
"punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\
"punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\
"punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\
\
MOVNTQ( q0, (dst, index, 4))\
MOVNTQ( b, 8(dst, index, 4))\
MOVNTQ( q2, 16(dst, index, 4))\
MOVNTQ( q3, 24(dst, index, 4))\
\
"add $8, "#index" \n\t"\
"cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t)
 
static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"movq %%mm2, "U_TEMP"(%0) \n\t"
"movq %%mm4, "V_TEMP"(%0) \n\t"
"movq %%mm5, "Y_TEMP"(%0) \n\t"
YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET)
"movq "Y_TEMP"(%0), %%mm5 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
WRITEBGR32(%4, "%5", %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
 
static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%4, "%5", %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
 
static void RENAME(yuv2bgr32_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7)
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
"packuswb %%mm7, %%mm1 \n\t"
WRITEBGR32(%4, "%5", %%REGa, %%mm5, %%mm4, %%mm2, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
} else {
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%4, "%5", %%REGa, %%mm5, %%mm4, %%mm2, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
YSCALEYUV2PACKEDX_END
}
}
 
#define REAL_WRITERGB16(dst, dstw, index) \
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
"pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
"psrlq $3, %%mm2 \n\t"\
\
"movq %%mm2, %%mm1 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm5, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm5, %%mm1 \n\t"\
\
"psllq $3, %%mm3 \n\t"\
"psllq $3, %%mm4 \n\t"\
\
"por %%mm3, %%mm2 \n\t"\
"por %%mm4, %%mm1 \n\t"\
\
MOVNTQ(%%mm2, (dst, index, 2))\
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
"cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index)
 
static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
WRITERGB16(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
 
static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
#endif
WRITERGB16(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
 
#define REAL_WRITERGB15(dst, dstw, index) \
"pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\
"pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\
"pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\
"psrlq $3, %%mm2 \n\t"\
"psrlq $1, %%mm5 \n\t"\
\
"movq %%mm2, %%mm1 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm5, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm5, %%mm1 \n\t"\
\
"psllq $2, %%mm3 \n\t"\
"psllq $2, %%mm4 \n\t"\
\
"por %%mm3, %%mm2 \n\t"\
"por %%mm4, %%mm1 \n\t"\
\
MOVNTQ(%%mm2, (dst, index, 2))\
MOVNTQ(%%mm1, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
"cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index)
 
static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2\n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4\n\t"
"paddusb "RED_DITHER"(%0), %%mm5\n\t"
#endif
WRITERGB15(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
 
static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%0), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%0), %%mm4 \n\t"
"paddusb "RED_DITHER"(%0), %%mm5 \n\t"
#endif
WRITERGB15(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
 
#define WRITEBGR24MMX(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
"movq %%mm2, %%mm1 \n\t" /* B */\
"movq %%mm5, %%mm6 \n\t" /* R */\
"punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\
"punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\
"punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\
"punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\
"movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\
"movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\
"punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\
"punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\
"punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\
"punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\
\
"movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\
"movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\
"movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\
"movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\
\
"psllq $40, %%mm0 \n\t" /* RGB00000 0 */\
"psllq $40, %%mm2 \n\t" /* RGB00000 1 */\
"psllq $40, %%mm1 \n\t" /* RGB00000 2 */\
"psllq $40, %%mm3 \n\t" /* RGB00000 3 */\
\
"punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\
"punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\
"punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\
"punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\
\
"psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\
"movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\
"psllq $40, %%mm2 \n\t" /* GB000000 1 */\
"por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\
MOVNTQ(%%mm0, (dst))\
\
"psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\
"movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\
"psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\
"por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\
MOVNTQ(%%mm6, 8(dst))\
\
"psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\
"psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\
"por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\
MOVNTQ(%%mm5, 16(dst))\
\
"add $24, "#dst" \n\t"\
\
"add $8, "#index" \n\t"\
"cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
 
#define WRITEBGR24MMXEXT(dst, dstw, index) \
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\
"movq "MANGLE(ff_M24A)", %%mm0 \n\t"\
"movq "MANGLE(ff_M24C)", %%mm7 \n\t"\
"pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\
"pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\
"pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\
\
"pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\
"pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\
"pand %%mm7, %%mm6 \n\t" /* R1 R0 */\
\
"psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\
"por %%mm1, %%mm6 \n\t"\
"por %%mm3, %%mm6 \n\t"\
MOVNTQ(%%mm6, (dst))\
\
"psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\
"pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\
"pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\
"pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\
\
"pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\
"pand %%mm7, %%mm3 \n\t" /* G4 G3 */\
"pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\
\
"por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\
"por %%mm3, %%mm6 \n\t"\
MOVNTQ(%%mm6, 8(dst))\
\
"pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\
"pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\
"pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\
\
"pand %%mm7, %%mm1 \n\t" /* B7 B6 */\
"pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\
"pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\
\
"por %%mm1, %%mm3 \n\t"\
"por %%mm3, %%mm6 \n\t"\
MOVNTQ(%%mm6, 16(dst))\
\
"add $24, "#dst" \n\t"\
\
"add $8, "#index" \n\t"\
"cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
 
#if COMPILE_TEMPLATE_MMXEXT
#undef WRITEBGR24
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index)
#else
#undef WRITEBGR24
#define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index)
#endif
 
#if HAVE_6REGS
static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
YSCALEYUV2PACKEDX_ACCURATE
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize
"add %4, %%"REG_c" \n\t"
WRITEBGR24(%%REGc, "%5", %%REGa)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg), "m"(uv_off)
NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
);
}
 
static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
YSCALEYUV2PACKEDX
YSCALEYUV2RGBX
"pxor %%mm7, %%mm7 \n\t"
"lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize
"add %4, %%"REG_c" \n\t"
WRITEBGR24(%%REGc, "%5", %%REGa)
:: "r" (&c->redDither),
"m" (dummy), "m" (dummy), "m" (dummy),
"r" (dest), "m" (dstW_reg), "m"(uv_off)
NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
: "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S
);
}
#endif /* HAVE_6REGS */
 
#define REAL_WRITEYUY2(dst, dstw, index) \
"packuswb %%mm3, %%mm3 \n\t"\
"packuswb %%mm4, %%mm4 \n\t"\
"packuswb %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm4, %%mm3 \n\t"\
"movq %%mm1, %%mm7 \n\t"\
"punpcklbw %%mm3, %%mm1 \n\t"\
"punpckhbw %%mm3, %%mm7 \n\t"\
\
MOVNTQ(%%mm1, (dst, index, 2))\
MOVNTQ(%%mm7, 8(dst, index, 2))\
\
"add $8, "#index" \n\t"\
"cmp "dstw", "#index" \n\t"\
" jb 1b \n\t"
#define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index)
 
static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
YSCALEYUV2PACKEDX_ACCURATE
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
"psraw $3, %%mm3 \n\t"
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
WRITEYUY2(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
 
static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter,
const int16_t **lumSrc, int lumFilterSize,
const int16_t *chrFilter, const int16_t **chrUSrc,
const int16_t **chrVSrc,
int chrFilterSize, const int16_t **alpSrc,
uint8_t *dest, int dstW, int dstY)
{
x86_reg dummy=0;
x86_reg dstW_reg = dstW;
x86_reg uv_off = c->uv_offx2;
 
YSCALEYUV2PACKEDX
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
"psraw $3, %%mm3 \n\t"
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm7 \n\t"
WRITEYUY2(%4, "%5", %%REGa)
YSCALEYUV2PACKEDX_END
}
 
#define REAL_YSCALEYUV2RGB_UV(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
 
#define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \
"movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
"movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
 
#define REAL_YSCALEYUV2RGB_COEFF(c) \
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
 
#define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2)
 
#define YSCALEYUV2RGB(index, c) \
REAL_YSCALEYUV2RGB_UV(index, c) \
REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \
REAL_YSCALEYUV2RGB_COEFF(c)
 
/**
* vertical bilinear scale YV12 to RGB
*/
static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf[2], uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y)
{
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1];
#if ARCH_X86_64
__asm__ volatile(
YSCALEYUV2RGB(%%r8, %5)
YSCALEYUV2RGB_YA(%%r8, %5, %6, %7)
"psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"packuswb %%mm7, %%mm1 \n\t"
WRITEBGR32(%4, DSTW_OFFSET"(%5)", %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest),
"a" (&c->redDither),
"r" (abuf0), "r" (abuf1)
: "%r8"
);
#else
c->u_temp=(intptr_t)abuf0;
c->v_temp=(intptr_t)abuf1;
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5)
"push %0 \n\t"
"push %1 \n\t"
"mov "U_TEMP"(%5), %0 \n\t"
"mov "V_TEMP"(%5), %1 \n\t"
YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1)
"psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/
"packuswb %%mm7, %%mm1 \n\t"
"pop %1 \n\t"
"pop %0 \n\t"
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
);
#endif
} else {
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5)
"pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
);
}
}
 
static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf[2], uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y)
{
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
);
}
 
static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf[2], uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y)
{
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
NAMED_CONSTRAINTS_ADD(bF8)
);
}
 
static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf[2], uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y)
{
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
NAMED_CONSTRAINTS_ADD(bF8,bFC)
);
}
 
#define REAL_YSCALEYUV2PACKED(index, c) \
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\
"psraw $3, %%mm0 \n\t"\
"psraw $3, %%mm1 \n\t"\
"movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\
"movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\
"psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\
"movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\
"pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\
"pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\
"psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\
"movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\
"movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\
"movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\
"movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\
"psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\
"psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\
"psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
"paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\
 
#define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c)
 
static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2],
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf[2], uint8_t *dest,
int dstW, int yalpha, int uvalpha, int y)
{
const int16_t *buf0 = buf[0], *buf1 = buf[1],
*ubuf0 = ubuf[0], *ubuf1 = ubuf[1];
 
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2PACKED(%%REGBP, %5)
WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
);
}
 
#define REAL_YSCALEYUV2RGB1(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\
"psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
 
#define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c)
 
// do vertical chrominance interpolation
#define REAL_YSCALEYUV2RGB1b(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\
"psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\
"psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\
"psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\
"movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\
"movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\
"pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\
"pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\
/* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\
"pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\
"pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\
"psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\
"psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\
"pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\
"pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\
/* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\
"paddw %%mm3, %%mm4 \n\t"\
"movq %%mm2, %%mm0 \n\t"\
"movq %%mm5, %%mm6 \n\t"\
"movq %%mm4, %%mm3 \n\t"\
"punpcklwd %%mm2, %%mm2 \n\t"\
"punpcklwd %%mm5, %%mm5 \n\t"\
"punpcklwd %%mm4, %%mm4 \n\t"\
"paddw %%mm1, %%mm2 \n\t"\
"paddw %%mm1, %%mm5 \n\t"\
"paddw %%mm1, %%mm4 \n\t"\
"punpckhwd %%mm0, %%mm0 \n\t"\
"punpckhwd %%mm6, %%mm6 \n\t"\
"punpckhwd %%mm3, %%mm3 \n\t"\
"paddw %%mm7, %%mm0 \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw %%mm7, %%mm3 \n\t"\
/* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\
"packuswb %%mm0, %%mm2 \n\t"\
"packuswb %%mm6, %%mm5 \n\t"\
"packuswb %%mm3, %%mm4 \n\t"\
 
#define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c)
 
#define REAL_YSCALEYUV2RGB1_ALPHA(index) \
"movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\
"movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\
"psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\
"psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\
"packuswb %%mm1, %%mm7 \n\t"
#define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index)
 
/**
* YV12 to RGB without scaling or interpolating
*/
static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, int y)
{
const int16_t *ubuf0 = ubuf[0];
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
const int16_t *ubuf1 = ubuf[0];
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5)
YSCALEYUV2RGB1_ALPHA(%%REGBP)
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
);
} else {
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5)
"pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
);
}
} else {
const int16_t *ubuf1 = ubuf[1];
if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) {
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5)
YSCALEYUV2RGB1_ALPHA(%%REGBP)
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
);
} else {
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5)
"pcmpeqd %%mm7, %%mm7 \n\t"
WRITEBGR32(%%REGb, DSTW_OFFSET"(%5)", %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
);
}
}
}
 
static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, int y)
{
const int16_t *ubuf0 = ubuf[0];
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
const int16_t *ubuf1 = ubuf[0];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
);
} else {
const int16_t *ubuf1 = ubuf[1];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
WRITEBGR24(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
NAMED_CONSTRAINTS_ADD(ff_M24A,ff_M24C,ff_M24B)
);
}
}
 
static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, int y)
{
const int16_t *ubuf0 = ubuf[0];
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
const int16_t *ubuf1 = ubuf[0];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
NAMED_CONSTRAINTS_ADD(bF8)
);
} else {
const int16_t *ubuf1 = ubuf[1];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
WRITERGB15(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
NAMED_CONSTRAINTS_ADD(bF8)
);
}
}
 
static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, int y)
{
const int16_t *ubuf0 = ubuf[0];
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
const int16_t *ubuf1 = ubuf[0];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
NAMED_CONSTRAINTS_ADD(bF8,bFC)
);
} else {
const int16_t *ubuf1 = ubuf[1];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2RGB1b(%%REGBP, %5)
"pxor %%mm7, %%mm7 \n\t"
/* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */
#ifdef DITHER1XBPP
"paddusb "BLUE_DITHER"(%5), %%mm2 \n\t"
"paddusb "GREEN_DITHER"(%5), %%mm4 \n\t"
"paddusb "RED_DITHER"(%5), %%mm5 \n\t"
#endif
WRITERGB16(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
NAMED_CONSTRAINTS_ADD(bF8,bFC)
);
}
}
 
#define REAL_YSCALEYUV2PACKED1(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\
"add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\
"sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"psraw $7, %%mm3 \n\t" \
"psraw $7, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t" \
 
#define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c)
 
#define REAL_YSCALEYUV2PACKED1b(index, c) \
"xor "#index", "#index" \n\t"\
".p2align 4 \n\t"\
"1: \n\t"\
"movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\
"movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\
"add "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\
"movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\
"sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \
"paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\
"paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\
"psrlw $8, %%mm3 \n\t" \
"psrlw $8, %%mm4 \n\t" \
"movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\
"movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\
"psraw $7, %%mm1 \n\t" \
"psraw $7, %%mm7 \n\t"
#define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c)
 
static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0,
const int16_t *ubuf[2], const int16_t *vbuf[2],
const int16_t *abuf0, uint8_t *dest,
int dstW, int uvalpha, int y)
{
const int16_t *ubuf0 = ubuf[0];
const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1
 
if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster
const int16_t *ubuf1 = ubuf[0];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2PACKED1(%%REGBP, %5)
WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
);
} else {
const int16_t *ubuf1 = ubuf[1];
__asm__ volatile(
"mov %%"REG_b", "ESP_OFFSET"(%5) \n\t"
"mov %4, %%"REG_b" \n\t"
"push %%"REG_BP" \n\t"
YSCALEYUV2PACKED1b(%%REGBP, %5)
WRITEYUY2(%%REGb, DSTW_OFFSET"(%5)", %%REGBP)
"pop %%"REG_BP" \n\t"
"mov "ESP_OFFSET"(%5), %%"REG_b" \n\t"
:: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest),
"a" (&c->redDither)
);
}
}
static av_cold void RENAME(sws_init_swscale)(SwsContext *c)
{
enum AVPixelFormat dstFormat = c->dstFormat;
 
c->use_mmx_vfilter= 0;
if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12
&& dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) {
if (c->flags & SWS_ACCURATE_RND) {
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break;
#if HAVE_6REGS
case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break;
#endif
case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break;
case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break;
case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break;
default: break;
}
}
} else {
c->use_mmx_vfilter= 1;
c->yuv2planeX = RENAME(yuv2yuvX );
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break;
case AV_PIX_FMT_BGR32: c->yuv2packedX = RENAME(yuv2bgr32_X); break;
#if HAVE_6REGS
case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break;
#endif
case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break;
case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break;
case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break;
default: break;
}
}
}
if (!(c->flags & SWS_FULL_CHR_H_INT)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
c->yuv2packed1 = RENAME(yuv2rgb32_1);
c->yuv2packed2 = RENAME(yuv2rgb32_2);
break;
case AV_PIX_FMT_BGR24:
c->yuv2packed1 = RENAME(yuv2bgr24_1);
c->yuv2packed2 = RENAME(yuv2bgr24_2);
break;
case AV_PIX_FMT_RGB555:
c->yuv2packed1 = RENAME(yuv2rgb555_1);
c->yuv2packed2 = RENAME(yuv2rgb555_2);
break;
case AV_PIX_FMT_RGB565:
c->yuv2packed1 = RENAME(yuv2rgb565_1);
c->yuv2packed2 = RENAME(yuv2rgb565_2);
break;
case AV_PIX_FMT_YUYV422:
c->yuv2packed1 = RENAME(yuv2yuyv422_1);
c->yuv2packed2 = RENAME(yuv2yuyv422_2);
break;
default:
break;
}
}
}
 
if (c->srcBpc == 8 && c->dstBpc <= 14) {
// Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one).
#if COMPILE_TEMPLATE_MMXEXT
if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) {
c->hyscale_fast = ff_hyscale_fast_mmxext;
c->hcscale_fast = ff_hcscale_fast_mmxext;
} else {
#endif /* COMPILE_TEMPLATE_MMXEXT */
c->hyscale_fast = NULL;
c->hcscale_fast = NULL;
#if COMPILE_TEMPLATE_MMXEXT
}
#endif /* COMPILE_TEMPLATE_MMXEXT */
}
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/w64xmmtest.c
0,0 → 1,31
/*
* check XMM registers for clobbers on Win64
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/x86/w64xmmtest.h"
#include "libswscale/swscale.h"
 
wrap(sws_scale(struct SwsContext *c, const uint8_t *const srcSlice[],
const int srcStride[], int srcSliceY, int srcSliceH,
uint8_t *const dst[], const int dstStride[]))
{
testxmmclobbers(sws_scale, c, srcSlice, srcStride, srcSliceY,
srcSliceH, dst, dstStride);
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/yuv2rgb.c
0,0 → 1,119
/*
* software YUV to RGB converter
*
* Copyright (C) 2009 Konstantin Shishkov
*
* MMX/MMXEXT template stuff (needed for fast movntq support),
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
 
#include "config.h"
#include "libswscale/rgb2rgb.h"
#include "libswscale/swscale.h"
#include "libswscale/swscale_internal.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/cpu.h"
 
#if HAVE_INLINE_ASM
 
#define DITHER1XBPP // only for MMX
 
/* hope these constant values are cache line aligned */
DECLARE_ASM_CONST(8, uint64_t, mmx_00ffw) = 0x00ff00ff00ff00ffULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_redmask) = 0xf8f8f8f8f8f8f8f8ULL;
DECLARE_ASM_CONST(8, uint64_t, mmx_grnmask) = 0xfcfcfcfcfcfcfcfcULL;
DECLARE_ASM_CONST(8, uint64_t, pb_e0) = 0xe0e0e0e0e0e0e0e0ULL;
DECLARE_ASM_CONST(8, uint64_t, pb_03) = 0x0303030303030303ULL;
DECLARE_ASM_CONST(8, uint64_t, pb_07) = 0x0707070707070707ULL;
 
//MMX versions
#if HAVE_MMX_INLINE && HAVE_6REGS
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 0
#define RENAME(a) a ## _mmx
#include "yuv2rgb_template.c"
#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
 
// MMXEXT versions
#if HAVE_MMXEXT_INLINE && HAVE_6REGS
#undef RENAME
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define RENAME(a) a ## _mmxext
#include "yuv2rgb_template.c"
#endif /* HAVE_MMXEXT_INLINE && HAVE_6REGS */
 
#endif /* HAVE_INLINE_ASM */
 
av_cold SwsFunc ff_yuv2rgb_init_x86(SwsContext *c)
{
#if HAVE_MMX_INLINE && HAVE_6REGS
int cpu_flags = av_get_cpu_flags();
 
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB24:
return yuv420_rgb24_mmxext;
case AV_PIX_FMT_BGR24:
return yuv420_bgr24_mmxext;
}
}
#endif
 
if (INLINE_MMX(cpu_flags)) {
switch (c->dstFormat) {
case AV_PIX_FMT_RGB32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
return yuva420_rgb32_mmx;
#endif
break;
} else
return yuv420_rgb32_mmx;
case AV_PIX_FMT_BGR32:
if (c->srcFormat == AV_PIX_FMT_YUVA420P) {
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
return yuva420_bgr32_mmx;
#endif
break;
} else
return yuv420_bgr32_mmx;
case AV_PIX_FMT_RGB24:
return yuv420_rgb24_mmx;
case AV_PIX_FMT_BGR24:
return yuv420_bgr24_mmx;
case AV_PIX_FMT_RGB565:
return yuv420_rgb16_mmx;
case AV_PIX_FMT_RGB555:
return yuv420_rgb15_mmx;
}
}
#endif /* HAVE_MMX_INLINE && HAVE_6REGS */
 
return NULL;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/x86/yuv2rgb_template.c
0,0 → 1,467
/*
* software YUV to RGB converter
*
* Copyright (C) 2001-2007 Michael Niedermayer
* (c) 2010 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stdint.h>
 
#include "libavutil/x86/asm.h"
#include "libswscale/swscale_internal.h"
 
#undef MOVNTQ
#undef EMMS
#undef SFENCE
 
#if COMPILE_TEMPLATE_MMXEXT
#define MOVNTQ "movntq"
#define SFENCE "sfence"
#else
#define MOVNTQ "movq"
#define SFENCE " # nop"
#endif
 
#define REG_BLUE "0"
#define REG_RED "1"
#define REG_GREEN "2"
#define REG_ALPHA "3"
 
#define YUV2RGB_LOOP(depth) \
h_size = (c->dstW + 7) & ~7; \
if (h_size * depth > FFABS(dstStride[0])) \
h_size -= 8; \
\
vshift = c->srcFormat != AV_PIX_FMT_YUV422P; \
\
__asm__ volatile ("pxor %mm4, %mm4\n\t"); \
for (y = 0; y < srcSliceH; y++) { \
uint8_t *image = dst[0] + (y + srcSliceY) * dstStride[0]; \
const uint8_t *py = src[0] + y * srcStride[0]; \
const uint8_t *pu = src[1] + (y >> vshift) * srcStride[1]; \
const uint8_t *pv = src[2] + (y >> vshift) * srcStride[2]; \
x86_reg index = -h_size / 2; \
 
#define YUV2RGB_INITIAL_LOAD \
__asm__ volatile ( \
"movq (%5, %0, 2), %%mm6\n\t" \
"movd (%2, %0), %%mm0\n\t" \
"movd (%3, %0), %%mm1\n\t" \
"1: \n\t" \
 
/* YUV2RGB core
* Conversion is performed in usual way:
* R = Y' * Ycoef + Vred * V'
* G = Y' * Ycoef + Vgreen * V' + Ugreen * U'
* B = Y' * Ycoef + Ublue * U'
*
* where X' = X * 8 - Xoffset (multiplication is performed to increase
* precision a bit).
* Since it operates in YUV420 colorspace, Y component is additionally
* split into Y1 and Y2 for even and odd pixels.
*
* Input:
* mm0 - U (4 elems), mm1 - V (4 elems), mm6 - Y (8 elems), mm4 - zero register
* Output:
* mm1 - R, mm2 - G, mm0 - B
*/
#define YUV2RGB \
/* convert Y, U, V into Y1', Y2', U', V' */ \
"movq %%mm6, %%mm7\n\t" \
"punpcklbw %%mm4, %%mm0\n\t" \
"punpcklbw %%mm4, %%mm1\n\t" \
"pand "MANGLE(mmx_00ffw)", %%mm6\n\t" \
"psrlw $8, %%mm7\n\t" \
"psllw $3, %%mm0\n\t" \
"psllw $3, %%mm1\n\t" \
"psllw $3, %%mm6\n\t" \
"psllw $3, %%mm7\n\t" \
"psubsw "U_OFFSET"(%4), %%mm0\n\t" \
"psubsw "V_OFFSET"(%4), %%mm1\n\t" \
"psubw "Y_OFFSET"(%4), %%mm6\n\t" \
"psubw "Y_OFFSET"(%4), %%mm7\n\t" \
\
/* multiply by coefficients */ \
"movq %%mm0, %%mm2\n\t" \
"movq %%mm1, %%mm3\n\t" \
"pmulhw "UG_COEFF"(%4), %%mm2\n\t" \
"pmulhw "VG_COEFF"(%4), %%mm3\n\t" \
"pmulhw "Y_COEFF" (%4), %%mm6\n\t" \
"pmulhw "Y_COEFF" (%4), %%mm7\n\t" \
"pmulhw "UB_COEFF"(%4), %%mm0\n\t" \
"pmulhw "VR_COEFF"(%4), %%mm1\n\t" \
"paddsw %%mm3, %%mm2\n\t" \
/* now: mm0 = UB, mm1 = VR, mm2 = CG */ \
/* mm6 = Y1, mm7 = Y2 */ \
\
/* produce RGB */ \
"movq %%mm7, %%mm3\n\t" \
"movq %%mm7, %%mm5\n\t" \
"paddsw %%mm0, %%mm3\n\t" \
"paddsw %%mm1, %%mm5\n\t" \
"paddsw %%mm2, %%mm7\n\t" \
"paddsw %%mm6, %%mm0\n\t" \
"paddsw %%mm6, %%mm1\n\t" \
"paddsw %%mm6, %%mm2\n\t" \
 
#define RGB_PACK_INTERLEAVE \
/* pack and interleave even/odd pixels */ \
"packuswb %%mm1, %%mm0\n\t" \
"packuswb %%mm5, %%mm3\n\t" \
"packuswb %%mm2, %%mm2\n\t" \
"movq %%mm0, %%mm1\n\n" \
"packuswb %%mm7, %%mm7\n\t" \
"punpcklbw %%mm3, %%mm0\n\t" \
"punpckhbw %%mm3, %%mm1\n\t" \
"punpcklbw %%mm7, %%mm2\n\t" \
 
#define YUV2RGB_ENDLOOP(depth) \
"movq 8 (%5, %0, 2), %%mm6\n\t" \
"movd 4 (%3, %0), %%mm1\n\t" \
"movd 4 (%2, %0), %%mm0\n\t" \
"add $"AV_STRINGIFY(depth * 8)", %1\n\t" \
"add $4, %0\n\t" \
"js 1b\n\t" \
 
#if COMPILE_TEMPLATE_MMXEXT
#undef RGB_PACK24_B_OPERANDS
#define RGB_PACK24_B_OPERANDS NAMED_CONSTRAINTS_ARRAY_ADD(mask1101,mask0110,mask0100,mask0010,mask1001)
#else
#undef RGB_PACK24_B_OPERANDS
#define RGB_PACK24_B_OPERANDS
#endif
 
#define YUV2RGB_OPERANDS \
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index) \
NAMED_CONSTRAINTS_ADD(mmx_00ffw,pb_03,pb_07,mmx_redmask,pb_e0) \
RGB_PACK24_B_OPERANDS \
: "memory" \
); \
} \
 
#define YUV2RGB_OPERANDS_ALPHA \
: "+r" (index), "+r" (image) \
: "r" (pu - index), "r" (pv - index), "r"(&c->redDither), \
"r" (py - 2*index), "r" (pa - 2*index) \
NAMED_CONSTRAINTS_ADD(mmx_00ffw) \
: "memory" \
); \
} \
 
#define YUV2RGB_ENDFUNC \
__asm__ volatile (SFENCE"\n\t" \
"emms \n\t"); \
return srcSliceH; \
 
#define IF0(x)
#define IF1(x) x
 
#define RGB_PACK16(gmask, is15) \
"pand "MANGLE(mmx_redmask)", %%mm0\n\t" \
"pand "MANGLE(mmx_redmask)", %%mm1\n\t" \
"movq %%mm2, %%mm3\n\t" \
"psllw $"AV_STRINGIFY(3-is15)", %%mm2\n\t" \
"psrlw $"AV_STRINGIFY(5+is15)", %%mm3\n\t" \
"psrlw $3, %%mm0\n\t" \
IF##is15("psrlw $1, %%mm1\n\t") \
"pand "MANGLE(pb_e0)", %%mm2\n\t" \
"pand "MANGLE(gmask)", %%mm3\n\t" \
"por %%mm2, %%mm0\n\t" \
"por %%mm3, %%mm1\n\t" \
"movq %%mm0, %%mm2\n\t" \
"punpcklbw %%mm1, %%mm0\n\t" \
"punpckhbw %%mm1, %%mm2\n\t" \
MOVNTQ " %%mm0, (%1)\n\t" \
MOVNTQ " %%mm2, 8(%1)\n\t" \
 
#define DITHER_RGB \
"paddusb "BLUE_DITHER"(%4), %%mm0\n\t" \
"paddusb "GREEN_DITHER"(%4), %%mm2\n\t" \
"paddusb "RED_DITHER"(%4), %%mm1\n\t" \
 
#if !COMPILE_TEMPLATE_MMXEXT
static inline int RENAME(yuv420_rgb15)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
 
YUV2RGB_LOOP(2)
 
#ifdef DITHER1XBPP
c->blueDither = ff_dither8[y & 1];
c->greenDither = ff_dither8[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
#endif
 
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
#ifdef DITHER1XBPP
DITHER_RGB
#endif
RGB_PACK16(pb_03, 1)
 
YUV2RGB_ENDLOOP(2)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
 
static inline int RENAME(yuv420_rgb16)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
 
YUV2RGB_LOOP(2)
 
#ifdef DITHER1XBPP
c->blueDither = ff_dither8[y & 1];
c->greenDither = ff_dither4[y & 1];
c->redDither = ff_dither8[(y + 1) & 1];
#endif
 
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
#ifdef DITHER1XBPP
DITHER_RGB
#endif
RGB_PACK16(pb_07, 0)
 
YUV2RGB_ENDLOOP(2)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
#endif /* !COMPILE_TEMPLATE_MMXEXT */
 
#define RGB_PACK24(blue, red)\
"packuswb %%mm3, %%mm0 \n" /* R0 R2 R4 R6 R1 R3 R5 R7 */\
"packuswb %%mm5, %%mm1 \n" /* B0 B2 B4 B6 B1 B3 B5 B7 */\
"packuswb %%mm7, %%mm2 \n" /* G0 G2 G4 G6 G1 G3 G5 G7 */\
"movq %%mm"red", %%mm3 \n"\
"movq %%mm"blue", %%mm6 \n"\
"psrlq $32, %%mm"red" \n" /* R1 R3 R5 R7 */\
"punpcklbw %%mm2, %%mm3 \n" /* R0 G0 R2 G2 R4 G4 R6 G6 */\
"punpcklbw %%mm"red", %%mm6 \n" /* B0 R1 B2 R3 B4 R5 B6 R7 */\
"movq %%mm3, %%mm5 \n"\
"punpckhbw %%mm"blue", %%mm2 \n" /* G1 B1 G3 B3 G5 B5 G7 B7 */\
"punpcklwd %%mm6, %%mm3 \n" /* R0 G0 B0 R1 R2 G2 B2 R3 */\
"punpckhwd %%mm6, %%mm5 \n" /* R4 G4 B4 R5 R6 G6 B6 R7 */\
RGB_PACK24_B
 
#if COMPILE_TEMPLATE_MMXEXT
DECLARE_ASM_CONST(8, int16_t, mask1101[4]) = {-1,-1, 0,-1};
DECLARE_ASM_CONST(8, int16_t, mask0010[4]) = { 0, 0,-1, 0};
DECLARE_ASM_CONST(8, int16_t, mask0110[4]) = { 0,-1,-1, 0};
DECLARE_ASM_CONST(8, int16_t, mask1001[4]) = {-1, 0, 0,-1};
DECLARE_ASM_CONST(8, int16_t, mask0100[4]) = { 0,-1, 0, 0};
#undef RGB_PACK24_B
#define RGB_PACK24_B\
"pshufw $0xc6, %%mm2, %%mm1 \n"\
"pshufw $0x84, %%mm3, %%mm6 \n"\
"pshufw $0x38, %%mm5, %%mm7 \n"\
"pand "MANGLE(mask1101)", %%mm6 \n" /* R0 G0 B0 R1 -- -- R2 G2 */\
"movq %%mm1, %%mm0 \n"\
"pand "MANGLE(mask0110)", %%mm7 \n" /* -- -- R6 G6 B6 R7 -- -- */\
"movq %%mm1, %%mm2 \n"\
"pand "MANGLE(mask0100)", %%mm1 \n" /* -- -- G3 B3 -- -- -- -- */\
"psrlq $48, %%mm3 \n" /* B2 R3 -- -- -- -- -- -- */\
"pand "MANGLE(mask0010)", %%mm0 \n" /* -- -- -- -- G1 B1 -- -- */\
"psllq $32, %%mm5 \n" /* -- -- -- -- R4 G4 B4 R5 */\
"pand "MANGLE(mask1001)", %%mm2 \n" /* G5 B5 -- -- -- -- G7 B7 */\
"por %%mm3, %%mm1 \n"\
"por %%mm6, %%mm0 \n"\
"por %%mm5, %%mm1 \n"\
"por %%mm7, %%mm2 \n"\
MOVNTQ" %%mm0, (%1) \n"\
MOVNTQ" %%mm1, 8(%1) \n"\
MOVNTQ" %%mm2, 16(%1) \n"\
 
#else
#undef RGB_PACK24_B
#define RGB_PACK24_B\
"movd %%mm3, (%1) \n" /* R0 G0 B0 R1 */\
"movd %%mm2, 4(%1) \n" /* G1 B1 */\
"psrlq $32, %%mm3 \n"\
"psrlq $16, %%mm2 \n"\
"movd %%mm3, 6(%1) \n" /* R2 G2 B2 R3 */\
"movd %%mm2, 10(%1) \n" /* G3 B3 */\
"psrlq $16, %%mm2 \n"\
"movd %%mm5, 12(%1) \n" /* R4 G4 B4 R5 */\
"movd %%mm2, 16(%1) \n" /* G5 B5 */\
"psrlq $32, %%mm5 \n"\
"movd %%mm2, 20(%1) \n" /* -- -- G7 B7 */\
"movd %%mm5, 18(%1) \n" /* R6 G6 B6 R7 */\
 
#endif
 
static inline int RENAME(yuv420_rgb24)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
 
YUV2RGB_LOOP(3)
 
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK24(REG_BLUE, REG_RED)
 
YUV2RGB_ENDLOOP(3)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
 
static inline int RENAME(yuv420_bgr24)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
 
YUV2RGB_LOOP(3)
 
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK24(REG_RED, REG_BLUE)
 
YUV2RGB_ENDLOOP(3)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
 
 
#define SET_EMPTY_ALPHA \
"pcmpeqd %%mm"REG_ALPHA", %%mm"REG_ALPHA"\n\t" /* set alpha to 0xFF */ \
 
#define LOAD_ALPHA \
"movq (%6, %0, 2), %%mm"REG_ALPHA"\n\t" \
 
#define RGB_PACK32(red, green, blue, alpha) \
"movq %%mm"blue", %%mm5\n\t" \
"movq %%mm"red", %%mm6\n\t" \
"punpckhbw %%mm"green", %%mm5\n\t" \
"punpcklbw %%mm"green", %%mm"blue"\n\t" \
"punpckhbw %%mm"alpha", %%mm6\n\t" \
"punpcklbw %%mm"alpha", %%mm"red"\n\t" \
"movq %%mm"blue", %%mm"green"\n\t" \
"movq %%mm5, %%mm"alpha"\n\t" \
"punpcklwd %%mm"red", %%mm"blue"\n\t" \
"punpckhwd %%mm"red", %%mm"green"\n\t" \
"punpcklwd %%mm6, %%mm5\n\t" \
"punpckhwd %%mm6, %%mm"alpha"\n\t" \
MOVNTQ " %%mm"blue", 0(%1)\n\t" \
MOVNTQ " %%mm"green", 8(%1)\n\t" \
MOVNTQ " %%mm5, 16(%1)\n\t" \
MOVNTQ " %%mm"alpha", 24(%1)\n\t" \
 
#if !COMPILE_TEMPLATE_MMXEXT
static inline int RENAME(yuv420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
 
YUV2RGB_LOOP(4)
 
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
SET_EMPTY_ALPHA
RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
 
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
 
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
static inline int RENAME(yuva420_rgb32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
 
YUV2RGB_LOOP(4)
 
const uint8_t *pa = src[3] + y * srcStride[3];
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
LOAD_ALPHA
RGB_PACK32(REG_RED, REG_GREEN, REG_BLUE, REG_ALPHA)
 
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS_ALPHA
YUV2RGB_ENDFUNC
}
#endif
 
static inline int RENAME(yuv420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
 
YUV2RGB_LOOP(4)
 
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
SET_EMPTY_ALPHA
RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
 
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS
YUV2RGB_ENDFUNC
}
 
#if HAVE_7REGS && CONFIG_SWSCALE_ALPHA
static inline int RENAME(yuva420_bgr32)(SwsContext *c, const uint8_t *src[],
int srcStride[],
int srcSliceY, int srcSliceH,
uint8_t *dst[], int dstStride[])
{
int y, h_size, vshift;
 
YUV2RGB_LOOP(4)
 
const uint8_t *pa = src[3] + y * srcStride[3];
YUV2RGB_INITIAL_LOAD
YUV2RGB
RGB_PACK_INTERLEAVE
LOAD_ALPHA
RGB_PACK32(REG_BLUE, REG_GREEN, REG_RED, REG_ALPHA)
 
YUV2RGB_ENDLOOP(4)
YUV2RGB_OPERANDS_ALPHA
YUV2RGB_ENDFUNC
}
#endif
 
#endif /* !COMPILE_TEMPLATE_MMXEXT */
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswscale/yuv2rgb.c
0,0 → 1,977
/*
* software YUV to RGB converter
*
* Copyright (C) 2009 Konstantin Shishkov
*
* 1,4,8bpp support and context / deglobalize stuff
* by Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stdio.h>
#include <stdlib.h>
#include <inttypes.h>
 
#include "libavutil/cpu.h"
#include "libavutil/bswap.h"
#include "config.h"
#include "rgb2rgb.h"
#include "swscale.h"
#include "swscale_internal.h"
#include "libavutil/pixdesc.h"
 
const int32_t ff_yuv2rgb_coeffs[8][4] = {
{ 117504, 138453, 13954, 34903 }, /* no sequence_display_extension */
{ 117504, 138453, 13954, 34903 }, /* ITU-R Rec. 709 (1990) */
{ 104597, 132201, 25675, 53279 }, /* unspecified */
{ 104597, 132201, 25675, 53279 }, /* reserved */
{ 104448, 132798, 24759, 53109 }, /* FCC */
{ 104597, 132201, 25675, 53279 }, /* ITU-R Rec. 624-4 System B, G */
{ 104597, 132201, 25675, 53279 }, /* SMPTE 170M */
{ 117579, 136230, 16907, 35559 } /* SMPTE 240M (1987) */
};
 
const int *sws_getCoefficients(int colorspace)
{
if (colorspace > 7 || colorspace < 0)
colorspace = SWS_CS_DEFAULT;
return ff_yuv2rgb_coeffs[colorspace];
}
 
#define LOADCHROMA(i) \
U = pu[i]; \
V = pv[i]; \
r = (void *)c->table_rV[V+YUVRGB_TABLE_HEADROOM]; \
g = (void *)(c->table_gU[U+YUVRGB_TABLE_HEADROOM] + c->table_gV[V+YUVRGB_TABLE_HEADROOM]); \
b = (void *)c->table_bU[U+YUVRGB_TABLE_HEADROOM];
 
#define PUTRGB(dst, src, i) \
Y = src[2 * i]; \
dst[2 * i] = r[Y] + g[Y] + b[Y]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y] + g[Y] + b[Y];
 
#define PUTRGB24(dst, src, i) \
Y = src[2 * i]; \
dst[6 * i + 0] = r[Y]; \
dst[6 * i + 1] = g[Y]; \
dst[6 * i + 2] = b[Y]; \
Y = src[2 * i + 1]; \
dst[6 * i + 3] = r[Y]; \
dst[6 * i + 4] = g[Y]; \
dst[6 * i + 5] = b[Y];
 
#define PUTBGR24(dst, src, i) \
Y = src[2 * i]; \
dst[6 * i + 0] = b[Y]; \
dst[6 * i + 1] = g[Y]; \
dst[6 * i + 2] = r[Y]; \
Y = src[2 * i + 1]; \
dst[6 * i + 3] = b[Y]; \
dst[6 * i + 4] = g[Y]; \
dst[6 * i + 5] = r[Y];
 
#define PUTRGBA(dst, ysrc, asrc, i, s) \
Y = ysrc[2 * i]; \
dst[2 * i] = r[Y] + g[Y] + b[Y] + (asrc[2 * i] << s); \
Y = ysrc[2 * i + 1]; \
dst[2 * i + 1] = r[Y] + g[Y] + b[Y] + (asrc[2 * i + 1] << s);
 
#define PUTRGB48(dst, src, i) \
Y = src[ 2 * i]; \
dst[12 * i + 0] = dst[12 * i + 1] = r[Y]; \
dst[12 * i + 2] = dst[12 * i + 3] = g[Y]; \
dst[12 * i + 4] = dst[12 * i + 5] = b[Y]; \
Y = src[ 2 * i + 1]; \
dst[12 * i + 6] = dst[12 * i + 7] = r[Y]; \
dst[12 * i + 8] = dst[12 * i + 9] = g[Y]; \
dst[12 * i + 10] = dst[12 * i + 11] = b[Y];
 
#define PUTBGR48(dst, src, i) \
Y = src[2 * i]; \
dst[12 * i + 0] = dst[12 * i + 1] = b[Y]; \
dst[12 * i + 2] = dst[12 * i + 3] = g[Y]; \
dst[12 * i + 4] = dst[12 * i + 5] = r[Y]; \
Y = src[2 * i + 1]; \
dst[12 * i + 6] = dst[12 * i + 7] = b[Y]; \
dst[12 * i + 8] = dst[12 * i + 9] = g[Y]; \
dst[12 * i + 10] = dst[12 * i + 11] = r[Y];
 
#define YUV2RGBFUNC(func_name, dst_type, alpha) \
static int func_name(SwsContext *c, const uint8_t *src[], \
int srcStride[], int srcSliceY, int srcSliceH, \
uint8_t *dst[], int dstStride[]) \
{ \
int y; \
\
if (!alpha && c->srcFormat == AV_PIX_FMT_YUV422P) { \
srcStride[1] *= 2; \
srcStride[2] *= 2; \
} \
for (y = 0; y < srcSliceH; y += 2) { \
dst_type *dst_1 = \
(dst_type *)(dst[0] + (y + srcSliceY) * dstStride[0]); \
dst_type *dst_2 = \
(dst_type *)(dst[0] + (y + srcSliceY + 1) * dstStride[0]); \
dst_type av_unused *r, *g, *b; \
const uint8_t *py_1 = src[0] + y * srcStride[0]; \
const uint8_t *py_2 = py_1 + srcStride[0]; \
const uint8_t *pu = src[1] + (y >> 1) * srcStride[1]; \
const uint8_t *pv = src[2] + (y >> 1) * srcStride[2]; \
const uint8_t av_unused *pa_1, *pa_2; \
unsigned int h_size = c->dstW >> 3; \
if (alpha) { \
pa_1 = src[3] + y * srcStride[3]; \
pa_2 = pa_1 + srcStride[3]; \
} \
while (h_size--) { \
int av_unused U, V, Y; \
 
#define ENDYUV2RGBLINE(dst_delta, ss) \
pu += 4 >> ss; \
pv += 4 >> ss; \
py_1 += 8 >> ss; \
py_2 += 8 >> ss; \
dst_1 += dst_delta >> ss; \
dst_2 += dst_delta >> ss; \
} \
if (c->dstW & (4 >> ss)) { \
int av_unused Y, U, V; \
 
#define ENDYUV2RGBFUNC() \
} \
} \
return srcSliceH; \
}
 
#define CLOSEYUV2RGBFUNC(dst_delta) \
ENDYUV2RGBLINE(dst_delta, 0) \
ENDYUV2RGBFUNC()
 
YUV2RGBFUNC(yuv2rgb_c_48, uint8_t, 0)
LOADCHROMA(0);
PUTRGB48(dst_1, py_1, 0);
PUTRGB48(dst_2, py_2, 0);
 
LOADCHROMA(1);
PUTRGB48(dst_2, py_2, 1);
PUTRGB48(dst_1, py_1, 1);
 
LOADCHROMA(2);
PUTRGB48(dst_1, py_1, 2);
PUTRGB48(dst_2, py_2, 2);
 
LOADCHROMA(3);
PUTRGB48(dst_2, py_2, 3);
PUTRGB48(dst_1, py_1, 3);
ENDYUV2RGBLINE(48, 0)
LOADCHROMA(0);
PUTRGB48(dst_1, py_1, 0);
PUTRGB48(dst_2, py_2, 0);
 
LOADCHROMA(1);
PUTRGB48(dst_2, py_2, 1);
PUTRGB48(dst_1, py_1, 1);
ENDYUV2RGBLINE(48, 1)
LOADCHROMA(0);
PUTRGB48(dst_1, py_1, 0);
PUTRGB48(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
 
YUV2RGBFUNC(yuv2rgb_c_bgr48, uint8_t, 0)
LOADCHROMA(0);
PUTBGR48(dst_1, py_1, 0);
PUTBGR48(dst_2, py_2, 0);
 
LOADCHROMA(1);
PUTBGR48(dst_2, py_2, 1);
PUTBGR48(dst_1, py_1, 1);
 
LOADCHROMA(2);
PUTBGR48(dst_1, py_1, 2);
PUTBGR48(dst_2, py_2, 2);
 
LOADCHROMA(3);
PUTBGR48(dst_2, py_2, 3);
PUTBGR48(dst_1, py_1, 3);
ENDYUV2RGBLINE(48, 0)
LOADCHROMA(0);
PUTBGR48(dst_1, py_1, 0);
PUTBGR48(dst_2, py_2, 0);
 
LOADCHROMA(1);
PUTBGR48(dst_2, py_2, 1);
PUTBGR48(dst_1, py_1, 1);
ENDYUV2RGBLINE(48, 1)
LOADCHROMA(0);
PUTBGR48(dst_1, py_1, 0);
PUTBGR48(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
 
YUV2RGBFUNC(yuv2rgb_c_32, uint32_t, 0)
LOADCHROMA(0);
PUTRGB(dst_1, py_1, 0);
PUTRGB(dst_2, py_2, 0);
 
LOADCHROMA(1);
PUTRGB(dst_2, py_2, 1);
PUTRGB(dst_1, py_1, 1);
 
LOADCHROMA(2);
PUTRGB(dst_1, py_1, 2);
PUTRGB(dst_2, py_2, 2);
 
LOADCHROMA(3);
PUTRGB(dst_2, py_2, 3);
PUTRGB(dst_1, py_1, 3);
ENDYUV2RGBLINE(8, 0)
LOADCHROMA(0);
PUTRGB(dst_1, py_1, 0);
PUTRGB(dst_2, py_2, 0);
 
LOADCHROMA(1);
PUTRGB(dst_2, py_2, 1);
PUTRGB(dst_1, py_1, 1);
ENDYUV2RGBLINE(8, 1)
LOADCHROMA(0);
PUTRGB(dst_1, py_1, 0);
PUTRGB(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
 
YUV2RGBFUNC(yuva2rgba_c, uint32_t, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 24);
PUTRGBA(dst_2, py_2, pa_2, 0, 24);
 
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 24);
PUTRGBA(dst_1, py_1, pa_1, 1, 24);
 
LOADCHROMA(2);
PUTRGBA(dst_1, py_1, pa_1, 2, 24);
PUTRGBA(dst_2, py_2, pa_2, 2, 24);
 
LOADCHROMA(3);
PUTRGBA(dst_2, py_2, pa_2, 3, 24);
PUTRGBA(dst_1, py_1, pa_1, 3, 24);
pa_1 += 8;
pa_2 += 8;
ENDYUV2RGBLINE(8, 0)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 24);
PUTRGBA(dst_2, py_2, pa_2, 0, 24);
 
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 24);
PUTRGBA(dst_1, py_1, pa_1, 1, 24);
pa_1 += 4;
pa_2 += 4;
ENDYUV2RGBLINE(8, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 24);
PUTRGBA(dst_2, py_2, pa_2, 0, 24);
ENDYUV2RGBFUNC()
 
YUV2RGBFUNC(yuva2argb_c, uint32_t, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 0);
PUTRGBA(dst_2, py_2, pa_2, 0, 0);
 
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 0);
PUTRGBA(dst_1, py_1, pa_1, 1, 0);
 
LOADCHROMA(2);
PUTRGBA(dst_1, py_1, pa_1, 2, 0);
PUTRGBA(dst_2, py_2, pa_2, 2, 0);
 
LOADCHROMA(3);
PUTRGBA(dst_2, py_2, pa_2, 3, 0);
PUTRGBA(dst_1, py_1, pa_1, 3, 0);
pa_1 += 8;
pa_2 += 8;
ENDYUV2RGBLINE(8, 0)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 0);
PUTRGBA(dst_2, py_2, pa_2, 0, 0);
 
LOADCHROMA(1);
PUTRGBA(dst_2, py_2, pa_2, 1, 0);
PUTRGBA(dst_1, py_1, pa_1, 1, 0);
pa_1 += 4;
pa_2 += 4;
ENDYUV2RGBLINE(8, 1)
LOADCHROMA(0);
PUTRGBA(dst_1, py_1, pa_1, 0, 0);
PUTRGBA(dst_2, py_2, pa_2, 0, 0);
ENDYUV2RGBFUNC()
 
YUV2RGBFUNC(yuv2rgb_c_24_rgb, uint8_t, 0)
LOADCHROMA(0);
PUTRGB24(dst_1, py_1, 0);
PUTRGB24(dst_2, py_2, 0);
 
LOADCHROMA(1);
PUTRGB24(dst_2, py_2, 1);
PUTRGB24(dst_1, py_1, 1);
 
LOADCHROMA(2);
PUTRGB24(dst_1, py_1, 2);
PUTRGB24(dst_2, py_2, 2);
 
LOADCHROMA(3);
PUTRGB24(dst_2, py_2, 3);
PUTRGB24(dst_1, py_1, 3);
ENDYUV2RGBLINE(24, 0)
LOADCHROMA(0);
PUTRGB24(dst_1, py_1, 0);
PUTRGB24(dst_2, py_2, 0);
 
LOADCHROMA(1);
PUTRGB24(dst_2, py_2, 1);
PUTRGB24(dst_1, py_1, 1);
ENDYUV2RGBLINE(24, 1)
LOADCHROMA(0);
PUTRGB24(dst_1, py_1, 0);
PUTRGB24(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
 
// only trivial mods from yuv2rgb_c_24_rgb
YUV2RGBFUNC(yuv2rgb_c_24_bgr, uint8_t, 0)
LOADCHROMA(0);
PUTBGR24(dst_1, py_1, 0);
PUTBGR24(dst_2, py_2, 0);
 
LOADCHROMA(1);
PUTBGR24(dst_2, py_2, 1);
PUTBGR24(dst_1, py_1, 1);
 
LOADCHROMA(2);
PUTBGR24(dst_1, py_1, 2);
PUTBGR24(dst_2, py_2, 2);
 
LOADCHROMA(3);
PUTBGR24(dst_2, py_2, 3);
PUTBGR24(dst_1, py_1, 3);
ENDYUV2RGBLINE(24, 0)
LOADCHROMA(0);
PUTBGR24(dst_1, py_1, 0);
PUTBGR24(dst_2, py_2, 0);
 
LOADCHROMA(1);
PUTBGR24(dst_2, py_2, 1);
PUTBGR24(dst_1, py_1, 1);
ENDYUV2RGBLINE(24, 1)
LOADCHROMA(0);
PUTBGR24(dst_1, py_1, 0);
PUTBGR24(dst_2, py_2, 0);
ENDYUV2RGBFUNC()
 
YUV2RGBFUNC(yuv2rgb_c_16_ordered_dither, uint16_t, 0)
const uint8_t *d16 = ff_dither_2x2_8[y & 1];
const uint8_t *e16 = ff_dither_2x2_4[y & 1];
const uint8_t *f16 = ff_dither_2x2_8[(y & 1)^1];
 
#define PUTRGB16(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d16[0 + o]] + \
g[Y + e16[0 + o]] + \
b[Y + f16[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d16[1 + o]] + \
g[Y + e16[1 + o]] + \
b[Y + f16[1 + o]];
LOADCHROMA(0);
PUTRGB16(dst_1, py_1, 0, 0);
PUTRGB16(dst_2, py_2, 0, 0 + 8);
 
LOADCHROMA(1);
PUTRGB16(dst_2, py_2, 1, 2 + 8);
PUTRGB16(dst_1, py_1, 1, 2);
 
LOADCHROMA(2);
PUTRGB16(dst_1, py_1, 2, 4);
PUTRGB16(dst_2, py_2, 2, 4 + 8);
 
LOADCHROMA(3);
PUTRGB16(dst_2, py_2, 3, 6 + 8);
PUTRGB16(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
 
YUV2RGBFUNC(yuv2rgb_c_15_ordered_dither, uint16_t, 0)
const uint8_t *d16 = ff_dither_2x2_8[y & 1];
const uint8_t *e16 = ff_dither_2x2_8[(y & 1)^1];
 
#define PUTRGB15(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d16[0 + o]] + \
g[Y + d16[1 + o]] + \
b[Y + e16[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d16[1 + o]] + \
g[Y + d16[0 + o]] + \
b[Y + e16[1 + o]];
LOADCHROMA(0);
PUTRGB15(dst_1, py_1, 0, 0);
PUTRGB15(dst_2, py_2, 0, 0 + 8);
 
LOADCHROMA(1);
PUTRGB15(dst_2, py_2, 1, 2 + 8);
PUTRGB15(dst_1, py_1, 1, 2);
 
LOADCHROMA(2);
PUTRGB15(dst_1, py_1, 2, 4);
PUTRGB15(dst_2, py_2, 2, 4 + 8);
 
LOADCHROMA(3);
PUTRGB15(dst_2, py_2, 3, 6 + 8);
PUTRGB15(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
 
// r, g, b, dst_1, dst_2
YUV2RGBFUNC(yuv2rgb_c_12_ordered_dither, uint16_t, 0)
const uint8_t *d16 = ff_dither_4x4_16[y & 3];
 
#define PUTRGB12(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d16[0 + o]] + \
g[Y + d16[0 + o]] + \
b[Y + d16[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d16[1 + o]] + \
g[Y + d16[1 + o]] + \
b[Y + d16[1 + o]];
 
LOADCHROMA(0);
PUTRGB12(dst_1, py_1, 0, 0);
PUTRGB12(dst_2, py_2, 0, 0 + 8);
 
LOADCHROMA(1);
PUTRGB12(dst_2, py_2, 1, 2 + 8);
PUTRGB12(dst_1, py_1, 1, 2);
 
LOADCHROMA(2);
PUTRGB12(dst_1, py_1, 2, 4);
PUTRGB12(dst_2, py_2, 2, 4 + 8);
 
LOADCHROMA(3);
PUTRGB12(dst_2, py_2, 3, 6 + 8);
PUTRGB12(dst_1, py_1, 3, 6);
CLOSEYUV2RGBFUNC(8)
 
// r, g, b, dst_1, dst_2
YUV2RGBFUNC(yuv2rgb_c_8_ordered_dither, uint8_t, 0)
const uint8_t *d32 = ff_dither_8x8_32[y & 7];
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
 
#define PUTRGB8(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d32[0 + o]] + \
g[Y + d32[0 + o]] + \
b[Y + d64[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d32[1 + o]] + \
g[Y + d32[1 + o]] + \
b[Y + d64[1 + o]];
 
LOADCHROMA(0);
PUTRGB8(dst_1, py_1, 0, 0);
PUTRGB8(dst_2, py_2, 0, 0 + 8);
 
LOADCHROMA(1);
PUTRGB8(dst_2, py_2, 1, 2 + 8);
PUTRGB8(dst_1, py_1, 1, 2);
 
LOADCHROMA(2);
PUTRGB8(dst_1, py_1, 2, 4);
PUTRGB8(dst_2, py_2, 2, 4 + 8);
 
LOADCHROMA(3);
PUTRGB8(dst_2, py_2, 3, 6 + 8);
PUTRGB8(dst_1, py_1, 3, 6);
 
ENDYUV2RGBLINE(8, 0)
const uint8_t *d32 = ff_dither_8x8_32[y & 7];
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
LOADCHROMA(0);
PUTRGB8(dst_1, py_1, 0, 0);
PUTRGB8(dst_2, py_2, 0, 0 + 8);
 
LOADCHROMA(1);
PUTRGB8(dst_2, py_2, 1, 2 + 8);
PUTRGB8(dst_1, py_1, 1, 2);
 
ENDYUV2RGBLINE(8, 1)
const uint8_t *d32 = ff_dither_8x8_32[y & 7];
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
LOADCHROMA(0);
PUTRGB8(dst_1, py_1, 0, 0);
PUTRGB8(dst_2, py_2, 0, 0 + 8);
 
ENDYUV2RGBFUNC()
 
 
YUV2RGBFUNC(yuv2rgb_c_4_ordered_dither, uint8_t, 0)
const uint8_t * d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
int acc;
 
#define PUTRGB4D(dst, src, i, o) \
Y = src[2 * i]; \
acc = r[Y + d128[0 + o]] + \
g[Y + d64[0 + o]] + \
b[Y + d128[0 + o]]; \
Y = src[2 * i + 1]; \
acc |= (r[Y + d128[1 + o]] + \
g[Y + d64[1 + o]] + \
b[Y + d128[1 + o]]) << 4; \
dst[i] = acc;
 
LOADCHROMA(0);
PUTRGB4D(dst_1, py_1, 0, 0);
PUTRGB4D(dst_2, py_2, 0, 0 + 8);
 
LOADCHROMA(1);
PUTRGB4D(dst_2, py_2, 1, 2 + 8);
PUTRGB4D(dst_1, py_1, 1, 2);
 
LOADCHROMA(2);
PUTRGB4D(dst_1, py_1, 2, 4);
PUTRGB4D(dst_2, py_2, 2, 4 + 8);
 
LOADCHROMA(3);
PUTRGB4D(dst_2, py_2, 3, 6 + 8);
PUTRGB4D(dst_1, py_1, 3, 6);
 
ENDYUV2RGBLINE(4, 0)
const uint8_t * d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
int acc;
LOADCHROMA(0);
PUTRGB4D(dst_1, py_1, 0, 0);
PUTRGB4D(dst_2, py_2, 0, 0 + 8);
 
LOADCHROMA(1);
PUTRGB4D(dst_2, py_2, 1, 2 + 8);
PUTRGB4D(dst_1, py_1, 1, 2);
 
ENDYUV2RGBLINE(4, 1)
const uint8_t * d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
int acc;
LOADCHROMA(0);
PUTRGB4D(dst_1, py_1, 0, 0);
PUTRGB4D(dst_2, py_2, 0, 0 + 8);
ENDYUV2RGBFUNC()
 
YUV2RGBFUNC(yuv2rgb_c_4b_ordered_dither, uint8_t, 0)
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
 
#define PUTRGB4DB(dst, src, i, o) \
Y = src[2 * i]; \
dst[2 * i] = r[Y + d128[0 + o]] + \
g[Y + d64[0 + o]] + \
b[Y + d128[0 + o]]; \
Y = src[2 * i + 1]; \
dst[2 * i + 1] = r[Y + d128[1 + o]] + \
g[Y + d64[1 + o]] + \
b[Y + d128[1 + o]];
 
LOADCHROMA(0);
PUTRGB4DB(dst_1, py_1, 0, 0);
PUTRGB4DB(dst_2, py_2, 0, 0 + 8);
 
LOADCHROMA(1);
PUTRGB4DB(dst_2, py_2, 1, 2 + 8);
PUTRGB4DB(dst_1, py_1, 1, 2);
 
LOADCHROMA(2);
PUTRGB4DB(dst_1, py_1, 2, 4);
PUTRGB4DB(dst_2, py_2, 2, 4 + 8);
 
LOADCHROMA(3);
PUTRGB4DB(dst_2, py_2, 3, 6 + 8);
PUTRGB4DB(dst_1, py_1, 3, 6);
ENDYUV2RGBLINE(8, 0)
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
LOADCHROMA(0);
PUTRGB4DB(dst_1, py_1, 0, 0);
PUTRGB4DB(dst_2, py_2, 0, 0 + 8);
 
LOADCHROMA(1);
PUTRGB4DB(dst_2, py_2, 1, 2 + 8);
PUTRGB4DB(dst_1, py_1, 1, 2);
ENDYUV2RGBLINE(8, 1)
const uint8_t *d64 = ff_dither_8x8_73[y & 7];
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
LOADCHROMA(0);
PUTRGB4DB(dst_1, py_1, 0, 0);
PUTRGB4DB(dst_2, py_2, 0, 0 + 8);
ENDYUV2RGBFUNC()
 
YUV2RGBFUNC(yuv2rgb_c_1_ordered_dither, uint8_t, 0)
const uint8_t *d128 = ff_dither_8x8_220[y & 7];
char out_1 = 0, out_2 = 0;
g = c->table_gU[128 + YUVRGB_TABLE_HEADROOM] + c->table_gV[128 + YUVRGB_TABLE_HEADROOM];
 
#define PUTRGB1(out, src, i, o) \
Y = src[2 * i]; \
out += out + g[Y + d128[0 + o]]; \
Y = src[2 * i + 1]; \
out += out + g[Y + d128[1 + o]];
 
PUTRGB1(out_1, py_1, 0, 0);
PUTRGB1(out_2, py_2, 0, 0 + 8);
 
PUTRGB1(out_2, py_2, 1, 2 + 8);
PUTRGB1(out_1, py_1, 1, 2);
 
PUTRGB1(out_1, py_1, 2, 4);
PUTRGB1(out_2, py_2, 2, 4 + 8);
 
PUTRGB1(out_2, py_2, 3, 6 + 8);
PUTRGB1(out_1, py_1, 3, 6);
 
dst_1[0] = out_1;
dst_2[0] = out_2;
CLOSEYUV2RGBFUNC(1)
 
SwsFunc ff_yuv2rgb_get_func_ptr(SwsContext *c)
{
SwsFunc t = NULL;
 
if (ARCH_PPC)
t = ff_yuv2rgb_init_ppc(c);
if (ARCH_X86)
t = ff_yuv2rgb_init_x86(c);
 
if (t)
return t;
 
av_log(c, AV_LOG_WARNING,
"No accelerated colorspace conversion found from %s to %s.\n",
av_get_pix_fmt_name(c->srcFormat), av_get_pix_fmt_name(c->dstFormat));
 
switch (c->dstFormat) {
case AV_PIX_FMT_BGR48BE:
case AV_PIX_FMT_BGR48LE:
return yuv2rgb_c_bgr48;
case AV_PIX_FMT_RGB48BE:
case AV_PIX_FMT_RGB48LE:
return yuv2rgb_c_48;
case AV_PIX_FMT_ARGB:
case AV_PIX_FMT_ABGR:
if (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat))
return yuva2argb_c;
case AV_PIX_FMT_RGBA:
case AV_PIX_FMT_BGRA:
return (CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat)) ? yuva2rgba_c : yuv2rgb_c_32;
case AV_PIX_FMT_RGB24:
return yuv2rgb_c_24_rgb;
case AV_PIX_FMT_BGR24:
return yuv2rgb_c_24_bgr;
case AV_PIX_FMT_RGB565:
case AV_PIX_FMT_BGR565:
return yuv2rgb_c_16_ordered_dither;
case AV_PIX_FMT_RGB555:
case AV_PIX_FMT_BGR555:
return yuv2rgb_c_15_ordered_dither;
case AV_PIX_FMT_RGB444:
case AV_PIX_FMT_BGR444:
return yuv2rgb_c_12_ordered_dither;
case AV_PIX_FMT_RGB8:
case AV_PIX_FMT_BGR8:
return yuv2rgb_c_8_ordered_dither;
case AV_PIX_FMT_RGB4:
case AV_PIX_FMT_BGR4:
return yuv2rgb_c_4_ordered_dither;
case AV_PIX_FMT_RGB4_BYTE:
case AV_PIX_FMT_BGR4_BYTE:
return yuv2rgb_c_4b_ordered_dither;
case AV_PIX_FMT_MONOBLACK:
return yuv2rgb_c_1_ordered_dither;
}
return NULL;
}
 
static void fill_table(uint8_t* table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize,
const int64_t inc, void *y_tab)
{
int i;
uint8_t *y_table = y_tab;
 
y_table -= elemsize * (inc >> 9);
 
for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
int64_t cb = av_clip_uint8(i-YUVRGB_TABLE_HEADROOM)*inc;
table[i] = y_table + elemsize * (cb >> 16);
}
}
 
static void fill_gv_table(int table[256 + 2*YUVRGB_TABLE_HEADROOM], const int elemsize, const int64_t inc)
{
int i;
int off = -(inc >> 9);
 
for (i = 0; i < 256 + 2*YUVRGB_TABLE_HEADROOM; i++) {
int64_t cb = av_clip_uint8(i-YUVRGB_TABLE_HEADROOM)*inc;
table[i] = elemsize * (off + (cb >> 16));
}
}
 
static uint16_t roundToInt16(int64_t f)
{
int r = (f + (1 << 15)) >> 16;
 
if (r < -0x7FFF)
return 0x8000;
else if (r > 0x7FFF)
return 0x7FFF;
else
return r;
}
 
av_cold int ff_yuv2rgb_c_init_tables(SwsContext *c, const int inv_table[4],
int fullRange, int brightness,
int contrast, int saturation)
{
const int isRgb = c->dstFormat == AV_PIX_FMT_RGB32 ||
c->dstFormat == AV_PIX_FMT_RGB32_1 ||
c->dstFormat == AV_PIX_FMT_BGR24 ||
c->dstFormat == AV_PIX_FMT_RGB565BE ||
c->dstFormat == AV_PIX_FMT_RGB565LE ||
c->dstFormat == AV_PIX_FMT_RGB555BE ||
c->dstFormat == AV_PIX_FMT_RGB555LE ||
c->dstFormat == AV_PIX_FMT_RGB444BE ||
c->dstFormat == AV_PIX_FMT_RGB444LE ||
c->dstFormat == AV_PIX_FMT_RGB8 ||
c->dstFormat == AV_PIX_FMT_RGB4 ||
c->dstFormat == AV_PIX_FMT_RGB4_BYTE ||
c->dstFormat == AV_PIX_FMT_MONOBLACK;
const int isNotNe = c->dstFormat == AV_PIX_FMT_NE(RGB565LE, RGB565BE) ||
c->dstFormat == AV_PIX_FMT_NE(RGB555LE, RGB555BE) ||
c->dstFormat == AV_PIX_FMT_NE(RGB444LE, RGB444BE) ||
c->dstFormat == AV_PIX_FMT_NE(BGR565LE, BGR565BE) ||
c->dstFormat == AV_PIX_FMT_NE(BGR555LE, BGR555BE) ||
c->dstFormat == AV_PIX_FMT_NE(BGR444LE, BGR444BE);
const int bpp = c->dstFormatBpp;
uint8_t *y_table;
uint16_t *y_table16;
uint32_t *y_table32;
int i, base, rbase, gbase, bbase, av_uninit(abase), needAlpha;
const int yoffs = (fullRange ? 384 : 326) + YUVRGB_TABLE_LUMA_HEADROOM;
const int table_plane_size = 1024 + 2*YUVRGB_TABLE_LUMA_HEADROOM;
 
int64_t crv = inv_table[0];
int64_t cbu = inv_table[1];
int64_t cgu = -inv_table[2];
int64_t cgv = -inv_table[3];
int64_t cy = 1 << 16;
int64_t oy = 0;
int64_t yb = 0;
 
if (!fullRange) {
cy = (cy * 255) / 219;
oy = 16 << 16;
} else {
crv = (crv * 224) / 255;
cbu = (cbu * 224) / 255;
cgu = (cgu * 224) / 255;
cgv = (cgv * 224) / 255;
}
 
cy = (cy * contrast) >> 16;
crv = (crv * contrast * saturation) >> 32;
cbu = (cbu * contrast * saturation) >> 32;
cgu = (cgu * contrast * saturation) >> 32;
cgv = (cgv * contrast * saturation) >> 32;
oy -= 256 * brightness;
 
c->uOffset = 0x0400040004000400LL;
c->vOffset = 0x0400040004000400LL;
c->yCoeff = roundToInt16(cy * 8192) * 0x0001000100010001ULL;
c->vrCoeff = roundToInt16(crv * 8192) * 0x0001000100010001ULL;
c->ubCoeff = roundToInt16(cbu * 8192) * 0x0001000100010001ULL;
c->vgCoeff = roundToInt16(cgv * 8192) * 0x0001000100010001ULL;
c->ugCoeff = roundToInt16(cgu * 8192) * 0x0001000100010001ULL;
c->yOffset = roundToInt16(oy * 8) * 0x0001000100010001ULL;
 
c->yuv2rgb_y_coeff = (int16_t)roundToInt16(cy << 13);
c->yuv2rgb_y_offset = (int16_t)roundToInt16(oy << 9);
c->yuv2rgb_v2r_coeff = (int16_t)roundToInt16(crv << 13);
c->yuv2rgb_v2g_coeff = (int16_t)roundToInt16(cgv << 13);
c->yuv2rgb_u2g_coeff = (int16_t)roundToInt16(cgu << 13);
c->yuv2rgb_u2b_coeff = (int16_t)roundToInt16(cbu << 13);
 
//scale coefficients by cy
crv = ((crv << 16) + 0x8000) / FFMAX(cy, 1);
cbu = ((cbu << 16) + 0x8000) / FFMAX(cy, 1);
cgu = ((cgu << 16) + 0x8000) / FFMAX(cy, 1);
cgv = ((cgv << 16) + 0x8000) / FFMAX(cy, 1);
 
av_freep(&c->yuvTable);
 
#define ALLOC_YUV_TABLE(x) \
c->yuvTable = av_malloc(x); \
if (!c->yuvTable) \
return AVERROR(ENOMEM);
switch (bpp) {
case 1:
ALLOC_YUV_TABLE(table_plane_size);
y_table = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size - 110; i++) {
y_table[i + 110] = av_clip_uint8((yb + 0x8000) >> 16) >> 7;
yb += cy;
}
fill_table(c->table_gU, 1, cgu, y_table + yoffs);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 4:
case 4 | 128:
rbase = isRgb ? 3 : 0;
gbase = 1;
bbase = isRgb ? 0 : 3;
ALLOC_YUV_TABLE(table_plane_size * 3);
y_table = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size - 110; i++) {
int yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table[i + 110] = (yval >> 7) << rbase;
y_table[i + 37 + table_plane_size] = ((yval + 43) / 85) << gbase;
y_table[i + 110 + 2*table_plane_size] = (yval >> 7) << bbase;
yb += cy;
}
fill_table(c->table_rV, 1, crv, y_table + yoffs);
fill_table(c->table_gU, 1, cgu, y_table + yoffs + table_plane_size);
fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2*table_plane_size);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 8:
rbase = isRgb ? 5 : 0;
gbase = isRgb ? 2 : 3;
bbase = isRgb ? 0 : 6;
ALLOC_YUV_TABLE(table_plane_size * 3);
y_table = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size - 38; i++) {
int yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table[i + 16] = ((yval + 18) / 36) << rbase;
y_table[i + 16 + table_plane_size] = ((yval + 18) / 36) << gbase;
y_table[i + 37 + 2*table_plane_size] = ((yval + 43) / 85) << bbase;
yb += cy;
}
fill_table(c->table_rV, 1, crv, y_table + yoffs);
fill_table(c->table_gU, 1, cgu, y_table + yoffs + table_plane_size);
fill_table(c->table_bU, 1, cbu, y_table + yoffs + 2*table_plane_size);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 12:
rbase = isRgb ? 8 : 0;
gbase = 4;
bbase = isRgb ? 0 : 8;
ALLOC_YUV_TABLE(table_plane_size * 3 * 2);
y_table16 = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size; i++) {
uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table16[i] = (yval >> 4) << rbase;
y_table16[i + table_plane_size] = (yval >> 4) << gbase;
y_table16[i + 2*table_plane_size] = (yval >> 4) << bbase;
yb += cy;
}
if (isNotNe)
for (i = 0; i < table_plane_size * 3; i++)
y_table16[i] = av_bswap16(y_table16[i]);
fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + table_plane_size);
fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2*table_plane_size);
fill_gv_table(c->table_gV, 2, cgv);
break;
case 15:
case 16:
rbase = isRgb ? bpp - 5 : 0;
gbase = 5;
bbase = isRgb ? 0 : (bpp - 5);
ALLOC_YUV_TABLE(table_plane_size * 3 * 2);
y_table16 = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size; i++) {
uint8_t yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table16[i] = (yval >> 3) << rbase;
y_table16[i + table_plane_size] = (yval >> (18 - bpp)) << gbase;
y_table16[i + 2*table_plane_size] = (yval >> 3) << bbase;
yb += cy;
}
if (isNotNe)
for (i = 0; i < table_plane_size * 3; i++)
y_table16[i] = av_bswap16(y_table16[i]);
fill_table(c->table_rV, 2, crv, y_table16 + yoffs);
fill_table(c->table_gU, 2, cgu, y_table16 + yoffs + table_plane_size);
fill_table(c->table_bU, 2, cbu, y_table16 + yoffs + 2*table_plane_size);
fill_gv_table(c->table_gV, 2, cgv);
break;
case 24:
case 48:
ALLOC_YUV_TABLE(table_plane_size);
y_table = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size; i++) {
y_table[i] = av_clip_uint8((yb + 0x8000) >> 16);
yb += cy;
}
fill_table(c->table_rV, 1, crv, y_table + yoffs);
fill_table(c->table_gU, 1, cgu, y_table + yoffs);
fill_table(c->table_bU, 1, cbu, y_table + yoffs);
fill_gv_table(c->table_gV, 1, cgv);
break;
case 32:
case 64:
base = (c->dstFormat == AV_PIX_FMT_RGB32_1 ||
c->dstFormat == AV_PIX_FMT_BGR32_1) ? 8 : 0;
rbase = base + (isRgb ? 16 : 0);
gbase = base + 8;
bbase = base + (isRgb ? 0 : 16);
needAlpha = CONFIG_SWSCALE_ALPHA && isALPHA(c->srcFormat);
if (!needAlpha)
abase = (base + 24) & 31;
ALLOC_YUV_TABLE(table_plane_size * 3 * 4);
y_table32 = c->yuvTable;
yb = -(384 << 16) - YUVRGB_TABLE_LUMA_HEADROOM*cy - oy;
for (i = 0; i < table_plane_size; i++) {
unsigned yval = av_clip_uint8((yb + 0x8000) >> 16);
y_table32[i] = (yval << rbase) +
(needAlpha ? 0 : (255u << abase));
y_table32[i + table_plane_size] = yval << gbase;
y_table32[i + 2*table_plane_size] = yval << bbase;
yb += cy;
}
fill_table(c->table_rV, 4, crv, y_table32 + yoffs);
fill_table(c->table_gU, 4, cgu, y_table32 + yoffs + table_plane_size);
fill_table(c->table_bU, 4, cbu, y_table32 + yoffs + 2*table_plane_size);
fill_gv_table(c->table_gV, 4, cgv);
break;
default:
if(!isPlanar(c->dstFormat) || bpp <= 24)
av_log(c, AV_LOG_ERROR, "%ibpp not supported by yuv2rgb\n", bpp);
return -1;
}
return 0;
}