Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 4348 → Rev 4349

/contrib/sdk/sources/ffmpeg/libavcodec/x86/Makefile
0,0 → 1,104
OBJS += x86/constants.o \
x86/fmtconvert_init.o \
 
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o
OBJS-$(CONFIG_DCT) += x86/dct_init.o
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o
OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o \
x86/dsputil_x86.o
OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \
x86/fdct.o \
x86/motion_est.o
OBJS-$(CONFIG_FFT) += x86/fft_init.o
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o
OBJS-$(CONFIG_LPC) += x86/lpc.o
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o
OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o
OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \
x86/rv40dsp_init.o
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o
OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o
OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
 
MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \
x86/fpel_mmx.o \
x86/idct_mmx_xvid.o \
x86/idct_sse2_xvid.o \
x86/rnd_mmx.o \
x86/simple_idct.o
MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o
MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \
x86/hpeldsp_mmx.o \
x86/rnd_mmx.o
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o
 
YASM-OBJS += x86/deinterlace.o \
x86/fmtconvert.o \
 
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o
YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\
x86/dwt_yasm.o
YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \
x86/fpel.o \
x86/mpeg4qpel.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o
YASM-OBJS-$(CONFIG_FFT) += x86/fft.o
YASM-OBJS-$(CONFIG_H263_DECODER) += x86/h263_loopfilter.o
YASM-OBJS-$(CONFIG_H263_ENCODER) += x86/h263_loopfilter.o
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \
x86/h264_chromamc_10bit.o
YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \
x86/h264_deblock_10bit.o \
x86/h264_idct.o \
x86/h264_idct_10bit.o \
x86/h264_weight.o \
x86/h264_weight_10bit.o
YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \
x86/h264_intrapred_10bit.o
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \
x86/h264_qpel_10bit.o \
x86/fpel.o \
x86/qpel.o
YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \
x86/hpeldsp.o
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o
YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o
YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o
YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \
x86/rv40dsp.o
YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o
YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o
YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp.o
YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o
/contrib/sdk/sources/ffmpeg/libavcodec/x86/ac3dsp.asm
0,0 → 1,421
;*****************************************************************************
;* x86-optimized AC-3 DSP utils
;* Copyright (c) 2011 Justin Ruggles
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
; 16777216.0f - used in ff_float_to_fixed24()
pf_1_24: times 4 dd 0x4B800000
 
; used in ff_ac3_compute_mantissa_size()
cextern ac3_bap_bits
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7
 
; used in ff_ac3_extract_exponents()
pd_1: times 4 dd 1
pd_151: times 4 dd 151
 
SECTION .text
 
;-----------------------------------------------------------------------------
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs)
;-----------------------------------------------------------------------------
 
%macro AC3_EXPONENT_MIN 0
cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset
shl reuse_blksq, 8
jz .end
LOOP_ALIGN
.nextexp:
mov offsetq, reuse_blksq
mova m0, [expq+offsetq]
sub offsetq, 256
LOOP_ALIGN
.nextblk:
PMINUB m0, [expq+offsetq], m1
sub offsetq, 256
jae .nextblk
mova [expq], m0
add expq, mmsize
sub expnq, mmsize
jg .nextexp
.end:
REP_RET
%endmacro
 
%define LOOP_ALIGN
INIT_MMX mmx
AC3_EXPONENT_MIN
%if HAVE_MMXEXT_EXTERNAL
%define LOOP_ALIGN ALIGN 16
INIT_MMX mmxext
AC3_EXPONENT_MIN
%endif
%if HAVE_SSE2_EXTERNAL
INIT_XMM sse2
AC3_EXPONENT_MIN
%endif
%undef LOOP_ALIGN
 
;-----------------------------------------------------------------------------
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len)
;
; This function uses 2 different methods to calculate a valid result.
; 1) logical 'or' of abs of each element
; This is used for ssse3 because of the pabsw instruction.
; It is also used for mmx because of the lack of min/max instructions.
; 2) calculate min/max for the array, then or(abs(min),abs(max))
; This is used for mmxext and sse2 because they have pminsw/pmaxsw.
;-----------------------------------------------------------------------------
 
; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word
%macro OR_WORDS_HORIZ 2 ; src, tmp
%if cpuflag(sse2)
movhlps %2, %1
por %1, %2
pshuflw %2, %1, q0032
por %1, %2
pshuflw %2, %1, q0001
por %1, %2
%elif cpuflag(mmxext)
pshufw %2, %1, q0032
por %1, %2
pshufw %2, %1, q0001
por %1, %2
%else ; mmx
movq %2, %1
psrlq %2, 32
por %1, %2
movq %2, %1
psrlq %2, 16
por %1, %2
%endif
%endmacro
 
%macro AC3_MAX_MSB_ABS_INT16 1
cglobal ac3_max_msb_abs_int16, 2,2,5, src, len
pxor m2, m2
pxor m3, m3
.loop:
%ifidn %1, min_max
mova m0, [srcq]
mova m1, [srcq+mmsize]
pminsw m2, m0
pminsw m2, m1
pmaxsw m3, m0
pmaxsw m3, m1
%else ; or_abs
%if notcpuflag(ssse3)
mova m0, [srcq]
mova m1, [srcq+mmsize]
ABS2 m0, m1, m3, m4
%else ; ssse3
; using memory args is faster for ssse3
pabsw m0, [srcq]
pabsw m1, [srcq+mmsize]
%endif
por m2, m0
por m2, m1
%endif
add srcq, mmsize*2
sub lend, mmsize
ja .loop
%ifidn %1, min_max
ABS2 m2, m3, m0, m1
por m2, m3
%endif
OR_WORDS_HORIZ m2, m0
movd eax, m2
and eax, 0xFFFF
RET
%endmacro
 
INIT_MMX mmx
AC3_MAX_MSB_ABS_INT16 or_abs
INIT_MMX mmxext
AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM sse2
AC3_MAX_MSB_ABS_INT16 min_max
INIT_XMM ssse3
AC3_MAX_MSB_ABS_INT16 or_abs
 
;-----------------------------------------------------------------------------
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32()
;-----------------------------------------------------------------------------
 
%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set
cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift
movd m0, shiftd
.loop:
mova m1, [srcq ]
mova m2, [srcq+mmsize ]
mova m3, [srcq+mmsize*2]
mova m4, [srcq+mmsize*3]
%3 m1, m0
%3 m2, m0
%3 m3, m0
%3 m4, m0
mova [srcq ], m1
mova [srcq+mmsize ], m2
mova [srcq+mmsize*2], m3
mova [srcq+mmsize*3], m4
add srcq, mmsize*4
sub lend, mmsize*32/%2
ja .loop
.end:
REP_RET
%endmacro
 
;-----------------------------------------------------------------------------
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift)
;-----------------------------------------------------------------------------
 
INIT_MMX mmx
AC3_SHIFT l, 16, psllw
INIT_XMM sse2
AC3_SHIFT l, 16, psllw
 
;-----------------------------------------------------------------------------
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift)
;-----------------------------------------------------------------------------
 
INIT_MMX mmx
AC3_SHIFT r, 32, psrad
INIT_XMM sse2
AC3_SHIFT r, 32, psrad
 
;-----------------------------------------------------------------------------
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len)
;-----------------------------------------------------------------------------
 
; The 3DNow! version is not bit-identical because pf2id uses truncation rather
; than round-to-nearest.
INIT_MMX 3dnow
cglobal float_to_fixed24, 3, 3, 0, dst, src, len
movq m0, [pf_1_24]
.loop:
movq m1, [srcq ]
movq m2, [srcq+8 ]
movq m3, [srcq+16]
movq m4, [srcq+24]
pfmul m1, m0
pfmul m2, m0
pfmul m3, m0
pfmul m4, m0
pf2id m1, m1
pf2id m2, m2
pf2id m3, m3
pf2id m4, m4
movq [dstq ], m1
movq [dstq+8 ], m2
movq [dstq+16], m3
movq [dstq+24], m4
add srcq, 32
add dstq, 32
sub lend, 8
ja .loop
femms
RET
 
INIT_XMM sse
cglobal float_to_fixed24, 3, 3, 3, dst, src, len
movaps m0, [pf_1_24]
.loop:
movaps m1, [srcq ]
movaps m2, [srcq+16]
mulps m1, m0
mulps m2, m0
cvtps2pi mm0, m1
movhlps m1, m1
cvtps2pi mm1, m1
cvtps2pi mm2, m2
movhlps m2, m2
cvtps2pi mm3, m2
movq [dstq ], mm0
movq [dstq+ 8], mm1
movq [dstq+16], mm2
movq [dstq+24], mm3
add srcq, 32
add dstq, 32
sub lend, 8
ja .loop
emms
RET
 
INIT_XMM sse2
cglobal float_to_fixed24, 3, 3, 9, dst, src, len
movaps m0, [pf_1_24]
.loop:
movaps m1, [srcq ]
movaps m2, [srcq+16 ]
movaps m3, [srcq+32 ]
movaps m4, [srcq+48 ]
%ifdef m8
movaps m5, [srcq+64 ]
movaps m6, [srcq+80 ]
movaps m7, [srcq+96 ]
movaps m8, [srcq+112]
%endif
mulps m1, m0
mulps m2, m0
mulps m3, m0
mulps m4, m0
%ifdef m8
mulps m5, m0
mulps m6, m0
mulps m7, m0
mulps m8, m0
%endif
cvtps2dq m1, m1
cvtps2dq m2, m2
cvtps2dq m3, m3
cvtps2dq m4, m4
%ifdef m8
cvtps2dq m5, m5
cvtps2dq m6, m6
cvtps2dq m7, m7
cvtps2dq m8, m8
%endif
movdqa [dstq ], m1
movdqa [dstq+16 ], m2
movdqa [dstq+32 ], m3
movdqa [dstq+48 ], m4
%ifdef m8
movdqa [dstq+64 ], m5
movdqa [dstq+80 ], m6
movdqa [dstq+96 ], m7
movdqa [dstq+112], m8
add srcq, 128
add dstq, 128
sub lenq, 32
%else
add srcq, 64
add dstq, 64
sub lenq, 16
%endif
ja .loop
REP_RET
 
;------------------------------------------------------------------------------
; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16])
;------------------------------------------------------------------------------
 
%macro PHADDD4 2 ; xmm src, xmm tmp
movhlps %2, %1
paddd %1, %2
pshufd %2, %1, 0x1
paddd %1, %2
%endmacro
 
INIT_XMM sse2
cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum
movdqa m0, [mant_cntq ]
movdqa m1, [mant_cntq+ 1*16]
paddw m0, [mant_cntq+ 2*16]
paddw m1, [mant_cntq+ 3*16]
paddw m0, [mant_cntq+ 4*16]
paddw m1, [mant_cntq+ 5*16]
paddw m0, [mant_cntq+ 6*16]
paddw m1, [mant_cntq+ 7*16]
paddw m0, [mant_cntq+ 8*16]
paddw m1, [mant_cntq+ 9*16]
paddw m0, [mant_cntq+10*16]
paddw m1, [mant_cntq+11*16]
pmaddwd m0, [ac3_bap_bits ]
pmaddwd m1, [ac3_bap_bits+16]
paddd m0, m1
PHADDD4 m0, m1
movd sumd, m0
movdqa m3, [pw_bap_mul1]
movhpd m0, [mant_cntq +2]
movlpd m0, [mant_cntq+1*32+2]
movhpd m1, [mant_cntq+2*32+2]
movlpd m1, [mant_cntq+3*32+2]
movhpd m2, [mant_cntq+4*32+2]
movlpd m2, [mant_cntq+5*32+2]
pmulhuw m0, m3
pmulhuw m1, m3
pmulhuw m2, m3
paddusw m0, m1
paddusw m0, m2
pmaddwd m0, [pw_bap_mul2]
PHADDD4 m0, m1
movd eax, m0
add eax, sumd
RET
 
;------------------------------------------------------------------------------
; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs)
;------------------------------------------------------------------------------
 
%macro PABSD 1-2 ; src/dst, unused
%if cpuflag(ssse3)
pabsd %1, %1
%else ; src/dst, tmp
pxor %2, %2
pcmpgtd %2, %1
pxor %1, %2
psubd %1, %2
%endif
%endmacro
 
%macro AC3_EXTRACT_EXPONENTS 0
cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len
add expq, lenq
lea coefq, [coefq+4*lenq]
neg lenq
mova m2, [pd_1]
mova m3, [pd_151]
.loop:
; move 4 32-bit coefs to xmm0
mova m0, [coefq+4*lenq]
; absolute value
PABSD m0, m1
; convert to float and extract exponents
pslld m0, 1
por m0, m2
cvtdq2ps m1, m0
psrld m1, 23
mova m0, m3
psubd m0, m1
; move the lowest byte in each of 4 dwords to the low dword
; NOTE: We cannot just extract the low bytes with pshufb because the dword
; result for 16777215 is -1 due to float inaccuracy. Using packuswb
; clips this to 0, which is the correct exponent.
packssdw m0, m0
packuswb m0, m0
movd [expq+lenq], m0
 
add lenq, 4
jl .loop
REP_RET
%endmacro
 
%if HAVE_SSE2_EXTERNAL
INIT_XMM sse2
AC3_EXTRACT_EXPONENTS
%endif
%if HAVE_SSSE3_EXTERNAL
INIT_XMM ssse3
AC3_EXTRACT_EXPONENTS
%endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/ac3dsp_init.c
0,0 → 1,231
/*
* x86-optimized AC-3 DSP utils
* Copyright (c) 2011 Justin Ruggles
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "dsputil_x86.h"
#include "libavcodec/ac3.h"
#include "libavcodec/ac3dsp.h"
 
void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs);
void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs);
 
int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len);
int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len);
 
void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift);
void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift);
 
void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift);
void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift);
 
void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len);
void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len);
 
int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]);
 
void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs);
void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs);
 
#if ARCH_X86_32 && defined(__INTEL_COMPILER)
# undef HAVE_7REGS
# define HAVE_7REGS 0
#endif
 
#if HAVE_SSE_INLINE && HAVE_7REGS
 
#define IF1(x) x
#define IF0(x)
 
#define MIX5(mono, stereo) \
__asm__ volatile ( \
"movss 0(%1), %%xmm5 \n" \
"movss 8(%1), %%xmm6 \n" \
"movss 24(%1), %%xmm7 \n" \
"shufps $0, %%xmm5, %%xmm5 \n" \
"shufps $0, %%xmm6, %%xmm6 \n" \
"shufps $0, %%xmm7, %%xmm7 \n" \
"1: \n" \
"movaps (%0, %2), %%xmm0 \n" \
"movaps (%0, %3), %%xmm1 \n" \
"movaps (%0, %4), %%xmm2 \n" \
"movaps (%0, %5), %%xmm3 \n" \
"movaps (%0, %6), %%xmm4 \n" \
"mulps %%xmm5, %%xmm0 \n" \
"mulps %%xmm6, %%xmm1 \n" \
"mulps %%xmm5, %%xmm2 \n" \
"mulps %%xmm7, %%xmm3 \n" \
"mulps %%xmm7, %%xmm4 \n" \
stereo("addps %%xmm1, %%xmm0 \n") \
"addps %%xmm1, %%xmm2 \n" \
"addps %%xmm3, %%xmm0 \n" \
"addps %%xmm4, %%xmm2 \n" \
mono("addps %%xmm2, %%xmm0 \n") \
"movaps %%xmm0, (%0, %2) \n" \
stereo("movaps %%xmm2, (%0, %3) \n") \
"add $16, %0 \n" \
"jl 1b \n" \
: "+&r"(i) \
: "r"(matrix), \
"r"(samples[0] + len), \
"r"(samples[1] + len), \
"r"(samples[2] + len), \
"r"(samples[3] + len), \
"r"(samples[4] + len) \
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \
"%xmm4", "%xmm5", "%xmm6", "%xmm7",) \
"memory" \
);
 
#define MIX_MISC(stereo) \
__asm__ volatile ( \
"mov %5, %2 \n" \
"1: \n" \
"mov -%c7(%6, %2, %c8), %3 \n" \
"movaps (%3, %0), %%xmm0 \n" \
stereo("movaps %%xmm0, %%xmm1 \n") \
"mulps %%xmm4, %%xmm0 \n" \
stereo("mulps %%xmm5, %%xmm1 \n") \
"2: \n" \
"mov (%6, %2, %c8), %1 \n" \
"movaps (%1, %0), %%xmm2 \n" \
stereo("movaps %%xmm2, %%xmm3 \n") \
"mulps (%4, %2, 8), %%xmm2 \n" \
stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \
"addps %%xmm2, %%xmm0 \n" \
stereo("addps %%xmm3, %%xmm1 \n") \
"add $4, %2 \n" \
"jl 2b \n" \
"mov %5, %2 \n" \
stereo("mov (%6, %2, %c8), %1 \n") \
"movaps %%xmm0, (%3, %0) \n" \
stereo("movaps %%xmm1, (%1, %0) \n") \
"add $16, %0 \n" \
"jl 1b \n" \
: "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \
: "r"(matrix_simd + in_ch), \
"g"((intptr_t) - 4 * (in_ch - 1)), \
"r"(samp + in_ch), \
"i"(sizeof(float *)), "i"(sizeof(float *)/4) \
: "memory" \
);
 
static void ac3_downmix_sse(float **samples, float (*matrix)[2],
int out_ch, int in_ch, int len)
{
int (*matrix_cmp)[2] = (int(*)[2])matrix;
intptr_t i, j, k, m;
 
i = -len * sizeof(float);
if (in_ch == 5 && out_ch == 2 &&
!(matrix_cmp[0][1] | matrix_cmp[2][0] |
matrix_cmp[3][1] | matrix_cmp[4][0] |
(matrix_cmp[1][0] ^ matrix_cmp[1][1]) |
(matrix_cmp[0][0] ^ matrix_cmp[2][1]))) {
MIX5(IF0, IF1);
} else if (in_ch == 5 && out_ch == 1 &&
matrix_cmp[0][0] == matrix_cmp[2][0] &&
matrix_cmp[3][0] == matrix_cmp[4][0]) {
MIX5(IF1, IF0);
} else {
DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4];
float *samp[AC3_MAX_CHANNELS];
 
for (j = 0; j < in_ch; j++)
samp[j] = samples[j] + len;
 
j = 2 * in_ch * sizeof(float);
__asm__ volatile (
"1: \n"
"sub $8, %0 \n"
"movss (%2, %0), %%xmm4 \n"
"movss 4(%2, %0), %%xmm5 \n"
"shufps $0, %%xmm4, %%xmm4 \n"
"shufps $0, %%xmm5, %%xmm5 \n"
"movaps %%xmm4, (%1, %0, 4) \n"
"movaps %%xmm5, 16(%1, %0, 4) \n"
"jg 1b \n"
: "+&r"(j)
: "r"(matrix_simd), "r"(matrix)
: "memory"
);
if (out_ch == 2) {
MIX_MISC(IF1);
} else {
MIX_MISC(IF0);
}
}
}
 
#endif /* HAVE_SSE_INLINE && HAVE_7REGS */
 
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact)
{
int cpu_flags = av_get_cpu_flags();
 
if (EXTERNAL_MMX(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmx;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx;
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (!bit_exact) {
c->float_to_fixed24 = ff_float_to_fixed24_3dnow;
}
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext;
}
if (EXTERNAL_SSE(cpu_flags)) {
c->float_to_fixed24 = ff_float_to_fixed24_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->ac3_exponent_min = ff_ac3_exponent_min_sse2;
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2;
c->float_to_fixed24 = ff_float_to_fixed24_sse2;
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2;
c->extract_exponents = ff_ac3_extract_exponents_sse2;
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2;
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2;
}
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3;
if (!(cpu_flags & AV_CPU_FLAG_ATOM)) {
c->extract_exponents = ff_ac3_extract_exponents_ssse3;
}
}
 
#if HAVE_SSE_INLINE && HAVE_7REGS
if (INLINE_SSE(cpu_flags)) {
c->downmix = ac3_downmix_sse;
}
#endif
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/cabac.h
0,0 → 1,299
/*
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef AVCODEC_X86_CABAC_H
#define AVCODEC_X86_CABAC_H
 
#include "libavcodec/cabac.h"
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/internal.h"
#include "config.h"
 
#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\
|| ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1)
# define BROKEN_COMPILER 1
#else
# define BROKEN_COMPILER 0
#endif
 
#if HAVE_INLINE_ASM
 
#ifndef UNCHECKED_BITSTREAM_READER
#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER
#endif
 
#if UNCHECKED_BITSTREAM_READER
#define END_CHECK(end) ""
#else
#define END_CHECK(end) \
"cmp "end" , %%"REG_c" \n\t"\
"jge 1f \n\t"
#endif
 
#ifdef BROKEN_RELOCATIONS
#define TABLES_ARG , "r"(tables)
 
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
"cmp "low" , "tmp" \n\t"\
"cmova %%ecx , "range" \n\t"\
"sbb %%rcx , %%rcx \n\t"\
"and %%ecx , "tmp" \n\t"\
"xor %%rcx , "retq" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
/* P4 Prescott has crappy cmov,sbb,64bit shift so avoid them */ \
"sub "low" , "tmp" \n\t"\
"sar $31 , "tmp" \n\t"\
"sub %%ecx , "range" \n\t"\
"and "tmp" , "range" \n\t"\
"add %%ecx , "range" \n\t"\
"shl $17 , %%ecx \n\t"\
"and "tmp" , %%ecx \n\t"\
"sub %%ecx , "low" \n\t"\
"xor "tmp" , "ret" \n\t"\
"movslq "ret" , "retq" \n\t"
#endif /* HAVE_FAST_CMOV */
 
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
"lea ("ret", "range", 2), %%ecx \n\t"\
"movzbl "lps_off"("tables", %%rcx), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \
"movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
"jnz 2f \n\t"\
"mov "byte" , %%"REG_c" \n\t"\
END_CHECK(end)\
"add"OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
"shr $15 , %%ecx \n\t"\
"bswap "tmp" \n\t"\
"shr $15 , "tmp" \n\t"\
"movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\
"sub $0xFFFF , "tmp" \n\t"\
"neg %%ecx \n\t"\
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
"2: \n\t"
 
#else /* BROKEN_RELOCATIONS */
#define TABLES_ARG
#define RIP_ARG
 
#if HAVE_FAST_CMOV
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"cmp "low" , "tmp" \n\t"\
"cmova %%ecx , "range" \n\t"\
"sbb %%ecx , %%ecx \n\t"\
"and %%ecx , "tmp" \n\t"\
"xor %%ecx , "ret" \n\t"\
"sub "tmp" , "low" \n\t"
#else /* HAVE_FAST_CMOV */
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\
"mov "tmp" , %%ecx \n\t"\
"shl $17 , "tmp" \n\t"\
"sub "low" , "tmp" \n\t"\
"sar $31 , "tmp" \n\t" /*lps_mask*/\
"sub %%ecx , "range" \n\t" /*RangeLPS - range*/\
"and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\
"add %%ecx , "range" \n\t" /*new range*/\
"shl $17 , %%ecx \n\t"\
"and "tmp" , %%ecx \n\t"\
"sub %%ecx , "low" \n\t"\
"xor "tmp" , "ret" \n\t"
#endif /* HAVE_FAST_CMOV */
 
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \
"movzbl "statep" , "ret" \n\t"\
"mov "range" , "tmp" \n\t"\
"and $0xC0 , "range" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\
"sub "range" , "tmp" \n\t"\
BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\
"shl %%cl , "range" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\
"shl %%cl , "low" \n\t"\
"mov "tmpbyte" , "statep" \n\t"\
"test "lowword" , "lowword" \n\t"\
" jnz 2f \n\t"\
"mov "byte" , %%"REG_c" \n\t"\
END_CHECK(end)\
"add"OPSIZE" $2 , "byte" \n\t"\
"1: \n\t"\
"movzwl (%%"REG_c") , "tmp" \n\t"\
"lea -1("low") , %%ecx \n\t"\
"xor "low" , %%ecx \n\t"\
"shr $15 , %%ecx \n\t"\
"bswap "tmp" \n\t"\
"shr $15 , "tmp" \n\t"\
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\
"sub $0xFFFF , "tmp" \n\t"\
"neg %%ecx \n\t"\
"add $7 , %%ecx \n\t"\
"shl %%cl , "tmp" \n\t"\
"add "tmp" , "low" \n\t"\
"2: \n\t"
 
#endif /* BROKEN_RELOCATIONS */
 
#if HAVE_7REGS && !BROKEN_COMPILER
#define get_cabac_inline get_cabac_inline_x86
static av_always_inline int get_cabac_inline_x86(CABACContext *c,
uint8_t *const state)
{
int bit, tmp;
#ifdef BROKEN_RELOCATIONS
void *tables;
 
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
);
#endif
 
__asm__ volatile(
BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1",
"%2", "%q2", "%3", "%b3",
"%c6(%5)", "%c7(%5)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%8")
: "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp)
: "r"(state), "r"(c),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end))
TABLES_ARG
,"1"(c->low), "2"(c->range)
: "%"REG_c, "memory"
);
return bit & 1;
}
#endif /* HAVE_7REGS */
 
#if !BROKEN_COMPILER
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val)
{
x86_reg tmp;
__asm__ volatile(
"movl %c6(%2), %k1 \n\t"
"movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
"cltd \n\t"
"and %%edx, %k1 \n\t"
"add %k1, %%eax \n\t"
"xor %%edx, %%ecx \n\t"
"sub %%edx, %%ecx \n\t"
"test %%ax, %%ax \n\t"
"jnz 1f \n\t"
"mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%edx \n\t"
"bswap %%edx \n\t"
"shrl $15, %%edx \n\t"
#if UNCHECKED_BITSTREAM_READER
"add $2, %1 \n\t"
"addl %%edx, %%eax \n\t"
"mov %1, %c4(%2) \n\t"
#else
"addl %%edx, %%eax \n\t"
"cmp %c5(%2), %1 \n\t"
"jge 1f \n\t"
"add"OPSIZE" $2, %c4(%2) \n\t"
#endif
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"
 
: "+c"(val), "=&r"(tmp)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%edx", "memory"
);
return val;
}
 
#define get_cabac_bypass get_cabac_bypass_x86
static av_always_inline int get_cabac_bypass_x86(CABACContext *c)
{
x86_reg tmp;
int res;
__asm__ volatile(
"movl %c6(%2), %k1 \n\t"
"movl %c3(%2), %%eax \n\t"
"shl $17, %k1 \n\t"
"add %%eax, %%eax \n\t"
"sub %k1, %%eax \n\t"
"cltd \n\t"
"and %%edx, %k1 \n\t"
"add %k1, %%eax \n\t"
"inc %%edx \n\t"
"test %%ax, %%ax \n\t"
"jnz 1f \n\t"
"mov %c4(%2), %1 \n\t"
"subl $0xFFFF, %%eax \n\t"
"movzwl (%1), %%ecx \n\t"
"bswap %%ecx \n\t"
"shrl $15, %%ecx \n\t"
"addl %%ecx, %%eax \n\t"
"cmp %c5(%2), %1 \n\t"
"jge 1f \n\t"
"add"OPSIZE" $2, %c4(%2) \n\t"
"1: \n\t"
"movl %%eax, %c3(%2) \n\t"
 
: "=&d"(res), "=&r"(tmp)
: "r"(c),
"i"(offsetof(CABACContext, low)),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(offsetof(CABACContext, range))
: "%eax", "%ecx", "memory"
);
return res;
}
#endif /* !BROKEN_COMPILER */
 
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_CABAC_H */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/cavsdsp.c
0,0 → 1,558
/*
* Chinese AVS video (AVS1-P2, JiZhun profile) decoder.
* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de>
*
* MMX-optimized DSP functions, based on H.264 optimizations by
* Michael Niedermayer and Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/cavsdsp.h"
#include "constants.h"
#include "dsputil_x86.h"
#include "config.h"
 
#if HAVE_MMX_INLINE
 
/* in/out: mma=mma+mmb, mmb=mmb-mma */
#define SUMSUB_BA( a, b ) \
"paddw "#b", "#a" \n\t"\
"paddw "#b", "#b" \n\t"\
"psubw "#a", "#b" \n\t"
 
/*****************************************************************************
*
* inverse transform
*
****************************************************************************/
 
static inline void cavs_idct8_1d(int16_t *block, uint64_t bias)
{
__asm__ volatile(
"movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */
"movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */
"movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */
"movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */
"movq %%mm4, %%mm0 \n\t"
"movq %%mm5, %%mm3 \n\t"
"movq %%mm2, %%mm6 \n\t"
"movq %%mm7, %%mm1 \n\t"
 
"paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */
"paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */
"paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */
"paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */
"paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */
"paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */
"paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */
"paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */
"psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */
"paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */
"psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */
"paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */
 
"movq %%mm5, %%mm4 \n\t"
"movq %%mm7, %%mm6 \n\t"
"movq %%mm3, %%mm0 \n\t"
"movq %%mm1, %%mm2 \n\t"
SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */
"paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */
"paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */
"paddw %%mm7, %%mm7 \n\t"
"paddw %%mm5, %%mm5 \n\t"
"paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */
"paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */
 
SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */
"psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */
"movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */
"psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */
"paddw %%mm1, %%mm1 \n\t"
"paddw %%mm3, %%mm3 \n\t"
"psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */
"paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */
 
"movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */
"movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */
"movq %%mm2, %%mm4 \n\t"
"movq %%mm6, %%mm0 \n\t"
"psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */
"psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */
"paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */
"paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */
"paddw %%mm2, %%mm2 \n\t"
"paddw %%mm0, %%mm0 \n\t"
"psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */
"paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */
 
"movq (%0), %%mm2 \n\t" /* mm2 = src0 */
"movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */
SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */
"psllw $3, %%mm0 \n\t"
"psllw $3, %%mm2 \n\t"
"paddw %1, %%mm0 \n\t" /* add rounding bias */
"paddw %1, %%mm2 \n\t" /* add rounding bias */
 
SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */
SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */
SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */
SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */
SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */
SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */
:: "r"(block), "m"(bias)
);
}
 
#define SBUTTERFLY(a,b,t,n,m)\
"mov" #m " " #a ", " #t " \n\t" /* abcd */\
"punpckl" #n " " #b ", " #a " \n\t" /* aebf */\
"punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\
 
#define TRANSPOSE4(a,b,c,d,t)\
SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\
SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\
SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\
SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */
 
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride)
{
int i;
DECLARE_ALIGNED(8, int16_t, b2)[64];
 
for(i=0; i<2; i++){
DECLARE_ALIGNED(8, uint64_t, tmp);
 
cavs_idct8_1d(block+4*i, ff_pw_4.a);
 
__asm__ volatile(
"psraw $3, %%mm7 \n\t"
"psraw $3, %%mm6 \n\t"
"psraw $3, %%mm5 \n\t"
"psraw $3, %%mm4 \n\t"
"psraw $3, %%mm3 \n\t"
"psraw $3, %%mm2 \n\t"
"psraw $3, %%mm1 \n\t"
"psraw $3, %%mm0 \n\t"
"movq %%mm7, %0 \n\t"
TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 )
"movq %%mm0, 8(%1) \n\t"
"movq %%mm6, 24(%1) \n\t"
"movq %%mm7, 40(%1) \n\t"
"movq %%mm4, 56(%1) \n\t"
"movq %0, %%mm7 \n\t"
TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 )
"movq %%mm7, (%1) \n\t"
"movq %%mm1, 16(%1) \n\t"
"movq %%mm0, 32(%1) \n\t"
"movq %%mm3, 48(%1) \n\t"
: "=m"(tmp)
: "r"(b2+32*i)
: "memory"
);
}
 
for(i=0; i<2; i++){
cavs_idct8_1d(b2+4*i, ff_pw_64.a);
 
__asm__ volatile(
"psraw $7, %%mm7 \n\t"
"psraw $7, %%mm6 \n\t"
"psraw $7, %%mm5 \n\t"
"psraw $7, %%mm4 \n\t"
"psraw $7, %%mm3 \n\t"
"psraw $7, %%mm2 \n\t"
"psraw $7, %%mm1 \n\t"
"psraw $7, %%mm0 \n\t"
"movq %%mm7, (%0) \n\t"
"movq %%mm5, 16(%0) \n\t"
"movq %%mm3, 32(%0) \n\t"
"movq %%mm1, 48(%0) \n\t"
"movq %%mm0, 64(%0) \n\t"
"movq %%mm2, 80(%0) \n\t"
"movq %%mm4, 96(%0) \n\t"
"movq %%mm6, 112(%0) \n\t"
:: "r"(b2+4*i)
: "memory"
);
}
 
ff_add_pixels_clamped_mmx(b2, dst, stride);
}
 
#endif /* HAVE_MMX_INLINE */
 
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE)
 
/*****************************************************************************
*
* motion compensation
*
****************************************************************************/
 
/* vertical filter [-1 -2 96 42 -7 0] */
#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"pmullw %5, %%mm6 \n\t"\
"movq "#D", %%mm7 \n\t"\
"pmullw "MANGLE(MUL2)", %%mm7\n\t"\
"psllw $3, "#E" \n\t"\
"psubw "#E", %%mm6 \n\t"\
"psraw $3, "#E" \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw "#E", %%mm6 \n\t"\
"paddw "#B", "#B" \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psraw $1, "#B" \n\t"\
"psubw "#A", %%mm6 \n\t"\
"paddw %4, %%mm6 \n\t"\
"psraw $7, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
 
/* vertical filter [ 0 -1 5 5 -1 0] */
#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"paddw "#D", %%mm6 \n\t"\
"pmullw %5, %%mm6 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psubw "#E", %%mm6 \n\t"\
"paddw %4, %%mm6 \n\t"\
"psraw $3, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
 
/* vertical filter [ 0 -7 42 96 -2 -1] */
#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \
"movd (%0), "#F" \n\t"\
"movq "#C", %%mm6 \n\t"\
"pmullw "MANGLE(MUL2)", %%mm6\n\t"\
"movq "#D", %%mm7 \n\t"\
"pmullw %5, %%mm7 \n\t"\
"psllw $3, "#B" \n\t"\
"psubw "#B", %%mm6 \n\t"\
"psraw $3, "#B" \n\t"\
"paddw %%mm7, %%mm6 \n\t"\
"paddw "#B", %%mm6 \n\t"\
"paddw "#E", "#E" \n\t"\
"pxor %%mm7, %%mm7 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, "#F" \n\t"\
"psubw "#E", %%mm6 \n\t"\
"psraw $1, "#E" \n\t"\
"psubw "#F", %%mm6 \n\t"\
"paddw %4, %%mm6 \n\t"\
"psraw $7, %%mm6 \n\t"\
"packuswb %%mm6, %%mm6 \n\t"\
OP(%%mm6, (%1), A, d) \
"add %3, %1 \n\t"
 
 
#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\
int w= 2;\
src -= 2*srcStride;\
\
while(w--){\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movd (%0), %%mm0 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm1 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm3 \n\t"\
"add %2, %0 \n\t"\
"movd (%0), %%mm4 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpcklbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
: "memory"\
);\
if(h==16){\
__asm__ volatile(\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\
\
: "+a"(src), "+c"(dst)\
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\
: "memory"\
);\
}\
src += 4-(h+5)*srcStride;\
dst += 4-h*dstStride;\
}
 
#define QPEL_CAVS(OPNAME, OP, MMX)\
static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
int h=8;\
__asm__ volatile(\
"pxor %%mm7, %%mm7 \n\t"\
"movq %5, %%mm6 \n\t"\
"1: \n\t"\
"movq (%0), %%mm0 \n\t"\
"movq 1(%0), %%mm2 \n\t"\
"movq %%mm0, %%mm1 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpckhbw %%mm7, %%mm1 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"paddw %%mm2, %%mm0 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"pmullw %%mm6, %%mm0 \n\t"\
"pmullw %%mm6, %%mm1 \n\t"\
"movq -1(%0), %%mm2 \n\t"\
"movq 2(%0), %%mm4 \n\t"\
"movq %%mm2, %%mm3 \n\t"\
"movq %%mm4, %%mm5 \n\t"\
"punpcklbw %%mm7, %%mm2 \n\t"\
"punpckhbw %%mm7, %%mm3 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"punpckhbw %%mm7, %%mm5 \n\t"\
"paddw %%mm4, %%mm2 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psubw %%mm2, %%mm0 \n\t"\
"psubw %%mm5, %%mm1 \n\t"\
"movq %6, %%mm5 \n\t"\
"paddw %%mm5, %%mm0 \n\t"\
"paddw %%mm5, %%mm1 \n\t"\
"psraw $3, %%mm0 \n\t"\
"psraw $3, %%mm1 \n\t"\
"packuswb %%mm1, %%mm0 \n\t"\
OP(%%mm0, (%1),%%mm5, q) \
"add %3, %0 \n\t"\
"add %4, %1 \n\t"\
"decl %2 \n\t"\
" jnz 1b \n\t"\
: "+a"(src), "+c"(dst), "+m"(h)\
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\
: "memory"\
);\
}\
\
static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
}\
\
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \
}\
\
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \
}\
\
static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
 
#define CAVS_MC(OPNAME, SIZE, MMX) \
static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\
}\
 
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t"
#define AVG_3DNOW_OP(a,b,temp, size) \
"mov" #size " " #b ", " #temp " \n\t"\
"pavgusb " #temp ", " #a " \n\t"\
"mov" #size " " #a ", " #b " \n\t"
#define AVG_MMXEXT_OP(a, b, temp, size) \
"mov" #size " " #b ", " #temp " \n\t"\
"pavgb " #temp ", " #a " \n\t"\
"mov" #size " " #a ", " #b " \n\t"
 
#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */
 
#if HAVE_MMX_INLINE
static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels8_mmx(dst, src, stride, 8);
}
 
static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels8_mmx(dst, src, stride, 8);
}
 
static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_mmx(dst, src, stride, 16);
}
 
static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_mmx(dst, src, stride, 16);
}
 
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c,
AVCodecContext *avctx)
{
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx;
c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx;
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx;
 
c->cavs_idct8_add = cavs_idct8_add_mmx;
c->idct_perm = FF_TRANSPOSE_IDCT_PERM;
}
#endif /* HAVE_MMX_INLINE */
 
#define DSPFUNC(PFX, IDX, NUM, EXT) \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \
c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \
 
#if HAVE_MMXEXT_INLINE
QPEL_CAVS(put_, PUT_OP, mmxext)
QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext)
 
CAVS_MC(put_, 8, mmxext)
CAVS_MC(put_, 16, mmxext)
CAVS_MC(avg_, 8, mmxext)
CAVS_MC(avg_, 16, mmxext)
 
static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c,
AVCodecContext *avctx)
{
DSPFUNC(put, 0, 16, mmxext);
DSPFUNC(put, 1, 8, mmxext);
DSPFUNC(avg, 0, 16, mmxext);
DSPFUNC(avg, 1, 8, mmxext);
}
#endif /* HAVE_MMXEXT_INLINE */
 
#if HAVE_AMD3DNOW_INLINE
QPEL_CAVS(put_, PUT_OP, 3dnow)
QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow)
 
CAVS_MC(put_, 8, 3dnow)
CAVS_MC(put_, 16,3dnow)
CAVS_MC(avg_, 8, 3dnow)
CAVS_MC(avg_, 16,3dnow)
 
static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c,
AVCodecContext *avctx)
{
DSPFUNC(put, 0, 16, 3dnow);
DSPFUNC(put, 1, 8, 3dnow);
DSPFUNC(avg, 0, 16, 3dnow);
DSPFUNC(avg, 1, 8, 3dnow);
}
#endif /* HAVE_AMD3DNOW_INLINE */
 
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx)
{
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
 
if (INLINE_MMX(cpu_flags))
cavsdsp_init_mmx(c, avctx);
#endif /* HAVE_MMX_INLINE */
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
cavsdsp_init_mmxext(c, avctx);
#endif /* HAVE_MMXEXT_INLINE */
#if HAVE_AMD3DNOW_INLINE
if (INLINE_AMD3DNOW(cpu_flags))
cavsdsp_init_3dnow(c, avctx);
#endif /* HAVE_AMD3DNOW_INLINE */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/constants.c
0,0 → 1,53
/*
* MMX/SSE constants used across x86 dsp optimizations.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h" // for xmm_reg
#include "constants.h"
 
DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL;
 
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL;
DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL;
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL };
 
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL };
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL };
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL;
/contrib/sdk/sources/ffmpeg/libavcodec/x86/constants.h
0,0 → 1,51
/*
* MMX/SSE constants used across x86 dsp optimizations.
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef AVCODEC_X86_CONSTANTS_H
#define AVCODEC_X86_CONSTANTS_H
 
#include <stdint.h>
 
#include "libavutil/x86/asm.h"
 
extern const uint64_t ff_wtwo;
 
extern const xmm_reg ff_pw_3;
extern const xmm_reg ff_pw_4;
extern const xmm_reg ff_pw_5;
extern const xmm_reg ff_pw_8;
extern const uint64_t ff_pw_15;
extern const xmm_reg ff_pw_16;
extern const xmm_reg ff_pw_18;
extern const uint64_t ff_pw_20;
extern const xmm_reg ff_pw_32;
extern const uint64_t ff_pw_42;
extern const uint64_t ff_pw_53;
extern const xmm_reg ff_pw_64;
extern const uint64_t ff_pw_96;
extern const uint64_t ff_pw_128;
extern const uint64_t ff_pw_255;
 
extern const xmm_reg ff_pb_1;
extern const xmm_reg ff_pb_3;
extern const xmm_reg ff_pb_F8;
extern const uint64_t ff_pb_FC;
 
#endif /* AVCODEC_X86_CONSTANTS_H */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dct32.asm
0,0 → 1,490
;******************************************************************************
;* 32 point SSE-optimized DCT transform
;* Copyright (c) 2010 Vitor Sessak
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA 32
 
align 32
ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043
dd 0.553104, 0.582935, 0.622504, 0.674808
dd -10.190008, -3.407609, -2.057781, -1.484165
dd -1.169440, -0.972568, -0.839350, -0.744536
dd 0.502419, 0.522499, 0.566944, 0.646822
dd 0.788155, 1.060678, 1.722447, 5.101149
dd 0.509796, 0.601345, 0.899976, 2.562916
dd 0.509796, 0.601345, 0.899976, 2.562916
dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 1.000000, 1.306563, 0.541196
dd 1.000000, 0.707107, 1.000000, -0.707107
dd 1.000000, 0.707107, 1.000000, -0.707107
dd 0.707107, 0.707107, 0.707107, 0.707107
 
align 32
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000
 
%macro BUTTERFLY 4
subps %4, %1, %2
addps %2, %2, %1
mulps %1, %4, %3
%endmacro
 
%macro BUTTERFLY0 5
%if cpuflag(sse2) && notcpuflag(avx)
pshufd %4, %1, %5
xorps %1, %2
addps %1, %4
mulps %1, %3
%else
shufps %4, %1, %1, %5
xorps %1, %1, %2
addps %4, %4, %1
mulps %1, %4, %3
%endif
%endmacro
 
%macro BUTTERFLY2 4
BUTTERFLY0 %1, %2, %3, %4, 0x1b
%endmacro
 
%macro BUTTERFLY3 4
BUTTERFLY0 %1, %2, %3, %4, 0xb1
%endmacro
 
%macro BUTTERFLY3V 5
movaps m%5, m%1
addps m%1, m%2
subps m%5, m%2
SWAP %2, %5
mulps m%2, [ps_cos_vec+192]
movaps m%5, m%3
addps m%3, m%4
subps m%4, m%5
mulps m%4, [ps_cos_vec+192]
%endmacro
 
%macro PASS6_AND_PERMUTE 0
mov tmpd, [outq+4]
movss m7, [outq+72]
addss m7, [outq+76]
movss m3, [outq+56]
addss m3, [outq+60]
addss m4, m3
movss m2, [outq+52]
addss m2, m3
movss m3, [outq+104]
addss m3, [outq+108]
addss m1, m3
addss m5, m4
movss [outq+ 16], m1
movss m1, [outq+100]
addss m1, m3
movss m3, [outq+40]
movss [outq+ 48], m1
addss m3, [outq+44]
movss m1, [outq+100]
addss m4, m3
addss m3, m2
addss m1, [outq+108]
movss [outq+ 40], m3
addss m2, [outq+36]
movss m3, [outq+8]
movss [outq+ 56], m2
addss m3, [outq+12]
movss [outq+ 32], m3
movss m3, [outq+80]
movss [outq+ 8], m5
movss [outq+ 80], m1
movss m2, [outq+52]
movss m5, [outq+120]
addss m5, [outq+124]
movss m1, [outq+64]
addss m2, [outq+60]
addss m0, m5
addss m5, [outq+116]
mov [outq+64], tmpd
addss m6, m0
addss m1, m6
mov tmpd, [outq+12]
mov [outq+ 96], tmpd
movss [outq+ 4], m1
movss m1, [outq+24]
movss [outq+ 24], m4
movss m4, [outq+88]
addss m4, [outq+92]
addss m3, m4
addss m4, [outq+84]
mov tmpd, [outq+108]
addss m1, [outq+28]
addss m0, m1
addss m1, m5
addss m6, m3
addss m3, m0
addss m0, m7
addss m5, [outq+20]
addss m7, m1
movss [outq+ 12], m6
mov [outq+112], tmpd
movss m6, [outq+28]
movss [outq+ 28], m0
movss m0, [outq+36]
movss [outq+ 36], m7
addss m1, m4
movss m7, [outq+116]
addss m0, m2
addss m7, [outq+124]
movss [outq+ 72], m0
movss m0, [outq+44]
addss m2, m0
movss [outq+ 44], m1
movss [outq+ 88], m2
addss m0, [outq+60]
mov tmpd, [outq+60]
mov [outq+120], tmpd
movss [outq+104], m0
addss m4, m5
addss m5, [outq+68]
movss [outq+52], m4
movss [outq+60], m5
movss m4, [outq+68]
movss m5, [outq+20]
movss [outq+ 20], m3
addss m5, m7
addss m7, m6
addss m4, m5
movss m2, [outq+84]
addss m2, [outq+92]
addss m5, m2
movss [outq+ 68], m4
addss m2, m7
movss m4, [outq+76]
movss [outq+ 84], m2
movss [outq+ 76], m5
addss m7, m4
addss m6, [outq+124]
addss m4, m6
addss m6, [outq+92]
movss [outq+100], m4
movss [outq+108], m6
movss m6, [outq+92]
movss [outq+92], m7
addss m6, [outq+124]
movss [outq+116], m6
%endmacro
 
INIT_YMM avx
SECTION_TEXT
%if HAVE_AVX_EXTERNAL
; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in)
cglobal dct32_float, 2,3,8, out, in, tmp
; pass 1
vmovaps m4, [inq+0]
vinsertf128 m5, m5, [inq+96], 1
vinsertf128 m5, m5, [inq+112], 0
vshufps m5, m5, m5, 0x1b
BUTTERFLY m4, m5, [ps_cos_vec], m6
 
vmovaps m2, [inq+64]
vinsertf128 m6, m6, [inq+32], 1
vinsertf128 m6, m6, [inq+48], 0
vshufps m6, m6, m6, 0x1b
BUTTERFLY m2, m6, [ps_cos_vec+32], m0
 
; pass 2
 
BUTTERFLY m5, m6, [ps_cos_vec+64], m0
BUTTERFLY m4, m2, [ps_cos_vec+64], m7
 
 
; pass 3
vperm2f128 m3, m6, m4, 0x31
vperm2f128 m1, m6, m4, 0x20
vshufps m3, m3, m3, 0x1b
 
BUTTERFLY m1, m3, [ps_cos_vec+96], m6
 
 
vperm2f128 m4, m5, m2, 0x20
vperm2f128 m5, m5, m2, 0x31
vshufps m5, m5, m5, 0x1b
 
BUTTERFLY m4, m5, [ps_cos_vec+96], m6
 
; pass 4
vmovaps m6, [ps_p1p1m1m1+0]
vmovaps m2, [ps_cos_vec+128]
 
BUTTERFLY2 m5, m6, m2, m7
BUTTERFLY2 m4, m6, m2, m7
BUTTERFLY2 m1, m6, m2, m7
BUTTERFLY2 m3, m6, m2, m7
 
 
; pass 5
vshufps m6, m6, m6, 0xcc
vmovaps m2, [ps_cos_vec+160]
 
BUTTERFLY3 m5, m6, m2, m7
BUTTERFLY3 m4, m6, m2, m7
BUTTERFLY3 m1, m6, m2, m7
BUTTERFLY3 m3, m6, m2, m7
 
vperm2f128 m6, m3, m3, 0x31
vmovaps [outq], m3
 
vextractf128 [outq+64], m5, 1
vextractf128 [outq+32], m5, 0
 
vextractf128 [outq+80], m4, 1
vextractf128 [outq+48], m4, 0
 
vperm2f128 m0, m1, m1, 0x31
vmovaps [outq+96], m1
 
vzeroupper
 
; pass 6, no SIMD...
INIT_XMM
PASS6_AND_PERMUTE
RET
%endif
 
%if ARCH_X86_64
%define SPILL SWAP
%define UNSPILL SWAP
 
%macro PASS5 0
nop ; FIXME code alignment
SWAP 5, 8
SWAP 4, 12
SWAP 6, 14
SWAP 7, 13
SWAP 0, 15
PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13
TRANSPOSE4x4PS 8, 9, 10, 11, 0
BUTTERFLY3V 8, 9, 10, 11, 0
addps m10, m11
TRANSPOSE4x4PS 12, 13, 14, 15, 0
BUTTERFLY3V 12, 13, 14, 15, 0
addps m14, m15
addps m12, m14
addps m14, m13
addps m13, m15
%endmacro
 
%macro PASS6 0
SWAP 9, 12
SWAP 11, 14
movss [outq+0x00], m8
pshuflw m0, m8, 0xe
movss [outq+0x10], m9
pshuflw m1, m9, 0xe
movss [outq+0x20], m10
pshuflw m2, m10, 0xe
movss [outq+0x30], m11
pshuflw m3, m11, 0xe
movss [outq+0x40], m12
pshuflw m4, m12, 0xe
movss [outq+0x50], m13
pshuflw m5, m13, 0xe
movss [outq+0x60], m14
pshuflw m6, m14, 0xe
movaps [outq+0x70], m15
pshuflw m7, m15, 0xe
addss m0, m1
addss m1, m2
movss [outq+0x08], m0
addss m2, m3
movss [outq+0x18], m1
addss m3, m4
movss [outq+0x28], m2
addss m4, m5
movss [outq+0x38], m3
addss m5, m6
movss [outq+0x48], m4
addss m6, m7
movss [outq+0x58], m5
movss [outq+0x68], m6
movss [outq+0x78], m7
 
PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7
movhlps m0, m1
pshufd m1, m1, 3
SWAP 0, 2, 4, 6, 8, 10, 12, 14
SWAP 1, 3, 5, 7, 9, 11, 13, 15
%rep 7
movhlps m0, m1
pshufd m1, m1, 3
addss m15, m1
SWAP 0, 2, 4, 6, 8, 10, 12, 14
SWAP 1, 3, 5, 7, 9, 11, 13, 15
%endrep
%assign i 4
%rep 15
addss m0, m1
movss [outq+i], m0
SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
%assign i i+8
%endrep
%endmacro
 
%else ; ARCH_X86_32
%macro SPILL 2 ; xmm#, mempos
movaps [outq+(%2-8)*16], m%1
%endmacro
%macro UNSPILL 2
movaps m%1, [outq+(%2-8)*16]
%endmacro
 
%define PASS6 PASS6_AND_PERMUTE
%macro PASS5 0
movaps m2, [ps_cos_vec+160]
shufps m3, m3, 0xcc
 
BUTTERFLY3 m5, m3, m2, m1
SPILL 5, 8
 
UNSPILL 1, 9
BUTTERFLY3 m1, m3, m2, m5
SPILL 1, 14
 
BUTTERFLY3 m4, m3, m2, m5
SPILL 4, 12
 
BUTTERFLY3 m7, m3, m2, m5
SPILL 7, 13
 
UNSPILL 5, 10
BUTTERFLY3 m5, m3, m2, m7
SPILL 5, 10
 
UNSPILL 4, 11
BUTTERFLY3 m4, m3, m2, m7
SPILL 4, 11
 
BUTTERFLY3 m6, m3, m2, m7
SPILL 6, 9
 
BUTTERFLY3 m0, m3, m2, m7
SPILL 0, 15
%endmacro
%endif
 
 
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in)
%macro DCT32_FUNC 0
cglobal dct32_float, 2, 3, 16, out, in, tmp
; pass 1
 
movaps m0, [inq+0]
LOAD_INV m1, [inq+112]
BUTTERFLY m0, m1, [ps_cos_vec], m3
 
movaps m7, [inq+64]
LOAD_INV m4, [inq+48]
BUTTERFLY m7, m4, [ps_cos_vec+32], m3
 
; pass 2
movaps m2, [ps_cos_vec+64]
BUTTERFLY m1, m4, m2, m3
SPILL 1, 11
SPILL 4, 8
 
; pass 1
movaps m1, [inq+16]
LOAD_INV m6, [inq+96]
BUTTERFLY m1, m6, [ps_cos_vec+16], m3
 
movaps m4, [inq+80]
LOAD_INV m5, [inq+32]
BUTTERFLY m4, m5, [ps_cos_vec+48], m3
 
; pass 2
BUTTERFLY m0, m7, m2, m3
 
movaps m2, [ps_cos_vec+80]
BUTTERFLY m6, m5, m2, m3
 
BUTTERFLY m1, m4, m2, m3
 
; pass 3
movaps m2, [ps_cos_vec+96]
shufps m1, m1, 0x1b
BUTTERFLY m0, m1, m2, m3
SPILL 0, 15
SPILL 1, 14
 
UNSPILL 0, 8
shufps m5, m5, 0x1b
BUTTERFLY m0, m5, m2, m3
 
UNSPILL 1, 11
shufps m6, m6, 0x1b
BUTTERFLY m1, m6, m2, m3
SPILL 1, 11
 
shufps m4, m4, 0x1b
BUTTERFLY m7, m4, m2, m3
 
; pass 4
movaps m3, [ps_p1p1m1m1+0]
movaps m2, [ps_cos_vec+128]
 
BUTTERFLY2 m5, m3, m2, m1
 
BUTTERFLY2 m0, m3, m2, m1
SPILL 0, 9
 
BUTTERFLY2 m6, m3, m2, m1
SPILL 6, 10
 
UNSPILL 0, 11
BUTTERFLY2 m0, m3, m2, m1
SPILL 0, 11
 
BUTTERFLY2 m4, m3, m2, m1
 
BUTTERFLY2 m7, m3, m2, m1
 
UNSPILL 6, 14
BUTTERFLY2 m6, m3, m2, m1
 
UNSPILL 0, 15
BUTTERFLY2 m0, m3, m2, m1
 
PASS5
PASS6
RET
%endmacro
 
%macro LOAD_INV 2
%if cpuflag(sse2)
pshufd %1, %2, 0x1b
%elif cpuflag(sse)
movaps %1, %2
shufps %1, %1, 0x1b
%endif
%endmacro
 
INIT_XMM sse
DCT32_FUNC
INIT_XMM sse2
DCT32_FUNC
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dct_init.c
0,0 → 1,39
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dct.h"
 
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in);
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in);
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in);
 
av_cold void ff_dct_init_x86(DCTContext *s)
{
int cpu_flags = av_get_cpu_flags();
 
if (EXTERNAL_SSE(cpu_flags))
s->dct32 = ff_dct32_float_sse;
if (EXTERNAL_SSE2(cpu_flags))
s->dct32 = ff_dct32_float_sse2;
if (EXTERNAL_AVX(cpu_flags))
s->dct32 = ff_dct32_float_avx;
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/deinterlace.asm
0,0 → 1,82
;******************************************************************************
;* MMX optimized deinterlacing functions
;* Copyright (c) 2010 Vitor Sessak
;* Copyright (c) 2002 Michael Niedermayer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
cextern pw_4
 
SECTION .text
 
%macro DEINTERLACE 1
%ifidn %1, inplace
;void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size)
cglobal deinterlace_line_inplace_mmx, 6,6,7, lum_m4, lum_m3, lum_m2, lum_m1, lum, size
%else
;void ff_deinterlace_line_mmx(uint8_t *dst, const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size)
cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size
%endif
pxor mm7, mm7
movq mm6, [pw_4]
.nextrow:
movd mm0, [lum_m4q]
movd mm1, [lum_m3q]
movd mm2, [lum_m2q]
%ifidn %1, inplace
movd [lum_m4q], mm2
%endif
movd mm3, [lum_m1q]
movd mm4, [lumq]
punpcklbw mm0, mm7
punpcklbw mm1, mm7
punpcklbw mm2, mm7
punpcklbw mm3, mm7
punpcklbw mm4, mm7
paddw mm1, mm3
psllw mm2, 1
paddw mm0, mm4
psllw mm1, 2
paddw mm2, mm6
paddw mm1, mm2
psubusw mm1, mm0
psrlw mm1, 3
packuswb mm1, mm7
%ifidn %1, inplace
movd [lum_m2q], mm1
%else
movd [dstq], mm1
add dstq, 4
%endif
add lum_m4q, 4
add lum_m3q, 4
add lum_m2q, 4
add lum_m1q, 4
add lumq, 4
sub sized, 4
jg .nextrow
REP_RET
%endmacro
 
DEINTERLACE ""
 
DEINTERLACE inplace
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dirac_dwt.c
0,0 → 1,202
/*
* MMX optimized discrete wavelet transform
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
* Copyright (c) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/x86/asm.h"
#include "dsputil_x86.h"
#include "dirac_dwt.h"
 
#define COMPOSE_VERTICAL(ext, align) \
void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \
void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \
void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \
void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\
\
static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
{ \
int i, width_align = width&~(align-1); \
\
for(i=width_align; i<width; i++) \
b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \
\
ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \
} \
\
static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \
{ \
int i, width_align = width&~(align-1); \
\
for(i=width_align; i<width; i++) \
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \
\
ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \
} \
\
static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
IDWTELEM *b3, IDWTELEM *b4, int width) \
{ \
int i, width_align = width&~(align-1); \
\
for(i=width_align; i<width; i++) \
b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
\
ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \
} \
\
static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \
IDWTELEM *b3, IDWTELEM *b4, int width) \
{ \
int i, width_align = width&~(align-1); \
\
for(i=width_align; i<width; i++) \
b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \
\
ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \
} \
static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \
{ \
int i, width_align = width&~(align-1); \
\
for(i=width_align; i<width; i++) { \
b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \
b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \
} \
\
ff_vertical_compose_haar##ext(b0, b1, width_align); \
} \
static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
{\
int w2= w>>1;\
int x= w2 - (w2&(align-1));\
ff_horizontal_compose_haar0i##ext(b, tmp, w);\
\
for (; x < w2; x++) {\
b[2*x ] = tmp[x];\
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\
}\
}\
static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\
{\
int w2= w>>1;\
int x= w2 - (w2&(align-1));\
ff_horizontal_compose_haar1i##ext(b, tmp, w);\
\
for (; x < w2; x++) {\
b[2*x ] = (tmp[x] + 1)>>1;\
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\
}\
}\
\
 
#if HAVE_YASM
#if !ARCH_X86_64
COMPOSE_VERTICAL(_mmx, 4)
#endif
COMPOSE_VERTICAL(_sse2, 8)
 
 
void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w);
 
static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w)
{
int w2= w>>1;
int x= w2 - (w2&7);
ff_horizontal_compose_dd97i_ssse3(b, tmp, w);
 
for (; x < w2; x++) {
b[2*x ] = (tmp[x] + 1)>>1;
b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1;
}
}
#endif
 
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type)
{
#if HAVE_YASM
int mm_flags = av_get_cpu_flags();
 
#if !ARCH_X86_64
if (!(mm_flags & AV_CPU_FLAG_MMX))
return;
 
switch (type) {
case DWT_DIRAC_DD9_7:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
break;
case DWT_DIRAC_LEGALL5_3:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx;
break;
case DWT_DIRAC_DD13_7:
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx;
break;
case DWT_DIRAC_HAAR0:
d->vertical_compose = (void*)vertical_compose_haar_mmx;
d->horizontal_compose = horizontal_compose_haar0i_mmx;
break;
case DWT_DIRAC_HAAR1:
d->vertical_compose = (void*)vertical_compose_haar_mmx;
d->horizontal_compose = horizontal_compose_haar1i_mmx;
break;
}
#endif
 
if (!(mm_flags & AV_CPU_FLAG_SSE2))
return;
 
switch (type) {
case DWT_DIRAC_DD9_7:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
break;
case DWT_DIRAC_LEGALL5_3:
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2;
break;
case DWT_DIRAC_DD13_7:
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2;
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2;
break;
case DWT_DIRAC_HAAR0:
d->vertical_compose = (void*)vertical_compose_haar_sse2;
d->horizontal_compose = horizontal_compose_haar0i_sse2;
break;
case DWT_DIRAC_HAAR1:
d->vertical_compose = (void*)vertical_compose_haar_sse2;
d->horizontal_compose = horizontal_compose_haar1i_sse2;
break;
}
 
if (!(mm_flags & AV_CPU_FLAG_SSSE3))
return;
 
switch (type) {
case DWT_DIRAC_DD9_7:
d->horizontal_compose = horizontal_compose_dd97i_ssse3;
break;
}
#endif // HAVE_YASM
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dirac_dwt.h
0,0 → 1,30
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef AVCODEC_X86_DIRAC_DWT_H
#define AVCODEC_X86_DIRAC_DWT_H
 
#include "libavcodec/dirac_dwt.h"
 
void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x);
 
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type);
 
#endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/diracdsp_mmx.c
0,0 → 1,104
/*
* Copyright (C) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "dsputil_x86.h"
#include "diracdsp_mmx.h"
 
void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height);
 
#define HPEL_FILTER(MMSIZE, EXT) \
void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \
void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \
\
static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \
const uint8_t *src, int stride, int width, int height) \
{ \
while( height-- ) \
{ \
ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \
ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \
ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \
\
dsth += stride; \
dstv += stride; \
dstc += stride; \
src += stride; \
} \
}
 
#if !ARCH_X86_64
HPEL_FILTER(8, mmx)
#endif
HPEL_FILTER(16, sse2)
 
#define PIXFUNC(PFX, IDX, EXT) \
/*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \
c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \
c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT
 
void ff_diracdsp_init_mmx(DiracDSPContext* c)
{
int mm_flags = av_get_cpu_flags();
 
if (!(mm_flags & AV_CPU_FLAG_MMX))
return;
 
#if HAVE_YASM
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx;
#if !ARCH_X86_64
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx;
c->dirac_hpel_filter = dirac_hpel_filter_mmx;
c->add_rect_clamped = ff_add_rect_clamped_mmx;
c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx;
#endif
#endif
 
#if HAVE_MMX_INLINE
PIXFUNC(put, 0, mmx);
PIXFUNC(avg, 0, mmx);
#endif
 
#if HAVE_MMXEXT_INLINE
if (mm_flags & AV_CPU_FLAG_MMX2) {
PIXFUNC(avg, 0, mmxext);
}
#endif
 
if (mm_flags & AV_CPU_FLAG_SSE2) {
#if HAVE_YASM
c->dirac_hpel_filter = dirac_hpel_filter_sse2;
c->add_rect_clamped = ff_add_rect_clamped_sse2;
c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2;
 
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2;
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2;
#endif
#if HAVE_SSE2_INLINE
c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2;
c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2;
c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2;
c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2;
#endif
}
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/diracdsp_mmx.h
0,0 → 1,47
/*
* Copyright (c) 2010 David Conrad
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef AVCODEC_X86_DIRACDSP_H
#define AVCODEC_X86_DIRACDSP_H
 
#include "libavcodec/diracdsp.h"
 
void ff_diracdsp_init_mmx(DiracDSPContext* c);
 
DECL_DIRAC_PIXOP(put, mmx);
DECL_DIRAC_PIXOP(avg, mmx);
DECL_DIRAC_PIXOP(avg, mmxext);
 
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h);
 
void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int);
 
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
 
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen);
 
#endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/diracdsp_yasm.asm
0,0 → 1,264
;******************************************************************************
;* Copyright (c) 2010 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
pw_3: times 8 dw 3
pw_7: times 8 dw 7
pw_16: times 8 dw 16
pw_32: times 8 dw 32
pb_128: times 16 db 128
 
section .text
 
%macro UNPACK_ADD 6
mov%5 %1, %3
mov%6 m5, %4
mova m4, %1
mova %2, m5
punpcklbw %1, m7
punpcklbw m5, m7
punpckhbw m4, m7
punpckhbw %2, m7
paddw %1, m5
paddw %2, m4
%endmacro
 
%macro HPEL_FILTER 1
; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width);
cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3
mov src0q, srcq
lea stridex3q, [3*strideq]
sub src0q, stridex3q
pxor m7, m7
.loop:
; 7*(src[0] + src[1])
UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a
pmullw m0, [pw_7]
pmullw m1, [pw_7]
 
; 3*( ... + src[-2] + src[3])
UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a
paddw m0, m2
paddw m1, m3
pmullw m0, [pw_3]
pmullw m1, [pw_3]
 
; ... - 7*(src[-1] + src[2])
UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a
pmullw m2, [pw_7]
pmullw m3, [pw_7]
psubw m0, m2
psubw m1, m3
 
; ... - (src[-3] + src[4])
UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a
psubw m0, m2
psubw m1, m3
 
paddw m0, [pw_16]
paddw m1, [pw_16]
psraw m0, 5
psraw m1, 5
packuswb m0, m1
mova [dstq], m0
add dstq, mmsize
add srcq, mmsize
add src0q, mmsize
sub widthd, mmsize
jg .loop
RET
 
; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width);
cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width
dec widthd
pxor m7, m7
and widthd, ~(mmsize-1)
.loop:
; 7*(src[0] + src[1])
UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u
pmullw m0, [pw_7]
pmullw m1, [pw_7]
 
; 3*( ... + src[-2] + src[3])
UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u
paddw m0, m2
paddw m1, m3
pmullw m0, [pw_3]
pmullw m1, [pw_3]
 
; ... - 7*(src[-1] + src[2])
UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u
pmullw m2, [pw_7]
pmullw m3, [pw_7]
psubw m0, m2
psubw m1, m3
 
; ... - (src[-3] + src[4])
UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u
psubw m0, m2
psubw m1, m3
 
paddw m0, [pw_16]
paddw m1, [pw_16]
psraw m0, 5
psraw m1, 5
packuswb m0, m1
mova [dstq + widthq], m0
sub widthd, mmsize
jge .loop
RET
%endmacro
 
%macro PUT_RECT 1
; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height)
cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2
mova m0, [pb_128]
add wd, (mmsize-1)
and wd, ~(mmsize-1)
 
%if ARCH_X86_64
movsxd dst_strideq, dst_strided
movsxd src_strideq, src_strided
mov r7d, r5m
mov r8d, wd
%define wspill r8d
%define hd r7d
%else
mov r4m, wd
%define wspill r4m
%define hd r5mp
%endif
 
.loopy
lea src2q, [srcq+src_strideq*2]
lea dst2q, [dstq+dst_strideq]
.loopx:
sub wd, mmsize
mova m1, [srcq +2*wq]
mova m2, [src2q+2*wq]
packsswb m1, [srcq +2*wq+mmsize]
packsswb m2, [src2q+2*wq+mmsize]
paddb m1, m0
paddb m2, m0
mova [dstq +wq], m1
mova [dst2q+wq], m2
jg .loopx
 
lea srcq, [srcq+src_strideq*4]
lea dstq, [dstq+dst_strideq*2]
sub hd, 2
mov wd, wspill
jg .loopy
RET
%endm
 
%macro ADD_RECT 1
; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height)
cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h
mova m0, [pw_32]
add wd, (mmsize-1)
and wd, ~(mmsize-1)
 
%if ARCH_X86_64
movsxd strideq, strided
movsxd idwt_strideq, idwt_strided
mov r8d, wd
%define wspill r8d
%else
mov r5m, wd
%define wspill r5m
%endif
 
.loop:
sub wd, mmsize
movu m1, [srcq +2*wq] ; FIXME: ensure alignment
paddw m1, m0
psraw m1, 6
movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment
paddw m2, m0
psraw m2, 6
paddw m1, [idwtq+2*wq]
paddw m2, [idwtq+2*wq+mmsize]
packuswb m1, m2
mova [dstq +wq], m1
jg .loop
 
lea srcq, [srcq + 2*strideq]
add dstq, strideq
lea idwtq, [idwtq+ 2*idwt_strideq]
sub hd, 1
mov wd, wspill
jg .loop
RET
%endm
 
%macro ADD_OBMC 2
; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen)
cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen
pxor m4, m4
.loop:
%assign i 0
%rep %1 / mmsize
mova m0, [srcq+i]
mova m1, m0
punpcklbw m0, m4
punpckhbw m1, m4
mova m2, [obmcq+i]
mova m3, m2
punpcklbw m2, m4
punpckhbw m3, m4
pmullw m0, m2
pmullw m1, m3
movu m2, [dstq+2*i]
movu m3, [dstq+2*i+mmsize]
paddw m0, m2
paddw m1, m3
movu [dstq+2*i], m0
movu [dstq+2*i+mmsize], m1
%assign i i+mmsize
%endrep
lea srcq, [srcq+strideq]
lea dstq, [dstq+2*strideq]
add obmcq, 32
sub yblend, 1
jg .loop
RET
%endm
 
INIT_MMX
%if ARCH_X86_64 == 0
PUT_RECT mmx
ADD_RECT mmx
 
HPEL_FILTER mmx
ADD_OBMC 32, mmx
ADD_OBMC 16, mmx
%endif
ADD_OBMC 8, mmx
 
INIT_XMM
PUT_RECT sse2
ADD_RECT sse2
 
HPEL_FILTER sse2
ADD_OBMC 32, sse2
ADD_OBMC 16, sse2
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dnxhdenc.c
0,0 → 1,67
/*
* VC3/DNxHD SIMD functions
* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com>
*
* VC-3 encoder funded by the British Broadcasting Corporation
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dnxhdenc.h"
 
#if HAVE_SSE2_INLINE
 
static void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels, int line_size)
{
__asm__ volatile(
"pxor %%xmm5, %%xmm5 \n\t"
"movq (%0), %%xmm0 \n\t"
"add %2, %0 \n\t"
"movq (%0), %%xmm1 \n\t"
"movq (%0, %2), %%xmm2 \n\t"
"movq (%0, %2,2), %%xmm3 \n\t"
"punpcklbw %%xmm5, %%xmm0 \n\t"
"punpcklbw %%xmm5, %%xmm1 \n\t"
"punpcklbw %%xmm5, %%xmm2 \n\t"
"punpcklbw %%xmm5, %%xmm3 \n\t"
"movdqa %%xmm0, (%1) \n\t"
"movdqa %%xmm1, 16(%1) \n\t"
"movdqa %%xmm2, 32(%1) \n\t"
"movdqa %%xmm3, 48(%1) \n\t"
"movdqa %%xmm3 , 64(%1) \n\t"
"movdqa %%xmm2 , 80(%1) \n\t"
"movdqa %%xmm1 , 96(%1) \n\t"
"movdqa %%xmm0, 112(%1) \n\t"
: "+r" (pixels)
: "r" (block), "r" ((x86_reg)line_size)
);
}
 
#endif /* HAVE_SSE2_INLINE */
 
av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx)
{
#if HAVE_SSE2_INLINE
if (INLINE_SSE2(av_get_cpu_flags())) {
if (ctx->cid_table->bit_depth == 8)
ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2;
}
#endif /* HAVE_SSE2_INLINE */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil.asm
0,0 → 1,653
;******************************************************************************
;* MMX optimized DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
pb_f: times 16 db 15
pb_zzzzzzzz77777777: times 8 db -1
pb_7: times 8 db 7
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13
pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0
pd_16384: times 4 dd 16384
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12
 
SECTION_TEXT
 
%macro SCALARPRODUCT 0
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order)
cglobal scalarproduct_int16, 3,3,3, v1, v2, order
shl orderq, 1
add v1q, orderq
add v2q, orderq
neg orderq
pxor m2, m2
.loop:
movu m0, [v1q + orderq]
movu m1, [v1q + orderq + mmsize]
pmaddwd m0, [v2q + orderq]
pmaddwd m1, [v2q + orderq + mmsize]
paddd m2, m0
paddd m2, m1
add orderq, mmsize*2
jl .loop
%if mmsize == 16
movhlps m0, m2
paddd m2, m0
pshuflw m0, m2, 0x4e
%else
pshufw m0, m2, 0x4e
%endif
paddd m2, m0
movd eax, m2
RET
 
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
%if mmsize == 16
pshuflw m7, m7, 0
punpcklqdq m7, m7
%else
pshufw m7, m7, 0
%endif
pxor m6, m6
add v1q, orderq
add v2q, orderq
add v3q, orderq
neg orderq
.loop:
movu m0, [v2q + orderq]
movu m1, [v2q + orderq + mmsize]
mova m4, [v1q + orderq]
mova m5, [v1q + orderq + mmsize]
movu m2, [v3q + orderq]
movu m3, [v3q + orderq + mmsize]
pmaddwd m0, m4
pmaddwd m1, m5
pmullw m2, m7
pmullw m3, m7
paddd m6, m0
paddd m6, m1
paddw m2, m4
paddw m3, m5
mova [v1q + orderq], m2
mova [v1q + orderq + mmsize], m3
add orderq, mmsize*2
jl .loop
%if mmsize == 16
movhlps m0, m6
paddd m6, m0
pshuflw m0, m6, 0x4e
%else
pshufw m0, m6, 0x4e
%endif
paddd m6, m0
movd eax, m6
RET
%endmacro
 
INIT_MMX mmxext
SCALARPRODUCT
INIT_XMM sse2
SCALARPRODUCT
 
%macro SCALARPRODUCT_LOOP 1
align 16
.loop%1:
sub orderq, mmsize*2
%if %1
mova m1, m4
mova m4, [v2q + orderq]
mova m0, [v2q + orderq + mmsize]
palignr m1, m0, %1
palignr m0, m4, %1
mova m3, m5
mova m5, [v3q + orderq]
mova m2, [v3q + orderq + mmsize]
palignr m3, m2, %1
palignr m2, m5, %1
%else
mova m0, [v2q + orderq]
mova m1, [v2q + orderq + mmsize]
mova m2, [v3q + orderq]
mova m3, [v3q + orderq + mmsize]
%endif
%define t0 [v1q + orderq]
%define t1 [v1q + orderq + mmsize]
%if ARCH_X86_64
mova m8, t0
mova m9, t1
%define t0 m8
%define t1 m9
%endif
pmaddwd m0, t0
pmaddwd m1, t1
pmullw m2, m7
pmullw m3, m7
paddw m2, t0
paddw m3, t1
paddd m6, m0
paddd m6, m1
mova [v1q + orderq], m2
mova [v1q + orderq + mmsize], m3
jg .loop%1
%if %1
jmp .end
%endif
%endmacro
 
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul)
INIT_XMM ssse3
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul
shl orderq, 1
movd m7, mulm
pshuflw m7, m7, 0
punpcklqdq m7, m7
pxor m6, m6
mov r4d, v2d
and r4d, 15
and v2q, ~15
and v3q, ~15
mova m4, [v2q + orderq]
mova m5, [v3q + orderq]
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable)
cmp r4d, 0
je .loop0
cmp r4d, 2
je .loop2
cmp r4d, 4
je .loop4
cmp r4d, 6
je .loop6
cmp r4d, 8
je .loop8
cmp r4d, 10
je .loop10
cmp r4d, 12
je .loop12
SCALARPRODUCT_LOOP 14
SCALARPRODUCT_LOOP 12
SCALARPRODUCT_LOOP 10
SCALARPRODUCT_LOOP 8
SCALARPRODUCT_LOOP 6
SCALARPRODUCT_LOOP 4
SCALARPRODUCT_LOOP 2
SCALARPRODUCT_LOOP 0
.end:
movhlps m0, m6
paddd m6, m0
pshuflw m0, m6, 0x4e
paddd m6, m0
movd eax, m6
RET
 
 
;-----------------------------------------------------------------------------
; void ff_apply_window_int16(int16_t *output, const int16_t *input,
; const int16_t *window, unsigned int len)
;-----------------------------------------------------------------------------
 
%macro REVERSE_WORDS 1-2
%if cpuflag(ssse3) && notcpuflag(atom)
pshufb %1, %2
%elif cpuflag(sse2)
pshuflw %1, %1, 0x1B
pshufhw %1, %1, 0x1B
pshufd %1, %1, 0x4E
%elif cpuflag(mmxext)
pshufw %1, %1, 0x1B
%endif
%endmacro
 
%macro MUL16FIXED 3
%if cpuflag(ssse3) ; dst, src, unused
; dst = ((dst * src) + (1<<14)) >> 15
pmulhrsw %1, %2
%elif cpuflag(mmxext) ; dst, src, temp
; dst = (dst * src) >> 15
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back
; in from the pmullw result.
mova %3, %1
pmulhw %1, %2
pmullw %3, %2
psrlw %3, 15
psllw %1, 1
por %1, %3
%endif
%endmacro
 
%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version
%if %1
cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2
%else
cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2
%endif
lea offset2q, [offsetq-mmsize]
%if cpuflag(ssse3) && notcpuflag(atom)
mova m5, [pb_revwords]
ALIGN 16
%elif %1
mova m5, [pd_16384]
%endif
.loop:
%if cpuflag(ssse3)
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The ssse3 version is bit-identical.
mova m0, [windowq+offset2q]
mova m1, [ inputq+offset2q]
pmulhrsw m1, m0
REVERSE_WORDS m0, m5
pmulhrsw m0, [ inputq+offsetq ]
mova [outputq+offset2q], m1
mova [outputq+offsetq ], m0
%elif %1
; This version expands 16-bit to 32-bit, multiplies by the window,
; adds 16384 for rounding, right shifts 15, then repacks back to words to
; save to the output. The window is reversed for the second half.
mova m3, [windowq+offset2q]
mova m4, [ inputq+offset2q]
pxor m0, m0
punpcklwd m0, m3
punpcklwd m1, m4
pmaddwd m0, m1
paddd m0, m5
psrad m0, 15
pxor m2, m2
punpckhwd m2, m3
punpckhwd m1, m4
pmaddwd m2, m1
paddd m2, m5
psrad m2, 15
packssdw m0, m2
mova [outputq+offset2q], m0
REVERSE_WORDS m3
mova m4, [ inputq+offsetq]
pxor m0, m0
punpcklwd m0, m3
punpcklwd m1, m4
pmaddwd m0, m1
paddd m0, m5
psrad m0, 15
pxor m2, m2
punpckhwd m2, m3
punpckhwd m1, m4
pmaddwd m2, m1
paddd m2, m5
psrad m2, 15
packssdw m0, m2
mova [outputq+offsetq], m0
%else
; This version does the 16x16->16 multiplication in-place without expanding
; to 32-bit. The mmxext and sse2 versions do not use rounding, and
; therefore are not bit-identical to the C version.
mova m0, [windowq+offset2q]
mova m1, [ inputq+offset2q]
mova m2, [ inputq+offsetq ]
MUL16FIXED m1, m0, m3
REVERSE_WORDS m0
MUL16FIXED m2, m0, m3
mova [outputq+offset2q], m1
mova [outputq+offsetq ], m2
%endif
add offsetd, mmsize
sub offset2d, mmsize
jae .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
APPLY_WINDOW_INT16 0
INIT_XMM sse2
APPLY_WINDOW_INT16 0
 
INIT_MMX mmxext
APPLY_WINDOW_INT16 1
INIT_XMM sse2
APPLY_WINDOW_INT16 1
INIT_XMM ssse3
APPLY_WINDOW_INT16 1
INIT_XMM ssse3, atom
APPLY_WINDOW_INT16 1
 
 
; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top)
INIT_MMX mmxext
cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top
movq mm0, [topq]
movq mm2, mm0
movd mm4, [left_topq]
psllq mm2, 8
movq mm1, mm0
por mm4, mm2
movd mm3, [leftq]
psubb mm0, mm4 ; t-tl
add dstq, wq
add topq, wq
add diffq, wq
neg wq
jmp .skip
.loop:
movq mm4, [topq+wq]
movq mm0, mm4
psllq mm4, 8
por mm4, mm1
movq mm1, mm0 ; t
psubb mm0, mm4 ; t-tl
.skip:
movq mm2, [diffq+wq]
%assign i 0
%rep 8
movq mm4, mm0
paddb mm4, mm3 ; t-tl+l
movq mm5, mm3
pmaxub mm3, mm1
pminub mm5, mm1
pminub mm3, mm4
pmaxub mm3, mm5 ; median
paddb mm3, mm2 ; +residual
%if i==0
movq mm7, mm3
psllq mm7, 56
%else
movq mm6, mm3
psrlq mm7, 8
psllq mm6, 56
por mm7, mm6
%endif
%if i<7
psrlq mm0, 8
psrlq mm1, 8
psrlq mm2, 8
%endif
%assign i i+1
%endrep
movq [dstq+wq], mm7
add wq, 8
jl .loop
movzx r2d, byte [dstq-1]
mov [leftq], r2d
movzx r2d, byte [topq-1]
mov [left_topq], r2d
RET
 
 
%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned
add srcq, wq
add dstq, wq
neg wq
%%.loop:
%if %2
mova m1, [srcq+wq]
%else
movu m1, [srcq+wq]
%endif
mova m2, m1
psllw m1, 8
paddb m1, m2
mova m2, m1
pshufb m1, m3
paddb m1, m2
pshufb m0, m5
mova m2, m1
pshufb m1, m4
paddb m1, m2
%if mmsize == 16
mova m2, m1
pshufb m1, m6
paddb m1, m2
%endif
paddb m0, m1
%if %1
mova [dstq+wq], m0
%else
movq [dstq+wq], m0
movhps [dstq+wq+8], m0
%endif
add wq, mmsize
jl %%.loop
mov eax, mmsize-1
sub eax, wd
movd m1, eax
pshufb m0, m1
movd eax, m0
RET
%endmacro
 
; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left)
INIT_MMX ssse3
cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
.skip_prologue:
mova m5, [pb_7]
mova m4, [pb_zzzz3333zzzzbbbb]
mova m3, [pb_zz11zz55zz99zzdd]
movd m0, leftm
psllq m0, 56
ADD_HFYU_LEFT_LOOP 1, 1
 
INIT_XMM sse4
cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left
mova m5, [pb_f]
mova m6, [pb_zzzzzzzz77777777]
mova m4, [pb_zzzz3333zzzzbbbb]
mova m3, [pb_zz11zz55zz99zzdd]
movd m0, leftm
pslldq m0, 15
test srcq, 15
jnz .src_unaligned
test dstq, 15
jnz .dst_unaligned
ADD_HFYU_LEFT_LOOP 1, 1
.dst_unaligned:
ADD_HFYU_LEFT_LOOP 0, 1
.src_unaligned:
ADD_HFYU_LEFT_LOOP 0, 0
 
;-----------------------------------------------------------------------------
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min,
; int32_t max, unsigned int len)
;-----------------------------------------------------------------------------
 
; %1 = number of xmm registers used
; %2 = number of inline load/process/store loops per asm loop
; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop
; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2)
; %5 = suffix
%macro VECTOR_CLIP_INT32 4-5
cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len
%if %4
cvtsi2ss m4, minm
cvtsi2ss m5, maxm
%else
movd m4, minm
movd m5, maxm
%endif
SPLATD m4
SPLATD m5
.loop:
%assign %%i 1
%rep %2
mova m0, [srcq+mmsize*0*%%i]
mova m1, [srcq+mmsize*1*%%i]
mova m2, [srcq+mmsize*2*%%i]
mova m3, [srcq+mmsize*3*%%i]
%if %3
mova m7, [srcq+mmsize*4*%%i]
mova m8, [srcq+mmsize*5*%%i]
mova m9, [srcq+mmsize*6*%%i]
mova m10, [srcq+mmsize*7*%%i]
%endif
CLIPD m0, m4, m5, m6
CLIPD m1, m4, m5, m6
CLIPD m2, m4, m5, m6
CLIPD m3, m4, m5, m6
%if %3
CLIPD m7, m4, m5, m6
CLIPD m8, m4, m5, m6
CLIPD m9, m4, m5, m6
CLIPD m10, m4, m5, m6
%endif
mova [dstq+mmsize*0*%%i], m0
mova [dstq+mmsize*1*%%i], m1
mova [dstq+mmsize*2*%%i], m2
mova [dstq+mmsize*3*%%i], m3
%if %3
mova [dstq+mmsize*4*%%i], m7
mova [dstq+mmsize*5*%%i], m8
mova [dstq+mmsize*6*%%i], m9
mova [dstq+mmsize*7*%%i], m10
%endif
%assign %%i %%i+1
%endrep
add srcq, mmsize*4*(%2+%3)
add dstq, mmsize*4*(%2+%3)
sub lend, mmsize*(%2+%3)
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmx
%define CLIPD CLIPD_MMX
VECTOR_CLIP_INT32 0, 1, 0, 0
INIT_XMM sse2
VECTOR_CLIP_INT32 6, 1, 0, 0, _int
%define CLIPD CLIPD_SSE2
VECTOR_CLIP_INT32 6, 2, 0, 1
INIT_XMM sse4
%define CLIPD CLIPD_SSE41
%ifdef m8
VECTOR_CLIP_INT32 11, 1, 1, 0
%else
VECTOR_CLIP_INT32 6, 1, 0, 0
%endif
 
; %1 = aligned/unaligned
%macro BSWAP_LOOPS 1
mov r3, r2
sar r2, 3
jz .left4_%1
.loop8_%1:
mov%1 m0, [r1 + 0]
mov%1 m1, [r1 + 16]
%if cpuflag(ssse3)
pshufb m0, m2
pshufb m1, m2
mov%1 [r0 + 0], m0
mov%1 [r0 + 16], m1
%else
pshuflw m0, m0, 10110001b
pshuflw m1, m1, 10110001b
pshufhw m0, m0, 10110001b
pshufhw m1, m1, 10110001b
mova m2, m0
mova m3, m1
psllw m0, 8
psllw m1, 8
psrlw m2, 8
psrlw m3, 8
por m2, m0
por m3, m1
mov%1 [r0 + 0], m2
mov%1 [r0 + 16], m3
%endif
add r0, 32
add r1, 32
dec r2
jnz .loop8_%1
.left4_%1:
mov r2, r3
and r3, 4
jz .left
mov%1 m0, [r1]
%if cpuflag(ssse3)
pshufb m0, m2
mov%1 [r0], m0
%else
pshuflw m0, m0, 10110001b
pshufhw m0, m0, 10110001b
mova m2, m0
psllw m0, 8
psrlw m2, 8
por m2, m0
mov%1 [r0], m2
%endif
add r1, 16
add r0, 16
%endmacro
 
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w);
%macro BSWAP32_BUF 0
%if cpuflag(ssse3)
cglobal bswap32_buf, 3,4,3
mov r3, r1
mova m2, [pb_bswap32]
%else
cglobal bswap32_buf, 3,4,5
mov r3, r1
%endif
or r3, r0
and r3, 15
jz .start_align
BSWAP_LOOPS u
jmp .left
.start_align:
BSWAP_LOOPS a
.left:
%if cpuflag(ssse3)
mov r3, r2
and r2, 2
jz .left1
movq m0, [r1]
pshufb m0, m2
movq [r0], m0
add r1, 8
add r0, 8
.left1:
and r3, 1
jz .end
mov r2d, [r1]
bswap r2d
mov [r0], r2d
%else
and r2, 3
jz .end
.loop2:
mov r3d, [r1]
bswap r3d
mov [r0], r3d
add r1, 4
add r0, 4
dec r2
jnz .loop2
%endif
.end:
RET
%endmacro
 
INIT_XMM sse2
BSWAP32_BUF
 
INIT_XMM ssse3
BSWAP32_BUF
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil_init.c
0,0 → 1,733
/*
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/simple_idct.h"
#include "dsputil_x86.h"
#include "idct_xvid.h"
 
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1,
uint8_t *src2, int dstStride,
int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride,
int h);
void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride, int h);
void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride,
int h);
void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src,
int dstStride, int srcStride);
#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext
#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext
 
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale);
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale);
 
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2,
int order);
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2,
const int16_t *v3,
int order, int mul);
 
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input,
const int16_t *window, unsigned int len);
 
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w);
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w);
 
void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top);
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src,
int w, int left);
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src,
int w, int left);
 
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src,
int32_t min, int32_t max, unsigned int len);
 
#if HAVE_YASM
 
PIXELS16(static, ff_avg, , , _mmxext)
PIXELS16(static, ff_put, , , _mmxext)
 
#define QPEL_OP(OPNAME, RND, MMX) \
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \
stride, 8); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \
stride, stride, 8); \
} \
\
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \
stride, stride); \
} \
\
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[8]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \
8, stride); \
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half) + 64; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \
stride, 8, 8); \
} \
\
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \
8, stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[8 + 9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \
stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[9]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \
stride, 9); \
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \
stride, 8); \
} \
\
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \
stride, stride, 16);\
} \
\
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \
stride, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \
stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \
stride, stride); \
} \
\
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t temp[32]; \
uint8_t * const half = (uint8_t*)temp; \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \
stride); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \
stride, stride, 16); \
} \
\
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[16 * 2 + 17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half) + 256; \
uint8_t * const halfHV = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \
16, 16); \
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \
stride, 16, 16); \
} \
\
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
} \
\
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \
ptrdiff_t stride) \
{ \
uint64_t half[17 * 2]; \
uint8_t * const halfH = ((uint8_t*)half); \
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \
stride, 17); \
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \
stride, 16); \
}
 
QPEL_OP(put_, _, mmxext)
QPEL_OP(avg_, _, mmxext)
QPEL_OP(put_no_rnd_, _no_rnd_, mmxext)
#endif /* HAVE_YASM */
 
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
 
static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_MMX_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
c->put_pixels_clamped = ff_put_pixels_clamped_mmx;
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx;
c->add_pixels_clamped = ff_add_pixels_clamped_mmx;
 
if (!high_bit_depth) {
c->clear_block = ff_clear_block_mmx;
c->clear_blocks = ff_clear_blocks_mmx;
c->draw_edges = ff_draw_edges_mmx;
}
 
#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM)
c->gmc = ff_gmc_mmx;
#endif
 
c->add_bytes = ff_add_bytes_mmx;
#endif /* HAVE_MMX_INLINE */
 
#if HAVE_MMX_EXTERNAL
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) {
c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx;
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx;
}
 
c->vector_clip_int32 = ff_vector_clip_int32_mmx;
#endif /* HAVE_MMX_EXTERNAL */
}
 
static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_MMXEXT_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) {
c->idct_put = ff_idct_xvid_mmxext_put;
c->idct_add = ff_idct_xvid_mmxext_add;
c->idct = ff_idct_xvid_mmxext;
}
#endif /* HAVE_MMXEXT_INLINE */
 
#if HAVE_MMXEXT_EXTERNAL
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, );
 
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, );
 
/* slower than cmov version on AMD */
if (!(cpu_flags & AV_CPU_FLAG_3DNOW))
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext;
 
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext;
 
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_mmxext;
} else {
c->apply_window_int16 = ff_apply_window_int16_round_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
 
static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_SSE_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
if (!high_bit_depth) {
if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) {
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */
c->clear_block = ff_clear_block_sse;
c->clear_blocks = ff_clear_blocks_sse;
}
}
 
c->vector_clipf = ff_vector_clipf_sse;
#endif /* HAVE_SSE_INLINE */
 
#if HAVE_YASM
#if HAVE_INLINE_ASM && CONFIG_VIDEODSP
c->gmc = ff_gmc_sse;
#endif
#endif /* HAVE_YASM */
}
 
static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_SSE2_INLINE
const int high_bit_depth = avctx->bits_per_raw_sample > 8;
 
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) {
c->idct_put = ff_idct_xvid_sse2_put;
c->idct_add = ff_idct_xvid_sse2_add;
c->idct = ff_idct_xvid_sse2;
c->idct_permutation_type = FF_SSE2_IDCT_PERM;
}
#endif /* HAVE_SSE2_INLINE */
 
#if HAVE_SSE2_EXTERNAL
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2;
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2;
if (cpu_flags & AV_CPU_FLAG_ATOM) {
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2;
} else {
c->vector_clip_int32 = ff_vector_clip_int32_sse2;
}
if (avctx->flags & CODEC_FLAG_BITEXACT) {
c->apply_window_int16 = ff_apply_window_int16_sse2;
} else if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
c->apply_window_int16 = ff_apply_window_int16_round_sse2;
}
c->bswap_buf = ff_bswap32_buf_sse2;
#endif /* HAVE_SSE2_EXTERNAL */
}
 
static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_SSSE3_EXTERNAL
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3;
if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4;
 
if (cpu_flags & AV_CPU_FLAG_ATOM)
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom;
else
c->apply_window_int16 = ff_apply_window_int16_ssse3;
if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3;
c->bswap_buf = ff_bswap32_buf_ssse3;
#endif /* HAVE_SSSE3_EXTERNAL */
}
 
static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx,
int cpu_flags)
{
#if HAVE_SSE4_EXTERNAL
c->vector_clip_int32 = ff_vector_clip_int32_sse4;
#endif /* HAVE_SSE4_EXTERNAL */
}
 
av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
 
#if HAVE_7REGS && HAVE_INLINE_ASM
if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_CMOV)
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_cmov;
#endif
 
if (X86_MMX(cpu_flags)) {
#if HAVE_INLINE_ASM
const int idct_algo = avctx->idct_algo;
 
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) {
if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) {
c->idct_put = ff_simple_idct_put_mmx;
c->idct_add = ff_simple_idct_add_mmx;
c->idct = ff_simple_idct_mmx;
c->idct_permutation_type = FF_SIMPLE_IDCT_PERM;
} else if (idct_algo == FF_IDCT_XVIDMMX) {
c->idct_put = ff_idct_xvid_mmx_put;
c->idct_add = ff_idct_xvid_mmx_add;
c->idct = ff_idct_xvid_mmx;
}
}
#endif /* HAVE_INLINE_ASM */
 
dsputil_init_mmx(c, avctx, cpu_flags);
}
 
if (X86_MMXEXT(cpu_flags))
dsputil_init_mmxext(c, avctx, cpu_flags);
 
if (X86_SSE(cpu_flags))
dsputil_init_sse(c, avctx, cpu_flags);
 
if (X86_SSE2(cpu_flags))
dsputil_init_sse2(c, avctx, cpu_flags);
 
if (EXTERNAL_SSSE3(cpu_flags))
dsputil_init_ssse3(c, avctx, cpu_flags);
 
if (EXTERNAL_SSE4(cpu_flags))
dsputil_init_sse4(c, avctx, cpu_flags);
 
if (CONFIG_ENCODERS)
ff_dsputilenc_init_mmx(c, avctx);
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil_mmx.c
0,0 → 1,637
/*
* MMX optimized DSP utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
 
#include "config.h"
#include "libavutil/avassert.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/videodsp.h"
#include "constants.h"
#include "dsputil_x86.h"
#include "diracdsp_mmx.h"
 
#if HAVE_INLINE_ASM
 
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
const int16_t *p;
uint8_t *pix;
 
/* read the pixels */
p = block;
pix = pixels;
/* unrolled loop */
__asm__ volatile (
"movq (%3), %%mm0 \n\t"
"movq 8(%3), %%mm1 \n\t"
"movq 16(%3), %%mm2 \n\t"
"movq 24(%3), %%mm3 \n\t"
"movq 32(%3), %%mm4 \n\t"
"movq 40(%3), %%mm5 \n\t"
"movq 48(%3), %%mm6 \n\t"
"movq 56(%3), %%mm7 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"packuswb %%mm7, %%mm6 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm2, (%0, %1) \n\t"
"movq %%mm4, (%0, %1, 2) \n\t"
"movq %%mm6, (%0, %2) \n\t"
:: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3),
"r"(p)
: "memory");
pix += line_size * 4;
p += 32;
 
// if here would be an exact copy of the code above
// compiler would generate some very strange code
// thus using "r"
__asm__ volatile (
"movq (%3), %%mm0 \n\t"
"movq 8(%3), %%mm1 \n\t"
"movq 16(%3), %%mm2 \n\t"
"movq 24(%3), %%mm3 \n\t"
"movq 32(%3), %%mm4 \n\t"
"movq 40(%3), %%mm5 \n\t"
"movq 48(%3), %%mm6 \n\t"
"movq 56(%3), %%mm7 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"packuswb %%mm7, %%mm6 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm2, (%0, %1) \n\t"
"movq %%mm4, (%0, %1, 2) \n\t"
"movq %%mm6, (%0, %2) \n\t"
:: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p)
: "memory");
}
 
#define put_signed_pixels_clamped_mmx_half(off) \
"movq "#off"(%2), %%mm1 \n\t" \
"movq 16 + "#off"(%2), %%mm2 \n\t" \
"movq 32 + "#off"(%2), %%mm3 \n\t" \
"movq 48 + "#off"(%2), %%mm4 \n\t" \
"packsswb 8 + "#off"(%2), %%mm1 \n\t" \
"packsswb 24 + "#off"(%2), %%mm2 \n\t" \
"packsswb 40 + "#off"(%2), %%mm3 \n\t" \
"packsswb 56 + "#off"(%2), %%mm4 \n\t" \
"paddb %%mm0, %%mm1 \n\t" \
"paddb %%mm0, %%mm2 \n\t" \
"paddb %%mm0, %%mm3 \n\t" \
"paddb %%mm0, %%mm4 \n\t" \
"movq %%mm1, (%0) \n\t" \
"movq %%mm2, (%0, %3) \n\t" \
"movq %%mm3, (%0, %3, 2) \n\t" \
"movq %%mm4, (%0, %1) \n\t"
 
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
x86_reg line_skip = line_size;
x86_reg line_skip3;
 
__asm__ volatile (
"movq "MANGLE(ff_pb_80)", %%mm0 \n\t"
"lea (%3, %3, 2), %1 \n\t"
put_signed_pixels_clamped_mmx_half(0)
"lea (%0, %3, 4), %0 \n\t"
put_signed_pixels_clamped_mmx_half(64)
: "+&r"(pixels), "=&r"(line_skip3)
: "r"(block), "r"(line_skip)
: "memory");
}
 
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels,
int line_size)
{
const int16_t *p;
uint8_t *pix;
int i;
 
/* read the pixels */
p = block;
pix = pixels;
MOVQ_ZERO(mm7);
i = 4;
do {
__asm__ volatile (
"movq (%2), %%mm0 \n\t"
"movq 8(%2), %%mm1 \n\t"
"movq 16(%2), %%mm2 \n\t"
"movq 24(%2), %%mm3 \n\t"
"movq %0, %%mm4 \n\t"
"movq %1, %%mm6 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddsw %%mm4, %%mm0 \n\t"
"paddsw %%mm5, %%mm1 \n\t"
"movq %%mm6, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm6 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddsw %%mm6, %%mm2 \n\t"
"paddsw %%mm5, %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"packuswb %%mm3, %%mm2 \n\t"
"movq %%mm0, %0 \n\t"
"movq %%mm2, %1 \n\t"
: "+m"(*pix), "+m"(*(pix + line_size))
: "r"(p)
: "memory");
pix += line_size * 2;
p += 16;
} while (--i);
}
 
#define CLEAR_BLOCKS(name, n) \
void name(int16_t *blocks) \
{ \
__asm__ volatile ( \
"pxor %%mm7, %%mm7 \n\t" \
"mov %1, %%"REG_a" \n\t" \
"1: \n\t" \
"movq %%mm7, (%0, %%"REG_a") \n\t" \
"movq %%mm7, 8(%0, %%"REG_a") \n\t" \
"movq %%mm7, 16(%0, %%"REG_a") \n\t" \
"movq %%mm7, 24(%0, %%"REG_a") \n\t" \
"add $32, %%"REG_a" \n\t" \
"js 1b \n\t" \
:: "r"(((uint8_t *)blocks) + 128 * n), \
"i"(-128 * n) \
: "%"REG_a \
); \
}
CLEAR_BLOCKS(ff_clear_blocks_mmx, 6)
CLEAR_BLOCKS(ff_clear_block_mmx, 1)
 
void ff_clear_block_sse(int16_t *block)
{
__asm__ volatile (
"xorps %%xmm0, %%xmm0 \n"
"movaps %%xmm0, (%0) \n"
"movaps %%xmm0, 16(%0) \n"
"movaps %%xmm0, 32(%0) \n"
"movaps %%xmm0, 48(%0) \n"
"movaps %%xmm0, 64(%0) \n"
"movaps %%xmm0, 80(%0) \n"
"movaps %%xmm0, 96(%0) \n"
"movaps %%xmm0, 112(%0) \n"
:: "r"(block)
: "memory"
);
}
 
void ff_clear_blocks_sse(int16_t *blocks)
{
__asm__ volatile (
"xorps %%xmm0, %%xmm0 \n"
"mov %1, %%"REG_a" \n"
"1: \n"
"movaps %%xmm0, (%0, %%"REG_a") \n"
"movaps %%xmm0, 16(%0, %%"REG_a") \n"
"movaps %%xmm0, 32(%0, %%"REG_a") \n"
"movaps %%xmm0, 48(%0, %%"REG_a") \n"
"movaps %%xmm0, 64(%0, %%"REG_a") \n"
"movaps %%xmm0, 80(%0, %%"REG_a") \n"
"movaps %%xmm0, 96(%0, %%"REG_a") \n"
"movaps %%xmm0, 112(%0, %%"REG_a") \n"
"add $128, %%"REG_a" \n"
"js 1b \n"
:: "r"(((uint8_t *)blocks) + 128 * 6),
"i"(-128 * 6)
: "%"REG_a
);
}
 
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w)
{
x86_reg i = 0;
__asm__ volatile (
"jmp 2f \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq (%2, %0), %%mm1 \n\t"
"paddb %%mm0, %%mm1 \n\t"
"movq %%mm1, (%2, %0) \n\t"
"movq 8(%1, %0), %%mm0 \n\t"
"movq 8(%2, %0), %%mm1 \n\t"
"paddb %%mm0, %%mm1 \n\t"
"movq %%mm1, 8(%2, %0) \n\t"
"add $16, %0 \n\t"
"2: \n\t"
"cmp %3, %0 \n\t"
"js 1b \n\t"
: "+r"(i)
: "r"(src), "r"(dst), "r"((x86_reg)w - 15)
);
for ( ; i < w; i++)
dst[i + 0] += src[i + 0];
}
 
/* Draw the edges of width 'w' of an image of size width, height
* this MMX version can only handle w == 8 || w == 16. */
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
int w, int h, int sides)
{
uint8_t *ptr, *last_line;
int i;
 
last_line = buf + (height - 1) * wrap;
/* left and right */
ptr = buf;
if (w == 8) {
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpcklwd %%mm0, %%mm0 \n\t"
"punpckldq %%mm0, %%mm0 \n\t"
"movq %%mm0, -8(%0) \n\t"
"movq -8(%0, %2), %%mm1 \n\t"
"punpckhbw %%mm1, %%mm1 \n\t"
"punpckhwd %%mm1, %%mm1 \n\t"
"punpckhdq %%mm1, %%mm1 \n\t"
"movq %%mm1, (%0, %2) \n\t"
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
);
} else if(w==16){
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpcklwd %%mm0, %%mm0 \n\t"
"punpckldq %%mm0, %%mm0 \n\t"
"movq %%mm0, -8(%0) \n\t"
"movq %%mm0, -16(%0) \n\t"
"movq -8(%0, %2), %%mm1 \n\t"
"punpckhbw %%mm1, %%mm1 \n\t"
"punpckhwd %%mm1, %%mm1 \n\t"
"punpckhdq %%mm1, %%mm1 \n\t"
"movq %%mm1, (%0, %2) \n\t"
"movq %%mm1, 8(%0, %2) \n\t"
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
);
} else {
av_assert1(w == 4);
__asm__ volatile (
"1: \n\t"
"movd (%0), %%mm0 \n\t"
"punpcklbw %%mm0, %%mm0 \n\t"
"punpcklwd %%mm0, %%mm0 \n\t"
"movd %%mm0, -4(%0) \n\t"
"movd -4(%0, %2), %%mm1 \n\t"
"punpcklbw %%mm1, %%mm1 \n\t"
"punpckhwd %%mm1, %%mm1 \n\t"
"punpckhdq %%mm1, %%mm1 \n\t"
"movd %%mm1, (%0, %2) \n\t"
"add %1, %0 \n\t"
"cmp %3, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height)
);
}
 
/* top and bottom (and hopefully also the corners) */
if (sides & EDGE_TOP) {
for (i = 0; i < h; i += 4) {
ptr = buf - (i + 1) * wrap - w;
__asm__ volatile (
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm0, (%0, %2) \n\t"
"movq %%mm0, (%0, %2, 2) \n\t"
"movq %%mm0, (%0, %3) \n\t"
"add $8, %0 \n\t"
"cmp %4, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap),
"r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w)
);
}
}
 
if (sides & EDGE_BOTTOM) {
for (i = 0; i < h; i += 4) {
ptr = last_line + (i + 1) * wrap - w;
__asm__ volatile (
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm0, (%0, %2) \n\t"
"movq %%mm0, (%0, %2, 2) \n\t"
"movq %%mm0, (%0, %3) \n\t"
"add $8, %0 \n\t"
"cmp %4, %0 \n\t"
"jb 1b \n\t"
: "+r"(ptr)
: "r"((x86_reg)last_line - (x86_reg)ptr - w),
"r"((x86_reg)wrap), "r"((x86_reg)wrap * 3),
"r"(ptr + width + 2 * w)
);
}
}
}
 
typedef void emulated_edge_mc_func(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_linesize,
int block_w, int block_h,
int src_x, int src_y, int w, int h);
 
static av_always_inline void gmc(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height,
emulated_edge_mc_func *emu_edge_fn)
{
const int w = 8;
const int ix = ox >> (16 + shift);
const int iy = oy >> (16 + shift);
const int oxs = ox >> 4;
const int oys = oy >> 4;
const int dxxs = dxx >> 4;
const int dxys = dxy >> 4;
const int dyxs = dyx >> 4;
const int dyys = dyy >> 4;
const uint16_t r4[4] = { r, r, r, r };
const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys };
const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys };
const uint64_t shift2 = 2 * shift;
#define MAX_STRIDE 4096U
#define MAX_H 8U
uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE];
int x, y;
 
const int dxw = (dxx - (1 << (16 + shift))) * (w - 1);
const int dyh = (dyy - (1 << (16 + shift))) * (h - 1);
const int dxh = dxy * (h - 1);
const int dyw = dyx * (w - 1);
int need_emu = (unsigned)ix >= width - w ||
(unsigned)iy >= height - h;
 
if ( // non-constant fullpel offset (3% of blocks)
((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) |
(oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift)
// uses more than 16 bits of subpel mv (only at huge resolution)
|| (dxx | dxy | dyx | dyy) & 15
|| (need_emu && (h > MAX_H || stride > MAX_STRIDE))) {
// FIXME could still use mmx for some of the rows
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy,
shift, r, width, height);
return;
}
 
src += ix + iy * stride;
if (need_emu) {
emu_edge_fn(edge_buf, stride, src, stride, w + 1, h + 1, ix, iy, width, height);
src = edge_buf;
}
 
__asm__ volatile (
"movd %0, %%mm6 \n\t"
"pxor %%mm7, %%mm7 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
"punpcklwd %%mm6, %%mm6 \n\t"
:: "r"(1<<shift)
);
 
for (x = 0; x < w; x += 4) {
uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0),
oxs - dxys + dxxs * (x + 1),
oxs - dxys + dxxs * (x + 2),
oxs - dxys + dxxs * (x + 3) };
uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0),
oys - dyys + dyxs * (x + 1),
oys - dyys + dyxs * (x + 2),
oys - dyys + dyxs * (x + 3) };
 
for (y = 0; y < h; y++) {
__asm__ volatile (
"movq %0, %%mm4 \n\t"
"movq %1, %%mm5 \n\t"
"paddw %2, %%mm4 \n\t"
"paddw %3, %%mm5 \n\t"
"movq %%mm4, %0 \n\t"
"movq %%mm5, %1 \n\t"
"psrlw $12, %%mm4 \n\t"
"psrlw $12, %%mm5 \n\t"
: "+m"(*dx4), "+m"(*dy4)
: "m"(*dxy4), "m"(*dyy4)
);
 
__asm__ volatile (
"movq %%mm6, %%mm2 \n\t"
"movq %%mm6, %%mm1 \n\t"
"psubw %%mm4, %%mm2 \n\t"
"psubw %%mm5, %%mm1 \n\t"
"movq %%mm2, %%mm0 \n\t"
"movq %%mm4, %%mm3 \n\t"
"pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy)
"pmullw %%mm5, %%mm3 \n\t" // dx * dy
"pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy
"pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy)
 
"movd %4, %%mm5 \n\t"
"movd %3, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy
"pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy
 
"movd %2, %%mm5 \n\t"
"movd %1, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy)
"pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy)
"paddw %5, %%mm1 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm2, %%mm0 \n\t"
 
"psrlw %6, %%mm0 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"movd %%mm0, %0 \n\t"
 
: "=m"(dst[x + y * stride])
: "m"(src[0]), "m"(src[1]),
"m"(src[stride]), "m"(src[stride + 1]),
"m"(*r4), "m"(shift2)
);
src += stride;
}
src += 4 - h * stride;
}
}
 
#if CONFIG_VIDEODSP
#if HAVE_YASM
#if ARCH_X86_32
void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
{
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &ff_emulated_edge_mc_8);
}
#endif
void ff_gmc_sse(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
{
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &ff_emulated_edge_mc_8);
}
#else
void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height)
{
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r,
width, height, &ff_emulated_edge_mc_8);
}
#endif
#endif
 
#if CONFIG_DIRAC_DECODER
#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\
void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3)\
ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\
else\
OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\
}\
void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3)\
ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\
else\
OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\
}\
void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\
{\
if (h&3) {\
ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\
} else {\
OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\
OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\
}\
}
 
#if HAVE_MMX_INLINE
PIXELS16(static, ff_avg, , , _mmxext)
DIRAC_PIXOP(put, ff_put, mmx)
DIRAC_PIXOP(avg, ff_avg, mmx)
#endif
 
#if HAVE_YASM
DIRAC_PIXOP(avg, ff_avg, mmxext)
 
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3)
ff_put_dirac_pixels16_c(dst, src, stride, h);
else
ff_put_pixels16_sse2(dst, src[0], stride, h);
}
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3)
ff_avg_dirac_pixels16_c(dst, src, stride, h);
else
ff_avg_pixels16_sse2(dst, src[0], stride, h);
}
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3) {
ff_put_dirac_pixels32_c(dst, src, stride, h);
} else {
ff_put_pixels16_sse2(dst , src[0] , stride, h);
ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h);
}
}
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h)
{
if (h&3) {
ff_avg_dirac_pixels32_c(dst, src, stride, h);
} else {
ff_avg_pixels16_sse2(dst , src[0] , stride, h);
ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h);
}
}
#endif
#endif
 
void ff_vector_clipf_sse(float *dst, const float *src,
float min, float max, int len)
{
x86_reg i = (len - 16) * 4;
__asm__ volatile (
"movss %3, %%xmm4 \n\t"
"movss %4, %%xmm5 \n\t"
"shufps $0, %%xmm4, %%xmm4 \n\t"
"shufps $0, %%xmm5, %%xmm5 \n\t"
"1: \n\t"
"movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel
"movaps 16(%2, %0), %%xmm1 \n\t"
"movaps 32(%2, %0), %%xmm2 \n\t"
"movaps 48(%2, %0), %%xmm3 \n\t"
"maxps %%xmm4, %%xmm0 \n\t"
"maxps %%xmm4, %%xmm1 \n\t"
"maxps %%xmm4, %%xmm2 \n\t"
"maxps %%xmm4, %%xmm3 \n\t"
"minps %%xmm5, %%xmm0 \n\t"
"minps %%xmm5, %%xmm1 \n\t"
"minps %%xmm5, %%xmm2 \n\t"
"minps %%xmm5, %%xmm3 \n\t"
"movaps %%xmm0, (%1, %0) \n\t"
"movaps %%xmm1, 16(%1, %0) \n\t"
"movaps %%xmm2, 32(%1, %0) \n\t"
"movaps %%xmm3, 48(%1, %0) \n\t"
"sub $64, %0 \n\t"
"jge 1b \n\t"
: "+&r"(i)
: "r"(dst), "r"(src), "m"(min), "m"(max)
: "memory"
);
}
 
#endif /* HAVE_INLINE_ASM */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil_qns_template.c
0,0 → 1,101
/*
* DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3
* Copyright (c) 2004 Michael Niedermayer
*
* MMX optimization by Michael Niedermayer <michaelni@gmx.at>
* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0))
 
static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale)
{
x86_reg i=0;
 
av_assert2(FFABS(scale) < MAX_ABS);
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
 
SET_RND(mm6);
__asm__ volatile(
"pxor %%mm7, %%mm7 \n\t"
"movd %4, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"psraw $6, %%mm0 \n\t"
"psraw $6, %%mm1 \n\t"
"pmullw (%3, %0), %%mm0 \n\t"
"pmullw 8(%3, %0), %%mm1 \n\t"
"pmaddwd %%mm0, %%mm0 \n\t"
"pmaddwd %%mm1, %%mm1 \n\t"
"paddd %%mm1, %%mm0 \n\t"
"psrld $4, %%mm0 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"add $16, %0 \n\t"
"cmp $128, %0 \n\t" //FIXME optimize & bench
" jb 1b \n\t"
PHADDD(%%mm7, %%mm6)
"psrld $2, %%mm7 \n\t"
"movd %%mm7, %0 \n\t"
 
: "+r" (i)
: "r"(basis), "r"(rem), "r"(weight), "g"(scale)
);
return i;
}
 
static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale)
{
x86_reg i=0;
 
if(FFABS(scale) < MAX_ABS){
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT;
SET_RND(mm6);
__asm__ volatile(
"movd %3, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
"punpcklwd %%mm5, %%mm5 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6)
"paddw (%2, %0), %%mm0 \n\t"
"paddw 8(%2, %0), %%mm1 \n\t"
"movq %%mm0, (%2, %0) \n\t"
"movq %%mm1, 8(%2, %0) \n\t"
"add $16, %0 \n\t"
"cmp $128, %0 \n\t" // FIXME optimize & bench
" jb 1b \n\t"
 
: "+r" (i)
: "r"(basis), "r"(rem), "g"(scale)
);
}else{
for(i=0; i<8*8; i++){
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT);
}
}
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil_x86.c
0,0 → 1,65
/*
* Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "config.h"
#include "libavutil/x86/asm.h"
#include "dsputil_x86.h"
 
#if HAVE_INLINE_ASM
 
#if HAVE_7REGS
void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top)
{
x86_reg w2 = -w;
x86_reg x;
int l = *left & 0xff;
int tl = *left_top & 0xff;
int t;
__asm__ volatile (
"mov %7, %3 \n"
"1: \n"
"movzbl (%3, %4), %2 \n"
"mov %2, %k3 \n"
"sub %b1, %b3 \n"
"add %b0, %b3 \n"
"mov %2, %1 \n"
"cmp %0, %2 \n"
"cmovg %0, %2 \n"
"cmovg %1, %0 \n"
"cmp %k3, %0 \n"
"cmovg %k3, %0 \n"
"mov %7, %3 \n"
"cmp %2, %0 \n"
"cmovl %2, %0 \n"
"add (%6, %4), %b0 \n"
"mov %b0, (%5, %4) \n"
"inc %4 \n"
"jl 1b \n"
: "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2)
: "r"(dst + w), "r"(diff + w), "rm"(top + w)
);
*left = l;
*left_top = tl;
}
#endif
 
#endif /* HAVE_INLINE_ASM */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil_x86.h
0,0 → 1,198
/*
* MMX optimized DSP utils
* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef AVCODEC_X86_DSPUTIL_MMX_H
#define AVCODEC_X86_DSPUTIL_MMX_H
 
#include <stddef.h>
#include <stdint.h>
 
#include "libavcodec/dsputil.h"
#include "libavutil/x86/asm.h"
#include "constants.h"
 
#define MOVQ_WONE(regd) \
__asm__ volatile ( \
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \
"psrlw $15, %%" #regd ::)
 
#define JUMPALIGN() __asm__ volatile (".p2align 3"::)
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::)
 
#define MOVQ_BFE(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"paddb %%"#regd", %%"#regd" \n\t" ::)
 
#ifndef PIC
#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo))
#else
// for shared library it's better to use this way for accessing constants
// pcmpeqd -> -1
#define MOVQ_WTWO(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"psrlw $15, %%"#regd" \n\t" \
"psllw $1, %%"#regd" \n\t"::)
 
#endif
 
// using regr as temporary and for the output result
// first argument is unmodifed and second is trashed
// regfe is supposed to contain 0xfefefefefefefefe
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"paddb "#regb", "#regr" \n\t"
 
#define PAVGB_MMX(rega, regb, regr, regfe) \
"movq "#rega", "#regr" \n\t" \
"por "#regb", "#regr" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pand "#regfe", "#regb" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psubb "#regb", "#regr" \n\t"
 
// mm6 is supposed to contain 0xfefefefefefefefe
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pand "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psrlq $1, "#regd" \n\t" \
"paddb "#regb", "#regr" \n\t" \
"paddb "#regd", "#regp" \n\t"
 
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"por "#regb", "#regr" \n\t" \
"por "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psubb "#regb", "#regr" \n\t" \
"psubb "#regd", "#regp" \n\t"
 
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx);
void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx);
 
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size);
 
void ff_clear_block_mmx(int16_t *block);
void ff_clear_block_sse(int16_t *block);
void ff_clear_blocks_mmx(int16_t *blocks);
void ff_clear_blocks_sse(int16_t *blocks);
 
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w);
 
void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top,
const uint8_t *diff, int w,
int *left, int *left_top);
 
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height,
int w, int h, int sides);
 
void ff_gmc_mmx(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height);
 
void ff_gmc_sse(uint8_t *dst, uint8_t *src,
int stride, int h, int ox, int oy,
int dxx, int dxy, int dyx, int dyy,
int shift, int r, int width, int height);
 
void ff_vector_clipf_sse(float *dst, const float *src,
float min, float max, int len);
 
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
 
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
 
void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
 
void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
 
 
void ff_mmx_idct(int16_t *block);
void ff_mmxext_idct(int16_t *block);
 
void ff_deinterlace_line_mmx(uint8_t *dst,
const uint8_t *lum_m4, const uint8_t *lum_m3,
const uint8_t *lum_m2, const uint8_t *lum_m1,
const uint8_t *lum,
int size);
 
void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4,
const uint8_t *lum_m3,
const uint8_t *lum_m2,
const uint8_t *lum_m1,
const uint8_t *lum, int size);
 
#define PIXELS16(STATIC, PFX1, PFX2, TYPE, CPUEXT) \
STATIC void PFX1 ## _pixels16 ## TYPE ## CPUEXT(uint8_t *block, \
const uint8_t *pixels, \
ptrdiff_t line_size, \
int h) \
{ \
PFX2 ## PFX1 ## _pixels8 ## TYPE ## CPUEXT(block, pixels, \
line_size, h); \
PFX2 ## PFX1 ## _pixels8 ## TYPE ## CPUEXT(block + 8, pixels + 8, \
line_size, h); \
}
 
#endif /* AVCODEC_X86_DSPUTIL_MMX_H */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputilenc.asm
0,0 → 1,487
;*****************************************************************************
;* MMX optimized DSP utils
;*****************************************************************************
;* Copyright (c) 2000, 2001 Fabrice Bellard
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION .text
 
%macro DIFF_PIXELS_1 4
movh %1, %3
movh %2, %4
punpcklbw %2, %1
punpcklbw %1, %1
psubw %1, %2
%endmacro
 
; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3
; %6=temporary storage location
; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64)
%macro DIFF_PIXELS_8 6
DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3]
DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3]
DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3]
add %1, %5
add %2, %5
DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3]
DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3]
DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3]
DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3]
%ifdef m8
DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3]
%else
mova [%6], m0
DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3]
mova m0, [%6]
%endif
sub %1, %5
sub %2, %5
%endmacro
 
%macro HADAMARD8 0
SUMSUB_BADC w, 0, 1, 2, 3
SUMSUB_BADC w, 4, 5, 6, 7
SUMSUB_BADC w, 0, 2, 1, 3
SUMSUB_BADC w, 4, 6, 5, 7
SUMSUB_BADC w, 0, 4, 1, 5
SUMSUB_BADC w, 2, 6, 3, 7
%endmacro
 
%macro ABS1_SUM 3
ABS1 %1, %2
paddusw %3, %1
%endmacro
 
%macro ABS2_SUM 6
ABS2 %1, %2, %3, %4
paddusw %5, %1
paddusw %6, %2
%endmacro
 
%macro ABS_SUM_8x8_64 1
ABS2 m0, m1, m8, m9
ABS2_SUM m2, m3, m8, m9, m0, m1
ABS2_SUM m4, m5, m8, m9, m0, m1
ABS2_SUM m6, m7, m8, m9, m0, m1
paddusw m0, m1
%endmacro
 
%macro ABS_SUM_8x8_32 1
mova [%1], m7
ABS1 m0, m7
ABS1 m1, m7
ABS1_SUM m2, m7, m0
ABS1_SUM m3, m7, m1
ABS1_SUM m4, m7, m0
ABS1_SUM m5, m7, m1
ABS1_SUM m6, m7, m0
mova m2, [%1]
ABS1_SUM m2, m7, m1
paddusw m0, m1
%endmacro
 
; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to
; about 100k on extreme inputs. But that's very unlikely to occur in natural video,
; and it's even more unlikely to not have any alternative mvs/modes with lower cost.
%macro HSUM 3
%if cpuflag(sse2)
movhlps %2, %1
paddusw %1, %2
pshuflw %2, %1, 0xE
paddusw %1, %2
pshuflw %2, %1, 0x1
paddusw %1, %2
movd %3, %1
%elif cpuflag(mmxext)
pshufw %2, %1, 0xE
paddusw %1, %2
pshufw %2, %1, 0x1
paddusw %1, %2
movd %3, %1
%elif cpuflag(mmx)
mova %2, %1
psrlq %1, 32
paddusw %1, %2
mova %2, %1
psrlq %1, 16
paddusw %1, %2
movd %3, %1
%endif
%endmacro
 
%macro STORE4 5
mova [%1+mmsize*0], %2
mova [%1+mmsize*1], %3
mova [%1+mmsize*2], %4
mova [%1+mmsize*3], %5
%endmacro
 
%macro LOAD4 5
mova %2, [%1+mmsize*0]
mova %3, [%1+mmsize*1]
mova %4, [%1+mmsize*2]
mova %5, [%1+mmsize*3]
%endmacro
 
%macro hadamard8_16_wrapper 2
cglobal hadamard8_diff, 4, 4, %1
%ifndef m8
%assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
SUB rsp, pad
%endif
call hadamard8x8_diff %+ SUFFIX
%ifndef m8
ADD rsp, pad
%endif
RET
 
cglobal hadamard8_diff16, 5, 6, %1
%ifndef m8
%assign pad %2*mmsize-(4+stack_offset&(mmsize-1))
SUB rsp, pad
%endif
 
call hadamard8x8_diff %+ SUFFIX
mov r5d, eax
 
add r1, 8
add r2, 8
call hadamard8x8_diff %+ SUFFIX
add r5d, eax
 
cmp r4d, 16
jne .done
 
lea r1, [r1+r3*8-8]
lea r2, [r2+r3*8-8]
call hadamard8x8_diff %+ SUFFIX
add r5d, eax
 
add r1, 8
add r2, 8
call hadamard8x8_diff %+ SUFFIX
add r5d, eax
 
.done:
mov eax, r5d
%ifndef m8
ADD rsp, pad
%endif
RET
%endmacro
 
%macro HADAMARD8_DIFF 0-1
%if cpuflag(sse2)
hadamard8x8_diff %+ SUFFIX:
lea r0, [r3*3]
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize
HADAMARD8
%if ARCH_X86_64
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize]
%endif
HADAMARD8
ABS_SUM_8x8 rsp+gprsize
HSUM m0, m1, eax
and eax, 0xFFFF
ret
 
hadamard8_16_wrapper %1, 3
%elif cpuflag(mmx)
ALIGN 16
; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2,
; int stride, int h)
; r0 = void *s = unused, int h = unused (always 8)
; note how r1, r2 and r3 are not clobbered in this function, so 16x16
; can simply call this 2x2x (and that's why we access rsp+gprsize
; everywhere, which is rsp of calling func
hadamard8x8_diff %+ SUFFIX:
lea r0, [r3*3]
 
; first 4x8 pixels
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60
HADAMARD8
mova [rsp+gprsize+0x60], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
STORE4 rsp+gprsize, m0, m1, m2, m3
mova m7, [rsp+gprsize+0x60]
TRANSPOSE4x4W 4, 5, 6, 7, 0
STORE4 rsp+gprsize+0x40, m4, m5, m6, m7
 
; second 4x8 pixels
DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60
HADAMARD8
mova [rsp+gprsize+0x60], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
STORE4 rsp+gprsize+0x20, m0, m1, m2, m3
mova m7, [rsp+gprsize+0x60]
TRANSPOSE4x4W 4, 5, 6, 7, 0
 
LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3
HADAMARD8
ABS_SUM_8x8_32 rsp+gprsize+0x60
mova [rsp+gprsize+0x60], m0
 
LOAD4 rsp+gprsize , m0, m1, m2, m3
LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7
HADAMARD8
ABS_SUM_8x8_32 rsp+gprsize
paddusw m0, [rsp+gprsize+0x60]
 
HSUM m0, m1, eax
and rax, 0xFFFF
ret
 
hadamard8_16_wrapper 0, 14
%endif
%endmacro
 
INIT_MMX mmx
HADAMARD8_DIFF
 
INIT_MMX mmxext
HADAMARD8_DIFF
 
INIT_XMM sse2
%if ARCH_X86_64
%define ABS_SUM_8x8 ABS_SUM_8x8_64
%else
%define ABS_SUM_8x8 ABS_SUM_8x8_32
%endif
HADAMARD8_DIFF 10
 
INIT_XMM ssse3
%define ABS_SUM_8x8 ABS_SUM_8x8_64
HADAMARD8_DIFF 9
 
INIT_XMM sse2
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h)
cglobal sse16, 5, 5, 8
shr r4d, 1
pxor m0, m0 ; mm0 = 0
pxor m7, m7 ; mm7 holds the sum
 
.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned
movu m1, [r1 ] ; mm1 = pix1[0][0-15]
movu m2, [r2 ] ; mm2 = pix2[0][0-15]
movu m3, [r1+r3] ; mm3 = pix1[1][0-15]
movu m4, [r2+r3] ; mm4 = pix2[1][0-15]
 
; todo: mm1-mm2, mm3-mm4
; algo: subtract mm1 from mm2 with saturation and vice versa
; OR the result to get the absolute difference
mova m5, m1
mova m6, m3
psubusb m1, m2
psubusb m3, m4
psubusb m2, m5
psubusb m4, m6
 
por m2, m1
por m4, m3
 
; now convert to 16-bit vectors so we can square them
mova m1, m2
mova m3, m4
 
punpckhbw m2, m0
punpckhbw m4, m0
punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2)
punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4)
 
pmaddwd m2, m2
pmaddwd m4, m4
pmaddwd m1, m1
pmaddwd m3, m3
 
lea r1, [r1+r3*2] ; pix1 += 2*line_size
lea r2, [r2+r3*2] ; pix2 += 2*line_size
 
paddd m1, m2
paddd m3, m4
paddd m7, m1
paddd m7, m3
 
dec r4
jnz .next2lines
 
mova m1, m7
psrldq m7, 8 ; shift hi qword to lo
paddd m7, m1
mova m1, m7
psrldq m7, 4 ; shift hi dword to lo
paddd m7, m1
movd eax, m7 ; return value
RET
 
INIT_MMX mmx
; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size)
cglobal get_pixels, 3,4
movsxdifnidn r2, r2d
add r0, 128
mov r3, -128
pxor m7, m7
.loop:
mova m0, [r1]
mova m2, [r1+r2]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
mova [r0+r3+ 0], m0
mova [r0+r3+ 8], m1
mova [r0+r3+16], m2
mova [r0+r3+24], m3
lea r1, [r1+r2*2]
add r3, 32
js .loop
REP_RET
 
INIT_XMM sse2
cglobal get_pixels, 3, 4
movsxdifnidn r2, r2d
lea r3, [r2*3]
pxor m4, m4
movh m0, [r1]
movh m1, [r1+r2]
movh m2, [r1+r2*2]
movh m3, [r1+r3]
lea r1, [r1+r2*4]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
mova [r0], m0
mova [r0+0x10], m1
mova [r0+0x20], m2
mova [r0+0x30], m3
movh m0, [r1]
movh m1, [r1+r2*1]
movh m2, [r1+r2*2]
movh m3, [r1+r3]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
mova [r0+0x40], m0
mova [r0+0x50], m1
mova [r0+0x60], m2
mova [r0+0x70], m3
RET
 
INIT_MMX mmx
; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride)
cglobal diff_pixels, 4,5
movsxdifnidn r3, r3d
pxor m7, m7
add r0, 128
mov r4, -128
.loop:
mova m0, [r1]
mova m2, [r2]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
mova [r0+r4+0], m0
mova [r0+r4+8], m1
add r1, r3
add r2, r3
add r4, 16
jne .loop
REP_RET
 
INIT_MMX mmx
; pix_sum16_mmx(uint8_t * pix, int line_size)
cglobal pix_sum16, 2, 3
movsxdifnidn r1, r1d
mov r2, r1
neg r2
shl r2, 4
sub r0, r2
pxor m7, m7
pxor m6, m6
.loop:
mova m0, [r0+r2+0]
mova m1, [r0+r2+0]
mova m2, [r0+r2+8]
mova m3, [r0+r2+8]
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m1, m0
paddw m3, m2
paddw m3, m1
paddw m6, m3
add r2, r1
js .loop
mova m5, m6
psrlq m6, 32
paddw m6, m5
mova m5, m6
psrlq m6, 16
paddw m6, m5
movd eax, m6
and eax, 0xffff
RET
 
INIT_MMX mmx
; pix_norm1_mmx(uint8_t *pix, int line_size)
cglobal pix_norm1, 2, 4
movsxdifnidn r1, r1d
mov r2, 16
pxor m0, m0
pxor m7, m7
.loop:
mova m2, [r0+0]
mova m3, [r0+8]
mova m1, m2
punpckhbw m1, m0
punpcklbw m2, m0
mova m4, m3
punpckhbw m3, m0
punpcklbw m4, m0
pmaddwd m1, m1
pmaddwd m2, m2
pmaddwd m3, m3
pmaddwd m4, m4
paddd m2, m1
paddd m4, m3
paddd m7, m2
add r0, r1
paddd m7, m4
dec r2
jne .loop
mova m1, m7
psrlq m7, 32
paddd m1, m7
movd eax, m1
RET
 
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputilenc_mmx.c
0,0 → 1,1061
/*
* MMX optimized DSP utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dct.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/mpegvideo.h"
#include "libavcodec/mathops.h"
#include "dsputil_x86.h"
 
void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size);
void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size);
void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride);
int ff_pix_sum16_mmx(uint8_t * pix, int line_size);
int ff_pix_norm1_mmx(uint8_t *pix, int line_size);
 
#if HAVE_INLINE_ASM
 
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
int tmp;
__asm__ volatile (
"movl %4,%%ecx\n"
"shr $1,%%ecx\n"
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
"1:\n"
"movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */
"movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */
"movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */
"movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */
 
/* todo: mm1-mm2, mm3-mm4 */
/* algo: subtract mm1 from mm2 with saturation and vice versa */
/* OR the results to get absolute difference */
"movq %%mm1,%%mm5\n"
"movq %%mm3,%%mm6\n"
"psubusb %%mm2,%%mm1\n"
"psubusb %%mm4,%%mm3\n"
"psubusb %%mm5,%%mm2\n"
"psubusb %%mm6,%%mm4\n"
 
"por %%mm1,%%mm2\n"
"por %%mm3,%%mm4\n"
 
/* now convert to 16-bit vectors so we can square them */
"movq %%mm2,%%mm1\n"
"movq %%mm4,%%mm3\n"
 
"punpckhbw %%mm0,%%mm2\n"
"punpckhbw %%mm0,%%mm4\n"
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
 
"pmaddwd %%mm2,%%mm2\n"
"pmaddwd %%mm4,%%mm4\n"
"pmaddwd %%mm1,%%mm1\n"
"pmaddwd %%mm3,%%mm3\n"
 
"lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */
"lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */
 
"paddd %%mm2,%%mm1\n"
"paddd %%mm4,%%mm3\n"
"paddd %%mm1,%%mm7\n"
"paddd %%mm3,%%mm7\n"
 
"decl %%ecx\n"
"jnz 1b\n"
 
"movq %%mm7,%%mm1\n"
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
"paddd %%mm7,%%mm1\n"
"movd %%mm1,%2\n"
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
: "r" ((x86_reg)line_size) , "m" (h)
: "%ecx");
return tmp;
}
 
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
int tmp;
__asm__ volatile (
"movl %4,%%ecx\n"
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */
"1:\n"
"movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */
"movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */
"movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */
"movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */
 
/* todo: mm1-mm2, mm3-mm4 */
/* algo: subtract mm1 from mm2 with saturation and vice versa */
/* OR the results to get absolute difference */
"movq %%mm1,%%mm5\n"
"movq %%mm3,%%mm6\n"
"psubusb %%mm2,%%mm1\n"
"psubusb %%mm4,%%mm3\n"
"psubusb %%mm5,%%mm2\n"
"psubusb %%mm6,%%mm4\n"
 
"por %%mm1,%%mm2\n"
"por %%mm3,%%mm4\n"
 
/* now convert to 16-bit vectors so we can square them */
"movq %%mm2,%%mm1\n"
"movq %%mm4,%%mm3\n"
 
"punpckhbw %%mm0,%%mm2\n"
"punpckhbw %%mm0,%%mm4\n"
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */
 
"pmaddwd %%mm2,%%mm2\n"
"pmaddwd %%mm4,%%mm4\n"
"pmaddwd %%mm1,%%mm1\n"
"pmaddwd %%mm3,%%mm3\n"
 
"add %3,%0\n"
"add %3,%1\n"
 
"paddd %%mm2,%%mm1\n"
"paddd %%mm4,%%mm3\n"
"paddd %%mm1,%%mm7\n"
"paddd %%mm3,%%mm7\n"
 
"decl %%ecx\n"
"jnz 1b\n"
 
"movq %%mm7,%%mm1\n"
"psrlq $32, %%mm7\n" /* shift hi dword to lo */
"paddd %%mm7,%%mm1\n"
"movd %%mm1,%2\n"
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
: "r" ((x86_reg)line_size) , "m" (h)
: "%ecx");
return tmp;
}
 
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) {
int tmp;
__asm__ volatile (
"movl %3,%%ecx\n"
"pxor %%mm7,%%mm7\n"
"pxor %%mm6,%%mm6\n"
 
"movq (%0),%%mm0\n"
"movq %%mm0, %%mm1\n"
"psllq $8, %%mm0\n"
"psrlq $8, %%mm1\n"
"psrlq $8, %%mm0\n"
"movq %%mm0, %%mm2\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm0\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm2\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm0\n"
"psubw %%mm3, %%mm2\n"
 
"add %2,%0\n"
 
"movq (%0),%%mm4\n"
"movq %%mm4, %%mm1\n"
"psllq $8, %%mm4\n"
"psrlq $8, %%mm1\n"
"psrlq $8, %%mm4\n"
"movq %%mm4, %%mm5\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm4\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm5\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm4\n"
"psubw %%mm3, %%mm5\n"
"psubw %%mm4, %%mm0\n"
"psubw %%mm5, %%mm2\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm0, %%mm3\n\t"
"pcmpgtw %%mm2, %%mm1\n\t"
"pxor %%mm3, %%mm0\n"
"pxor %%mm1, %%mm2\n"
"psubw %%mm3, %%mm0\n"
"psubw %%mm1, %%mm2\n"
"paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n"
 
"add %2,%0\n"
"1:\n"
 
"movq (%0),%%mm0\n"
"movq %%mm0, %%mm1\n"
"psllq $8, %%mm0\n"
"psrlq $8, %%mm1\n"
"psrlq $8, %%mm0\n"
"movq %%mm0, %%mm2\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm0\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm2\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm0\n"
"psubw %%mm3, %%mm2\n"
"psubw %%mm0, %%mm4\n"
"psubw %%mm2, %%mm5\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm4, %%mm3\n\t"
"pcmpgtw %%mm5, %%mm1\n\t"
"pxor %%mm3, %%mm4\n"
"pxor %%mm1, %%mm5\n"
"psubw %%mm3, %%mm4\n"
"psubw %%mm1, %%mm5\n"
"paddw %%mm4, %%mm5\n"
"paddw %%mm5, %%mm6\n"
 
"add %2,%0\n"
 
"movq (%0),%%mm4\n"
"movq %%mm4, %%mm1\n"
"psllq $8, %%mm4\n"
"psrlq $8, %%mm1\n"
"psrlq $8, %%mm4\n"
"movq %%mm4, %%mm5\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm4\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm5\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm4\n"
"psubw %%mm3, %%mm5\n"
"psubw %%mm4, %%mm0\n"
"psubw %%mm5, %%mm2\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm0, %%mm3\n\t"
"pcmpgtw %%mm2, %%mm1\n\t"
"pxor %%mm3, %%mm0\n"
"pxor %%mm1, %%mm2\n"
"psubw %%mm3, %%mm0\n"
"psubw %%mm1, %%mm2\n"
"paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n"
 
"add %2,%0\n"
"subl $2, %%ecx\n"
" jnz 1b\n"
 
"movq %%mm6, %%mm0\n"
"punpcklwd %%mm7,%%mm0\n"
"punpckhwd %%mm7,%%mm6\n"
"paddd %%mm0, %%mm6\n"
 
"movq %%mm6,%%mm0\n"
"psrlq $32, %%mm6\n"
"paddd %%mm6,%%mm0\n"
"movd %%mm0,%1\n"
: "+r" (pix1), "=r"(tmp)
: "r" ((x86_reg)line_size) , "g" (h-2)
: "%ecx");
return tmp;
}
 
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) {
int tmp;
uint8_t * pix= pix1;
__asm__ volatile (
"movl %3,%%ecx\n"
"pxor %%mm7,%%mm7\n"
"pxor %%mm6,%%mm6\n"
 
"movq (%0),%%mm0\n"
"movq 1(%0),%%mm1\n"
"movq %%mm0, %%mm2\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm0\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm2\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm0\n"
"psubw %%mm3, %%mm2\n"
 
"add %2,%0\n"
 
"movq (%0),%%mm4\n"
"movq 1(%0),%%mm1\n"
"movq %%mm4, %%mm5\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm4\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm5\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm4\n"
"psubw %%mm3, %%mm5\n"
"psubw %%mm4, %%mm0\n"
"psubw %%mm5, %%mm2\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm0, %%mm3\n\t"
"pcmpgtw %%mm2, %%mm1\n\t"
"pxor %%mm3, %%mm0\n"
"pxor %%mm1, %%mm2\n"
"psubw %%mm3, %%mm0\n"
"psubw %%mm1, %%mm2\n"
"paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n"
 
"add %2,%0\n"
"1:\n"
 
"movq (%0),%%mm0\n"
"movq 1(%0),%%mm1\n"
"movq %%mm0, %%mm2\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm0\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm2\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm0\n"
"psubw %%mm3, %%mm2\n"
"psubw %%mm0, %%mm4\n"
"psubw %%mm2, %%mm5\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm4, %%mm3\n\t"
"pcmpgtw %%mm5, %%mm1\n\t"
"pxor %%mm3, %%mm4\n"
"pxor %%mm1, %%mm5\n"
"psubw %%mm3, %%mm4\n"
"psubw %%mm1, %%mm5\n"
"paddw %%mm4, %%mm5\n"
"paddw %%mm5, %%mm6\n"
 
"add %2,%0\n"
 
"movq (%0),%%mm4\n"
"movq 1(%0),%%mm1\n"
"movq %%mm4, %%mm5\n"
"movq %%mm1, %%mm3\n"
"punpcklbw %%mm7,%%mm4\n"
"punpcklbw %%mm7,%%mm1\n"
"punpckhbw %%mm7,%%mm5\n"
"punpckhbw %%mm7,%%mm3\n"
"psubw %%mm1, %%mm4\n"
"psubw %%mm3, %%mm5\n"
"psubw %%mm4, %%mm0\n"
"psubw %%mm5, %%mm2\n"
"pxor %%mm3, %%mm3\n"
"pxor %%mm1, %%mm1\n"
"pcmpgtw %%mm0, %%mm3\n\t"
"pcmpgtw %%mm2, %%mm1\n\t"
"pxor %%mm3, %%mm0\n"
"pxor %%mm1, %%mm2\n"
"psubw %%mm3, %%mm0\n"
"psubw %%mm1, %%mm2\n"
"paddw %%mm0, %%mm2\n"
"paddw %%mm2, %%mm6\n"
 
"add %2,%0\n"
"subl $2, %%ecx\n"
" jnz 1b\n"
 
"movq %%mm6, %%mm0\n"
"punpcklwd %%mm7,%%mm0\n"
"punpckhwd %%mm7,%%mm6\n"
"paddd %%mm0, %%mm6\n"
 
"movq %%mm6,%%mm0\n"
"psrlq $32, %%mm6\n"
"paddd %%mm6,%%mm0\n"
"movd %%mm0,%1\n"
: "+r" (pix1), "=r"(tmp)
: "r" ((x86_reg)line_size) , "g" (h-2)
: "%ecx");
return tmp + hf_noise8_mmx(pix+8, line_size, h);
}
 
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
MpegEncContext *c = p;
int score1, score2;
 
if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h);
else score1 = sse16_mmx(c, pix1, pix2, line_size, h);
score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h);
 
if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
else return score1 + FFABS(score2)*8;
}
 
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
MpegEncContext *c = p;
int score1= sse8_mmx(c, pix1, pix2, line_size, h);
int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h);
 
if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight;
else return score1 + FFABS(score2)*8;
}
 
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) {
int tmp;
 
av_assert2( (((int)pix) & 7) == 0);
av_assert2((line_size &7) ==0);
 
#define SUM(in0, in1, out0, out1) \
"movq (%0), %%mm2\n"\
"movq 8(%0), %%mm3\n"\
"add %2,%0\n"\
"movq %%mm2, " #out0 "\n"\
"movq %%mm3, " #out1 "\n"\
"psubusb " #in0 ", %%mm2\n"\
"psubusb " #in1 ", %%mm3\n"\
"psubusb " #out0 ", " #in0 "\n"\
"psubusb " #out1 ", " #in1 "\n"\
"por %%mm2, " #in0 "\n"\
"por %%mm3, " #in1 "\n"\
"movq " #in0 ", %%mm2\n"\
"movq " #in1 ", %%mm3\n"\
"punpcklbw %%mm7, " #in0 "\n"\
"punpcklbw %%mm7, " #in1 "\n"\
"punpckhbw %%mm7, %%mm2\n"\
"punpckhbw %%mm7, %%mm3\n"\
"paddw " #in1 ", " #in0 "\n"\
"paddw %%mm3, %%mm2\n"\
"paddw %%mm2, " #in0 "\n"\
"paddw " #in0 ", %%mm6\n"
 
 
__asm__ volatile (
"movl %3,%%ecx\n"
"pxor %%mm6,%%mm6\n"
"pxor %%mm7,%%mm7\n"
"movq (%0),%%mm0\n"
"movq 8(%0),%%mm1\n"
"add %2,%0\n"
"jmp 2f\n"
"1:\n"
 
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
"2:\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 
"subl $2, %%ecx\n"
"jnz 1b\n"
 
"movq %%mm6,%%mm0\n"
"psrlq $32, %%mm6\n"
"paddw %%mm6,%%mm0\n"
"movq %%mm0,%%mm6\n"
"psrlq $16, %%mm0\n"
"paddw %%mm6,%%mm0\n"
"movd %%mm0,%1\n"
: "+r" (pix), "=r"(tmp)
: "r" ((x86_reg)line_size) , "m" (h)
: "%ecx");
return tmp & 0xFFFF;
}
#undef SUM
 
static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy,
int line_size, int h)
{
int tmp;
 
av_assert2( (((int)pix) & 7) == 0);
av_assert2((line_size &7) ==0);
 
#define SUM(in0, in1, out0, out1) \
"movq (%0), " #out0 "\n"\
"movq 8(%0), " #out1 "\n"\
"add %2,%0\n"\
"psadbw " #out0 ", " #in0 "\n"\
"psadbw " #out1 ", " #in1 "\n"\
"paddw " #in1 ", " #in0 "\n"\
"paddw " #in0 ", %%mm6\n"
 
__asm__ volatile (
"movl %3,%%ecx\n"
"pxor %%mm6,%%mm6\n"
"pxor %%mm7,%%mm7\n"
"movq (%0),%%mm0\n"
"movq 8(%0),%%mm1\n"
"add %2,%0\n"
"jmp 2f\n"
"1:\n"
 
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
"2:\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 
"subl $2, %%ecx\n"
"jnz 1b\n"
 
"movd %%mm6,%1\n"
: "+r" (pix), "=r"(tmp)
: "r" ((x86_reg)line_size) , "m" (h)
: "%ecx");
return tmp;
}
#undef SUM
 
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) {
int tmp;
 
av_assert2( (((int)pix1) & 7) == 0);
av_assert2( (((int)pix2) & 7) == 0);
av_assert2((line_size &7) ==0);
 
#define SUM(in0, in1, out0, out1) \
"movq (%0),%%mm2\n"\
"movq (%1)," #out0 "\n"\
"movq 8(%0),%%mm3\n"\
"movq 8(%1)," #out1 "\n"\
"add %3,%0\n"\
"add %3,%1\n"\
"psubb " #out0 ", %%mm2\n"\
"psubb " #out1 ", %%mm3\n"\
"pxor %%mm7, %%mm2\n"\
"pxor %%mm7, %%mm3\n"\
"movq %%mm2, " #out0 "\n"\
"movq %%mm3, " #out1 "\n"\
"psubusb " #in0 ", %%mm2\n"\
"psubusb " #in1 ", %%mm3\n"\
"psubusb " #out0 ", " #in0 "\n"\
"psubusb " #out1 ", " #in1 "\n"\
"por %%mm2, " #in0 "\n"\
"por %%mm3, " #in1 "\n"\
"movq " #in0 ", %%mm2\n"\
"movq " #in1 ", %%mm3\n"\
"punpcklbw %%mm7, " #in0 "\n"\
"punpcklbw %%mm7, " #in1 "\n"\
"punpckhbw %%mm7, %%mm2\n"\
"punpckhbw %%mm7, %%mm3\n"\
"paddw " #in1 ", " #in0 "\n"\
"paddw %%mm3, %%mm2\n"\
"paddw %%mm2, " #in0 "\n"\
"paddw " #in0 ", %%mm6\n"
 
 
__asm__ volatile (
"movl %4,%%ecx\n"
"pxor %%mm6,%%mm6\n"
"pcmpeqw %%mm7,%%mm7\n"
"psllw $15, %%mm7\n"
"packsswb %%mm7, %%mm7\n"
"movq (%0),%%mm0\n"
"movq (%1),%%mm2\n"
"movq 8(%0),%%mm1\n"
"movq 8(%1),%%mm3\n"
"add %3,%0\n"
"add %3,%1\n"
"psubb %%mm2, %%mm0\n"
"psubb %%mm3, %%mm1\n"
"pxor %%mm7, %%mm0\n"
"pxor %%mm7, %%mm1\n"
"jmp 2f\n"
"1:\n"
 
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
"2:\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 
"subl $2, %%ecx\n"
"jnz 1b\n"
 
"movq %%mm6,%%mm0\n"
"psrlq $32, %%mm6\n"
"paddw %%mm6,%%mm0\n"
"movq %%mm0,%%mm6\n"
"psrlq $16, %%mm0\n"
"paddw %%mm6,%%mm0\n"
"movd %%mm0,%2\n"
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
: "r" ((x86_reg)line_size) , "m" (h)
: "%ecx");
return tmp & 0x7FFF;
}
#undef SUM
 
static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2,
int line_size, int h)
{
int tmp;
 
av_assert2( (((int)pix1) & 7) == 0);
av_assert2( (((int)pix2) & 7) == 0);
av_assert2((line_size &7) ==0);
 
#define SUM(in0, in1, out0, out1) \
"movq (%0)," #out0 "\n"\
"movq (%1),%%mm2\n"\
"movq 8(%0)," #out1 "\n"\
"movq 8(%1),%%mm3\n"\
"add %3,%0\n"\
"add %3,%1\n"\
"psubb %%mm2, " #out0 "\n"\
"psubb %%mm3, " #out1 "\n"\
"pxor %%mm7, " #out0 "\n"\
"pxor %%mm7, " #out1 "\n"\
"psadbw " #out0 ", " #in0 "\n"\
"psadbw " #out1 ", " #in1 "\n"\
"paddw " #in1 ", " #in0 "\n"\
"paddw " #in0 ", %%mm6\n"
 
__asm__ volatile (
"movl %4,%%ecx\n"
"pxor %%mm6,%%mm6\n"
"pcmpeqw %%mm7,%%mm7\n"
"psllw $15, %%mm7\n"
"packsswb %%mm7, %%mm7\n"
"movq (%0),%%mm0\n"
"movq (%1),%%mm2\n"
"movq 8(%0),%%mm1\n"
"movq 8(%1),%%mm3\n"
"add %3,%0\n"
"add %3,%1\n"
"psubb %%mm2, %%mm0\n"
"psubb %%mm3, %%mm1\n"
"pxor %%mm7, %%mm0\n"
"pxor %%mm7, %%mm1\n"
"jmp 2f\n"
"1:\n"
 
SUM(%%mm4, %%mm5, %%mm0, %%mm1)
"2:\n"
SUM(%%mm0, %%mm1, %%mm4, %%mm5)
 
"subl $2, %%ecx\n"
"jnz 1b\n"
 
"movd %%mm6,%2\n"
: "+r" (pix1), "+r" (pix2), "=r"(tmp)
: "r" ((x86_reg)line_size) , "m" (h)
: "%ecx");
return tmp;
}
#undef SUM
 
static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){
x86_reg i=0;
if(w>=16)
__asm__ volatile(
"1: \n\t"
"movq (%2, %0), %%mm0 \n\t"
"movq (%1, %0), %%mm1 \n\t"
"psubb %%mm0, %%mm1 \n\t"
"movq %%mm1, (%3, %0) \n\t"
"movq 8(%2, %0), %%mm0 \n\t"
"movq 8(%1, %0), %%mm1 \n\t"
"psubb %%mm0, %%mm1 \n\t"
"movq %%mm1, 8(%3, %0) \n\t"
"add $16, %0 \n\t"
"cmp %4, %0 \n\t"
" jb 1b \n\t"
: "+r" (i)
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15)
);
for(; i<w; i++)
dst[i+0] = src1[i+0]-src2[i+0];
}
 
static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1,
const uint8_t *src2, int w,
int *left, int *left_top)
{
x86_reg i=0;
uint8_t l, lt;
 
__asm__ volatile(
"movq (%1, %0), %%mm0 \n\t" // LT
"psllq $8, %%mm0 \n\t"
"1: \n\t"
"movq (%1, %0), %%mm1 \n\t" // T
"movq -1(%2, %0), %%mm2 \n\t" // L
"movq (%2, %0), %%mm3 \n\t" // X
"movq %%mm2, %%mm4 \n\t" // L
"psubb %%mm0, %%mm2 \n\t"
"paddb %%mm1, %%mm2 \n\t" // L + T - LT
"movq %%mm4, %%mm5 \n\t" // L
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L)
"pminub %%mm5, %%mm1 \n\t" // min(T, L)
"pminub %%mm2, %%mm4 \n\t"
"pmaxub %%mm1, %%mm4 \n\t"
"psubb %%mm4, %%mm3 \n\t" // dst - pred
"movq %%mm3, (%3, %0) \n\t"
"add $8, %0 \n\t"
"movq -1(%1, %0), %%mm0 \n\t" // LT
"cmp %4, %0 \n\t"
" jb 1b \n\t"
: "+r" (i)
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w)
);
 
l= *left;
lt= *left_top;
 
dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF);
 
*left_top= src1[w-1];
*left = src2[w-1];
}
 
#define MMABS_MMX(a,z)\
"pxor " #z ", " #z " \n\t"\
"pcmpgtw " #a ", " #z " \n\t"\
"pxor " #z ", " #a " \n\t"\
"psubw " #z ", " #a " \n\t"
 
#define MMABS_MMXEXT(a, z) \
"pxor " #z ", " #z " \n\t"\
"psubw " #a ", " #z " \n\t"\
"pmaxsw " #z ", " #a " \n\t"
 
#define MMABS_SSSE3(a,z)\
"pabsw " #a ", " #a " \n\t"
 
#define MMABS_SUM(a,z, sum)\
MMABS(a,z)\
"paddusw " #a ", " #sum " \n\t"
 
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to
* about 100k on extreme inputs. But that's very unlikely to occur in natural video,
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */
#define HSUM_MMX(a, t, dst)\
"movq "#a", "#t" \n\t"\
"psrlq $32, "#a" \n\t"\
"paddusw "#t", "#a" \n\t"\
"movq "#a", "#t" \n\t"\
"psrlq $16, "#a" \n\t"\
"paddusw "#t", "#a" \n\t"\
"movd "#a", "#dst" \n\t"\
 
#define HSUM_MMXEXT(a, t, dst) \
"pshufw $0x0E, "#a", "#t" \n\t"\
"paddusw "#t", "#a" \n\t"\
"pshufw $0x01, "#a", "#t" \n\t"\
"paddusw "#t", "#a" \n\t"\
"movd "#a", "#dst" \n\t"\
 
#define HSUM_SSE2(a, t, dst)\
"movhlps "#a", "#t" \n\t"\
"paddusw "#t", "#a" \n\t"\
"pshuflw $0x0E, "#a", "#t" \n\t"\
"paddusw "#t", "#a" \n\t"\
"pshuflw $0x01, "#a", "#t" \n\t"\
"paddusw "#t", "#a" \n\t"\
"movd "#a", "#dst" \n\t"\
 
#define DCT_SAD4(m,mm,o)\
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\
"mov"#m" "#o"+16(%1), "#mm"3 \n\t"\
"mov"#m" "#o"+32(%1), "#mm"4 \n\t"\
"mov"#m" "#o"+48(%1), "#mm"5 \n\t"\
MMABS_SUM(mm##2, mm##6, mm##0)\
MMABS_SUM(mm##3, mm##7, mm##1)\
MMABS_SUM(mm##4, mm##6, mm##0)\
MMABS_SUM(mm##5, mm##7, mm##1)\
 
#define DCT_SAD_MMX\
"pxor %%mm0, %%mm0 \n\t"\
"pxor %%mm1, %%mm1 \n\t"\
DCT_SAD4(q, %%mm, 0)\
DCT_SAD4(q, %%mm, 8)\
DCT_SAD4(q, %%mm, 64)\
DCT_SAD4(q, %%mm, 72)\
"paddusw %%mm1, %%mm0 \n\t"\
HSUM(%%mm0, %%mm1, %0)
 
#define DCT_SAD_SSE2\
"pxor %%xmm0, %%xmm0 \n\t"\
"pxor %%xmm1, %%xmm1 \n\t"\
DCT_SAD4(dqa, %%xmm, 0)\
DCT_SAD4(dqa, %%xmm, 64)\
"paddusw %%xmm1, %%xmm0 \n\t"\
HSUM(%%xmm0, %%xmm1, %0)
 
#define DCT_SAD_FUNC(cpu) \
static int sum_abs_dctelem_##cpu(int16_t *block){\
int sum;\
__asm__ volatile(\
DCT_SAD\
:"=r"(sum)\
:"r"(block)\
);\
return sum&0xFFFF;\
}
 
#define DCT_SAD DCT_SAD_MMX
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst)
#define MMABS(a,z) MMABS_MMX(a,z)
DCT_SAD_FUNC(mmx)
#undef MMABS
#undef HSUM
 
#define HSUM(a,t,dst) HSUM_MMXEXT(a,t,dst)
#define MMABS(a,z) MMABS_MMXEXT(a,z)
DCT_SAD_FUNC(mmxext)
#undef HSUM
#undef DCT_SAD
 
#define DCT_SAD DCT_SAD_SSE2
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst)
DCT_SAD_FUNC(sse2)
#undef MMABS
 
#if HAVE_SSSE3_INLINE
#define MMABS(a,z) MMABS_SSSE3(a,z)
DCT_SAD_FUNC(ssse3)
#undef MMABS
#endif
#undef HSUM
#undef DCT_SAD
 
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){
int sum;
x86_reg i=size;
__asm__ volatile(
"pxor %%mm4, %%mm4 \n"
"1: \n"
"sub $8, %0 \n"
"movq (%2,%0), %%mm2 \n"
"movq (%3,%0,2), %%mm0 \n"
"movq 8(%3,%0,2), %%mm1 \n"
"punpckhbw %%mm2, %%mm3 \n"
"punpcklbw %%mm2, %%mm2 \n"
"psraw $8, %%mm3 \n"
"psraw $8, %%mm2 \n"
"psubw %%mm3, %%mm1 \n"
"psubw %%mm2, %%mm0 \n"
"pmaddwd %%mm1, %%mm1 \n"
"pmaddwd %%mm0, %%mm0 \n"
"paddd %%mm1, %%mm4 \n"
"paddd %%mm0, %%mm4 \n"
"jg 1b \n"
"movq %%mm4, %%mm3 \n"
"psrlq $32, %%mm3 \n"
"paddd %%mm3, %%mm4 \n"
"movd %%mm4, %1 \n"
:"+r"(i), "=r"(sum)
:"r"(pix1), "r"(pix2)
);
return sum;
}
 
#define PHADDD(a, t)\
"movq "#a", "#t" \n\t"\
"psrlq $32, "#a" \n\t"\
"paddd "#t", "#a" \n\t"
/*
pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31]
pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31]
pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30]
*/
#define PMULHRW(x, y, s, o)\
"pmulhw " #s ", "#x " \n\t"\
"pmulhw " #s ", "#y " \n\t"\
"paddw " #o ", "#x " \n\t"\
"paddw " #o ", "#y " \n\t"\
"psraw $1, "#x " \n\t"\
"psraw $1, "#y " \n\t"
#define DEF(x) x ## _mmx
#define SET_RND MOVQ_WONE
#define SCALE_OFFSET 1
 
#include "dsputil_qns_template.c"
 
#undef DEF
#undef SET_RND
#undef SCALE_OFFSET
#undef PMULHRW
 
#define DEF(x) x ## _3dnow
#define SET_RND(x)
#define SCALE_OFFSET 0
#define PMULHRW(x, y, s, o)\
"pmulhrw " #s ", "#x " \n\t"\
"pmulhrw " #s ", "#y " \n\t"
 
#include "dsputil_qns_template.c"
 
#undef DEF
#undef SET_RND
#undef SCALE_OFFSET
#undef PMULHRW
 
#if HAVE_SSSE3_INLINE
#undef PHADDD
#define DEF(x) x ## _ssse3
#define SET_RND(x)
#define SCALE_OFFSET -1
#define PHADDD(a, t)\
"pshufw $0x0E, "#a", "#t" \n\t"\
"paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */
#define PMULHRW(x, y, s, o)\
"pmulhrsw " #s ", "#x " \n\t"\
"pmulhrsw " #s ", "#y " \n\t"
 
#include "dsputil_qns_template.c"
 
#undef DEF
#undef SET_RND
#undef SCALE_OFFSET
#undef PMULHRW
#undef PHADDD
#endif /* HAVE_SSSE3_INLINE */
 
#endif /* HAVE_INLINE_ASM */
 
int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h);
 
#define hadamard_func(cpu) \
int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \
int stride, int h); \
int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \
int stride, int h);
 
hadamard_func(mmx)
hadamard_func(mmxext)
hadamard_func(sse2)
hadamard_func(ssse3)
 
av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx)
{
int cpu_flags = av_get_cpu_flags();
const int dct_algo = avctx->dct_algo;
 
#if HAVE_YASM
int bit_depth = avctx->bits_per_raw_sample;
 
if (EXTERNAL_MMX(cpu_flags)) {
if (bit_depth <= 8)
c->get_pixels = ff_get_pixels_mmx;
c->diff_pixels = ff_diff_pixels_mmx;
c->pix_sum = ff_pix_sum16_mmx;
 
c->pix_norm1 = ff_pix_norm1_mmx;
}
if (EXTERNAL_SSE2(cpu_flags))
if (bit_depth <= 8)
c->get_pixels = ff_get_pixels_sse2;
#endif /* HAVE_YASM */
 
#if HAVE_INLINE_ASM
if (INLINE_MMX(cpu_flags)) {
if (avctx->bits_per_raw_sample <= 8 &&
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
c->fdct = ff_fdct_mmx;
 
c->diff_bytes= diff_bytes_mmx;
c->sum_abs_dctelem= sum_abs_dctelem_mmx;
 
c->sse[0] = sse16_mmx;
c->sse[1] = sse8_mmx;
c->vsad[4]= vsad_intra16_mmx;
 
c->nsse[0] = nsse16_mmx;
c->nsse[1] = nsse8_mmx;
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->vsad[0] = vsad16_mmx;
}
 
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->try_8x8basis= try_8x8basis_mmx;
}
c->add_8x8basis= add_8x8basis_mmx;
 
c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx;
}
 
if (INLINE_MMXEXT(cpu_flags)) {
if (avctx->bits_per_raw_sample <= 8 &&
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
c->fdct = ff_fdct_mmxext;
 
c->sum_abs_dctelem = sum_abs_dctelem_mmxext;
c->vsad[4] = vsad_intra16_mmxext;
 
if (!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->vsad[0] = vsad16_mmxext;
}
 
c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext;
}
 
if (INLINE_SSE2(cpu_flags)) {
if (avctx->bits_per_raw_sample <= 8 &&
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX))
c->fdct = ff_fdct_sse2;
 
c->sum_abs_dctelem= sum_abs_dctelem_sse2;
}
 
#if HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags)) {
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->try_8x8basis = try_8x8basis_ssse3;
}
c->add_8x8basis = add_8x8basis_ssse3;
c->sum_abs_dctelem = sum_abs_dctelem_ssse3;
}
#endif
 
if (INLINE_AMD3DNOW(cpu_flags)) {
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->try_8x8basis = try_8x8basis_3dnow;
}
c->add_8x8basis = add_8x8basis_3dnow;
}
#endif /* HAVE_INLINE_ASM */
 
if (EXTERNAL_MMX(cpu_flags)) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx;
}
 
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext;
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext;
}
 
if (EXTERNAL_SSE2(cpu_flags)) {
c->sse[0] = ff_sse16_sse2;
 
#if HAVE_ALIGNED_STACK
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2;
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2;
#endif
}
 
if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) {
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3;
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3;
}
 
ff_dsputil_init_pix_mmx(c, avctx);
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dwt_yasm.asm
0,0 → 1,306
;******************************************************************************
;* MMX optimized discrete wavelet trasnform
;* Copyright (c) 2010 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
pw_1: times 8 dw 1
pw_2: times 8 dw 2
pw_8: times 8 dw 8
pw_16: times 8 dw 16
pw_1991: times 4 dw 9,-1
 
section .text
 
; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2
%macro COMPOSE_53iL0 4
paddw %2, %3
paddw %2, %4
psraw %2, 2
psubw %1, %2
%endm
 
; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4
; if %4 is supplied, %1 is loaded unaligned from there
; m2: clobbered m3: pw_8 m4: pw_1991
%macro COMPOSE_DD97iH0 3-4
paddw m0, %3
paddw m1, %2
psubw m0, m3
mova m2, m1
punpcklwd m1, m0
punpckhwd m2, m0
pmaddwd m1, m4
pmaddwd m2, m4
%if %0 > 3
movu %1, %4
%endif
psrad m1, 4
psrad m2, 4
packssdw m1, m2
paddw m1, %1
%endm
 
%macro COMPOSE_VERTICAL 1
; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; int width)
cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width
mova m2, [pw_2]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m1, [b0q+2*widthq]
mova m0, [b1q+2*widthq]
COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2
mova [b1q+2*widthq], m0
jg .loop
REP_RET
 
; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; int width)
cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width
mova m1, [pw_1]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m0, [b0q+2*widthq]
paddw m0, [b2q+2*widthq]
paddw m0, m1
psraw m0, 1
paddw m0, [b1q+2*widthq]
mova [b1q+2*widthq], m0
jg .loop
REP_RET
 
; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; IDWTELEM *b3, IDWTELEM *b4, int width)
cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width
mova m3, [pw_8]
mova m4, [pw_1991]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m0, [b0q+2*widthq]
mova m1, [b1q+2*widthq]
COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq]
mova [b2q+2*widthq], m1
jg .loop
REP_RET
 
; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2,
; IDWTELEM *b3, IDWTELEM *b4, int width)
cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width
mova m3, [pw_16]
mova m4, [pw_1991]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m0, [b0q+2*widthq]
mova m1, [b1q+2*widthq]
mova m5, [b2q+2*widthq]
paddw m0, [b4q+2*widthq]
paddw m1, [b3q+2*widthq]
psubw m0, m3
mova m2, m1
punpcklwd m1, m0
punpckhwd m2, m0
pmaddwd m1, m4
pmaddwd m2, m4
psrad m1, 5
psrad m2, 5
packssdw m1, m2
psubw m5, m1
mova [b2q+2*widthq], m5
jg .loop
REP_RET
 
; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width)
cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width
mova m3, [pw_1]
%if ARCH_X86_64
mov widthd, widthd
%endif
.loop:
sub widthq, mmsize/2
mova m1, [b1q+2*widthq]
mova m0, [b0q+2*widthq]
mova m2, m1
paddw m1, m3
psraw m1, 1
psubw m0, m1
mova [b0q+2*widthq], m0
paddw m2, m0
mova [b1q+2*widthq], m2
jg .loop
REP_RET
%endmacro
 
; extend the left and right edges of the tmp array by %1 and %2 respectively
%macro EDGE_EXTENSION 3
mov %3, [tmpq]
%assign %%i 1
%rep %1
mov [tmpq-2*%%i], %3
%assign %%i %%i+1
%endrep
mov %3, [tmpq+2*w2q-2]
%assign %%i 0
%rep %2
mov [tmpq+2*w2q+2*%%i], %3
%assign %%i %%i+1
%endrep
%endmacro
 
 
%macro HAAR_HORIZONTAL 2
; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width)
cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2
mov w2d, wd
xor xq, xq
shr w2d, 1
lea b_w2q, [bq+wq]
mova m3, [pw_1]
.lowpass_loop:
movu m1, [b_w2q + 2*xq]
mova m0, [bq + 2*xq]
paddw m1, m3
psraw m1, 1
psubw m0, m1
mova [tmpq + 2*xq], m0
add xq, mmsize/2
cmp xq, w2q
jl .lowpass_loop
 
xor xq, xq
and w2q, ~(mmsize/2 - 1)
cmp w2q, mmsize/2
jl .end
 
.highpass_loop:
movu m1, [b_w2q + 2*xq]
mova m0, [tmpq + 2*xq]
paddw m1, m0
 
; shift and interleave
%if %2 == 1
paddw m0, m3
paddw m1, m3
psraw m0, 1
psraw m1, 1
%endif
mova m2, m0
punpcklwd m0, m1
punpckhwd m2, m1
mova [bq+4*xq], m0
mova [bq+4*xq+mmsize], m2
 
add xq, mmsize/2
cmp xq, w2q
jl .highpass_loop
.end:
REP_RET
%endmacro
 
 
INIT_XMM
; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width)
cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2
mov w2d, wd
xor xd, xd
shr w2d, 1
lea b_w2q, [bq+wq]
movu m4, [bq+wq]
mova m7, [pw_2]
pslldq m4, 14
.lowpass_loop:
movu m1, [b_w2q + 2*xq]
mova m0, [bq + 2*xq]
mova m2, m1
palignr m1, m4, 14
mova m4, m2
COMPOSE_53iL0 m0, m1, m2, m7
mova [tmpq + 2*xq], m0
add xd, mmsize/2
cmp xd, w2d
jl .lowpass_loop
 
EDGE_EXTENSION 1, 2, xw
; leave the last up to 7 (sse) or 3 (mmx) values for C
xor xd, xd
and w2d, ~(mmsize/2 - 1)
cmp w2d, mmsize/2
jl .end
 
mova m7, [tmpq-mmsize]
mova m0, [tmpq]
mova m5, [pw_1]
mova m3, [pw_8]
mova m4, [pw_1991]
.highpass_loop:
mova m6, m0
palignr m0, m7, 14
mova m7, [tmpq + 2*xq + 16]
mova m1, m7
mova m2, m7
palignr m1, m6, 2
palignr m2, m6, 4
COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq]
mova m0, m7
mova m7, m6
 
; shift and interleave
paddw m6, m5
paddw m1, m5
psraw m6, 1
psraw m1, 1
mova m2, m6
punpcklwd m6, m1
punpckhwd m2, m1
mova [bq+4*xq], m6
mova [bq+4*xq+mmsize], m2
 
add xd, mmsize/2
cmp xd, w2d
jl .highpass_loop
.end:
REP_RET
 
 
%if ARCH_X86_64 == 0
INIT_MMX
COMPOSE_VERTICAL mmx
HAAR_HORIZONTAL mmx, 0
HAAR_HORIZONTAL mmx, 1
%endif
 
;;INIT_XMM
INIT_XMM
COMPOSE_VERTICAL sse2
HAAR_HORIZONTAL sse2, 0
HAAR_HORIZONTAL sse2, 1
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fdct.c
0,0 → 1,594
/*
* MMX optimized forward DCT
* The gcc porting is Copyright (c) 2001 Fabrice Bellard.
* cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
* SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
*
* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
*
* Intel Application Note AP-922 - fast, precise implementation of DCT
* http://developer.intel.com/vtune/cbts/appnotes.htm
*
* Also of inspiration:
* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
* Skal's fdct at http://skal.planet-d.net/coding/dct.html
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/common.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/dct.h"
 
#if HAVE_MMX_INLINE
 
//////////////////////////////////////////////////////////////////////
//
// constants for the forward DCT
// -----------------------------
//
// Be sure to check that your compiler is aligning all constants to QWORD
// (8-byte) memory boundaries! Otherwise the unaligned memory access will
// severely stall MMX execution.
//
//////////////////////////////////////////////////////////////////////
 
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
#define SHIFT_FRW_COL BITS_FRW_ACC
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
 
#define X8(x) x,x,x,x,x,x,x,x
 
//concatenated table, for forward DCT transformation
DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
X8(13036), // tg * (2<<16) + 0.5
X8(27146), // tg * (2<<16) + 0.5
X8(-21746) // tg * (2<<16) + 0.5
};
 
DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
X8(23170) //cos * (2<<15) + 0.5
};
 
DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
 
DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
 
static const struct
{
DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
} fdct_r_row_sse2 =
{{
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
}};
//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
 
DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
16384, 16384, 22725, 19266,
16384, 16384, 12873, 4520,
21407, 8867, 19266, -4520,
-8867, -21407, -22725, -12873,
16384, -16384, 12873, -22725,
-16384, 16384, 4520, 19266,
8867, -21407, 4520, -12873,
21407, -8867, 19266, -22725,
 
22725, 22725, 31521, 26722,
22725, 22725, 17855, 6270,
29692, 12299, 26722, -6270,
-12299, -29692, -31521, -17855,
22725, -22725, 17855, -31521,
-22725, 22725, 6270, 26722,
12299, -29692, 6270, -17855,
29692, -12299, 26722, -31521,
 
21407, 21407, 29692, 25172,
21407, 21407, 16819, 5906,
27969, 11585, 25172, -5906,
-11585, -27969, -29692, -16819,
21407, -21407, 16819, -29692,
-21407, 21407, 5906, 25172,
11585, -27969, 5906, -16819,
27969, -11585, 25172, -29692,
 
19266, 19266, 26722, 22654,
19266, 19266, 15137, 5315,
25172, 10426, 22654, -5315,
-10426, -25172, -26722, -15137,
19266, -19266, 15137, -26722,
-19266, 19266, 5315, 22654,
10426, -25172, 5315, -15137,
25172, -10426, 22654, -26722,
 
16384, 16384, 22725, 19266,
16384, 16384, 12873, 4520,
21407, 8867, 19266, -4520,
-8867, -21407, -22725, -12873,
16384, -16384, 12873, -22725,
-16384, 16384, 4520, 19266,
8867, -21407, 4520, -12873,
21407, -8867, 19266, -22725,
 
19266, 19266, 26722, 22654,
19266, 19266, 15137, 5315,
25172, 10426, 22654, -5315,
-10426, -25172, -26722, -15137,
19266, -19266, 15137, -26722,
-19266, 19266, 5315, 22654,
10426, -25172, 5315, -15137,
25172, -10426, 22654, -26722,
 
21407, 21407, 29692, 25172,
21407, 21407, 16819, 5906,
27969, 11585, 25172, -5906,
-11585, -27969, -29692, -16819,
21407, -21407, 16819, -29692,
-21407, 21407, 5906, 25172,
11585, -27969, 5906, -16819,
27969, -11585, 25172, -29692,
 
22725, 22725, 31521, 26722,
22725, 22725, 17855, 6270,
29692, 12299, 26722, -6270,
-12299, -29692, -31521, -17855,
22725, -22725, 17855, -31521,
-22725, 22725, 6270, 26722,
12299, -29692, 6270, -17855,
29692, -12299, 26722, -31521,
};
 
static const struct
{
DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
} tab_frw_01234567_sse2 =
{{
//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
C4, C4, C5, C7, C2, C6, C3, -C7, \
-C4, C4, C7, C3, C6, -C2, C7, -C5, \
C4, -C4, C5, -C1, C2, -C6, C3, -C1,
// c1..c7 * cos(pi/4) * 2^15
#define C1 22725
#define C2 21407
#define C3 19266
#define C4 16384
#define C5 12873
#define C6 8867
#define C7 4520
TABLE_SSE2
 
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 31521
#define C2 29692
#define C3 26722
#define C4 22725
#define C5 17855
#define C6 12299
#define C7 6270
TABLE_SSE2
 
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 29692
#define C2 27969
#define C3 25172
#define C4 21407
#define C5 16819
#define C6 11585
#define C7 5906
TABLE_SSE2
 
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 26722
#define C2 25172
#define C3 22654
#define C4 19266
#define C5 15137
#define C6 10426
#define C7 5315
TABLE_SSE2
 
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 22725
#define C2 21407
#define C3 19266
#define C4 16384
#define C5 12873
#define C6 8867
#define C7 4520
TABLE_SSE2
 
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 26722
#define C2 25172
#define C3 22654
#define C4 19266
#define C5 15137
#define C6 10426
#define C7 5315
TABLE_SSE2
 
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 29692
#define C2 27969
#define C3 25172
#define C4 21407
#define C5 16819
#define C6 11585
#define C7 5906
TABLE_SSE2
 
#undef C1
#undef C2
#undef C3
#undef C4
#undef C5
#undef C6
#undef C7
#define C1 31521
#define C2 29692
#define C3 26722
#define C4 22725
#define C5 17855
#define C6 12299
#define C7 6270
TABLE_SSE2
}};
 
#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
 
#define FDCT_COL(cpu, mm, mov)\
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\
{\
__asm__ volatile (\
#mov" 16(%0), %%"#mm"0 \n\t" \
#mov" 96(%0), %%"#mm"1 \n\t" \
#mov" %%"#mm"0, %%"#mm"2 \n\t" \
#mov" 32(%0), %%"#mm"3 \n\t" \
"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
#mov" 80(%0), %%"#mm"4 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
#mov" (%0), %%"#mm"5 \n\t" \
"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
"paddsw 112(%0), %%"#mm"5 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
#mov" %%"#mm"0, %%"#mm"6 \n\t" \
"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
#mov" 16(%1), %%"#mm"1 \n\t" \
"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
#mov" 48(%0), %%"#mm"7 \n\t" \
"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
"paddsw 64(%0), %%"#mm"7 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
#mov" %%"#mm"5, %%"#mm"4 \n\t" \
"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
"por (%2), %%"#mm"1 \n\t" \
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
"pmulhw 16(%1), %%"#mm"5 \n\t" \
#mov" %%"#mm"4, %%"#mm"7 \n\t" \
"psubsw 80(%0), %%"#mm"3 \n\t" \
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
#mov" %%"#mm"1, 32(%3) \n\t" \
"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
#mov" 48(%0), %%"#mm"1 \n\t" \
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
"psubsw 64(%0), %%"#mm"1 \n\t" \
#mov" %%"#mm"2, %%"#mm"6 \n\t" \
#mov" %%"#mm"4, 64(%3) \n\t" \
"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
"pmulhw (%4), %%"#mm"2 \n\t" \
"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
"pmulhw (%4), %%"#mm"6 \n\t" \
"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
"por (%2), %%"#mm"5 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
"por (%2), %%"#mm"2 \n\t" \
#mov" %%"#mm"1, %%"#mm"4 \n\t" \
#mov" (%0), %%"#mm"3 \n\t" \
"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
"psubsw 112(%0), %%"#mm"3 \n\t" \
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
#mov" (%1), %%"#mm"0 \n\t" \
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
#mov" 32(%1), %%"#mm"6 \n\t" \
"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
#mov" %%"#mm"7, (%3) \n\t" \
"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
#mov" %%"#mm"5, 96(%3) \n\t" \
#mov" %%"#mm"3, %%"#mm"7 \n\t" \
#mov" 32(%1), %%"#mm"5 \n\t" \
"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
"pmulhw (%1), %%"#mm"3 \n\t" \
"por (%2), %%"#mm"0 \n\t" \
"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
#mov" %%"#mm"0, 16(%3) \n\t" \
"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
#mov" %%"#mm"7, 48(%3) \n\t" \
"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
#mov" %%"#mm"5, 80(%3) \n\t" \
#mov" %%"#mm"3, 112(%3) \n\t" \
: \
: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
"r" (out + offset), "r" (ocos_4_16)); \
}
 
FDCT_COL(mmx, mm, movq)
FDCT_COL(sse2, xmm, movdqa)
 
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out)
{
__asm__ volatile(
#define FDCT_ROW_SSE2_H1(i,t) \
"movq " #i "(%0), %%xmm2 \n\t" \
"movq " #i "+8(%0), %%xmm0 \n\t" \
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
"movdqa " #t "+48(%1), %%xmm7 \n\t" \
"movdqa " #t "(%1), %%xmm4 \n\t" \
"movdqa " #t "+16(%1), %%xmm5 \n\t"
 
#define FDCT_ROW_SSE2_H2(i,t) \
"movq " #i "(%0), %%xmm2 \n\t" \
"movq " #i "+8(%0), %%xmm0 \n\t" \
"movdqa " #t "+32(%1), %%xmm3 \n\t" \
"movdqa " #t "+48(%1), %%xmm7 \n\t"
 
#define FDCT_ROW_SSE2(i) \
"movq %%xmm2, %%xmm1 \n\t" \
"pshuflw $27, %%xmm0, %%xmm0 \n\t" \
"paddsw %%xmm0, %%xmm1 \n\t" \
"psubsw %%xmm0, %%xmm2 \n\t" \
"punpckldq %%xmm2, %%xmm1 \n\t" \
"pshufd $78, %%xmm1, %%xmm2 \n\t" \
"pmaddwd %%xmm2, %%xmm3 \n\t" \
"pmaddwd %%xmm1, %%xmm7 \n\t" \
"pmaddwd %%xmm5, %%xmm2 \n\t" \
"pmaddwd %%xmm4, %%xmm1 \n\t" \
"paddd %%xmm7, %%xmm3 \n\t" \
"paddd %%xmm2, %%xmm1 \n\t" \
"paddd %%xmm6, %%xmm3 \n\t" \
"paddd %%xmm6, %%xmm1 \n\t" \
"psrad %3, %%xmm3 \n\t" \
"psrad %3, %%xmm1 \n\t" \
"packssdw %%xmm3, %%xmm1 \n\t" \
"movdqa %%xmm1, " #i "(%4) \n\t"
 
"movdqa (%2), %%xmm6 \n\t"
FDCT_ROW_SSE2_H1(0,0)
FDCT_ROW_SSE2(0)
FDCT_ROW_SSE2_H2(64,0)
FDCT_ROW_SSE2(64)
 
FDCT_ROW_SSE2_H1(16,64)
FDCT_ROW_SSE2(16)
FDCT_ROW_SSE2_H2(112,64)
FDCT_ROW_SSE2(112)
 
FDCT_ROW_SSE2_H1(32,128)
FDCT_ROW_SSE2(32)
FDCT_ROW_SSE2_H2(96,128)
FDCT_ROW_SSE2(96)
 
FDCT_ROW_SSE2_H1(48,192)
FDCT_ROW_SSE2(48)
FDCT_ROW_SSE2_H2(80,192)
FDCT_ROW_SSE2(80)
:
: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
"r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
 
static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out,
const int16_t *table)
{
__asm__ volatile (
"pshufw $0x1B, 8(%0), %%mm5 \n\t"
"movq (%0), %%mm0 \n\t"
"movq %%mm0, %%mm1 \n\t"
"paddsw %%mm5, %%mm0 \n\t"
"psubsw %%mm5, %%mm1 \n\t"
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm1, %%mm0 \n\t"
"punpckhdq %%mm1, %%mm2 \n\t"
"movq (%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm5 \n\t"
"movq 32(%1), %%mm6 \n\t"
"movq 40(%1), %%mm7 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm0, %%mm4 \n\t"
"pmaddwd %%mm2, %%mm5 \n\t"
"pmaddwd %%mm0, %%mm6 \n\t"
"pmaddwd %%mm2, %%mm7 \n\t"
"pmaddwd 48(%1), %%mm0 \n\t"
"pmaddwd 56(%1), %%mm2 \n\t"
"paddd %%mm1, %%mm3 \n\t"
"paddd %%mm4, %%mm5 \n\t"
"paddd %%mm6, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"movq (%2), %%mm0 \n\t"
"paddd %%mm0, %%mm3 \n\t"
"paddd %%mm0, %%mm5 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
"packssdw %%mm5, %%mm3 \n\t"
"packssdw %%mm2, %%mm7 \n\t"
"movq %%mm3, (%3) \n\t"
"movq %%mm7, 8(%3) \n\t"
:
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
}
 
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table)
{
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
__asm__ volatile(
"movd 12(%0), %%mm1 \n\t"
"punpcklwd 8(%0), %%mm1 \n\t"
"movq %%mm1, %%mm2 \n\t"
"psrlq $0x20, %%mm1 \n\t"
"movq 0(%0), %%mm0 \n\t"
"punpcklwd %%mm2, %%mm1 \n\t"
"movq %%mm0, %%mm5 \n\t"
"paddsw %%mm1, %%mm0 \n\t"
"psubsw %%mm1, %%mm5 \n\t"
"movq %%mm0, %%mm2 \n\t"
"punpckldq %%mm5, %%mm0 \n\t"
"punpckhdq %%mm5, %%mm2 \n\t"
"movq 0(%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm5 \n\t"
"movq 32(%1), %%mm6 \n\t"
"movq 40(%1), %%mm7 \n\t"
"pmaddwd %%mm0, %%mm1 \n\t"
"pmaddwd %%mm2, %%mm3 \n\t"
"pmaddwd %%mm0, %%mm4 \n\t"
"pmaddwd %%mm2, %%mm5 \n\t"
"pmaddwd %%mm0, %%mm6 \n\t"
"pmaddwd %%mm2, %%mm7 \n\t"
"pmaddwd 48(%1), %%mm0 \n\t"
"pmaddwd 56(%1), %%mm2 \n\t"
"paddd %%mm1, %%mm3 \n\t"
"paddd %%mm4, %%mm5 \n\t"
"paddd %%mm6, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"movq (%2), %%mm0 \n\t"
"paddd %%mm0, %%mm3 \n\t"
"paddd %%mm0, %%mm5 \n\t"
"paddd %%mm0, %%mm7 \n\t"
"paddd %%mm0, %%mm2 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
"packssdw %%mm5, %%mm3 \n\t"
"packssdw %%mm2, %%mm7 \n\t"
"movq %%mm3, 0(%3) \n\t"
"movq %%mm7, 8(%3) \n\t"
:
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
}
 
void ff_fdct_mmx(int16_t *block)
{
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
int16_t * block1= (int16_t*)align_tmp;
const int16_t *table= tab_frw_01234567;
int i;
 
fdct_col_mmx(block, block1, 0);
fdct_col_mmx(block, block1, 4);
 
for(i=8;i>0;i--) {
fdct_row_mmx(block1, block, table);
block1 += 8;
table += 32;
block += 8;
}
}
 
#endif /* HAVE_MMX_INLINE */
 
#if HAVE_MMXEXT_INLINE
 
void ff_fdct_mmxext(int16_t *block)
{
DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
int16_t *block1= (int16_t*)align_tmp;
const int16_t *table= tab_frw_01234567;
int i;
 
fdct_col_mmx(block, block1, 0);
fdct_col_mmx(block, block1, 4);
 
for(i=8;i>0;i--) {
fdct_row_mmxext(block1, block, table);
block1 += 8;
table += 32;
block += 8;
}
}
 
#endif /* HAVE_MMXEXT_INLINE */
 
#if HAVE_SSE2_INLINE
 
void ff_fdct_sse2(int16_t *block)
{
DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
int16_t * const block1= (int16_t*)align_tmp;
 
fdct_col_sse2(block, block1, 0);
fdct_row_sse2(block1, block);
}
 
#endif /* HAVE_SSE2_INLINE */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fft.asm
0,0 → 1,1092
;******************************************************************************
;* FFT transform with SSE/3DNow optimizations
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2011 Vitor Sessak
;*
;* This algorithm (though not any of the implementation details) is
;* based on libdjbfft by D. J. Bernstein.
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
; These functions are not individually interchangeable with the C versions.
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results
; in blocks as conventient to the vector size.
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively)
 
%include "libavutil/x86/x86util.asm"
 
%if ARCH_X86_64
%define pointer resq
%else
%define pointer resd
%endif
 
SECTION_RODATA 32
 
struc FFTContext
.nbits: resd 1
.reverse: resd 1
.revtab: pointer 1
.tmpbuf: pointer 1
.mdctsize: resd 1
.mdctbits: resd 1
.tcos: pointer 1
.tsin: pointer 1
.fftperm: pointer 1
.fftcalc: pointer 1
.imdctcalc:pointer 1
.imdcthalf:pointer 1
endstruc
 
%define M_SQRT1_2 0.70710678118654752440
%define M_COS_PI_1_8 0.923879532511287
%define M_COS_PI_3_8 0.38268343236509
 
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8
 
ps_root2: times 8 dd M_SQRT1_2
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2
ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0
 
perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31
ps_m1m1m1m1: times 4 dd 1<<31
ps_m1p1: dd 1<<31, 0
 
%assign i 16
%rep 13
cextern cos_ %+ i
%assign i i<<1
%endrep
 
%if ARCH_X86_64
%define pointer dq
%else
%define pointer dd
%endif
 
%macro IF0 1+
%endmacro
%macro IF1 1+
%1
%endmacro
 
SECTION_TEXT
 
%macro T2_3DNOW 4 ; z0, z1, mem0, mem1
mova %1, %3
mova %2, %1
pfadd %1, %4
pfsub %2, %4
%endmacro
 
%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1
mova %5, %3
pfsub %3, %4
pfadd %5, %4 ; {t6,t5}
pxor %3, [ps_m1p1] ; {t8,t7}
mova %6, %1
movd [r0+12], %3
punpckhdq %3, [r0+8]
pfadd %1, %5 ; {r0,i0}
pfsub %6, %5 ; {r2,i2}
mova %4, %2
pfadd %2, %3 ; {r1,i1}
pfsub %4, %3 ; {r3,i3}
SWAP %3, %6
%endmacro
 
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6}
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7}
; %3, %4, %5 tmp
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3}
; %2 = {r4,r5,r6,r7,i4,i5,i6,i7}
%macro T8_AVX 5
vsubps %5, %1, %2 ; v = %1 - %2
vaddps %3, %1, %2 ; w = %1 + %2
vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1
vpermilps %2, %2, [perm1]
vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6}
vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5}
vsubps %4, %5, %1 ; s = r - q
vaddps %1, %5, %1 ; u = r + q
vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8}
vshufps %5, %4, %1, 0xbb
vshufps %3, %4, %1, 0xee
vperm2f128 %3, %3, %5, 0x13
vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1}
vshufps %2, %1, %4, 0xdd
vshufps %1, %1, %4, 0x88
vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4}
vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7}
vsubps %5, %1, %3
vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8}
vsubps %2, %4, %1 ; %2 = v - w
vaddps %1, %4, %1 ; %1 = v + w
%endmacro
 
; In SSE mode do one fft4 transforms
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3}
;
; In AVX mode do two fft4 transforms
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7}
%macro T4_SSE 3
subps %3, %1, %2 ; {t3,t4,-t8,t7}
addps %1, %1, %2 ; {t1,t2,t6,t5}
xorps %3, %3, [ps_p1p1m1p1]
shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8}
shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4}
subps %3, %1, %2 ; {r2,i2,r3,i3}
addps %1, %1, %2 ; {r0,i0,r1,i1}
shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3}
shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3}
%endmacro
 
; In SSE mode do one FFT8
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7}
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7}
;
; In AVX mode do two FFT8
; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11}
; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15}
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11}
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15}
%macro T8_SSE 6
addps %6, %3, %4 ; {t1,t2,t3,t4}
subps %3, %3, %4 ; {r5,i5,r7,i7}
shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7}
mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7}
mulps %4, %4, [ps_root2]
addps %3, %3, %4 ; {t8,t7,ta,t9}
shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta}
shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8}
subps %3, %6, %4 ; {t6,t5,tc,tb}
addps %6, %6, %4 ; {t1,t2,t9,ta}
shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc}
shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb}
subps %3, %1, %6 ; {r4,r5,r6,r7}
addps %1, %1, %6 ; {r0,r1,r2,r3}
subps %4, %2, %5 ; {i4,i5,i6,i7}
addps %2, %2, %5 ; {i0,i1,i2,i3}
%endmacro
 
; scheduled for cpu-bound sizes
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim
IF%1 mova m4, Z(4)
IF%1 mova m5, Z(5)
mova m0, %2 ; wre
mova m1, %3 ; wim
mulps m2, m4, m0 ; r2*wre
IF%1 mova m6, Z2(6)
mulps m3, m5, m1 ; i2*wim
IF%1 mova m7, Z2(7)
mulps m4, m4, m1 ; r2*wim
mulps m5, m5, m0 ; i2*wre
addps m2, m2, m3 ; r2*wre + i2*wim
mulps m3, m1, m7 ; i3*wim
subps m5, m5, m4 ; i2*wre - r2*wim
mulps m1, m1, m6 ; r3*wim
mulps m4, m0, m6 ; r3*wre
mulps m0, m0, m7 ; i3*wre
subps m4, m4, m3 ; r3*wre - i3*wim
mova m3, Z(0)
addps m0, m0, m1 ; i3*wre + r3*wim
subps m1, m4, m2 ; t3
addps m4, m4, m2 ; t5
subps m3, m3, m4 ; r2
addps m4, m4, Z(0) ; r0
mova m6, Z(2)
mova Z(4), m3
mova Z(0), m4
subps m3, m5, m0 ; t4
subps m4, m6, m3 ; r3
addps m3, m3, m6 ; r1
mova Z2(6), m4
mova Z(2), m3
mova m2, Z(3)
addps m3, m5, m0 ; t6
subps m2, m2, m1 ; i3
mova m7, Z(1)
addps m1, m1, Z(3) ; i1
mova Z2(7), m2
mova Z(3), m1
subps m4, m7, m3 ; i2
addps m3, m3, m7 ; i0
mova Z(5), m4
mova Z(1), m3
%endmacro
 
; scheduled to avoid store->load aliasing
%macro PASS_BIG 1 ; (!interleave)
mova m4, Z(4) ; r2
mova m5, Z(5) ; i2
mova m0, [wq] ; wre
mova m1, [wq+o1q] ; wim
mulps m2, m4, m0 ; r2*wre
mova m6, Z2(6) ; r3
mulps m3, m5, m1 ; i2*wim
mova m7, Z2(7) ; i3
mulps m4, m4, m1 ; r2*wim
mulps m5, m5, m0 ; i2*wre
addps m2, m2, m3 ; r2*wre + i2*wim
mulps m3, m1, m7 ; i3*wim
mulps m1, m1, m6 ; r3*wim
subps m5, m5, m4 ; i2*wre - r2*wim
mulps m4, m0, m6 ; r3*wre
mulps m0, m0, m7 ; i3*wre
subps m4, m4, m3 ; r3*wre - i3*wim
mova m3, Z(0)
addps m0, m0, m1 ; i3*wre + r3*wim
subps m1, m4, m2 ; t3
addps m4, m4, m2 ; t5
subps m3, m3, m4 ; r2
addps m4, m4, Z(0) ; r0
mova m6, Z(2)
mova Z(4), m3
mova Z(0), m4
subps m3, m5, m0 ; t4
subps m4, m6, m3 ; r3
addps m3, m3, m6 ; r1
IF%1 mova Z2(6), m4
IF%1 mova Z(2), m3
mova m2, Z(3)
addps m5, m5, m0 ; t6
subps m2, m2, m1 ; i3
mova m7, Z(1)
addps m1, m1, Z(3) ; i1
IF%1 mova Z2(7), m2
IF%1 mova Z(3), m1
subps m6, m7, m5 ; i2
addps m5, m5, m7 ; i0
IF%1 mova Z(5), m6
IF%1 mova Z(1), m5
%if %1==0
INTERL m1, m3, m7, Z, 2
INTERL m2, m4, m0, Z2, 6
 
mova m1, Z(0)
mova m2, Z(4)
 
INTERL m5, m1, m3, Z, 0
INTERL m6, m2, m7, Z, 4
%endif
%endmacro
 
%macro PUNPCK 3
mova %3, %1
punpckldq %1, %2
punpckhdq %3, %2
%endmacro
 
%define Z(x) [r0+mmsize*x]
%define Z2(x) [r0+mmsize*x]
%define ZH(x) [r0+mmsize*x+mmsize/2]
 
INIT_YMM avx
 
%if HAVE_AVX_EXTERNAL
align 16
fft8_avx:
mova m0, Z(0)
mova m1, Z(1)
T8_AVX m0, m1, m2, m3, m4
mova Z(0), m0
mova Z(1), m1
ret
 
 
align 16
fft16_avx:
mova m2, Z(2)
mova m3, Z(3)
T4_SSE m2, m3, m7
 
mova m0, Z(0)
mova m1, Z(1)
T8_AVX m0, m1, m4, m5, m7
 
mova m4, [ps_cos16_1]
mova m5, [ps_cos16_2]
vmulps m6, m2, m4
vmulps m7, m3, m5
vaddps m7, m7, m6
vmulps m2, m2, m5
vmulps m3, m3, m4
vsubps m3, m3, m2
vblendps m2, m7, m3, 0xf0
vperm2f128 m3, m7, m3, 0x21
vaddps m4, m2, m3
vsubps m2, m3, m2
vperm2f128 m2, m2, m2, 0x01
vsubps m3, m1, m2
vaddps m1, m1, m2
vsubps m5, m0, m4
vaddps m0, m0, m4
vextractf128 Z(0), m0, 0
vextractf128 ZH(0), m1, 0
vextractf128 Z(1), m0, 1
vextractf128 ZH(1), m1, 1
vextractf128 Z(2), m5, 0
vextractf128 ZH(2), m3, 0
vextractf128 Z(3), m5, 1
vextractf128 ZH(3), m3, 1
ret
 
align 16
fft32_avx:
call fft16_avx
 
mova m0, Z(4)
mova m1, Z(5)
 
T4_SSE m0, m1, m4
 
mova m2, Z(6)
mova m3, Z(7)
 
T8_SSE m0, m1, m2, m3, m4, m6
; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11}
; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15}
 
vperm2f128 m4, m0, m2, 0x20
vperm2f128 m5, m1, m3, 0x20
vperm2f128 m6, m0, m2, 0x31
vperm2f128 m7, m1, m3, 0x31
 
PASS_SMALL 0, [cos_32], [cos_32+32]
 
ret
 
fft32_interleave_avx:
call fft32_avx
mov r2d, 32
.deint_loop:
mova m2, Z(0)
mova m3, Z(1)
vunpcklps m0, m2, m3
vunpckhps m1, m2, m3
vextractf128 Z(0), m0, 0
vextractf128 ZH(0), m1, 0
vextractf128 Z(1), m0, 1
vextractf128 ZH(1), m1, 1
add r0, mmsize*2
sub r2d, mmsize/4
jg .deint_loop
ret
 
%endif
 
INIT_XMM sse
 
align 16
fft4_avx:
fft4_sse:
mova m0, Z(0)
mova m1, Z(1)
T4_SSE m0, m1, m2
mova Z(0), m0
mova Z(1), m1
ret
 
align 16
fft8_sse:
mova m0, Z(0)
mova m1, Z(1)
T4_SSE m0, m1, m2
mova m2, Z(2)
mova m3, Z(3)
T8_SSE m0, m1, m2, m3, m4, m5
mova Z(0), m0
mova Z(1), m1
mova Z(2), m2
mova Z(3), m3
ret
 
align 16
fft16_sse:
mova m0, Z(0)
mova m1, Z(1)
T4_SSE m0, m1, m2
mova m2, Z(2)
mova m3, Z(3)
T8_SSE m0, m1, m2, m3, m4, m5
mova m4, Z(4)
mova m5, Z(5)
mova Z(0), m0
mova Z(1), m1
mova Z(2), m2
mova Z(3), m3
T4_SSE m4, m5, m6
mova m6, Z2(6)
mova m7, Z2(7)
T4_SSE m6, m7, m0
PASS_SMALL 0, [cos_16], [cos_16+16]
ret
 
 
%macro FFT48_3DNOW 0
align 16
fft4 %+ SUFFIX:
T2_3DNOW m0, m1, Z(0), Z(1)
mova m2, Z(2)
mova m3, Z(3)
T4_3DNOW m0, m1, m2, m3, m4, m5
PUNPCK m0, m1, m4
PUNPCK m2, m3, m5
mova Z(0), m0
mova Z(1), m4
mova Z(2), m2
mova Z(3), m5
ret
 
align 16
fft8 %+ SUFFIX:
T2_3DNOW m0, m1, Z(0), Z(1)
mova m2, Z(2)
mova m3, Z(3)
T4_3DNOW m0, m1, m2, m3, m4, m5
mova Z(0), m0
mova Z(2), m2
T2_3DNOW m4, m5, Z(4), Z(5)
T2_3DNOW m6, m7, Z2(6), Z2(7)
PSWAPD m0, m5
PSWAPD m2, m7
pxor m0, [ps_m1p1]
pxor m2, [ps_m1p1]
pfsub m5, m0
pfadd m7, m2
pfmul m5, [ps_root2]
pfmul m7, [ps_root2]
T4_3DNOW m1, m3, m5, m7, m0, m2
mova Z(5), m5
mova Z2(7), m7
mova m0, Z(0)
mova m2, Z(2)
T4_3DNOW m0, m2, m4, m6, m5, m7
PUNPCK m0, m1, m5
PUNPCK m2, m3, m7
mova Z(0), m0
mova Z(1), m5
mova Z(2), m2
mova Z(3), m7
PUNPCK m4, Z(5), m5
PUNPCK m6, Z2(7), m7
mova Z(4), m4
mova Z(5), m5
mova Z2(6), m6
mova Z2(7), m7
ret
%endmacro
 
%if ARCH_X86_32
INIT_MMX 3dnowext
FFT48_3DNOW
 
INIT_MMX 3dnow
FFT48_3DNOW
%endif
 
%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)]
%define Z2(x) [zcq + o3q + mmsize*(x&1)]
%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2]
%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2]
 
%macro DECL_PASS 2+ ; name, payload
align 16
%1:
DEFINE_ARGS zc, w, n, o1, o3
lea o3q, [nq*3]
lea o1q, [nq*8]
shl o3q, 4
.loop:
%2
add zcq, mmsize*2
add wq, mmsize
sub nd, mmsize/8
jg .loop
rep ret
%endmacro
 
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs
lea r2, [dispatch_tab%1]
mov r2, [r2 + (%2q-2)*gprsize]
%ifdef PIC
lea r3, [$$]
add r2, r3
%endif
call r2
%endmacro ; FFT_DISPATCH
 
INIT_YMM avx
 
%if HAVE_AVX_EXTERNAL
%macro INTERL_AVX 5
vunpckhps %3, %2, %1
vunpcklps %2, %2, %1
vextractf128 %4(%5), %2, 0
vextractf128 %4 %+ H(%5), %3, 0
vextractf128 %4(%5 + 1), %2, 1
vextractf128 %4 %+ H(%5 + 1), %3, 1
%endmacro
 
%define INTERL INTERL_AVX
 
DECL_PASS pass_avx, PASS_BIG 1
DECL_PASS pass_interleave_avx, PASS_BIG 0
 
cglobal fft_calc, 2,5,8
mov r3d, [r0 + FFTContext.nbits]
mov r0, r1
mov r1, r3
FFT_DISPATCH _interleave %+ SUFFIX, r1
REP_RET
 
%endif
 
INIT_XMM sse
 
%macro INTERL_SSE 5
mova %3, %2
unpcklps %2, %1
unpckhps %3, %1
mova %4(%5), %2
mova %4(%5+1), %3
%endmacro
 
%define INTERL INTERL_SSE
 
DECL_PASS pass_sse, PASS_BIG 1
DECL_PASS pass_interleave_sse, PASS_BIG 0
 
%macro FFT_CALC_FUNC 0
cglobal fft_calc, 2,5,8
mov r3d, [r0 + FFTContext.nbits]
PUSH r1
PUSH r3
mov r0, r1
mov r1, r3
FFT_DISPATCH _interleave %+ SUFFIX, r1
POP rcx
POP r4
cmp rcx, 3+(mmsize/16)
jg .end
mov r2, -1
add rcx, 3
shl r2, cl
sub r4, r2
.loop:
%if mmsize == 8
PSWAPD m0, [r4 + r2 + 4]
mova [r4 + r2 + 4], m0
%else
movaps xmm0, [r4 + r2]
movaps xmm1, xmm0
unpcklps xmm0, [r4 + r2 + 16]
unpckhps xmm1, [r4 + r2 + 16]
movaps [r4 + r2], xmm0
movaps [r4 + r2 + 16], xmm1
%endif
add r2, mmsize*2
jl .loop
.end:
%if cpuflag(3dnow)
femms
RET
%else
REP_RET
%endif
%endmacro
 
%if ARCH_X86_32
INIT_MMX 3dnow
FFT_CALC_FUNC
INIT_MMX 3dnowext
FFT_CALC_FUNC
%endif
INIT_XMM sse
FFT_CALC_FUNC
 
cglobal fft_permute, 2,7,1
mov r4, [r0 + FFTContext.revtab]
mov r5, [r0 + FFTContext.tmpbuf]
mov ecx, [r0 + FFTContext.nbits]
mov r2, 1
shl r2, cl
xor r0, r0
%if ARCH_X86_32
mov r1, r1m
%endif
.loop:
movaps xmm0, [r1 + 8*r0]
movzx r6, word [r4 + 2*r0]
movzx r3, word [r4 + 2*r0 + 2]
movlps [r5 + 8*r6], xmm0
movhps [r5 + 8*r3], xmm0
add r0, 2
cmp r0, r2
jl .loop
shl r2, 3
add r1, r2
add r5, r2
neg r2
; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B
.loopcopy:
movaps xmm0, [r5 + r2]
movaps xmm1, [r5 + r2 + 16]
movaps [r1 + r2], xmm0
movaps [r1 + r2 + 16], xmm1
add r2, 32
jl .loopcopy
REP_RET
 
%macro IMDCT_CALC_FUNC 0
cglobal imdct_calc, 3,5,3
mov r3d, [r0 + FFTContext.mdctsize]
mov r4, [r0 + FFTContext.imdcthalf]
add r1, r3
PUSH r3
PUSH r1
%if ARCH_X86_32
push r2
push r1
push r0
%else
sub rsp, 8+32*WIN64 ; allocate win64 shadow space
%endif
call r4
%if ARCH_X86_32
add esp, 12
%else
add rsp, 8+32*WIN64
%endif
POP r1
POP r3
lea r0, [r1 + 2*r3]
mov r2, r3
sub r3, mmsize
neg r2
mova m2, [ps_m1m1m1m1]
.loop:
%if mmsize == 8
PSWAPD m0, [r1 + r3]
PSWAPD m1, [r0 + r2]
pxor m0, m2
%else
mova m0, [r1 + r3]
mova m1, [r0 + r2]
shufps m0, m0, 0x1b
shufps m1, m1, 0x1b
xorps m0, m2
%endif
mova [r0 + r3], m1
mova [r1 + r2], m0
sub r3, mmsize
add r2, mmsize
jl .loop
%if cpuflag(3dnow)
femms
RET
%else
REP_RET
%endif
%endmacro
 
%if ARCH_X86_32
INIT_MMX 3dnow
IMDCT_CALC_FUNC
INIT_MMX 3dnowext
IMDCT_CALC_FUNC
%endif
 
INIT_XMM sse
IMDCT_CALC_FUNC
 
%if ARCH_X86_32
INIT_MMX 3dnow
%define mulps pfmul
%define addps pfadd
%define subps pfsub
%define unpcklps punpckldq
%define unpckhps punpckhdq
DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q]
DECL_PASS pass_interleave_3dnow, PASS_BIG 0
%define pass_3dnowext pass_3dnow
%define pass_interleave_3dnowext pass_interleave_3dnow
%endif
 
%ifdef PIC
%define SECTION_REL - $$
%else
%define SECTION_REL
%endif
 
%macro DECL_FFT 1-2 ; nbits, suffix
%ifidn %0, 1
%xdefine fullsuffix SUFFIX
%else
%xdefine fullsuffix %2 %+ SUFFIX
%endif
%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL
%if %1>=5
%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL
%endif
%if %1>=6
%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL
%endif
 
%assign n 1<<%1
%rep 17-%1
%assign n2 n/2
%assign n4 n/4
%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL
 
align 16
fft %+ n %+ fullsuffix:
call fft %+ n2 %+ SUFFIX
add r0, n*4 - (n&(-2<<%1))
call fft %+ n4 %+ SUFFIX
add r0, n*2 - (n2&(-2<<%1))
call fft %+ n4 %+ SUFFIX
sub r0, n*6 + (n2&(-2<<%1))
lea r1, [cos_ %+ n]
mov r2d, n4/2
jmp pass %+ fullsuffix
 
%assign n n*2
%endrep
%undef n
 
align 8
dispatch_tab %+ fullsuffix: pointer list_of_fft
%endmacro ; DECL_FFT
 
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
DECL_FFT 6
DECL_FFT 6, _interleave
%endif
INIT_XMM sse
DECL_FFT 5
DECL_FFT 5, _interleave
%if ARCH_X86_32
INIT_MMX 3dnow
DECL_FFT 4
DECL_FFT 4, _interleave
INIT_MMX 3dnowext
DECL_FFT 4
DECL_FFT 4, _interleave
%endif
 
INIT_XMM sse
%undef mulps
%undef addps
%undef subps
%undef unpcklps
%undef unpckhps
 
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8
%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8
PSWAPD m0, [%3+%2*4]
movq m2, [%3+%1*4-8]
movq m3, m0
punpckldq m0, m2
punpckhdq m2, m3
movd m1, [%4+%1*2-4] ; tcos[j]
movd m3, [%4+%2*2] ; tcos[n4-j-1]
punpckldq m1, [%5+%1*2-4] ; tsin[j]
punpckldq m3, [%5+%2*2] ; tsin[n4-j-1]
 
mova m4, m0
PSWAPD m5, m1
pfmul m0, m1
pfmul m4, m5
mova m6, m2
PSWAPD m5, m3
pfmul m2, m3
pfmul m6, m5
%if cpuflag(3dnowext)
pfpnacc m0, m4
pfpnacc m2, m6
%else
SBUTTERFLY dq, 0, 4, 1
SBUTTERFLY dq, 2, 6, 3
pxor m4, m7
pxor m6, m7
pfadd m0, m4
pfadd m2, m6
%endif
%else
movaps xmm0, [%3+%2*4]
movaps xmm1, [%3+%1*4-0x10]
movaps xmm2, xmm0
shufps xmm0, xmm1, 0x88
shufps xmm1, xmm2, 0x77
movlps xmm4, [%4+%2*2]
movlps xmm5, [%5+%2*2+0x0]
movhps xmm4, [%4+%1*2-0x8]
movhps xmm5, [%5+%1*2-0x8]
movaps xmm2, xmm0
movaps xmm3, xmm1
mulps xmm0, xmm5
mulps xmm1, xmm4
mulps xmm2, xmm4
mulps xmm3, xmm5
subps xmm1, xmm0
addps xmm2, xmm3
movaps xmm0, xmm1
unpcklps xmm1, xmm2
unpckhps xmm0, xmm2
%endif
%endmacro
 
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5
mulps m6, %3, [%5+%1]
mulps m7, %2, [%5+%1]
mulps %2, %2, [%6+%1]
mulps %3, %3, [%6+%1]
subps %2, %2, m6
addps %3, %3, m7
%endmacro
 
%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8
.post:
vmovaps ymm1, [%3+%1*2]
vmovaps ymm0, [%3+%1*2+0x20]
vmovaps ymm3, [%3+%2*2]
vmovaps ymm2, [%3+%2*2+0x20]
 
CMUL %1, ymm0, ymm1, %3, %4, %5
CMUL %2, ymm2, ymm3, %3, %4, %5
vshufps ymm1, ymm1, ymm1, 0x1b
vshufps ymm3, ymm3, ymm3, 0x1b
vperm2f128 ymm1, ymm1, ymm1, 0x01
vperm2f128 ymm3, ymm3, ymm3, 0x01
vunpcklps ymm6, ymm2, ymm1
vunpckhps ymm4, ymm2, ymm1
vunpcklps ymm7, ymm0, ymm3
vunpckhps ymm5, ymm0, ymm3
 
vextractf128 [%3+%1*2], ymm7, 0
vextractf128 [%3+%1*2+0x10], ymm5, 0
vextractf128 [%3+%1*2+0x20], ymm7, 1
vextractf128 [%3+%1*2+0x30], ymm5, 1
 
vextractf128 [%3+%2*2], ymm6, 0
vextractf128 [%3+%2*2+0x10], ymm4, 0
vextractf128 [%3+%2*2+0x20], ymm6, 1
vextractf128 [%3+%2*2+0x30], ymm4, 1
sub %2, 0x20
add %1, 0x20
jl .post
%endmacro
 
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8
.post:
movaps xmm1, [%3+%1*2]
movaps xmm0, [%3+%1*2+0x10]
CMUL %1, xmm0, xmm1, %3, %4, %5
movaps xmm5, [%3+%2*2]
movaps xmm4, [%3+%2*2+0x10]
CMUL %2, xmm4, xmm5, %3, %4, %5
shufps xmm1, xmm1, 0x1b
shufps xmm5, xmm5, 0x1b
movaps xmm6, xmm4
unpckhps xmm4, xmm1
unpcklps xmm6, xmm1
movaps xmm2, xmm0
unpcklps xmm0, xmm5
unpckhps xmm2, xmm5
movaps [%3+%2*2], xmm6
movaps [%3+%2*2+0x10], xmm4
movaps [%3+%1*2], xmm0
movaps [%3+%1*2+0x10], xmm2
sub %2, 0x10
add %1, 0x10
jl .post
%endmacro
 
%macro CMUL_3DNOW 6
mova m6, [%1+%2*2]
mova %3, [%1+%2*2+8]
mova %4, m6
mova m7, %3
pfmul m6, [%5+%2]
pfmul %3, [%6+%2]
pfmul %4, [%6+%2]
pfmul m7, [%5+%2]
pfsub %3, m6
pfadd %4, m7
%endmacro
 
%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8
.post:
CMUL_3DNOW %3, %1, m0, m1, %4, %5
CMUL_3DNOW %3, %2, m2, m3, %4, %5
movd [%3+%1*2+ 0], m0
movd [%3+%2*2+12], m1
movd [%3+%2*2+ 0], m2
movd [%3+%1*2+12], m3
psrlq m0, 32
psrlq m1, 32
psrlq m2, 32
psrlq m3, 32
movd [%3+%1*2+ 8], m0
movd [%3+%2*2+ 4], m1
movd [%3+%2*2+ 8], m2
movd [%3+%1*2+ 4], m3
sub %2, 8
add %1, 8
jl .post
%endmacro
 
%macro DECL_IMDCT 1
cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input
%if ARCH_X86_64
%define rrevtab r7
%define rtcos r8
%define rtsin r9
%else
%define rrevtab r6
%define rtsin r6
%define rtcos r5
%endif
mov r3d, [r0+FFTContext.mdctsize]
add r2, r3
shr r3, 1
mov rtcos, [r0+FFTContext.tcos]
mov rtsin, [r0+FFTContext.tsin]
add rtcos, r3
add rtsin, r3
%if ARCH_X86_64 == 0
push rtcos
push rtsin
%endif
shr r3, 1
mov rrevtab, [r0+FFTContext.revtab]
add rrevtab, r3
%if ARCH_X86_64 == 0
push rrevtab
%endif
 
%if mmsize == 8
sub r3, 2
%else
sub r3, 4
%endif
%if ARCH_X86_64 || mmsize == 8
xor r4, r4
sub r4, r3
%endif
%if notcpuflag(3dnowext) && mmsize == 8
movd m7, [ps_m1m1m1m1]
%endif
.pre:
%if ARCH_X86_64 == 0
;unspill
%if mmsize != 8
xor r4, r4
sub r4, r3
%endif
mov rtcos, [esp+8]
mov rtsin, [esp+4]
%endif
 
PREROTATER r4, r3, r2, rtcos, rtsin
%if mmsize == 8
mov r6, [esp] ; rrevtab = ptr+n8
movzx r5, word [rrevtab+r4-2] ; rrevtab[j]
movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1]
mova [r1+r5*8], m0
mova [r1+r6*8], m2
add r4, 2
sub r3, 2
%else
%if ARCH_X86_64
movzx r5, word [rrevtab+r4-4]
movzx r6, word [rrevtab+r4-2]
movzx r10, word [rrevtab+r3]
movzx r11, word [rrevtab+r3+2]
movlps [r1+r5 *8], xmm0
movhps [r1+r6 *8], xmm0
movlps [r1+r10*8], xmm1
movhps [r1+r11*8], xmm1
add r4, 4
%else
mov r6, [esp]
movzx r5, word [r6+r4-4]
movzx r4, word [r6+r4-2]
movlps [r1+r5*8], xmm0
movhps [r1+r4*8], xmm0
movzx r5, word [r6+r3]
movzx r4, word [r6+r3+2]
movlps [r1+r5*8], xmm1
movhps [r1+r4*8], xmm1
%endif
sub r3, 4
%endif
jns .pre
 
mov r5, r0
mov r6, r1
mov r0, r1
mov r1d, [r5+FFTContext.nbits]
 
FFT_DISPATCH SUFFIX, r1
 
mov r0d, [r5+FFTContext.mdctsize]
add r6, r0
shr r0, 1
%if ARCH_X86_64 == 0
%define rtcos r2
%define rtsin r3
mov rtcos, [esp+8]
mov rtsin, [esp+4]
%endif
neg r0
mov r1, -mmsize
sub r1, r0
%1 r0, r1, r6, rtcos, rtsin
%if ARCH_X86_64 == 0
add esp, 12
%endif
%if mmsize == 8
femms
%endif
RET
%endmacro
 
DECL_IMDCT POSROTATESHUF
 
%if ARCH_X86_32
INIT_MMX 3dnow
DECL_IMDCT POSROTATESHUF_3DNOW
 
INIT_MMX 3dnowext
DECL_IMDCT POSROTATESHUF_3DNOW
%endif
 
INIT_YMM avx
 
%if HAVE_AVX_EXTERNAL
DECL_IMDCT POSROTATESHUF_AVX
%endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fft.h
0,0 → 1,38
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef AVCODEC_X86_FFT_H
#define AVCODEC_X86_FFT_H
 
#include "libavcodec/fft.h"
 
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z);
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z);
void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z);
 
void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input);
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input);
 
#endif /* AVCODEC_X86_FFT_H */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fft_init.c
0,0 → 1,57
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "fft.h"
 
av_cold void ff_fft_init_x86(FFTContext *s)
{
int cpu_flags = av_get_cpu_flags();
 
#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
/* 3DNow! for K6-2/3 */
s->imdct_calc = ff_imdct_calc_3dnow;
s->imdct_half = ff_imdct_half_3dnow;
s->fft_calc = ff_fft_calc_3dnow;
}
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
/* 3DNowEx for K7 */
s->imdct_calc = ff_imdct_calc_3dnowext;
s->imdct_half = ff_imdct_half_3dnowext;
s->fft_calc = ff_fft_calc_3dnowext;
}
#endif
if (EXTERNAL_SSE(cpu_flags)) {
/* SSE for P3/P4/K8 */
s->imdct_calc = ff_imdct_calc_sse;
s->imdct_half = ff_imdct_half_sse;
s->fft_permute = ff_fft_permute_sse;
s->fft_calc = ff_fft_calc_sse;
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS;
}
if (EXTERNAL_AVX(cpu_flags) && s->nbits >= 5) {
/* AVX for SB */
s->imdct_half = ff_imdct_half_avx;
s->fft_calc = ff_fft_calc_avx;
s->fft_permutation = FF_FFT_PERM_AVX;
}
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fmtconvert.asm
0,0 → 1,429
;******************************************************************************
;* x86 optimized Format Conversion Utils
;* Copyright (c) 2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_TEXT
 
%macro CVTPS2PI 2
%if cpuflag(sse)
cvtps2pi %1, %2
%elif cpuflag(3dnow)
pf2id %1, %2
%endif
%endmacro
 
;---------------------------------------------------------------------------------
; void int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul, int len);
;---------------------------------------------------------------------------------
%macro INT32_TO_FLOAT_FMUL_SCALAR 1
%if UNIX64
cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len
%else
cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len
%endif
%if WIN64
SWAP 0, 2
%elif ARCH_X86_32
movss m0, mulm
%endif
SPLATD m0
shl lenq, 2
add srcq, lenq
add dstq, lenq
neg lenq
.loop:
%if cpuflag(sse2)
cvtdq2ps m1, [srcq+lenq ]
cvtdq2ps m2, [srcq+lenq+16]
%else
cvtpi2ps m1, [srcq+lenq ]
cvtpi2ps m3, [srcq+lenq+ 8]
cvtpi2ps m2, [srcq+lenq+16]
cvtpi2ps m4, [srcq+lenq+24]
movlhps m1, m3
movlhps m2, m4
%endif
mulps m1, m0
mulps m2, m0
mova [dstq+lenq ], m1
mova [dstq+lenq+16], m2
add lenq, 32
jl .loop
REP_RET
%endmacro
 
INIT_XMM sse
INT32_TO_FLOAT_FMUL_SCALAR 5
INIT_XMM sse2
INT32_TO_FLOAT_FMUL_SCALAR 3
 
 
;------------------------------------------------------------------------------
; void ff_float_to_int16(int16_t *dst, const float *src, long len);
;------------------------------------------------------------------------------
%macro FLOAT_TO_INT16 1
cglobal float_to_int16, 3, 3, %1, dst, src, len
add lenq, lenq
lea srcq, [srcq+2*lenq]
add dstq, lenq
neg lenq
.loop:
%if cpuflag(sse2)
cvtps2dq m0, [srcq+2*lenq ]
cvtps2dq m1, [srcq+2*lenq+16]
packssdw m0, m1
mova [dstq+lenq], m0
%else
CVTPS2PI m0, [srcq+2*lenq ]
CVTPS2PI m1, [srcq+2*lenq+ 8]
CVTPS2PI m2, [srcq+2*lenq+16]
CVTPS2PI m3, [srcq+2*lenq+24]
packssdw m0, m1
packssdw m2, m3
mova [dstq+lenq ], m0
mova [dstq+lenq+8], m2
%endif
add lenq, 16
js .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
 
INIT_XMM sse2
FLOAT_TO_INT16 2
INIT_MMX sse
FLOAT_TO_INT16 0
INIT_MMX 3dnow
FLOAT_TO_INT16 0
 
;------------------------------------------------------------------------------
; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step);
;------------------------------------------------------------------------------
%macro FLOAT_TO_INT16_STEP 1
cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2
add lenq, lenq
lea srcq, [srcq+2*lenq]
lea step3q, [stepq*3]
neg lenq
.loop:
%if cpuflag(sse2)
cvtps2dq m0, [srcq+2*lenq ]
cvtps2dq m1, [srcq+2*lenq+16]
packssdw m0, m1
movd v1d, m0
psrldq m0, 4
movd v2d, m0
psrldq m0, 4
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
movd v1d, m0
psrldq m0, 4
movd v2d, m0
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
%else
CVTPS2PI m0, [srcq+2*lenq ]
CVTPS2PI m1, [srcq+2*lenq+ 8]
CVTPS2PI m2, [srcq+2*lenq+16]
CVTPS2PI m3, [srcq+2*lenq+24]
packssdw m0, m1
packssdw m2, m3
movd v1d, m0
psrlq m0, 32
movd v2d, m0
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
movd v1d, m2
psrlq m2, 32
movd v2d, m2
mov [dstq], v1w
mov [dstq+stepq*4], v2w
shr v1d, 16
shr v2d, 16
mov [dstq+stepq*2], v1w
mov [dstq+step3q*2], v2w
lea dstq, [dstq+stepq*8]
%endif
add lenq, 16
js .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
 
INIT_XMM sse2
FLOAT_TO_INT16_STEP 2
INIT_MMX sse
FLOAT_TO_INT16_STEP 0
INIT_MMX 3dnow
FLOAT_TO_INT16_STEP 0
 
;-------------------------------------------------------------------------------
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len);
;-------------------------------------------------------------------------------
%macro FLOAT_TO_INT16_INTERLEAVE2 0
cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len
lea lenq, [4*r2q]
mov src1q, [src0q+gprsize]
mov src0q, [src0q]
add dstq, lenq
add src0q, lenq
add src1q, lenq
neg lenq
.loop:
%if cpuflag(sse2)
cvtps2dq m0, [src0q+lenq]
cvtps2dq m1, [src1q+lenq]
packssdw m0, m1
movhlps m1, m0
punpcklwd m0, m1
mova [dstq+lenq], m0
%else
CVTPS2PI m0, [src0q+lenq ]
CVTPS2PI m1, [src0q+lenq+8]
CVTPS2PI m2, [src1q+lenq ]
CVTPS2PI m3, [src1q+lenq+8]
packssdw m0, m1
packssdw m2, m3
mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
mova [dstq+lenq ], m0
mova [dstq+lenq+8], m1
%endif
add lenq, 16
js .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
 
INIT_MMX 3dnow
FLOAT_TO_INT16_INTERLEAVE2
INIT_MMX sse
FLOAT_TO_INT16_INTERLEAVE2
INIT_XMM sse2
FLOAT_TO_INT16_INTERLEAVE2
 
%macro FLOAT_TO_INT16_INTERLEAVE6 0
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len)
cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
.loop:
CVTPS2PI mm0, [srcq]
CVTPS2PI mm1, [srcq+src1q]
CVTPS2PI mm2, [srcq+src2q]
CVTPS2PI mm3, [srcq+src3q]
CVTPS2PI mm4, [srcq+src4q]
CVTPS2PI mm5, [srcq+src5q]
packssdw mm0, mm3
packssdw mm1, mm4
packssdw mm2, mm5
PSWAPD mm3, mm0
punpcklwd mm0, mm1
punpckhwd mm1, mm2
punpcklwd mm2, mm3
PSWAPD mm3, mm0
punpckldq mm0, mm2
punpckhdq mm2, mm1
punpckldq mm1, mm3
movq [dstq ], mm0
movq [dstq+16], mm2
movq [dstq+ 8], mm1
add srcq, 8
add dstq, 24
sub lend, 2
jg .loop
emms
RET
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6
 
INIT_MMX sse
FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX 3dnow
FLOAT_TO_INT16_INTERLEAVE6
INIT_MMX 3dnowext
FLOAT_TO_INT16_INTERLEAVE6
 
;-----------------------------------------------------------------------------
; void ff_float_interleave6(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
 
%macro FLOAT_INTERLEAVE6 1
cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
.loop:
%if cpuflag(sse)
movaps m0, [srcq]
movaps m1, [srcq+src1q]
movaps m2, [srcq+src2q]
movaps m3, [srcq+src3q]
movaps m4, [srcq+src4q]
movaps m5, [srcq+src5q]
 
SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6
 
movaps m6, m4
shufps m4, m0, 0xe4
movlhps m0, m2
movhlps m6, m2
movaps [dstq ], m0
movaps [dstq+16], m4
movaps [dstq+32], m6
 
movaps m6, m5
shufps m5, m1, 0xe4
movlhps m1, m3
movhlps m6, m3
movaps [dstq+48], m1
movaps [dstq+64], m5
movaps [dstq+80], m6
%else ; mmx
movq m0, [srcq]
movq m1, [srcq+src1q]
movq m2, [srcq+src2q]
movq m3, [srcq+src3q]
movq m4, [srcq+src4q]
movq m5, [srcq+src5q]
 
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
SBUTTERFLY dq, 4, 5, 6
movq [dstq ], m0
movq [dstq+ 8], m2
movq [dstq+16], m4
movq [dstq+24], m1
movq [dstq+32], m3
movq [dstq+40], m5
%endif
add srcq, mmsize
add dstq, mmsize*6
sub lend, mmsize/4
jg .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
 
INIT_MMX mmx
FLOAT_INTERLEAVE6 0
INIT_XMM sse
FLOAT_INTERLEAVE6 7
 
;-----------------------------------------------------------------------------
; void ff_float_interleave2(float *dst, const float **src, unsigned int len);
;-----------------------------------------------------------------------------
 
%macro FLOAT_INTERLEAVE2 1
cglobal float_interleave2, 3, 4, %1, dst, src, len, src1
mov src1q, [srcq+gprsize]
mov srcq, [srcq ]
sub src1q, srcq
.loop:
mova m0, [srcq ]
mova m1, [srcq+src1q ]
mova m3, [srcq +mmsize]
mova m4, [srcq+src1q+mmsize]
 
mova m2, m0
PUNPCKLDQ m0, m1
PUNPCKHDQ m2, m1
 
mova m1, m3
PUNPCKLDQ m3, m4
PUNPCKHDQ m1, m4
 
mova [dstq ], m0
mova [dstq+1*mmsize], m2
mova [dstq+2*mmsize], m3
mova [dstq+3*mmsize], m1
 
add srcq, mmsize*2
add dstq, mmsize*4
sub lend, mmsize/2
jg .loop
%if mmsize == 8
emms
%endif
REP_RET
%endmacro
 
INIT_MMX mmx
%define PUNPCKLDQ punpckldq
%define PUNPCKHDQ punpckhdq
FLOAT_INTERLEAVE2 0
INIT_XMM sse
%define PUNPCKLDQ unpcklps
%define PUNPCKHDQ unpckhps
FLOAT_INTERLEAVE2 5
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fmtconvert_init.c
0,0 → 1,147
/*
* Format Conversion Utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/fmtconvert.h"
 
#if HAVE_YASM
 
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len);
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len);
 
void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len);
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len);
 
void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step);
void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step);
void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step);
 
void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len);
void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len);
 
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len);
void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len);
 
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse
 
#define FLOAT_TO_INT16_INTERLEAVE(cpu) \
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\
int c;\
for(c=0; c<channels; c++){\
ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\
}\
}\
\
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\
if(channels==1)\
ff_float_to_int16_##cpu(dst, src[0], len);\
else if(channels==2){\
ff_float_to_int16_interleave2_##cpu(dst, src, len);\
}else if(channels==6){\
ff_float_to_int16_interleave6_##cpu(dst, src, len);\
}else\
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\
}
 
FLOAT_TO_INT16_INTERLEAVE(3dnow)
FLOAT_TO_INT16_INTERLEAVE(sse)
FLOAT_TO_INT16_INTERLEAVE(sse2)
 
static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src,
long len, int channels)
{
if(channels==6)
ff_float_to_int16_interleave6_3dnowext(dst, src, len);
else
float_to_int16_interleave_3dnow(dst, src, len, channels);
}
 
void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len);
 
void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len);
void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len);
 
static void float_interleave_mmx(float *dst, const float **src,
unsigned int len, int channels)
{
if (channels == 2) {
ff_float_interleave2_mmx(dst, src, len);
} else if (channels == 6)
ff_float_interleave6_mmx(dst, src, len);
else
ff_float_interleave_c(dst, src, len, channels);
}
 
static void float_interleave_sse(float *dst, const float **src,
unsigned int len, int channels)
{
if (channels == 2) {
ff_float_interleave2_sse(dst, src, len);
} else if (channels == 6)
ff_float_interleave6_sse(dst, src, len);
else
ff_float_interleave_c(dst, src, len, channels);
}
#endif /* HAVE_YASM */
 
av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
 
if (EXTERNAL_MMX(cpu_flags)) {
c->float_interleave = float_interleave_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16 = ff_float_to_int16_3dnow;
c->float_to_int16_interleave = float_to_int16_interleave_3dnow;
}
}
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) {
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) {
c->float_to_int16_interleave = float_to_int16_interleave_3dnowext;
}
}
if (EXTERNAL_SSE(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse;
c->float_to_int16 = ff_float_to_int16_sse;
c->float_to_int16_interleave = float_to_int16_interleave_sse;
c->float_interleave = float_interleave_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2;
c->float_to_int16 = ff_float_to_int16_sse2;
c->float_to_int16_interleave = float_to_int16_interleave_sse2;
}
#endif /* HAVE_YASM */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fpel.asm
0,0 → 1,106
;******************************************************************************
;* MMX optimized DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION .text
 
INIT_MMX mmxext
; void pixels(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PIXELS48 2
%if %2 == 4
%define OP movh
%else
%define OP mova
%endif
cglobal %1_pixels%2, 4,5
movsxdifnidn r2, r2d
lea r4, [r2*3]
.loop:
OP m0, [r1]
OP m1, [r1+r2]
OP m2, [r1+r2*2]
OP m3, [r1+r4]
lea r1, [r1+r2*4]
%ifidn %1, avg
pavgb m0, [r0]
pavgb m1, [r0+r2]
pavgb m2, [r0+r2*2]
pavgb m3, [r0+r4]
%endif
OP [r0], m0
OP [r0+r2], m1
OP [r0+r2*2], m2
OP [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jne .loop
RET
%endmacro
 
PIXELS48 put, 4
PIXELS48 avg, 4
PIXELS48 put, 8
PIXELS48 avg, 8
 
 
INIT_XMM sse2
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
cglobal put_pixels16, 4,5,4
lea r4, [r2*3]
.loop:
movu m0, [r1]
movu m1, [r1+r2]
movu m2, [r1+r2*2]
movu m3, [r1+r4]
lea r1, [r1+r2*4]
mova [r0], m0
mova [r0+r2], m1
mova [r0+r2*2], m2
mova [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jnz .loop
REP_RET
 
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
cglobal avg_pixels16, 4,5,4
lea r4, [r2*3]
.loop:
movu m0, [r1]
movu m1, [r1+r2]
movu m2, [r1+r2*2]
movu m3, [r1+r4]
lea r1, [r1+r2*4]
pavgb m0, [r0]
pavgb m1, [r0+r2]
pavgb m2, [r0+r2*2]
pavgb m3, [r0+r4]
mova [r0], m0
mova [r0+r2], m1
mova [r0+r2*2], m2
mova [r0+r4], m3
sub r3d, 4
lea r0, [r0+r2*4]
jnz .loop
REP_RET
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fpel_mmx.c
0,0 → 1,139
/*
* MMX-optimized avg/put pixel routines
*
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stddef.h>
#include <stdint.h>
 
#include "config.h"
#include "dsputil_x86.h"
 
#if HAVE_MMX_INLINE
 
// in case more speed is needed - unrolling would certainly help
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
JUMPALIGN();
do {
__asm__ volatile(
"movq %0, %%mm0 \n\t"
"movq %1, %%mm1 \n\t"
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
"movq %%mm2, %0 \n\t"
:"+m"(*block)
:"m"(*pixels)
:"memory");
pixels += line_size;
block += line_size;
}
while (--h);
}
 
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
JUMPALIGN();
do {
__asm__ volatile(
"movq %0, %%mm0 \n\t"
"movq %1, %%mm1 \n\t"
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
"movq %%mm2, %0 \n\t"
"movq 8%0, %%mm0 \n\t"
"movq 8%1, %%mm1 \n\t"
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
"movq %%mm2, 8%0 \n\t"
:"+m"(*block)
:"m"(*pixels)
:"memory");
pixels += line_size;
block += line_size;
}
while (--h);
}
 
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
__asm__ volatile (
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1 ), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1 ), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r"(pixels), "+r"(block)
: "r"((x86_reg)line_size)
: "%"REG_a, "memory"
);
}
 
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
__asm__ volatile (
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1 ), %%mm0 \n\t"
"movq 8(%1 ), %%mm4 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq 8(%1, %3), %%mm5 \n\t"
"movq %%mm0, (%2) \n\t"
"movq %%mm4, 8(%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1 ), %%mm0 \n\t"
"movq 8(%1 ), %%mm4 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq 8(%1, %3), %%mm5 \n\t"
"movq %%mm0, (%2) \n\t"
"movq %%mm4, 8(%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
: "+g"(h), "+r"(pixels), "+r"(block)
: "r"((x86_reg)line_size)
: "%"REG_a, "memory"
);
}
 
#endif /* HAVE_MMX_INLINE */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h263_loopfilter.asm
0,0 → 1,189
;******************************************************************************
;* MMX-optimized H.263 loop filter
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
cextern pb_FC
cextern h263_loop_filter_strength
 
SECTION_TEXT
 
%macro H263_LOOP_FILTER 5
pxor m7, m7
mova m0, [%1]
mova m1, [%1]
mova m2, [%4]
mova m3, [%4]
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
psubw m0, m2
psubw m1, m3
mova m2, [%2]
mova m3, [%2]
mova m4, [%3]
mova m5, [%3]
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
psubw m4, m2
psubw m5, m3
psllw m4, 2
psllw m5, 2
paddw m4, m0
paddw m5, m1
pxor m6, m6
pcmpgtw m6, m4
pcmpgtw m7, m5
pxor m4, m6
pxor m5, m7
psubw m4, m6
psubw m5, m7
psrlw m4, 3
psrlw m5, 3
packuswb m4, m5
packsswb m6, m7
pxor m7, m7
movd m2, %5
punpcklbw m2, m2
punpcklbw m2, m2
punpcklbw m2, m2
psubusb m2, m4
mova m3, m2
psubusb m3, m4
psubb m2, m3
mova m3, [%2]
mova m4, [%3]
pxor m3, m6
pxor m4, m6
paddusb m3, m2
psubusb m4, m2
pxor m3, m6
pxor m4, m6
paddusb m2, m2
packsswb m0, m1
pcmpgtb m7, m0
pxor m0, m7
psubb m0, m7
mova m1, m0
psubusb m0, m2
psubb m1, m0
pand m1, [pb_FC]
psrlw m1, 2
pxor m1, m7
psubb m1, m7
mova m5, [%1]
mova m6, [%4]
psubb m5, m1
paddb m6, m1
%endmacro
 
INIT_MMX mmx
; void h263_v_loop_filter(uint8_t *src, int stride, int qscale)
cglobal h263_v_loop_filter, 3,5
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
 
lea r4, [h263_loop_filter_strength]
movzx r3d, BYTE [r4+r2]
movsx r2, r3b
shl r2, 1
 
mov r3, r0
sub r3, r1
mov r4, r3
sub r4, r1
H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d
 
mova [r3], m3
mova [r0], m4
mova [r4], m5
mova [r0+r1], m6
RET
 
%macro TRANSPOSE4X4 2
movd m0, [%1]
movd m1, [%1+r1]
movd m2, [%1+r1*2]
movd m3, [%1+r3]
punpcklbw m0, m1
punpcklbw m2, m3
mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
movd [%2+ 0], m0
punpckhdq m0, m0
movd [%2+ 8], m0
movd [%2+16], m1
punpckhdq m1, m1
movd [%2+24], m1
%endmacro
 
 
; void h263_h_loop_filter(uint8_t *src, int stride, int qscale)
INIT_MMX mmx
cglobal h263_h_loop_filter, 3,5,0,32
movsxdifnidn r1, r1d
movsxdifnidn r2, r2d
 
lea r4, [h263_loop_filter_strength]
movzx r3d, BYTE [r4+r2]
movsx r2, r3b
shl r2, 1
 
sub r0, 2
lea r3, [r1*3]
 
TRANSPOSE4X4 r0, rsp
lea r4, [r0+r1*4]
TRANSPOSE4X4 r4, rsp+4
 
H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d
 
mova m1, m5
mova m0, m4
punpcklbw m5, m3
punpcklbw m4, m6
punpckhbw m1, m3
punpckhbw m0, m6
mova m3, m5
mova m6, m1
punpcklwd m5, m4
punpcklwd m1, m0
punpckhwd m3, m4
punpckhwd m6, m0
movd [r0], m5
punpckhdq m5, m5
movd [r0+r1*1], m5
movd [r0+r1*2], m3
punpckhdq m3, m3
movd [r0+r3], m3
movd [r4], m1
punpckhdq m1, m1
movd [r4+r1*1], m1
movd [r4+r1*2], m6
punpckhdq m6, m6
movd [r4+r3], m6
RET
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_chromamc.asm
0,0 → 1,678
;******************************************************************************
;* MMX/SSSE3-optimized functions for H264 chroma MC
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>,
;* 2005-2008 Loren Merritt
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
rnd_rv40_2d_tbl: times 4 dw 0
times 4 dw 16
times 4 dw 32
times 4 dw 16
times 4 dw 32
times 4 dw 28
times 4 dw 32
times 4 dw 28
times 4 dw 0
times 4 dw 32
times 4 dw 16
times 4 dw 32
times 4 dw 32
times 4 dw 28
times 4 dw 32
times 4 dw 28
rnd_rv40_1d_tbl: times 4 dw 0
times 4 dw 2
times 4 dw 4
times 4 dw 2
times 4 dw 4
times 4 dw 3
times 4 dw 4
times 4 dw 3
times 4 dw 0
times 4 dw 4
times 4 dw 2
times 4 dw 4
times 4 dw 4
times 4 dw 3
times 4 dw 4
times 4 dw 3
 
cextern pw_3
cextern pw_4
cextern pw_8
pw_28: times 8 dw 28
cextern pw_32
cextern pw_64
 
SECTION .text
 
%macro mv0_pixels_mc8 0
lea r4, [r2*2 ]
.next4rows:
movq mm0, [r1 ]
movq mm1, [r1+r2]
add r1, r4
CHROMAMC_AVG mm0, [r0 ]
CHROMAMC_AVG mm1, [r0+r2]
movq [r0 ], mm0
movq [r0+r2], mm1
add r0, r4
movq mm0, [r1 ]
movq mm1, [r1+r2]
add r1, r4
CHROMAMC_AVG mm0, [r0 ]
CHROMAMC_AVG mm1, [r0+r2]
movq [r0 ], mm0
movq [r0+r2], mm1
add r0, r4
sub r3d, 4
jne .next4rows
%endmacro
 
%macro chroma_mc8_mmx_func 2-3
%ifidn %2, rv40
%ifdef PIC
%define rnd_1d_rv40 r8
%define rnd_2d_rv40 r8
%define extra_regs 2
%else ; no-PIC
%define rnd_1d_rv40 rnd_rv40_1d_tbl
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%define extra_regs 1
%endif ; PIC
%else
%define extra_regs 0
%endif ; rv40
; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
; int stride, int h, int mx, int my)
cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
; mx == 0 AND my == 0 - no filter needed
mv0_pixels_mc8
REP_RET
 
.at_least_one_non_zero:
%ifidn %2, rv40
%if ARCH_X86_64
mov r7, r5
and r7, 6 ; &~1 for mx/my=[0,7]
lea r7, [r7*4+r4]
sar r7d, 1
%define rnd_bias r7
%define dest_reg r0
%else ; x86-32
mov r0, r5
and r0, 6 ; &~1 for mx/my=[0,7]
lea r0, [r0*4+r4]
sar r0d, 1
%define rnd_bias r0
%define dest_reg r5
%endif
%else ; vc1, h264
%define rnd_bias 0
%define dest_reg r0
%endif
 
test r5d, r5d
mov r6, 1
je .my_is_zero
test r4d, r4d
mov r6, r2 ; dxy = x ? 1 : stride
jne .both_non_zero
.my_is_zero:
; mx == 0 XOR my == 0 - 1 dimensional filter only
or r4d, r5d ; x + y
 
%ifidn %2, rv40
%ifdef PIC
lea r8, [rnd_rv40_1d_tbl]
%endif
%if ARCH_X86_64 == 0
mov r5, r0m
%endif
%endif
 
movd m5, r4d
movq m4, [pw_8]
movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
punpcklwd m5, m5
punpckldq m5, m5 ; mm5 = B = x
pxor m7, m7
psubw m4, m5 ; mm4 = A = 8-x
 
.next1drow:
movq m0, [r1 ] ; mm0 = src[0..7]
movq m2, [r1+r6] ; mm1 = src[1..8]
 
movq m1, m0
movq m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
pmullw m0, m4 ; [mm0,mm1] = A * src[0..7]
pmullw m1, m4
pmullw m2, m5 ; [mm2,mm3] = B * src[1..8]
pmullw m3, m5
 
paddw m0, m6
paddw m1, m6
paddw m0, m2
paddw m1, m3
psrlw m0, 3
psrlw m1, 3
packuswb m0, m1
CHROMAMC_AVG m0, [dest_reg]
movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
 
add dest_reg, r2
add r1, r2
dec r3d
jne .next1drow
REP_RET
 
.both_non_zero: ; general case, bilinear
movd m4, r4d ; x
movd m6, r5d ; y
%ifidn %2, rv40
%ifdef PIC
lea r8, [rnd_rv40_2d_tbl]
%endif
%if ARCH_X86_64 == 0
mov r5, r0m
%endif
%endif
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, 16 ; AA and DD
 
punpcklwd m4, m4
punpcklwd m6, m6
punpckldq m4, m4 ; mm4 = x words
punpckldq m6, m6 ; mm6 = y words
movq m5, m4
pmullw m4, m6 ; mm4 = x * y
psllw m5, 3
psllw m6, 3
movq m7, m5
paddw m7, m6
movq [rsp+8], m4 ; DD = x * y
psubw m5, m4 ; mm5 = B = 8x - xy
psubw m6, m4 ; mm6 = C = 8y - xy
paddw m4, [pw_64]
psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64
pxor m7, m7
movq [rsp ], m4
 
movq m0, [r1 ] ; mm0 = src[0..7]
movq m1, [r1+1] ; mm1 = src[1..8]
.next2drow:
add r1, r2
 
movq m2, m0
movq m3, m1
punpckhbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
pmullw m0, [rsp]
pmullw m2, [rsp]
pmullw m1, m5
pmullw m3, m5
paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4]
paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8]
 
movq m0, [r1]
movq m1, m0
punpcklbw m0, m7
punpckhbw m1, m7
pmullw m0, m6
pmullw m1, m6
paddw m2, m0
paddw m3, m1 ; [mm2,mm3] += C * src[0..7]
 
movq m1, [r1+1]
movq m0, m1
movq m4, m1
punpcklbw m0, m7
punpckhbw m4, m7
pmullw m0, [rsp+8]
pmullw m4, [rsp+8]
paddw m2, m0
paddw m3, m4 ; [mm2,mm3] += D * src[1..8]
movq m0, [r1]
 
paddw m2, [rnd_2d_%2+rnd_bias*8]
paddw m3, [rnd_2d_%2+rnd_bias*8]
psrlw m2, 6
psrlw m3, 6
packuswb m2, m3
CHROMAMC_AVG m2, [dest_reg]
movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
 
add dest_reg, r2
dec r3d
jne .next2drow
mov rsp, r6 ; restore stack pointer
RET
%endmacro
 
%macro chroma_mc4_mmx_func 2
%define extra_regs 0
%ifidn %2, rv40
%ifdef PIC
%define extra_regs 1
%endif ; PIC
%endif ; rv40
cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
pxor m7, m7
movd m2, r4d ; x
movd m3, r5d ; y
movq m4, [pw_8]
movq m5, [pw_8]
punpcklwd m2, m2
punpcklwd m3, m3
punpcklwd m2, m2
punpcklwd m3, m3
psubw m4, m2
psubw m5, m3
 
%ifidn %2, rv40
%ifdef PIC
lea r6, [rnd_rv40_2d_tbl]
%define rnd_2d_rv40 r6
%else
%define rnd_2d_rv40 rnd_rv40_2d_tbl
%endif
and r5, 6 ; &~1 for mx/my=[0,7]
lea r5, [r5*4+r4]
sar r5d, 1
%define rnd_bias r5
%else ; vc1, h264
%define rnd_bias 0
%endif
 
movd m0, [r1 ]
movd m6, [r1+1]
add r1, r2
punpcklbw m0, m7
punpcklbw m6, m7
pmullw m0, m4
pmullw m6, m2
paddw m6, m0
 
.next2rows:
movd m0, [r1 ]
movd m1, [r1+1]
add r1, r2
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4
pmullw m1, m2
paddw m1, m0
movq m0, m1
 
pmullw m6, m5
pmullw m1, m3
paddw m6, [rnd_2d_%2+rnd_bias*8]
paddw m1, m6
psrlw m1, 6
packuswb m1, m1
CHROMAMC_AVG4 m1, m6, [r0]
movd [r0], m1
add r0, r2
 
movd m6, [r1 ]
movd m1, [r1+1]
add r1, r2
punpcklbw m6, m7
punpcklbw m1, m7
pmullw m6, m4
pmullw m1, m2
paddw m1, m6
movq m6, m1
pmullw m0, m5
pmullw m1, m3
paddw m0, [rnd_2d_%2+rnd_bias*8]
paddw m1, m0
psrlw m1, 6
packuswb m1, m1
CHROMAMC_AVG4 m1, m0, [r0]
movd [r0], m1
add r0, r2
sub r3d, 2
jnz .next2rows
REP_RET
%endmacro
 
%macro chroma_mc2_mmx_func 2
cglobal %1_%2_chroma_mc2, 6, 7, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
 
mov r6d, r4d
shl r4d, 16
sub r4d, r6d
add r4d, 8
imul r5d, r4d ; x*y<<16 | y*(8-x)
shl r4d, 3
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
 
movd m5, r4d
movd m6, r5d
punpckldq m5, m5 ; mm5 = {A,B,A,B}
punpckldq m6, m6 ; mm6 = {C,D,C,D}
pxor m7, m7
movd m2, [r1]
punpcklbw m2, m7
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2]
 
.nextrow:
add r1, r2
movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
movd m0, [r1]
punpcklbw m0, m7
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2]
movq m2, m0
pmaddwd m0, m6
paddw m1, [rnd_2d_%2]
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
psrlw m1, 6
packssdw m1, m7
packuswb m1, m7
CHROMAMC_AVG4 m1, m3, [r0]
movd r5d, m1
mov [r0], r5w
add r0, r2
sub r3d, 1
jnz .nextrow
REP_RET
%endmacro
 
%define rnd_1d_h264 pw_4
%define rnd_2d_h264 pw_32
%define rnd_1d_vc1 pw_3
%define rnd_2d_vc1 pw_28
 
%macro NOTHING 2-3
%endmacro
%macro DIRECT_AVG 2
PAVGB %1, %2
%endmacro
%macro COPY_AVG 3
movd %2, %3
PAVGB %1, %2
%endmacro
 
INIT_MMX mmx
%define CHROMAMC_AVG NOTHING
%define CHROMAMC_AVG4 NOTHING
chroma_mc8_mmx_func put, h264, _rnd
chroma_mc8_mmx_func put, vc1, _nornd
chroma_mc8_mmx_func put, rv40
chroma_mc4_mmx_func put, h264
chroma_mc4_mmx_func put, rv40
 
INIT_MMX mmxext
chroma_mc2_mmx_func put, h264
 
%define CHROMAMC_AVG DIRECT_AVG
%define CHROMAMC_AVG4 COPY_AVG
chroma_mc8_mmx_func avg, h264, _rnd
chroma_mc8_mmx_func avg, vc1, _nornd
chroma_mc8_mmx_func avg, rv40
chroma_mc4_mmx_func avg, h264
chroma_mc4_mmx_func avg, rv40
chroma_mc2_mmx_func avg, h264
 
INIT_MMX 3dnow
chroma_mc8_mmx_func avg, h264, _rnd
chroma_mc8_mmx_func avg, vc1, _nornd
chroma_mc8_mmx_func avg, rv40
chroma_mc4_mmx_func avg, h264
chroma_mc4_mmx_func avg, rv40
 
%macro chroma_mc8_ssse3_func 2-3
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
%if ARCH_X86_64
movsxd r2, r2d
%endif
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
; mx == 0 AND my == 0 - no filter needed
mv0_pixels_mc8
REP_RET
 
.at_least_one_non_zero:
test r5d, r5d
je .my_is_zero
test r4d, r4d
je .mx_is_zero
 
; general case, bilinear
mov r6d, r4d
shl r4d, 8
sub r4, r6
mov r6, 8
add r4, 8 ; x*288+8 = x<<8 | (8-x)
sub r6d, r5d
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
 
movd m7, r6d
movd m6, r4d
movdqa m5, [rnd_2d_%2]
movq m0, [r1 ]
movq m1, [r1+1]
pshuflw m7, m7, 0
pshuflw m6, m6, 0
punpcklbw m0, m1
movlhps m7, m7
movlhps m6, m6
 
.next2rows:
movq m1, [r1+r2*1 ]
movq m2, [r1+r2*1+1]
movq m3, [r1+r2*2 ]
movq m4, [r1+r2*2+1]
lea r1, [r1+r2*2]
punpcklbw m1, m2
movdqa m2, m1
punpcklbw m3, m4
movdqa m4, m3
pmaddubsw m0, m7
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
paddw m0, m5
paddw m2, m5
paddw m1, m0
paddw m3, m2
psrlw m1, 6
movdqa m0, m4
psrlw m3, 6
%ifidn %1, avg
movq m2, [r0 ]
movhps m2, [r0+r2]
%endif
packuswb m1, m3
CHROMAMC_AVG m1, m2
movq [r0 ], m1
movhps [r0+r2], m1
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2rows
REP_RET
 
.my_is_zero:
mov r5d, r4d
shl r4d, 8
add r4, 8
sub r4, r5 ; 255*x+8 = x<<8 | (8-x)
movd m7, r4d
movdqa m6, [rnd_1d_%2]
pshuflw m7, m7, 0
movlhps m7, m7
 
.next2xrows:
movq m0, [r1 ]
movq m1, [r1 +1]
movq m2, [r1+r2 ]
movq m3, [r1+r2+1]
punpcklbw m0, m1
punpcklbw m2, m3
pmaddubsw m0, m7
pmaddubsw m2, m7
%ifidn %1, avg
movq m4, [r0 ]
movhps m4, [r0+r2]
%endif
paddw m0, m6
paddw m2, m6
psrlw m0, 3
psrlw m2, 3
packuswb m0, m2
CHROMAMC_AVG m0, m4
movq [r0 ], m0
movhps [r0+r2], m0
sub r3d, 2
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
jg .next2xrows
REP_RET
 
.mx_is_zero:
mov r4d, r5d
shl r5d, 8
add r5, 8
sub r5, r4 ; 255*y+8 = y<<8 | (8-y)
movd m7, r5d
movdqa m6, [rnd_1d_%2]
pshuflw m7, m7, 0
movlhps m7, m7
 
.next2yrows:
movq m0, [r1 ]
movq m1, [r1+r2 ]
movdqa m2, m1
movq m3, [r1+r2*2]
lea r1, [r1+r2*2]
punpcklbw m0, m1
punpcklbw m2, m3
pmaddubsw m0, m7
pmaddubsw m2, m7
%ifidn %1, avg
movq m4, [r0 ]
movhps m4, [r0+r2]
%endif
paddw m0, m6
paddw m2, m6
psrlw m0, 3
psrlw m2, 3
packuswb m0, m2
CHROMAMC_AVG m0, m4
movq [r0 ], m0
movhps [r0+r2], m0
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2yrows
REP_RET
%endmacro
 
%macro chroma_mc4_ssse3_func 2
cglobal %1_%2_chroma_mc4, 6, 7, 0
%if ARCH_X86_64
movsxd r2, r2d
%endif
mov r6, r4
shl r4d, 8
sub r4d, r6d
mov r6, 8
add r4d, 8 ; x*288+8
sub r6d, r5d
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x)
 
movd m7, r6d
movd m6, r4d
movq m5, [pw_32]
movd m0, [r1 ]
pshufw m7, m7, 0
punpcklbw m0, [r1+1]
pshufw m6, m6, 0
 
.next2rows:
movd m1, [r1+r2*1 ]
movd m3, [r1+r2*2 ]
punpcklbw m1, [r1+r2*1+1]
punpcklbw m3, [r1+r2*2+1]
lea r1, [r1+r2*2]
movq m2, m1
movq m4, m3
pmaddubsw m0, m7
pmaddubsw m1, m6
pmaddubsw m2, m7
pmaddubsw m3, m6
paddw m0, m5
paddw m2, m5
paddw m1, m0
paddw m3, m2
psrlw m1, 6
movq m0, m4
psrlw m3, 6
packuswb m1, m1
packuswb m3, m3
CHROMAMC_AVG m1, [r0 ]
CHROMAMC_AVG m3, [r0+r2]
movd [r0 ], m1
movd [r0+r2], m3
sub r3d, 2
lea r0, [r0+r2*2]
jg .next2rows
REP_RET
%endmacro
 
%define CHROMAMC_AVG NOTHING
INIT_XMM ssse3
chroma_mc8_ssse3_func put, h264, _rnd
chroma_mc8_ssse3_func put, vc1, _nornd
INIT_MMX ssse3
chroma_mc4_ssse3_func put, h264
 
%define CHROMAMC_AVG DIRECT_AVG
INIT_XMM ssse3
chroma_mc8_ssse3_func avg, h264, _rnd
chroma_mc8_ssse3_func avg, vc1, _nornd
INIT_MMX ssse3
chroma_mc4_ssse3_func avg, h264
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm
0,0 → 1,271
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
cextern pw_4
cextern pw_8
cextern pw_32
cextern pw_64
 
SECTION .text
 
 
%macro MV0_PIXELS_MC8 0
lea r4, [r2*3 ]
lea r5, [r2*4 ]
.next4rows:
movu m0, [r1 ]
movu m1, [r1+r2 ]
CHROMAMC_AVG m0, [r0 ]
CHROMAMC_AVG m1, [r0+r2 ]
mova [r0 ], m0
mova [r0+r2 ], m1
movu m0, [r1+r2*2]
movu m1, [r1+r4 ]
CHROMAMC_AVG m0, [r0+r2*2]
CHROMAMC_AVG m1, [r0+r4 ]
mova [r0+r2*2], m0
mova [r0+r4 ], m1
add r1, r5
add r0, r5
sub r3d, 4
jne .next4rows
%endmacro
 
;-----------------------------------------------------------------------------
; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my)
;-----------------------------------------------------------------------------
%macro CHROMA_MC8 1
; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
; int stride, int h, int mx, int my)
cglobal %1_h264_chroma_mc8_10, 6,7,8
movsxdifnidn r2, r2d
mov r6d, r5d
or r6d, r4d
jne .at_least_one_non_zero
; mx == 0 AND my == 0 - no filter needed
MV0_PIXELS_MC8
REP_RET
 
.at_least_one_non_zero:
mov r6d, 2
test r5d, r5d
je .x_interpolation
mov r6, r2 ; dxy = x ? 1 : stride
test r4d, r4d
jne .xy_interpolation
.x_interpolation:
; mx == 0 XOR my == 0 - 1 dimensional filter only
or r4d, r5d ; x + y
movd m5, r4d
mova m4, [pw_8]
mova m6, [pw_4] ; mm6 = rnd >> 3
SPLATW m5, m5 ; mm5 = B = x
psubw m4, m5 ; mm4 = A = 8-x
 
.next1drow:
movu m0, [r1 ] ; mm0 = src[0..7]
movu m2, [r1+r6] ; mm2 = src[1..8]
 
pmullw m0, m4 ; mm0 = A * src[0..7]
pmullw m2, m5 ; mm2 = B * src[1..8]
 
paddw m0, m6
paddw m0, m2
psrlw m0, 3
CHROMAMC_AVG m0, [r0]
mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
 
add r0, r2
add r1, r2
dec r3d
jne .next1drow
REP_RET
 
.xy_interpolation: ; general case, bilinear
movd m4, r4m ; x
movd m6, r5m ; y
 
SPLATW m4, m4 ; mm4 = x words
SPLATW m6, m6 ; mm6 = y words
psllw m5, m4, 3 ; mm5 = 8x
pmullw m4, m6 ; mm4 = x * y
psllw m6, 3 ; mm6 = 8y
paddw m1, m5, m6 ; mm7 = 8x+8y
mova m7, m4 ; DD = x * y
psubw m5, m4 ; mm5 = B = 8x - xy
psubw m6, m4 ; mm6 = C = 8y - xy
paddw m4, [pw_64]
psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64
 
movu m0, [r1 ] ; mm0 = src[0..7]
movu m1, [r1+2] ; mm1 = src[1..8]
.next2drow:
add r1, r2
 
pmullw m2, m0, m4
pmullw m1, m5
paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8]
 
movu m0, [r1]
movu m1, [r1+2]
pmullw m3, m0, m6
paddw m2, m3 ; mm2 += C * src[0..7+strde]
pmullw m3, m1, m7
paddw m2, m3 ; mm2 += D * src[1..8+strde]
 
paddw m2, [pw_32]
psrlw m2, 6
CHROMAMC_AVG m2, [r0]
mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6
 
add r0, r2
dec r3d
jne .next2drow
REP_RET
%endmacro
 
;-----------------------------------------------------------------------------
; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my)
;-----------------------------------------------------------------------------
;TODO: xmm mc4
%macro MC4_OP 2
movq %1, [r1 ]
movq m1, [r1+2]
add r1, r2
pmullw %1, m4
pmullw m1, m2
paddw m1, %1
mova %1, m1
 
pmullw %2, m5
pmullw m1, m3
paddw %2, [pw_32]
paddw m1, %2
psrlw m1, 6
CHROMAMC_AVG m1, %2, [r0]
movq [r0], m1
add r0, r2
%endmacro
 
%macro CHROMA_MC4 1
cglobal %1_h264_chroma_mc4_10, 6,6,7
movsxdifnidn r2, r2d
movd m2, r4m ; x
movd m3, r5m ; y
mova m4, [pw_8]
mova m5, m4
SPLATW m2, m2
SPLATW m3, m3
psubw m4, m2
psubw m5, m3
 
movq m0, [r1 ]
movq m6, [r1+2]
add r1, r2
pmullw m0, m4
pmullw m6, m2
paddw m6, m0
 
.next2rows:
MC4_OP m0, m6
MC4_OP m6, m0
sub r3d, 2
jnz .next2rows
REP_RET
%endmacro
 
;-----------------------------------------------------------------------------
; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my)
;-----------------------------------------------------------------------------
%macro CHROMA_MC2 1
cglobal %1_h264_chroma_mc2_10, 6,7
movsxdifnidn r2, r2d
mov r6d, r4d
shl r4d, 16
sub r4d, r6d
add r4d, 8
imul r5d, r4d ; x*y<<16 | y*(8-x)
shl r4d, 3
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y)
 
movd m5, r4d
movd m6, r5d
punpckldq m5, m5 ; mm5 = {A,B,A,B}
punpckldq m6, m6 ; mm6 = {C,D,C,D}
pxor m7, m7
pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2]
 
.nextrow:
add r1, r2
movq m1, m2
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2]
pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2]
movq m2, m0
pmaddwd m0, m6
paddw m1, [pw_32]
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2]
psrlw m1, 6
packssdw m1, m7
CHROMAMC_AVG m1, m3, [r0]
movd [r0], m1
add r0, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
 
%macro NOTHING 2-3
%endmacro
%macro AVG 2-3
%if %0==3
movq %2, %3
%endif
pavgw %1, %2
%endmacro
 
%define CHROMAMC_AVG NOTHING
INIT_XMM sse2
CHROMA_MC8 put
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
CHROMA_MC8 put
%endif
INIT_MMX mmxext
CHROMA_MC4 put
CHROMA_MC2 put
 
%define CHROMAMC_AVG AVG
INIT_XMM sse2
CHROMA_MC8 avg
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
CHROMA_MC8 avg
%endif
INIT_MMX mmxext
CHROMA_MC4 avg
CHROMA_MC2 avg
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_deblock.asm
0,0 → 1,1078
;*****************************************************************************
;* MMX/SSE2/AVX-optimized H.264 deblocking code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;* Oskar Arvidsson <oskar@irock.se>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
pb_A1: times 16 db 0xA1
pb_3_1: times 4 db 3, 1
 
SECTION .text
 
cextern pb_0
cextern pb_1
cextern pb_3
 
; expands to [base],...,[base+7*stride]
%define PASS8ROWS(base, base3, stride, stride3) \
[base], [base+stride], [base+stride*2], [base3], \
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
 
%define PASS8ROWS(base, base3, stride, stride3, offset) \
PASS8ROWS(base+offset, base3+offset, stride, stride3)
 
; in: 8 rows of 4 bytes in %4..%11
; out: 4 rows of 8 bytes in m0..m3
%macro TRANSPOSE4x8_LOAD 11
movh m0, %4
movh m2, %5
movh m1, %6
movh m3, %7
punpckl%1 m0, m2
punpckl%1 m1, m3
mova m2, m0
punpckl%2 m0, m1
punpckh%2 m2, m1
 
movh m4, %8
movh m6, %9
movh m5, %10
movh m7, %11
punpckl%1 m4, m6
punpckl%1 m5, m7
mova m6, m4
punpckl%2 m4, m5
punpckh%2 m6, m5
 
punpckh%3 m1, m0, m4
punpckh%3 m3, m2, m6
punpckl%3 m0, m4
punpckl%3 m2, m6
%endmacro
 
; in: 4 rows of 8 bytes in m0..m3
; out: 8 rows of 4 bytes in %1..%8
%macro TRANSPOSE8x4B_STORE 8
punpckhdq m4, m0, m0
punpckhdq m5, m1, m1
punpckhdq m6, m2, m2
 
punpcklbw m0, m1
punpcklbw m2, m3
punpcklwd m1, m0, m2
punpckhwd m0, m2
movh %1, m1
punpckhdq m1, m1
movh %2, m1
movh %3, m0
punpckhdq m0, m0
movh %4, m0
 
punpckhdq m3, m3
punpcklbw m4, m5
punpcklbw m6, m3
punpcklwd m5, m4, m6
punpckhwd m4, m6
movh %5, m5
punpckhdq m5, m5
movh %6, m5
movh %7, m4
punpckhdq m4, m4
movh %8, m4
%endmacro
 
%macro TRANSPOSE4x8B_LOAD 8
TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
%endmacro
 
%macro SBUTTERFLY3 4
punpckh%1 %4, %2, %3
punpckl%1 %2, %3
%endmacro
 
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
%macro TRANSPOSE6x8_MEM 9
RESET_MM_PERMUTATION
movq m0, %1
movq m1, %2
movq m2, %3
movq m3, %4
movq m4, %5
movq m5, %6
movq m6, %7
SBUTTERFLY bw, 0, 1, 7
SBUTTERFLY bw, 2, 3, 7
SBUTTERFLY bw, 4, 5, 7
movq [%9+0x10], m3
SBUTTERFLY3 bw, m6, %8, m7
SBUTTERFLY wd, 0, 2, 3
SBUTTERFLY wd, 4, 6, 3
punpckhdq m0, m4
movq [%9+0x00], m0
SBUTTERFLY3 wd, m1, [%9+0x10], m3
SBUTTERFLY wd, 5, 7, 0
SBUTTERFLY dq, 1, 5, 0
SBUTTERFLY dq, 2, 6, 0
punpckldq m3, m7
movq [%9+0x10], m2
movq [%9+0x20], m6
movq [%9+0x30], m1
movq [%9+0x40], m5
movq [%9+0x50], m3
RESET_MM_PERMUTATION
%endmacro
 
; in: 8 rows of 8 in %1..%8
; out: 8 rows of 8 in %9..%16
%macro TRANSPOSE8x8_MEM 16
RESET_MM_PERMUTATION
movq m0, %1
movq m1, %2
movq m2, %3
movq m3, %4
movq m4, %5
movq m5, %6
movq m6, %7
SBUTTERFLY bw, 0, 1, 7
SBUTTERFLY bw, 2, 3, 7
SBUTTERFLY bw, 4, 5, 7
SBUTTERFLY3 bw, m6, %8, m7
movq %9, m5
SBUTTERFLY wd, 0, 2, 5
SBUTTERFLY wd, 4, 6, 5
SBUTTERFLY wd, 1, 3, 5
movq %11, m6
movq m6, %9
SBUTTERFLY wd, 6, 7, 5
SBUTTERFLY dq, 0, 4, 5
SBUTTERFLY dq, 1, 6, 5
movq %9, m0
movq %10, m4
movq %13, m1
movq %14, m6
SBUTTERFLY3 dq, m2, %11, m0
SBUTTERFLY dq, 3, 7, 4
movq %11, m2
movq %12, m0
movq %15, m3
movq %16, m7
RESET_MM_PERMUTATION
%endmacro
 
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT 5
%if avx_enabled == 0
mova %5, %2
mova %4, %1
psubusb %5, %1
psubusb %4, %2
%else
psubusb %5, %2, %1
psubusb %4, %1, %2
%endif
por %4, %5
psubusb %4, %3
%endmacro
 
; out: %4 = |%1-%2|>%3
; clobbers: %5
%macro DIFF_GT2 5
%if ARCH_X86_64
psubusb %5, %2, %1
psubusb %4, %1, %2
%else
mova %5, %2
mova %4, %1
psubusb %5, %1
psubusb %4, %2
%endif
psubusb %5, %3
psubusb %4, %3
pcmpeqb %4, %5
%endmacro
 
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
; out: m5=beta-1, m7=mask, %3=alpha-1
; clobbers: m4,m6
%macro LOAD_MASK 2-3
movd m4, %1
movd m5, %2
SPLATW m4, m4
SPLATW m5, m5
packuswb m4, m4 ; 16x alpha-1
packuswb m5, m5 ; 16x beta-1
%if %0>2
mova %3, m4
%endif
DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
por m7, m4
DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
por m7, m4
pxor m6, m6
pcmpeqb m7, m6
%endmacro
 
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
; out: m1=p0' m2=q0'
; clobbers: m0,3-6
%macro DEBLOCK_P0_Q0 0
pcmpeqb m4, m4
pxor m5, m1, m2 ; p0^q0
pxor m3, m4
pand m5, [pb_1] ; (p0^q0)&1
pavgb m3, m0 ; (p1 - q1 + 256)>>1
pxor m4, m1
pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
pavgb m4, m2 ; (q0 - p0 + 256)>>1
pavgb m3, m5
mova m6, [pb_A1]
paddusb m3, m4 ; d+128+33
psubusb m6, m3
psubusb m3, [pb_A1]
pminub m6, m7
pminub m3, m7
psubusb m1, m6
psubusb m2, m3
paddusb m1, m3
paddusb m2, m6
%endmacro
 
; in: m1=p0 m2=q0
; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
; clobbers: q2, tmp, tc0
%macro LUMA_Q1 6
pavgb %6, m1, m2
pavgb %2, %6 ; avg(p2,avg(p0,q0))
pxor %6, %3
pand %6, [pb_1] ; (p2^avg(p0,q0))&1
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1
psubusb %6, %1, %5
paddusb %5, %1
pmaxub %2, %6
pminub %2, %5
mova %4, %2
%endmacro
 
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA 0
cglobal deblock_v_luma_8, 5,5,10
movd m8, [r4] ; tc0
lea r4, [r1*3]
dec r2d ; alpha-1
neg r4
dec r3d ; beta-1
add r4, r0 ; pix-3*stride
 
mova m0, [r4+r1] ; p1
mova m1, [r4+2*r1] ; p0
mova m2, [r0] ; q0
mova m3, [r0+r1] ; q1
LOAD_MASK r2d, r3d
 
punpcklbw m8, m8
punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
pcmpeqb m9, m9
pcmpeqb m9, m8
pandn m9, m7
pand m8, m9
 
movdqa m3, [r4] ; p2
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m9
psubb m7, m8, m6
pand m6, m8
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
 
movdqa m4, [r0+2*r1] ; q2
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
pand m6, m9
pand m8, m6
psubb m7, m6
mova m3, [r0+r1]
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
 
DEBLOCK_P0_Q0
mova [r4+2*r1], m1
mova [r0], m2
RET
 
;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
movsxd r7, r1d
lea r8, [r7+r7*2]
lea r6, [r0-4]
lea r5, [r0-4+r8]
%if WIN64
%define pix_tmp rsp+0x30 ; shadow space + r4
%else
%define pix_tmp rsp
%endif
 
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp
lea r6, [r6+r7*8]
lea r5, [r5+r7*8]
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
 
; vertical filter
; alpha, beta, tc0 are still in r2d, r3d, r4
; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
lea r0, [pix_tmp+0x30]
mov r1d, 0x10
%if WIN64
mov [rsp+0x20], r4
%endif
call deblock_v_luma_8
 
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
add r6, 2
add r5, 2
movq m0, [pix_tmp+0x18]
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
 
shl r7, 3
sub r6, r7
sub r5, r7
shr r7, 3
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8)
 
RET
%endmacro
 
INIT_XMM sse2
DEBLOCK_LUMA
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA
%endif
 
%else
 
%macro DEBLOCK_LUMA 2
;-----------------------------------------------------------------------------
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_%1_luma_8, 5,5,8,2*%2
lea r4, [r1*3]
dec r2 ; alpha-1
neg r4
dec r3 ; beta-1
add r4, r0 ; pix-3*stride
 
mova m0, [r4+r1] ; p1
mova m1, [r4+2*r1] ; p0
mova m2, [r0] ; q0
mova m3, [r0+r1] ; q1
LOAD_MASK r2, r3
 
mov r3, r4mp
pcmpeqb m3, m3
movd m4, [r3] ; tc0
punpcklbw m4, m4
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
mova [esp+%2], m4 ; tc
pcmpgtb m4, m3
mova m3, [r4] ; p2
pand m4, m7
mova [esp], m4 ; mask
 
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
pand m6, m4
pand m4, [esp+%2] ; tc
psubb m7, m4, m6
pand m6, m4
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
 
mova m4, [r0+2*r1] ; q2
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
pand m6, [esp] ; mask
mova m5, [esp+%2] ; tc
psubb m7, m6
pand m5, m6
mova m3, [r0+r1]
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
 
DEBLOCK_P0_Q0
mova [r4+2*r1], m1
mova [r0], m2
RET
 
;-----------------------------------------------------------------------------
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
INIT_MMX cpuname
cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12
mov r0, r0mp
mov r3, r1m
lea r4, [r3*3]
sub r0, 4
lea r1, [r0+r4]
%define pix_tmp esp+12*HAVE_ALIGNED_STACK
 
; transpose 6x16 -> tmp space
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp
lea r0, [r0+r3*8]
lea r1, [r1+r3*8]
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
 
; vertical filter
lea r0, [pix_tmp+0x30]
PUSH dword r4m
PUSH dword r3m
PUSH dword r2m
PUSH dword 16
PUSH dword r0
call deblock_%1_luma_8
%ifidn %1, v8
add dword [esp ], 8 ; pix_tmp+0x38
add dword [esp+16], 2 ; tc0+2
call deblock_%1_luma_8
%endif
ADD esp, 20
 
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter)
mov r0, r0mp
sub r0, 2
 
movq m0, [pix_tmp+0x10]
movq m1, [pix_tmp+0x20]
lea r1, [r0+r4]
movq m2, [pix_tmp+0x30]
movq m3, [pix_tmp+0x40]
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
 
lea r0, [r0+r3*8]
lea r1, [r1+r3*8]
movq m0, [pix_tmp+0x18]
movq m1, [pix_tmp+0x28]
movq m2, [pix_tmp+0x38]
movq m3, [pix_tmp+0x48]
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4)
 
RET
%endmacro ; DEBLOCK_LUMA
 
INIT_MMX mmxext
DEBLOCK_LUMA v8, 8
INIT_XMM sse2
DEBLOCK_LUMA v, 16
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA v, 16
%endif
 
%endif ; ARCH
 
 
 
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
%if ARCH_X86_64
pavgb t0, p2, p1
pavgb t1, p0, q0
%else
mova t0, p2
mova t1, p0
pavgb t0, p1
pavgb t1, q0
%endif
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
mova t5, t1
%if ARCH_X86_64
paddb t2, p2, p1
paddb t3, p0, q0
%else
mova t2, p2
mova t3, p0
paddb t2, p1
paddb t3, q0
%endif
paddb t2, t3
mova t3, t2
mova t4, t2
psrlw t2, 1
pavgb t2, mpb_0
pxor t2, t0
pand t2, mpb_1
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
 
%if ARCH_X86_64
pavgb t1, p2, q1
psubb t2, p2, q1
%else
mova t1, p2
mova t2, p2
pavgb t1, q1
psubb t2, q1
%endif
paddb t3, t3
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
pand t2, mpb_1
psubb t1, t2
pavgb t1, p1
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
psrlw t3, 2
pavgb t3, mpb_0
pxor t3, t1
pand t3, mpb_1
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
 
pxor t3, p0, q1
pavgb t2, p0, q1
pand t3, mpb_1
psubb t2, t3
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
 
pxor t1, t2
pxor t2, p0
pand t1, mask1p
pand t2, mask0
pxor t1, t2
pxor t1, p0
mova %1, t1 ; store p0
 
mova t1, %4 ; p3
paddb t2, t1, p2
pavgb t1, p2
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
paddb t2, t2
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
psrlw t2, 2
pavgb t2, mpb_0
pxor t2, t1
pand t2, mpb_1
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
 
pxor t0, p1
pxor t1, p2
pand t0, mask1p
pand t1, mask1p
pxor t0, p1
pxor t1, p2
mova %2, t0 ; store p1
mova %3, t1 ; store p2
%endmacro
 
%macro LUMA_INTRA_SWAP_PQ 0
%define q1 m0
%define q0 m1
%define p0 m2
%define p1 m3
%define p2 q2
%define mask1p mask1q
%endmacro
 
%macro DEBLOCK_LUMA_INTRA 1
%define p1 m0
%define p0 m1
%define q0 m2
%define q1 m3
%define t0 m4
%define t1 m5
%define t2 m6
%define t3 m7
%if ARCH_X86_64
%define p2 m8
%define q2 m9
%define t4 m10
%define t5 m11
%define mask0 m12
%define mask1p m13
%if WIN64
%define mask1q [rsp]
%else
%define mask1q [rsp-24]
%endif
%define mpb_0 m14
%define mpb_1 m15
%else
%define spill(x) [esp+16*x]
%define p2 [r4+r1]
%define q2 [r0+2*r1]
%define t4 spill(0)
%define t5 spill(1)
%define mask0 spill(2)
%define mask1p spill(3)
%define mask1q spill(4)
%define mpb_0 [pb_0]
%define mpb_1 [pb_1]
%endif
 
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
%if WIN64
cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
%else
cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
%endif
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
dec r2d ; alpha-1
jl .end
neg r4
dec r3d ; beta-1
jl .end
add r4, r0 ; pix-4*stride
mova p1, [r4+2*r1]
mova p0, [r4+r5]
mova q0, [r0]
mova q1, [r0+r1]
%if ARCH_X86_64
pxor mpb_0, mpb_0
mova mpb_1, [pb_1]
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
SWAP 7, 12 ; m12=mask0
pavgb t5, mpb_0
pavgb t5, mpb_1 ; alpha/4+1
movdqa p2, [r4+r1]
movdqa q2, [r0+2*r1]
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
pand t0, mask0
pand t4, t0
pand t2, t0
mova mask1q, t4
mova mask1p, t2
%else
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
mova m4, t5
mova mask0, m7
pavgb m4, [pb_0]
pavgb m4, [pb_1] ; alpha/4+1
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
pand m6, mask0
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
pand m4, m6
mova mask1p, m4
DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
pand m4, m6
mova mask1q, m4
%endif
LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
LUMA_INTRA_SWAP_PQ
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
.end:
RET
 
INIT_MMX cpuname
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra_8, 4,9,0,0x80
movsxd r7, r1d
lea r8, [r7*3]
lea r6, [r0-4]
lea r5, [r0-4+r8]
%if WIN64
%define pix_tmp rsp+0x20 ; shadow space
%else
%define pix_tmp rsp
%endif
 
; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea r6, [r6+r7*8]
lea r5, [r5+r7*8]
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
 
lea r0, [pix_tmp+0x40]
mov r1, 0x10
call deblock_v_luma_intra_8
 
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
lea r5, [r6+r8]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
shl r7, 3
sub r6, r7
sub r5, r7
shr r7, 3
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
RET
%else
cglobal deblock_h_luma_intra_8, 2,4,8,0x80
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
%define pix_tmp rsp
 
; transpose 8x16 -> tmp space
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
lea r0, [r0+r1*8]
lea r2, [r2+r1*8]
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
 
lea r0, [pix_tmp+0x40]
PUSH dword r3m
PUSH dword r2m
PUSH dword 16
PUSH r0
call deblock_%1_luma_intra_8
%ifidn %1, v8
add dword [rsp], 8 ; pix_tmp+8
call deblock_%1_luma_intra_8
%endif
ADD esp, 16
 
mov r1, r1m
mov r0, r0mp
lea r3, [r1*3]
sub r0, 4
lea r2, [r0+r3]
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
lea r0, [r0+r1*8]
lea r2, [r2+r1*8]
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
RET
%endif ; ARCH_X86_64
%endmacro ; DEBLOCK_LUMA_INTRA
 
INIT_XMM sse2
DEBLOCK_LUMA_INTRA v
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_INTRA v
%endif
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_LUMA_INTRA v8
%endif
 
INIT_MMX mmxext
 
%macro CHROMA_V_START 0
dec r2d ; alpha-1
dec r3d ; beta-1
mov t5, r0
sub t5, r1
sub t5, r1
%endmacro
 
%macro CHROMA_H_START 0
dec r2d
dec r3d
sub r0, 2
lea t6, [r1*3]
mov t5, r0
add r0, t6
%endmacro
 
%define t5 r5
%define t6 r6
 
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_8, 5,6
CHROMA_V_START
movq m0, [t5]
movq m1, [t5+r1]
movq m2, [r0]
movq m3, [r0+r1]
call ff_chroma_inter_body_mmxext
movq [t5+r1], m1
movq [r0], m2
RET
 
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_8, 5,7
%if UNIX64
%define buf0 [rsp-24]
%define buf1 [rsp-16]
%elif WIN64
sub rsp, 16
%define buf0 [rsp]
%define buf1 [rsp+8]
%else
%define buf0 r0m
%define buf1 r2m
%endif
CHROMA_H_START
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
movq buf0, m0
movq buf1, m3
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
pand m7, m6
DEBLOCK_P0_Q0
movq m0, buf0
movq m3, buf1
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
%if WIN64
add rsp, 16
%endif
RET
 
ALIGN 16
ff_chroma_inter_body_mmxext:
LOAD_MASK r2d, r3d
movd m6, [r4] ; tc0
punpcklbw m6, m6
pand m7, m6
DEBLOCK_P0_Q0
ret
 
 
 
; in: %1=p0 %2=p1 %3=q1
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
%macro CHROMA_INTRA_P0 3
movq m4, %1
pxor m4, %3
pand m4, [pb_1] ; m4 = (p0^q1)&1
pavgb %1, %3
psubusb %1, m4
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
%endmacro
 
%define t5 r4
%define t6 r5
 
;-----------------------------------------------------------------------------
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra_8, 4,5
CHROMA_V_START
movq m0, [t5]
movq m1, [t5+r1]
movq m2, [r0]
movq m3, [r0+r1]
call ff_chroma_intra_body_mmxext
movq [t5+r1], m1
movq [r0], m2
RET
 
;-----------------------------------------------------------------------------
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_chroma_intra_8, 4,6
CHROMA_H_START
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
call ff_chroma_intra_body_mmxext
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
RET
 
ALIGN 16
ff_chroma_intra_body_mmxext:
LOAD_MASK r2d, r3d
movq m5, m1
movq m6, m2
CHROMA_INTRA_P0 m1, m0, m3
CHROMA_INTRA_P0 m2, m3, m0
psubb m1, m5
psubb m2, m6
pand m1, m7
pand m2, m7
paddb m1, m5
paddb m2, m6
ret
 
;-----------------------------------------------------------------------------
; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
; int8_t ref[2][40], int16_t mv[2][40][2],
; int bidir, int edges, int step,
; int mask_mv0, int mask_mv1, int field);
;
; bidir is 0 or 1
; edges is 1 or 4
; step is 1 or 2
; mask_mv0 is 0 or 3
; mask_mv1 is 0 or 1
; field is 0 or 1
;-----------------------------------------------------------------------------
%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
; dir, d_idx, mask_dir, bidir
%define edgesd %1
%define stepd %2
%define mask_mvd %3
%define dir %4
%define d_idx %5
%define mask_dir %6
%define bidir %7
xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
%%.b_idx_loop:
%if mask_dir == 0
pxor m0, m0
%endif
test b_idxd, dword mask_mvd
jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv))
%if bidir == 1
movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] }
pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] }
pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] }
psubb m0, m2 ; { ref0[b] != ref0[bn],
; ref0[b] != ref1[bn] }
psubb m1, m3 ; { ref1[b] != ref1[bn],
; ref1[b] != ref0[bn] }
 
por m0, m1
mova m1, [mvq+b_idxq*4+(d_idx+12)*4]
mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
mova m3, m1
mova m4, m2
psubw m1, [mvq+b_idxq*4+12*4]
psubw m2, [mvq+b_idxq*4+12*4+mmsize]
psubw m3, [mvq+b_idxq*4+52*4]
psubw m4, [mvq+b_idxq*4+52*4+mmsize]
packsswb m1, m2
packsswb m3, m4
paddb m1, m6
paddb m3, m6
psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
psubusb m3, m5
packsswb m1, m3
 
por m0, m1
mova m1, [mvq+b_idxq*4+(d_idx+52)*4]
mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
mova m3, m1
mova m4, m2
psubw m1, [mvq+b_idxq*4+12*4]
psubw m2, [mvq+b_idxq*4+12*4+mmsize]
psubw m3, [mvq+b_idxq*4+52*4]
psubw m4, [mvq+b_idxq*4+52*4+mmsize]
packsswb m1, m2
packsswb m3, m4
paddb m1, m6
paddb m3, m6
psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
psubusb m3, m5
packsswb m1, m3
 
pshufw m1, m1, 0x4E
por m0, m1
pshufw m1, m0, 0x4E
pminub m0, m1
%else ; bidir == 0
movd m0, [refq+b_idxq+12]
psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
 
mova m1, [mvq+b_idxq*4+12*4]
mova m2, [mvq+b_idxq*4+12*4+mmsize]
psubw m1, [mvq+b_idxq*4+(d_idx+12)*4]
psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
packsswb m1, m2
paddb m1, m6
psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit
packsswb m1, m1
por m0, m1
%endif ; bidir == 1/0
 
%%.skip_loop_iter:
movd m1, [nnzq+b_idxq+12]
por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
 
pminub m1, m7
pminub m0, m7
psllw m1, 1
pxor m2, m2
pmaxub m1, m0
punpcklbw m1, m2
movq [bsq+b_idxq+32*dir], m1
 
add b_idxd, dword stepd
cmp b_idxd, dword edgesd
jl %%.b_idx_loop
%endmacro
 
INIT_MMX mmxext
cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
step, mask_mv0, mask_mv1, field
%define b_idxq bidirq
%define b_idxd bidird
cmp dword fieldm, 0
mova m7, [pb_1]
mova m5, [pb_3]
je .nofield
mova m5, [pb_3_1]
.nofield:
mova m6, m5
paddb m5, m5
 
shl dword stepd, 3
shl dword edgesd, 3
%if ARCH_X86_32
%define mask_mv0d mask_mv0m
%define mask_mv1d mask_mv1m
%endif
shl dword mask_mv1d, 3
shl dword mask_mv0d, 3
 
cmp dword bidird, 0
jne .bidir
loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0
loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0
 
mova m0, [bsq+mmsize*0]
mova m1, [bsq+mmsize*1]
mova m2, [bsq+mmsize*2]
mova m3, [bsq+mmsize*3]
TRANSPOSE4x4W 0, 1, 2, 3, 4
mova [bsq+mmsize*0], m0
mova [bsq+mmsize*1], m1
mova [bsq+mmsize*2], m2
mova [bsq+mmsize*3], m3
RET
 
.bidir:
loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1
loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1
 
mova m0, [bsq+mmsize*0]
mova m1, [bsq+mmsize*1]
mova m2, [bsq+mmsize*2]
mova m3, [bsq+mmsize*3]
TRANSPOSE4x4W 0, 1, 2, 3, 4
mova [bsq+mmsize*0], m0
mova [bsq+mmsize*1], m1
mova [bsq+mmsize*2], m2
mova [bsq+mmsize*3], m3
RET
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm
0,0 → 1,923
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Oskar Arvidsson <oskar@irock.se>
;* Loren Merritt <lorenm@u.washington.edu>
;* Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
pw_pixel_max: times 8 dw ((1 << 10)-1)
 
SECTION .text
 
cextern pw_2
cextern pw_3
cextern pw_4
 
; out: %4 = |%1-%2|-%3
; clobbers: %5
%macro ABS_SUB 5
psubusw %5, %2, %1
psubusw %4, %1, %2
por %4, %5
psubw %4, %3
%endmacro
 
; out: %4 = |%1-%2|<%3
%macro DIFF_LT 5
psubusw %4, %2, %1
psubusw %5, %1, %2
por %5, %4 ; |%1-%2|
pxor %4, %4
psubw %5, %3 ; |%1-%2|-%3
pcmpgtw %4, %5 ; 0 > |%1-%2|-%3
%endmacro
 
%macro LOAD_AB 4
movd %1, %3
movd %2, %4
SPLATW %1, %1
SPLATW %2, %2
%endmacro
 
; in: %2=tc reg
; out: %1=splatted tc
%macro LOAD_TC 2
movd %1, [%2]
punpcklbw %1, %1
%if mmsize == 8
pshufw %1, %1, 0
%else
pshuflw %1, %1, 01010000b
pshufd %1, %1, 01010000b
%endif
psraw %1, 6
%endmacro
 
; in: %1=p1, %2=p0, %3=q0, %4=q1
; %5=alpha, %6=beta, %7-%9=tmp
; out: %7=mask
%macro LOAD_MASK 9
ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha
ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta
pand %8, %9
ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta
pxor %7, %7
pand %8, %9
pcmpgtw %7, %8
%endmacro
 
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', m2=q0'
%macro DEBLOCK_P0_Q0 7
psubw %3, %4
pxor %7, %7
paddw %3, [pw_4]
psubw %7, %5
psubw %6, %2, %1
psllw %6, 2
paddw %3, %6
psraw %3, 3
mova %6, [pw_pixel_max]
CLIPW %3, %7, %5
pxor %7, %7
paddw %1, %3
psubw %2, %3
CLIPW %1, %7, %6
CLIPW %2, %7, %6
%endmacro
 
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp
%macro LUMA_Q1 6
pavgw %6, %3, %4 ; (p0+q0+1)>>1
paddw %1, %6
pxor %6, %6
psraw %1, 1
psubw %6, %5
psubw %1, %2
CLIPW %1, %6, %5
paddw %1, %2
%endmacro
 
%macro LUMA_DEBLOCK_ONE 3
DIFF_LT m5, %1, bm, m4, m6
pxor m6, m6
mova %3, m4
pcmpgtw m6, tcm
pand m4, tcm
pandn m6, m7
pand m4, m6
LUMA_Q1 m5, %2, m1, m2, m4, m6
%endmacro
 
%macro LUMA_H_STORE 2
%if mmsize == 8
movq [r0-4], m0
movq [r0+r1-4], m1
movq [r0+r1*2-4], m2
movq [r0+%2-4], m3
%else
movq [r0-4], m0
movhps [r0+r1-4], m0
movq [r0+r1*2-4], m1
movhps [%1-4], m1
movq [%1+r1-4], m2
movhps [%1+r1*2-4], m2
movq [%1+%2-4], m3
movhps [%1+r1*4-4], m3
%endif
%endmacro
 
%macro DEBLOCK_LUMA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_10, 5,5,8*(mmsize/16)
%assign pad 5*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
%define ms2 [rsp+mmsize*2]
%define am [rsp+mmsize*3]
%define bm [rsp+mmsize*4]
SUB rsp, pad
shl r2d, 2
shl r3d, 2
LOAD_AB m4, m5, r2d, r3d
mov r3, 32/mmsize
mov r2, r0
sub r0, r1
mova am, m4
sub r0, r1
mova bm, m5
sub r0, r1
.loop:
mova m0, [r0+r1]
mova m1, [r0+r1*2]
mova m2, [r2]
mova m3, [r2+r1]
 
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
LOAD_TC m6, r4
mova tcm, m6
 
mova m5, [r0]
LUMA_DEBLOCK_ONE m1, m0, ms1
mova [r0+r1], m5
 
mova m5, [r2+r1*2]
LUMA_DEBLOCK_ONE m2, m3, ms2
mova [r2+r1], m5
 
pxor m5, m5
mova m6, tcm
pcmpgtw m5, tcm
psubw m6, ms1
pandn m5, m7
psubw m6, ms2
pand m5, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
mova [r0+r1*2], m1
mova [r2], m2
 
add r0, mmsize
add r2, mmsize
add r4, mmsize/8
dec r3
jg .loop
ADD rsp, pad
RET
 
cglobal deblock_h_luma_10, 5,6,8*(mmsize/16)
%assign pad 7*mmsize+12-(stack_offset&15)
%define tcm [rsp]
%define ms1 [rsp+mmsize]
%define ms2 [rsp+mmsize*2]
%define p1m [rsp+mmsize*3]
%define p2m [rsp+mmsize*4]
%define am [rsp+mmsize*5]
%define bm [rsp+mmsize*6]
SUB rsp, pad
shl r2d, 2
shl r3d, 2
LOAD_AB m4, m5, r2d, r3d
mov r3, r1
mova am, m4
add r3, r1
mov r5, 32/mmsize
mova bm, m5
add r3, r1
%if mmsize == 16
mov r2, r0
add r2, r3
%endif
.loop:
%if mmsize == 8
movq m2, [r0-8] ; y q2 q1 q0
movq m7, [r0+0]
movq m5, [r0+r1-8]
movq m3, [r0+r1+0]
movq m0, [r0+r1*2-8]
movq m6, [r0+r1*2+0]
movq m1, [r0+r3-8]
TRANSPOSE4x4W 2, 5, 0, 1, 4
SWAP 2, 7
movq m7, [r0+r3]
TRANSPOSE4x4W 2, 3, 6, 7, 4
%else
movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
movu m0, [r0+r1-8]
movu m2, [r0+r1*2-8]
movu m3, [r2-8]
TRANSPOSE4x4W 5, 0, 2, 3, 6
mova tcm, m3
 
movu m4, [r2+r1-8]
movu m1, [r2+r1*2-8]
movu m3, [r2+r3-8]
movu m7, [r2+r1*4-8]
TRANSPOSE4x4W 4, 1, 3, 7, 6
 
mova m6, tcm
punpcklqdq m6, m7
punpckhqdq m5, m4
SBUTTERFLY qdq, 0, 1, 7
SBUTTERFLY qdq, 2, 3, 7
%endif
 
mova p2m, m6
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6
LOAD_TC m6, r4
mova tcm, m6
 
LUMA_DEBLOCK_ONE m1, m0, ms1
mova p1m, m5
 
mova m5, p2m
LUMA_DEBLOCK_ONE m2, m3, ms2
mova p2m, m5
 
pxor m5, m5
mova m6, tcm
pcmpgtw m5, tcm
psubw m6, ms1
pandn m5, m7
psubw m6, ms2
pand m5, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6
mova m0, p1m
mova m3, p2m
TRANSPOSE4x4W 0, 1, 2, 3, 4
LUMA_H_STORE r2, r3
 
add r4, mmsize/8
lea r0, [r0+r1*(mmsize/2)]
lea r2, [r2+r1*(mmsize/2)]
dec r5
jg .loop
ADD rsp, pad
RET
%endmacro
 
%if ARCH_X86_64
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2
; m12=alpha, m13=beta
; out: m0=p1', m3=q1', m1=p0', m2=q0'
; clobbers: m4, m5, m6, m7, m10, m11, m14
%macro DEBLOCK_LUMA_INTER_SSE2 0
LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6
LOAD_TC m6, r4
DIFF_LT m8, m1, m13, m10, m4
DIFF_LT m9, m2, m13, m11, m4
pand m6, m7
 
mova m14, m6
pxor m4, m4
pcmpgtw m6, m4
pand m6, m14
 
mova m5, m10
pand m5, m6
LUMA_Q1 m8, m0, m1, m2, m5, m4
 
mova m5, m11
pand m5, m6
LUMA_Q1 m9, m3, m1, m2, m5, m4
 
pxor m4, m4
psubw m6, m10
pcmpgtw m4, m14
pandn m4, m7
psubw m6, m11
pand m4, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6
 
SWAP 0, 8
SWAP 3, 9
%endmacro
 
%macro DEBLOCK_LUMA_64 0
cglobal deblock_v_luma_10, 5,5,15
%define p2 m8
%define p1 m0
%define p0 m1
%define q0 m2
%define q1 m3
%define q2 m9
%define mask0 m7
%define mask1 m10
%define mask2 m11
shl r2d, 2
shl r3d, 2
LOAD_AB m12, m13, r2d, r3d
mov r2, r0
sub r0, r1
sub r0, r1
sub r0, r1
mov r3, 2
.loop:
mova p2, [r0]
mova p1, [r0+r1]
mova p0, [r0+r1*2]
mova q0, [r2]
mova q1, [r2+r1]
mova q2, [r2+r1*2]
DEBLOCK_LUMA_INTER_SSE2
mova [r0+r1], p1
mova [r0+r1*2], p0
mova [r2], q0
mova [r2+r1], q1
add r0, mmsize
add r2, mmsize
add r4, 2
dec r3
jg .loop
REP_RET
 
cglobal deblock_h_luma_10, 5,7,15
shl r2d, 2
shl r3d, 2
LOAD_AB m12, m13, r2d, r3d
mov r2, r1
add r2, r1
add r2, r1
mov r5, r0
add r5, r2
mov r6, 2
.loop:
movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x
movu m0, [r0+r1-8]
movu m2, [r0+r1*2-8]
movu m9, [r5-8]
movu m5, [r5+r1-8]
movu m1, [r5+r1*2-8]
movu m3, [r5+r2-8]
movu m7, [r5+r1*4-8]
 
TRANSPOSE4x4W 8, 0, 2, 9, 10
TRANSPOSE4x4W 5, 1, 3, 7, 10
 
punpckhqdq m8, m5
SBUTTERFLY qdq, 0, 1, 10
SBUTTERFLY qdq, 2, 3, 10
punpcklqdq m9, m7
 
DEBLOCK_LUMA_INTER_SSE2
 
TRANSPOSE4x4W 0, 1, 2, 3, 4
LUMA_H_STORE r5, r2
add r4, 2
lea r0, [r0+r1*8]
lea r5, [r5+r1*8]
dec r6
jg .loop
REP_RET
%endmacro
 
INIT_XMM sse2
DEBLOCK_LUMA_64
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_64
%endif
%endif
 
%macro SWAPMOVA 2
%ifid %1
SWAP %1, %2
%else
mova %1, %2
%endif
%endmacro
 
; in: t0-t2: tmp registers
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2'
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory
%if ARCH_X86_64
paddw t0, %3, %2
mova t2, %4
paddw t2, %3
%else
mova t0, %3
mova t2, %4
paddw t0, %2
paddw t2, %3
%endif
paddw t0, %1
paddw t2, t2
paddw t0, %5
paddw t2, %9
paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2)
paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4)
 
psrlw t2, 3
psrlw t1, t0, 2
psubw t2, %3
psubw t1, %2
pand t2, %8
pand t1, %8
paddw t2, %3
paddw t1, %2
SWAPMOVA %11, t1
 
psubw t1, t0, %3
paddw t0, t0
psubw t1, %5
psubw t0, %3
paddw t1, %6
paddw t1, %2
paddw t0, %6
psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4
psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3
 
pxor t0, t1
pxor t1, %1
pand t0, %8
pand t1, %7
pxor t0, t1
pxor t0, %1
SWAPMOVA %10, t0
SWAPMOVA %12, t2
%endmacro
 
%macro LUMA_INTRA_INIT 1
%xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15)
%define t0 m4
%define t1 m5
%define t2 m6
%define t3 m7
%assign i 4
%rep %1
CAT_XDEFINE t, i, [rsp+mmsize*(i-4)]
%assign i i+1
%endrep
SUB rsp, pad
%endmacro
 
; in: %1-%3=tmp, %4=p2, %5=q2
%macro LUMA_INTRA_INTER 5
LOAD_AB t0, t1, r2d, r3d
mova %1, t0
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3
%if ARCH_X86_64
mova %2, t0 ; mask0
psrlw t3, %1, 2
%else
mova t3, %1
mova %2, t0 ; mask0
psrlw t3, 2
%endif
paddw t3, [pw_2] ; alpha/4+2
DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2
pand t2, %2
mova t3, %5 ; q2
mova %1, t2 ; mask1
DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta
pand t2, %1
mova t3, %4 ; p2
mova %3, t2 ; mask1q
DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta
pand t2, %1
mova %1, t2 ; mask1p
%endmacro
 
%macro LUMA_H_INTRA_LOAD 0
%if mmsize == 8
movu t0, [r0-8]
movu t1, [r0+r1-8]
movu m0, [r0+r1*2-8]
movu m1, [r0+r4-8]
TRANSPOSE4x4W 4, 5, 0, 1, 2
mova t4, t0 ; p3
mova t5, t1 ; p2
 
movu m2, [r0]
movu m3, [r0+r1]
movu t0, [r0+r1*2]
movu t1, [r0+r4]
TRANSPOSE4x4W 2, 3, 4, 5, 6
mova t6, t0 ; q2
mova t7, t1 ; q3
%else
movu t0, [r0-8]
movu t1, [r0+r1-8]
movu m0, [r0+r1*2-8]
movu m1, [r0+r5-8]
movu m2, [r4-8]
movu m3, [r4+r1-8]
movu t2, [r4+r1*2-8]
movu t3, [r4+r5-8]
TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5
mova t4, t0 ; p3
mova t5, t1 ; p2
mova t6, t2 ; q2
mova t7, t3 ; q3
%endif
%endmacro
 
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp
%macro LUMA_H_INTRA_STORE 9
%if mmsize == 8
TRANSPOSE4x4W %1, %2, %3, %4, %9
movq [r0-8], m%1
movq [r0+r1-8], m%2
movq [r0+r1*2-8], m%3
movq [r0+r4-8], m%4
movq m%1, %8
TRANSPOSE4x4W %5, %6, %7, %1, %9
movq [r0], m%5
movq [r0+r1], m%6
movq [r0+r1*2], m%7
movq [r0+r4], m%1
%else
TRANSPOSE2x4x4W %1, %2, %3, %4, %9
movq [r0-8], m%1
movq [r0+r1-8], m%2
movq [r0+r1*2-8], m%3
movq [r0+r5-8], m%4
movhps [r4-8], m%1
movhps [r4+r1-8], m%2
movhps [r4+r1*2-8], m%3
movhps [r4+r5-8], m%4
%ifnum %8
SWAP %1, %8
%else
mova m%1, %8
%endif
TRANSPOSE2x4x4W %5, %6, %7, %1, %9
movq [r0], m%5
movq [r0+r1], m%6
movq [r0+r1*2], m%7
movq [r0+r5], m%1
movhps [r4], m%5
movhps [r4+r1], m%6
movhps [r4+r1*2], m%7
movhps [r4+r5], m%1
%endif
%endmacro
 
%if ARCH_X86_64
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
%macro DEBLOCK_LUMA_INTRA_64 0
cglobal deblock_v_luma_intra_10, 4,7,16
%define t0 m1
%define t1 m2
%define t2 m4
%define p2 m8
%define p1 m9
%define p0 m10
%define q0 m11
%define q1 m12
%define q2 m13
%define aa m5
%define bb m14
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
neg r4
add r4, r0 ; pix-4*stride
mov r6, 2
mova m0, [pw_2]
shl r2d, 2
shl r3d, 2
LOAD_AB aa, bb, r2d, r3d
.loop:
mova p2, [r4+r1]
mova p1, [r4+2*r1]
mova p0, [r4+r5]
mova q0, [r0]
mova q1, [r0+r1]
mova q2, [r0+2*r1]
 
LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1
mova t2, aa
psrlw t2, 2
paddw t2, m0 ; alpha/4+2
DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2
DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta
DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta
pand m6, m3
pand m7, m6
pand m6, t1
LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1]
LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1]
add r0, mmsize
add r4, mmsize
dec r6
jg .loop
REP_RET
 
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra_10, 4,7,16
%define t0 m15
%define t1 m14
%define t2 m2
%define q3 m5
%define q2 m8
%define q1 m9
%define q0 m10
%define p0 m11
%define p1 m12
%define p2 m13
%define p3 m4
%define spill [rsp]
%assign pad 24-(stack_offset&15)
SUB rsp, pad
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
add r4, r0 ; pix+4*stride
mov r6, 2
mova m0, [pw_2]
shl r2d, 2
shl r3d, 2
.loop:
movu q3, [r0-8]
movu q2, [r0+r1-8]
movu q1, [r0+r1*2-8]
movu q0, [r0+r5-8]
movu p0, [r4-8]
movu p1, [r4+r1-8]
movu p2, [r4+r1*2-8]
movu p3, [r4+r5-8]
TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1
 
LOAD_AB m1, m2, r2d, r3d
LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1
psrlw m1, 2
paddw m1, m0 ; alpha/4+2
DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2
DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta
DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta
pand m6, m3
pand m7, m6
pand m6, t1
 
mova spill, q3
LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2
LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2
mova m7, spill
 
LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14
 
lea r0, [r0+r1*8]
lea r4, [r4+r1*8]
dec r6
jg .loop
ADD rsp, pad
RET
%endmacro
 
INIT_XMM sse2
DEBLOCK_LUMA_INTRA_64
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA_INTRA_64
%endif
 
%endif
 
%macro DEBLOCK_LUMA_INTRA 0
;-----------------------------------------------------------------------------
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16)
LUMA_INTRA_INIT 3
lea r4, [r1*4]
lea r5, [r1*3]
neg r4
add r4, r0
mov r6, 32/mmsize
shl r2d, 2
shl r3d, 2
.loop:
mova m0, [r4+r1*2] ; p1
mova m1, [r4+r5] ; p0
mova m2, [r0] ; q0
mova m3, [r0+r1] ; q1
LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2]
LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1]
mova t3, [r0+r1*2] ; q2
LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1]
add r0, mmsize
add r4, mmsize
dec r6
jg .loop
ADD rsp, pad
RET
 
;-----------------------------------------------------------------------------
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16)
LUMA_INTRA_INIT 8
%if mmsize == 8
lea r4, [r1*3]
mov r5, 32/mmsize
%else
lea r4, [r1*4]
lea r5, [r1*3] ; 3*stride
add r4, r0 ; pix+4*stride
mov r6, 32/mmsize
%endif
shl r2d, 2
shl r3d, 2
.loop:
LUMA_H_INTRA_LOAD
LUMA_INTRA_INTER t8, t9, t10, t5, t6
 
LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11
mova t3, t6 ; q2
LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5
 
mova m2, t4
mova m0, t11
mova m1, t5
mova m3, t8
mova m6, t6
 
LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7
 
lea r0, [r0+r1*(mmsize/2)]
%if mmsize == 8
dec r5
%else
lea r4, [r4+r1*(mmsize/2)]
dec r6
%endif
jg .loop
ADD rsp, pad
RET
%endmacro
 
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
INIT_XMM sse2
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_LUMA
DEBLOCK_LUMA_INTRA
%endif
%endif
 
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp
; out: %1=p0', %2=q0'
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7
mova %6, [pw_2]
paddw %6, %3
paddw %6, %4
paddw %7, %6, %2
paddw %6, %1
paddw %6, %3
paddw %7, %4
psraw %6, 2
psraw %7, 2
psubw %6, %1
psubw %7, %2
pand %6, %5
pand %7, %5
paddw %1, %6
paddw %2, %7
%endmacro
 
%macro CHROMA_V_LOAD 1
mova m0, [r0] ; p1
mova m1, [r0+r1] ; p0
mova m2, [%1] ; q0
mova m3, [%1+r1] ; q1
%endmacro
 
%macro CHROMA_V_STORE 0
mova [r0+1*r1], m1
mova [r0+2*r1], m2
%endmacro
 
%macro CHROMA_V_LOAD_TC 2
movd %1, [%2]
punpcklbw %1, %1
punpcklwd %1, %1
psraw %1, 6
%endmacro
 
%macro DEBLOCK_CHROMA 0
;-----------------------------------------------------------------------------
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16)
mov r5, r0
sub r0, r1
sub r0, r1
shl r2d, 2
shl r3d, 2
%if mmsize < 16
mov r6, 16/mmsize
.loop:
%endif
CHROMA_V_LOAD r5
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
pxor m4, m4
CHROMA_V_LOAD_TC m6, r4
psubw m6, [pw_3]
pmaxsw m6, m4
pand m7, m6
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE
%if mmsize < 16
add r0, mmsize
add r5, mmsize
add r4, mmsize/4
dec r6
jg .loop
REP_RET
%else
RET
%endif
 
;-----------------------------------------------------------------------------
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta )
;-----------------------------------------------------------------------------
cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16)
mov r4, r0
sub r0, r1
sub r0, r1
shl r2d, 2
shl r3d, 2
%if mmsize < 16
mov r5, 16/mmsize
.loop:
%endif
CHROMA_V_LOAD r4
LOAD_AB m4, m5, r2d, r3d
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6
CHROMA_V_STORE
%if mmsize < 16
add r0, mmsize
add r4, mmsize
dec r5
jg .loop
REP_RET
%else
RET
%endif
%endmacro
 
%if ARCH_X86_64 == 0
INIT_MMX mmxext
DEBLOCK_CHROMA
%endif
INIT_XMM sse2
DEBLOCK_CHROMA
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEBLOCK_CHROMA
%endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_i386.h
0,0 → 1,204
/*
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
/**
* @file
* H.264 / AVC / MPEG4 part10 codec.
* non-MMX i386-specific optimizations for H.264
* @author Michael Niedermayer <michaelni@gmx.at>
*/
 
#ifndef AVCODEC_X86_H264_I386_H
#define AVCODEC_X86_H264_I386_H
 
#include <stddef.h>
 
#include "libavcodec/cabac.h"
#include "cabac.h"
 
#if HAVE_INLINE_ASM
 
//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet
//as that would make optimization work hard)
#if HAVE_7REGS
#define decode_significance decode_significance_x86
static int decode_significance_x86(CABACContext *c, int max_coeff,
uint8_t *significant_coeff_ctx_base,
int *index, x86_reg last_off){
void *end= significant_coeff_ctx_base + max_coeff - 1;
int minusstart= -(intptr_t)significant_coeff_ctx_base;
int minusindex= 4-(intptr_t)index;
int bit;
x86_reg coeff_count;
 
#ifdef BROKEN_RELOCATIONS
void *tables;
 
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
);
#endif
 
__asm__ volatile(
"3: \n\t"
 
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c11(%6)", "%c12(%6)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%13")
 
"test $1, %4 \n\t"
" jz 4f \n\t"
"add %10, %1 \n\t"
 
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c11(%6)", "%c12(%6)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%13")
 
"sub %10, %1 \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
 
"test $1, %4 \n\t"
" jnz 5f \n\t"
 
"add"OPSIZE" $4, %2 \n\t"
 
"4: \n\t"
"add $1, %1 \n\t"
"cmp %8, %1 \n\t"
" jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %7, %%ecx \n\t"
"add %1, %%"REG_c" \n\t"
"movl %%ecx, (%0) \n\t"
"5: \n\t"
"add %9, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index),
"+&r"(c->low), "=&r"(bit), "+&r"(c->range)
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end))
TABLES_ARG
: "%"REG_c, "memory"
);
return coeff_count;
}
 
#define decode_significance_8x8 decode_significance_8x8_x86
static int decode_significance_8x8_x86(CABACContext *c,
uint8_t *significant_coeff_ctx_base,
int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){
int minusindex= 4-(intptr_t)index;
int bit;
x86_reg coeff_count;
x86_reg last=0;
x86_reg state;
 
#ifdef BROKEN_RELOCATIONS
void *tables;
 
__asm__ volatile(
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t"
: "=&r"(tables)
);
#endif
 
__asm__ volatile(
"mov %1, %6 \n\t"
"3: \n\t"
 
"mov %10, %0 \n\t"
"movzbl (%0, %6), %k6 \n\t"
"add %9, %6 \n\t"
 
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c12(%7)", "%c13(%7)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%15")
 
"mov %1, %k6 \n\t"
"test $1, %4 \n\t"
" jz 4f \n\t"
 
#ifdef BROKEN_RELOCATIONS
"movzbl %c14(%15, %q6), %k6\n\t"
#else
"movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t"
#endif
"add %11, %6 \n\t"
 
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3",
"%5", "%q5", "%k0", "%b0",
"%c12(%7)", "%c13(%7)",
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET),
AV_STRINGIFY(H264_LPS_RANGE_OFFSET),
AV_STRINGIFY(H264_MLPS_STATE_OFFSET),
"%15")
 
"mov %2, %0 \n\t"
"mov %1, %k6 \n\t"
"movl %k6, (%0) \n\t"
 
"test $1, %4 \n\t"
" jnz 5f \n\t"
 
"add"OPSIZE" $4, %2 \n\t"
 
"4: \n\t"
"addl $1, %k6 \n\t"
"mov %k6, %1 \n\t"
"cmpl $63, %k6 \n\t"
" jb 3b \n\t"
"mov %2, %0 \n\t"
"movl %k6, (%0) \n\t"
"5: \n\t"
"addl %8, %k0 \n\t"
"shr $2, %k0 \n\t"
: "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low),
"=&r"(bit), "+&r"(c->range), "=&r"(state)
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base),
"m"(sig_off), "m"(last_coeff_ctx_base),
"i"(offsetof(CABACContext, bytestream)),
"i"(offsetof(CABACContext, bytestream_end)),
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG
: "%"REG_c, "memory"
);
return coeff_count;
}
#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */
 
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_H264_I386_H */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_idct.asm
0,0 → 1,1082
;*****************************************************************************
;* MMX/SSE2-optimized H.264 iDCT
;*****************************************************************************
;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2003-2008 x264 project
;*
;* Authors: Laurent Aimar <fenrir@via.ecp.fr>
;* Loren Merritt <lorenm@u.washington.edu>
;* Holger Lubitz <hal@duncan.ol.sub.de>
;* Min Chen <chenm001.163.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;*****************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
db 4+11*8, 5+11*8, 4+12*8, 5+12*8
db 6+11*8, 7+11*8, 6+12*8, 7+12*8
db 4+13*8, 5+13*8, 4+14*8, 5+14*8
db 6+13*8, 7+13*8, 6+14*8, 7+14*8
%ifdef PIC
%define npicregs 1
%define scan8 picregq
%else
%define npicregs 0
%define scan8 scan8_mem
%endif
 
cextern pw_32
cextern pw_1
 
SECTION .text
 
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT4_ADD 3
; Load dct coeffs
movq m0, [%2]
movq m1, [%2+8]
movq m2, [%2+16]
movq m3, [%2+24]
 
IDCT4_1D w, 0, 1, 2, 3, 4, 5
mova m6, [pw_32]
TRANSPOSE4x4W 0, 1, 2, 3, 4
paddw m0, m6
IDCT4_1D w, 0, 1, 2, 3, 4, 5
pxor m7, m7
movq [%2+ 0], m7
movq [%2+ 8], m7
movq [%2+16], m7
movq [%2+24], m7
 
STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
%endmacro
 
INIT_MMX mmx
; ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct_add_8, 3, 3, 0
IDCT4_ADD r0, r1, r2
RET
 
%macro IDCT8_1D 2
mova m0, m1
psraw m1, 1
mova m4, m5
psraw m4, 1
paddw m4, m5
paddw m1, m0
paddw m4, m7
paddw m1, m5
psubw m4, m0
paddw m1, m3
 
psubw m0, m3
psubw m5, m3
psraw m3, 1
paddw m0, m7
psubw m5, m7
psraw m7, 1
psubw m0, m3
psubw m5, m7
 
mova m7, m1
psraw m1, 2
mova m3, m4
psraw m3, 2
paddw m3, m0
psraw m0, 2
paddw m1, m5
psraw m5, 2
psubw m0, m4
psubw m7, m5
 
mova m5, m6
psraw m6, 1
mova m4, m2
psraw m4, 1
paddw m6, m2
psubw m4, m5
 
mova m2, %1
mova m5, %2
SUMSUB_BA w, 5, 2
SUMSUB_BA w, 6, 5
SUMSUB_BA w, 4, 2
SUMSUB_BA w, 7, 6
SUMSUB_BA w, 0, 4
SUMSUB_BA w, 3, 2
SUMSUB_BA w, 1, 5
SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
%endmacro
 
%macro IDCT8_1D_FULL 1
mova m7, [%1+112]
mova m6, [%1+ 96]
mova m5, [%1+ 80]
mova m3, [%1+ 48]
mova m2, [%1+ 32]
mova m1, [%1+ 16]
IDCT8_1D [%1], [%1+ 64]
%endmacro
 
; %1=int16_t *block, %2=int16_t *dstblock
%macro IDCT8_ADD_MMX_START 2
IDCT8_1D_FULL %1
mova [%1], m7
TRANSPOSE4x4W 0, 1, 2, 3, 7
mova m7, [%1]
mova [%2 ], m0
mova [%2+16], m1
mova [%2+32], m2
mova [%2+48], m3
TRANSPOSE4x4W 4, 5, 6, 7, 3
mova [%2+ 8], m4
mova [%2+24], m5
mova [%2+40], m6
mova [%2+56], m7
%endmacro
 
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_MMX_END 3-4
IDCT8_1D_FULL %2
mova [%2 ], m5
mova [%2+16], m6
mova [%2+32], m7
 
pxor m7, m7
%if %0 == 4
movq [%4+ 0], m7
movq [%4+ 8], m7
movq [%4+ 16], m7
movq [%4+ 24], m7
movq [%4+ 32], m7
movq [%4+ 40], m7
movq [%4+ 48], m7
movq [%4+ 56], m7
movq [%4+ 64], m7
movq [%4+ 72], m7
movq [%4+ 80], m7
movq [%4+ 88], m7
movq [%4+ 96], m7
movq [%4+104], m7
movq [%4+112], m7
movq [%4+120], m7
%endif
STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
mova m0, [%2 ]
mova m1, [%2+16]
mova m2, [%2+32]
lea %1, [%1+%3*2]
STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
%endmacro
 
INIT_MMX mmx
; ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_add_8, 3, 4, 0
%assign pad 128+4-(stack_offset&7)
SUB rsp, pad
 
add word [r1], 32
IDCT8_ADD_MMX_START r1 , rsp
IDCT8_ADD_MMX_START r1+8, rsp+64
lea r3, [r0+4]
IDCT8_ADD_MMX_END r0 , rsp, r2, r1
IDCT8_ADD_MMX_END r3 , rsp+8, r2
 
ADD rsp, pad
RET
 
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE 4
IDCT8_1D_FULL %2
%if ARCH_X86_64
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
%endif
paddw m0, [pw_32]
 
%if ARCH_X86_64 == 0
mova [%2 ], m0
mova [%2+16], m4
IDCT8_1D [%2], [%2+ 16]
mova [%2 ], m6
mova [%2+16], m7
%else
SWAP 0, 8
SWAP 4, 9
IDCT8_1D m8, m9
SWAP 6, 8
SWAP 7, 9
%endif
 
pxor m7, m7
lea %4, [%3*3]
STORE_DIFF m0, m6, m7, [%1 ]
STORE_DIFF m1, m6, m7, [%1+%3 ]
STORE_DIFF m2, m6, m7, [%1+%3*2]
STORE_DIFF m3, m6, m7, [%1+%4 ]
%if ARCH_X86_64 == 0
mova m0, [%2 ]
mova m1, [%2+16]
%else
SWAP 0, 8
SWAP 1, 9
%endif
mova [%2+ 0], m7
mova [%2+ 16], m7
mova [%2+ 32], m7
mova [%2+ 48], m7
mova [%2+ 64], m7
mova [%2+ 80], m7
mova [%2+ 96], m7
mova [%2+112], m7
lea %1, [%1+%3*4]
STORE_DIFF m4, m6, m7, [%1 ]
STORE_DIFF m5, m6, m7, [%1+%3 ]
STORE_DIFF m0, m6, m7, [%1+%3*2]
STORE_DIFF m1, m6, m7, [%1+%4 ]
%endmacro
 
INIT_XMM sse2
; ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_add_8, 3, 4, 10
IDCT8_ADD_SSE r0, r1, r2, r3
RET
 
%macro DC_ADD_MMXEXT_INIT 2
add %1, 32
sar %1, 6
movd m0, %1d
lea %1, [%2*3]
pshufw m0, m0, 0
pxor m1, m1
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
%endmacro
 
%macro DC_ADD_MMXEXT_OP 4
%1 m2, [%2 ]
%1 m3, [%2+%3 ]
%1 m4, [%2+%3*2]
%1 m5, [%2+%4 ]
paddusb m2, m0
paddusb m3, m0
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
psubusb m3, m1
psubusb m4, m1
psubusb m5, m1
%1 [%2 ], m2
%1 [%2+%3 ], m3
%1 [%2+%3*2], m4
%1 [%2+%4 ], m5
%endmacro
 
INIT_MMX mmxext
; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
%if ARCH_X86_64
cglobal h264_idct_dc_add_8, 3, 4, 0
movsx r3, word [r1]
mov dword [r1], 0
DC_ADD_MMXEXT_INIT r3, r2
DC_ADD_MMXEXT_OP movh, r0, r2, r3
RET
 
; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_dc_add_8, 3, 4, 0
movsx r3, word [r1]
mov dword [r1], 0
DC_ADD_MMXEXT_INIT r3, r2
DC_ADD_MMXEXT_OP mova, r0, r2, r3
lea r0, [r0+r2*4]
DC_ADD_MMXEXT_OP mova, r0, r2, r3
RET
%else
; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct_dc_add_8, 2, 3, 0
movsx r2, word [r1]
mov dword [r1], 0
mov r1, r2m
DC_ADD_MMXEXT_INIT r2, r1
DC_ADD_MMXEXT_OP movh, r0, r1, r2
RET
 
; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
cglobal h264_idct8_dc_add_8, 2, 3, 0
movsx r2, word [r1]
mov dword [r1], 0
mov r1, r2m
DC_ADD_MMXEXT_INIT r2, r1
DC_ADD_MMXEXT_OP mova, r0, r1, r2
lea r0, [r0+r1*4]
DC_ADD_MMXEXT_OP mova, r0, r1, r2
RET
%endif
 
INIT_MMX mmx
; ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .skipblock
mov r6d, dword [r1+r5*4]
lea r6, [r0+r6]
IDCT4_ADD r6, r2, r3
.skipblock:
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
 
; ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
%assign pad 128+4-(stack_offset&7)
SUB rsp, pad
 
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .skipblock
mov r6d, dword [r1+r5*4]
add r6, r0
add word [r2], 32
IDCT8_ADD_MMX_START r2 , rsp
IDCT8_ADD_MMX_START r2+8, rsp+64
IDCT8_ADD_MMX_END r6 , rsp, r3, r2
mov r6d, dword [r1+r5*4]
lea r6, [r0+r6+4]
IDCT8_ADD_MMX_END r6 , rsp+8, r3
.skipblock:
add r5, 4
add r2, 128
cmp r5, 16
jl .nextblock
ADD rsp, pad
RET
 
INIT_MMX mmxext
; ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .skipblock
cmp r6, 1
jnz .no_dc
movsx r6, word [r2]
test r6, r6
jz .no_dc
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
mov dst2d, dword [r1+r5*4]
lea dst2q, [r0+dst2q]
DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
.no_dc:
mov r6d, dword [r1+r5*4]
add r6, r0
IDCT4_ADD r6, r2, r3
.skipblock:
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
 
INIT_MMX mmx
; ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
or r6w, word [r2]
test r6, r6
jz .skipblock
mov r6d, dword [r1+r5*4]
add r6, r0
IDCT4_ADD r6, r2, r3
.skipblock:
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
 
INIT_MMX mmxext
; ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .try_dc
mov r6d, dword [r1+r5*4]
lea r6, [r0+r6]
IDCT4_ADD r6, r2, r3
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
.try_dc:
movsx r6, word [r2]
test r6, r6
jz .skipblock
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
mov dst2d, dword [r1+r5*4]
add dst2q, r0
DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
.skipblock:
inc r5
add r2, 32
cmp r5, 16
jl .nextblock
REP_RET
 
; ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
%assign pad 128+4-(stack_offset&7)
SUB rsp, pad
 
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .skipblock
cmp r6, 1
jnz .no_dc
movsx r6, word [r2]
test r6, r6
jz .no_dc
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
mov dst2d, dword [r1+r5*4]
lea dst2q, [r0+dst2q]
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
lea dst2q, [dst2q+r3*4]
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
add r5, 4
add r2, 128
cmp r5, 16
jl .nextblock
 
ADD rsp, pad
RET
.no_dc:
mov r6d, dword [r1+r5*4]
add r6, r0
add word [r2], 32
IDCT8_ADD_MMX_START r2 , rsp
IDCT8_ADD_MMX_START r2+8, rsp+64
IDCT8_ADD_MMX_END r6 , rsp, r3, r2
mov r6d, dword [r1+r5*4]
lea r6, [r0+r6+4]
IDCT8_ADD_MMX_END r6 , rsp+8, r3
.skipblock:
add r5, 4
add r2, 128
cmp r5, 16
jl .nextblock
 
ADD rsp, pad
RET
 
INIT_XMM sse2
; ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
xor r5, r5
%ifdef PIC
lea picregq, [scan8_mem]
%endif
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .skipblock
cmp r6, 1
jnz .no_dc
movsx r6, word [r2]
test r6, r6
jz .no_dc
INIT_MMX cpuname
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64 == 0
%define dst2q r1
%define dst2d r1d
%endif
mov dst2d, dword [r1+r5*4]
add dst2q, r0
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
lea dst2q, [dst2q+r3*4]
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
add r5, 4
add r2, 128
cmp r5, 16
jl .nextblock
REP_RET
.no_dc:
INIT_XMM cpuname
mov dst2d, dword [r1+r5*4]
add dst2q, r0
IDCT8_ADD_SSE dst2q, r2, r3, r6
%if ARCH_X86_64 == 0
mov r1, r1m
%endif
.skipblock:
add r5, 4
add r2, 128
cmp r5, 16
jl .nextblock
REP_RET
 
INIT_MMX mmx
h264_idct_add8_mmx_plane:
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
or r6w, word [r2]
test r6, r6
jz .skipblock
%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
add r0, dword [r1+r5*4]
%endif
IDCT4_ADD r0, r2, r3
.skipblock:
inc r5
add r2, 32
test r5, 3
jnz .nextblock
rep ret
 
; ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
; int16_t *block, int stride, const uint8_t nnzc[6 * 8])
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
mov r5, 16
add r2, 512
%ifdef PIC
lea picregq, [scan8_mem]
%endif
%if ARCH_X86_64
mov dst2q, r0
%endif
call h264_idct_add8_mmx_plane
mov r5, 32
add r2, 384
%if ARCH_X86_64
add dst2q, gprsize
%else
add r0mp, gprsize
%endif
call h264_idct_add8_mmx_plane
RET
 
h264_idct_add8_mmxext_plane:
.nextblock:
movzx r6, byte [scan8+r5]
movzx r6, byte [r4+r6]
test r6, r6
jz .try_dc
%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
add r0, dword [r1+r5*4]
%endif
IDCT4_ADD r0, r2, r3
inc r5
add r2, 32
test r5, 3
jnz .nextblock
rep ret
.try_dc:
movsx r6, word [r2]
test r6, r6
jz .skipblock
mov word [r2], 0
DC_ADD_MMXEXT_INIT r6, r3
%if ARCH_X86_64
mov r0d, dword [r1+r5*4]
add r0, [dst2q]
%else
mov r0, r1m ; XXX r1m here is actually r0m of the calling func
mov r0, [r0]
add r0, dword [r1+r5*4]
%endif
DC_ADD_MMXEXT_OP movh, r0, r3, r6
.skipblock:
inc r5
add r2, 32
test r5, 3
jnz .nextblock
rep ret
 
INIT_MMX mmxext
; ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
mov r5, 16
add r2, 512
%if ARCH_X86_64
mov dst2q, r0
%endif
%ifdef PIC
lea picregq, [scan8_mem]
%endif
call h264_idct_add8_mmxext_plane
mov r5, 32
add r2, 384
%if ARCH_X86_64
add dst2q, gprsize
%else
add r0mp, gprsize
%endif
call h264_idct_add8_mmxext_plane
RET
 
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
h264_idct_dc_add8_mmxext:
movd m0, [r2 ] ; 0 0 X D
mov word [r2+ 0], 0
punpcklwd m0, [r2+32] ; x X d D
mov word [r2+32], 0
paddsw m0, [pw_32]
psraw m0, 6
punpcklwd m0, m0 ; d d D D
pxor m1, m1 ; 0 0 0 0
psubw m1, m0 ; -d-d-D-D
packuswb m0, m1 ; -d-d-D-D d d D D
pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D
punpcklwd m0, m0 ; d d d d D D D D
lea r6, [r3*3]
DC_ADD_MMXEXT_OP movq, r0, r3, r6
ret
 
ALIGN 16
INIT_XMM sse2
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
h264_add8x4_idct_sse2:
movq m0, [r2+ 0]
movq m1, [r2+ 8]
movq m2, [r2+16]
movq m3, [r2+24]
movhps m0, [r2+32]
movhps m1, [r2+40]
movhps m2, [r2+48]
movhps m3, [r2+56]
IDCT4_1D w,0,1,2,3,4,5
TRANSPOSE2x4x4W 0,1,2,3,4
paddw m0, [pw_32]
IDCT4_1D w,0,1,2,3,4,5
pxor m7, m7
mova [r2+ 0], m7
mova [r2+16], m7
mova [r2+32], m7
mova [r2+48], m7
STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
lea r0, [r0+r3*2]
STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
ret
 
%macro add16_sse2_cycle 2
movzx r0, word [r4+%2]
test r0, r0
jz .cycle%1end
mov r0d, dword [r1+%1*8]
%if ARCH_X86_64
add r0, r5
%else
add r0, r0m
%endif
call h264_add8x4_idct_sse2
.cycle%1end:
%if %1 < 7
add r2, 64
%endif
%endmacro
 
; ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
%if ARCH_X86_64
mov r5, r0
%endif
; unrolling of the loop leads to an average performance gain of
; 20-25%
add16_sse2_cycle 0, 0xc
add16_sse2_cycle 1, 0x14
add16_sse2_cycle 2, 0xe
add16_sse2_cycle 3, 0x16
add16_sse2_cycle 4, 0x1c
add16_sse2_cycle 5, 0x24
add16_sse2_cycle 6, 0x1e
add16_sse2_cycle 7, 0x26
RET
 
%macro add16intra_sse2_cycle 2
movzx r0, word [r4+%2]
test r0, r0
jz .try%1dc
mov r0d, dword [r1+%1*8]
%if ARCH_X86_64
add r0, r7
%else
add r0, r0m
%endif
call h264_add8x4_idct_sse2
jmp .cycle%1end
.try%1dc:
movsx r0, word [r2 ]
or r0w, word [r2+32]
jz .cycle%1end
mov r0d, dword [r1+%1*8]
%if ARCH_X86_64
add r0, r7
%else
add r0, r0m
%endif
call h264_idct_dc_add8_mmxext
.cycle%1end:
%if %1 < 7
add r2, 64
%endif
%endmacro
 
; ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
%if ARCH_X86_64
mov r7, r0
%endif
add16intra_sse2_cycle 0, 0xc
add16intra_sse2_cycle 1, 0x14
add16intra_sse2_cycle 2, 0xe
add16intra_sse2_cycle 3, 0x16
add16intra_sse2_cycle 4, 0x1c
add16intra_sse2_cycle 5, 0x24
add16intra_sse2_cycle 6, 0x1e
add16intra_sse2_cycle 7, 0x26
RET
 
%macro add8_sse2_cycle 2
movzx r0, word [r4+%2]
test r0, r0
jz .try%1dc
%if ARCH_X86_64
mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
add r0, [r7]
%else
mov r0, r0m
mov r0, [r0]
add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
%endif
call h264_add8x4_idct_sse2
jmp .cycle%1end
.try%1dc:
movsx r0, word [r2 ]
or r0w, word [r2+32]
jz .cycle%1end
%if ARCH_X86_64
mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
add r0, [r7]
%else
mov r0, r0m
mov r0, [r0]
add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
%endif
call h264_idct_dc_add8_mmxext
.cycle%1end:
%if %1 == 1
add r2, 384+64
%elif %1 < 3
add r2, 64
%endif
%endmacro
 
; ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
; int16_t *block, int stride,
; const uint8_t nnzc[6 * 8])
cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
add r2, 512
%if ARCH_X86_64
mov r7, r0
%endif
add8_sse2_cycle 0, 0x34
add8_sse2_cycle 1, 0x3c
%if ARCH_X86_64
add r7, gprsize
%else
add r0mp, gprsize
%endif
add8_sse2_cycle 2, 0x5c
add8_sse2_cycle 3, 0x64
RET
 
;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
 
%macro WALSH4_1D 5
SUMSUB_BADC w, %4, %3, %2, %1, %5
SUMSUB_BADC w, %4, %2, %3, %1, %5
SWAP %1, %4, %3
%endmacro
 
%macro DEQUANT_MMX 3
mova m7, [pw_1]
mova m4, %1
punpcklwd %1, m7
punpckhwd m4, m7
mova m5, %2
punpcklwd %2, m7
punpckhwd m5, m7
movd m7, t3d
punpckldq m7, m7
pmaddwd %1, m7
pmaddwd %2, m7
pmaddwd m4, m7
pmaddwd m5, m7
psrad %1, %3
psrad %2, %3
psrad m4, %3
psrad m5, %3
packssdw %1, m4
packssdw %2, m5
%endmacro
 
%macro STORE_WORDS 5-9
%if cpuflag(sse)
movd t0d, %1
psrldq %1, 4
movd t1d, %1
psrldq %1, 4
mov [t2+%2*32], t0w
mov [t2+%4*32], t1w
shr t0d, 16
shr t1d, 16
mov [t2+%3*32], t0w
mov [t2+%5*32], t1w
movd t0d, %1
psrldq %1, 4
movd t1d, %1
mov [t2+%6*32], t0w
mov [t2+%8*32], t1w
shr t0d, 16
shr t1d, 16
mov [t2+%7*32], t0w
mov [t2+%9*32], t1w
%else
movd t0d, %1
psrlq %1, 32
movd t1d, %1
mov [t2+%2*32], t0w
mov [t2+%4*32], t1w
shr t0d, 16
shr t1d, 16
mov [t2+%3*32], t0w
mov [t2+%5*32], t1w
%endif
%endmacro
 
%macro DEQUANT_STORE 1
%if cpuflag(sse2)
movd xmm4, t3d
movq xmm5, [pw_1]
pshufd xmm4, xmm4, 0
movq2dq xmm0, m0
movq2dq xmm1, m1
movq2dq xmm2, m2
movq2dq xmm3, m3
punpcklwd xmm0, xmm5
punpcklwd xmm1, xmm5
punpcklwd xmm2, xmm5
punpcklwd xmm3, xmm5
pmaddwd xmm0, xmm4
pmaddwd xmm1, xmm4
pmaddwd xmm2, xmm4
pmaddwd xmm3, xmm4
psrad xmm0, %1
psrad xmm1, %1
psrad xmm2, %1
psrad xmm3, %1
packssdw xmm0, xmm1
packssdw xmm2, xmm3
STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7
STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15
%else
DEQUANT_MMX m0, m1, %1
STORE_WORDS m0, 0, 1, 4, 5
STORE_WORDS m1, 2, 3, 6, 7
 
DEQUANT_MMX m2, m3, %1
STORE_WORDS m2, 8, 9, 12, 13
STORE_WORDS m3, 10, 11, 14, 15
%endif
%endmacro
 
%macro IDCT_DC_DEQUANT 1
cglobal h264_luma_dc_dequant_idct, 3, 4, %1
; manually spill XMM registers for Win64 because
; the code here is initialized with INIT_MMX
WIN64_SPILL_XMM %1
movq m3, [r1+24]
movq m2, [r1+16]
movq m1, [r1+ 8]
movq m0, [r1+ 0]
WALSH4_1D 0,1,2,3,4
TRANSPOSE4x4W 0,1,2,3,4
WALSH4_1D 0,1,2,3,4
 
; shift, tmp, output, qmul
%if WIN64
DECLARE_REG_TMP 0,3,1,2
; we can't avoid this, because r0 is the shift register (ecx) on win64
xchg r0, t2
%elif ARCH_X86_64
DECLARE_REG_TMP 3,1,0,2
%else
DECLARE_REG_TMP 1,3,0,2
%endif
 
cmp t3d, 32767
jg .big_qmul
add t3d, 128 << 16
DEQUANT_STORE 8
RET
.big_qmul:
bsr t0d, t3d
add t3d, 128 << 16
mov t1d, 7
cmp t0d, t1d
cmovg t0d, t1d
inc t1d
shr t3d, t0b
sub t1d, t0d
%if cpuflag(sse2)
movd xmm6, t1d
DEQUANT_STORE xmm6
%else
movd m6, t1d
DEQUANT_STORE m6
%endif
RET
%endmacro
 
INIT_MMX mmx
IDCT_DC_DEQUANT 0
INIT_MMX sse2
IDCT_DC_DEQUANT 7
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_idct_10bit.asm
0,0 → 1,589
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
pw_pixel_max: times 8 dw ((1 << 10)-1)
pd_32: times 4 dd 32
 
SECTION .text
 
;-----------------------------------------------------------------------------
; void h264_idct_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
%macro STORE_DIFFx2 6
psrad %1, 6
psrad %2, 6
packssdw %1, %2
movq %3, [%5]
movhps %3, [%5+%6]
paddsw %1, %3
CLIPW %1, %4, [pw_pixel_max]
movq [%5], %1
movhps [%5+%6], %1
%endmacro
 
%macro STORE_DIFF16 5
psrad %1, 6
psrad %2, 6
packssdw %1, %2
paddsw %1, [%5]
CLIPW %1, %3, %4
mova [%5], %1
%endmacro
 
;dst, in, stride
%macro IDCT4_ADD_10 3
mova m0, [%2+ 0]
mova m1, [%2+16]
mova m2, [%2+32]
mova m3, [%2+48]
IDCT4_1D d,0,1,2,3,4,5
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5
mova [%2+ 0], m5
mova [%2+16], m5
mova [%2+32], m5
mova [%2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m4, m5, %1, %3
%endmacro
 
%macro IDCT_ADD_10 0
cglobal h264_idct_add_10, 3,3
IDCT4_ADD_10 r0, r1, r2
RET
%endmacro
 
INIT_XMM sse2
IDCT_ADD_10
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD_10
%endif
 
;-----------------------------------------------------------------------------
; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
;;;;;;; NO FATE SAMPLES TRIGGER THIS
%macro ADD4x4IDCT 0
add4x4_idct %+ SUFFIX:
add r5, r0
mova m0, [r2+ 0]
mova m1, [r2+16]
mova m2, [r2+32]
mova m3, [r2+48]
IDCT4_1D d,0,1,2,3,4,5
TRANSPOSE4x4D 0,1,2,3,4
paddd m0, [pd_32]
IDCT4_1D d,0,1,2,3,4,5
pxor m5, m5
mova [r2+ 0], m5
mova [r2+16], m5
mova [r2+32], m5
mova [r2+48], m5
STORE_DIFFx2 m0, m1, m4, m5, r5, r3
lea r5, [r5+r3*2]
STORE_DIFFx2 m2, m3, m4, m5, r5, r3
ret
%endmacro
 
INIT_XMM sse2
ALIGN 16
ADD4x4IDCT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
ALIGN 16
ADD4x4IDCT
%endif
 
%macro ADD16_OP 2
cmp byte [r4+%2], 0
jz .skipblock%1
mov r5d, [r1+%1*4]
call add4x4_idct %+ SUFFIX
.skipblock%1:
%if %1<15
add r2, 64
%endif
%endmacro
 
%macro IDCT_ADD16_10 0
cglobal h264_idct_add16_10, 5,6
ADD16_OP 0, 4+1*8
ADD16_OP 1, 5+1*8
ADD16_OP 2, 4+2*8
ADD16_OP 3, 5+2*8
ADD16_OP 4, 6+1*8
ADD16_OP 5, 7+1*8
ADD16_OP 6, 6+2*8
ADD16_OP 7, 7+2*8
ADD16_OP 8, 4+3*8
ADD16_OP 9, 5+3*8
ADD16_OP 10, 4+4*8
ADD16_OP 11, 5+4*8
ADD16_OP 12, 6+3*8
ADD16_OP 13, 7+3*8
ADD16_OP 14, 6+4*8
ADD16_OP 15, 7+4*8
REP_RET
%endmacro
 
INIT_XMM sse2
IDCT_ADD16_10
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD16_10
%endif
 
;-----------------------------------------------------------------------------
; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT_DC_ADD_OP_10 3
pxor m5, m5
%if avx_enabled
paddw m1, m0, [%1+0 ]
paddw m2, m0, [%1+%2 ]
paddw m3, m0, [%1+%2*2]
paddw m4, m0, [%1+%3 ]
%else
mova m1, [%1+0 ]
mova m2, [%1+%2 ]
mova m3, [%1+%2*2]
mova m4, [%1+%3 ]
paddw m1, m0
paddw m2, m0
paddw m3, m0
paddw m4, m0
%endif
CLIPW m1, m5, m6
CLIPW m2, m5, m6
CLIPW m3, m5, m6
CLIPW m4, m5, m6
mova [%1+0 ], m1
mova [%1+%2 ], m2
mova [%1+%2*2], m3
mova [%1+%3 ], m4
%endmacro
 
INIT_MMX mmxext
cglobal h264_idct_dc_add_10,3,3
movd m0, [r1]
mov dword [r1], 0
paddd m0, [pd_32]
psrad m0, 6
lea r1, [r2*3]
pshufw m0, m0, 0
mova m6, [pw_pixel_max]
IDCT_DC_ADD_OP_10 r0, r2, r1
RET
 
;-----------------------------------------------------------------------------
; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT8_DC_ADD 0
cglobal h264_idct8_dc_add_10,3,4,7
movd m0, [r1]
mov dword[r1], 0
paddd m0, [pd_32]
psrad m0, 6
lea r1, [r2*3]
SPLATW m0, m0, 0
mova m6, [pw_pixel_max]
IDCT_DC_ADD_OP_10 r0, r2, r1
lea r0, [r0+r2*4]
IDCT_DC_ADD_OP_10 r0, r2, r1
RET
%endmacro
 
INIT_XMM sse2
IDCT8_DC_ADD
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_DC_ADD
%endif
 
;-----------------------------------------------------------------------------
; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
%macro AC 1
.ac%1:
mov r5d, [r1+(%1+0)*4]
call add4x4_idct %+ SUFFIX
mov r5d, [r1+(%1+1)*4]
add r2, 64
call add4x4_idct %+ SUFFIX
add r2, 64
jmp .skipadd%1
%endmacro
 
%assign last_block 16
%macro ADD16_OP_INTRA 2
cmp word [r4+%2], 0
jnz .ac%1
mov r5d, [r2+ 0]
or r5d, [r2+64]
jz .skipblock%1
mov r5d, [r1+(%1+0)*4]
call idct_dc_add %+ SUFFIX
.skipblock%1:
%if %1<last_block-2
add r2, 128
%endif
.skipadd%1:
%endmacro
 
%macro IDCT_ADD16INTRA_10 0
idct_dc_add %+ SUFFIX:
add r5, r0
movq m0, [r2+ 0]
movhps m0, [r2+64]
mov dword [r2+ 0], 0
mov dword [r2+64], 0
paddd m0, [pd_32]
psrad m0, 6
pshufhw m0, m0, 0
pshuflw m0, m0, 0
lea r6, [r3*3]
mova m6, [pw_pixel_max]
IDCT_DC_ADD_OP_10 r5, r3, r6
ret
 
cglobal h264_idct_add16intra_10,5,7,8
ADD16_OP_INTRA 0, 4+1*8
ADD16_OP_INTRA 2, 4+2*8
ADD16_OP_INTRA 4, 6+1*8
ADD16_OP_INTRA 6, 6+2*8
ADD16_OP_INTRA 8, 4+3*8
ADD16_OP_INTRA 10, 4+4*8
ADD16_OP_INTRA 12, 6+3*8
ADD16_OP_INTRA 14, 6+4*8
REP_RET
AC 8
AC 10
AC 12
AC 14
AC 0
AC 2
AC 4
AC 6
%endmacro
 
INIT_XMM sse2
IDCT_ADD16INTRA_10
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD16INTRA_10
%endif
 
%assign last_block 36
;-----------------------------------------------------------------------------
; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
%macro IDCT_ADD8 0
cglobal h264_idct_add8_10,5,8,7
%if ARCH_X86_64
mov r7, r0
%endif
add r2, 1024
mov r0, [r0]
ADD16_OP_INTRA 16, 4+ 6*8
ADD16_OP_INTRA 18, 4+ 7*8
add r2, 1024-128*2
%if ARCH_X86_64
mov r0, [r7+gprsize]
%else
mov r0, r0m
mov r0, [r0+gprsize]
%endif
ADD16_OP_INTRA 32, 4+11*8
ADD16_OP_INTRA 34, 4+12*8
REP_RET
AC 16
AC 18
AC 32
AC 34
 
%endmacro ; IDCT_ADD8
 
INIT_XMM sse2
IDCT_ADD8
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT_ADD8
%endif
 
;-----------------------------------------------------------------------------
; void h264_idct8_add(pixel *dst, dctcoef *block, int stride)
;-----------------------------------------------------------------------------
%macro IDCT8_1D 2
SWAP 0, 1
psrad m4, m5, 1
psrad m1, m0, 1
paddd m4, m5
paddd m1, m0
paddd m4, m7
paddd m1, m5
psubd m4, m0
paddd m1, m3
 
psubd m0, m3
psubd m5, m3
paddd m0, m7
psubd m5, m7
psrad m3, 1
psrad m7, 1
psubd m0, m3
psubd m5, m7
 
SWAP 1, 7
psrad m1, m7, 2
psrad m3, m4, 2
paddd m3, m0
psrad m0, 2
paddd m1, m5
psrad m5, 2
psubd m0, m4
psubd m7, m5
 
SWAP 5, 6
psrad m4, m2, 1
psrad m6, m5, 1
psubd m4, m5
paddd m6, m2
 
mova m2, %1
mova m5, %2
SUMSUB_BA d, 5, 2
SUMSUB_BA d, 6, 5
SUMSUB_BA d, 4, 2
SUMSUB_BA d, 7, 6
SUMSUB_BA d, 0, 4
SUMSUB_BA d, 3, 2
SUMSUB_BA d, 1, 5
SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
%endmacro
 
%macro IDCT8_1D_FULL 1
mova m7, [%1+112*2]
mova m6, [%1+ 96*2]
mova m5, [%1+ 80*2]
mova m3, [%1+ 48*2]
mova m2, [%1+ 32*2]
mova m1, [%1+ 16*2]
IDCT8_1D [%1], [%1+ 64*2]
%endmacro
 
; %1=int16_t *block, %2=int16_t *dstblock
%macro IDCT8_ADD_SSE_START 2
IDCT8_1D_FULL %1
%if ARCH_X86_64
TRANSPOSE4x4D 0,1,2,3,8
mova [%2 ], m0
TRANSPOSE4x4D 4,5,6,7,8
mova [%2+8*2], m4
%else
mova [%1], m7
TRANSPOSE4x4D 0,1,2,3,7
mova m7, [%1]
mova [%2 ], m0
mova [%2+16*2], m1
mova [%2+32*2], m2
mova [%2+48*2], m3
TRANSPOSE4x4D 4,5,6,7,3
mova [%2+ 8*2], m4
mova [%2+24*2], m5
mova [%2+40*2], m6
mova [%2+56*2], m7
%endif
%endmacro
 
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
%macro IDCT8_ADD_SSE_END 3
IDCT8_1D_FULL %2
mova [%2 ], m6
mova [%2+16*2], m7
 
pxor m7, m7
STORE_DIFFx2 m0, m1, m6, m7, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m2, m3, m6, m7, %1, %3
mova m0, [%2 ]
mova m1, [%2+16*2]
lea %1, [%1+%3*2]
STORE_DIFFx2 m4, m5, m6, m7, %1, %3
lea %1, [%1+%3*2]
STORE_DIFFx2 m0, m1, m6, m7, %1, %3
%endmacro
 
%macro IDCT8_ADD 0
cglobal h264_idct8_add_10, 3,4,16
%if UNIX64 == 0
%assign pad 16-gprsize-(stack_offset&15)
sub rsp, pad
call h264_idct8_add1_10 %+ SUFFIX
add rsp, pad
RET
%endif
 
ALIGN 16
; TODO: does not need to use stack
h264_idct8_add1_10 %+ SUFFIX:
%assign pad 256+16-gprsize
sub rsp, pad
add dword [r1], 32
 
%if ARCH_X86_64
IDCT8_ADD_SSE_START r1, rsp
SWAP 1, 9
SWAP 2, 10
SWAP 3, 11
SWAP 5, 13
SWAP 6, 14
SWAP 7, 15
IDCT8_ADD_SSE_START r1+16, rsp+128
PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
IDCT8_1D [rsp], [rsp+128]
SWAP 0, 8
SWAP 1, 9
SWAP 2, 10
SWAP 3, 11
SWAP 4, 12
SWAP 5, 13
SWAP 6, 14
SWAP 7, 15
IDCT8_1D [rsp+16], [rsp+144]
psrad m8, 6
psrad m0, 6
packssdw m8, m0
paddsw m8, [r0]
pxor m0, m0
mova [r1+ 0], m0
mova [r1+ 16], m0
mova [r1+ 32], m0
mova [r1+ 48], m0
mova [r1+ 64], m0
mova [r1+ 80], m0
mova [r1+ 96], m0
mova [r1+112], m0
mova [r1+128], m0
mova [r1+144], m0
mova [r1+160], m0
mova [r1+176], m0
mova [r1+192], m0
mova [r1+208], m0
mova [r1+224], m0
mova [r1+240], m0
CLIPW m8, m0, [pw_pixel_max]
mova [r0], m8
mova m8, [pw_pixel_max]
STORE_DIFF16 m9, m1, m0, m8, r0+r2
lea r0, [r0+r2*2]
STORE_DIFF16 m10, m2, m0, m8, r0
STORE_DIFF16 m11, m3, m0, m8, r0+r2
lea r0, [r0+r2*2]
STORE_DIFF16 m12, m4, m0, m8, r0
STORE_DIFF16 m13, m5, m0, m8, r0+r2
lea r0, [r0+r2*2]
STORE_DIFF16 m14, m6, m0, m8, r0
STORE_DIFF16 m15, m7, m0, m8, r0+r2
%else
IDCT8_ADD_SSE_START r1, rsp
IDCT8_ADD_SSE_START r1+16, rsp+128
lea r3, [r0+8]
IDCT8_ADD_SSE_END r0, rsp, r2
IDCT8_ADD_SSE_END r3, rsp+16, r2
mova [r1+ 0], m7
mova [r1+ 16], m7
mova [r1+ 32], m7
mova [r1+ 48], m7
mova [r1+ 64], m7
mova [r1+ 80], m7
mova [r1+ 96], m7
mova [r1+112], m7
mova [r1+128], m7
mova [r1+144], m7
mova [r1+160], m7
mova [r1+176], m7
mova [r1+192], m7
mova [r1+208], m7
mova [r1+224], m7
mova [r1+240], m7
%endif ; ARCH_X86_64
 
add rsp, pad
ret
%endmacro
 
INIT_XMM sse2
IDCT8_ADD
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_ADD
%endif
 
;-----------------------------------------------------------------------------
; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
;-----------------------------------------------------------------------------
;;;;;;; NO FATE SAMPLES TRIGGER THIS
%macro IDCT8_ADD4_OP 2
cmp byte [r4+%2], 0
jz .skipblock%1
mov r0d, [r6+%1*4]
add r0, r5
call h264_idct8_add1_10 %+ SUFFIX
.skipblock%1:
%if %1<12
add r1, 256
%endif
%endmacro
 
%macro IDCT8_ADD4 0
cglobal h264_idct8_add4_10, 0,7,16
%assign pad 16-gprsize-(stack_offset&15)
SUB rsp, pad
mov r5, r0mp
mov r6, r1mp
mov r1, r2mp
mov r2d, r3m
movifnidn r4, r4mp
IDCT8_ADD4_OP 0, 4+1*8
IDCT8_ADD4_OP 4, 6+1*8
IDCT8_ADD4_OP 8, 4+3*8
IDCT8_ADD4_OP 12, 6+3*8
ADD rsp, pad
RET
%endmacro ; IDCT8_ADD4
 
INIT_XMM sse2
IDCT8_ADD4
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
IDCT8_ADD4
%endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_intrapred.asm
0,0 → 1,2702
;******************************************************************************
;* H.264 intra prediction asm optimizations
;* Copyright (c) 2010 Jason Garrett-Glaser
;* Copyright (c) 2010 Holger Lubitz
;* Copyright (c) 2010 Loren Merritt
;* Copyright (c) 2010 Ronald S. Bultje
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
tm_shuf: times 8 db 0x03, 0x80
pw_ff00: times 8 dw 0xff00
plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1
db 1, 2, 3, 4, 5, 6, 7, 8
plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0
db 1, 2, 3, 4, 0, 0, 0, 0
pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7
pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8
pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1
pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4
 
SECTION .text
 
cextern pb_1
cextern pb_3
cextern pw_4
cextern pw_5
cextern pw_8
cextern pw_16
cextern pw_17
cextern pw_32
 
;-----------------------------------------------------------------------------
; void pred16x16_vertical_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmx
cglobal pred16x16_vertical_8, 2,3
sub r0, r1
mov r2, 8
movq mm0, [r0+0]
movq mm1, [r0+8]
.loop:
movq [r0+r1*1+0], mm0
movq [r0+r1*1+8], mm1
movq [r0+r1*2+0], mm0
movq [r0+r1*2+8], mm1
lea r0, [r0+r1*2]
dec r2
jg .loop
REP_RET
 
INIT_XMM sse
cglobal pred16x16_vertical_8, 2,3
sub r0, r1
mov r2, 4
movaps xmm0, [r0]
.loop:
movaps [r0+r1*1], xmm0
movaps [r0+r1*2], xmm0
lea r0, [r0+r1*2]
movaps [r0+r1*1], xmm0
movaps [r0+r1*2], xmm0
lea r0, [r0+r1*2]
dec r2
jg .loop
REP_RET
 
;-----------------------------------------------------------------------------
; void pred16x16_horizontal_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED16x16_H 0
cglobal pred16x16_horizontal_8, 2,3
mov r2, 8
%if cpuflag(ssse3)
mova m2, [pb_3]
%endif
.loop:
movd m0, [r0+r1*0-4]
movd m1, [r0+r1*1-4]
 
%if cpuflag(ssse3)
pshufb m0, m2
pshufb m1, m2
%else
punpcklbw m0, m0
punpcklbw m1, m1
SPLATW m0, m0, 3
SPLATW m1, m1, 3
mova [r0+r1*0+8], m0
mova [r0+r1*1+8], m1
%endif
 
mova [r0+r1*0], m0
mova [r0+r1*1], m1
lea r0, [r0+r1*2]
dec r2
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmx
PRED16x16_H
INIT_MMX mmxext
PRED16x16_H
INIT_XMM ssse3
PRED16x16_H
 
;-----------------------------------------------------------------------------
; void pred16x16_dc_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED16x16_DC 0
cglobal pred16x16_dc_8, 2,7
mov r4, r0
sub r0, r1
pxor mm0, mm0
pxor mm1, mm1
psadbw mm0, [r0+0]
psadbw mm1, [r0+8]
dec r0
movzx r5d, byte [r0+r1*1]
paddw mm0, mm1
movd r6d, mm0
lea r0, [r0+r1*2]
%rep 7
movzx r2d, byte [r0+r1*0]
movzx r3d, byte [r0+r1*1]
add r5d, r2d
add r6d, r3d
lea r0, [r0+r1*2]
%endrep
movzx r2d, byte [r0+r1*0]
add r5d, r6d
lea r2d, [r2+r5+16]
shr r2d, 5
%if cpuflag(ssse3)
pxor m1, m1
%endif
SPLATB_REG m0, r2, m1
 
%if mmsize==8
mov r3d, 8
.loop:
mova [r4+r1*0+0], m0
mova [r4+r1*0+8], m0
mova [r4+r1*1+0], m0
mova [r4+r1*1+8], m0
%else
mov r3d, 4
.loop:
mova [r4+r1*0], m0
mova [r4+r1*1], m0
lea r4, [r4+r1*2]
mova [r4+r1*0], m0
mova [r4+r1*1], m0
%endif
lea r4, [r4+r1*2]
dec r3d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PRED16x16_DC
INIT_XMM sse2
PRED16x16_DC
INIT_XMM ssse3
PRED16x16_DC
 
;-----------------------------------------------------------------------------
; void pred16x16_tm_vp8_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED16x16_TM 0
cglobal pred16x16_tm_vp8_8, 2,5
sub r0, r1
pxor mm7, mm7
movq mm0, [r0+0]
movq mm2, [r0+8]
movq mm1, mm0
movq mm3, mm2
punpcklbw mm0, mm7
punpckhbw mm1, mm7
punpcklbw mm2, mm7
punpckhbw mm3, mm7
movzx r3d, byte [r0-1]
mov r4d, 16
.loop:
movzx r2d, byte [r0+r1-1]
sub r2d, r3d
movd mm4, r2d
SPLATW mm4, mm4, 0
movq mm5, mm4
movq mm6, mm4
movq mm7, mm4
paddw mm4, mm0
paddw mm5, mm1
paddw mm6, mm2
paddw mm7, mm3
packuswb mm4, mm5
packuswb mm6, mm7
movq [r0+r1+0], mm4
movq [r0+r1+8], mm6
add r0, r1
dec r4d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmx
PRED16x16_TM
INIT_MMX mmxext
PRED16x16_TM
 
INIT_XMM sse2
cglobal pred16x16_tm_vp8_8, 2,6,6
sub r0, r1
pxor xmm2, xmm2
movdqa xmm0, [r0]
movdqa xmm1, xmm0
punpcklbw xmm0, xmm2
punpckhbw xmm1, xmm2
movzx r4d, byte [r0-1]
mov r5d, 8
.loop:
movzx r2d, byte [r0+r1*1-1]
movzx r3d, byte [r0+r1*2-1]
sub r2d, r4d
sub r3d, r4d
movd xmm2, r2d
movd xmm4, r3d
pshuflw xmm2, xmm2, 0
pshuflw xmm4, xmm4, 0
punpcklqdq xmm2, xmm2
punpcklqdq xmm4, xmm4
movdqa xmm3, xmm2
movdqa xmm5, xmm4
paddw xmm2, xmm0
paddw xmm3, xmm1
paddw xmm4, xmm0
paddw xmm5, xmm1
packuswb xmm2, xmm3
packuswb xmm4, xmm5
movdqa [r0+r1*1], xmm2
movdqa [r0+r1*2], xmm4
lea r0, [r0+r1*2]
dec r5d
jg .loop
REP_RET
 
;-----------------------------------------------------------------------------
; void pred16x16_plane_*_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
%macro H264_PRED16x16_PLANE 1
cglobal pred16x16_plane_%1_8, 2,9,7
mov r2, r1 ; +stride
neg r1 ; -stride
 
movh m0, [r0+r1 -1]
%if mmsize == 8
pxor m4, m4
movh m1, [r0+r1 +3 ]
movh m2, [r0+r1 +8 ]
movh m3, [r0+r1 +12]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
pmullw m0, [pw_m8tom1 ]
pmullw m1, [pw_m8tom1+8]
pmullw m2, [pw_1to8 ]
pmullw m3, [pw_1to8 +8]
paddw m0, m2
paddw m1, m3
%else ; mmsize == 16
%if cpuflag(ssse3)
movhps m0, [r0+r1 +8]
pmaddubsw m0, [plane_shuf] ; H coefficients
%else ; sse2
pxor m2, m2
movh m1, [r0+r1 +8]
punpcklbw m0, m2
punpcklbw m1, m2
pmullw m0, [pw_m8tom1]
pmullw m1, [pw_1to8]
paddw m0, m1
%endif
movhlps m1, m0
%endif
paddw m0, m1
%if cpuflag(mmxext)
PSHUFLW m1, m0, 0xE
%elif cpuflag(mmx)
mova m1, m0
psrlq m1, 32
%endif
paddw m0, m1
%if cpuflag(mmxext)
PSHUFLW m1, m0, 0x1
%elif cpuflag(mmx)
mova m1, m0
psrlq m1, 16
%endif
paddw m0, m1 ; sum of H coefficients
 
lea r4, [r0+r2*8-1]
lea r3, [r0+r2*4-1]
add r4, r2
 
%if ARCH_X86_64
%define e_reg r8
%else
%define e_reg r0
%endif
 
movzx e_reg, byte [r3+r2*2 ]
movzx r5, byte [r4+r1 ]
sub r5, e_reg
 
movzx e_reg, byte [r3+r2 ]
movzx r6, byte [r4 ]
sub r6, e_reg
lea r5, [r5+r6*2]
 
movzx e_reg, byte [r3+r1 ]
movzx r6, byte [r4+r2*2 ]
sub r6, e_reg
lea r5, [r5+r6*4]
 
movzx e_reg, byte [r3 ]
%if ARCH_X86_64
movzx r7, byte [r4+r2 ]
sub r7, e_reg
%else
movzx r6, byte [r4+r2 ]
sub r6, e_reg
lea r5, [r5+r6*4]
sub r5, r6
%endif
 
lea e_reg, [r3+r1*4]
lea r3, [r4+r2*4]
 
movzx r4, byte [e_reg+r2 ]
movzx r6, byte [r3 ]
sub r6, r4
%if ARCH_X86_64
lea r6, [r7+r6*2]
lea r5, [r5+r6*2]
add r5, r6
%else
lea r5, [r5+r6*4]
lea r5, [r5+r6*2]
%endif
 
movzx r4, byte [e_reg ]
%if ARCH_X86_64
movzx r7, byte [r3 +r2 ]
sub r7, r4
sub r5, r7
%else
movzx r6, byte [r3 +r2 ]
sub r6, r4
lea r5, [r5+r6*8]
sub r5, r6
%endif
 
movzx r4, byte [e_reg+r1 ]
movzx r6, byte [r3 +r2*2]
sub r6, r4
%if ARCH_X86_64
add r6, r7
%endif
lea r5, [r5+r6*8]
 
movzx r4, byte [e_reg+r2*2]
movzx r6, byte [r3 +r1 ]
sub r6, r4
lea r5, [r5+r6*4]
add r5, r6 ; sum of V coefficients
 
%if ARCH_X86_64 == 0
mov r0, r0m
%endif
 
%ifidn %1, h264
lea r5, [r5*5+32]
sar r5, 6
%elifidn %1, rv40
lea r5, [r5*5]
sar r5, 6
%elifidn %1, svq3
test r5, r5
lea r6, [r5+3]
cmovs r5, r6
sar r5, 2 ; V/4
lea r5, [r5*5] ; 5*(V/4)
test r5, r5
lea r6, [r5+15]
cmovs r5, r6
sar r5, 4 ; (5*(V/4))/16
%endif
 
movzx r4, byte [r0+r1 +15]
movzx r3, byte [r3+r2*2 ]
lea r3, [r3+r4+1]
shl r3, 4
 
movd r1d, m0
movsx r1d, r1w
%ifnidn %1, svq3
%ifidn %1, h264
lea r1d, [r1d*5+32]
%else ; rv40
lea r1d, [r1d*5]
%endif
sar r1d, 6
%else ; svq3
test r1d, r1d
lea r4d, [r1d+3]
cmovs r1d, r4d
sar r1d, 2 ; H/4
lea r1d, [r1d*5] ; 5*(H/4)
test r1d, r1d
lea r4d, [r1d+15]
cmovs r1d, r4d
sar r1d, 4 ; (5*(H/4))/16
%endif
movd m0, r1d
 
add r1d, r5d
add r3d, r1d
shl r1d, 3
sub r3d, r1d ; a
 
movd m1, r5d
movd m3, r3d
SPLATW m0, m0, 0 ; H
SPLATW m1, m1, 0 ; V
SPLATW m3, m3, 0 ; a
%ifidn %1, svq3
SWAP 0, 1
%endif
mova m2, m0
%if mmsize == 8
mova m5, m0
%endif
pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
%if mmsize == 16
psllw m2, 3
%else
psllw m5, 3
psllw m2, 2
mova m6, m5
paddw m6, m2
%endif
paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H
%if mmsize == 8
paddw m5, m0 ; a + {8,9,10,11}*H
paddw m6, m0 ; a + {12,13,14,15}*H
%endif
 
mov r4, 8
.loop:
mova m3, m0 ; b[0..7]
mova m4, m2 ; b[8..15]
psraw m3, 5
psraw m4, 5
packuswb m3, m4
mova [r0], m3
%if mmsize == 8
mova m3, m5 ; b[8..11]
mova m4, m6 ; b[12..15]
psraw m3, 5
psraw m4, 5
packuswb m3, m4
mova [r0+8], m3
%endif
paddw m0, m1
paddw m2, m1
%if mmsize == 8
paddw m5, m1
paddw m6, m1
%endif
 
mova m3, m0 ; b[0..7]
mova m4, m2 ; b[8..15]
psraw m3, 5
psraw m4, 5
packuswb m3, m4
mova [r0+r2], m3
%if mmsize == 8
mova m3, m5 ; b[8..11]
mova m4, m6 ; b[12..15]
psraw m3, 5
psraw m4, 5
packuswb m3, m4
mova [r0+r2+8], m3
%endif
paddw m0, m1
paddw m2, m1
%if mmsize == 8
paddw m5, m1
paddw m6, m1
%endif
 
lea r0, [r0+r2*2]
dec r4
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmx
H264_PRED16x16_PLANE h264
H264_PRED16x16_PLANE rv40
H264_PRED16x16_PLANE svq3
INIT_MMX mmxext
H264_PRED16x16_PLANE h264
H264_PRED16x16_PLANE rv40
H264_PRED16x16_PLANE svq3
INIT_XMM sse2
H264_PRED16x16_PLANE h264
H264_PRED16x16_PLANE rv40
H264_PRED16x16_PLANE svq3
INIT_XMM ssse3
H264_PRED16x16_PLANE h264
H264_PRED16x16_PLANE rv40
H264_PRED16x16_PLANE svq3
 
;-----------------------------------------------------------------------------
; void pred8x8_plane_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
%macro H264_PRED8x8_PLANE 0
cglobal pred8x8_plane_8, 2,9,7
mov r2, r1 ; +stride
neg r1 ; -stride
 
movd m0, [r0+r1 -1]
%if mmsize == 8
pxor m2, m2
movh m1, [r0+r1 +4 ]
punpcklbw m0, m2
punpcklbw m1, m2
pmullw m0, [pw_m4to4]
pmullw m1, [pw_m4to4+8]
%else ; mmsize == 16
%if cpuflag(ssse3)
movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary
pmaddubsw m0, [plane8_shuf] ; H coefficients
%else ; sse2
pxor m2, m2
movd m1, [r0+r1 +4]
punpckldq m0, m1
punpcklbw m0, m2
pmullw m0, [pw_m4to4]
%endif
movhlps m1, m0
%endif
paddw m0, m1
 
%if notcpuflag(ssse3)
%if cpuflag(mmxext)
PSHUFLW m1, m0, 0xE
%elif cpuflag(mmx)
mova m1, m0
psrlq m1, 32
%endif
paddw m0, m1
%endif ; !ssse3
 
%if cpuflag(mmxext)
PSHUFLW m1, m0, 0x1
%elif cpuflag(mmx)
mova m1, m0
psrlq m1, 16
%endif
paddw m0, m1 ; sum of H coefficients
 
lea r4, [r0+r2*4-1]
lea r3, [r0 -1]
add r4, r2
 
%if ARCH_X86_64
%define e_reg r8
%else
%define e_reg r0
%endif
 
movzx e_reg, byte [r3+r2*2 ]
movzx r5, byte [r4+r1 ]
sub r5, e_reg
 
movzx e_reg, byte [r3 ]
%if ARCH_X86_64
movzx r7, byte [r4+r2 ]
sub r7, e_reg
sub r5, r7
%else
movzx r6, byte [r4+r2 ]
sub r6, e_reg
lea r5, [r5+r6*4]
sub r5, r6
%endif
 
movzx e_reg, byte [r3+r1 ]
movzx r6, byte [r4+r2*2 ]
sub r6, e_reg
%if ARCH_X86_64
add r6, r7
%endif
lea r5, [r5+r6*4]
 
movzx e_reg, byte [r3+r2 ]
movzx r6, byte [r4 ]
sub r6, e_reg
lea r6, [r5+r6*2]
 
lea r5, [r6*9+16]
lea r5, [r5+r6*8]
sar r5, 5
 
%if ARCH_X86_64 == 0
mov r0, r0m
%endif
 
movzx r3, byte [r4+r2*2 ]
movzx r4, byte [r0+r1 +7]
lea r3, [r3+r4+1]
shl r3, 4
movd r1d, m0
movsx r1d, r1w
imul r1d, 17
add r1d, 16
sar r1d, 5
movd m0, r1d
add r1d, r5d
sub r3d, r1d
add r1d, r1d
sub r3d, r1d ; a
 
movd m1, r5d
movd m3, r3d
SPLATW m0, m0, 0 ; H
SPLATW m1, m1, 0 ; V
SPLATW m3, m3, 0 ; a
%if mmsize == 8
mova m2, m0
%endif
pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words)
paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H
%if mmsize == 8
psllw m2, 2
paddw m2, m0 ; a + {4,5,6,7}*H
%endif
 
mov r4, 4
ALIGN 16
.loop:
%if mmsize == 16
mova m3, m0 ; b[0..7]
paddw m0, m1
psraw m3, 5
mova m4, m0 ; V+b[0..7]
paddw m0, m1
psraw m4, 5
packuswb m3, m4
movh [r0], m3
movhps [r0+r2], m3
%else ; mmsize == 8
mova m3, m0 ; b[0..3]
mova m4, m2 ; b[4..7]
paddw m0, m1
paddw m2, m1
psraw m3, 5
psraw m4, 5
mova m5, m0 ; V+b[0..3]
mova m6, m2 ; V+b[4..7]
paddw m0, m1
paddw m2, m1
psraw m5, 5
psraw m6, 5
packuswb m3, m4
packuswb m5, m6
mova [r0], m3
mova [r0+r2], m5
%endif
 
lea r0, [r0+r2*2]
dec r4
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmx
H264_PRED8x8_PLANE
INIT_MMX mmxext
H264_PRED8x8_PLANE
INIT_XMM sse2
H264_PRED8x8_PLANE
INIT_XMM ssse3
H264_PRED8x8_PLANE
 
;-----------------------------------------------------------------------------
; void pred8x8_vertical_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmx
cglobal pred8x8_vertical_8, 2,2
sub r0, r1
movq mm0, [r0]
%rep 3
movq [r0+r1*1], mm0
movq [r0+r1*2], mm0
lea r0, [r0+r1*2]
%endrep
movq [r0+r1*1], mm0
movq [r0+r1*2], mm0
RET
 
;-----------------------------------------------------------------------------
; void pred8x8_horizontal_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED8x8_H 0
cglobal pred8x8_horizontal_8, 2,3
mov r2, 4
%if cpuflag(ssse3)
mova m2, [pb_3]
%endif
.loop:
SPLATB_LOAD m0, r0+r1*0-1, m2
SPLATB_LOAD m1, r0+r1*1-1, m2
mova [r0+r1*0], m0
mova [r0+r1*1], m1
lea r0, [r0+r1*2]
dec r2
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmx
PRED8x8_H
INIT_MMX mmxext
PRED8x8_H
INIT_MMX ssse3
PRED8x8_H
 
;-----------------------------------------------------------------------------
; void pred8x8_top_dc_8_mmxext(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred8x8_top_dc_8, 2,5
sub r0, r1
movq mm0, [r0]
pxor mm1, mm1
pxor mm2, mm2
lea r2, [r0+r1*2]
punpckhbw mm1, mm0
punpcklbw mm0, mm2
psadbw mm1, mm2 ; s1
lea r3, [r2+r1*2]
psadbw mm0, mm2 ; s0
psrlw mm1, 1
psrlw mm0, 1
pavgw mm1, mm2
lea r4, [r3+r1*2]
pavgw mm0, mm2
pshufw mm1, mm1, 0
pshufw mm0, mm0, 0 ; dc0 (w)
packuswb mm0, mm1 ; dc0,dc1 (b)
movq [r0+r1*1], mm0
movq [r0+r1*2], mm0
lea r0, [r3+r1*2]
movq [r2+r1*1], mm0
movq [r2+r1*2], mm0
movq [r3+r1*1], mm0
movq [r3+r1*2], mm0
movq [r0+r1*1], mm0
movq [r0+r1*2], mm0
RET
 
;-----------------------------------------------------------------------------
; void pred8x8_dc_8_mmxext(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred8x8_dc_8, 2,5
sub r0, r1
pxor m7, m7
movd m0, [r0+0]
movd m1, [r0+4]
psadbw m0, m7 ; s0
mov r4, r0
psadbw m1, m7 ; s1
 
movzx r2d, byte [r0+r1*1-1]
movzx r3d, byte [r0+r1*2-1]
lea r0, [r0+r1*2]
add r2d, r3d
movzx r3d, byte [r0+r1*1-1]
add r2d, r3d
movzx r3d, byte [r0+r1*2-1]
add r2d, r3d
lea r0, [r0+r1*2]
movd m2, r2d ; s2
movzx r2d, byte [r0+r1*1-1]
movzx r3d, byte [r0+r1*2-1]
lea r0, [r0+r1*2]
add r2d, r3d
movzx r3d, byte [r0+r1*1-1]
add r2d, r3d
movzx r3d, byte [r0+r1*2-1]
add r2d, r3d
movd m3, r2d ; s3
 
punpcklwd m0, m1
mov r0, r4
punpcklwd m2, m3
punpckldq m0, m2 ; s0, s1, s2, s3
pshufw m3, m0, 11110110b ; s2, s1, s3, s3
lea r2, [r0+r1*2]
pshufw m0, m0, 01110100b ; s0, s1, s3, s1
paddw m0, m3
lea r3, [r2+r1*2]
psrlw m0, 2
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3
lea r4, [r3+r1*2]
packuswb m0, m0
punpcklbw m0, m0
movq m1, m0
punpcklbw m0, m0
punpckhbw m1, m1
movq [r0+r1*1], m0
movq [r0+r1*2], m0
movq [r2+r1*1], m0
movq [r2+r1*2], m0
movq [r3+r1*1], m1
movq [r3+r1*2], m1
movq [r4+r1*1], m1
movq [r4+r1*2], m1
RET
 
;-----------------------------------------------------------------------------
; void pred8x8_dc_rv40_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred8x8_dc_rv40_8, 2,7
mov r4, r0
sub r0, r1
pxor mm0, mm0
psadbw mm0, [r0]
dec r0
movzx r5d, byte [r0+r1*1]
movd r6d, mm0
lea r0, [r0+r1*2]
%rep 3
movzx r2d, byte [r0+r1*0]
movzx r3d, byte [r0+r1*1]
add r5d, r2d
add r6d, r3d
lea r0, [r0+r1*2]
%endrep
movzx r2d, byte [r0+r1*0]
add r5d, r6d
lea r2d, [r2+r5+8]
shr r2d, 4
movd mm0, r2d
punpcklbw mm0, mm0
pshufw mm0, mm0, 0
mov r3d, 4
.loop:
movq [r4+r1*0], mm0
movq [r4+r1*1], mm0
lea r4, [r4+r1*2]
dec r3d
jg .loop
REP_RET
 
;-----------------------------------------------------------------------------
; void pred8x8_tm_vp8_8(uint8_t *src, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED8x8_TM 0
cglobal pred8x8_tm_vp8_8, 2,6
sub r0, r1
pxor mm7, mm7
movq mm0, [r0]
movq mm1, mm0
punpcklbw mm0, mm7
punpckhbw mm1, mm7
movzx r4d, byte [r0-1]
mov r5d, 4
.loop:
movzx r2d, byte [r0+r1*1-1]
movzx r3d, byte [r0+r1*2-1]
sub r2d, r4d
sub r3d, r4d
movd mm2, r2d
movd mm4, r3d
SPLATW mm2, mm2, 0
SPLATW mm4, mm4, 0
movq mm3, mm2
movq mm5, mm4
paddw mm2, mm0
paddw mm3, mm1
paddw mm4, mm0
paddw mm5, mm1
packuswb mm2, mm3
packuswb mm4, mm5
movq [r0+r1*1], mm2
movq [r0+r1*2], mm4
lea r0, [r0+r1*2]
dec r5d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmx
PRED8x8_TM
INIT_MMX mmxext
PRED8x8_TM
 
INIT_XMM sse2
cglobal pred8x8_tm_vp8_8, 2,6,4
sub r0, r1
pxor xmm1, xmm1
movq xmm0, [r0]
punpcklbw xmm0, xmm1
movzx r4d, byte [r0-1]
mov r5d, 4
.loop:
movzx r2d, byte [r0+r1*1-1]
movzx r3d, byte [r0+r1*2-1]
sub r2d, r4d
sub r3d, r4d
movd xmm2, r2d
movd xmm3, r3d
pshuflw xmm2, xmm2, 0
pshuflw xmm3, xmm3, 0
punpcklqdq xmm2, xmm2
punpcklqdq xmm3, xmm3
paddw xmm2, xmm0
paddw xmm3, xmm0
packuswb xmm2, xmm3
movq [r0+r1*1], xmm2
movhps [r0+r1*2], xmm2
lea r0, [r0+r1*2]
dec r5d
jg .loop
REP_RET
 
INIT_XMM ssse3
cglobal pred8x8_tm_vp8_8, 2,3,6
sub r0, r1
movdqa xmm4, [tm_shuf]
pxor xmm1, xmm1
movq xmm0, [r0]
punpcklbw xmm0, xmm1
movd xmm5, [r0-4]
pshufb xmm5, xmm4
mov r2d, 4
.loop:
movd xmm2, [r0+r1*1-4]
movd xmm3, [r0+r1*2-4]
pshufb xmm2, xmm4
pshufb xmm3, xmm4
psubw xmm2, xmm5
psubw xmm3, xmm5
paddw xmm2, xmm0
paddw xmm3, xmm0
packuswb xmm2, xmm3
movq [r0+r1*1], xmm2
movhps [r0+r1*2], xmm2
lea r0, [r0+r1*2]
dec r2d
jg .loop
REP_RET
 
; dest, left, right, src, tmp
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED4x4_LOWPASS 5
mova %5, %2
pavgb %2, %3
pxor %3, %5
mova %1, %4
pand %3, [pb_1]
psubusb %2, %3
pavgb %1, %2
%endmacro
 
;-----------------------------------------------------------------------------
; void pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_TOP_DC 0
cglobal pred8x8l_top_dc_8, 4,4
sub r0, r3
pxor mm7, mm7
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1 ; top_left
jz .fix_lt_2
test r2, r2 ; top_right
jz .fix_tr_1
jmp .body
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2 ; top_right
jnz .body
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
.body:
PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
psadbw mm7, mm0
paddw mm7, [pw_4]
psrlw mm7, 3
pshufw mm7, mm7, 0
packuswb mm7, mm7
%rep 3
movq [r0+r3*1], mm7
movq [r0+r3*2], mm7
lea r0, [r0+r3*2]
%endrep
movq [r0+r3*1], mm7
movq [r0+r3*2], mm7
RET
%endmacro
 
INIT_MMX mmxext
PRED8x8L_TOP_DC
INIT_MMX ssse3
PRED8x8L_TOP_DC
 
;-----------------------------------------------------------------------------
;void pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED8x8L_DC 0
cglobal pred8x8l_dc_8, 4,5
sub r0, r3
lea r4, [r0+r3*2]
movq mm0, [r0+r3*1-8]
punpckhbw mm0, [r0+r3*0-8]
movq mm1, [r4+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r4, r0
punpckhwd mm1, mm0
lea r0, [r0+r3*4]
movq mm2, [r0+r3*1-8]
punpckhbw mm2, [r0+r3*0-8]
lea r0, [r0+r3*2]
movq mm3, [r0+r3*1-8]
punpckhbw mm3, [r0+r3*0-8]
punpckhwd mm3, mm2
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
movq mm1, [r4]
mov r0, r4
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
test r1, r1
jnz .do_left
.fix_lt_1:
movq mm5, mm3
pxor mm5, mm4
psrlq mm5, 56
psllq mm5, 48
pxor mm1, mm5
jmp .do_left
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2
jnz .body
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
jmp .body
.do_left:
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq mm4, mm0
movq mm7, mm2
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
psllq mm1, 56
PALIGNR mm7, mm1, 7, mm3
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
.body:
lea r1, [r0+r3*2]
PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
pxor mm0, mm0
pxor mm1, mm1
lea r2, [r1+r3*2]
psadbw mm0, mm7
psadbw mm1, mm6
paddw mm0, [pw_8]
paddw mm0, mm1
lea r4, [r2+r3*2]
psrlw mm0, 4
pshufw mm0, mm0, 0
packuswb mm0, mm0
movq [r0+r3*1], mm0
movq [r0+r3*2], mm0
movq [r1+r3*1], mm0
movq [r1+r3*2], mm0
movq [r2+r3*1], mm0
movq [r2+r3*2], mm0
movq [r4+r3*1], mm0
movq [r4+r3*2], mm0
RET
%endmacro
 
INIT_MMX mmxext
PRED8x8L_DC
INIT_MMX ssse3
PRED8x8L_DC
 
;-----------------------------------------------------------------------------
; void pred8x8l_horizontal_8(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED8x8L_HORIZONTAL 0
cglobal pred8x8l_horizontal_8, 4,4
sub r0, r3
lea r2, [r0+r3*2]
movq mm0, [r0+r3*1-8]
test r1, r1
lea r1, [r0+r3]
cmovnz r1, r0
punpckhbw mm0, [r1+r3*0-8]
movq mm1, [r2+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r2, r0
punpckhwd mm1, mm0
lea r0, [r0+r3*4]
movq mm2, [r0+r3*1-8]
punpckhbw mm2, [r0+r3*0-8]
lea r0, [r0+r3*2]
movq mm3, [r0+r3*1-8]
punpckhbw mm3, [r0+r3*0-8]
punpckhwd mm3, mm2
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
movq mm1, [r1+r3*0-8]
mov r0, r2
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq mm4, mm0
movq mm7, mm2
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
psllq mm1, 56
PALIGNR mm7, mm1, 7, mm3
movq mm3, mm7
lea r1, [r0+r3*2]
movq mm7, mm3
punpckhbw mm3, mm3
punpcklbw mm7, mm7
pshufw mm0, mm3, 0xff
pshufw mm1, mm3, 0xaa
lea r2, [r1+r3*2]
pshufw mm2, mm3, 0x55
pshufw mm3, mm3, 0x00
pshufw mm4, mm7, 0xff
pshufw mm5, mm7, 0xaa
pshufw mm6, mm7, 0x55
pshufw mm7, mm7, 0x00
movq [r0+r3*1], mm0
movq [r0+r3*2], mm1
movq [r1+r3*1], mm2
movq [r1+r3*2], mm3
movq [r2+r3*1], mm4
movq [r2+r3*2], mm5
lea r0, [r2+r3*2]
movq [r0+r3*1], mm6
movq [r0+r3*2], mm7
RET
%endmacro
 
INIT_MMX mmxext
PRED8x8L_HORIZONTAL
INIT_MMX ssse3
PRED8x8L_HORIZONTAL
 
;-----------------------------------------------------------------------------
; void pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED8x8L_VERTICAL 0
cglobal pred8x8l_vertical_8, 4,4
sub r0, r3
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1 ; top_left
jz .fix_lt_2
test r2, r2 ; top_right
jz .fix_tr_1
jmp .body
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2 ; top_right
jnz .body
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
.body:
PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5
%rep 3
movq [r0+r3*1], mm0
movq [r0+r3*2], mm0
lea r0, [r0+r3*2]
%endrep
movq [r0+r3*1], mm0
movq [r0+r3*2], mm0
RET
%endmacro
 
INIT_MMX mmxext
PRED8x8L_VERTICAL
INIT_MMX ssse3
PRED8x8L_VERTICAL
 
;-----------------------------------------------------------------------------
;void pred8x8l_down_left_8(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred8x8l_down_left_8, 4,5
sub r0, r3
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
jmp .do_top
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2
jnz .do_top
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
jmp .do_top
.fix_tr_2:
punpckhbw mm3, mm3
pshufw mm1, mm3, 0xFF
jmp .do_topright
.do_top:
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
movq mm7, mm4
test r2, r2
jz .fix_tr_2
movq mm0, [r0+8]
movq mm5, mm0
movq mm2, mm0
movq mm4, mm0
psrlq mm5, 56
PALIGNR mm2, mm3, 7, mm3
PALIGNR mm5, mm4, 1, mm4
PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
.do_topright:
lea r1, [r0+r3*2]
movq mm6, mm1
psrlq mm1, 56
movq mm4, mm1
lea r2, [r1+r3*2]
movq mm2, mm6
PALIGNR mm2, mm7, 1, mm0
movq mm3, mm6
PALIGNR mm3, mm7, 7, mm0
PALIGNR mm4, mm6, 1, mm0
movq mm5, mm7
movq mm1, mm7
movq mm7, mm6
lea r4, [r2+r3*2]
psllq mm1, 8
PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
movq [r4+r3*2], mm1
movq mm2, mm0
psllq mm1, 8
psrlq mm2, 56
psllq mm0, 8
por mm1, mm2
movq [r4+r3*1], mm1
movq mm2, mm0
psllq mm1, 8
psrlq mm2, 56
psllq mm0, 8
por mm1, mm2
movq [r2+r3*2], mm1
movq mm2, mm0
psllq mm1, 8
psrlq mm2, 56
psllq mm0, 8
por mm1, mm2
movq [r2+r3*1], mm1
movq mm2, mm0
psllq mm1, 8
psrlq mm2, 56
psllq mm0, 8
por mm1, mm2
movq [r1+r3*2], mm1
movq mm2, mm0
psllq mm1, 8
psrlq mm2, 56
psllq mm0, 8
por mm1, mm2
movq [r1+r3*1], mm1
movq mm2, mm0
psllq mm1, 8
psrlq mm2, 56
psllq mm0, 8
por mm1, mm2
movq [r0+r3*2], mm1
psllq mm1, 8
psrlq mm0, 56
por mm1, mm0
movq [r0+r3*1], mm1
RET
 
%macro PRED8x8L_DOWN_LEFT 0
cglobal pred8x8l_down_left_8, 4,4
sub r0, r3
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1 ; top_left
jz .fix_lt_2
test r2, r2 ; top_right
jz .fix_tr_1
jmp .do_top
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2 ; top_right
jnz .do_top
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
jmp .do_top
.fix_tr_2:
punpckhbw mm3, mm3
pshufw mm1, mm3, 0xFF
jmp .do_topright
.do_top:
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
movq2dq xmm3, mm4
test r2, r2 ; top_right
jz .fix_tr_2
movq mm0, [r0+8]
movq mm5, mm0
movq mm2, mm0
movq mm4, mm0
psrlq mm5, 56
PALIGNR mm2, mm3, 7, mm3
PALIGNR mm5, mm4, 1, mm4
PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
.do_topright:
movq2dq xmm4, mm1
psrlq mm1, 56
movq2dq xmm5, mm1
lea r1, [r0+r3*2]
pslldq xmm4, 8
por xmm3, xmm4
movdqa xmm2, xmm3
psrldq xmm2, 1
pslldq xmm5, 15
por xmm2, xmm5
lea r2, [r1+r3*2]
movdqa xmm1, xmm3
pslldq xmm1, 1
INIT_XMM cpuname
PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
psrldq xmm0, 1
movq [r0+r3*1], xmm0
psrldq xmm0, 1
movq [r0+r3*2], xmm0
psrldq xmm0, 1
lea r0, [r2+r3*2]
movq [r1+r3*1], xmm0
psrldq xmm0, 1
movq [r1+r3*2], xmm0
psrldq xmm0, 1
movq [r2+r3*1], xmm0
psrldq xmm0, 1
movq [r2+r3*2], xmm0
psrldq xmm0, 1
movq [r0+r3*1], xmm0
psrldq xmm0, 1
movq [r0+r3*2], xmm0
RET
%endmacro
 
INIT_MMX sse2
PRED8x8L_DOWN_LEFT
INIT_MMX ssse3
PRED8x8L_DOWN_LEFT
 
;-----------------------------------------------------------------------------
;void pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred8x8l_down_right_8, 4,5
sub r0, r3
lea r4, [r0+r3*2]
movq mm0, [r0+r3*1-8]
punpckhbw mm0, [r0+r3*0-8]
movq mm1, [r4+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r4, r0
punpckhwd mm1, mm0
lea r0, [r0+r3*4]
movq mm2, [r0+r3*1-8]
punpckhbw mm2, [r0+r3*0-8]
lea r0, [r0+r3*2]
movq mm3, [r0+r3*1-8]
punpckhbw mm3, [r0+r3*0-8]
punpckhwd mm3, mm2
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
movq mm1, [r4]
mov r0, r4
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
test r1, r1 ; top_left
jz .fix_lt_1
.do_left:
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq mm4, mm0
movq mm7, mm2
movq mm6, mm2
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
psllq mm1, 56
PALIGNR mm7, mm1, 7, mm3
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1 ; top_left
jz .fix_lt_2
test r2, r2 ; top_right
jz .fix_tr_1
.do_top:
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
movq mm5, mm4
jmp .body
.fix_lt_1:
movq mm5, mm3
pxor mm5, mm4
psrlq mm5, 56
psllq mm5, 48
pxor mm1, mm5
jmp .do_left
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2 ; top_right
jnz .do_top
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
jmp .do_top
.body:
lea r1, [r0+r3*2]
movq mm1, mm7
movq mm7, mm5
movq mm5, mm6
movq mm2, mm7
lea r2, [r1+r3*2]
PALIGNR mm2, mm6, 1, mm0
movq mm3, mm7
PALIGNR mm3, mm6, 7, mm0
movq mm4, mm7
lea r4, [r2+r3*2]
psrlq mm4, 8
PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6
PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6
movq [r4+r3*2], mm0
movq mm2, mm1
psrlq mm0, 8
psllq mm2, 56
psrlq mm1, 8
por mm0, mm2
movq [r4+r3*1], mm0
movq mm2, mm1
psrlq mm0, 8
psllq mm2, 56
psrlq mm1, 8
por mm0, mm2
movq [r2+r3*2], mm0
movq mm2, mm1
psrlq mm0, 8
psllq mm2, 56
psrlq mm1, 8
por mm0, mm2
movq [r2+r3*1], mm0
movq mm2, mm1
psrlq mm0, 8
psllq mm2, 56
psrlq mm1, 8
por mm0, mm2
movq [r1+r3*2], mm0
movq mm2, mm1
psrlq mm0, 8
psllq mm2, 56
psrlq mm1, 8
por mm0, mm2
movq [r1+r3*1], mm0
movq mm2, mm1
psrlq mm0, 8
psllq mm2, 56
psrlq mm1, 8
por mm0, mm2
movq [r0+r3*2], mm0
psrlq mm0, 8
psllq mm1, 56
por mm0, mm1
movq [r0+r3*1], mm0
RET
 
%macro PRED8x8L_DOWN_RIGHT 0
cglobal pred8x8l_down_right_8, 4,5
sub r0, r3
lea r4, [r0+r3*2]
movq mm0, [r0+r3*1-8]
punpckhbw mm0, [r0+r3*0-8]
movq mm1, [r4+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r4, r0
punpckhwd mm1, mm0
lea r0, [r0+r3*4]
movq mm2, [r0+r3*1-8]
punpckhbw mm2, [r0+r3*0-8]
lea r0, [r0+r3*2]
movq mm3, [r0+r3*1-8]
punpckhbw mm3, [r0+r3*0-8]
punpckhwd mm3, mm2
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
movq mm1, [r4]
mov r0, r4
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
test r1, r1
jz .fix_lt_1
jmp .do_left
.fix_lt_1:
movq mm5, mm3
pxor mm5, mm4
psrlq mm5, 56
psllq mm5, 48
pxor mm1, mm5
jmp .do_left
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2
jnz .do_top
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
jmp .do_top
.do_left:
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq mm4, mm0
movq mm7, mm2
movq2dq xmm3, mm2
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
psllq mm1, 56
PALIGNR mm7, mm1, 7, mm3
movq2dq xmm1, mm7
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
.do_top:
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
movq2dq xmm4, mm4
lea r1, [r0+r3*2]
movdqa xmm0, xmm3
pslldq xmm4, 8
por xmm3, xmm4
lea r2, [r1+r3*2]
pslldq xmm4, 1
por xmm1, xmm4
psrldq xmm0, 7
pslldq xmm0, 15
psrldq xmm0, 7
por xmm1, xmm0
lea r0, [r2+r3*2]
movdqa xmm2, xmm3
psrldq xmm2, 1
INIT_XMM cpuname
PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4
movdqa xmm1, xmm0
psrldq xmm1, 1
movq [r0+r3*2], xmm0
movq [r0+r3*1], xmm1
psrldq xmm0, 2
psrldq xmm1, 2
movq [r2+r3*2], xmm0
movq [r2+r3*1], xmm1
psrldq xmm0, 2
psrldq xmm1, 2
movq [r1+r3*2], xmm0
movq [r1+r3*1], xmm1
psrldq xmm0, 2
psrldq xmm1, 2
movq [r4+r3*2], xmm0
movq [r4+r3*1], xmm1
RET
%endmacro
 
INIT_MMX sse2
PRED8x8L_DOWN_RIGHT
INIT_MMX ssse3
PRED8x8L_DOWN_RIGHT
 
;-----------------------------------------------------------------------------
; void pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred8x8l_vertical_right_8, 4,5
sub r0, r3
lea r4, [r0+r3*2]
movq mm0, [r0+r3*1-8]
punpckhbw mm0, [r0+r3*0-8]
movq mm1, [r4+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r4, r0
punpckhwd mm1, mm0
lea r0, [r0+r3*4]
movq mm2, [r0+r3*1-8]
punpckhbw mm2, [r0+r3*0-8]
lea r0, [r0+r3*2]
movq mm3, [r0+r3*1-8]
punpckhbw mm3, [r0+r3*0-8]
punpckhwd mm3, mm2
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
movq mm1, [r4]
mov r0, r4
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
test r1, r1
jz .fix_lt_1
jmp .do_left
.fix_lt_1:
movq mm5, mm3
pxor mm5, mm4
psrlq mm5, 56
psllq mm5, 48
pxor mm1, mm5
jmp .do_left
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2
jnz .do_top
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
jmp .do_top
.do_left:
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq mm7, mm2
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
.do_top:
PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
lea r1, [r0+r3*2]
movq mm2, mm6
movq mm3, mm6
PALIGNR mm3, mm7, 7, mm0
PALIGNR mm6, mm7, 6, mm1
movq mm4, mm3
pavgb mm3, mm2
lea r2, [r1+r3*2]
PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5
movq [r0+r3*1], mm3
movq [r0+r3*2], mm0
movq mm5, mm0
movq mm6, mm3
movq mm1, mm7
movq mm2, mm1
psllq mm2, 8
movq mm3, mm1
psllq mm3, 16
lea r4, [r2+r3*2]
PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4
PALIGNR mm6, mm0, 7, mm2
movq [r1+r3*1], mm6
psllq mm0, 8
PALIGNR mm5, mm0, 7, mm1
movq [r1+r3*2], mm5
psllq mm0, 8
PALIGNR mm6, mm0, 7, mm2
movq [r2+r3*1], mm6
psllq mm0, 8
PALIGNR mm5, mm0, 7, mm1
movq [r2+r3*2], mm5
psllq mm0, 8
PALIGNR mm6, mm0, 7, mm2
movq [r4+r3*1], mm6
psllq mm0, 8
PALIGNR mm5, mm0, 7, mm1
movq [r4+r3*2], mm5
RET
 
%macro PRED8x8L_VERTICAL_RIGHT 0
cglobal pred8x8l_vertical_right_8, 4,5,7
; manually spill XMM registers for Win64 because
; the code here is initialized with INIT_MMX
WIN64_SPILL_XMM 7
sub r0, r3
lea r4, [r0+r3*2]
movq mm0, [r0+r3*1-8]
punpckhbw mm0, [r0+r3*0-8]
movq mm1, [r4+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r4, r0
punpckhwd mm1, mm0
lea r0, [r0+r3*4]
movq mm2, [r0+r3*1-8]
punpckhbw mm2, [r0+r3*0-8]
lea r0, [r0+r3*2]
movq mm3, [r0+r3*1-8]
punpckhbw mm3, [r0+r3*0-8]
punpckhwd mm3, mm2
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
movq mm1, [r4]
mov r0, r4
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
test r1, r1
jnz .do_left
.fix_lt_1:
movq mm5, mm3
pxor mm5, mm4
psrlq mm5, 56
psllq mm5, 48
pxor mm1, mm5
jmp .do_left
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2
jnz .do_top
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
jmp .do_top
.do_left:
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq2dq xmm0, mm2
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
.do_top:
PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5
lea r1, [r0+r3*2]
movq2dq xmm4, mm6
pslldq xmm4, 8
por xmm0, xmm4
movdqa xmm6, [pw_ff00]
movdqa xmm1, xmm0
lea r2, [r1+r3*2]
movdqa xmm2, xmm0
movdqa xmm3, xmm0
pslldq xmm0, 1
pslldq xmm1, 2
pavgb xmm2, xmm0
INIT_XMM cpuname
PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5
pandn xmm6, xmm4
movdqa xmm5, xmm4
psrlw xmm4, 8
packuswb xmm6, xmm4
movhlps xmm4, xmm6
movhps [r0+r3*2], xmm5
movhps [r0+r3*1], xmm2
psrldq xmm5, 4
movss xmm5, xmm6
psrldq xmm2, 4
movss xmm2, xmm4
lea r0, [r2+r3*2]
psrldq xmm5, 1
psrldq xmm2, 1
movq [r0+r3*2], xmm5
movq [r0+r3*1], xmm2
psrldq xmm5, 1
psrldq xmm2, 1
movq [r2+r3*2], xmm5
movq [r2+r3*1], xmm2
psrldq xmm5, 1
psrldq xmm2, 1
movq [r1+r3*2], xmm5
movq [r1+r3*1], xmm2
RET
%endmacro
 
INIT_MMX sse2
PRED8x8L_VERTICAL_RIGHT
INIT_MMX ssse3
PRED8x8L_VERTICAL_RIGHT
 
;-----------------------------------------------------------------------------
;void pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED8x8L_VERTICAL_LEFT 0
cglobal pred8x8l_vertical_left_8, 4,4
sub r0, r3
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
jmp .do_top
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2
jnz .do_top
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
jmp .do_top
.fix_tr_2:
punpckhbw mm3, mm3
pshufw mm1, mm3, 0xFF
jmp .do_topright
.do_top:
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
movq2dq xmm4, mm4
test r2, r2
jz .fix_tr_2
movq mm0, [r0+8]
movq mm5, mm0
movq mm2, mm0
movq mm4, mm0
psrlq mm5, 56
PALIGNR mm2, mm3, 7, mm3
PALIGNR mm5, mm4, 1, mm4
PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
.do_topright:
movq2dq xmm3, mm1
lea r1, [r0+r3*2]
pslldq xmm3, 8
por xmm4, xmm3
movdqa xmm2, xmm4
movdqa xmm1, xmm4
movdqa xmm3, xmm4
psrldq xmm2, 1
pslldq xmm1, 1
pavgb xmm3, xmm2
lea r2, [r1+r3*2]
INIT_XMM cpuname
PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5
psrldq xmm0, 1
movq [r0+r3*1], xmm3
movq [r0+r3*2], xmm0
lea r0, [r2+r3*2]
psrldq xmm3, 1
psrldq xmm0, 1
movq [r1+r3*1], xmm3
movq [r1+r3*2], xmm0
psrldq xmm3, 1
psrldq xmm0, 1
movq [r2+r3*1], xmm3
movq [r2+r3*2], xmm0
psrldq xmm3, 1
psrldq xmm0, 1
movq [r0+r3*1], xmm3
movq [r0+r3*2], xmm0
RET
%endmacro
 
INIT_MMX sse2
PRED8x8L_VERTICAL_LEFT
INIT_MMX ssse3
PRED8x8L_VERTICAL_LEFT
 
;-----------------------------------------------------------------------------
; void pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED8x8L_HORIZONTAL_UP 0
cglobal pred8x8l_horizontal_up_8, 4,4
sub r0, r3
lea r2, [r0+r3*2]
movq mm0, [r0+r3*1-8]
test r1, r1
lea r1, [r0+r3]
cmovnz r1, r0
punpckhbw mm0, [r1+r3*0-8]
movq mm1, [r2+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r2, r0
punpckhwd mm1, mm0
lea r0, [r0+r3*4]
movq mm2, [r0+r3*1-8]
punpckhbw mm2, [r0+r3*0-8]
lea r0, [r0+r3*2]
movq mm3, [r0+r3*1-8]
punpckhbw mm3, [r0+r3*0-8]
punpckhwd mm3, mm2
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
movq mm1, [r1+r3*0-8]
mov r0, r2
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq mm4, mm0
movq mm7, mm2
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
psllq mm1, 56
PALIGNR mm7, mm1, 7, mm3
lea r1, [r0+r3*2]
pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1
psllq mm7, 56 ; l7 .. .. .. .. .. .. ..
movq mm2, mm0
psllw mm0, 8
psrlw mm2, 8
por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0
movq mm3, mm2
movq mm4, mm2
movq mm5, mm2
psrlq mm2, 8
psrlq mm3, 16
lea r2, [r1+r3*2]
por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1
punpckhbw mm7, mm7
por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2
pavgb mm4, mm2
PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6
movq mm5, mm4
punpcklbw mm4, mm1 ; p4 p3 p2 p1
punpckhbw mm5, mm1 ; p8 p7 p6 p5
movq mm6, mm5
movq mm7, mm5
movq mm0, mm5
PALIGNR mm5, mm4, 2, mm1
pshufw mm1, mm6, 11111001b
PALIGNR mm6, mm4, 4, mm2
pshufw mm2, mm7, 11111110b
PALIGNR mm7, mm4, 6, mm3
pshufw mm3, mm0, 11111111b
movq [r0+r3*1], mm4
movq [r0+r3*2], mm5
lea r0, [r2+r3*2]
movq [r1+r3*1], mm6
movq [r1+r3*2], mm7
movq [r2+r3*1], mm0
movq [r2+r3*2], mm1
movq [r0+r3*1], mm2
movq [r0+r3*2], mm3
RET
%endmacro
 
INIT_MMX mmxext
PRED8x8L_HORIZONTAL_UP
INIT_MMX ssse3
PRED8x8L_HORIZONTAL_UP
 
;-----------------------------------------------------------------------------
;void pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred8x8l_horizontal_down_8, 4,5
sub r0, r3
lea r4, [r0+r3*2]
movq mm0, [r0+r3*1-8]
punpckhbw mm0, [r0+r3*0-8]
movq mm1, [r4+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r4, r0
punpckhwd mm1, mm0
lea r0, [r0+r3*4]
movq mm2, [r0+r3*1-8]
punpckhbw mm2, [r0+r3*0-8]
lea r0, [r0+r3*2]
movq mm3, [r0+r3*1-8]
punpckhbw mm3, [r0+r3*0-8]
punpckhwd mm3, mm2
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
movq mm1, [r4]
mov r0, r4
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
test r1, r1
jnz .do_left
.fix_lt_1:
movq mm5, mm3
pxor mm5, mm4
psrlq mm5, 56
psllq mm5, 48
pxor mm1, mm5
jmp .do_left
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2
jnz .do_top
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
jmp .do_top
.do_left:
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq mm4, mm0
movq mm7, mm2
movq mm6, mm2
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
psllq mm1, 56
PALIGNR mm7, mm1, 7, mm3
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
.do_top:
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
movq mm5, mm4
lea r1, [r0+r3*2]
psllq mm7, 56
movq mm2, mm5
movq mm3, mm6
movq mm4, mm2
PALIGNR mm2, mm6, 7, mm5
PALIGNR mm6, mm7, 7, mm0
lea r2, [r1+r3*2]
PALIGNR mm4, mm3, 1, mm7
movq mm5, mm3
pavgb mm3, mm6
PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7
movq mm4, mm2
movq mm1, mm2
lea r4, [r2+r3*2]
psrlq mm4, 16
psrlq mm1, 8
PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5
movq mm7, mm3
punpcklbw mm3, mm0
punpckhbw mm7, mm0
movq mm1, mm7
movq mm0, mm7
movq mm4, mm7
movq [r4+r3*2], mm3
PALIGNR mm7, mm3, 2, mm5
movq [r4+r3*1], mm7
PALIGNR mm1, mm3, 4, mm5
movq [r2+r3*2], mm1
PALIGNR mm0, mm3, 6, mm3
movq [r2+r3*1], mm0
movq mm2, mm6
movq mm3, mm6
movq [r1+r3*2], mm4
PALIGNR mm6, mm4, 2, mm5
movq [r1+r3*1], mm6
PALIGNR mm2, mm4, 4, mm5
movq [r0+r3*2], mm2
PALIGNR mm3, mm4, 6, mm4
movq [r0+r3*1], mm3
RET
 
%macro PRED8x8L_HORIZONTAL_DOWN 0
cglobal pred8x8l_horizontal_down_8, 4,5
sub r0, r3
lea r4, [r0+r3*2]
movq mm0, [r0+r3*1-8]
punpckhbw mm0, [r0+r3*0-8]
movq mm1, [r4+r3*1-8]
punpckhbw mm1, [r0+r3*2-8]
mov r4, r0
punpckhwd mm1, mm0
lea r0, [r0+r3*4]
movq mm2, [r0+r3*1-8]
punpckhbw mm2, [r0+r3*0-8]
lea r0, [r0+r3*2]
movq mm3, [r0+r3*1-8]
punpckhbw mm3, [r0+r3*0-8]
punpckhwd mm3, mm2
punpckhdq mm3, mm1
lea r0, [r0+r3*2]
movq mm0, [r0+r3*0-8]
movq mm1, [r4]
mov r0, r4
movq mm4, mm3
movq mm2, mm3
PALIGNR mm4, mm0, 7, mm0
PALIGNR mm1, mm2, 1, mm2
test r1, r1
jnz .do_left
.fix_lt_1:
movq mm5, mm3
pxor mm5, mm4
psrlq mm5, 56
psllq mm5, 48
pxor mm1, mm5
jmp .do_left
.fix_lt_2:
movq mm5, mm3
pxor mm5, mm2
psllq mm5, 56
psrlq mm5, 56
pxor mm2, mm5
test r2, r2
jnz .do_top
.fix_tr_1:
movq mm5, mm3
pxor mm5, mm1
psrlq mm5, 56
psllq mm5, 56
pxor mm1, mm5
jmp .do_top
.fix_tr_2:
punpckhbw mm3, mm3
pshufw mm1, mm3, 0xFF
jmp .do_topright
.do_left:
movq mm0, mm4
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5
movq2dq xmm0, mm2
pslldq xmm0, 8
movq mm4, mm0
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5
movq2dq xmm2, mm1
pslldq xmm2, 15
psrldq xmm2, 8
por xmm0, xmm2
movq mm0, [r0-8]
movq mm3, [r0]
movq mm1, [r0+8]
movq mm2, mm3
movq mm4, mm3
PALIGNR mm2, mm0, 7, mm0
PALIGNR mm1, mm4, 1, mm4
test r1, r1
jz .fix_lt_2
test r2, r2
jz .fix_tr_1
.do_top:
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5
movq2dq xmm1, mm4
test r2, r2
jz .fix_tr_2
movq mm0, [r0+8]
movq mm5, mm0
movq mm2, mm0
movq mm4, mm0
psrlq mm5, 56
PALIGNR mm2, mm3, 7, mm3
PALIGNR mm5, mm4, 1, mm4
PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4
.do_topright:
movq2dq xmm5, mm1
pslldq xmm5, 8
por xmm1, xmm5
INIT_XMM cpuname
lea r2, [r4+r3*2]
movdqa xmm2, xmm1
movdqa xmm3, xmm1
PALIGNR xmm1, xmm0, 7, xmm4
PALIGNR xmm2, xmm0, 9, xmm5
lea r1, [r2+r3*2]
PALIGNR xmm3, xmm0, 8, xmm0
movdqa xmm4, xmm1
pavgb xmm4, xmm3
lea r0, [r1+r3*2]
PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5
punpcklbw xmm4, xmm0
movhlps xmm0, xmm4
movq [r0+r3*2], xmm4
movq [r2+r3*2], xmm0
psrldq xmm4, 2
psrldq xmm0, 2
movq [r0+r3*1], xmm4
movq [r2+r3*1], xmm0
psrldq xmm4, 2
psrldq xmm0, 2
movq [r1+r3*2], xmm4
movq [r4+r3*2], xmm0
psrldq xmm4, 2
psrldq xmm0, 2
movq [r1+r3*1], xmm4
movq [r4+r3*1], xmm0
RET
%endmacro
 
INIT_MMX sse2
PRED8x8L_HORIZONTAL_DOWN
INIT_MMX ssse3
PRED8x8L_HORIZONTAL_DOWN
 
;-----------------------------------------------------------------------------
; void pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred4x4_dc_8, 3,5
pxor mm7, mm7
mov r4, r0
sub r0, r2
movd mm0, [r0]
psadbw mm0, mm7
movzx r1d, byte [r0+r2*1-1]
movd r3d, mm0
add r3d, r1d
movzx r1d, byte [r0+r2*2-1]
lea r0, [r0+r2*2]
add r3d, r1d
movzx r1d, byte [r0+r2*1-1]
add r3d, r1d
movzx r1d, byte [r0+r2*2-1]
add r3d, r1d
add r3d, 4
shr r3d, 3
imul r3d, 0x01010101
mov [r4+r2*0], r3d
mov [r0+r2*0], r3d
mov [r0+r2*1], r3d
mov [r0+r2*2], r3d
RET
 
;-----------------------------------------------------------------------------
; void pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
 
%macro PRED4x4_TM 0
cglobal pred4x4_tm_vp8_8, 3,6
sub r0, r2
pxor mm7, mm7
movd mm0, [r0]
punpcklbw mm0, mm7
movzx r4d, byte [r0-1]
mov r5d, 2
.loop:
movzx r1d, byte [r0+r2*1-1]
movzx r3d, byte [r0+r2*2-1]
sub r1d, r4d
sub r3d, r4d
movd mm2, r1d
movd mm4, r3d
%if cpuflag(mmxext)
pshufw mm2, mm2, 0
pshufw mm4, mm4, 0
%else
punpcklwd mm2, mm2
punpcklwd mm4, mm4
punpckldq mm2, mm2
punpckldq mm4, mm4
%endif
paddw mm2, mm0
paddw mm4, mm0
packuswb mm2, mm2
packuswb mm4, mm4
movd [r0+r2*1], mm2
movd [r0+r2*2], mm4
lea r0, [r0+r2*2]
dec r5d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmx
PRED4x4_TM
INIT_MMX mmxext
PRED4x4_TM
 
INIT_XMM ssse3
cglobal pred4x4_tm_vp8_8, 3,3
sub r0, r2
movq mm6, [tm_shuf]
pxor mm1, mm1
movd mm0, [r0]
punpcklbw mm0, mm1
movd mm7, [r0-4]
pshufb mm7, mm6
lea r1, [r0+r2*2]
movd mm2, [r0+r2*1-4]
movd mm3, [r0+r2*2-4]
movd mm4, [r1+r2*1-4]
movd mm5, [r1+r2*2-4]
pshufb mm2, mm6
pshufb mm3, mm6
pshufb mm4, mm6
pshufb mm5, mm6
psubw mm2, mm7
psubw mm3, mm7
psubw mm4, mm7
psubw mm5, mm7
paddw mm2, mm0
paddw mm3, mm0
paddw mm4, mm0
paddw mm5, mm0
packuswb mm2, mm2
packuswb mm3, mm3
packuswb mm4, mm4
packuswb mm5, mm5
movd [r0+r2*1], mm2
movd [r0+r2*2], mm3
movd [r1+r2*1], mm4
movd [r1+r2*2], mm5
RET
 
;-----------------------------------------------------------------------------
; void pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred4x4_vertical_vp8_8, 3,3
sub r0, r2
movd m1, [r0-1]
movd m0, [r0]
mova m2, m0 ;t0 t1 t2 t3
punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7
lea r1, [r0+r2*2]
psrlq m0, 8 ;t1 t2 t3 t4
PRED4x4_LOWPASS m3, m1, m0, m2, m4
movd [r0+r2*1], m3
movd [r0+r2*2], m3
movd [r1+r2*1], m3
movd [r1+r2*2], m3
RET
 
;-----------------------------------------------------------------------------
; void pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred4x4_down_left_8, 3,3
sub r0, r2
movq m1, [r0]
punpckldq m1, [r1]
movq m2, m1
movq m3, m1
psllq m1, 8
pxor m2, m1
psrlq m2, 8
pxor m2, m3
PRED4x4_LOWPASS m0, m1, m2, m3, m4
lea r1, [r0+r2*2]
psrlq m0, 8
movd [r0+r2*1], m0
psrlq m0, 8
movd [r0+r2*2], m0
psrlq m0, 8
movd [r1+r2*1], m0
psrlq m0, 8
movd [r1+r2*2], m0
RET
 
;-----------------------------------------------------------------------------
; void pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred4x4_vertical_left_8, 3,3
sub r0, r2
movq m1, [r0]
punpckldq m1, [r1]
movq m3, m1
movq m2, m1
psrlq m3, 8
psrlq m2, 16
movq m4, m3
pavgb m4, m1
PRED4x4_LOWPASS m0, m1, m2, m3, m5
lea r1, [r0+r2*2]
movh [r0+r2*1], m4
movh [r0+r2*2], m0
psrlq m4, 8
psrlq m0, 8
movh [r1+r2*1], m4
movh [r1+r2*2], m0
RET
 
;-----------------------------------------------------------------------------
; void pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred4x4_horizontal_up_8, 3,3
sub r0, r2
lea r1, [r0+r2*2]
movd m0, [r0+r2*1-4]
punpcklbw m0, [r0+r2*2-4]
movd m1, [r1+r2*1-4]
punpcklbw m1, [r1+r2*2-4]
punpckhwd m0, m1
movq m1, m0
punpckhbw m1, m1
pshufw m1, m1, 0xFF
punpckhdq m0, m1
movq m2, m0
movq m3, m0
movq m7, m0
psrlq m2, 16
psrlq m3, 8
pavgb m7, m3
PRED4x4_LOWPASS m4, m0, m2, m3, m5
punpcklbw m7, m4
movd [r0+r2*1], m7
psrlq m7, 16
movd [r0+r2*2], m7
psrlq m7, 16
movd [r1+r2*1], m7
movd [r1+r2*2], m1
RET
 
;-----------------------------------------------------------------------------
; void pred4x4_horizontal_down_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred4x4_horizontal_down_8, 3,3
sub r0, r2
lea r1, [r0+r2*2]
movh m0, [r0-4] ; lt ..
punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. ..
psllq m0, 8 ; t2 t1 t0 lt .. .. .. ..
movd m1, [r1+r2*2-4] ; l3
punpcklbw m1, [r1+r2*1-4] ; l2 l3
movd m2, [r0+r2*2-4] ; l1
punpcklbw m2, [r0+r2*1-4] ; l0 l1
punpckhwd m1, m2 ; l0 l1 l2 l3
punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
movq m0, m1
movq m2, m1
movq m5, m1
psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1
psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2
pavgb m5, m2
PRED4x4_LOWPASS m3, m1, m0, m2, m4
punpcklbw m5, m3
psrlq m3, 32
PALIGNR m3, m5, 6, m4
movh [r1+r2*2], m5
psrlq m5, 16
movh [r1+r2*1], m5
psrlq m5, 16
movh [r0+r2*2], m5
movh [r0+r2*1], m3
RET
 
;-----------------------------------------------------------------------------
; void pred4x4_vertical_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred4x4_vertical_right_8, 3,3
sub r0, r2
lea r1, [r0+r2*2]
movh m0, [r0] ; ........t3t2t1t0
movq m5, m0
PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt
pavgb m5, m0
PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0
movq m1, m0
PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1
movq m2, m0
PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2
PRED4x4_LOWPASS m3, m1, m0, m2, m4
movq m1, m3
psrlq m3, 16
psllq m1, 48
movh [r0+r2*1], m5
movh [r0+r2*2], m3
PALIGNR m5, m1, 7, m2
psllq m1, 8
movh [r1+r2*1], m5
PALIGNR m3, m1, 7, m1
movh [r1+r2*2], m3
RET
 
;-----------------------------------------------------------------------------
; void pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride)
;-----------------------------------------------------------------------------
 
INIT_MMX mmxext
cglobal pred4x4_down_right_8, 3,3
sub r0, r2
lea r1, [r0+r2*2]
movq m1, [r1-8]
movq m2, [r0+r2*1-8]
punpckhbw m2, [r0-8]
movh m3, [r0]
punpckhwd m1, m2
PALIGNR m3, m1, 5, m1
movq m1, m3
PALIGNR m3, [r1+r2*1-8], 7, m4
movq m2, m3
PALIGNR m3, [r1+r2*2-8], 7, m4
PRED4x4_LOWPASS m0, m3, m1, m2, m4
movh [r1+r2*2], m0
psrlq m0, 8
movh [r1+r2*1], m0
psrlq m0, 8
movh [r0+r2*2], m0
psrlq m0, 8
movh [r0+r2*1], m0
RET
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm
0,0 → 1,1199
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
cextern pw_16
cextern pw_8
cextern pw_4
cextern pw_2
cextern pw_1
 
pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4
pw_m3: times 8 dw -3
pw_pixel_max: times 8 dw ((1 << 10)-1)
pw_512: times 8 dw 512
pd_17: times 4 dd 17
pd_16: times 4 dd 16
 
SECTION .text
 
; dest, left, right, src
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2
%macro PRED4x4_LOWPASS 4
paddw %2, %3
psrlw %2, 1
pavgw %1, %4, %2
%endmacro
 
;-----------------------------------------------------------------------------
; void pred4x4_down_right(pixel *src, const pixel *topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED4x4_DR 0
cglobal pred4x4_down_right_10, 3, 3
sub r0, r2
lea r1, [r0+r2*2]
movhps m1, [r1-8]
movhps m2, [r0+r2*1-8]
movhps m4, [r0-8]
punpckhwd m2, m4
movq m3, [r0]
punpckhdq m1, m2
PALIGNR m3, m1, 10, m1
movhps m4, [r1+r2*1-8]
PALIGNR m0, m3, m4, 14, m4
movhps m4, [r1+r2*2-8]
PALIGNR m2, m0, m4, 14, m4
PRED4x4_LOWPASS m0, m2, m3, m0
movq [r1+r2*2], m0
psrldq m0, 2
movq [r1+r2*1], m0
psrldq m0, 2
movq [r0+r2*2], m0
psrldq m0, 2
movq [r0+r2*1], m0
RET
%endmacro
 
INIT_XMM sse2
PRED4x4_DR
INIT_XMM ssse3
PRED4x4_DR
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_DR
%endif
 
;-----------------------------------------------------------------------------
; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED4x4_VR 0
cglobal pred4x4_vertical_right_10, 3, 3, 6
sub r0, r2
lea r1, [r0+r2*2]
movq m5, [r0] ; ........t3t2t1t0
movhps m1, [r0-8]
PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt
pavgw m5, m0
movhps m1, [r0+r2*1-8]
PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0
movhps m2, [r0+r2*2-8]
PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1
movhps m3, [r1+r2*1-8]
PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2
PRED4x4_LOWPASS m1, m0, m2, m1
pslldq m0, m1, 12
psrldq m1, 4
movq [r0+r2*1], m5
movq [r0+r2*2], m1
PALIGNR m5, m0, 14, m2
pslldq m0, 2
movq [r1+r2*1], m5
PALIGNR m1, m0, 14, m0
movq [r1+r2*2], m1
RET
%endmacro
 
INIT_XMM sse2
PRED4x4_VR
INIT_XMM ssse3
PRED4x4_VR
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_VR
%endif
 
;-----------------------------------------------------------------------------
; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED4x4_HD 0
cglobal pred4x4_horizontal_down_10, 3, 3
sub r0, r2
lea r1, [r0+r2*2]
movq m0, [r0-8] ; lt ..
movhps m0, [r0]
pslldq m0, 2 ; t2 t1 t0 lt .. .. .. ..
movq m1, [r1+r2*2-8] ; l3
movq m3, [r1+r2*1-8]
punpcklwd m1, m3 ; l2 l3
movq m2, [r0+r2*2-8] ; l1
movq m3, [r0+r2*1-8]
punpcklwd m2, m3 ; l0 l1
punpckhdq m1, m2 ; l0 l1 l2 l3
punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3
psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1
psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2
pavgw m5, m1, m3
PRED4x4_LOWPASS m3, m1, m0, m3
punpcklwd m5, m3
psrldq m3, 8
PALIGNR m3, m5, 12, m4
movq [r1+r2*2], m5
movhps [r0+r2*2], m5
psrldq m5, 4
movq [r1+r2*1], m5
movq [r0+r2*1], m3
RET
%endmacro
 
INIT_XMM sse2
PRED4x4_HD
INIT_XMM ssse3
PRED4x4_HD
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_HD
%endif
 
;-----------------------------------------------------------------------------
; void pred4x4_dc(pixel *src, const pixel *topright, int stride)
;-----------------------------------------------------------------------------
%macro HADDD 2 ; sum junk
%if mmsize == 16
movhlps %2, %1
paddd %1, %2
pshuflw %2, %1, 0xE
paddd %1, %2
%else
pshufw %2, %1, 0xE
paddd %1, %2
%endif
%endmacro
 
%macro HADDW 2
pmaddwd %1, [pw_1]
HADDD %1, %2
%endmacro
 
INIT_MMX mmxext
cglobal pred4x4_dc_10, 3, 3
sub r0, r2
lea r1, [r0+r2*2]
movq m2, [r0+r2*1-8]
paddw m2, [r0+r2*2-8]
paddw m2, [r1+r2*1-8]
paddw m2, [r1+r2*2-8]
psrlq m2, 48
movq m0, [r0]
HADDW m0, m1
paddw m0, [pw_4]
paddw m0, m2
psrlw m0, 3
SPLATW m0, m0, 0
movq [r0+r2*1], m0
movq [r0+r2*2], m0
movq [r1+r2*1], m0
movq [r1+r2*2], m0
RET
 
;-----------------------------------------------------------------------------
; void pred4x4_down_left(pixel *src, const pixel *topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED4x4_DL 0
cglobal pred4x4_down_left_10, 3, 3
sub r0, r2
movq m0, [r0]
movhps m0, [r1]
psrldq m2, m0, 2
pslldq m3, m0, 2
pshufhw m2, m2, 10100100b
PRED4x4_LOWPASS m0, m3, m2, m0
lea r1, [r0+r2*2]
movhps [r1+r2*2], m0
psrldq m0, 2
movq [r0+r2*1], m0
psrldq m0, 2
movq [r0+r2*2], m0
psrldq m0, 2
movq [r1+r2*1], m0
RET
%endmacro
 
INIT_XMM sse2
PRED4x4_DL
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_DL
%endif
 
;-----------------------------------------------------------------------------
; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED4x4_VL 0
cglobal pred4x4_vertical_left_10, 3, 3
sub r0, r2
movu m1, [r0]
movhps m1, [r1]
psrldq m0, m1, 2
psrldq m2, m1, 4
pavgw m4, m0, m1
PRED4x4_LOWPASS m0, m1, m2, m0
lea r1, [r0+r2*2]
movq [r0+r2*1], m4
movq [r0+r2*2], m0
psrldq m4, 2
psrldq m0, 2
movq [r1+r2*1], m4
movq [r1+r2*2], m0
RET
%endmacro
 
INIT_XMM sse2
PRED4x4_VL
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED4x4_VL
%endif
 
;-----------------------------------------------------------------------------
; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride)
;-----------------------------------------------------------------------------
INIT_MMX mmxext
cglobal pred4x4_horizontal_up_10, 3, 3
sub r0, r2
lea r1, [r0+r2*2]
movq m0, [r0+r2*1-8]
punpckhwd m0, [r0+r2*2-8]
movq m1, [r1+r2*1-8]
punpckhwd m1, [r1+r2*2-8]
punpckhdq m0, m1
pshufw m1, m1, 0xFF
movq [r1+r2*2], m1
movd [r1+r2*1+4], m1
pshufw m2, m0, 11111001b
movq m1, m2
pavgw m2, m0
 
pshufw m5, m0, 11111110b
PRED4x4_LOWPASS m1, m0, m5, m1
movq m6, m2
punpcklwd m6, m1
movq [r0+r2*1], m6
psrlq m2, 16
psrlq m1, 16
punpcklwd m2, m1
movq [r0+r2*2], m2
psrlq m2, 32
movd [r1+r2*1], m2
RET
 
 
 
;-----------------------------------------------------------------------------
; void pred8x8_vertical(pixel *src, int stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal pred8x8_vertical_10, 2, 2
sub r0, r1
mova m0, [r0]
%rep 3
mova [r0+r1*1], m0
mova [r0+r1*2], m0
lea r0, [r0+r1*2]
%endrep
mova [r0+r1*1], m0
mova [r0+r1*2], m0
RET
 
;-----------------------------------------------------------------------------
; void pred8x8_horizontal(pixel *src, int stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal pred8x8_horizontal_10, 2, 3
mov r2d, 4
.loop:
movq m0, [r0+r1*0-8]
movq m1, [r0+r1*1-8]
pshuflw m0, m0, 0xff
pshuflw m1, m1, 0xff
punpcklqdq m0, m0
punpcklqdq m1, m1
mova [r0+r1*0], m0
mova [r0+r1*1], m1
lea r0, [r0+r1*2]
dec r2d
jg .loop
REP_RET
 
;-----------------------------------------------------------------------------
; void predict_8x8_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
%macro MOV8 2-3
; sort of a hack, but it works
%if mmsize==8
movq [%1+0], %2
movq [%1+8], %3
%else
movdqa [%1], %2
%endif
%endmacro
 
%macro PRED8x8_DC 1
cglobal pred8x8_dc_10, 2, 6
sub r0, r1
pxor m4, m4
movq m0, [r0+0]
movq m1, [r0+8]
%if mmsize==16
punpcklwd m0, m1
movhlps m1, m0
paddw m0, m1
%else
pshufw m2, m0, 00001110b
pshufw m3, m1, 00001110b
paddw m0, m2
paddw m1, m3
punpcklwd m0, m1
%endif
%1 m2, m0, 00001110b
paddw m0, m2
 
lea r5, [r1*3]
lea r4, [r0+r1*4]
movzx r2d, word [r0+r1*1-2]
movzx r3d, word [r0+r1*2-2]
add r2d, r3d
movzx r3d, word [r0+r5*1-2]
add r2d, r3d
movzx r3d, word [r4-2]
add r2d, r3d
movd m2, r2d ; s2
 
movzx r2d, word [r4+r1*1-2]
movzx r3d, word [r4+r1*2-2]
add r2d, r3d
movzx r3d, word [r4+r5*1-2]
add r2d, r3d
movzx r3d, word [r4+r1*4-2]
add r2d, r3d
movd m3, r2d ; s3
 
punpcklwd m2, m3
punpckldq m0, m2 ; s0, s1, s2, s3
%1 m3, m0, 11110110b ; s2, s1, s3, s3
%1 m0, m0, 01110100b ; s0, s1, s3, s1
paddw m0, m3
psrlw m0, 2
pavgw m0, m4 ; s0+s2, s1, s3, s1+s3
%if mmsize==16
punpcklwd m0, m0
pshufd m3, m0, 11111010b
punpckldq m0, m0
SWAP 0,1
%else
pshufw m1, m0, 0x00
pshufw m2, m0, 0x55
pshufw m3, m0, 0xaa
pshufw m4, m0, 0xff
%endif
MOV8 r0+r1*1, m1, m2
MOV8 r0+r1*2, m1, m2
MOV8 r0+r5*1, m1, m2
MOV8 r0+r1*4, m1, m2
MOV8 r4+r1*1, m3, m4
MOV8 r4+r1*2, m3, m4
MOV8 r4+r5*1, m3, m4
MOV8 r4+r1*4, m3, m4
RET
%endmacro
 
INIT_MMX mmxext
PRED8x8_DC pshufw
INIT_XMM sse2
PRED8x8_DC pshuflw
 
;-----------------------------------------------------------------------------
; void pred8x8_top_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal pred8x8_top_dc_10, 2, 4
sub r0, r1
mova m0, [r0]
pshuflw m1, m0, 0x4e
pshufhw m1, m1, 0x4e
paddw m0, m1
pshuflw m1, m0, 0xb1
pshufhw m1, m1, 0xb1
paddw m0, m1
lea r2, [r1*3]
lea r3, [r0+r1*4]
paddw m0, [pw_2]
psrlw m0, 2
mova [r0+r1*1], m0
mova [r0+r1*2], m0
mova [r0+r2*1], m0
mova [r0+r1*4], m0
mova [r3+r1*1], m0
mova [r3+r1*2], m0
mova [r3+r2*1], m0
mova [r3+r1*4], m0
RET
 
;-----------------------------------------------------------------------------
; void pred8x8_plane(pixel *src, int stride)
;-----------------------------------------------------------------------------
INIT_XMM sse2
cglobal pred8x8_plane_10, 2, 7, 7
sub r0, r1
lea r2, [r1*3]
lea r3, [r0+r1*4]
mova m2, [r0]
pmaddwd m2, [pw_m32101234]
HADDD m2, m1
movd m0, [r0-4]
psrld m0, 14
psubw m2, m0 ; H
movd m0, [r3+r1*4-4]
movd m1, [r0+12]
paddw m0, m1
psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7])
movzx r4d, word [r3+r1*1-2] ; src[4*stride-1]
movzx r5d, word [r0+r2*1-2] ; src[2*stride-1]
sub r4d, r5d
movzx r6d, word [r3+r1*2-2] ; src[5*stride-1]
movzx r5d, word [r0+r1*2-2] ; src[1*stride-1]
sub r6d, r5d
lea r4d, [r4+r6*2]
movzx r5d, word [r3+r2*1-2] ; src[6*stride-1]
movzx r6d, word [r0+r1*1-2] ; src[0*stride-1]
sub r5d, r6d
lea r5d, [r5*3]
add r4d, r5d
movzx r6d, word [r3+r1*4-2] ; src[7*stride-1]
movzx r5d, word [r0+r1*0-2] ; src[ -stride-1]
sub r6d, r5d
lea r4d, [r4+r6*4]
movd m3, r4d ; V
punpckldq m2, m3
pmaddwd m2, [pd_17]
paddd m2, [pd_16]
psrad m2, 5 ; b, c
 
mova m3, [pw_pixel_max]
pxor m1, m1
SPLATW m0, m0, 1
SPLATW m4, m2, 2
SPLATW m2, m2, 0
pmullw m2, [pw_m32101234] ; b
pmullw m5, m4, [pw_m3] ; c
paddw m5, [pw_16]
mov r2d, 8
add r0, r1
.loop:
paddsw m6, m2, m5
paddsw m6, m0
psraw m6, 5
CLIPW m6, m1, m3
mova [r0], m6
paddw m5, m4
add r0, r1
dec r2d
jg .loop
REP_RET
 
 
;-----------------------------------------------------------------------------
; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_128_DC 0
cglobal pred8x8l_128_dc_10, 4, 4
mova m0, [pw_512] ; (1<<(BIT_DEPTH-1))
lea r1, [r3*3]
lea r2, [r0+r3*4]
MOV8 r0+r3*0, m0, m0
MOV8 r0+r3*1, m0, m0
MOV8 r0+r3*2, m0, m0
MOV8 r0+r1*1, m0, m0
MOV8 r2+r3*0, m0, m0
MOV8 r2+r3*1, m0, m0
MOV8 r2+r3*2, m0, m0
MOV8 r2+r1*1, m0, m0
RET
%endmacro
 
INIT_MMX mmxext
PRED8x8L_128_DC
INIT_XMM sse2
PRED8x8L_128_DC
 
;-----------------------------------------------------------------------------
; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_TOP_DC 0
cglobal pred8x8l_top_dc_10, 4, 4, 6
sub r0, r3
mova m0, [r0]
shr r1d, 14
shr r2d, 13
neg r1
pslldq m1, m0, 2
psrldq m2, m0, 2
pinsrw m1, [r0+r1], 0
pinsrw m2, [r0+r2+14], 7
lea r1, [r3*3]
lea r2, [r0+r3*4]
PRED4x4_LOWPASS m0, m2, m1, m0
HADDW m0, m1
paddw m0, [pw_4]
psrlw m0, 3
SPLATW m0, m0, 0
mova [r0+r3*1], m0
mova [r0+r3*2], m0
mova [r0+r1*1], m0
mova [r0+r3*4], m0
mova [r2+r3*1], m0
mova [r2+r3*2], m0
mova [r2+r1*1], m0
mova [r2+r3*4], m0
RET
%endmacro
 
INIT_XMM sse2
PRED8x8L_TOP_DC
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_TOP_DC
%endif
 
;-----------------------------------------------------------------------------
;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
;TODO: see if scalar is faster
%macro PRED8x8L_DC 0
cglobal pred8x8l_dc_10, 4, 6, 6
sub r0, r3
lea r4, [r0+r3*4]
lea r5, [r3*3]
mova m0, [r0+r3*2-16]
punpckhwd m0, [r0+r3*1-16]
mova m1, [r4+r3*0-16]
punpckhwd m1, [r0+r5*1-16]
punpckhdq m1, m0
mova m2, [r4+r3*2-16]
punpckhwd m2, [r4+r3*1-16]
mova m3, [r4+r3*4-16]
punpckhwd m3, [r4+r5*1-16]
punpckhdq m3, m2
punpckhqdq m3, m1
mova m0, [r0]
shr r1d, 14
shr r2d, 13
neg r1
pslldq m1, m0, 2
psrldq m2, m0, 2
pinsrw m1, [r0+r1], 0
pinsrw m2, [r0+r2+14], 7
not r1
and r1, r3
pslldq m4, m3, 2
psrldq m5, m3, 2
pshuflw m4, m4, 11100101b
pinsrw m5, [r0+r1-2], 7
PRED4x4_LOWPASS m3, m4, m5, m3
PRED4x4_LOWPASS m0, m2, m1, m0
paddw m0, m3
HADDW m0, m1
paddw m0, [pw_8]
psrlw m0, 4
SPLATW m0, m0
mova [r0+r3*1], m0
mova [r0+r3*2], m0
mova [r0+r5*1], m0
mova [r0+r3*4], m0
mova [r4+r3*1], m0
mova [r4+r3*2], m0
mova [r4+r5*1], m0
mova [r4+r3*4], m0
RET
%endmacro
 
INIT_XMM sse2
PRED8x8L_DC
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DC
%endif
 
;-----------------------------------------------------------------------------
; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_VERTICAL 0
cglobal pred8x8l_vertical_10, 4, 4, 6
sub r0, r3
mova m0, [r0]
shr r1d, 14
shr r2d, 13
neg r1
pslldq m1, m0, 2
psrldq m2, m0, 2
pinsrw m1, [r0+r1], 0
pinsrw m2, [r0+r2+14], 7
lea r1, [r3*3]
lea r2, [r0+r3*4]
PRED4x4_LOWPASS m0, m2, m1, m0
mova [r0+r3*1], m0
mova [r0+r3*2], m0
mova [r0+r1*1], m0
mova [r0+r3*4], m0
mova [r2+r3*1], m0
mova [r2+r3*2], m0
mova [r2+r1*1], m0
mova [r2+r3*4], m0
RET
%endmacro
 
INIT_XMM sse2
PRED8x8L_VERTICAL
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_VERTICAL
%endif
 
;-----------------------------------------------------------------------------
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_HORIZONTAL 0
cglobal pred8x8l_horizontal_10, 4, 4, 5
mova m0, [r0-16]
shr r1d, 14
dec r1
and r1, r3
sub r1, r3
punpckhwd m0, [r0+r1-16]
mova m1, [r0+r3*2-16]
punpckhwd m1, [r0+r3*1-16]
lea r2, [r0+r3*4]
lea r1, [r3*3]
punpckhdq m1, m0
mova m2, [r2+r3*0-16]
punpckhwd m2, [r0+r1-16]
mova m3, [r2+r3*2-16]
punpckhwd m3, [r2+r3*1-16]
punpckhdq m3, m2
punpckhqdq m3, m1
PALIGNR m4, m3, [r2+r1-16], 14, m0
pslldq m0, m4, 2
pshuflw m0, m0, 11100101b
PRED4x4_LOWPASS m4, m3, m0, m4
punpckhwd m3, m4, m4
punpcklwd m4, m4
pshufd m0, m3, 0xff
pshufd m1, m3, 0xaa
pshufd m2, m3, 0x55
pshufd m3, m3, 0x00
mova [r0+r3*0], m0
mova [r0+r3*1], m1
mova [r0+r3*2], m2
mova [r0+r1*1], m3
pshufd m0, m4, 0xff
pshufd m1, m4, 0xaa
pshufd m2, m4, 0x55
pshufd m3, m4, 0x00
mova [r2+r3*0], m0
mova [r2+r3*1], m1
mova [r2+r3*2], m2
mova [r2+r1*1], m3
RET
%endmacro
 
INIT_XMM sse2
PRED8x8L_HORIZONTAL
INIT_XMM ssse3
PRED8x8L_HORIZONTAL
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_HORIZONTAL
%endif
 
;-----------------------------------------------------------------------------
;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_DOWN_LEFT 0
cglobal pred8x8l_down_left_10, 4, 4, 7
sub r0, r3
mova m3, [r0]
shr r1d, 14
neg r1
shr r2d, 13
pslldq m1, m3, 2
psrldq m2, m3, 2
pinsrw m1, [r0+r1], 0
pinsrw m2, [r0+r2+14], 7
PRED4x4_LOWPASS m6, m2, m1, m3
jz .fix_tr ; flags from shr r2d
mova m1, [r0+16]
psrldq m5, m1, 2
PALIGNR m2, m1, m3, 14, m3
pshufhw m5, m5, 10100100b
PRED4x4_LOWPASS m1, m2, m5, m1
.do_topright:
lea r1, [r3*3]
psrldq m5, m1, 14
lea r2, [r0+r3*4]
PALIGNR m2, m1, m6, 2, m0
PALIGNR m3, m1, m6, 14, m0
PALIGNR m5, m1, 2, m0
pslldq m4, m6, 2
PRED4x4_LOWPASS m6, m4, m2, m6
PRED4x4_LOWPASS m1, m3, m5, m1
mova [r2+r3*4], m1
PALIGNR m1, m6, 14, m2
pslldq m6, 2
mova [r2+r1*1], m1
PALIGNR m1, m6, 14, m2
pslldq m6, 2
mova [r2+r3*2], m1
PALIGNR m1, m6, 14, m2
pslldq m6, 2
mova [r2+r3*1], m1
PALIGNR m1, m6, 14, m2
pslldq m6, 2
mova [r0+r3*4], m1
PALIGNR m1, m6, 14, m2
pslldq m6, 2
mova [r0+r1*1], m1
PALIGNR m1, m6, 14, m2
pslldq m6, 2
mova [r0+r3*2], m1
PALIGNR m1, m6, 14, m6
mova [r0+r3*1], m1
RET
.fix_tr:
punpckhwd m3, m3
pshufd m1, m3, 0xFF
jmp .do_topright
%endmacro
 
INIT_XMM sse2
PRED8x8L_DOWN_LEFT
INIT_XMM ssse3
PRED8x8L_DOWN_LEFT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DOWN_LEFT
%endif
 
;-----------------------------------------------------------------------------
;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_DOWN_RIGHT 0
; standard forbids this when has_topleft is false
; no need to check
cglobal pred8x8l_down_right_10, 4, 5, 8
sub r0, r3
lea r4, [r0+r3*4]
lea r1, [r3*3]
mova m0, [r0+r3*1-16]
punpckhwd m0, [r0+r3*0-16]
mova m1, [r0+r1*1-16]
punpckhwd m1, [r0+r3*2-16]
punpckhdq m1, m0
mova m2, [r4+r3*1-16]
punpckhwd m2, [r4+r3*0-16]
mova m3, [r4+r1*1-16]
punpckhwd m3, [r4+r3*2-16]
punpckhdq m3, m2
punpckhqdq m3, m1
mova m0, [r4+r3*4-16]
mova m1, [r0]
PALIGNR m4, m3, m0, 14, m0
PALIGNR m1, m3, 2, m2
pslldq m0, m4, 2
pshuflw m0, m0, 11100101b
PRED4x4_LOWPASS m6, m1, m4, m3
PRED4x4_LOWPASS m4, m3, m0, m4
mova m3, [r0]
shr r2d, 13
pslldq m1, m3, 2
psrldq m2, m3, 2
pinsrw m1, [r0-2], 0
pinsrw m2, [r0+r2+14], 7
PRED4x4_LOWPASS m3, m2, m1, m3
PALIGNR m2, m3, m6, 2, m0
PALIGNR m5, m3, m6, 14, m0
psrldq m7, m3, 2
PRED4x4_LOWPASS m6, m4, m2, m6
PRED4x4_LOWPASS m3, m5, m7, m3
mova [r4+r3*4], m6
PALIGNR m3, m6, 14, m2
pslldq m6, 2
mova [r0+r3*1], m3
PALIGNR m3, m6, 14, m2
pslldq m6, 2
mova [r0+r3*2], m3
PALIGNR m3, m6, 14, m2
pslldq m6, 2
mova [r0+r1*1], m3
PALIGNR m3, m6, 14, m2
pslldq m6, 2
mova [r0+r3*4], m3
PALIGNR m3, m6, 14, m2
pslldq m6, 2
mova [r4+r3*1], m3
PALIGNR m3, m6, 14, m2
pslldq m6, 2
mova [r4+r3*2], m3
PALIGNR m3, m6, 14, m6
mova [r4+r1*1], m3
RET
%endmacro
 
INIT_XMM sse2
PRED8x8L_DOWN_RIGHT
INIT_XMM ssse3
PRED8x8L_DOWN_RIGHT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_DOWN_RIGHT
%endif
 
;-----------------------------------------------------------------------------
; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_VERTICAL_RIGHT 0
; likewise with 8x8l_down_right
cglobal pred8x8l_vertical_right_10, 4, 5, 7
sub r0, r3
lea r4, [r0+r3*4]
lea r1, [r3*3]
mova m0, [r0+r3*1-16]
punpckhwd m0, [r0+r3*0-16]
mova m1, [r0+r1*1-16]
punpckhwd m1, [r0+r3*2-16]
punpckhdq m1, m0
mova m2, [r4+r3*1-16]
punpckhwd m2, [r4+r3*0-16]
mova m3, [r4+r1*1-16]
punpckhwd m3, [r4+r3*2-16]
punpckhdq m3, m2
punpckhqdq m3, m1
mova m0, [r4+r3*4-16]
mova m1, [r0]
PALIGNR m4, m3, m0, 14, m0
PALIGNR m1, m3, 2, m2
PRED4x4_LOWPASS m3, m1, m4, m3
mova m2, [r0]
shr r2d, 13
pslldq m1, m2, 2
psrldq m5, m2, 2
pinsrw m1, [r0-2], 0
pinsrw m5, [r0+r2+14], 7
PRED4x4_LOWPASS m2, m5, m1, m2
PALIGNR m6, m2, m3, 12, m1
PALIGNR m5, m2, m3, 14, m0
PRED4x4_LOWPASS m0, m6, m2, m5
pavgw m2, m5
mova [r0+r3*2], m0
mova [r0+r3*1], m2
pslldq m6, m3, 4
pslldq m1, m3, 2
PRED4x4_LOWPASS m1, m3, m6, m1
PALIGNR m2, m1, 14, m4
mova [r0+r1*1], m2
pslldq m1, 2
PALIGNR m0, m1, 14, m3
mova [r0+r3*4], m0
pslldq m1, 2
PALIGNR m2, m1, 14, m4
mova [r4+r3*1], m2
pslldq m1, 2
PALIGNR m0, m1, 14, m3
mova [r4+r3*2], m0
pslldq m1, 2
PALIGNR m2, m1, 14, m4
mova [r4+r1*1], m2
pslldq m1, 2
PALIGNR m0, m1, 14, m1
mova [r4+r3*4], m0
RET
%endmacro
 
INIT_XMM sse2
PRED8x8L_VERTICAL_RIGHT
INIT_XMM ssse3
PRED8x8L_VERTICAL_RIGHT
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_VERTICAL_RIGHT
%endif
 
;-----------------------------------------------------------------------------
; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride)
;-----------------------------------------------------------------------------
%macro PRED8x8L_HORIZONTAL_UP 0
cglobal pred8x8l_horizontal_up_10, 4, 4, 6
mova m0, [r0+r3*0-16]
punpckhwd m0, [r0+r3*1-16]
shr r1d, 14
dec r1
and r1, r3
sub r1, r3
mova m4, [r0+r1*1-16]
lea r1, [r3*3]
lea r2, [r0+r3*4]
mova m1, [r0+r3*2-16]
punpckhwd m1, [r0+r1*1-16]
punpckhdq m0, m1
mova m2, [r2+r3*0-16]
punpckhwd m2, [r2+r3*1-16]
mova m3, [r2+r3*2-16]
punpckhwd m3, [r2+r1*1-16]
punpckhdq m2, m3
punpckhqdq m0, m2
PALIGNR m1, m0, m4, 14, m4
psrldq m2, m0, 2
pshufhw m2, m2, 10100100b
PRED4x4_LOWPASS m0, m1, m2, m0
psrldq m1, m0, 2
psrldq m2, m0, 4
pshufhw m1, m1, 10100100b
pshufhw m2, m2, 01010100b
pavgw m4, m0, m1
PRED4x4_LOWPASS m1, m2, m0, m1
punpckhwd m5, m4, m1
punpcklwd m4, m1
mova [r2+r3*0], m5
mova [r0+r3*0], m4
pshufd m0, m5, 11111001b
pshufd m1, m5, 11111110b
pshufd m2, m5, 11111111b
mova [r2+r3*1], m0
mova [r2+r3*2], m1
mova [r2+r1*1], m2
PALIGNR m2, m5, m4, 4, m0
PALIGNR m3, m5, m4, 8, m1
PALIGNR m5, m5, m4, 12, m4
mova [r0+r3*1], m2
mova [r0+r3*2], m3
mova [r0+r1*1], m5
RET
%endmacro
 
INIT_XMM sse2
PRED8x8L_HORIZONTAL_UP
INIT_XMM ssse3
PRED8x8L_HORIZONTAL_UP
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PRED8x8L_HORIZONTAL_UP
%endif
 
 
;-----------------------------------------------------------------------------
; void pred16x16_vertical(pixel *src, int stride)
;-----------------------------------------------------------------------------
%macro MOV16 3-5
mova [%1+ 0], %2
mova [%1+mmsize], %3
%if mmsize==8
mova [%1+ 16], %4
mova [%1+ 24], %5
%endif
%endmacro
 
%macro PRED16x16_VERTICAL 0
cglobal pred16x16_vertical_10, 2, 3
sub r0, r1
mov r2d, 8
mova m0, [r0+ 0]
mova m1, [r0+mmsize]
%if mmsize==8
mova m2, [r0+16]
mova m3, [r0+24]
%endif
.loop:
MOV16 r0+r1*1, m0, m1, m2, m3
MOV16 r0+r1*2, m0, m1, m2, m3
lea r0, [r0+r1*2]
dec r2d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PRED16x16_VERTICAL
INIT_XMM sse2
PRED16x16_VERTICAL
 
;-----------------------------------------------------------------------------
; void pred16x16_horizontal(pixel *src, int stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_HORIZONTAL 0
cglobal pred16x16_horizontal_10, 2, 3
mov r2d, 8
.vloop:
movd m0, [r0+r1*0-4]
movd m1, [r0+r1*1-4]
SPLATW m0, m0, 1
SPLATW m1, m1, 1
MOV16 r0+r1*0, m0, m0, m0, m0
MOV16 r0+r1*1, m1, m1, m1, m1
lea r0, [r0+r1*2]
dec r2d
jg .vloop
REP_RET
%endmacro
 
INIT_MMX mmxext
PRED16x16_HORIZONTAL
INIT_XMM sse2
PRED16x16_HORIZONTAL
 
;-----------------------------------------------------------------------------
; void pred16x16_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_DC 0
cglobal pred16x16_dc_10, 2, 6
mov r5, r0
sub r0, r1
mova m0, [r0+0]
paddw m0, [r0+mmsize]
%if mmsize==8
paddw m0, [r0+16]
paddw m0, [r0+24]
%endif
HADDW m0, m2
 
lea r0, [r0+r1-2]
movzx r3d, word [r0]
movzx r4d, word [r0+r1]
%rep 7
lea r0, [r0+r1*2]
movzx r2d, word [r0]
add r3d, r2d
movzx r2d, word [r0+r1]
add r4d, r2d
%endrep
lea r3d, [r3+r4+16]
 
movd m1, r3d
paddw m0, m1
psrlw m0, 5
SPLATW m0, m0
mov r3d, 8
.loop:
MOV16 r5+r1*0, m0, m0, m0, m0
MOV16 r5+r1*1, m0, m0, m0, m0
lea r5, [r5+r1*2]
dec r3d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PRED16x16_DC
INIT_XMM sse2
PRED16x16_DC
 
;-----------------------------------------------------------------------------
; void pred16x16_top_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_TOP_DC 0
cglobal pred16x16_top_dc_10, 2, 3
sub r0, r1
mova m0, [r0+0]
paddw m0, [r0+mmsize]
%if mmsize==8
paddw m0, [r0+16]
paddw m0, [r0+24]
%endif
HADDW m0, m2
 
SPLATW m0, m0
paddw m0, [pw_8]
psrlw m0, 4
mov r2d, 8
.loop:
MOV16 r0+r1*1, m0, m0, m0, m0
MOV16 r0+r1*2, m0, m0, m0, m0
lea r0, [r0+r1*2]
dec r2d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PRED16x16_TOP_DC
INIT_XMM sse2
PRED16x16_TOP_DC
 
;-----------------------------------------------------------------------------
; void pred16x16_left_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_LEFT_DC 0
cglobal pred16x16_left_dc_10, 2, 6
mov r5, r0
 
sub r0, 2
movzx r3d, word [r0]
movzx r4d, word [r0+r1]
%rep 7
lea r0, [r0+r1*2]
movzx r2d, word [r0]
add r3d, r2d
movzx r2d, word [r0+r1]
add r4d, r2d
%endrep
lea r3d, [r3+r4+8]
shr r3d, 4
 
movd m0, r3d
SPLATW m0, m0
mov r3d, 8
.loop:
MOV16 r5+r1*0, m0, m0, m0, m0
MOV16 r5+r1*1, m0, m0, m0, m0
lea r5, [r5+r1*2]
dec r3d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PRED16x16_LEFT_DC
INIT_XMM sse2
PRED16x16_LEFT_DC
 
;-----------------------------------------------------------------------------
; void pred16x16_128_dc(pixel *src, int stride)
;-----------------------------------------------------------------------------
%macro PRED16x16_128_DC 0
cglobal pred16x16_128_dc_10, 2,3
mova m0, [pw_512]
mov r2d, 8
.loop:
MOV16 r0+r1*0, m0, m0, m0, m0
MOV16 r0+r1*1, m0, m0, m0, m0
lea r0, [r0+r1*2]
dec r2d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PRED16x16_128_DC
INIT_XMM sse2
PRED16x16_128_DC
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_intrapred_init.c
0,0 → 1,402
/*
* Copyright (c) 2010 Jason Garrett-Glaser
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/h264pred.h"
 
#define PRED4x4(TYPE, DEPTH, OPT) \
void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
const uint8_t *topright, \
ptrdiff_t stride);
 
PRED4x4(dc, 10, mmxext)
PRED4x4(down_left, 10, sse2)
PRED4x4(down_left, 10, avx)
PRED4x4(down_right, 10, sse2)
PRED4x4(down_right, 10, ssse3)
PRED4x4(down_right, 10, avx)
PRED4x4(vertical_left, 10, sse2)
PRED4x4(vertical_left, 10, avx)
PRED4x4(vertical_right, 10, sse2)
PRED4x4(vertical_right, 10, ssse3)
PRED4x4(vertical_right, 10, avx)
PRED4x4(horizontal_up, 10, mmxext)
PRED4x4(horizontal_down, 10, sse2)
PRED4x4(horizontal_down, 10, ssse3)
PRED4x4(horizontal_down, 10, avx)
 
#define PRED8x8(TYPE, DEPTH, OPT) \
void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
ptrdiff_t stride);
 
PRED8x8(dc, 10, mmxext)
PRED8x8(dc, 10, sse2)
PRED8x8(top_dc, 10, sse2)
PRED8x8(plane, 10, sse2)
PRED8x8(vertical, 10, sse2)
PRED8x8(horizontal, 10, sse2)
 
#define PRED8x8L(TYPE, DEPTH, OPT)\
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
int has_topleft, \
int has_topright, \
ptrdiff_t stride);
 
PRED8x8L(dc, 10, sse2)
PRED8x8L(dc, 10, avx)
PRED8x8L(128_dc, 10, mmxext)
PRED8x8L(128_dc, 10, sse2)
PRED8x8L(top_dc, 10, sse2)
PRED8x8L(top_dc, 10, avx)
PRED8x8L(vertical, 10, sse2)
PRED8x8L(vertical, 10, avx)
PRED8x8L(horizontal, 10, sse2)
PRED8x8L(horizontal, 10, ssse3)
PRED8x8L(horizontal, 10, avx)
PRED8x8L(down_left, 10, sse2)
PRED8x8L(down_left, 10, ssse3)
PRED8x8L(down_left, 10, avx)
PRED8x8L(down_right, 10, sse2)
PRED8x8L(down_right, 10, ssse3)
PRED8x8L(down_right, 10, avx)
PRED8x8L(vertical_right, 10, sse2)
PRED8x8L(vertical_right, 10, ssse3)
PRED8x8L(vertical_right, 10, avx)
PRED8x8L(horizontal_up, 10, sse2)
PRED8x8L(horizontal_up, 10, ssse3)
PRED8x8L(horizontal_up, 10, avx)
 
#define PRED16x16(TYPE, DEPTH, OPT)\
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \
ptrdiff_t stride);
 
PRED16x16(dc, 10, mmxext)
PRED16x16(dc, 10, sse2)
PRED16x16(top_dc, 10, mmxext)
PRED16x16(top_dc, 10, sse2)
PRED16x16(128_dc, 10, mmxext)
PRED16x16(128_dc, 10, sse2)
PRED16x16(left_dc, 10, mmxext)
PRED16x16(left_dc, 10, sse2)
PRED16x16(vertical, 10, mmxext)
PRED16x16(vertical, 10, sse2)
PRED16x16(horizontal, 10, mmxext)
PRED16x16(horizontal, 10, sse2)
 
/* 8-bit versions */
PRED16x16(vertical, 8, mmx)
PRED16x16(vertical, 8, sse)
PRED16x16(horizontal, 8, mmx)
PRED16x16(horizontal, 8, mmxext)
PRED16x16(horizontal, 8, ssse3)
PRED16x16(dc, 8, mmxext)
PRED16x16(dc, 8, sse2)
PRED16x16(dc, 8, ssse3)
PRED16x16(plane_h264, 8, mmx)
PRED16x16(plane_h264, 8, mmxext)
PRED16x16(plane_h264, 8, sse2)
PRED16x16(plane_h264, 8, ssse3)
PRED16x16(plane_rv40, 8, mmx)
PRED16x16(plane_rv40, 8, mmxext)
PRED16x16(plane_rv40, 8, sse2)
PRED16x16(plane_rv40, 8, ssse3)
PRED16x16(plane_svq3, 8, mmx)
PRED16x16(plane_svq3, 8, mmxext)
PRED16x16(plane_svq3, 8, sse2)
PRED16x16(plane_svq3, 8, ssse3)
PRED16x16(tm_vp8, 8, mmx)
PRED16x16(tm_vp8, 8, mmxext)
PRED16x16(tm_vp8, 8, sse2)
 
PRED8x8(top_dc, 8, mmxext)
PRED8x8(dc_rv40, 8, mmxext)
PRED8x8(dc, 8, mmxext)
PRED8x8(vertical, 8, mmx)
PRED8x8(horizontal, 8, mmx)
PRED8x8(horizontal, 8, mmxext)
PRED8x8(horizontal, 8, ssse3)
PRED8x8(plane, 8, mmx)
PRED8x8(plane, 8, mmxext)
PRED8x8(plane, 8, sse2)
PRED8x8(plane, 8, ssse3)
PRED8x8(tm_vp8, 8, mmx)
PRED8x8(tm_vp8, 8, mmxext)
PRED8x8(tm_vp8, 8, sse2)
PRED8x8(tm_vp8, 8, ssse3)
 
PRED8x8L(top_dc, 8, mmxext)
PRED8x8L(top_dc, 8, ssse3)
PRED8x8L(dc, 8, mmxext)
PRED8x8L(dc, 8, ssse3)
PRED8x8L(horizontal, 8, mmxext)
PRED8x8L(horizontal, 8, ssse3)
PRED8x8L(vertical, 8, mmxext)
PRED8x8L(vertical, 8, ssse3)
PRED8x8L(down_left, 8, mmxext)
PRED8x8L(down_left, 8, sse2)
PRED8x8L(down_left, 8, ssse3)
PRED8x8L(down_right, 8, mmxext)
PRED8x8L(down_right, 8, sse2)
PRED8x8L(down_right, 8, ssse3)
PRED8x8L(vertical_right, 8, mmxext)
PRED8x8L(vertical_right, 8, sse2)
PRED8x8L(vertical_right, 8, ssse3)
PRED8x8L(vertical_left, 8, sse2)
PRED8x8L(vertical_left, 8, ssse3)
PRED8x8L(horizontal_up, 8, mmxext)
PRED8x8L(horizontal_up, 8, ssse3)
PRED8x8L(horizontal_down, 8, mmxext)
PRED8x8L(horizontal_down, 8, sse2)
PRED8x8L(horizontal_down, 8, ssse3)
 
PRED4x4(dc, 8, mmxext)
PRED4x4(down_left, 8, mmxext)
PRED4x4(down_right, 8, mmxext)
PRED4x4(vertical_left, 8, mmxext)
PRED4x4(vertical_right, 8, mmxext)
PRED4x4(horizontal_up, 8, mmxext)
PRED4x4(horizontal_down, 8, mmxext)
PRED4x4(tm_vp8, 8, mmx)
PRED4x4(tm_vp8, 8, mmxext)
PRED4x4(tm_vp8, 8, ssse3)
PRED4x4(vertical_vp8, 8, mmxext)
 
av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id,
const int bit_depth,
const int chroma_format_idc)
{
int cpu_flags = av_get_cpu_flags();
 
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx;
if (chroma_format_idc == 1) {
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx;
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx;
}
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx;
} else {
if (chroma_format_idc == 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx;
if (codec_id == AV_CODEC_ID_SVQ3) {
if (cpu_flags & AV_CPU_FLAG_CMOV)
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx;
}
}
}
 
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext;
if (chroma_format_idc == 1)
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext;
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext;
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext;
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext;
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext;
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext;
h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext;
h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext;
h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext;
h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext;
if (codec_id == AV_CODEC_ID_VP8 || codec_id == AV_CODEC_ID_H264) {
h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext;
}
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext;
}
if (codec_id != AV_CODEC_ID_RV40) {
h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext;
}
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) {
if (chroma_format_idc == 1) {
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext;
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext;
}
}
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext;
h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext;
h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext;
} else {
if (chroma_format_idc == 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext;
} else {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext;
}
}
}
 
if (EXTERNAL_SSE(cpu_flags)) {
h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse;
}
 
if (EXTERNAL_SSE2(cpu_flags)) {
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2;
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2;
if (codec_id == AV_CODEC_ID_VP8) {
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2;
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2;
} else {
if (chroma_format_idc == 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2;
}
}
}
 
if (EXTERNAL_SSSE3(cpu_flags)) {
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3;
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3;
if (chroma_format_idc == 1)
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3;
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3;
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3;
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3;
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3;
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3;
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3;
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3;
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3;
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3;
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3;
if (codec_id == AV_CODEC_ID_VP8) {
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3;
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3;
} else {
if (chroma_format_idc == 1)
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3;
if (codec_id == AV_CODEC_ID_SVQ3) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3;
} else if (codec_id == AV_CODEC_ID_RV40) {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3;
} else {
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3;
}
}
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext;
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext;
 
if (chroma_format_idc == 1)
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext;
 
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext;
 
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext;
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2;
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2;
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2;
 
if (chroma_format_idc == 1) {
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2;
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2;
h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2;
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2;
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2;
}
 
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2;
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2;
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2;
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2;
 
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2;
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2;
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2;
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2;
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2;
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3;
 
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx;
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx;
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx;
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx;
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx;
 
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx;
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx;
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx;
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx;
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx;
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx;
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx;
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx;
}
}
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_qpel.c
0,0 → 1,634
/*
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
* Copyright (c) 2011 Daniel Kang
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264qpel.h"
#include "libavcodec/mpegvideo.h"
#include "dsputil_x86.h"
 
#if HAVE_YASM
void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2,
int dstStride, int src1Stride, int h);
#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext
 
PIXELS16(static, ff_avg, , , _mmxext)
PIXELS16(static, ff_put, , , _mmxext)
 
#define DEF_QPEL(OPNAME)\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);
 
DEF_QPEL(avg)
DEF_QPEL(put)
 
#define QPEL_H264(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
int w=3;\
src -= 2*srcStride+2;\
while(w--){\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\
tmp += 4;\
src += 4;\
}\
tmp -= 3*4;\
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\
src -= 2*srcStride;\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
src += 4;\
dst += 4;\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\
int w = (size+8)>>2;\
src -= 2*srcStride+2;\
while(w--){\
ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\
tmp += 4;\
src += 4;\
}\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\
int w = size>>4;\
do{\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\
tmp += 8;\
dst += 8;\
}while(w--);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\
}\
\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\
}\
\
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\
{\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\
}\
 
 
#if ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
 
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);
 
#else // ARCH_X86_64
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
src += 8*dstStride;\
dst += 8*dstStride;\
src2 += 8*src2Stride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\
}
#endif // ARCH_X86_64
 
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
src += 8*srcStride;\
dst += 8*dstStride;\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\
}\
 
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\
}
 
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp,
uint8_t *src,
int tmpStride,
int srcStride,
int size)
{
int w = (size+8)>>3;
src -= 2*srcStride+2;
while(w--){
ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size);
tmp += 8;
src += 8;
}
}
 
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\
}\
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\
}\
 
#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext
 
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2
 
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext
 
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\
 
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_sse2(dst, src, stride, 16);
}
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_sse2(dst, src, stride, 16);
}
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext
 
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\
}\
 
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\
}\
 
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\
}\
 
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\
}\
\
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\
{\
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\
uint8_t * const halfHV= temp;\
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\
av_assert2(((int)temp & 7) == 0);\
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\
}\
 
#define H264_MC_4816(MMX)\
H264_MC(put_, 4, MMX, 8)\
H264_MC(put_, 8, MMX, 8)\
H264_MC(put_, 16,MMX, 8)\
H264_MC(avg_, 4, MMX, 8)\
H264_MC(avg_, 8, MMX, 8)\
H264_MC(avg_, 16,MMX, 8)\
 
#define H264_MC_816(QPEL, XMM)\
QPEL(put_, 8, XMM, 16)\
QPEL(put_, 16,XMM, 16)\
QPEL(avg_, 8, XMM, 16)\
QPEL(avg_, 16,XMM, 16)\
 
QPEL_H264(put_, PUT_OP, mmxext)
QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext)
QPEL_H264_V_XMM(put_, PUT_OP, sse2)
QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2)
QPEL_H264_HV_XMM(put_, PUT_OP, sse2)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2)
QPEL_H264_H_XMM(put_, PUT_OP, ssse3)
QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3)
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3)
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3)
 
H264_MC_4816(mmxext)
H264_MC_816(H264_MC_V, sse2)
H264_MC_816(H264_MC_HV, sse2)
H264_MC_816(H264_MC_H, ssse3)
H264_MC_816(H264_MC_HV, ssse3)
 
 
//10bit
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \
void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, uint8_t *src, ptrdiff_t stride);
 
#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
 
#define LUMA_MC_816(DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT)
 
LUMA_MC_ALL(10, mc00, mmxext)
LUMA_MC_ALL(10, mc10, mmxext)
LUMA_MC_ALL(10, mc20, mmxext)
LUMA_MC_ALL(10, mc30, mmxext)
LUMA_MC_ALL(10, mc01, mmxext)
LUMA_MC_ALL(10, mc11, mmxext)
LUMA_MC_ALL(10, mc21, mmxext)
LUMA_MC_ALL(10, mc31, mmxext)
LUMA_MC_ALL(10, mc02, mmxext)
LUMA_MC_ALL(10, mc12, mmxext)
LUMA_MC_ALL(10, mc22, mmxext)
LUMA_MC_ALL(10, mc32, mmxext)
LUMA_MC_ALL(10, mc03, mmxext)
LUMA_MC_ALL(10, mc13, mmxext)
LUMA_MC_ALL(10, mc23, mmxext)
LUMA_MC_ALL(10, mc33, mmxext)
 
LUMA_MC_816(10, mc00, sse2)
LUMA_MC_816(10, mc10, sse2)
LUMA_MC_816(10, mc10, sse2_cache64)
LUMA_MC_816(10, mc10, ssse3_cache64)
LUMA_MC_816(10, mc20, sse2)
LUMA_MC_816(10, mc20, sse2_cache64)
LUMA_MC_816(10, mc20, ssse3_cache64)
LUMA_MC_816(10, mc30, sse2)
LUMA_MC_816(10, mc30, sse2_cache64)
LUMA_MC_816(10, mc30, ssse3_cache64)
LUMA_MC_816(10, mc01, sse2)
LUMA_MC_816(10, mc11, sse2)
LUMA_MC_816(10, mc21, sse2)
LUMA_MC_816(10, mc31, sse2)
LUMA_MC_816(10, mc02, sse2)
LUMA_MC_816(10, mc12, sse2)
LUMA_MC_816(10, mc22, sse2)
LUMA_MC_816(10, mc32, sse2)
LUMA_MC_816(10, mc03, sse2)
LUMA_MC_816(10, mc13, sse2)
LUMA_MC_816(10, mc23, sse2)
LUMA_MC_816(10, mc33, sse2)
 
#define QPEL16_OPMC(OP, MC, MMX)\
void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride){\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
src += 8*stride;\
dst += 8*stride;\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\
}
 
#define QPEL16_OP(MC, MMX)\
QPEL16_OPMC(put, MC, MMX)\
QPEL16_OPMC(avg, MC, MMX)
 
#define QPEL16(MMX)\
QPEL16_OP(mc00, MMX)\
QPEL16_OP(mc01, MMX)\
QPEL16_OP(mc02, MMX)\
QPEL16_OP(mc03, MMX)\
QPEL16_OP(mc10, MMX)\
QPEL16_OP(mc11, MMX)\
QPEL16_OP(mc12, MMX)\
QPEL16_OP(mc13, MMX)\
QPEL16_OP(mc20, MMX)\
QPEL16_OP(mc21, MMX)\
QPEL16_OP(mc22, MMX)\
QPEL16_OP(mc23, MMX)\
QPEL16_OP(mc30, MMX)\
QPEL16_OP(mc31, MMX)\
QPEL16_OP(mc32, MMX)\
QPEL16_OP(mc33, MMX)
 
#if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+
QPEL16(mmxext)
#endif
 
#endif /* HAVE_YASM */
 
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \
do { \
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \
} while (0)
 
#define H264_QPEL_FUNCS(x, y, CPU) \
do { \
c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \
c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \
} while (0)
 
#define H264_QPEL_FUNCS_10(x, y, CPU) \
do { \
c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \
} while (0)
 
av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth)
{
#if HAVE_YASM
int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
 
if (EXTERNAL_MMXEXT(cpu_flags)) {
if (!high_bit_depth) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, );
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, );
} else if (bit_depth == 10) {
#if ARCH_X86_32
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_);
#endif
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_);
}
}
 
if (EXTERNAL_SSE2(cpu_flags)) {
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && !high_bit_depth) {
// these functions are slower than mmx on AMD, but faster on Intel
H264_QPEL_FUNCS(0, 0, sse2);
}
 
if (!high_bit_depth) {
H264_QPEL_FUNCS(0, 1, sse2);
H264_QPEL_FUNCS(0, 2, sse2);
H264_QPEL_FUNCS(0, 3, sse2);
H264_QPEL_FUNCS(1, 1, sse2);
H264_QPEL_FUNCS(1, 2, sse2);
H264_QPEL_FUNCS(1, 3, sse2);
H264_QPEL_FUNCS(2, 1, sse2);
H264_QPEL_FUNCS(2, 2, sse2);
H264_QPEL_FUNCS(2, 3, sse2);
H264_QPEL_FUNCS(3, 1, sse2);
H264_QPEL_FUNCS(3, 2, sse2);
H264_QPEL_FUNCS(3, 3, sse2);
}
 
if (bit_depth == 10) {
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_);
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_);
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_);
H264_QPEL_FUNCS_10(1, 0, sse2_cache64);
H264_QPEL_FUNCS_10(2, 0, sse2_cache64);
H264_QPEL_FUNCS_10(3, 0, sse2_cache64);
}
}
 
if (EXTERNAL_SSSE3(cpu_flags)) {
if (!high_bit_depth) {
H264_QPEL_FUNCS(1, 0, ssse3);
H264_QPEL_FUNCS(1, 1, ssse3);
H264_QPEL_FUNCS(1, 2, ssse3);
H264_QPEL_FUNCS(1, 3, ssse3);
H264_QPEL_FUNCS(2, 0, ssse3);
H264_QPEL_FUNCS(2, 1, ssse3);
H264_QPEL_FUNCS(2, 2, ssse3);
H264_QPEL_FUNCS(2, 3, ssse3);
H264_QPEL_FUNCS(3, 0, ssse3);
H264_QPEL_FUNCS(3, 1, ssse3);
H264_QPEL_FUNCS(3, 2, ssse3);
H264_QPEL_FUNCS(3, 3, ssse3);
}
 
if (bit_depth == 10) {
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64);
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64);
}
}
 
if (EXTERNAL_AVX(cpu_flags)) {
/* AVX implies 64 byte cache lines without the need to avoid unaligned
* memory accesses that cross the boundary between two cache lines.
* TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid
* having to treat SSE2 functions with such properties as AVX. */
if (bit_depth == 10) {
H264_QPEL_FUNCS_10(1, 0, sse2);
H264_QPEL_FUNCS_10(2, 0, sse2);
H264_QPEL_FUNCS_10(3, 0, sse2);
}
}
#endif
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm
0,0 → 1,884
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code
;*****************************************************************************
;* Copyright (C) 2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA 32
 
cextern pw_16
cextern pw_1
cextern pb_0
 
pw_pixel_max: times 8 dw ((1 << 10)-1)
 
pad10: times 8 dw 10*1023
pad20: times 8 dw 20*1023
pad30: times 8 dw 30*1023
depad: times 4 dd 32*20*1023 + 512
depad2: times 8 dw 20*1023 + 16*1022 + 16
unpad: times 8 dw 16*1022/32 ; needs to be mod 16
 
tap1: times 4 dw 1, -5
tap2: times 4 dw 20, 20
tap3: times 4 dw -5, 1
pd_0f: times 4 dd 0xffff
 
SECTION .text
 
 
%macro AVG_MOV 2
pavgw %2, %1
mova %1, %2
%endmacro
 
%macro ADDW 3
%if mmsize == 8
paddw %1, %2
%else
movu %3, %2
paddw %1, %3
%endif
%endmacro
 
%macro FILT_H 4
paddw %1, %4
psubw %1, %2 ; a-b
psraw %1, 2 ; (a-b)/4
psubw %1, %2 ; (a-b)/4-b
paddw %1, %3 ; (a-b)/4-b+c
psraw %1, 2 ; ((a-b)/4-b+c)/4
paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
%endmacro
 
%macro PRELOAD_V 0
lea r3, [r2*3]
sub r1, r3
movu m0, [r1+r2]
movu m1, [r1+r2*2]
add r1, r3
movu m2, [r1]
movu m3, [r1+r2]
movu m4, [r1+r2*2]
add r1, r3
%endmacro
 
%macro FILT_V 8
movu %6, [r1]
paddw %1, %6
mova %7, %2
paddw %7, %5
mova %8, %3
paddw %8, %4
FILT_H %1, %7, %8, [pw_16]
psraw %1, 1
CLIPW %1, [pb_0], [pw_pixel_max]
%endmacro
 
%macro MC 1
%define OP_MOV mova
INIT_MMX mmxext
%1 put, 4
INIT_XMM sse2
%1 put, 8
 
%define OP_MOV AVG_MOV
INIT_MMX mmxext
%1 avg, 4
INIT_XMM sse2
%1 avg, 8
%endmacro
 
%macro MCAxA_OP 7
%if ARCH_X86_32
cglobal %1_h264_qpel%4_%2_10, %5,%6,%7
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m
mov r1, r1m
add r0, %3*2
add r1, %3*2
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m
mov r1, r1m
lea r0, [r0+r2*%3]
lea r1, [r1+r2*%3]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
mov r0, r0m
mov r1, r1m
lea r0, [r0+r2*%3+%3*2]
lea r1, [r1+r2*%3+%3*2]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET
%else ; ARCH_X86_64
cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7
mov r%6, r0
%assign p1 %6+1
mov r %+ p1, r1
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%6+%3*2]
lea r1, [r %+ p1+%3*2]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%6+r2*%3]
lea r1, [r %+ p1+r2*%3]
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
lea r0, [r%6+r2*%3+%3*2]
lea r1, [r %+ p1+r2*%3+%3*2]
%if UNIX64 == 0 ; fall through to function
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET
%endif
%endif
%endmacro
 
;cpu, put/avg, mc, 4/8, ...
%macro cglobal_mc 6
%assign i %3*2
%if ARCH_X86_32 || cpuflag(sse2)
MCAxA_OP %1, %2, %3, i, %4,%5,%6
%endif
 
cglobal %1_h264_qpel%3_%2_10, %4,%5,%6
%if UNIX64 == 0 ; no prologue or epilogue for UNIX64
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX
RET
%endif
 
stub_%1_h264_qpel%3_%2_10 %+ SUFFIX:
%endmacro
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro COPY4 0
movu m0, [r1 ]
OP_MOV [r0 ], m0
movu m0, [r1+r2 ]
OP_MOV [r0+r2 ], m0
movu m0, [r1+r2*2]
OP_MOV [r0+r2*2], m0
movu m0, [r1+r3 ]
OP_MOV [r0+r3 ], m0
%endmacro
 
%macro MC00 1
INIT_MMX mmxext
cglobal_mc %1, mc00, 4, 3,4,0
lea r3, [r2*3]
COPY4
ret
 
INIT_XMM sse2
cglobal %1_h264_qpel8_mc00_10, 3,4
lea r3, [r2*3]
COPY4
lea r0, [r0+r2*4]
lea r1, [r1+r2*4]
COPY4
RET
 
cglobal %1_h264_qpel16_mc00_10, 3,4
mov r3d, 8
.loop:
movu m0, [r1 ]
movu m1, [r1 +16]
OP_MOV [r0 ], m0
OP_MOV [r0 +16], m1
movu m0, [r1+r2 ]
movu m1, [r1+r2+16]
OP_MOV [r0+r2 ], m0
OP_MOV [r0+r2+16], m1
lea r0, [r0+r2*2]
lea r1, [r1+r2*2]
dec r3d
jg .loop
REP_RET
%endmacro
 
%define OP_MOV mova
MC00 put
 
%define OP_MOV AVG_MOV
MC00 avg
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC_CACHE 1
%define OP_MOV mova
INIT_MMX mmxext
%1 put, 4
INIT_XMM sse2, cache64
%1 put, 8
INIT_XMM ssse3, cache64
%1 put, 8
INIT_XMM sse2
%1 put, 8
 
%define OP_MOV AVG_MOV
INIT_MMX mmxext
%1 avg, 4
INIT_XMM sse2, cache64
%1 avg, 8
INIT_XMM ssse3, cache64
%1 avg, 8
INIT_XMM sse2
%1 avg, 8
%endmacro
 
%macro MC20 2
cglobal_mc %1, mc20, %2, 3,4,9
mov r3d, %2
mova m1, [pw_pixel_max]
%if num_mmregs > 8
mova m8, [pw_16]
%define p16 m8
%else
%define p16 [pw_16]
%endif
.nextrow:
%if %0 == 4
movu m2, [r1-4]
movu m3, [r1-2]
movu m4, [r1+0]
ADDW m2, [r1+6], m5
ADDW m3, [r1+4], m5
ADDW m4, [r1+2], m5
%else ; movu is slow on these processors
%if mmsize==16
movu m2, [r1-4]
movu m0, [r1+6]
mova m6, m0
psrldq m0, 6
 
paddw m6, m2
PALIGNR m3, m0, m2, 2, m5
PALIGNR m7, m0, m2, 8, m5
paddw m3, m7
PALIGNR m4, m0, m2, 4, m5
PALIGNR m7, m0, m2, 6, m5
paddw m4, m7
SWAP 2, 6
%else
movu m2, [r1-4]
movu m6, [r1+4]
PALIGNR m3, m6, m2, 2, m5
paddw m3, m6
PALIGNR m4, m6, m2, 4, m5
PALIGNR m7, m6, m2, 6, m5
paddw m4, m7
paddw m2, [r1+6]
%endif
%endif
 
FILT_H m2, m3, m4, p16
psraw m2, 1
pxor m0, m0
CLIPW m2, m0, m1
OP_MOV [r0], m2
add r0, r2
add r1, r2
dec r3d
jg .nextrow
rep ret
%endmacro
 
MC_CACHE MC20
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC30 2
cglobal_mc %1, mc30, %2, 3,5,9
lea r4, [r1+2]
jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body
%endmacro
 
MC_CACHE MC30
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC10 2
cglobal_mc %1, mc10, %2, 3,5,9
mov r4, r1
.body:
mov r3d, %2
mova m1, [pw_pixel_max]
%if num_mmregs > 8
mova m8, [pw_16]
%define p16 m8
%else
%define p16 [pw_16]
%endif
.nextrow:
%if %0 == 4
movu m2, [r1-4]
movu m3, [r1-2]
movu m4, [r1+0]
ADDW m2, [r1+6], m5
ADDW m3, [r1+4], m5
ADDW m4, [r1+2], m5
%else ; movu is slow on these processors
%if mmsize==16
movu m2, [r1-4]
movu m0, [r1+6]
mova m6, m0
psrldq m0, 6
 
paddw m6, m2
PALIGNR m3, m0, m2, 2, m5
PALIGNR m7, m0, m2, 8, m5
paddw m3, m7
PALIGNR m4, m0, m2, 4, m5
PALIGNR m7, m0, m2, 6, m5
paddw m4, m7
SWAP 2, 6
%else
movu m2, [r1-4]
movu m6, [r1+4]
PALIGNR m3, m6, m2, 2, m5
paddw m3, m6
PALIGNR m4, m6, m2, 4, m5
PALIGNR m7, m6, m2, 6, m5
paddw m4, m7
paddw m2, [r1+6]
%endif
%endif
 
FILT_H m2, m3, m4, p16
psraw m2, 1
pxor m0, m0
CLIPW m2, m0, m1
movu m3, [r4]
pavgw m2, m3
OP_MOV [r0], m2
add r0, r2
add r1, r2
add r4, r2
dec r3d
jg .nextrow
rep ret
%endmacro
 
MC_CACHE MC10
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro V_FILT 10
v_filt%9_%10_10
add r4, r2
.no_addr4:
FILT_V m0, m1, m2, m3, m4, m5, m6, m7
add r1, r2
add r0, r2
ret
%endmacro
 
INIT_MMX mmxext
RESET_MM_PERMUTATION
%assign i 0
%rep 4
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
 
INIT_XMM sse2
RESET_MM_PERMUTATION
%assign i 0
%rep 6
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
 
%macro MC02 2
cglobal_mc %1, mc02, %2, 3,4,8
PRELOAD_V
 
sub r0, r2
%assign j 0
%rep %2
%assign i (j % 6)
call v_filt%2_ %+ i %+ _10.no_addr4
OP_MOV [r0], m0
SWAP 0,1,2,3,4,5
%assign j j+1
%endrep
ret
%endmacro
 
MC MC02
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC01 2
cglobal_mc %1, mc01, %2, 3,5,8
mov r4, r1
.body:
PRELOAD_V
 
sub r4, r2
sub r0, r2
%assign j 0
%rep %2
%assign i (j % 6)
call v_filt%2_ %+ i %+ _10
movu m7, [r4]
pavgw m0, m7
OP_MOV [r0], m0
SWAP 0,1,2,3,4,5
%assign j j+1
%endrep
ret
%endmacro
 
MC MC01
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC03 2
cglobal_mc %1, mc03, %2, 3,5,8
lea r4, [r1+r2]
jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body
%endmacro
 
MC MC03
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro H_FILT_AVG 2-3
h_filt%1_%2_10:
;FILT_H with fewer registers and averaged with the FILT_V result
;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration
;unfortunately I need three registers, so m5 will have to be re-read from memory
movu m5, [r4-4]
ADDW m5, [r4+6], m7
movu m6, [r4-2]
ADDW m6, [r4+4], m7
paddw m5, [pw_16]
psubw m5, m6 ; a-b
psraw m5, 2 ; (a-b)/4
psubw m5, m6 ; (a-b)/4-b
movu m6, [r4+0]
ADDW m6, [r4+2], m7
paddw m5, m6 ; (a-b)/4-b+c
psraw m5, 2 ; ((a-b)/4-b+c)/4
paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16
psraw m5, 1
CLIPW m5, [pb_0], [pw_pixel_max]
;avg FILT_V, FILT_H
pavgw m0, m5
%if %0!=4
movu m5, [r1+r5]
%endif
ret
%endmacro
 
INIT_MMX mmxext
RESET_MM_PERMUTATION
%assign i 0
%rep 3
H_FILT_AVG 4, i
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
H_FILT_AVG 4, i, 0
 
INIT_XMM sse2
RESET_MM_PERMUTATION
%assign i 0
%rep 6
%if i==1
H_FILT_AVG 8, i, 0
%else
H_FILT_AVG 8, i
%endif
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
 
%macro MC11 2
; this REALLY needs x86_64
cglobal_mc %1, mc11, %2, 3,6,8
mov r4, r1
.body:
PRELOAD_V
 
sub r0, r2
sub r4, r2
mov r5, r2
neg r5
%assign j 0
%rep %2
%assign i (j % 6)
call v_filt%2_ %+ i %+ _10
call h_filt%2_ %+ i %+ _10
%if %2==8 && i==1
movu m5, [r1+r5]
%endif
OP_MOV [r0], m0
SWAP 0,1,2,3,4,5
%assign j j+1
%endrep
ret
%endmacro
 
MC MC11
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC31 2
cglobal_mc %1, mc31, %2, 3,6,8
mov r4, r1
add r1, 2
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro
 
MC MC31
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC13 2
cglobal_mc %1, mc13, %2, 3,7,12
lea r4, [r1+r2]
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro
 
MC MC13
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC33 2
cglobal_mc %1, mc33, %2, 3,6,8
lea r4, [r1+r2]
add r1, 2
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body
%endmacro
 
MC MC33
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro FILT_H2 3
psubw %1, %2 ; a-b
psubw %2, %3 ; b-c
psllw %2, 2
psubw %1, %2 ; a-5*b+4*c
psllw %3, 4
paddw %1, %3 ; a-5*b+20*c
%endmacro
 
%macro FILT_VNRD 8
movu %6, [r1]
paddw %1, %6
mova %7, %2
paddw %7, %5
mova %8, %3
paddw %8, %4
FILT_H2 %1, %7, %8
%endmacro
 
%macro HV 1
%if mmsize==16
%define PAD 12
%define COUNT 2
%else
%define PAD 4
%define COUNT 3
%endif
put_hv%1_10:
neg r2 ; This actually saves instructions
lea r1, [r1+r2*2-mmsize+PAD]
lea r4, [rsp+PAD+gprsize]
mov r3d, COUNT
.v_loop:
movu m0, [r1]
sub r1, r2
movu m1, [r1]
sub r1, r2
movu m2, [r1]
sub r1, r2
movu m3, [r1]
sub r1, r2
movu m4, [r1]
sub r1, r2
%assign i 0
%rep %1-1
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
psubw m0, [pad20]
movu [r4+i*mmsize*3], m0
sub r1, r2
SWAP 0,1,2,3,4,5
%assign i i+1
%endrep
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7
psubw m0, [pad20]
movu [r4+i*mmsize*3], m0
add r4, mmsize
lea r1, [r1+r2*8+mmsize]
%if %1==8
lea r1, [r1+r2*4]
%endif
dec r3d
jg .v_loop
neg r2
ret
%endmacro
 
INIT_MMX mmxext
HV 4
INIT_XMM sse2
HV 8
 
%macro H_LOOP 1
%if num_mmregs > 8
%define s1 m8
%define s2 m9
%define s3 m10
%define d1 m11
%else
%define s1 [tap1]
%define s2 [tap2]
%define s3 [tap3]
%define d1 [depad]
%endif
h%1_loop_op:
movu m1, [r1+mmsize-4]
movu m2, [r1+mmsize-2]
mova m3, [r1+mmsize+0]
movu m4, [r1+mmsize+2]
movu m5, [r1+mmsize+4]
movu m6, [r1+mmsize+6]
%if num_mmregs > 8
pmaddwd m1, s1
pmaddwd m2, s1
pmaddwd m3, s2
pmaddwd m4, s2
pmaddwd m5, s3
pmaddwd m6, s3
paddd m1, d1
paddd m2, d1
%else
mova m0, s1
pmaddwd m1, m0
pmaddwd m2, m0
mova m0, s2
pmaddwd m3, m0
pmaddwd m4, m0
mova m0, s3
pmaddwd m5, m0
pmaddwd m6, m0
mova m0, d1
paddd m1, m0
paddd m2, m0
%endif
paddd m3, m5
paddd m4, m6
paddd m1, m3
paddd m2, m4
psrad m1, 10
psrad m2, 10
pslld m2, 16
pand m1, [pd_0f]
por m1, m2
%if num_mmregs <= 8
pxor m0, m0
%endif
CLIPW m1, m0, m7
add r1, mmsize*3
ret
%endmacro
 
INIT_MMX mmxext
H_LOOP 4
INIT_XMM sse2
H_LOOP 8
 
%macro MC22 2
cglobal_mc %1, mc22, %2, 3,7,12
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
 
call put_hv%2_10
 
mov r3d, %2
mova m7, [pw_pixel_max]
%if num_mmregs > 8
pxor m0, m0
mova m8, [tap1]
mova m9, [tap2]
mova m10, [tap3]
mova m11, [depad]
%endif
mov r1, rsp
.h_loop:
call h%2_loop_op
 
OP_MOV [r0], m1
add r0, r2
dec r3d
jg .h_loop
 
mov rsp, r6 ; restore stack pointer
ret
%endmacro
 
MC MC22
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC12 2
cglobal_mc %1, mc12, %2, 3,7,12
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
 
call put_hv%2_10
 
xor r4d, r4d
.body:
mov r3d, %2
pxor m0, m0
mova m7, [pw_pixel_max]
%if num_mmregs > 8
mova m8, [tap1]
mova m9, [tap2]
mova m10, [tap3]
mova m11, [depad]
%endif
mov r1, rsp
.h_loop:
call h%2_loop_op
 
movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc
paddw m3, [depad2]
psrlw m3, 5
psubw m3, [unpad]
CLIPW m3, m0, m7
pavgw m1, m3
 
OP_MOV [r0], m1
add r0, r2
dec r3d
jg .h_loop
 
mov rsp, r6 ; restore stack pointer
ret
%endmacro
 
MC MC12
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC32 2
cglobal_mc %1, mc32, %2, 3,7,12
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
sub rsp, PAD
 
call put_hv%2_10
 
mov r4d, 2 ; sizeof(pixel)
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
%endmacro
 
MC MC32
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro H_NRD 1
put_h%1_10:
add rsp, gprsize
mov r3d, %1
xor r4d, r4d
mova m6, [pad20]
.nextrow:
movu m2, [r5-4]
movu m3, [r5-2]
movu m4, [r5+0]
ADDW m2, [r5+6], m5
ADDW m3, [r5+4], m5
ADDW m4, [r5+2], m5
 
FILT_H2 m2, m3, m4
psubw m2, m6
mova [rsp+r4], m2
add r4d, mmsize*3
add r5, r2
dec r3d
jg .nextrow
sub rsp, gprsize
ret
%endmacro
 
INIT_MMX mmxext
H_NRD 4
INIT_XMM sse2
H_NRD 8
 
%macro MC21 2
cglobal_mc %1, mc21, %2, 3,7,12
mov r5, r1
.body:
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel)
mov r6, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
 
sub rsp, PAD
call put_h%2_10
 
sub rsp, PAD
call put_hv%2_10
 
mov r4d, PAD-mmsize ; H buffer
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body
%endmacro
 
MC MC21
 
;-----------------------------------------------------------------------------
; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride)
;-----------------------------------------------------------------------------
%macro MC23 2
cglobal_mc %1, mc23, %2, 3,7,12
lea r5, [r1+r2]
jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body
%endmacro
 
MC MC23
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm
0,0 → 1,862
;*****************************************************************************
;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2012 Daniel Kang
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA 32
 
cextern pw_16
cextern pw_5
cextern pb_0
 
SECTION .text
 
 
%macro op_avgh 3
movh %3, %2
pavgb %1, %3
movh %2, %1
%endmacro
 
%macro op_avg 2-3
pavgb %1, %2
mova %2, %1
%endmacro
 
%macro op_puth 2-3
movh %2, %1
%endmacro
 
%macro op_put 2-3
mova %2, %1
%endmacro
 
%macro QPEL4_H_LOWPASS_OP 1
cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
pxor m7, m7
mova m4, [pw_5]
mova m5, [pw_16]
mov r4d, 4
.loop:
movh m1, [r1-1]
movh m2, [r1+0]
movh m3, [r1+1]
movh m0, [r1+2]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m0, m7
paddw m1, m0
paddw m2, m3
movh m0, [r1-2]
movh m3, [r1+3]
punpcklbw m0, m7
punpcklbw m3, m7
paddw m0, m3
psllw m2, 2
psubw m2, m1
pmullw m2, m4
paddw m0, m5
paddw m0, m2
psraw m0, 5
packuswb m0, m0
op_%1h m0, [r0], m6
add r0, r2
add r1, r3
dec r4d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
QPEL4_H_LOWPASS_OP put
QPEL4_H_LOWPASS_OP avg
 
%macro QPEL8_H_LOWPASS_OP 1
cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
mova m0, [r1]
mova m2, [r1+1]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m0, m2
paddw m1, m3
psllw m0, 2
psllw m1, 2
mova m2, [r1-1]
mova m4, [r1+2]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
paddw m2, m4
paddw m5, m3
psubw m0, m2
psubw m1, m5
pmullw m0, m6
pmullw m1, m6
movd m2, [r1-2]
movd m5, [r1+7]
punpcklbw m2, m7
punpcklbw m5, m7
paddw m2, m3
paddw m4, m5
mova m5, [pw_16]
paddw m2, m5
paddw m4, m5
paddw m0, m2
paddw m1, m4
psraw m0, 5
psraw m1, 5
packuswb m0, m1
op_%1 m0, [r0], m4
add r0, r2
add r1, r3
dec r4d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
QPEL8_H_LOWPASS_OP put
QPEL8_H_LOWPASS_OP avg
 
%macro QPEL8_H_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
mov r4d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
movu m1, [r1-2]
mova m0, m1
punpckhbw m1, m7
punpcklbw m0, m7
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m4, m0, 2
palignr m3, m0, 4
palignr m2, m0, 6
palignr m1, m0, 8
palignr m5, m0, 10
paddw m0, m5
paddw m2, m3
paddw m1, m4
psllw m2, 2
psubw m2, m1
paddw m0, [pw_16]
pmullw m2, m6
paddw m2, m0
psraw m2, 5
packuswb m2, m2
op_%1h m2, [r0], m4
add r1, r3
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
 
INIT_XMM ssse3
QPEL8_H_LOWPASS_OP_XMM put
QPEL8_H_LOWPASS_OP_XMM avg
 
 
%macro QPEL4_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
pxor m7, m7
mova m4, [pw_5]
mova m5, [pw_16]
mov r5d, 4
.loop:
movh m1, [r1-1]
movh m2, [r1+0]
movh m3, [r1+1]
movh m0, [r1+2]
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m0, m7
paddw m1, m0
paddw m2, m3
movh m0, [r1-2]
movh m3, [r1+3]
punpcklbw m0, m7
punpcklbw m3, m7
paddw m0, m3
psllw m2, 2
psubw m2, m1
pmullw m2, m4
paddw m0, m5
paddw m0, m2
movh m3, [r2]
psraw m0, 5
packuswb m0, m0
pavgb m0, m3
op_%1h m0, [r0], m6
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
QPEL4_H_LOWPASS_L2_OP put
QPEL4_H_LOWPASS_L2_OP avg
 
 
%macro QPEL8_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
mova m0, [r1]
mova m2, [r1+1]
mova m1, m0
mova m3, m2
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
paddw m0, m2
paddw m1, m3
psllw m0, 2
psllw m1, 2
mova m2, [r1-1]
mova m4, [r1+2]
mova m3, m2
mova m5, m4
punpcklbw m2, m7
punpckhbw m3, m7
punpcklbw m4, m7
punpckhbw m5, m7
paddw m2, m4
paddw m5, m3
psubw m0, m2
psubw m1, m5
pmullw m0, m6
pmullw m1, m6
movd m2, [r1-2]
movd m5, [r1+7]
punpcklbw m2, m7
punpcklbw m5, m7
paddw m2, m3
paddw m4, m5
mova m5, [pw_16]
paddw m2, m5
paddw m4, m5
paddw m0, m2
paddw m1, m4
psraw m0, 5
psraw m1, 5
mova m4, [r2]
packuswb m0, m1
pavgb m0, m4
op_%1 m0, [r0], m4
add r0, r3
add r1, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
QPEL8_H_LOWPASS_L2_OP put
QPEL8_H_LOWPASS_L2_OP avg
 
 
%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 8
pxor m7, m7
mova m6, [pw_5]
.loop:
lddqu m1, [r1-2]
mova m0, m1
punpckhbw m1, m7
punpcklbw m0, m7
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m4, m0, 2
palignr m3, m0, 4
palignr m2, m0, 6
palignr m1, m0, 8
palignr m5, m0, 10
paddw m0, m5
paddw m2, m3
paddw m1, m4
psllw m2, 2
movh m3, [r2]
psubw m2, m1
paddw m0, [pw_16]
pmullw m2, m6
paddw m2, m0
psraw m2, 5
packuswb m2, m2
pavgb m2, m3
op_%1h m2, [r0], m4
add r1, r3
add r0, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
 
INIT_XMM ssse3
QPEL8_H_LOWPASS_L2_OP_XMM put
QPEL8_H_LOWPASS_L2_OP_XMM avg
 
 
; All functions that call this are required to have function arguments of
; dst, src, dstStride, srcStride
%macro FILT_V 1
mova m6, m2
movh m5, [r1]
paddw m6, m3
psllw m6, 2
psubw m6, m1
psubw m6, m4
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, [pw_16]
add r1, r3
paddw m0, m5
paddw m6, m0
psraw m6, 5
packuswb m6, m6
op_%1h m6, [r0], m0 ; 1
add r0, r2
SWAP 0, 1, 2, 3, 4, 5
%endmacro
 
%macro QPEL4_V_LOWPASS_OP 1
cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
sub r1, r3
sub r1, r3
pxor m7, m7
movh m0, [r1]
movh m1, [r1+r3]
lea r1, [r1+2*r3]
movh m2, [r1]
movh m3, [r1+r3]
lea r1, [r1+2*r3]
movh m4, [r1]
add r1, r3
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
RET
%endmacro
 
INIT_MMX mmxext
QPEL4_V_LOWPASS_OP put
QPEL4_V_LOWPASS_OP avg
 
 
 
%macro QPEL8OR16_V_LOWPASS_OP 1
%if cpuflag(sse2)
cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
sub r1, r3
sub r1, r3
%else
cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
%endif
pxor m7, m7
movh m0, [r1]
movh m1, [r1+r3]
lea r1, [r1+2*r3]
movh m2, [r1]
movh m3, [r1+r3]
lea r1, [r1+2*r3]
movh m4, [r1]
add r1, r3
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
cmp r4d, 16
jne .end
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
FILT_V %1
.end:
REP_RET
%endmacro
 
INIT_MMX mmxext
QPEL8OR16_V_LOWPASS_OP put
QPEL8OR16_V_LOWPASS_OP avg
 
INIT_XMM sse2
QPEL8OR16_V_LOWPASS_OP put
QPEL8OR16_V_LOWPASS_OP avg
 
 
; All functions that use this are required to have args:
; src, tmp, srcSize
%macro FILT_HV 1 ; offset
mova m6, m2
movh m5, [r0]
paddw m6, m3
psllw m6, 2
paddw m0, [pw_16]
psubw m6, m1
psubw m6, m4
punpcklbw m5, m7
pmullw m6, [pw_5]
paddw m0, m5
add r0, r2
paddw m6, m0
mova [r1+%1], m6
SWAP 0, 1, 2, 3, 4, 5
%endmacro
 
%macro QPEL4_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
movsxdifnidn r2, r2d
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
movh m2, [r0]
movh m3, [r0+r2]
lea r0, [r0+2*r2]
movh m4, [r0]
add r0, r2
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_HV 0*24
FILT_HV 1*24
FILT_HV 2*24
FILT_HV 3*24
RET
 
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
movsxdifnidn r2, r2d
mov r3d, 4
.loop:
mova m0, [r0]
paddw m0, [r0+10]
mova m1, [r0+2]
paddw m1, [r0+8]
mova m2, [r0+4]
paddw m2, [r0+6]
psubw m0, m1
psraw m0, 2
psubw m0, m1
paddsw m0, m2
psraw m0, 2
paddw m0, m2
psraw m0, 6
packuswb m0, m0
op_%1h m0, [r1], m7
add r0, 24
add r1, r2
dec r3d
jnz .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
QPEL4_HV1_LOWPASS_OP put
QPEL4_HV1_LOWPASS_OP avg
 
%macro QPEL8OR16_HV1_LOWPASS_OP 1
cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
movsxdifnidn r2, r2d
pxor m7, m7
movh m0, [r0]
movh m1, [r0+r2]
lea r0, [r0+2*r2]
movh m2, [r0]
movh m3, [r0+r2]
lea r0, [r0+2*r2]
movh m4, [r0]
add r0, r2
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
FILT_HV 0*48
FILT_HV 1*48
FILT_HV 2*48
FILT_HV 3*48
FILT_HV 4*48
FILT_HV 5*48
FILT_HV 6*48
FILT_HV 7*48
cmp r3d, 16
jne .end
FILT_HV 8*48
FILT_HV 9*48
FILT_HV 10*48
FILT_HV 11*48
FILT_HV 12*48
FILT_HV 13*48
FILT_HV 14*48
FILT_HV 15*48
.end:
REP_RET
%endmacro
 
INIT_MMX mmxext
QPEL8OR16_HV1_LOWPASS_OP put
QPEL8OR16_HV1_LOWPASS_OP avg
 
INIT_XMM sse2
QPEL8OR16_HV1_LOWPASS_OP put
 
 
 
%macro QPEL8OR16_HV2_LOWPASS_OP 1
; unused is to match ssse3 and mmxext args
cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
movsxdifnidn r2, r2d
.loop:
mova m0, [r1]
mova m3, [r1+8]
mova m1, [r1+2]
mova m4, [r1+10]
paddw m0, m4
paddw m1, m3
paddw m3, [r1+18]
paddw m4, [r1+16]
mova m2, [r1+4]
mova m5, [r1+12]
paddw m2, [r1+6]
paddw m5, [r1+14]
psubw m0, m1
psubw m3, m4
psraw m0, 2
psraw m3, 2
psubw m0, m1
psubw m3, m4
paddsw m0, m2
paddsw m3, m5
psraw m0, 2
psraw m3, 2
paddw m0, m2
paddw m3, m5
psraw m0, 6
psraw m3, 6
packuswb m0, m3
op_%1 m0, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
QPEL8OR16_HV2_LOWPASS_OP put
QPEL8OR16_HV2_LOWPASS_OP avg
 
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
cmp r4d, 16
je .op16
.loop8:
mova m1, [r1+16]
mova m0, [r1]
mova m2, m1
mova m3, m1
mova m4, m1
mova m5, m1
palignr m5, m0, 10
palignr m4, m0, 8
palignr m3, m0, 6
palignr m2, m0, 4
palignr m1, m0, 2
paddw m0, m5
paddw m1, m4
paddw m2, m3
psubw m0, m1
psraw m0, 2
psubw m0, m1
paddw m0, m2
psraw m0, 2
paddw m0, m2
psraw m0, 6
packuswb m0, m0
op_%1h m0, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .loop8
jmp .done
.op16:
mova m4, [r1+32]
mova m5, [r1+16]
mova m7, [r1]
mova m3, m4
mova m2, m4
mova m1, m4
mova m0, m4
palignr m0, m5, 10
palignr m1, m5, 8
palignr m2, m5, 6
palignr m3, m5, 4
palignr m4, m5, 2
paddw m0, m5
paddw m1, m4
paddw m2, m3
mova m6, m5
mova m4, m5
mova m3, m5
palignr m4, m7, 8
palignr m6, m7, 2
palignr m3, m7, 10
paddw m4, m6
mova m6, m5
palignr m5, m7, 6
palignr m6, m7, 4
paddw m3, m7
paddw m5, m6
psubw m0, m1
psubw m3, m4
psraw m0, 2
psraw m3, 2
psubw m0, m1
psubw m3, m4
paddw m0, m2
paddw m3, m5
psraw m0, 2
psraw m3, 2
paddw m0, m2
paddw m3, m5
psraw m0, 6
psraw m3, 6
packuswb m3, m0
op_%1 m3, [r0], m7
add r1, 48
add r0, r2
dec r4d
jne .op16
.done:
REP_RET
%endmacro
 
INIT_XMM ssse3
QPEL8OR16_HV2_LOWPASS_OP_XMM put
QPEL8OR16_HV2_LOWPASS_OP_XMM avg
 
 
%macro PIXELS4_L2_SHIFT5 1
cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mova m0, [r1]
mova m1, [r1+24]
psraw m0, 5
psraw m1, 5
packuswb m0, m0
packuswb m1, m1
pavgb m0, [r2]
pavgb m1, [r2+r4]
op_%1h m0, [r0], m4
op_%1h m1, [r0+r3], m5
lea r2, [r2+r4*2]
lea r0, [r0+r3*2]
mova m0, [r1+48]
mova m1, [r1+72]
psraw m0, 5
psraw m1, 5
packuswb m0, m0
packuswb m1, m1
pavgb m0, [r2]
pavgb m1, [r2+r4]
op_%1h m0, [r0], m4
op_%1h m1, [r0+r3], m5
RET
%endmacro
 
INIT_MMX mmxext
PIXELS4_L2_SHIFT5 put
PIXELS4_L2_SHIFT5 avg
 
 
%macro PIXELS8_L2_SHIFT5 1
cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
.loop:
mova m0, [r1]
mova m1, [r1+8]
mova m2, [r1+48]
mova m3, [r1+48+8]
psraw m0, 5
psraw m1, 5
psraw m2, 5
psraw m3, 5
packuswb m0, m1
packuswb m2, m3
pavgb m0, [r2]
pavgb m2, [r2+r4]
op_%1 m0, [r0], m4
op_%1 m2, [r0+r3], m5
lea r2, [r2+2*r4]
add r1, 48*2
lea r0, [r0+2*r3]
sub r5d, 2
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PIXELS8_L2_SHIFT5 put
PIXELS8_L2_SHIFT5 avg
 
 
%if ARCH_X86_64
%macro QPEL16_H_LOWPASS_L2_OP 1
cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
mov r5d, 16
pxor m15, m15
mova m14, [pw_5]
mova m13, [pw_16]
.loop:
lddqu m1, [r1+6]
lddqu m7, [r1-2]
mova m0, m1
punpckhbw m1, m15
punpcklbw m0, m15
punpcklbw m7, m15
mova m2, m1
mova m6, m0
mova m3, m1
mova m8, m0
mova m4, m1
mova m9, m0
mova m12, m0
mova m11, m1
palignr m11, m0, 10
palignr m12, m7, 10
palignr m4, m0, 2
palignr m9, m7, 2
palignr m3, m0, 4
palignr m8, m7, 4
palignr m2, m0, 6
palignr m6, m7, 6
paddw m11, m0
palignr m1, m0, 8
palignr m0, m7, 8
paddw m7, m12
paddw m2, m3
paddw m6, m8
paddw m1, m4
paddw m0, m9
psllw m2, 2
psllw m6, 2
psubw m2, m1
psubw m6, m0
paddw m11, m13
paddw m7, m13
pmullw m2, m14
pmullw m6, m14
lddqu m3, [r2]
paddw m2, m11
paddw m6, m7
psraw m2, 5
psraw m6, 5
packuswb m6, m2
pavgb m6, m3
op_%1 m6, [r0], m11
add r1, r3
add r0, r3
add r2, r4
dec r5d
jg .loop
REP_RET
%endmacro
 
INIT_XMM ssse3
QPEL16_H_LOWPASS_L2_OP put
QPEL16_H_LOWPASS_L2_OP avg
%endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_weight.asm
0,0 → 1,317
;*****************************************************************************
;* SSE2-optimized weighted prediction code
;*****************************************************************************
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION .text
 
;-----------------------------------------------------------------------------
; biweight pred:
;
; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride,
; int height, int log2_denom, int weightd,
; int weights, int offset);
; and
; void h264_weight_16_sse2(uint8_t *dst, int stride, int height,
; int log2_denom, int weight, int offset);
;-----------------------------------------------------------------------------
 
%macro WEIGHT_SETUP 0
add r5, r5
inc r5
movd m3, r4d
movd m5, r5d
movd m6, r3d
pslld m5, m6
psrld m5, 1
%if mmsize == 16
pshuflw m3, m3, 0
pshuflw m5, m5, 0
punpcklqdq m3, m3
punpcklqdq m5, m5
%else
pshufw m3, m3, 0
pshufw m5, m5, 0
%endif
pxor m7, m7
%endmacro
 
%macro WEIGHT_OP 2
movh m0, [r0+%1]
movh m1, [r0+%2]
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m3
pmullw m1, m3
paddsw m0, m5
paddsw m1, m5
psraw m0, m6
psraw m1, m6
packuswb m0, m1
%endmacro
 
INIT_MMX mmxext
cglobal h264_weight_16, 6, 6, 0
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0, 4
mova [r0 ], m0
WEIGHT_OP 8, 12
mova [r0+8], m0
add r0, r1
dec r2d
jnz .nextrow
REP_RET
 
%macro WEIGHT_FUNC_MM 2
cglobal h264_weight_%1, 6, 6, %2
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0, mmsize/2
mova [r0], m0
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%endmacro
 
INIT_MMX mmxext
WEIGHT_FUNC_MM 8, 0
INIT_XMM sse2
WEIGHT_FUNC_MM 16, 8
 
%macro WEIGHT_FUNC_HALF_MM 2
cglobal h264_weight_%1, 6, 6, %2
WEIGHT_SETUP
sar r2d, 1
lea r3, [r1*2]
.nextrow:
WEIGHT_OP 0, r1
movh [r0], m0
%if mmsize == 16
movhps [r0+r1], m0
%else
psrlq m0, 32
movh [r0+r1], m0
%endif
add r0, r3
dec r2d
jnz .nextrow
REP_RET
%endmacro
 
INIT_MMX mmxext
WEIGHT_FUNC_HALF_MM 4, 0
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM 8, 8
 
%macro BIWEIGHT_SETUP 0
%if ARCH_X86_64
%define off_regd r7d
%else
%define off_regd r3d
%endif
mov off_regd, r7m
add off_regd, 1
or off_regd, 1
add r4, 1
cmp r5, 128
jne .normal
sar r5, 1
sar r6, 1
sar off_regd, 1
sub r4, 1
.normal
%if cpuflag(ssse3)
movd m4, r5d
movd m0, r6d
%else
movd m3, r5d
movd m4, r6d
%endif
movd m5, off_regd
movd m6, r4d
pslld m5, m6
psrld m5, 1
%if cpuflag(ssse3)
punpcklbw m4, m0
pshuflw m4, m4, 0
pshuflw m5, m5, 0
punpcklqdq m4, m4
punpcklqdq m5, m5
 
%else
%if mmsize == 16
pshuflw m3, m3, 0
pshuflw m4, m4, 0
pshuflw m5, m5, 0
punpcklqdq m3, m3
punpcklqdq m4, m4
punpcklqdq m5, m5
%else
pshufw m3, m3, 0
pshufw m4, m4, 0
pshufw m5, m5, 0
%endif
pxor m7, m7
%endif
%endmacro
 
%macro BIWEIGHT_STEPA 3
movh m%1, [r0+%3]
movh m%2, [r1+%3]
punpcklbw m%1, m7
punpcklbw m%2, m7
pmullw m%1, m3
pmullw m%2, m4
paddsw m%1, m%2
%endmacro
 
%macro BIWEIGHT_STEPB 0
paddsw m0, m5
paddsw m1, m5
psraw m0, m6
psraw m1, m6
packuswb m0, m1
%endmacro
 
INIT_MMX mmxext
cglobal h264_biweight_16, 7, 8, 0
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow:
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, 4
BIWEIGHT_STEPB
mova [r0], m0
BIWEIGHT_STEPA 0, 1, 8
BIWEIGHT_STEPA 1, 2, 12
BIWEIGHT_STEPB
mova [r0+8], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
 
%macro BIWEIGHT_FUNC_MM 2
cglobal h264_biweight_%1, 7, 8, %2
BIWEIGHT_SETUP
movifnidn r3d, r3m
.nextrow:
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, mmsize/2
BIWEIGHT_STEPB
mova [r0], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
 
INIT_MMX mmxext
BIWEIGHT_FUNC_MM 8, 0
INIT_XMM sse2
BIWEIGHT_FUNC_MM 16, 8
 
%macro BIWEIGHT_FUNC_HALF_MM 2
cglobal h264_biweight_%1, 7, 8, %2
BIWEIGHT_SETUP
movifnidn r3d, r3m
sar r3, 1
lea r4, [r2*2]
.nextrow:
BIWEIGHT_STEPA 0, 1, 0
BIWEIGHT_STEPA 1, 2, r2
BIWEIGHT_STEPB
movh [r0], m0
%if mmsize == 16
movhps [r0+r2], m0
%else
psrlq m0, 32
movh [r0+r2], m0
%endif
add r0, r4
add r1, r4
dec r3d
jnz .nextrow
REP_RET
%endmacro
 
INIT_MMX mmxext
BIWEIGHT_FUNC_HALF_MM 4, 0
INIT_XMM sse2
BIWEIGHT_FUNC_HALF_MM 8, 8
 
%macro BIWEIGHT_SSSE3_OP 0
pmaddubsw m0, m4
pmaddubsw m2, m4
paddsw m0, m5
paddsw m2, m5
psraw m0, m6
psraw m2, m6
packuswb m0, m2
%endmacro
 
INIT_XMM ssse3
cglobal h264_biweight_16, 7, 8, 8
BIWEIGHT_SETUP
movifnidn r3d, r3m
 
.nextrow:
movh m0, [r0]
movh m2, [r0+8]
movh m3, [r1+8]
punpcklbw m0, [r1]
punpcklbw m2, m3
BIWEIGHT_SSSE3_OP
mova [r0], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
 
INIT_XMM ssse3
cglobal h264_biweight_8, 7, 8, 8
BIWEIGHT_SETUP
movifnidn r3d, r3m
sar r3, 1
lea r4, [r2*2]
 
.nextrow:
movh m0, [r0]
movh m1, [r1]
movh m2, [r0+r2]
movh m3, [r1+r2]
punpcklbw m0, m1
punpcklbw m2, m3
BIWEIGHT_SSSE3_OP
movh [r0], m0
movhps [r0+r2], m0
add r0, r4
add r1, r4
dec r3d
jnz .nextrow
REP_RET
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_weight_10bit.asm
0,0 → 1,282
;*****************************************************************************
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code
;*****************************************************************************
;* Copyright (C) 2005-2011 x264 project
;*
;* Authors: Daniel Kang <daniel.d.kang@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA 32
 
pw_pixel_max: times 8 dw ((1 << 10)-1)
sq_1: dq 1
dq 0
 
cextern pw_1
 
SECTION .text
 
;-----------------------------------------------------------------------------
; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom,
; int weight, int offset);
;-----------------------------------------------------------------------------
%macro WEIGHT_PROLOGUE 0
.prologue:
PROLOGUE 0,6,8
movifnidn r0, r0mp
movifnidn r1d, r1m
movifnidn r2d, r2m
movifnidn r4d, r4m
movifnidn r5d, r5m
%endmacro
 
%macro WEIGHT_SETUP 0
mova m0, [pw_1]
movd m2, r3m
pslld m0, m2 ; 1<<log2_denom
SPLATW m0, m0
shl r5, 19 ; *8, move to upper half of dword
lea r5, [r5+r4*2+0x10000]
movd m3, r5d ; weight<<1 | 1+(offset<<(3))
pshufd m3, m3, 0
mova m4, [pw_pixel_max]
paddw m2, [sq_1] ; log2_denom+1
%if notcpuflag(sse4)
pxor m7, m7
%endif
%endmacro
 
%macro WEIGHT_OP 1-2
%if %0==1
mova m5, [r0+%1]
punpckhwd m6, m5, m0
punpcklwd m5, m0
%else
movq m5, [r0+%1]
movq m6, [r0+%2]
punpcklwd m5, m0
punpcklwd m6, m0
%endif
pmaddwd m5, m3
pmaddwd m6, m3
psrad m5, m2
psrad m6, m2
%if cpuflag(sse4)
packusdw m5, m6
pminsw m5, m4
%else
packssdw m5, m6
CLIPW m5, m7, m4
%endif
%endmacro
 
%macro WEIGHT_FUNC_DBL 0
cglobal h264_weight_16_10
WEIGHT_PROLOGUE
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0
mova [r0 ], m5
WEIGHT_OP 16
mova [r0+16], m5
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%endmacro
 
INIT_XMM sse2
WEIGHT_FUNC_DBL
INIT_XMM sse4
WEIGHT_FUNC_DBL
 
 
%macro WEIGHT_FUNC_MM 0
cglobal h264_weight_8_10
WEIGHT_PROLOGUE
WEIGHT_SETUP
.nextrow:
WEIGHT_OP 0
mova [r0], m5
add r0, r1
dec r2d
jnz .nextrow
REP_RET
%endmacro
 
INIT_XMM sse2
WEIGHT_FUNC_MM
INIT_XMM sse4
WEIGHT_FUNC_MM
 
 
%macro WEIGHT_FUNC_HALF_MM 0
cglobal h264_weight_4_10
WEIGHT_PROLOGUE
sar r2d, 1
WEIGHT_SETUP
lea r3, [r1*2]
.nextrow:
WEIGHT_OP 0, r1
movh [r0], m5
movhps [r0+r1], m5
add r0, r3
dec r2d
jnz .nextrow
REP_RET
%endmacro
 
INIT_XMM sse2
WEIGHT_FUNC_HALF_MM
INIT_XMM sse4
WEIGHT_FUNC_HALF_MM
 
 
;-----------------------------------------------------------------------------
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height,
; int log2_denom, int weightd, int weights, int offset);
;-----------------------------------------------------------------------------
%if ARCH_X86_32
DECLARE_REG_TMP 3
%else
DECLARE_REG_TMP 7
%endif
 
%macro BIWEIGHT_PROLOGUE 0
.prologue:
PROLOGUE 0,8,8
movifnidn r0, r0mp
movifnidn r1, r1mp
movifnidn r2d, r2m
movifnidn r5d, r5m
movifnidn r6d, r6m
movifnidn t0d, r7m
%endmacro
 
%macro BIWEIGHT_SETUP 0
lea t0, [t0*4+1] ; (offset<<2)+1
or t0, 1
shl r6, 16
or r5, r6
movd m4, r5d ; weightd | weights
movd m5, t0d ; (offset+1)|1
movd m6, r4m ; log2_denom
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom
paddd m6, [sq_1]
pshufd m4, m4, 0
pshufd m5, m5, 0
mova m3, [pw_pixel_max]
movifnidn r3d, r3m
%if notcpuflag(sse4)
pxor m7, m7
%endif
%endmacro
 
%macro BIWEIGHT 1-2
%if %0==1
mova m0, [r0+%1]
mova m1, [r1+%1]
punpckhwd m2, m0, m1
punpcklwd m0, m1
%else
movq m0, [r0+%1]
movq m1, [r1+%1]
punpcklwd m0, m1
movq m2, [r0+%2]
movq m1, [r1+%2]
punpcklwd m2, m1
%endif
pmaddwd m0, m4
pmaddwd m2, m4
paddd m0, m5
paddd m2, m5
psrad m0, m6
psrad m2, m6
%if cpuflag(sse4)
packusdw m0, m2
pminsw m0, m3
%else
packssdw m0, m2
CLIPW m0, m7, m3
%endif
%endmacro
 
%macro BIWEIGHT_FUNC_DBL 0
cglobal h264_biweight_16_10
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP
.nextrow:
BIWEIGHT 0
mova [r0 ], m0
BIWEIGHT 16
mova [r0+16], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
 
INIT_XMM sse2
BIWEIGHT_FUNC_DBL
INIT_XMM sse4
BIWEIGHT_FUNC_DBL
 
%macro BIWEIGHT_FUNC 0
cglobal h264_biweight_8_10
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP
.nextrow:
BIWEIGHT 0
mova [r0], m0
add r0, r2
add r1, r2
dec r3d
jnz .nextrow
REP_RET
%endmacro
 
INIT_XMM sse2
BIWEIGHT_FUNC
INIT_XMM sse4
BIWEIGHT_FUNC
 
%macro BIWEIGHT_FUNC_HALF 0
cglobal h264_biweight_4_10
BIWEIGHT_PROLOGUE
BIWEIGHT_SETUP
sar r3d, 1
lea r4, [r2*2]
.nextrow:
BIWEIGHT 0, r2
movh [r0 ], m0
movhps [r0+r2], m0
add r0, r4
add r1, r4
dec r3d
jnz .nextrow
REP_RET
%endmacro
 
INIT_XMM sse2
BIWEIGHT_FUNC_HALF
INIT_XMM sse4
BIWEIGHT_FUNC_HALF
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264chroma_init.c
0,0 → 1,119
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stdint.h>
 
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264chroma.h"
 
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
 
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
 
void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
 
void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
 
void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
 
#define CHROMA_MC(OP, NUM, DEPTH, OPT) \
void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, uint8_t *src, \
int stride, int h, int x, int y);
 
CHROMA_MC(put, 2, 10, mmxext)
CHROMA_MC(avg, 2, 10, mmxext)
CHROMA_MC(put, 4, 10, mmxext)
CHROMA_MC(avg, 4, 10, mmxext)
CHROMA_MC(put, 8, 10, sse2)
CHROMA_MC(avg, 8, 10, sse2)
CHROMA_MC(put, 8, 10, avx)
CHROMA_MC(avg, 8, 10, avx)
 
av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth)
{
#if HAVE_YASM
int high_bit_depth = bit_depth > 8;
int cpu_flags = av_get_cpu_flags();
 
if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx;
}
 
if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow;
}
 
if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) {
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext;
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext;
}
 
if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext;
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext;
}
 
if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2;
}
 
if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) {
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3;
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3;
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3;
}
 
if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) {
// AVX implies !cache64.
// TODO: Port cache(32|64) detection from x264.
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx;
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx;
}
#endif
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264dsp_init.c
0,0 → 1,371
/*
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/h264dsp.h"
#include "dsputil_x86.h"
 
/***********************************/
/* IDCT */
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \
int16_t *block, \
int stride);
 
IDCT_ADD_FUNC(, 8, mmx)
IDCT_ADD_FUNC(, 10, sse2)
IDCT_ADD_FUNC(_dc, 8, mmxext)
IDCT_ADD_FUNC(_dc, 10, mmxext)
IDCT_ADD_FUNC(8_dc, 8, mmxext)
IDCT_ADD_FUNC(8_dc, 10, sse2)
IDCT_ADD_FUNC(8, 8, mmx)
IDCT_ADD_FUNC(8, 8, sse2)
IDCT_ADD_FUNC(8, 10, sse2)
IDCT_ADD_FUNC(, 10, avx)
IDCT_ADD_FUNC(8_dc, 10, avx)
IDCT_ADD_FUNC(8, 10, avx)
 
 
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t *dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
 
IDCT_ADD_REP_FUNC(8, 4, 8, mmx)
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext)
IDCT_ADD_REP_FUNC(8, 4, 8, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, sse2)
IDCT_ADD_REP_FUNC(8, 4, 10, avx)
IDCT_ADD_REP_FUNC(, 16, 8, mmx)
IDCT_ADD_REP_FUNC(, 16, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16, 8, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx)
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext)
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2)
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2)
IDCT_ADD_REP_FUNC(, 16, 10, avx)
IDCT_ADD_REP_FUNC(, 16intra, 10, avx)
 
 
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \
(uint8_t **dst, const int *block_offset, \
int16_t *block, int stride, const uint8_t nnzc[6 * 8]);
 
IDCT_ADD_REP_FUNC2(, 8, 8, mmx)
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext)
IDCT_ADD_REP_FUNC2(, 8, 8, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, sse2)
IDCT_ADD_REP_FUNC2(, 8, 10, avx)
 
void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul);
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul);
 
/***********************************/
/* deblocking */
 
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40],
int8_t ref[2][40],
int16_t mv[2][40][2],
int bidir, int edges, int step,
int mask_mv0, int mask_mv1, int field);
 
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
int stride, \
int alpha, \
int beta, \
int8_t *tc0);
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \
int stride, \
int alpha, \
int beta);
 
#define LF_FUNCS(type, depth) \
LF_FUNC(h, chroma, depth, mmxext) \
LF_IFUNC(h, chroma_intra, depth, mmxext) \
LF_FUNC(v, chroma, depth, mmxext) \
LF_IFUNC(v, chroma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, mmxext) \
LF_IFUNC(h, luma_intra, depth, mmxext) \
LF_FUNC(h, luma, depth, sse2) \
LF_IFUNC(h, luma_intra, depth, sse2) \
LF_FUNC(v, luma, depth, sse2) \
LF_IFUNC(v, luma_intra, depth, sse2) \
LF_FUNC(h, chroma, depth, sse2) \
LF_IFUNC(h, chroma_intra, depth, sse2) \
LF_FUNC(v, chroma, depth, sse2) \
LF_IFUNC(v, chroma_intra, depth, sse2) \
LF_FUNC(h, luma, depth, avx) \
LF_IFUNC(h, luma_intra, depth, avx) \
LF_FUNC(v, luma, depth, avx) \
LF_IFUNC(v, luma_intra, depth, avx) \
LF_FUNC(h, chroma, depth, avx) \
LF_IFUNC(h, chroma_intra, depth, avx) \
LF_FUNC(v, chroma, depth, avx) \
LF_IFUNC(v, chroma_intra, depth, avx)
 
LF_FUNCS(uint8_t, 8)
LF_FUNCS(uint16_t, 10)
 
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
LF_FUNC(v8, luma, 8, mmxext)
static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha,
int beta, int8_t *tc0)
{
if ((tc0[0] & tc0[1]) >= 0)
ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0);
if ((tc0[2] & tc0[3]) >= 0)
ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2);
}
LF_IFUNC(v8, luma_intra, 8, mmxext)
static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride,
int alpha, int beta)
{
ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta);
ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta);
}
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
 
LF_FUNC(v, luma, 10, mmxext)
LF_IFUNC(v, luma_intra, 10, mmxext)
 
/***********************************/
/* weighted prediction */
 
#define H264_WEIGHT(W, OPT) \
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \
int height, int log2_denom, \
int weight, int offset);
 
#define H264_BIWEIGHT(W, OPT) \
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \
int stride, int height, \
int log2_denom, int weightd, \
int weights, int offset);
 
#define H264_BIWEIGHT_MMX(W) \
H264_WEIGHT(W, mmxext) \
H264_BIWEIGHT(W, mmxext)
 
#define H264_BIWEIGHT_MMX_SSE(W) \
H264_BIWEIGHT_MMX(W) \
H264_WEIGHT(W, sse2) \
H264_BIWEIGHT(W, sse2) \
H264_BIWEIGHT(W, ssse3)
 
H264_BIWEIGHT_MMX_SSE(16)
H264_BIWEIGHT_MMX_SSE(8)
H264_BIWEIGHT_MMX(4)
 
#define H264_WEIGHT_10(W, DEPTH, OPT) \
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
int stride, \
int height, \
int log2_denom, \
int weight, \
int offset);
 
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \
uint8_t *src, \
int stride, \
int height, \
int log2_denom, \
int weightd, \
int weights, \
int offset);
 
#define H264_BIWEIGHT_10_SSE(W, DEPTH) \
H264_WEIGHT_10(W, DEPTH, sse2) \
H264_WEIGHT_10(W, DEPTH, sse4) \
H264_BIWEIGHT_10(W, DEPTH, sse2) \
H264_BIWEIGHT_10(W, DEPTH, sse4)
 
H264_BIWEIGHT_10_SSE(16, 10)
H264_BIWEIGHT_10_SSE(8, 10)
H264_BIWEIGHT_10_SSE(4, 10)
 
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth,
const int chroma_format_idc)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
 
if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(cpu_flags))
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext;
 
if (bit_depth == 8) {
if (EXTERNAL_MMX(cpu_flags)) {
c->h264_idct_dc_add =
c->h264_idct_add = ff_h264_idct_add_8_mmx;
c->h264_idct8_dc_add =
c->h264_idct8_add = ff_h264_idct8_add_8_mmx;
 
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx;
if (cpu_flags & AV_CPU_FLAG_CMOV)
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext;
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext;
 
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext;
if (chroma_format_idc == 1) {
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext;
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext;
}
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext;
c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext;
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext;
 
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->h264_idct8_add = ff_h264_idct8_add_8_sse2;
 
c->h264_idct_add16 = ff_h264_idct_add16_8_sse2;
c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_8_sse2;
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2;
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2;
 
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2;
 
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2;
 
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx;
}
} else if (bit_depth == 10) {
if (EXTERNAL_MMXEXT(cpu_flags)) {
#if ARCH_X86_32
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext;
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext;
#endif /* ARCH_X86_32 */
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->h264_idct_add = ff_h264_idct_add_10_sse2;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2;
 
c->h264_idct_add16 = ff_h264_idct_add16_10_sse2;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_10_sse2;
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2;
#if HAVE_ALIGNED_STACK
c->h264_idct8_add = ff_h264_idct8_add_10_sse2;
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
 
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2;
 
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2;
 
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2;
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2;
#endif /* HAVE_ALIGNED_STACK */
}
if (EXTERNAL_SSE4(cpu_flags)) {
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4;
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4;
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4;
 
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4;
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4;
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4;
}
if (EXTERNAL_AVX(cpu_flags)) {
c->h264_idct_dc_add =
c->h264_idct_add = ff_h264_idct_add_10_avx;
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx;
 
c->h264_idct_add16 = ff_h264_idct_add16_10_avx;
if (chroma_format_idc == 1)
c->h264_idct_add8 = ff_h264_idct_add8_10_avx;
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx;
#if HAVE_ALIGNED_STACK
c->h264_idct8_add = ff_h264_idct8_add_10_avx;
c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx;
#endif /* HAVE_ALIGNED_STACK */
 
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx;
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx;
#if HAVE_ALIGNED_STACK
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx;
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx;
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx;
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx;
#endif /* HAVE_ALIGNED_STACK */
}
}
#endif
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/hpeldsp.asm
0,0 → 1,461
;******************************************************************************
;*
;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org>
;* Copyright (c) Nick Kurshev <nickols_k@mail.ru>
;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz>
;* Copyright (c) 2013 Daniel Kang
;*
;* MMX optimized hpel functions
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
cextern pb_1
 
SECTION_TEXT
 
; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS8_X2 0
cglobal put_pixels8_x2, 4,5
lea r4, [r2*2]
.loop:
mova m0, [r1]
mova m1, [r1+r2]
PAVGB m0, [r1+1]
PAVGB m1, [r1+r2+1]
mova [r0], m0
mova [r0+r2], m1
add r1, r4
add r0, r4
mova m0, [r1]
mova m1, [r1+r2]
PAVGB m0, [r1+1]
PAVGB m1, [r1+r2+1]
add r1, r4
mova [r0], m0
mova [r0+r2], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PUT_PIXELS8_X2
INIT_MMX 3dnow
PUT_PIXELS8_X2
 
 
; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS_16 0
cglobal put_pixels16_x2, 4,5
lea r4, [r2*2]
.loop:
mova m0, [r1]
mova m1, [r1+r2]
mova m2, [r1+8]
mova m3, [r1+r2+8]
PAVGB m0, [r1+1]
PAVGB m1, [r1+r2+1]
PAVGB m2, [r1+9]
PAVGB m3, [r1+r2+9]
mova [r0], m0
mova [r0+r2], m1
mova [r0+8], m2
mova [r0+r2+8], m3
add r1, r4
add r0, r4
mova m0, [r1]
mova m1, [r1+r2]
mova m2, [r1+8]
mova m3, [r1+r2+8]
PAVGB m0, [r1+1]
PAVGB m1, [r1+r2+1]
PAVGB m2, [r1+9]
PAVGB m3, [r1+r2+9]
add r1, r4
mova [r0], m0
mova [r0+r2], m1
mova [r0+8], m2
mova [r0+r2+8], m3
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PUT_PIXELS_16
INIT_MMX 3dnow
PUT_PIXELS_16
 
 
; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_X2 0
cglobal put_no_rnd_pixels8_x2, 4,5
mova m6, [pb_1]
lea r4, [r2*2]
.loop:
mova m0, [r1]
mova m2, [r1+r2]
mova m1, [r1+1]
mova m3, [r1+r2+1]
add r1, r4
psubusb m0, m6
psubusb m2, m6
PAVGB m0, m1
PAVGB m2, m3
mova [r0], m0
mova [r0+r2], m2
mova m0, [r1]
mova m1, [r1+1]
mova m2, [r1+r2]
mova m3, [r1+r2+1]
add r0, r4
add r1, r4
psubusb m0, m6
psubusb m2, m6
PAVGB m0, m1
PAVGB m2, m3
mova [r0], m0
mova [r0+r2], m2
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_X2
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_X2
 
 
; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_X2_EXACT 0
cglobal put_no_rnd_pixels8_x2_exact, 4,5
lea r4, [r2*3]
pcmpeqb m6, m6
.loop:
mova m0, [r1]
mova m2, [r1+r2]
mova m1, [r1+1]
mova m3, [r1+r2+1]
pxor m0, m6
pxor m2, m6
pxor m1, m6
pxor m3, m6
PAVGB m0, m1
PAVGB m2, m3
pxor m0, m6
pxor m2, m6
mova [r0], m0
mova [r0+r2], m2
mova m0, [r1+r2*2]
mova m1, [r1+r2*2+1]
mova m2, [r1+r4]
mova m3, [r1+r4+1]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m1
PAVGB m2, m3
pxor m0, m6
pxor m2, m6
mova [r0+r2*2], m0
mova [r0+r4], m2
lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
sub r3d, 4
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_X2_EXACT
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_X2_EXACT
 
 
; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_PIXELS8_Y2 0
cglobal put_pixels8_y2, 4,5
lea r4, [r2*2]
mova m0, [r1]
sub r0, r2
.loop:
mova m1, [r1+r2]
mova m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
mova [r0+r2], m0
mova [r0+r4], m1
mova m1, [r1+r2]
mova m0, [r1+r4]
add r0, r4
add r1, r4
PAVGB m2, m1
PAVGB m1, m0
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PUT_PIXELS8_Y2
INIT_MMX 3dnow
PUT_PIXELS8_Y2
 
 
; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_Y2 0
cglobal put_no_rnd_pixels8_y2, 4,5
mova m6, [pb_1]
lea r4, [r2+r2]
mova m0, [r1]
sub r0, r2
.loop:
mova m1, [r1+r2]
mova m2, [r1+r4]
add r1, r4
psubusb m1, m6
PAVGB m0, m1
PAVGB m1, m2
mova [r0+r2], m0
mova [r0+r4], m1
mova m1, [r1+r2]
mova m0, [r1+r4]
add r0, r4
add r1, r4
psubusb m1, m6
PAVGB m2, m1
PAVGB m1, m0
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_Y2
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_Y2
 
 
; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0
cglobal put_no_rnd_pixels8_y2_exact, 4,5
lea r4, [r2*3]
mova m0, [r1]
pcmpeqb m6, m6
add r1, r2
pxor m0, m6
.loop:
mova m1, [r1]
mova m2, [r1+r2]
pxor m1, m6
pxor m2, m6
PAVGB m0, m1
PAVGB m1, m2
pxor m0, m6
pxor m1, m6
mova [r0], m0
mova [r0+r2], m1
mova m1, [r1+r2*2]
mova m0, [r1+r4]
pxor m1, m6
pxor m0, m6
PAVGB m2, m1
PAVGB m1, m0
pxor m2, m6
pxor m1, m6
mova [r0+r2*2], m2
mova [r0+r4], m1
lea r1, [r1+r2*4]
lea r0, [r0+r2*4]
sub r3d, 4
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_Y2_EXACT
INIT_MMX 3dnow
PUT_NO_RND_PIXELS8_Y2_EXACT
 
 
; avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8 0
cglobal avg_pixels8, 4,5
lea r4, [r2*2]
.loop:
mova m0, [r0]
mova m1, [r0+r2]
PAVGB m0, [r1]
PAVGB m1, [r1+r2]
mova [r0], m0
mova [r0+r2], m1
add r1, r4
add r0, r4
mova m0, [r0]
mova m1, [r0+r2]
PAVGB m0, [r1]
PAVGB m1, [r1+r2]
add r1, r4
mova [r0], m0
mova [r0+r2], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX 3dnow
AVG_PIXELS8
 
 
; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_X2 0
cglobal avg_pixels8_x2, 4,5
lea r4, [r2*2]
.loop:
mova m0, [r1]
mova m2, [r1+r2]
PAVGB m0, [r1+1]
PAVGB m2, [r1+r2+1]
PAVGB m0, [r0]
PAVGB m2, [r0+r2]
add r1, r4
mova [r0], m0
mova [r0+r2], m2
mova m0, [r1]
mova m2, [r1+r2]
PAVGB m0, [r1+1]
PAVGB m2, [r1+r2+1]
add r0, r4
add r1, r4
PAVGB m0, [r0]
PAVGB m2, [r0+r2]
mova [r0], m0
mova [r0+r2], m2
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
AVG_PIXELS8_X2
INIT_MMX 3dnow
AVG_PIXELS8_X2
 
 
; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_Y2 0
cglobal avg_pixels8_y2, 4,5
lea r4, [r2*2]
mova m0, [r1]
sub r0, r2
.loop:
mova m1, [r1+r2]
mova m2, [r1+r4]
add r1, r4
PAVGB m0, m1
PAVGB m1, m2
mova m3, [r0+r2]
mova m4, [r0+r4]
PAVGB m0, m3
PAVGB m1, m4
mova [r0+r2], m0
mova [r0+r4], m1
mova m1, [r1+r2]
mova m0, [r1+r4]
PAVGB m2, m1
PAVGB m1, m0
add r0, r4
add r1, r4
mova m3, [r0+r2]
mova m4, [r0+r4]
PAVGB m2, m3
PAVGB m1, m4
mova [r0+r2], m2
mova [r0+r4], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
AVG_PIXELS8_Y2
INIT_MMX 3dnow
AVG_PIXELS8_Y2
 
 
; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
%macro AVG_PIXELS8_XY2 0
cglobal avg_pixels8_xy2, 4,5
mova m6, [pb_1]
lea r4, [r2*2]
mova m0, [r1]
pavgb m0, [r1+1]
.loop:
mova m2, [r1+r4]
mova m1, [r1+r2]
psubusb m2, m6
pavgb m1, [r1+r2+1]
pavgb m2, [r1+r4+1]
add r1, r4
pavgb m0, m1
pavgb m1, m2
pavgb m0, [r0]
pavgb m1, [r0+r2]
mova [r0], m0
mova [r0+r2], m1
mova m1, [r1+r2]
mova m0, [r1+r4]
pavgb m1, [r1+r2+1]
pavgb m0, [r1+r4+1]
add r0, r4
add r1, r4
pavgb m2, m1
pavgb m1, m0
pavgb m2, [r0]
pavgb m1, [r0+r2]
mova [r0], m2
mova [r0+r2], m1
add r0, r4
sub r3d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
AVG_PIXELS8_XY2
INIT_MMX 3dnow
AVG_PIXELS8_XY2
/contrib/sdk/sources/ffmpeg/libavcodec/x86/hpeldsp_init.c
0,0 → 1,269
/*
* MMX optimized DSP utils
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
*/
 
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/hpeldsp.h"
#include "dsputil_x86.h"
 
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block,
const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h);
 
#define avg_pixels8_mmx ff_avg_pixels8_mmx
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx
#define avg_pixels16_mmx ff_avg_pixels16_mmx
#define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx
#define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx
#define put_pixels8_mmx ff_put_pixels8_mmx
#define put_pixels16_mmx ff_put_pixels16_mmx
#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx
#define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx
#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx
#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx
 
#if HAVE_INLINE_ASM
 
/***********************************/
/* MMX no rounding */
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx
#define SET_RND MOVQ_WONE
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e)
#define STATIC static
 
#include "rnd_template.c"
#include "hpeldsp_rnd_template.c"
 
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
#undef STATIC
 
PIXELS16(static, avg_no_rnd, , _y2, _mmx)
PIXELS16(static, put_no_rnd, , _y2, _mmx)
 
PIXELS16(static, avg_no_rnd, , _xy2, _mmx)
PIXELS16(static, put_no_rnd, , _xy2, _mmx)
 
/***********************************/
/* MMX rounding */
 
#define DEF(x, y) x ## _ ## y ## _mmx
#define SET_RND MOVQ_WTWO
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
 
#include "hpeldsp_rnd_template.c"
 
#undef DEF
#undef SET_RND
#undef PAVGBP
#undef PAVGB
 
PIXELS16(static, avg, , _y2, _mmx)
PIXELS16(static, put, , _y2, _mmx)
 
#endif /* HAVE_INLINE_ASM */
 
 
#if HAVE_YASM
 
#define HPELDSP_AVG_PIXELS16(CPUEXT) \
PIXELS16(static, put_no_rnd, ff_, _x2, CPUEXT) \
PIXELS16(static, put, ff_, _y2, CPUEXT) \
PIXELS16(static, put_no_rnd, ff_, _y2, CPUEXT) \
PIXELS16(static, avg, ff_, , CPUEXT) \
PIXELS16(static, avg, ff_, _x2, CPUEXT) \
PIXELS16(static, avg, ff_, _y2, CPUEXT) \
PIXELS16(static, avg, ff_, _xy2, CPUEXT)
 
HPELDSP_AVG_PIXELS16(_3dnow)
HPELDSP_AVG_PIXELS16(_mmxext)
 
#endif /* HAVE_YASM */
 
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \
do { \
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \
} while (0)
 
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags)
{
#if HAVE_MMX_INLINE
SET_HPEL_FUNCS(put, [0], 16, mmx);
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx);
SET_HPEL_FUNCS(avg, [0], 16, mmx);
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx);
SET_HPEL_FUNCS(put, [1], 8, mmx);
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx);
SET_HPEL_FUNCS(avg, [1], 8, mmx);
#endif /* HAVE_MMX_INLINE */
}
 
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags)
{
#if HAVE_MMXEXT_EXTERNAL
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext;
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext;
 
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext;
 
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext;
 
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext;
 
if (!(flags & CODEC_FLAG_BITEXACT)) {
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext;
 
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext;
}
 
if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext;
}
#endif /* HAVE_MMXEXT_EXTERNAL */
}
 
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags)
{
#if HAVE_AMD3DNOW_EXTERNAL
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow;
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow;
 
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow;
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow;
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow;
 
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow;
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow;
 
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow;
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow;
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow;
 
if (!(flags & CODEC_FLAG_BITEXACT)){
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow;
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow;
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow;
 
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow;
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow;
}
 
if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) {
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow;
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow;
}
#endif /* HAVE_AMD3DNOW_EXTERNAL */
}
 
static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags)
{
#if HAVE_SSE2_EXTERNAL
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) {
// these functions are slower than mmx on AMD, but faster on Intel
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2;
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2;
}
#endif /* HAVE_SSE2_EXTERNAL */
}
 
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
 
if (INLINE_MMX(cpu_flags))
hpeldsp_init_mmx(c, flags, cpu_flags);
 
if (EXTERNAL_MMXEXT(cpu_flags))
hpeldsp_init_mmxext(c, flags, cpu_flags);
 
if (EXTERNAL_AMD3DNOW(cpu_flags))
hpeldsp_init_3dnow(c, flags, cpu_flags);
 
if (EXTERNAL_SSE2(cpu_flags))
hpeldsp_init_sse2(c, flags, cpu_flags);
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/hpeldsp_mmx.c
0,0 → 1,52
/*
* MMX-optimized avg/put pixel routines
*
* Copyright (c) 2001 Fabrice Bellard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stddef.h>
#include <stdint.h>
 
#include "config.h"
#include "dsputil_x86.h"
 
#if HAVE_MMX_INLINE
 
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
JUMPALIGN();
do {
__asm__ volatile(
"movq %1, %%mm0 \n\t"
"movq 1%1, %%mm1 \n\t"
"movq %0, %%mm3 \n\t"
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
"movq %%mm0, %0 \n\t"
:"+m"(*block)
:"m"(*pixels)
:"memory");
pixels += line_size;
block += line_size;
} while (--h);
}
 
#endif /* HAVE_MMX_INLINE */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c
0,0 → 1,198
/*
* DSP utils mmx functions are compiled twice for rnd/no_rnd
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
// put_pixels
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}
 
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%1, %3), %%mm2 \n\t"
"movq 9(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"movq 1(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"movq 8(%1), %%mm0 \n\t"
"movq 9(%1), %%mm1 \n\t"
"movq 8(%1, %3), %%mm2 \n\t"
"movq 9(%1, %3), %%mm3 \n\t"
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, 8(%2) \n\t"
"movq %%mm5, 8(%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}
 
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"REG_a"),%%mm2 \n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"REG_a"),%%mm0 \n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq %%mm4, (%2) \n\t"
"movq %%mm5, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}
 
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
JUMPALIGN();
do {
__asm__ volatile(
"movq %1, %%mm0 \n\t"
"movq 1%1, %%mm1 \n\t"
"movq %0, %%mm3 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
"movq %%mm0, %0 \n\t"
"movq 8%1, %%mm0 \n\t"
"movq 9%1, %%mm1 \n\t"
"movq 8%0, %%mm3 \n\t"
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6)
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6)
"movq %%mm0, 8%0 \n\t"
:"+m"(*block)
:"m"(*pixels)
:"memory");
pixels += line_size;
block += line_size;
} while (--h);
}
 
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h)
{
MOVQ_BFE(mm6);
__asm__ volatile(
"lea (%3, %3), %%"REG_a" \n\t"
"movq (%1), %%mm0 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm2 \n\t"
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6)
"movq (%2, %3), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
"movq %%mm0, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
 
"movq (%1, %3), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5)
"movq (%2), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6)
"movq (%2, %3), %%mm3 \n\t"
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6)
"movq %%mm2, (%2) \n\t"
"movq %%mm1, (%2, %3) \n\t"
"add %%"REG_a", %1 \n\t"
"add %%"REG_a", %2 \n\t"
 
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels), "+D"(block)
:"r"((x86_reg)line_size)
:REG_a, "memory");
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/idct_mmx_xvid.c
0,0 → 1,562
/*
* XVID MPEG-4 VIDEO CODEC
* - MMX and XMM forward discrete cosine transform -
*
* Copyright(C) 2001 Peter Ross <pross@xvid.org>
*
* Originally provided by Intel at AP-922
* http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
* (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm)
* but in a limited edition.
* New macro implements a column part for precise iDCT
* The routine precision now satisfies IEEE standard 1180-1990.
*
* Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru>
* Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org>
*
* http://www.elecard.com/peter/idct.html
* http://www.linuxvideo.org/mpeg2dec/
*
* These examples contain code fragments for first stage iDCT 8x8
* (for rows) and first stage DCT 8x8 (for columns)
*
* conversion to gcc syntax by Michael Niedermayer
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with FFmpeg; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <inttypes.h>
 
#include "config.h"
#include "libavcodec/avcodec.h"
#include "libavutil/mem.h"
#include "dsputil_x86.h"
#include "idct_xvid.h"
 
#if HAVE_MMX_INLINE
 
//=============================================================================
// Macros and other preprocessor constants
//=============================================================================
 
#define BITS_INV_ACC 5 // 4 or 5 for IEEE
#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11
#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6
#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC))
#define RND_INV_COL (16 * (BITS_INV_ACC - 3))
#define RND_INV_CORR (RND_INV_COL - 1)
 
#define BITS_FRW_ACC 3 // 2 or 3 for accuracy
#define SHIFT_FRW_COL BITS_FRW_ACC
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17)
#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1))
 
 
//-----------------------------------------------------------------------------
// Various memory constants (trigonometric values or rounding values)
//-----------------------------------------------------------------------------
 
 
DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = {
13036,13036,13036,13036, // tg * (2<<16) + 0.5
27146,27146,27146,27146, // tg * (2<<16) + 0.5
-21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5
23170,23170,23170,23170}; // cos * (2<<15) + 0.5
 
DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = {
65536,65536,
3597,3597,
2260,2260,
1203,1203,
0,0,
120,120,
512,512,
512,512};
 
//-----------------------------------------------------------------------------
//
// The first stage iDCT 8x8 - inverse DCTs of rows
//
//-----------------------------------------------------------------------------
// The 8-point inverse DCT direct algorithm
//-----------------------------------------------------------------------------
//
// static const short w[32] = {
// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16),
// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16),
// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16),
// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16),
// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16),
// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16),
// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16),
// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) };
//
// #define DCT_8_INV_ROW(x, y)
// {
// int a0, a1, a2, a3, b0, b1, b2, b3;
//
// a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3];
// a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7];
// a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11];
// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15];
// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19];
// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23];
// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27];
// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31];
//
// y[0] = SHIFT_ROUND ( a0 + b0 );
// y[1] = SHIFT_ROUND ( a1 + b1 );
// y[2] = SHIFT_ROUND ( a2 + b2 );
// y[3] = SHIFT_ROUND ( a3 + b3 );
// y[4] = SHIFT_ROUND ( a3 - b3 );
// y[5] = SHIFT_ROUND ( a2 - b2 );
// y[6] = SHIFT_ROUND ( a1 - b1 );
// y[7] = SHIFT_ROUND ( a0 - b0 );
// }
//
//-----------------------------------------------------------------------------
//
// In this implementation the outputs of the iDCT-1D are multiplied
// for rows 0,4 - by cos_4_16,
// for rows 1,7 - by cos_1_16,
// for rows 2,6 - by cos_2_16,
// for rows 3,5 - by cos_3_16
// and are shifted to the left for better accuracy
//
// For the constants used,
// FIX(float_const) = (short) (float_const * (1<<15) + 0.5)
//
//-----------------------------------------------------------------------------
 
//-----------------------------------------------------------------------------
// Tables for mmx processors
//-----------------------------------------------------------------------------
 
// Table for rows 0,4 - constants are multiplied by cos_4_16
DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32*4] = {
16384,16384,16384,-16384, // movq-> w06 w04 w02 w00
21407,8867,8867,-21407, // w07 w05 w03 w01
16384,-16384,16384,16384, // w14 w12 w10 w08
-8867,21407,-21407,-8867, // w15 w13 w11 w09
22725,12873,19266,-22725, // w22 w20 w18 w16
19266,4520,-4520,-12873, // w23 w21 w19 w17
12873,4520,4520,19266, // w30 w28 w26 w24
-22725,19266,-12873,-22725, // w31 w29 w27 w25
// Table for rows 1,7 - constants are multiplied by cos_1_16
22725,22725,22725,-22725, // movq-> w06 w04 w02 w00
29692,12299,12299,-29692, // w07 w05 w03 w01
22725,-22725,22725,22725, // w14 w12 w10 w08
-12299,29692,-29692,-12299, // w15 w13 w11 w09
31521,17855,26722,-31521, // w22 w20 w18 w16
26722,6270,-6270,-17855, // w23 w21 w19 w17
17855,6270,6270,26722, // w30 w28 w26 w24
-31521,26722,-17855,-31521, // w31 w29 w27 w25
// Table for rows 2,6 - constants are multiplied by cos_2_16
21407,21407,21407,-21407, // movq-> w06 w04 w02 w00
27969,11585,11585,-27969, // w07 w05 w03 w01
21407,-21407,21407,21407, // w14 w12 w10 w08
-11585,27969,-27969,-11585, // w15 w13 w11 w09
29692,16819,25172,-29692, // w22 w20 w18 w16
25172,5906,-5906,-16819, // w23 w21 w19 w17
16819,5906,5906,25172, // w30 w28 w26 w24
-29692,25172,-16819,-29692, // w31 w29 w27 w25
// Table for rows 3,5 - constants are multiplied by cos_3_16
19266,19266,19266,-19266, // movq-> w06 w04 w02 w00
25172,10426,10426,-25172, // w07 w05 w03 w01
19266,-19266,19266,19266, // w14 w12 w10 w08
-10426,25172,-25172,-10426, // w15 w13 w11 w09
26722,15137,22654,-26722, // w22 w20 w18 w16
22654,5315,-5315,-15137, // w23 w21 w19 w17
15137,5315,5315,22654, // w30 w28 w26 w24
-26722,22654,-15137,-26722, // w31 w29 w27 w25
};
//-----------------------------------------------------------------------------
// Tables for xmm processors
//-----------------------------------------------------------------------------
 
// %3 for rows 0,4 - constants are multiplied by cos_4_16
DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32*4] = {
16384,21407,16384,8867, // movq-> w05 w04 w01 w00
16384,8867,-16384,-21407, // w07 w06 w03 w02
16384,-8867,16384,-21407, // w13 w12 w09 w08
-16384,21407,16384,-8867, // w15 w14 w11 w10
22725,19266,19266,-4520, // w21 w20 w17 w16
12873,4520,-22725,-12873, // w23 w22 w19 w18
12873,-22725,4520,-12873, // w29 w28 w25 w24
4520,19266,19266,-22725, // w31 w30 w27 w26
// %3 for rows 1,7 - constants are multiplied by cos_1_16
22725,29692,22725,12299, // movq-> w05 w04 w01 w00
22725,12299,-22725,-29692, // w07 w06 w03 w02
22725,-12299,22725,-29692, // w13 w12 w09 w08
-22725,29692,22725,-12299, // w15 w14 w11 w10
31521,26722,26722,-6270, // w21 w20 w17 w16
17855,6270,-31521,-17855, // w23 w22 w19 w18
17855,-31521,6270,-17855, // w29 w28 w25 w24
6270,26722,26722,-31521, // w31 w30 w27 w26
// %3 for rows 2,6 - constants are multiplied by cos_2_16
21407,27969,21407,11585, // movq-> w05 w04 w01 w00
21407,11585,-21407,-27969, // w07 w06 w03 w02
21407,-11585,21407,-27969, // w13 w12 w09 w08
-21407,27969,21407,-11585, // w15 w14 w11 w10
29692,25172,25172,-5906, // w21 w20 w17 w16
16819,5906,-29692,-16819, // w23 w22 w19 w18
16819,-29692,5906,-16819, // w29 w28 w25 w24
5906,25172,25172,-29692, // w31 w30 w27 w26
// %3 for rows 3,5 - constants are multiplied by cos_3_16
19266,25172,19266,10426, // movq-> w05 w04 w01 w00
19266,10426,-19266,-25172, // w07 w06 w03 w02
19266,-10426,19266,-25172, // w13 w12 w09 w08
-19266,25172,19266,-10426, // w15 w14 w11 w10
26722,22654,22654,-5315, // w21 w20 w17 w16
15137,5315,-26722,-15137, // w23 w22 w19 w18
15137,-26722,5315,-15137, // w29 w28 w25 w24
5315,22654,22654,-26722, // w31 w30 w27 w26
};
//=============================================================================
// Helper macros for the code
//=============================================================================
 
//-----------------------------------------------------------------------------
// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER
//-----------------------------------------------------------------------------
 
#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\
"movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
"movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
"movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
"movq " #A3 ",%%mm3 \n\t"/* 3 ; w06 w04 w02 w00*/\
"punpcklwd %%mm1,%%mm0 \n\t"/* x5 x1 x4 x0*/\
"movq %%mm0,%%mm5 \n\t"/* 5 ; x5 x1 x4 x0*/\
"punpckldq %%mm0,%%mm0 \n\t"/* x4 x0 x4 x0*/\
"movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w05 w03 w01*/\
"punpckhwd %%mm1,%%mm2 \n\t"/* 1 ; x7 x3 x6 x2*/\
"pmaddwd %%mm0,%%mm3 \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\
"movq %%mm2,%%mm6 \n\t"/* 6 ; x7 x3 x6 x2*/\
"movq 32+" #A3 ",%%mm1 \n\t"/* 1 ; w22 w20 w18 w16*/\
"punpckldq %%mm2,%%mm2 \n\t"/* x6 x2 x6 x2*/\
"pmaddwd %%mm2,%%mm4 \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\
"punpckhdq %%mm5,%%mm5 \n\t"/* x5 x1 x5 x1*/\
"pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\
"punpckhdq %%mm6,%%mm6 \n\t"/* x7 x3 x7 x3*/\
"movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w21 w19 w17*/\
"pmaddwd %%mm5,%%mm1 \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\
"paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
"pmaddwd %%mm6,%%mm7 \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\
"pmaddwd 24+" #A3 ",%%mm2 \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\
"paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
"pmaddwd 48+" #A3 ",%%mm5 \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\
"movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
"pmaddwd 56+" #A3 ",%%mm6 \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\
"paddd %%mm7,%%mm1 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
"paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
"psubd %%mm1,%%mm3 \n\t"/* a1-b1 a0-b0*/\
"psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\
"paddd %%mm4,%%mm1 \n\t"/* 4 ; a1+b1 a0+b0*/\
"paddd %%mm2,%%mm0 \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\
"psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\
"paddd %%mm6,%%mm5 \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\
"movq %%mm0,%%mm4 \n\t"/* 4 ; a3 a2*/\
"paddd %%mm5,%%mm0 \n\t"/* a3+b3 a2+b2*/\
"psubd %%mm5,%%mm4 \n\t"/* 5 ; a3-b3 a2-b2*/\
"psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
"psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\
"packssdw %%mm0,%%mm1 \n\t"/* 0 ; y3 y2 y1 y0*/\
"packssdw %%mm3,%%mm4 \n\t"/* 3 ; y6 y7 y4 y5*/\
"movq %%mm4,%%mm7 \n\t"/* 7 ; y6 y7 y4 y5*/\
"psrld $16,%%mm4 \n\t"/* 0 y6 0 y4*/\
"pslld $16,%%mm7 \n\t"/* y7 0 y5 0*/\
"movq %%mm1," #A2 " \n\t"/* 1 ; save y3 y2 y1 y0*/\
"por %%mm4,%%mm7 \n\t"/* 4 ; y7 y6 y5 y4*/\
"movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
 
 
//-----------------------------------------------------------------------------
// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER
//-----------------------------------------------------------------------------
 
#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\
"movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\
"movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\
"movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\
"movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\
"pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\
"movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\
"movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\
"pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\
"movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\
"pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\
"pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\
"movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\
"pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\
"pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\
"pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\
"pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\
"paddd " #A4 ",%%mm3 \n\t"/* +%4*/\
"pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\
"paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\
"pmaddwd 24+" #A3 ",%%mm1 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\
"movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\
"pmaddwd 48+" #A3 ",%%mm2 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\
"paddd %%mm7,%%mm6 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\
"pmaddwd 56+" #A3 ",%%mm5 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\
"paddd %%mm6,%%mm3 \n\t"/* a1+b1 a0+b0*/\
"paddd " #A4 ",%%mm0 \n\t"/* +%4*/\
"psrad $11,%%mm3 \n\t"/* y1=a1+b1 y0=a0+b0*/\
"paddd %%mm1,%%mm0 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\
"psubd %%mm6,%%mm4 \n\t"/* 6 ; a1-b1 a0-b0*/\
"movq %%mm0,%%mm7 \n\t"/* 7 ; a3 a2*/\
"paddd %%mm5,%%mm2 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\
"paddd %%mm2,%%mm0 \n\t"/* a3+b3 a2+b2*/\
"psrad $11,%%mm4 \n\t"/* y6=a1-b1 y7=a0-b0*/\
"psubd %%mm2,%%mm7 \n\t"/* 2 ; a3-b3 a2-b2*/\
"psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\
"psrad $11,%%mm7 \n\t"/* y4=a3-b3 y5=a2-b2*/\
"packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\
"packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\
"movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\
"pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\
"movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\
 
 
//-----------------------------------------------------------------------------
//
// The first stage DCT 8x8 - forward DCTs of columns
//
// The %2puts are multiplied
// for rows 0,4 - on cos_4_16,
// for rows 1,7 - on cos_1_16,
// for rows 2,6 - on cos_2_16,
// for rows 3,5 - on cos_3_16
// and are shifted to the left for rise of accuracy
//
//-----------------------------------------------------------------------------
//
// The 8-point scaled forward DCT algorithm (26a8m)
//
//-----------------------------------------------------------------------------
//
// #define DCT_8_FRW_COL(x, y)
//{
// short t0, t1, t2, t3, t4, t5, t6, t7;
// short tp03, tm03, tp12, tm12, tp65, tm65;
// short tp465, tm465, tp765, tm765;
//
// t0 = LEFT_SHIFT ( x[0] + x[7] );
// t1 = LEFT_SHIFT ( x[1] + x[6] );
// t2 = LEFT_SHIFT ( x[2] + x[5] );
// t3 = LEFT_SHIFT ( x[3] + x[4] );
// t4 = LEFT_SHIFT ( x[3] - x[4] );
// t5 = LEFT_SHIFT ( x[2] - x[5] );
// t6 = LEFT_SHIFT ( x[1] - x[6] );
// t7 = LEFT_SHIFT ( x[0] - x[7] );
//
// tp03 = t0 + t3;
// tm03 = t0 - t3;
// tp12 = t1 + t2;
// tm12 = t1 - t2;
//
// y[0] = tp03 + tp12;
// y[4] = tp03 - tp12;
//
// y[2] = tm03 + tm12 * tg_2_16;
// y[6] = tm03 * tg_2_16 - tm12;
//
// tp65 =(t6 +t5 )*cos_4_16;
// tm65 =(t6 -t5 )*cos_4_16;
//
// tp765 = t7 + tp65;
// tm765 = t7 - tp65;
// tp465 = t4 + tm65;
// tm465 = t4 - tm65;
//
// y[1] = tp765 + tp465 * tg_1_16;
// y[7] = tp765 * tg_1_16 - tp465;
// y[5] = tm765 * tg_3_16 + tm465;
// y[3] = tm765 - tm465 * tg_3_16;
//}
//
//-----------------------------------------------------------------------------
 
//-----------------------------------------------------------------------------
// DCT_8_INV_COL_4 INP,OUT
//-----------------------------------------------------------------------------
 
#define DCT_8_INV_COL(A1,A2)\
"movq 2*8(%3),%%mm0\n\t"\
"movq 16*3+" #A1 ",%%mm3\n\t"\
"movq %%mm0,%%mm1 \n\t"/* tg_3_16*/\
"movq 16*5+" #A1 ",%%mm5\n\t"\
"pmulhw %%mm3,%%mm0 \n\t"/* x3*(tg_3_16-1)*/\
"movq (%3),%%mm4\n\t"\
"pmulhw %%mm5,%%mm1 \n\t"/* x5*(tg_3_16-1)*/\
"movq 16*7+" #A1 ",%%mm7\n\t"\
"movq %%mm4,%%mm2 \n\t"/* tg_1_16*/\
"movq 16*1+" #A1 ",%%mm6\n\t"\
"pmulhw %%mm7,%%mm4 \n\t"/* x7*tg_1_16*/\
"paddsw %%mm3,%%mm0 \n\t"/* x3*tg_3_16*/\
"pmulhw %%mm6,%%mm2 \n\t"/* x1*tg_1_16*/\
"paddsw %%mm3,%%mm1 \n\t"/* x3+x5*(tg_3_16-1)*/\
"psubsw %%mm5,%%mm0 \n\t"/* x3*tg_3_16-x5 = tm35*/\
"movq 3*8(%3),%%mm3\n\t"\
"paddsw %%mm5,%%mm1 \n\t"/* x3+x5*tg_3_16 = tp35*/\
"paddsw %%mm6,%%mm4 \n\t"/* x1+tg_1_16*x7 = tp17*/\
"psubsw %%mm7,%%mm2 \n\t"/* x1*tg_1_16-x7 = tm17*/\
"movq %%mm4,%%mm5 \n\t"/* tp17*/\
"movq %%mm2,%%mm6 \n\t"/* tm17*/\
"paddsw %%mm1,%%mm5 \n\t"/* tp17+tp35 = b0*/\
"psubsw %%mm0,%%mm6 \n\t"/* tm17-tm35 = b3*/\
"psubsw %%mm1,%%mm4 \n\t"/* tp17-tp35 = t1*/\
"paddsw %%mm0,%%mm2 \n\t"/* tm17+tm35 = t2*/\
"movq 1*8(%3),%%mm7\n\t"\
"movq %%mm4,%%mm1 \n\t"/* t1*/\
"movq %%mm5,3*16 +" #A2 "\n\t"/* save b0*/\
"paddsw %%mm2,%%mm1 \n\t"/* t1+t2*/\
"movq %%mm6,5*16 +" #A2 "\n\t"/* save b3*/\
"psubsw %%mm2,%%mm4 \n\t"/* t1-t2*/\
"movq 2*16+" #A1 ",%%mm5\n\t"\
"movq %%mm7,%%mm0 \n\t"/* tg_2_16*/\
"movq 6*16+" #A1 ",%%mm6\n\t"\
"pmulhw %%mm5,%%mm0 \n\t"/* x2*tg_2_16*/\
"pmulhw %%mm6,%%mm7 \n\t"/* x6*tg_2_16*/\
"pmulhw %%mm3,%%mm1 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\
"movq 0*16+" #A1 ",%%mm2\n\t"\
"pmulhw %%mm3,%%mm4 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\
"psubsw %%mm6,%%mm0 \n\t"/* t2*tg_2_16-x6 = tm26*/\
"movq %%mm2,%%mm3 \n\t"/* x0*/\
"movq 4*16+" #A1 ",%%mm6\n\t"\
"paddsw %%mm5,%%mm7 \n\t"/* x2+x6*tg_2_16 = tp26*/\
"paddsw %%mm6,%%mm2 \n\t"/* x0+x4 = tp04*/\
"psubsw %%mm6,%%mm3 \n\t"/* x0-x4 = tm04*/\
"movq %%mm2,%%mm5 \n\t"/* tp04*/\
"movq %%mm3,%%mm6 \n\t"/* tm04*/\
"psubsw %%mm7,%%mm2 \n\t"/* tp04-tp26 = a3*/\
"paddsw %%mm0,%%mm3 \n\t"/* tm04+tm26 = a1*/\
"paddsw %%mm1,%%mm1 \n\t"/* b1*/\
"paddsw %%mm4,%%mm4 \n\t"/* b2*/\
"paddsw %%mm7,%%mm5 \n\t"/* tp04+tp26 = a0*/\
"psubsw %%mm0,%%mm6 \n\t"/* tm04-tm26 = a2*/\
"movq %%mm3,%%mm7 \n\t"/* a1*/\
"movq %%mm6,%%mm0 \n\t"/* a2*/\
"paddsw %%mm1,%%mm3 \n\t"/* a1+b1*/\
"paddsw %%mm4,%%mm6 \n\t"/* a2+b2*/\
"psraw $6,%%mm3 \n\t"/* dst1*/\
"psubsw %%mm1,%%mm7 \n\t"/* a1-b1*/\
"psraw $6,%%mm6 \n\t"/* dst2*/\
"psubsw %%mm4,%%mm0 \n\t"/* a2-b2*/\
"movq 3*16+" #A2 ",%%mm1 \n\t"/* load b0*/\
"psraw $6,%%mm7 \n\t"/* dst6*/\
"movq %%mm5,%%mm4 \n\t"/* a0*/\
"psraw $6,%%mm0 \n\t"/* dst5*/\
"movq %%mm3,1*16+" #A2 "\n\t"\
"paddsw %%mm1,%%mm5 \n\t"/* a0+b0*/\
"movq %%mm6,2*16+" #A2 "\n\t"\
"psubsw %%mm1,%%mm4 \n\t"/* a0-b0*/\
"movq 5*16+" #A2 ",%%mm3 \n\t"/* load b3*/\
"psraw $6,%%mm5 \n\t"/* dst0*/\
"movq %%mm2,%%mm6 \n\t"/* a3*/\
"psraw $6,%%mm4 \n\t"/* dst7*/\
"movq %%mm0,5*16+" #A2 "\n\t"\
"paddsw %%mm3,%%mm2 \n\t"/* a3+b3*/\
"movq %%mm7,6*16+" #A2 "\n\t"\
"psubsw %%mm3,%%mm6 \n\t"/* a3-b3*/\
"movq %%mm5,0*16+" #A2 "\n\t"\
"psraw $6,%%mm2 \n\t"/* dst3*/\
"movq %%mm4,7*16+" #A2 "\n\t"\
"psraw $6,%%mm6 \n\t"/* dst4*/\
"movq %%mm2,3*16+" #A2 "\n\t"\
"movq %%mm6,4*16+" #A2 "\n\t"
 
//=============================================================================
// Code
//=============================================================================
 
//-----------------------------------------------------------------------------
// void idct_mmx(uint16_t block[64]);
//-----------------------------------------------------------------------------
 
 
void ff_idct_xvid_mmx(short *block){
__asm__ volatile(
//# Process each row
DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
 
//# Process the columns (4 at a time)
DCT_8_INV_COL(0(%0), 0(%0))
DCT_8_INV_COL(8(%0), 8(%0))
:: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16));
}
 
void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block)
{
ff_idct_xvid_mmx(block);
ff_put_pixels_clamped_mmx(block, dest, line_size);
}
 
void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block)
{
ff_idct_xvid_mmx(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
 
#endif /* HAVE_MMX_INLINE */
 
#if HAVE_MMXEXT_INLINE
 
//-----------------------------------------------------------------------------
// void idct_xmm(uint16_t block[64]);
//-----------------------------------------------------------------------------
 
 
void ff_idct_xvid_mmxext(short *block)
{
__asm__ volatile(
//# Process each row
DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1))
DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1))
DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1))
DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1))
DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1))
DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1))
DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1))
DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1))
 
//# Process the columns (4 at a time)
DCT_8_INV_COL(0(%0), 0(%0))
DCT_8_INV_COL(8(%0), 8(%0))
:: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16));
}
 
void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block)
{
ff_idct_xvid_mmxext(block);
ff_put_pixels_clamped_mmx(block, dest, line_size);
}
 
void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block)
{
ff_idct_xvid_mmxext(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
 
#endif /* HAVE_MMXEXT_INLINE */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/idct_sse2_xvid.c
0,0 → 1,407
/*
* XVID MPEG-4 VIDEO CODEC
* - SSE2 inverse discrete cosine transform -
*
* Copyright(C) 2003 Pascal Massimino <skal@planet-d.net>
*
* Conversion to gcc syntax with modifications
* by Alexander Strange <astrange@ithinksw.com>
*
* Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid.
*
* This file is part of FFmpeg.
*
* Vertical pass is an implementation of the scheme:
* Loeffler C., Ligtenberg A., and Moschytz C.S.:
* Practical Fast 1D DCT Algorithm with Eleven Multiplications,
* Proc. ICASSP 1989, 988-991.
*
* Horizontal pass is a double 4x4 vector/matrix multiplication,
* (see also Intel's Application Note 922:
* http://developer.intel.com/vtune/cbts/strmsimd/922down.htm
* Copyright (C) 1999 Intel Corporation)
*
* More details at http://skal.planet-d.net/coding/dct.html
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public License
* along with FFmpeg; if not, write to the Free Software Foundation,
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "idct_xvid.h"
#include "dsputil_x86.h"
 
#if HAVE_SSE2_INLINE
 
/**
* @file
* @brief SSE2 idct compatible with xvidmmx
*/
 
#define X8(x) x,x,x,x,x,x,x,x
 
#define ROW_SHIFT 11
#define COL_SHIFT 6
 
DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16)
DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1
DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1
DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2)
DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)};
 
DECLARE_ASM_CONST(16, int16_t, iTab1)[] = {
0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d,
0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61,
0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7,
0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b
};
 
DECLARE_ASM_CONST(16, int16_t, iTab2)[] = {
0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5,
0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04,
0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41,
0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df
};
 
DECLARE_ASM_CONST(16, int16_t, iTab3)[] = {
0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf,
0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf,
0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d,
0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04
};
 
DECLARE_ASM_CONST(16, int16_t, iTab4)[] = {
0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746,
0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac,
0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df,
0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e
};
 
DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = {
65536, 65536, 65536, 65536,
3597, 3597, 3597, 3597,
2260, 2260, 2260, 2260,
1203, 1203, 1203, 1203,
120, 120, 120, 120,
512, 512, 512, 512
};
 
// Temporary storage before the column pass
#define ROW1 "%%xmm6"
#define ROW3 "%%xmm4"
#define ROW5 "%%xmm5"
#define ROW7 "%%xmm7"
 
#define CLEAR_ODD(r) "pxor "r","r" \n\t"
#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t"
 
#if ARCH_X86_64
 
# define ROW0 "%%xmm8"
# define REG0 ROW0
# define ROW2 "%%xmm9"
# define REG2 ROW2
# define ROW4 "%%xmm10"
# define REG4 ROW4
# define ROW6 "%%xmm11"
# define REG6 ROW6
# define CLEAR_EVEN(r) CLEAR_ODD(r)
# define PUT_EVEN(dst) PUT_ODD(dst)
# define XMMS "%%xmm12"
# define MOV_32_ONLY "#"
# define SREG2 REG2
# define TAN3 "%%xmm13"
# define TAN1 "%%xmm14"
 
#else
 
# define ROW0 "(%0)"
# define REG0 "%%xmm4"
# define ROW2 "2*16(%0)"
# define REG2 "%%xmm4"
# define ROW4 "4*16(%0)"
# define REG4 "%%xmm6"
# define ROW6 "6*16(%0)"
# define REG6 "%%xmm6"
# define CLEAR_EVEN(r)
# define PUT_EVEN(dst) \
"pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \
"movdqa %%xmm2, "dst" \n\t"
# define XMMS "%%xmm2"
# define MOV_32_ONLY "movdqa "
# define SREG2 "%%xmm7"
# define TAN3 "%%xmm0"
# define TAN1 "%%xmm2"
 
#endif
 
#define ROUND(x) "paddd "MANGLE(x)
 
#define JZ(reg, to) \
"testl "reg","reg" \n\t" \
"jz "to" \n\t"
 
#define JNZ(reg, to) \
"testl "reg","reg" \n\t" \
"jnz "to" \n\t"
 
#define TEST_ONE_ROW(src, reg, clear) \
clear \
"movq "src", %%mm1 \n\t" \
"por 8+"src", %%mm1 \n\t" \
"paddusb %%mm0, %%mm1 \n\t" \
"pmovmskb %%mm1, "reg" \n\t"
 
#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \
clear1 \
clear2 \
"movq "row1", %%mm1 \n\t" \
"por 8+"row1", %%mm1 \n\t" \
"movq "row2", %%mm2 \n\t" \
"por 8+"row2", %%mm2 \n\t" \
"paddusb %%mm0, %%mm1 \n\t" \
"paddusb %%mm0, %%mm2 \n\t" \
"pmovmskb %%mm1, "reg1" \n\t" \
"pmovmskb %%mm2, "reg2" \n\t"
 
///IDCT pass on rows.
#define iMTX_MULT(src, table, rounder, put) \
"movdqa "src", %%xmm3 \n\t" \
"movdqa %%xmm3, %%xmm0 \n\t" \
"pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \
"punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \
"pmaddwd "table", %%xmm0 \n\t" \
"pmaddwd 16+"table", %%xmm1 \n\t" \
"pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \
"punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \
"pmaddwd 32+"table", %%xmm2 \n\t" \
"pmaddwd 48+"table", %%xmm3 \n\t" \
"paddd %%xmm1, %%xmm0 \n\t" \
"paddd %%xmm3, %%xmm2 \n\t" \
rounder", %%xmm0 \n\t" \
"movdqa %%xmm2, %%xmm3 \n\t" \
"paddd %%xmm0, %%xmm2 \n\t" \
"psubd %%xmm3, %%xmm0 \n\t" \
"psrad $11, %%xmm2 \n\t" \
"psrad $11, %%xmm0 \n\t" \
"packssdw %%xmm0, %%xmm2 \n\t" \
put \
"1: \n\t"
 
#define iLLM_HEAD \
"movdqa "MANGLE(tan3)", "TAN3" \n\t" \
"movdqa "MANGLE(tan1)", "TAN1" \n\t" \
 
///IDCT pass on columns.
#define iLLM_PASS(dct) \
"movdqa "TAN3", %%xmm1 \n\t" \
"movdqa "TAN1", %%xmm3 \n\t" \
"pmulhw %%xmm4, "TAN3" \n\t" \
"pmulhw %%xmm5, %%xmm1 \n\t" \
"paddsw %%xmm4, "TAN3" \n\t" \
"paddsw %%xmm5, %%xmm1 \n\t" \
"psubsw %%xmm5, "TAN3" \n\t" \
"paddsw %%xmm4, %%xmm1 \n\t" \
"pmulhw %%xmm7, %%xmm3 \n\t" \
"pmulhw %%xmm6, "TAN1" \n\t" \
"paddsw %%xmm6, %%xmm3 \n\t" \
"psubsw %%xmm7, "TAN1" \n\t" \
"movdqa %%xmm3, %%xmm7 \n\t" \
"movdqa "TAN1", %%xmm6 \n\t" \
"psubsw %%xmm1, %%xmm3 \n\t" \
"psubsw "TAN3", "TAN1" \n\t" \
"paddsw %%xmm7, %%xmm1 \n\t" \
"paddsw %%xmm6, "TAN3" \n\t" \
"movdqa %%xmm3, %%xmm6 \n\t" \
"psubsw "TAN3", %%xmm3 \n\t" \
"paddsw %%xmm6, "TAN3" \n\t" \
"movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
"pmulhw %%xmm4, %%xmm3 \n\t" \
"pmulhw %%xmm4, "TAN3" \n\t" \
"paddsw "TAN3", "TAN3" \n\t" \
"paddsw %%xmm3, %%xmm3 \n\t" \
"movdqa "MANGLE(tan2)", %%xmm7 \n\t" \
MOV_32_ONLY ROW2", "REG2" \n\t" \
MOV_32_ONLY ROW6", "REG6" \n\t" \
"movdqa %%xmm7, %%xmm5 \n\t" \
"pmulhw "REG6", %%xmm7 \n\t" \
"pmulhw "REG2", %%xmm5 \n\t" \
"paddsw "REG2", %%xmm7 \n\t" \
"psubsw "REG6", %%xmm5 \n\t" \
MOV_32_ONLY ROW0", "REG0" \n\t" \
MOV_32_ONLY ROW4", "REG4" \n\t" \
MOV_32_ONLY" "TAN1", (%0) \n\t" \
"movdqa "REG0", "XMMS" \n\t" \
"psubsw "REG4", "REG0" \n\t" \
"paddsw "XMMS", "REG4" \n\t" \
"movdqa "REG4", "XMMS" \n\t" \
"psubsw %%xmm7, "REG4" \n\t" \
"paddsw "XMMS", %%xmm7 \n\t" \
"movdqa "REG0", "XMMS" \n\t" \
"psubsw %%xmm5, "REG0" \n\t" \
"paddsw "XMMS", %%xmm5 \n\t" \
"movdqa %%xmm5, "XMMS" \n\t" \
"psubsw "TAN3", %%xmm5 \n\t" \
"paddsw "XMMS", "TAN3" \n\t" \
"movdqa "REG0", "XMMS" \n\t" \
"psubsw %%xmm3, "REG0" \n\t" \
"paddsw "XMMS", %%xmm3 \n\t" \
MOV_32_ONLY" (%0), "TAN1" \n\t" \
"psraw $6, %%xmm5 \n\t" \
"psraw $6, "REG0" \n\t" \
"psraw $6, "TAN3" \n\t" \
"psraw $6, %%xmm3 \n\t" \
"movdqa "TAN3", 1*16("dct") \n\t" \
"movdqa %%xmm3, 2*16("dct") \n\t" \
"movdqa "REG0", 5*16("dct") \n\t" \
"movdqa %%xmm5, 6*16("dct") \n\t" \
"movdqa %%xmm7, %%xmm0 \n\t" \
"movdqa "REG4", %%xmm4 \n\t" \
"psubsw %%xmm1, %%xmm7 \n\t" \
"psubsw "TAN1", "REG4" \n\t" \
"paddsw %%xmm0, %%xmm1 \n\t" \
"paddsw %%xmm4, "TAN1" \n\t" \
"psraw $6, %%xmm1 \n\t" \
"psraw $6, %%xmm7 \n\t" \
"psraw $6, "TAN1" \n\t" \
"psraw $6, "REG4" \n\t" \
"movdqa %%xmm1, ("dct") \n\t" \
"movdqa "TAN1", 3*16("dct") \n\t" \
"movdqa "REG4", 4*16("dct") \n\t" \
"movdqa %%xmm7, 7*16("dct") \n\t"
 
///IDCT pass on columns, assuming rows 4-7 are zero.
#define iLLM_PASS_SPARSE(dct) \
"pmulhw %%xmm4, "TAN3" \n\t" \
"paddsw %%xmm4, "TAN3" \n\t" \
"movdqa %%xmm6, %%xmm3 \n\t" \
"pmulhw %%xmm6, "TAN1" \n\t" \
"movdqa %%xmm4, %%xmm1 \n\t" \
"psubsw %%xmm1, %%xmm3 \n\t" \
"paddsw %%xmm6, %%xmm1 \n\t" \
"movdqa "TAN1", %%xmm6 \n\t" \
"psubsw "TAN3", "TAN1" \n\t" \
"paddsw %%xmm6, "TAN3" \n\t" \
"movdqa %%xmm3, %%xmm6 \n\t" \
"psubsw "TAN3", %%xmm3 \n\t" \
"paddsw %%xmm6, "TAN3" \n\t" \
"movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \
"pmulhw %%xmm4, %%xmm3 \n\t" \
"pmulhw %%xmm4, "TAN3" \n\t" \
"paddsw "TAN3", "TAN3" \n\t" \
"paddsw %%xmm3, %%xmm3 \n\t" \
"movdqa "MANGLE(tan2)", %%xmm5 \n\t" \
MOV_32_ONLY ROW2", "SREG2" \n\t" \
"pmulhw "SREG2", %%xmm5 \n\t" \
MOV_32_ONLY ROW0", "REG0" \n\t" \
"movdqa "REG0", %%xmm6 \n\t" \
"psubsw "SREG2", %%xmm6 \n\t" \
"paddsw "REG0", "SREG2" \n\t" \
MOV_32_ONLY" "TAN1", (%0) \n\t" \
"movdqa "REG0", "XMMS" \n\t" \
"psubsw %%xmm5, "REG0" \n\t" \
"paddsw "XMMS", %%xmm5 \n\t" \
"movdqa %%xmm5, "XMMS" \n\t" \
"psubsw "TAN3", %%xmm5 \n\t" \
"paddsw "XMMS", "TAN3" \n\t" \
"movdqa "REG0", "XMMS" \n\t" \
"psubsw %%xmm3, "REG0" \n\t" \
"paddsw "XMMS", %%xmm3 \n\t" \
MOV_32_ONLY" (%0), "TAN1" \n\t" \
"psraw $6, %%xmm5 \n\t" \
"psraw $6, "REG0" \n\t" \
"psraw $6, "TAN3" \n\t" \
"psraw $6, %%xmm3 \n\t" \
"movdqa "TAN3", 1*16("dct") \n\t" \
"movdqa %%xmm3, 2*16("dct") \n\t" \
"movdqa "REG0", 5*16("dct") \n\t" \
"movdqa %%xmm5, 6*16("dct") \n\t" \
"movdqa "SREG2", %%xmm0 \n\t" \
"movdqa %%xmm6, %%xmm4 \n\t" \
"psubsw %%xmm1, "SREG2" \n\t" \
"psubsw "TAN1", %%xmm6 \n\t" \
"paddsw %%xmm0, %%xmm1 \n\t" \
"paddsw %%xmm4, "TAN1" \n\t" \
"psraw $6, %%xmm1 \n\t" \
"psraw $6, "SREG2" \n\t" \
"psraw $6, "TAN1" \n\t" \
"psraw $6, %%xmm6 \n\t" \
"movdqa %%xmm1, ("dct") \n\t" \
"movdqa "TAN1", 3*16("dct") \n\t" \
"movdqa %%xmm6, 4*16("dct") \n\t" \
"movdqa "SREG2", 7*16("dct") \n\t"
 
inline void ff_idct_xvid_sse2(short *block)
{
__asm__ volatile(
"movq "MANGLE(m127)", %%mm0 \n\t"
iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0))
iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1))
iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2))
 
TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4))
JZ("%%eax", "1f")
iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3))
 
TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6))
TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7))
iLLM_HEAD
".p2align 4 \n\t"
JNZ("%%ecx", "2f")
JNZ("%%eax", "3f")
JNZ("%%edx", "4f")
JNZ("%%esi", "5f")
iLLM_PASS_SPARSE("%0")
"jmp 6f \n\t"
"2: \n\t"
iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4))
"3: \n\t"
iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5))
JZ("%%edx", "1f")
"4: \n\t"
iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6))
JZ("%%esi", "1f")
"5: \n\t"
iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7))
#if ARCH_X86_32
iLLM_HEAD
#endif
iLLM_PASS("%0")
"6: \n\t"
: "+r"(block)
:
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" ,
"%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,)
#if ARCH_X86_64
XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11",
"%xmm12", "%xmm13", "%xmm14",)
#endif
"%eax", "%ecx", "%edx", "%esi", "memory"
);
}
 
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block)
{
ff_idct_xvid_sse2(block);
ff_put_pixels_clamped_mmx(block, dest, line_size);
}
 
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block)
{
ff_idct_xvid_sse2(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
 
#endif /* HAVE_SSE2_INLINE */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/idct_xvid.h
0,0 → 1,43
/*
* XVID MPEG-4 VIDEO CODEC
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
/**
* @file
* header for Xvid IDCT functions
*/
 
#ifndef AVCODEC_X86_IDCT_XVID_H
#define AVCODEC_X86_IDCT_XVID_H
 
#include <stdint.h>
 
void ff_idct_xvid_mmx(short *block);
void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block);
void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block);
 
void ff_idct_xvid_mmxext(short *block);
void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block);
void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block);
 
void ff_idct_xvid_sse2(short *block);
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block);
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block);
 
#endif /* AVCODEC_X86_IDCT_XVID_H */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/imdct36.asm
0,0 → 1,724
;******************************************************************************
;* 36 point SSE-optimized IMDCT transform
;* Copyright (c) 2011 Vitor Sessak
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
align 16
ps_mask: dd 0, ~0, ~0, ~0
ps_mask2: dd 0, ~0, 0, ~0
ps_mask3: dd 0, 0, 0, ~0
ps_mask4: dd 0, ~0, 0, 0
 
ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038
ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038
ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433
ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038
ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530
ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097
ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097
 
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000
 
ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461
dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349
dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896
dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991
dd 1.0, 0.70710678118654752439, 0.0, 0.0
 
ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461
dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349
dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896
dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991
dd 1.0, 0.70710678118654752439, 0.0, 0.0
 
costabs: times 4 dd 0.98480773
times 4 dd 0.93969262
times 4 dd 0.86602539
times 4 dd -0.76604444
times 4 dd -0.64278764
times 4 dd 0.50000000
times 4 dd -0.50000000
times 4 dd -0.34202015
times 4 dd -0.17364818
times 4 dd 0.50190992
times 4 dd 0.51763808
times 4 dd 0.55168896
times 4 dd 0.61038726
times 4 dd 0.70710677
times 4 dd 0.87172341
times 4 dd 1.18310082
times 4 dd 1.93185163
times 4 dd 5.73685646
 
%define SBLIMIT 32
SECTION_TEXT
 
%macro PSHUFD 3
%if cpuflag(sse2) && notcpuflag(avx)
pshufd %1, %2, %3
%else
shufps %1, %2, %2, %3
%endif
%endmacro
 
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
; output %1={x3,x4,y1,y2}
%macro BUILDINVHIGHLOW 3
%if cpuflag(avx)
shufps %1, %2, %3, 0x4e
%else
movlhps %1, %3
movhlps %1, %2
%endif
%endmacro
 
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4}
; output %1={x4,y1,y2,y3}
%macro ROTLEFT 3
%if cpuflag(ssse3)
palignr %1, %3, %2, 12
%else
BUILDINVHIGHLOW %1, %2, %3
shufps %1, %1, %3, 0x99
%endif
%endmacro
 
%macro INVERTHL 2
%if cpuflag(sse2)
PSHUFD %1, %2, 0x4e
%else
movhlps %1, %2
movlhps %1, %2
%endif
%endmacro
 
%macro BUTTERF 3
INVERTHL %2, %1
xorps %1, [ps_p1p1m1m1]
addps %1, %2
%if cpuflag(sse3)
mulps %1, %1, [ps_cosh_sse3 + %3]
PSHUFD %2, %1, 0xb1
addsubps %1, %1, %2
%else
mulps %1, [ps_cosh + %3]
PSHUFD %2, %1, 0xb1
xorps %1, [ps_p1m1p1m1]
addps %1, %2
%endif
%endmacro
 
%macro STORE 4
movhlps %2, %1
movss [%3 ], %1
movss [%3 + 2*%4], %2
shufps %1, %1, 0xb1
movss [%3 + %4], %1
movhlps %2, %1
movss [%3 + 3*%4], %2
%endmacro
 
%macro LOAD 4
movlps %1, [%3 ]
movhps %1, [%3 + %4]
movlps %2, [%3 + 2*%4]
movhps %2, [%3 + 3*%4]
shufps %1, %2, 0x88
%endmacro
 
%macro LOADA64 2
%if cpuflag(avx)
movu %1, [%2]
%else
movlps %1, [%2]
movhps %1, [%2 + 8]
%endif
%endmacro
 
%macro DEFINE_IMDCT 0
cglobal imdct36_float, 4,4,9, out, buf, in, win
 
; for(i=17;i>=1;i--) in[i] += in[i-1];
LOADA64 m0, inq
LOADA64 m1, inq + 16
 
ROTLEFT m5, m0, m1
 
PSHUFD m6, m0, 0x93
andps m6, m6, [ps_mask]
addps m0, m0, m6
 
LOADA64 m2, inq + 32
 
ROTLEFT m7, m1, m2
 
addps m1, m1, m5
LOADA64 m3, inq + 48
 
ROTLEFT m5, m2, m3
 
xorps m4, m4, m4
movlps m4, [inq+64]
BUILDINVHIGHLOW m6, m3, m4
shufps m6, m6, m4, 0xa9
 
addps m4, m4, m6
addps m2, m2, m7
addps m3, m3, m5
 
; for(i=17;i>=3;i-=2) in[i] += in[i-2];
movlhps m5, m5, m0
andps m5, m5, [ps_mask3]
 
BUILDINVHIGHLOW m7, m0, m1
andps m7, m7, [ps_mask2]
 
addps m0, m0, m5
 
BUILDINVHIGHLOW m6, m1, m2
andps m6, m6, [ps_mask2]
 
addps m1, m1, m7
 
BUILDINVHIGHLOW m7, m2, m3
andps m7, m7, [ps_mask2]
 
addps m2, m2, m6
 
movhlps m6, m6, m3
andps m6, m6, [ps_mask4]
 
addps m3, m3, m7
addps m4, m4, m6
 
; Populate tmp[]
movlhps m6, m1, m5 ; zero out high values
subps m6, m6, m4
 
subps m5, m0, m3
 
%if ARCH_X86_64
SWAP m5, m8
%endif
 
mulps m7, m2, [ps_val1]
 
%if ARCH_X86_64
mulps m5, m8, [ps_val2]
%else
mulps m5, m5, [ps_val2]
%endif
addps m7, m7, m5
 
mulps m5, m6, [ps_val1]
subps m7, m7, m5
 
%if ARCH_X86_64
SWAP m5, m8
%else
subps m5, m0, m3
%endif
 
subps m5, m5, m6
addps m5, m5, m2
 
shufps m6, m4, m3, 0xe4
subps m6, m6, m2
mulps m6, m6, [ps_val3]
 
addps m4, m4, m1
mulps m4, m4, [ps_val4]
 
shufps m1, m1, m0, 0xe4
addps m1, m1, m2
mulps m1, m1, [ps_val5]
 
mulps m3, m3, [ps_val6]
mulps m0, m0, [ps_val7]
addps m0, m0, m3
 
xorps m2, m1, [ps_p1p1m1m1]
subps m2, m2, m4
addps m2, m2, m0
 
addps m3, m4, m0
subps m3, m3, m6
xorps m3, m3, [ps_p1p1m1m1]
 
shufps m0, m0, m4, 0xe4
subps m0, m0, m1
addps m0, m0, m6
 
BUILDINVHIGHLOW m4, m2, m3
shufps m3, m3, m2, 0x4e
 
; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5}
 
BUTTERF m0, m1, 0
BUTTERF m7, m2, 16
BUTTERF m3, m6, 32
BUTTERF m4, m1, 48
 
mulps m5, m5, [ps_cosh + 64]
PSHUFD m1, m5, 0xe1
xorps m5, m5, [ps_p1m1p1m1]
addps m5, m5, m1
 
; permutates:
; m0 0 1 2 3 => 2 6 10 14 m1
; m7 4 5 6 7 => 3 7 11 15 m2
; m3 8 9 10 11 => 17 13 9 5 m3
; m4 12 13 14 15 => 16 12 8 4 m5
; m5 16 17 xx xx => 0 1 xx xx m0
 
unpckhps m1, m0, m7
unpckhps m6, m3, m4
movhlps m2, m6, m1
movlhps m1, m1, m6
 
unpcklps m5, m5, m4
unpcklps m3, m3, m7
movhlps m4, m3, m5
movlhps m5, m5, m3
SWAP m4, m3
; permutation done
 
PSHUFD m6, m2, 0xb1
movss m4, [bufq + 4*68]
movss m7, [bufq + 4*64]
unpcklps m7, m7, m4
mulps m6, m6, [winq + 16*4]
addps m6, m6, m7
movss [outq + 64*SBLIMIT], m6
shufps m6, m6, m6, 0xb1
movss [outq + 68*SBLIMIT], m6
 
mulps m6, m3, [winq + 4*4]
LOAD m4, m7, bufq + 4*16, 16
addps m6, m6, m4
STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT
 
shufps m4, m0, m3, 0xb5
mulps m4, m4, [winq + 8*4]
LOAD m7, m6, bufq + 4*32, 16
addps m4, m4, m7
STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT
 
shufps m3, m3, m2, 0xb1
mulps m3, m3, [winq + 12*4]
LOAD m7, m6, bufq + 4*48, 16
addps m3, m3, m7
STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT
 
mulps m2, m2, [winq]
LOAD m6, m7, bufq, 16
addps m2, m2, m6
STORE m2, m7, outq, 4*SBLIMIT
 
mulps m4, m1, [winq + 20*4]
STORE m4, m7, bufq, 16
 
mulps m3, m5, [winq + 24*4]
STORE m3, m7, bufq + 4*16, 16
 
shufps m0, m0, m5, 0xb0
mulps m0, m0, [winq + 28*4]
STORE m0, m7, bufq + 4*32, 16
 
shufps m5, m5, m1, 0xb1
mulps m5, m5, [winq + 32*4]
STORE m5, m7, bufq + 4*48, 16
 
shufps m1, m1, m1, 0xb1
mulps m1, m1, [winq + 36*4]
movss [bufq + 4*64], m1
shufps m1, m1, 0xb1
movss [bufq + 4*68], m1
RET
%endmacro
 
INIT_XMM sse
DEFINE_IMDCT
 
INIT_XMM sse2
DEFINE_IMDCT
 
INIT_XMM sse3
DEFINE_IMDCT
 
INIT_XMM ssse3
DEFINE_IMDCT
 
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEFINE_IMDCT
%endif
 
INIT_XMM sse
 
%if ARCH_X86_64
%define SPILL SWAP
%define UNSPILL SWAP
%define SPILLED(x) m %+ x
%else
%define SPILLED(x) [tmpq+(x-8)*16 + 32*4]
%macro SPILL 2 ; xmm#, mempos
movaps SPILLED(%2), m%1
%endmacro
%macro UNSPILL 2
movaps m%1, SPILLED(%2)
%endmacro
%endif
 
%macro DEFINE_FOUR_IMDCT 0
cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp
movlps m0, [inq+64]
movhps m0, [inq+64 + 72]
movlps m3, [inq+64 + 2*72]
movhps m3, [inq+64 + 3*72]
 
shufps m5, m0, m3, 0xdd
shufps m0, m0, m3, 0x88
 
mova m1, [inq+48]
movu m6, [inq+48 + 72]
mova m7, [inq+48 + 2*72]
movu m3, [inq+48 + 3*72]
 
TRANSPOSE4x4PS 1, 6, 7, 3, 4
 
addps m4, m6, m7
mova [tmpq+4*28], m4
 
addps m7, m3
addps m6, m1
addps m3, m0
addps m0, m5
addps m0, m7
addps m7, m6
mova [tmpq+4*12], m7
SPILL 3, 12
 
mova m4, [inq+32]
movu m5, [inq+32 + 72]
mova m2, [inq+32 + 2*72]
movu m7, [inq+32 + 3*72]
 
TRANSPOSE4x4PS 4, 5, 2, 7, 3
 
addps m1, m7
SPILL 1, 11
 
addps m3, m5, m2
SPILL 3, 13
 
addps m7, m2
addps m5, m4
addps m6, m7
mova [tmpq], m6
addps m7, m5
mova [tmpq+4*16], m7
 
mova m2, [inq+16]
movu m7, [inq+16 + 72]
mova m1, [inq+16 + 2*72]
movu m6, [inq+16 + 3*72]
 
TRANSPOSE4x4PS 2, 7, 1, 6, 3
 
addps m4, m6
addps m6, m1
addps m1, m7
addps m7, m2
addps m5, m6
SPILL 5, 15
addps m6, m7
mulps m6, [costabs + 16*2]
mova [tmpq+4*8], m6
SPILL 1, 10
SPILL 0, 14
 
mova m1, [inq]
movu m6, [inq + 72]
mova m3, [inq + 2*72]
movu m5, [inq + 3*72]
 
TRANSPOSE4x4PS 1, 6, 3, 5, 0
 
addps m2, m5
addps m5, m3
addps m7, m5
addps m3, m6
addps m6, m1
SPILL 7, 8
addps m5, m6
SPILL 6, 9
addps m6, m4, SPILLED(12)
subps m6, m2
UNSPILL 7, 11
SPILL 5, 11
subps m5, m1, m7
mulps m7, [costabs + 16*5]
addps m7, m1
mulps m0, m6, [costabs + 16*6]
addps m0, m5
mova [tmpq+4*24], m0
addps m6, m5
mova [tmpq+4*4], m6
addps m6, m4, m2
mulps m6, [costabs + 16*1]
subps m4, SPILLED(12)
mulps m4, [costabs + 16*8]
addps m2, SPILLED(12)
mulps m2, [costabs + 16*3]
subps m5, m7, m6
subps m5, m2
addps m6, m7
addps m6, m4
addps m7, m2
subps m7, m4
mova [tmpq+4*20], m7
mova m2, [tmpq+4*28]
mova [tmpq+4*28], m5
UNSPILL 7, 13
subps m5, m7, m2
mulps m5, [costabs + 16*7]
UNSPILL 1, 10
mulps m1, [costabs + 16*2]
addps m4, m3, m2
mulps m4, [costabs + 16*4]
addps m2, m7
addps m7, m3
mulps m7, [costabs]
subps m3, m2
mulps m3, [costabs + 16*2]
addps m2, m7, m5
addps m2, m1
SPILL 2, 10
addps m7, m4
subps m7, m1
SPILL 7, 12
subps m5, m4
subps m5, m1
UNSPILL 0, 14
SPILL 5, 13
addps m1, m0, SPILLED(15)
subps m1, SPILLED(8)
mova m4, [costabs + 16*5]
mulps m4, [tmpq]
UNSPILL 2, 9
addps m4, m2
subps m2, [tmpq]
mulps m5, m1, [costabs + 16*6]
addps m5, m2
SPILL 5, 9
addps m2, m1
SPILL 2, 14
UNSPILL 5, 15
subps m7, m5, m0
addps m5, SPILLED(8)
mulps m5, [costabs + 16*1]
mulps m7, [costabs + 16*8]
addps m0, SPILLED(8)
mulps m0, [costabs + 16*3]
subps m2, m4, m5
subps m2, m0
SPILL 2, 15
addps m5, m4
addps m5, m7
addps m4, m0
subps m4, m7
SPILL 4, 8
mova m7, [tmpq+4*16]
mova m2, [tmpq+4*12]
addps m0, m7, m2
subps m0, SPILLED(11)
mulps m0, [costabs + 16*2]
addps m4, m7, SPILLED(11)
mulps m4, [costabs]
subps m7, m2
mulps m7, [costabs + 16*7]
addps m2, SPILLED(11)
mulps m2, [costabs + 16*4]
addps m1, m7, [tmpq+4*8]
addps m1, m4
addps m4, m2
subps m4, [tmpq+4*8]
SPILL 4, 11
subps m7, m2
subps m7, [tmpq+4*8]
addps m4, m6, SPILLED(10)
subps m6, SPILLED(10)
addps m2, m5, m1
mulps m2, [costabs + 16*9]
subps m5, m1
mulps m5, [costabs + 16*17]
subps m1, m4, m2
addps m4, m2
mulps m2, m1, [winq+4*36]
addps m2, [bufq+4*36]
mova [outq+1152], m2
mulps m1, [winq+4*32]
addps m1, [bufq+4*32]
mova [outq+1024], m1
mulps m1, m4, [winq+4*116]
mova [bufq+4*36], m1
mulps m4, [winq+4*112]
mova [bufq+4*32], m4
addps m2, m6, m5
subps m6, m5
mulps m1, m6, [winq+4*68]
addps m1, [bufq+4*68]
mova [outq+2176], m1
mulps m6, [winq]
addps m6, [bufq]
mova [outq], m6
mulps m1, m2, [winq+4*148]
mova [bufq+4*68], m1
mulps m2, [winq+4*80]
mova [bufq], m2
addps m5, m3, [tmpq+4*24]
mova m2, [tmpq+4*24]
subps m2, m3
mova m1, SPILLED(9)
subps m1, m0
mulps m1, [costabs + 16*10]
addps m0, SPILLED(9)
mulps m0, [costabs + 16*16]
addps m6, m5, m1
subps m5, m1
mulps m3, m5, [winq+4*40]
addps m3, [bufq+4*40]
mova [outq+1280], m3
mulps m5, [winq+4*28]
addps m5, [bufq+4*28]
mova [outq+896], m5
mulps m1, m6, [winq+4*120]
mova [bufq+4*40], m1
mulps m6, [winq+4*108]
mova [bufq+4*28], m6
addps m1, m2, m0
subps m2, m0
mulps m5, m2, [winq+4*64]
addps m5, [bufq+4*64]
mova [outq+2048], m5
mulps m2, [winq+4*4]
addps m2, [bufq+4*4]
mova [outq+128], m2
mulps m0, m1, [winq+4*144]
mova [bufq+4*64], m0
mulps m1, [winq+4*84]
mova [bufq+4*4], m1
mova m1, [tmpq+4*28]
mova m5, m1
addps m1, SPILLED(13)
subps m5, SPILLED(13)
UNSPILL 3, 15
addps m2, m7, m3
mulps m2, [costabs + 16*11]
subps m3, m7
mulps m3, [costabs + 16*15]
addps m0, m2, m1
subps m1, m2
SWAP m0, m2
mulps m6, m1, [winq+4*44]
addps m6, [bufq+4*44]
mova [outq+1408], m6
mulps m1, [winq+4*24]
addps m1, [bufq+4*24]
mova [outq+768], m1
mulps m0, m2, [winq+4*124]
mova [bufq+4*44], m0
mulps m2, [winq+4*104]
mova [bufq+4*24], m2
addps m0, m5, m3
subps m5, m3
mulps m1, m5, [winq+4*60]
addps m1, [bufq+4*60]
mova [outq+1920], m1
mulps m5, [winq+4*8]
addps m5, [bufq+4*8]
mova [outq+256], m5
mulps m1, m0, [winq+4*140]
mova [bufq+4*60], m1
mulps m0, [winq+4*88]
mova [bufq+4*8], m0
mova m1, [tmpq+4*20]
addps m1, SPILLED(12)
mova m2, [tmpq+4*20]
subps m2, SPILLED(12)
UNSPILL 7, 8
subps m0, m7, SPILLED(11)
addps m7, SPILLED(11)
mulps m4, m7, [costabs + 16*12]
mulps m0, [costabs + 16*14]
addps m5, m1, m4
subps m1, m4
mulps m7, m1, [winq+4*48]
addps m7, [bufq+4*48]
mova [outq+1536], m7
mulps m1, [winq+4*20]
addps m1, [bufq+4*20]
mova [outq+640], m1
mulps m1, m5, [winq+4*128]
mova [bufq+4*48], m1
mulps m5, [winq+4*100]
mova [bufq+4*20], m5
addps m6, m2, m0
subps m2, m0
mulps m1, m2, [winq+4*56]
addps m1, [bufq+4*56]
mova [outq+1792], m1
mulps m2, [winq+4*12]
addps m2, [bufq+4*12]
mova [outq+384], m2
mulps m0, m6, [winq+4*136]
mova [bufq+4*56], m0
mulps m6, [winq+4*92]
mova [bufq+4*12], m6
UNSPILL 0, 14
mulps m0, [costabs + 16*13]
mova m3, [tmpq+4*4]
addps m2, m0, m3
subps m3, m0
mulps m0, m3, [winq+4*52]
addps m0, [bufq+4*52]
mova [outq+1664], m0
mulps m3, [winq+4*16]
addps m3, [bufq+4*16]
mova [outq+512], m3
mulps m0, m2, [winq+4*132]
mova [bufq+4*52], m0
mulps m2, [winq+4*96]
mova [bufq+4*16], m2
RET
%endmacro
 
INIT_XMM sse
DEFINE_FOUR_IMDCT
 
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
DEFINE_FOUR_IMDCT
%endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/lpc.c
0,0 → 1,159
/*
* MMX optimized LPC DSP utils
* Copyright (c) 2007 Loren Merritt
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/lpc.h"
 
DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 };
DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 };
 
#if HAVE_SSE2_INLINE
 
static void lpc_apply_welch_window_sse2(const int32_t *data, int len,
double *w_data)
{
double c = 2.0 / (len-1.0);
int n2 = len>>1;
x86_reg i = -n2*sizeof(int32_t);
x86_reg j = n2*sizeof(int32_t);
__asm__ volatile(
"movsd %4, %%xmm7 \n\t"
"movapd "MANGLE(pd_1)", %%xmm6 \n\t"
"movapd "MANGLE(pd_2)", %%xmm5 \n\t"
"movlhps %%xmm7, %%xmm7 \n\t"
"subpd %%xmm5, %%xmm7 \n\t"
"addsd %%xmm6, %%xmm7 \n\t"
"test $1, %5 \n\t"
"jz 2f \n\t"
#define WELCH(MOVPD, offset)\
"1: \n\t"\
"movapd %%xmm7, %%xmm1 \n\t"\
"mulpd %%xmm1, %%xmm1 \n\t"\
"movapd %%xmm6, %%xmm0 \n\t"\
"subpd %%xmm1, %%xmm0 \n\t"\
"pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\
"cvtpi2pd (%3,%0), %%xmm2 \n\t"\
"cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\
"mulpd %%xmm0, %%xmm2 \n\t"\
"mulpd %%xmm1, %%xmm3 \n\t"\
"movapd %%xmm2, (%2,%0,2) \n\t"\
MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\
"subpd %%xmm5, %%xmm7 \n\t"\
"sub $8, %1 \n\t"\
"add $8, %0 \n\t"\
"jl 1b \n\t"\
 
WELCH("movupd", -1)
"jmp 3f \n\t"
"2: \n\t"
WELCH("movapd", -2)
"3: \n\t"
:"+&r"(i), "+&r"(j)
:"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm5", "%xmm6", "%xmm7")
);
#undef WELCH
}
 
static void lpc_compute_autocorr_sse2(const double *data, int len, int lag,
double *autoc)
{
int j;
 
if((x86_reg)data & 15)
data++;
 
for(j=0; j<lag; j+=2){
x86_reg i = -len*sizeof(double);
if(j == lag-2) {
__asm__ volatile(
"movsd "MANGLE(pd_1)", %%xmm0 \n\t"
"movsd "MANGLE(pd_1)", %%xmm1 \n\t"
"movsd "MANGLE(pd_1)", %%xmm2 \n\t"
"1: \n\t"
"movapd (%2,%0), %%xmm3 \n\t"
"movupd -8(%3,%0), %%xmm4 \n\t"
"movapd (%3,%0), %%xmm5 \n\t"
"mulpd %%xmm3, %%xmm4 \n\t"
"mulpd %%xmm3, %%xmm5 \n\t"
"mulpd -16(%3,%0), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm1 \n\t"
"addpd %%xmm5, %%xmm0 \n\t"
"addpd %%xmm3, %%xmm2 \n\t"
"add $16, %0 \n\t"
"jl 1b \n\t"
"movhlps %%xmm0, %%xmm3 \n\t"
"movhlps %%xmm1, %%xmm4 \n\t"
"movhlps %%xmm2, %%xmm5 \n\t"
"addsd %%xmm3, %%xmm0 \n\t"
"addsd %%xmm4, %%xmm1 \n\t"
"addsd %%xmm5, %%xmm2 \n\t"
"movsd %%xmm0, (%1) \n\t"
"movsd %%xmm1, 8(%1) \n\t"
"movsd %%xmm2, 16(%1) \n\t"
:"+&r"(i)
:"r"(autoc+j), "r"(data+len), "r"(data+len-j)
:"memory"
);
} else {
__asm__ volatile(
"movsd "MANGLE(pd_1)", %%xmm0 \n\t"
"movsd "MANGLE(pd_1)", %%xmm1 \n\t"
"1: \n\t"
"movapd (%3,%0), %%xmm3 \n\t"
"movupd -8(%4,%0), %%xmm4 \n\t"
"mulpd %%xmm3, %%xmm4 \n\t"
"mulpd (%4,%0), %%xmm3 \n\t"
"addpd %%xmm4, %%xmm1 \n\t"
"addpd %%xmm3, %%xmm0 \n\t"
"add $16, %0 \n\t"
"jl 1b \n\t"
"movhlps %%xmm0, %%xmm3 \n\t"
"movhlps %%xmm1, %%xmm4 \n\t"
"addsd %%xmm3, %%xmm0 \n\t"
"addsd %%xmm4, %%xmm1 \n\t"
"movsd %%xmm0, %1 \n\t"
"movsd %%xmm1, %2 \n\t"
:"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1])
:"r"(data+len), "r"(data+len-j)
);
}
}
}
 
#endif /* HAVE_SSE2_INLINE */
 
av_cold void ff_lpc_init_x86(LPCContext *c)
{
#if HAVE_SSE2_INLINE
int cpu_flags = av_get_cpu_flags();
 
if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
c->lpc_apply_welch_window = lpc_apply_welch_window_sse2;
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2;
}
#endif /* HAVE_SSE2_INLINE */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mathops.h
0,0 → 1,128
/*
* simple math operations
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef AVCODEC_X86_MATHOPS_H
#define AVCODEC_X86_MATHOPS_H
 
#include "config.h"
#include "libavutil/common.h"
 
#if HAVE_INLINE_ASM
 
#if ARCH_X86_32
 
#define MULL MULL
static av_always_inline av_const int MULL(int a, int b, unsigned shift)
{
int rt, dummy;
__asm__ (
"imull %3 \n\t"
"shrdl %4, %%edx, %%eax \n\t"
:"=a"(rt), "=d"(dummy)
:"a"(a), "rm"(b), "ci"((uint8_t)shift)
);
return rt;
}
 
#define MULH MULH
static av_always_inline av_const int MULH(int a, int b)
{
int rt, dummy;
__asm__ (
"imull %3"
:"=d"(rt), "=a"(dummy)
:"a"(a), "rm"(b)
);
return rt;
}
 
#define MUL64 MUL64
static av_always_inline av_const int64_t MUL64(int a, int b)
{
int64_t rt;
__asm__ (
"imull %2"
:"=A"(rt)
:"a"(a), "rm"(b)
);
return rt;
}
 
#endif /* ARCH_X86_32 */
 
#if HAVE_I686
/* median of 3 */
#define mid_pred mid_pred
static inline av_const int mid_pred(int a, int b, int c)
{
int i=b;
__asm__ (
"cmp %2, %1 \n\t"
"cmovg %1, %0 \n\t"
"cmovg %2, %1 \n\t"
"cmp %3, %1 \n\t"
"cmovl %3, %1 \n\t"
"cmp %1, %0 \n\t"
"cmovg %1, %0 \n\t"
:"+&r"(i), "+&r"(a)
:"r"(b), "r"(c)
);
return i;
}
 
#define COPY3_IF_LT(x, y, a, b, c, d)\
__asm__ volatile(\
"cmpl %0, %3 \n\t"\
"cmovl %3, %0 \n\t"\
"cmovl %4, %1 \n\t"\
"cmovl %5, %2 \n\t"\
: "+&r" (x), "+&r" (a), "+r" (c)\
: "r" (y), "r" (b), "r" (d)\
);
#endif /* HAVE_I686 */
 
#define MASK_ABS(mask, level) \
__asm__ ("cltd \n\t" \
"xorl %1, %0 \n\t" \
"subl %1, %0 \n\t" \
: "+a"(level), "=&d"(mask))
 
// avoid +32 for shift optimization (gcc should do that ...)
#define NEG_SSR32 NEG_SSR32
static inline int32_t NEG_SSR32( int32_t a, int8_t s){
__asm__ ("sarl %1, %0\n\t"
: "+r" (a)
: "ic" ((uint8_t)(-s))
);
return a;
}
 
#define NEG_USR32 NEG_USR32
static inline uint32_t NEG_USR32(uint32_t a, int8_t s){
__asm__ ("shrl %1, %0\n\t"
: "+r" (a)
: "ic" ((uint8_t)(-s))
);
return a;
}
 
#endif /* HAVE_INLINE_ASM */
#endif /* AVCODEC_X86_MATHOPS_H */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mlpdsp.c
0,0 → 1,186
/*
* MLP DSP functions x86-optimized
* Copyright (c) 2009 Ramiro Polla
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mlpdsp.h"
#include "libavcodec/mlp.h"
 
#if HAVE_7REGS && HAVE_INLINE_ASM
 
extern char ff_mlp_firorder_8;
extern char ff_mlp_firorder_7;
extern char ff_mlp_firorder_6;
extern char ff_mlp_firorder_5;
extern char ff_mlp_firorder_4;
extern char ff_mlp_firorder_3;
extern char ff_mlp_firorder_2;
extern char ff_mlp_firorder_1;
extern char ff_mlp_firorder_0;
 
extern char ff_mlp_iirorder_4;
extern char ff_mlp_iirorder_3;
extern char ff_mlp_iirorder_2;
extern char ff_mlp_iirorder_1;
extern char ff_mlp_iirorder_0;
 
static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1,
&ff_mlp_firorder_2, &ff_mlp_firorder_3,
&ff_mlp_firorder_4, &ff_mlp_firorder_5,
&ff_mlp_firorder_6, &ff_mlp_firorder_7,
&ff_mlp_firorder_8 };
static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1,
&ff_mlp_iirorder_2, &ff_mlp_iirorder_3,
&ff_mlp_iirorder_4 };
 
#if ARCH_X86_64
 
#define MLPMUL(label, offset, offs, offc) \
LABEL_MANGLE(label)": \n\t" \
"movslq "offset"+"offs"(%0), %%rax\n\t" \
"movslq "offset"+"offc"(%1), %%rdx\n\t" \
"imul %%rdx, %%rax\n\t" \
"add %%rax, %%rsi\n\t"
 
#define FIRMULREG(label, offset, firc)\
LABEL_MANGLE(label)": \n\t" \
"movslq "#offset"(%0), %%rax\n\t" \
"imul %"#firc", %%rax\n\t" \
"add %%rax, %%rsi\n\t"
 
#define CLEAR_ACCUM \
"xor %%rsi, %%rsi\n\t"
 
#define SHIFT_ACCUM \
"shr %%cl, %%rsi\n\t"
 
#define ACCUM "%%rdx"
#define RESULT "%%rsi"
#define RESULT32 "%%esi"
 
#else /* if ARCH_X86_32 */
 
#define MLPMUL(label, offset, offs, offc) \
LABEL_MANGLE(label)": \n\t" \
"mov "offset"+"offs"(%0), %%eax\n\t" \
"imull "offset"+"offc"(%1) \n\t" \
"add %%eax , %%esi\n\t" \
"adc %%edx , %%ecx\n\t"
 
#define FIRMULREG(label, offset, firc) \
MLPMUL(label, #offset, "0", "0")
 
#define CLEAR_ACCUM \
"xor %%esi, %%esi\n\t" \
"xor %%ecx, %%ecx\n\t"
 
#define SHIFT_ACCUM \
"mov %%ecx, %%edx\n\t" \
"mov %%esi, %%eax\n\t" \
"movzbl %7 , %%ecx\n\t" \
"shrd %%cl, %%edx, %%eax\n\t" \
 
#define ACCUM "%%edx"
#define RESULT "%%eax"
#define RESULT32 "%%eax"
 
#endif /* !ARCH_X86_64 */
 
#define BINC AV_STRINGIFY(4* MAX_CHANNELS)
#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE))
#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER)
 
#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0")
#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC)
 
static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff,
int firorder, int iirorder,
unsigned int filter_shift, int32_t mask,
int blocksize, int32_t *sample_buffer)
{
const void *firjump = firtable[firorder];
const void *iirjump = iirtable[iirorder];
 
blocksize = -blocksize;
 
__asm__ volatile(
"1: \n\t"
CLEAR_ACCUM
"jmp *%5 \n\t"
FIRMUL (ff_mlp_firorder_8, 0x1c )
FIRMUL (ff_mlp_firorder_7, 0x18 )
FIRMUL (ff_mlp_firorder_6, 0x14 )
FIRMUL (ff_mlp_firorder_5, 0x10 )
FIRMUL (ff_mlp_firorder_4, 0x0c )
FIRMULREG(ff_mlp_firorder_3, 0x08,10)
FIRMULREG(ff_mlp_firorder_2, 0x04, 9)
FIRMULREG(ff_mlp_firorder_1, 0x00, 8)
LABEL_MANGLE(ff_mlp_firorder_0)":\n\t"
"jmp *%6 \n\t"
IIRMUL (ff_mlp_iirorder_4, 0x0c )
IIRMUL (ff_mlp_iirorder_3, 0x08 )
IIRMUL (ff_mlp_iirorder_2, 0x04 )
IIRMUL (ff_mlp_iirorder_1, 0x00 )
LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t"
SHIFT_ACCUM
"mov "RESULT" ,"ACCUM" \n\t"
"add (%2) ,"RESULT" \n\t"
"and %4 ,"RESULT" \n\t"
"sub $4 , %0 \n\t"
"mov "RESULT32", (%0) \n\t"
"mov "RESULT32", (%2) \n\t"
"add $"BINC" , %2 \n\t"
"sub "ACCUM" ,"RESULT" \n\t"
"mov "RESULT32","IOFFS"(%0) \n\t"
"incl %3 \n\t"
"js 1b \n\t"
: /* 0*/"+r"(state),
/* 1*/"+r"(coeff),
/* 2*/"+r"(sample_buffer),
#if ARCH_X86_64
/* 3*/"+r"(blocksize)
: /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump),
/* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift)
, /* 8*/"r"((int64_t)coeff[0])
, /* 9*/"r"((int64_t)coeff[1])
, /*10*/"r"((int64_t)coeff[2])
: "rax", "rdx", "rsi"
#else /* ARCH_X86_32 */
/* 3*/"+m"(blocksize)
: /* 4*/"m"( mask), /* 5*/"m"(firjump),
/* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift)
: "eax", "edx", "esi", "ecx"
#endif /* !ARCH_X86_64 */
);
}
 
#endif /* HAVE_7REGS && HAVE_INLINE_ASM */
 
av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c)
{
#if HAVE_7REGS && HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
c->mlp_filter_channel = mlp_filter_channel_x86;
#endif
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/motion_est.c
0,0 → 1,474
/*
* MMX optimized motion estimation
* Copyright (c) 2001 Fabrice Bellard
* Copyright (c) 2002-2004 Michael Niedermayer
*
* mostly by Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "dsputil_x86.h"
 
#if HAVE_INLINE_ASM
 
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
0x0000000000000000ULL,
0x0001000100010001ULL,
0x0002000200020002ULL,
};
 
DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
 
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
x86_reg len= -(x86_reg)stride*h;
__asm__ volatile(
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%"REG_a"), %%mm2 \n\t"
"movq (%2, %%"REG_a"), %%mm4 \n\t"
"add %3, %%"REG_a" \n\t"
"psubusb %%mm0, %%mm2 \n\t"
"psubusb %%mm4, %%mm0 \n\t"
"movq (%1, %%"REG_a"), %%mm1 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"movq (%2, %%"REG_a"), %%mm5 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm5, %%mm1 \n\t"
"por %%mm2, %%mm0 \n\t"
"por %%mm1, %%mm3 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm3, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm3 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm3, %%mm2 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"add %3, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
);
}
 
static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2,
int stride, int h)
{
__asm__ volatile(
".p2align 4 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"psadbw (%2), %%mm0 \n\t"
"psadbw (%2, %3), %%mm1 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm1, %%mm6 \n\t"
"lea (%1,%3,2), %1 \n\t"
"lea (%2,%3,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2)
: "r" ((x86_reg)stride)
);
}
 
static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)
{
int ret;
__asm__ volatile(
"pxor %%xmm2, %%xmm2 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movdqu (%1), %%xmm0 \n\t"
"movdqu (%1, %4), %%xmm1 \n\t"
"psadbw (%2), %%xmm0 \n\t"
"psadbw (%2, %4), %%xmm1 \n\t"
"paddw %%xmm0, %%xmm2 \n\t"
"paddw %%xmm1, %%xmm2 \n\t"
"lea (%1,%4,2), %1 \n\t"
"lea (%2,%4,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
"movhlps %%xmm2, %%xmm0 \n\t"
"paddw %%xmm0, %%xmm2 \n\t"
"movd %%xmm2, %3 \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret)
: "r" ((x86_reg)stride)
);
return ret;
}
 
static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2,
int stride, int h)
{
__asm__ volatile(
".p2align 4 \n\t"
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%1, %3), %%mm1 \n\t"
"pavgb 1(%1), %%mm0 \n\t"
"pavgb 1(%1, %3), %%mm1 \n\t"
"psadbw (%2), %%mm0 \n\t"
"psadbw (%2, %3), %%mm1 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm1, %%mm6 \n\t"
"lea (%1,%3,2), %1 \n\t"
"lea (%2,%3,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2)
: "r" ((x86_reg)stride)
);
}
 
static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2,
int stride, int h)
{
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"add %3, %1 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%1), %%mm1 \n\t"
"movq (%1, %3), %%mm2 \n\t"
"pavgb %%mm1, %%mm0 \n\t"
"pavgb %%mm2, %%mm1 \n\t"
"psadbw (%2), %%mm0 \n\t"
"psadbw (%2, %3), %%mm1 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm1, %%mm6 \n\t"
"movq %%mm2, %%mm0 \n\t"
"lea (%1,%3,2), %1 \n\t"
"lea (%2,%3,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2)
: "r" ((x86_reg)stride)
);
}
 
static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2,
int stride, int h)
{
__asm__ volatile(
"movq "MANGLE(bone)", %%mm5 \n\t"
"movq (%1), %%mm0 \n\t"
"pavgb 1(%1), %%mm0 \n\t"
"add %3, %1 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%1), %%mm1 \n\t"
"movq (%1,%3), %%mm2 \n\t"
"pavgb 1(%1), %%mm1 \n\t"
"pavgb 1(%1,%3), %%mm2 \n\t"
"psubusb %%mm5, %%mm1 \n\t"
"pavgb %%mm1, %%mm0 \n\t"
"pavgb %%mm2, %%mm1 \n\t"
"psadbw (%2), %%mm0 \n\t"
"psadbw (%2,%3), %%mm1 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm1, %%mm6 \n\t"
"movq %%mm2, %%mm0 \n\t"
"lea (%1,%3,2), %1 \n\t"
"lea (%2,%3,2), %2 \n\t"
"sub $2, %0 \n\t"
" jg 1b \n\t"
: "+r" (h), "+r" (blk1), "+r" (blk2)
: "r" ((x86_reg)stride)
);
}
 
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h)
{
x86_reg len= -(x86_reg)stride*h;
__asm__ volatile(
".p2align 4 \n\t"
"1: \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq (%2, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm2 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm0, %%mm1 \n\t"
"paddw %%mm2, %%mm3 \n\t"
"movq (%3, %%"REG_a"), %%mm4 \n\t"
"movq (%3, %%"REG_a"), %%mm2 \n\t"
"paddw %%mm5, %%mm1 \n\t"
"paddw %%mm5, %%mm3 \n\t"
"psrlw $1, %%mm1 \n\t"
"psrlw $1, %%mm3 \n\t"
"packuswb %%mm3, %%mm1 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm2, %%mm1 \n\t"
"por %%mm4, %%mm1 \n\t"
"movq %%mm1, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"add %4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
);
}
 
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
x86_reg len= -(x86_reg)stride*h;
__asm__ volatile(
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%2, %%"REG_a"), %%mm2 \n\t"
"movq 1(%2, %%"REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddw %%mm4, %%mm2 \n\t"
"paddw %%mm5, %%mm3 \n\t"
"movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm3, %%mm1 \n\t"
"paddw %%mm5, %%mm0 \n\t"
"paddw %%mm5, %%mm1 \n\t"
"movq (%3, %%"REG_a"), %%mm4 \n\t"
"movq (%3, %%"REG_a"), %%mm5 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"psubusb %%mm0, %%mm4 \n\t"
"psubusb %%mm5, %%mm0 \n\t"
"por %%mm4, %%mm0 \n\t"
"movq %%mm0, %%mm4 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpckhbw %%mm7, %%mm4 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"paddw %%mm4, %%mm6 \n\t"
"movq %%mm2, %%mm0 \n\t"
"movq %%mm3, %%mm1 \n\t"
"add %4, %%"REG_a" \n\t"
" js 1b \n\t"
: "+a" (len)
: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
);
}
 
static inline int sum_mmx(void)
{
int ret;
__asm__ volatile(
"movq %%mm6, %%mm0 \n\t"
"psrlq $32, %%mm6 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"movq %%mm6, %%mm0 \n\t"
"psrlq $16, %%mm6 \n\t"
"paddw %%mm0, %%mm6 \n\t"
"movd %%mm6, %0 \n\t"
: "=r" (ret)
);
return ret&0xFFFF;
}
 
static inline int sum_mmxext(void)
{
int ret;
__asm__ volatile(
"movd %%mm6, %0 \n\t"
: "=r" (ret)
);
return ret;
}
 
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
}
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h)
{
sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
}
 
 
#define PIX_SAD(suf)\
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
av_assert2(h==8);\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t":);\
\
sad8_1_ ## suf(blk1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
av_assert2(h==8);\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[1]) \
);\
\
sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
\
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
av_assert2(h==8);\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[1]) \
);\
\
sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
\
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
av_assert2(h==8);\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
::);\
\
sad8_4_ ## suf(blk1, blk2, stride, 8);\
\
return sum_ ## suf();\
}\
\
static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t":);\
\
sad8_1_ ## suf(blk1 , blk2 , stride, h);\
sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[1]) \
);\
\
sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
"movq %0, %%mm5 \n\t"\
:: "m"(round_tab[1]) \
);\
\
sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\
{\
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
"pxor %%mm6, %%mm6 \n\t"\
::);\
\
sad8_4_ ## suf(blk1 , blk2 , stride, h);\
sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
\
return sum_ ## suf();\
}\
 
PIX_SAD(mmx)
PIX_SAD(mmxext)
 
#endif /* HAVE_INLINE_ASM */
 
av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx)
{
#if HAVE_INLINE_ASM
int cpu_flags = av_get_cpu_flags();
 
if (INLINE_MMX(cpu_flags)) {
c->pix_abs[0][0] = sad16_mmx;
c->pix_abs[0][1] = sad16_x2_mmx;
c->pix_abs[0][2] = sad16_y2_mmx;
c->pix_abs[0][3] = sad16_xy2_mmx;
c->pix_abs[1][0] = sad8_mmx;
c->pix_abs[1][1] = sad8_x2_mmx;
c->pix_abs[1][2] = sad8_y2_mmx;
c->pix_abs[1][3] = sad8_xy2_mmx;
 
c->sad[0]= sad16_mmx;
c->sad[1]= sad8_mmx;
}
if (INLINE_MMXEXT(cpu_flags)) {
c->pix_abs[0][0] = sad16_mmxext;
c->pix_abs[1][0] = sad8_mmxext;
 
c->sad[0] = sad16_mmxext;
c->sad[1] = sad8_mmxext;
 
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
c->pix_abs[0][1] = sad16_x2_mmxext;
c->pix_abs[0][2] = sad16_y2_mmxext;
c->pix_abs[0][3] = sad16_xy2_mmxext;
c->pix_abs[1][1] = sad8_x2_mmxext;
c->pix_abs[1][2] = sad8_y2_mmxext;
c->pix_abs[1][3] = sad8_xy2_mmxext;
}
}
if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
c->sad[0]= sad16_sse2;
}
#endif /* HAVE_INLINE_ASM */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpeg4qpel.asm
0,0 → 1,560
;******************************************************************************
;* mpeg4 qpel
;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at>
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
cextern pb_1
cextern pw_3
cextern pw_15
cextern pw_16
cextern pw_20
 
 
SECTION_TEXT
 
; put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PUT_NO_RND_PIXELS8_L2 0
cglobal put_no_rnd_pixels8_l2, 6,6
movsxdifnidn r4, r4d
movsxdifnidn r3, r3d
pcmpeqb m6, m6
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r2]
add r1, r4
add r2, 8
pxor m0, m6
pxor m1, m6
PAVGB m0, m1
pxor m0, m6
mova [r0], m0
add r0, r3
dec r5d
.loop:
mova m0, [r1]
add r1, r4
mova m1, [r1]
add r1, r4
mova m2, [r2]
mova m3, [r2+8]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m2
PAVGB m1, m3
pxor m0, m6
pxor m1, m6
mova [r0], m0
add r0, r3
mova [r0], m1
add r0, r3
mova m0, [r1]
add r1, r4
mova m1, [r1]
add r1, r4
mova m2, [r2+16]
mova m3, [r2+24]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m2
PAVGB m1, m3
pxor m0, m6
pxor m1, m6
mova [r0], m0
add r0, r3
mova [r0], m1
add r0, r3
add r2, 32
sub r5d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PUT_NO_RND_PIXELS8_L2
 
 
; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PUT_NO_RND_PIXELS16_l2 0
cglobal put_no_rnd_pixels16_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
pcmpeqb m6, m6
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r1+8]
mova m2, [r2]
mova m3, [r2+8]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m2
PAVGB m1, m3
pxor m0, m6
pxor m1, m6
add r1, r4
add r2, 16
mova [r0], m0
mova [r0+8], m1
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
mova m2, [r2]
mova m3, [r2+8]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m2
PAVGB m1, m3
pxor m0, m6
pxor m1, m6
mova [r0], m0
mova [r0+8], m1
add r0, r3
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
mova m2, [r2+16]
mova m3, [r2+24]
pxor m0, m6
pxor m1, m6
pxor m2, m6
pxor m3, m6
PAVGB m0, m2
PAVGB m1, m3
pxor m0, m6
pxor m1, m6
mova [r0], m0
mova [r0+8], m1
add r0, r3
add r2, 32
sub r5d, 2
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PUT_NO_RND_PIXELS16_l2
INIT_MMX 3dnow
PUT_NO_RND_PIXELS16_l2
 
%macro MPEG4_QPEL16_H_LOWPASS 1
cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
pxor m7, m7
.loop:
mova m0, [r1]
mova m1, m0
mova m2, m0
punpcklbw m0, m7
punpckhbw m1, m7
pshufw m5, m0, 0x90
pshufw m6, m0, 0x41
mova m3, m2
mova m4, m2
psllq m2, 8
psllq m3, 16
psllq m4, 24
punpckhbw m2, m7
punpckhbw m3, m7
punpckhbw m4, m7
paddw m5, m3
paddw m6, m2
paddw m5, m5
psubw m6, m5
pshufw m5, m0, 6
pmullw m6, [pw_3]
paddw m0, m4
paddw m5, m1
pmullw m0, [pw_20]
psubw m0, m5
paddw m6, [PW_ROUND]
paddw m0, m6
psraw m0, 5
mova [rsp+8], m0
mova m0, [r1+5]
mova m5, m0
mova m6, m0
psrlq m0, 8
psrlq m5, 16
punpcklbw m0, m7
punpcklbw m5, m7
paddw m2, m0
paddw m3, m5
paddw m2, m2
psubw m3, m2
mova m2, m6
psrlq m6, 24
punpcklbw m2, m7
punpcklbw m6, m7
pmullw m3, [pw_3]
paddw m1, m2
paddw m4, m6
pmullw m1, [pw_20]
psubw m3, m4
paddw m1, [PW_ROUND]
paddw m3, m1
psraw m3, 5
mova m1, [rsp+8]
packuswb m1, m3
OP_MOV [r0], m1, m4
mova m1, [r1+9]
mova m4, m1
mova m3, m1
psrlq m1, 8
psrlq m4, 16
punpcklbw m1, m7
punpcklbw m4, m7
paddw m5, m1
paddw m0, m4
paddw m5, m5
psubw m0, m5
mova m5, m3
psrlq m3, 24
pmullw m0, [pw_3]
punpcklbw m3, m7
paddw m2, m3
psubw m0, m2
mova m2, m5
punpcklbw m2, m7
punpckhbw m5, m7
paddw m6, m2
pmullw m6, [pw_20]
paddw m0, [PW_ROUND]
paddw m0, m6
psraw m0, 5
paddw m3, m5
pshufw m6, m5, 0xf9
paddw m6, m4
pshufw m4, m5, 0xbe
pshufw m5, m5, 0x6f
paddw m4, m1
paddw m5, m2
paddw m6, m6
psubw m4, m6
pmullw m3, [pw_20]
pmullw m4, [pw_3]
psubw m3, m5
paddw m4, [PW_ROUND]
paddw m4, m3
psraw m4, 5
packuswb m0, m4
OP_MOV [r0+8], m0, m4
add r1, r3
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
 
%macro PUT_OP 2-3
mova %1, %2
%endmacro
 
%macro AVG_OP 2-3
mova %3, %1
pavgb %2, %3
mova %1, %2
%endmacro
 
INIT_MMX mmxext
%define PW_ROUND pw_16
%define OP_MOV PUT_OP
MPEG4_QPEL16_H_LOWPASS put
%define PW_ROUND pw_16
%define OP_MOV AVG_OP
MPEG4_QPEL16_H_LOWPASS avg
%define PW_ROUND pw_15
%define OP_MOV PUT_OP
MPEG4_QPEL16_H_LOWPASS put_no_rnd
 
 
 
%macro MPEG4_QPEL8_H_LOWPASS 1
cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
pxor m7, m7
.loop:
mova m0, [r1]
mova m1, m0
mova m2, m0
punpcklbw m0, m7
punpckhbw m1, m7
pshufw m5, m0, 0x90
pshufw m6, m0, 0x41
mova m3, m2
mova m4, m2
psllq m2, 8
psllq m3, 16
psllq m4, 24
punpckhbw m2, m7
punpckhbw m3, m7
punpckhbw m4, m7
paddw m5, m3
paddw m6, m2
paddw m5, m5
psubw m6, m5
pshufw m5, m0, 0x6
pmullw m6, [pw_3]
paddw m0, m4
paddw m5, m1
pmullw m0, [pw_20]
psubw m0, m5
paddw m6, [PW_ROUND]
paddw m0, m6
psraw m0, 5
movh m5, [r1+5]
punpcklbw m5, m7
pshufw m6, m5, 0xf9
paddw m1, m5
paddw m2, m6
pshufw m6, m5, 0xbe
pshufw m5, m5, 0x6f
paddw m3, m6
paddw m4, m5
paddw m2, m2
psubw m3, m2
pmullw m1, [pw_20]
pmullw m3, [pw_3]
psubw m3, m4
paddw m1, [PW_ROUND]
paddw m3, m1
psraw m3, 5
packuswb m0, m3
OP_MOV [r0], m0, m4
add r1, r3
add r0, r2
dec r4d
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
%define PW_ROUND pw_16
%define OP_MOV PUT_OP
MPEG4_QPEL8_H_LOWPASS put
%define PW_ROUND pw_16
%define OP_MOV AVG_OP
MPEG4_QPEL8_H_LOWPASS avg
%define PW_ROUND pw_15
%define OP_MOV PUT_OP
MPEG4_QPEL8_H_LOWPASS put_no_rnd
 
 
 
%macro QPEL_V_LOW 5
paddw m0, m1
mova m4, [pw_20]
pmullw m4, m0
mova m0, %4
mova m5, %1
paddw m5, m0
psubw m4, m5
mova m5, %2
mova m6, %3
paddw m5, m3
paddw m6, m2
paddw m6, m6
psubw m5, m6
pmullw m5, [pw_3]
paddw m4, [PW_ROUND]
paddw m5, m4
psraw m5, 5
packuswb m5, m5
OP_MOV %5, m5, m7
SWAP 0,1,2,3
%endmacro
 
%macro MPEG4_QPEL16_V_LOWPASS 1
cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
 
mov r4d, 17
mov r5, rsp
pxor m7, m7
.looph:
mova m0, [r1]
mova m1, [r1]
mova m2, [r1+8]
mova m3, [r1+8]
punpcklbw m0, m7
punpckhbw m1, m7
punpcklbw m2, m7
punpckhbw m3, m7
mova [r5], m0
mova [r5+0x88], m1
mova [r5+0x110], m2
mova [r5+0x198], m3
add r5, 8
add r1, r3
dec r4d
jne .looph
 
 
; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride
mov r4d, 4
mov r1, 4
neg r2
lea r1, [r1+r2*8]
lea r1, [r1+r2*4]
lea r1, [r1+r2*2]
neg r2
mov r5, rsp
.loopv:
pxor m7, m7
mova m0, [r5+ 0x0]
mova m1, [r5+ 0x8]
mova m2, [r5+0x10]
mova m3, [r5+0x18]
QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0]
QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0]
QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0]
QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0]
QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0]
QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2]
 
add r5, 0x88
add r0, r1
dec r4d
jne .loopv
REP_RET
%endmacro
 
%macro PUT_OPH 2-3
movh %1, %2
%endmacro
 
%macro AVG_OPH 2-3
movh %3, %1
pavgb %2, %3
movh %1, %2
%endmacro
 
INIT_MMX mmxext
%define PW_ROUND pw_16
%define OP_MOV PUT_OPH
MPEG4_QPEL16_V_LOWPASS put
%define PW_ROUND pw_16
%define OP_MOV AVG_OPH
MPEG4_QPEL16_V_LOWPASS avg
%define PW_ROUND pw_15
%define OP_MOV PUT_OPH
MPEG4_QPEL16_V_LOWPASS put_no_rnd
 
 
 
%macro MPEG4_QPEL8_V_LOWPASS 1
cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288
movsxdifnidn r2, r2d
movsxdifnidn r3, r3d
 
mov r4d, 9
mov r5, rsp
pxor m7, m7
.looph:
mova m0, [r1]
mova m1, [r1]
punpcklbw m0, m7
punpckhbw m1, m7
mova [r5], m0
mova [r5+0x48], m1
add r5, 8
add r1, r3
dec r4d
jne .looph
 
 
; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride
mov r4d, 2
mov r1, 4
neg r2
lea r1, [r1+r2*4]
lea r1, [r1+r2*2]
neg r2
mov r5, rsp
.loopv:
pxor m7, m7
mova m0, [r5+ 0x0]
mova m1, [r5+ 0x8]
mova m2, [r5+0x10]
mova m3, [r5+0x18]
QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0]
QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0]
QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0]
QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2]
lea r0, [r0+r2*2]
QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0]
QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2]
 
add r5, 0x48
add r0, r1
dec r4d
jne .loopv
REP_RET
%endmacro
 
INIT_MMX mmxext
%define PW_ROUND pw_16
%define OP_MOV PUT_OPH
MPEG4_QPEL8_V_LOWPASS put
%define PW_ROUND pw_16
%define OP_MOV AVG_OPH
MPEG4_QPEL8_V_LOWPASS avg
%define PW_ROUND pw_15
%define OP_MOV PUT_OPH
MPEG4_QPEL8_V_LOWPASS put_no_rnd
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpegaudiodsp.c
0,0 → 1,277
/*
* MMX optimized MP3 decoding functions
* Copyright (c) 2010 Vitor Sessak
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/internal.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/mpegaudiodsp.h"
 
#define DECL(CPU)\
static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\
void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win);
 
DECL(sse)
DECL(sse2)
DECL(sse3)
DECL(ssse3)
DECL(avx)
 
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win,
float *tmpbuf);
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win,
float *tmpbuf);
 
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
 
#if HAVE_SSE2_INLINE
 
#define MACS(rt, ra, rb) rt+=(ra)*(rb)
#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
 
#define SUM8(op, sum, w, p) \
{ \
op(sum, (w)[0 * 64], (p)[0 * 64]); \
op(sum, (w)[1 * 64], (p)[1 * 64]); \
op(sum, (w)[2 * 64], (p)[2 * 64]); \
op(sum, (w)[3 * 64], (p)[3 * 64]); \
op(sum, (w)[4 * 64], (p)[4 * 64]); \
op(sum, (w)[5 * 64], (p)[5 * 64]); \
op(sum, (w)[6 * 64], (p)[6 * 64]); \
op(sum, (w)[7 * 64], (p)[7 * 64]); \
}
 
static void apply_window(const float *buf, const float *win1,
const float *win2, float *sum1, float *sum2, int len)
{
x86_reg count = - 4*len;
const float *win1a = win1+len;
const float *win2a = win2+len;
const float *bufa = buf+len;
float *sum1a = sum1+len;
float *sum2a = sum2+len;
 
 
#define MULT(a, b) \
"movaps " #a "(%1,%0), %%xmm1 \n\t" \
"movaps " #a "(%3,%0), %%xmm2 \n\t" \
"mulps %%xmm2, %%xmm1 \n\t" \
"subps %%xmm1, %%xmm0 \n\t" \
"mulps " #b "(%2,%0), %%xmm2 \n\t" \
"subps %%xmm2, %%xmm4 \n\t" \
 
__asm__ volatile(
"1: \n\t"
"xorps %%xmm0, %%xmm0 \n\t"
"xorps %%xmm4, %%xmm4 \n\t"
 
MULT( 0, 0)
MULT( 256, 64)
MULT( 512, 128)
MULT( 768, 192)
MULT(1024, 256)
MULT(1280, 320)
MULT(1536, 384)
MULT(1792, 448)
 
"movaps %%xmm0, (%4,%0) \n\t"
"movaps %%xmm4, (%5,%0) \n\t"
"add $16, %0 \n\t"
"jl 1b \n\t"
:"+&r"(count)
:"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
);
 
#undef MULT
}
 
static void apply_window_mp3(float *in, float *win, int *unused, float *out,
int incr)
{
LOCAL_ALIGNED_16(float, suma, [17]);
LOCAL_ALIGNED_16(float, sumb, [17]);
LOCAL_ALIGNED_16(float, sumc, [17]);
LOCAL_ALIGNED_16(float, sumd, [17]);
 
float sum;
 
/* copy to avoid wrap */
__asm__ volatile(
"movaps 0(%0), %%xmm0 \n\t" \
"movaps 16(%0), %%xmm1 \n\t" \
"movaps 32(%0), %%xmm2 \n\t" \
"movaps 48(%0), %%xmm3 \n\t" \
"movaps %%xmm0, 0(%1) \n\t" \
"movaps %%xmm1, 16(%1) \n\t" \
"movaps %%xmm2, 32(%1) \n\t" \
"movaps %%xmm3, 48(%1) \n\t" \
"movaps 64(%0), %%xmm0 \n\t" \
"movaps 80(%0), %%xmm1 \n\t" \
"movaps 96(%0), %%xmm2 \n\t" \
"movaps 112(%0), %%xmm3 \n\t" \
"movaps %%xmm0, 64(%1) \n\t" \
"movaps %%xmm1, 80(%1) \n\t" \
"movaps %%xmm2, 96(%1) \n\t" \
"movaps %%xmm3, 112(%1) \n\t"
::"r"(in), "r"(in+512)
:"memory"
);
 
apply_window(in + 16, win , win + 512, suma, sumc, 16);
apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
 
SUM8(MACS, suma[0], win + 32, in + 48);
 
sumc[ 0] = 0;
sumb[16] = 0;
sumd[16] = 0;
 
#define SUMS(suma, sumb, sumc, sumd, out1, out2) \
"movups " #sumd "(%4), %%xmm0 \n\t" \
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
"subps " #suma "(%1), %%xmm0 \n\t" \
"movaps %%xmm0," #out1 "(%0) \n\t" \
\
"movups " #sumc "(%3), %%xmm0 \n\t" \
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
"addps " #sumb "(%2), %%xmm0 \n\t" \
"movaps %%xmm0," #out2 "(%0) \n\t"
 
if (incr == 1) {
__asm__ volatile(
SUMS( 0, 48, 4, 52, 0, 112)
SUMS(16, 32, 20, 36, 16, 96)
SUMS(32, 16, 36, 20, 32, 80)
SUMS(48, 0, 52, 4, 48, 64)
 
:"+&r"(out)
:"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
:"memory"
);
out += 16*incr;
} else {
int j;
float *out2 = out + 32 * incr;
out[0 ] = -suma[ 0];
out += incr;
out2 -= incr;
for(j=1;j<16;j++) {
*out = -suma[ j] + sumd[16-j];
*out2 = sumb[16-j] + sumc[ j];
out += incr;
out2 -= incr;
}
}
 
sum = 0;
SUM8(MLSS, sum, win + 16 + 32, in + 32);
*out = sum;
}
 
#endif /* HAVE_SSE2_INLINE */
 
#if HAVE_YASM
#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \
int count, int switch_point, int block_type) \
{ \
int align_end = count - (count & 3); \
int j; \
for (j = 0; j < align_end; j+= 4) { \
LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
/* apply window & overlap with previous buffer */ \
\
/* select window */ \
ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
in += 4*18; \
buf += 4*18; \
out += 4; \
} \
for (; j < count; j++) { \
/* apply window & overlap with previous buffer */ \
\
/* select window */ \
int win_idx = (switch_point && j < 2) ? 0 : block_type; \
float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
\
ff_imdct36_float_ ## CPU1(out, buf, in, win); \
\
in += 18; \
buf++; \
out++; \
} \
}
 
#if HAVE_SSE
DECL_IMDCT_BLOCKS(sse,sse)
DECL_IMDCT_BLOCKS(sse2,sse)
DECL_IMDCT_BLOCKS(sse3,sse)
DECL_IMDCT_BLOCKS(ssse3,sse)
#endif
#if HAVE_AVX_EXTERNAL
DECL_IMDCT_BLOCKS(avx,avx)
#endif
#endif /* HAVE_YASM */
 
av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
 
int i, j;
for (j = 0; j < 4; j++) {
for (i = 0; i < 40; i ++) {
mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
}
}
 
#if HAVE_SSE2_INLINE
if (cpu_flags & AV_CPU_FLAG_SSE2) {
s->apply_window_float = apply_window_mp3;
}
#endif /* HAVE_SSE2_INLINE */
 
#if HAVE_YASM
if (EXTERNAL_SSE(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse;
}
if (EXTERNAL_SSE2(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse2;
}
if (EXTERNAL_SSE3(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_sse3;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_ssse3;
}
if (EXTERNAL_AVX(cpu_flags)) {
s->imdct36_blocks_float = imdct36_blocks_avx;
}
#endif /* HAVE_YASM */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpegvideo.c
0,0 → 1,577
/*
* Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru>
* h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/mpegvideo.h"
#include "dsputil_x86.h"
 
#if HAVE_MMX_INLINE
 
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg level, qmul, qadd, nCoeffs;
 
qmul = qscale << 1;
 
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
if (!s->h263_aic) {
if (n < 4)
level = block[0] * s->y_dc_scale;
else
level = block[0] * s->c_dc_scale;
qadd = (qscale - 1) | 1;
}else{
qadd = 0;
level= block[0];
}
if(s->ac_pred)
nCoeffs=63;
else
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
 
__asm__ volatile(
"movd %1, %%mm6 \n\t" //qmul
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" //qadd
"pxor %%mm7, %%mm7 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"psubw %%mm5, %%mm7 \n\t"
"pxor %%mm4, %%mm4 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %3), %%mm0 \n\t"
"movq 8(%0, %3), %%mm1 \n\t"
 
"pmullw %%mm6, %%mm0 \n\t"
"pmullw %%mm6, %%mm1 \n\t"
 
"movq (%0, %3), %%mm2 \n\t"
"movq 8(%0, %3), %%mm3 \n\t"
 
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
 
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
 
"paddw %%mm7, %%mm0 \n\t"
"paddw %%mm7, %%mm1 \n\t"
 
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
 
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
 
"pandn %%mm2, %%mm0 \n\t"
"pandn %%mm3, %%mm1 \n\t"
 
"movq %%mm0, (%0, %3) \n\t"
"movq %%mm1, 8(%0, %3) \n\t"
 
"add $16, %3 \n\t"
"jng 1b \n\t"
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
: "memory"
);
block[0]= level;
}
 
 
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg qmul, qadd, nCoeffs;
 
qmul = qscale << 1;
qadd = (qscale - 1) | 1;
 
av_assert2(s->block_last_index[n]>=0 || s->h263_aic);
 
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ];
 
__asm__ volatile(
"movd %1, %%mm6 \n\t" //qmul
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"movd %2, %%mm5 \n\t" //qadd
"pxor %%mm7, %%mm7 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"packssdw %%mm5, %%mm5 \n\t"
"psubw %%mm5, %%mm7 \n\t"
"pxor %%mm4, %%mm4 \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %3), %%mm0 \n\t"
"movq 8(%0, %3), %%mm1 \n\t"
 
"pmullw %%mm6, %%mm0 \n\t"
"pmullw %%mm6, %%mm1 \n\t"
 
"movq (%0, %3), %%mm2 \n\t"
"movq 8(%0, %3), %%mm3 \n\t"
 
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
 
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
 
"paddw %%mm7, %%mm0 \n\t"
"paddw %%mm7, %%mm1 \n\t"
 
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
 
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0
 
"pandn %%mm2, %%mm0 \n\t"
"pandn %%mm3, %%mm1 \n\t"
 
"movq %%mm0, (%0, %3) \n\t"
"movq %%mm1, 8(%0, %3) \n\t"
 
"add $16, %3 \n\t"
"jng 1b \n\t"
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs))
: "memory"
);
}
 
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
int block0;
 
av_assert2(s->block_last_index[n]>=0);
 
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
if (n < 4)
block0 = block[0] * s->y_dc_scale;
else
block0 = block[0] * s->c_dc_scale;
/* XXX: only mpeg1 */
quant_matrix = s->intra_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $3, %%mm0 \n\t"
"psraw $3, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t"
"psubw %%mm7, %%mm1 \n\t"
"por %%mm7, %%mm0 \n\t"
"por %%mm7, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
 
"add $16, %%"REG_a" \n\t"
"js 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"REG_a, "memory"
);
block[0]= block0;
}
 
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
 
av_assert2(s->block_last_index[n]>=0);
 
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1;
 
quant_matrix = s->inter_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $4, %%mm0 \n\t"
"psraw $4, %%mm1 \n\t"
"psubw %%mm7, %%mm0 \n\t"
"psubw %%mm7, %%mm1 \n\t"
"por %%mm7, %%mm0 \n\t"
"por %%mm7, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
 
"add $16, %%"REG_a" \n\t"
"js 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"REG_a, "memory"
);
}
 
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
int block0;
 
av_assert2(s->block_last_index[n]>=0);
 
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
 
if (n < 4)
block0 = block[0] * s->y_dc_scale;
else
block0 = block[0] * s->c_dc_scale;
quant_matrix = s->intra_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlw $15, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psraw $3, %%mm0 \n\t"
"psraw $3, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
 
"add $16, %%"REG_a" \n\t"
"jng 1b \n\t"
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs)
: "%"REG_a, "memory"
);
block[0]= block0;
//Note, we do not do mismatch control for intra as errors cannot accumulate
}
 
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s,
int16_t *block, int n, int qscale)
{
x86_reg nCoeffs;
const uint16_t *quant_matrix;
 
av_assert2(s->block_last_index[n]>=0);
 
if(s->alternate_scan) nCoeffs= 63; //FIXME
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ];
 
quant_matrix = s->inter_matrix;
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"psrlq $48, %%mm7 \n\t"
"movd %2, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"packssdw %%mm6, %%mm6 \n\t"
"mov %3, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
"movq (%0, %%"REG_a"), %%mm0 \n\t"
"movq 8(%0, %%"REG_a"), %%mm1 \n\t"
"movq (%1, %%"REG_a"), %%mm4 \n\t"
"movq 8(%1, %%"REG_a"), %%mm5 \n\t"
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i]
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i]
"pxor %%mm2, %%mm2 \n\t"
"pxor %%mm3, %%mm3 \n\t"
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t" // abs(block[i])
"psubw %%mm3, %%mm1 \n\t" // abs(block[i])
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q
"pxor %%mm4, %%mm4 \n\t"
"pxor %%mm5, %%mm5 \n\t" // FIXME slow
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0
"psrlw $4, %%mm0 \n\t"
"psrlw $4, %%mm1 \n\t"
"pxor %%mm2, %%mm0 \n\t"
"pxor %%mm3, %%mm1 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm3, %%mm1 \n\t"
"pandn %%mm0, %%mm4 \n\t"
"pandn %%mm1, %%mm5 \n\t"
"pxor %%mm4, %%mm7 \n\t"
"pxor %%mm5, %%mm7 \n\t"
"movq %%mm4, (%0, %%"REG_a") \n\t"
"movq %%mm5, 8(%0, %%"REG_a") \n\t"
 
"add $16, %%"REG_a" \n\t"
"jng 1b \n\t"
"movd 124(%0, %3), %%mm0 \n\t"
"movq %%mm7, %%mm6 \n\t"
"psrlq $32, %%mm7 \n\t"
"pxor %%mm6, %%mm7 \n\t"
"movq %%mm7, %%mm6 \n\t"
"psrlq $16, %%mm7 \n\t"
"pxor %%mm6, %%mm7 \n\t"
"pslld $31, %%mm7 \n\t"
"psrlq $15, %%mm7 \n\t"
"pxor %%mm7, %%mm0 \n\t"
"movd %%mm0, 124(%0, %3) \n\t"
 
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs)
: "%"REG_a, "memory"
);
}
 
static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){
const int intra= s->mb_intra;
int *sum= s->dct_error_sum[intra];
uint16_t *offset= s->dct_offset[intra];
 
s->dct_count[intra]++;
 
__asm__ volatile(
"pxor %%mm7, %%mm7 \n\t"
"1: \n\t"
"pxor %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"movq (%0), %%mm2 \n\t"
"movq 8(%0), %%mm3 \n\t"
"pcmpgtw %%mm2, %%mm0 \n\t"
"pcmpgtw %%mm3, %%mm1 \n\t"
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
"psubw %%mm0, %%mm2 \n\t"
"psubw %%mm1, %%mm3 \n\t"
"movq %%mm2, %%mm4 \n\t"
"movq %%mm3, %%mm5 \n\t"
"psubusw (%2), %%mm2 \n\t"
"psubusw 8(%2), %%mm3 \n\t"
"pxor %%mm0, %%mm2 \n\t"
"pxor %%mm1, %%mm3 \n\t"
"psubw %%mm0, %%mm2 \n\t"
"psubw %%mm1, %%mm3 \n\t"
"movq %%mm2, (%0) \n\t"
"movq %%mm3, 8(%0) \n\t"
"movq %%mm4, %%mm2 \n\t"
"movq %%mm5, %%mm3 \n\t"
"punpcklwd %%mm7, %%mm4 \n\t"
"punpckhwd %%mm7, %%mm2 \n\t"
"punpcklwd %%mm7, %%mm5 \n\t"
"punpckhwd %%mm7, %%mm3 \n\t"
"paddd (%1), %%mm4 \n\t"
"paddd 8(%1), %%mm2 \n\t"
"paddd 16(%1), %%mm5 \n\t"
"paddd 24(%1), %%mm3 \n\t"
"movq %%mm4, (%1) \n\t"
"movq %%mm2, 8(%1) \n\t"
"movq %%mm5, 16(%1) \n\t"
"movq %%mm3, 24(%1) \n\t"
"add $16, %0 \n\t"
"add $32, %1 \n\t"
"add $16, %2 \n\t"
"cmp %3, %0 \n\t"
" jb 1b \n\t"
: "+r" (block), "+r" (sum), "+r" (offset)
: "r"(block+64)
);
}
 
static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){
const int intra= s->mb_intra;
int *sum= s->dct_error_sum[intra];
uint16_t *offset= s->dct_offset[intra];
 
s->dct_count[intra]++;
 
__asm__ volatile(
"pxor %%xmm7, %%xmm7 \n\t"
"1: \n\t"
"pxor %%xmm0, %%xmm0 \n\t"
"pxor %%xmm1, %%xmm1 \n\t"
"movdqa (%0), %%xmm2 \n\t"
"movdqa 16(%0), %%xmm3 \n\t"
"pcmpgtw %%xmm2, %%xmm0 \n\t"
"pcmpgtw %%xmm3, %%xmm1 \n\t"
"pxor %%xmm0, %%xmm2 \n\t"
"pxor %%xmm1, %%xmm3 \n\t"
"psubw %%xmm0, %%xmm2 \n\t"
"psubw %%xmm1, %%xmm3 \n\t"
"movdqa %%xmm2, %%xmm4 \n\t"
"movdqa %%xmm3, %%xmm5 \n\t"
"psubusw (%2), %%xmm2 \n\t"
"psubusw 16(%2), %%xmm3 \n\t"
"pxor %%xmm0, %%xmm2 \n\t"
"pxor %%xmm1, %%xmm3 \n\t"
"psubw %%xmm0, %%xmm2 \n\t"
"psubw %%xmm1, %%xmm3 \n\t"
"movdqa %%xmm2, (%0) \n\t"
"movdqa %%xmm3, 16(%0) \n\t"
"movdqa %%xmm4, %%xmm6 \n\t"
"movdqa %%xmm5, %%xmm0 \n\t"
"punpcklwd %%xmm7, %%xmm4 \n\t"
"punpckhwd %%xmm7, %%xmm6 \n\t"
"punpcklwd %%xmm7, %%xmm5 \n\t"
"punpckhwd %%xmm7, %%xmm0 \n\t"
"paddd (%1), %%xmm4 \n\t"
"paddd 16(%1), %%xmm6 \n\t"
"paddd 32(%1), %%xmm5 \n\t"
"paddd 48(%1), %%xmm0 \n\t"
"movdqa %%xmm4, (%1) \n\t"
"movdqa %%xmm6, 16(%1) \n\t"
"movdqa %%xmm5, 32(%1) \n\t"
"movdqa %%xmm0, 48(%1) \n\t"
"add $32, %0 \n\t"
"add $64, %1 \n\t"
"add $32, %2 \n\t"
"cmp %3, %0 \n\t"
" jb 1b \n\t"
: "+r" (block), "+r" (sum), "+r" (offset)
: "r"(block+64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
 
#endif /* HAVE_MMX_INLINE */
 
av_cold void ff_MPV_common_init_x86(MpegEncContext *s)
{
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
 
if (INLINE_MMX(cpu_flags)) {
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx;
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx;
s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx;
s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx;
if(!(s->flags & CODEC_FLAG_BITEXACT))
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx;
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx;
s->denoise_dct = denoise_dct_mmx;
}
if (INLINE_SSE2(cpu_flags)) {
s->denoise_dct = denoise_dct_sse2;
}
#endif /* HAVE_MMX_INLINE */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpegvideoenc.c
0,0 → 1,107
/*
* The simplest mpeg encoder (well, it was the simplest!)
* Copyright (c) 2000,2001 Fabrice Bellard
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dct.h"
#include "libavcodec/mpegvideo.h"
#include "dsputil_x86.h"
 
extern uint16_t ff_inv_zigzag_direct16[64];
 
#if HAVE_MMX_INLINE
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_SSSE3 0
#define RENAME(a) a ## _MMX
#define RENAMEl(a) a ## _mmx
#include "mpegvideoenc_template.c"
#endif /* HAVE_MMX_INLINE */
 
#if HAVE_MMXEXT_INLINE
#undef COMPILE_TEMPLATE_SSSE3
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_MMXEXT
#define COMPILE_TEMPLATE_MMXEXT 1
#define COMPILE_TEMPLATE_SSE2 0
#define COMPILE_TEMPLATE_SSSE3 0
#undef RENAME
#undef RENAMEl
#define RENAME(a) a ## _MMXEXT
#define RENAMEl(a) a ## _mmxext
#include "mpegvideoenc_template.c"
#endif /* HAVE_MMXEXT_INLINE */
 
#if HAVE_SSE2_INLINE
#undef COMPILE_TEMPLATE_MMXEXT
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_SSSE3
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 1
#define COMPILE_TEMPLATE_SSSE3 0
#undef RENAME
#undef RENAMEl
#define RENAME(a) a ## _SSE2
#define RENAMEl(a) a ## _sse2
#include "mpegvideoenc_template.c"
#endif /* HAVE_SSE2_INLINE */
 
#if HAVE_SSSE3_INLINE
#undef COMPILE_TEMPLATE_MMXEXT
#undef COMPILE_TEMPLATE_SSE2
#undef COMPILE_TEMPLATE_SSSE3
#define COMPILE_TEMPLATE_MMXEXT 0
#define COMPILE_TEMPLATE_SSE2 1
#define COMPILE_TEMPLATE_SSSE3 1
#undef RENAME
#undef RENAMEl
#define RENAME(a) a ## _SSSE3
#define RENAMEl(a) a ## _sse2
#include "mpegvideoenc_template.c"
#endif /* HAVE_SSSE3_INLINE */
 
av_cold void ff_dct_encode_init_x86(MpegEncContext *s)
{
const int dct_algo = s->avctx->dct_algo;
 
if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) {
#if HAVE_MMX_INLINE
int cpu_flags = av_get_cpu_flags();
if (INLINE_MMX(cpu_flags))
s->dct_quantize = dct_quantize_MMX;
#endif
#if HAVE_MMXEXT_INLINE
if (INLINE_MMXEXT(cpu_flags))
s->dct_quantize = dct_quantize_MMXEXT;
#endif
#if HAVE_SSE2_INLINE
if (INLINE_SSE2(cpu_flags))
s->dct_quantize = dct_quantize_SSE2;
#endif
#if HAVE_SSSE3_INLINE
if (INLINE_SSSE3(cpu_flags))
s->dct_quantize = dct_quantize_SSSE3;
#endif
}
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpegvideoenc_template.c
0,0 → 1,364
/*
* MPEG video MMX templates
*
* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#undef MMREG_WIDTH
#undef MM
#undef MOVQ
#undef SPREADW
#undef PMAXW
#undef PMAX
#undef SAVE_SIGN
#undef RESTORE_SIGN
 
#if COMPILE_TEMPLATE_SSE2
#define MMREG_WIDTH "16"
#define MM "%%xmm"
#define MOVQ "movdqa"
#define SPREADW(a) \
"pshuflw $0, "a", "a" \n\t"\
"punpcklwd "a", "a" \n\t"
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
#define PMAX(a,b) \
"movhlps "a", "b" \n\t"\
PMAXW(b, a)\
"pshuflw $0x0E, "a", "b" \n\t"\
PMAXW(b, a)\
"pshuflw $0x01, "a", "b" \n\t"\
PMAXW(b, a)
#else
#define MMREG_WIDTH "8"
#define MM "%%mm"
#define MOVQ "movq"
#if COMPILE_TEMPLATE_MMXEXT
#define SPREADW(a) "pshufw $0, "a", "a" \n\t"
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t"
#define PMAX(a,b) \
"pshufw $0x0E, "a", "b" \n\t"\
PMAXW(b, a)\
"pshufw $0x01, "a", "b" \n\t"\
PMAXW(b, a)
#else
#define SPREADW(a) \
"punpcklwd "a", "a" \n\t"\
"punpcklwd "a", "a" \n\t"
#define PMAXW(a,b) \
"psubusw "a", "b" \n\t"\
"paddw "a", "b" \n\t"
#define PMAX(a,b) \
"movq "a", "b" \n\t"\
"psrlq $32, "a" \n\t"\
PMAXW(b, a)\
"movq "a", "b" \n\t"\
"psrlq $16, "a" \n\t"\
PMAXW(b, a)
 
#endif
#endif
 
#if COMPILE_TEMPLATE_SSSE3
#define SAVE_SIGN(a,b) \
"movdqa "b", "a" \n\t"\
"pabsw "b", "b" \n\t"
#define RESTORE_SIGN(a,b) \
"psignw "a", "b" \n\t"
#else
#define SAVE_SIGN(a,b) \
"pxor "a", "a" \n\t"\
"pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\
"pxor "a", "b" \n\t"\
"psubw "a", "b" \n\t" /* ABS(block[i]) */
#define RESTORE_SIGN(a,b) \
"pxor "a", "b" \n\t"\
"psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
#endif
 
static int RENAME(dct_quantize)(MpegEncContext *s,
int16_t *block, int n,
int qscale, int *overflow)
{
x86_reg last_non_zero_p1;
int level=0, q; //=0 is because gcc says uninitialized ...
const uint16_t *qmat, *bias;
LOCAL_ALIGNED_16(int16_t, temp_block, [64]);
 
av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly?
 
//s->fdct (block);
RENAMEl(ff_fdct) (block); //cannot be anything else ...
 
if(s->dct_error_sum)
s->denoise_dct(s, block);
 
if (s->mb_intra) {
int dummy;
if (n < 4){
q = s->y_dc_scale;
bias = s->q_intra_matrix16[qscale][1];
qmat = s->q_intra_matrix16[qscale][0];
}else{
q = s->c_dc_scale;
bias = s->q_chroma_intra_matrix16[qscale][1];
qmat = s->q_chroma_intra_matrix16[qscale][0];
}
/* note: block[0] is assumed to be positive */
if (!s->h263_aic) {
__asm__ volatile (
"mul %%ecx \n\t"
: "=d" (level), "=a"(dummy)
: "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1])
);
} else
/* For AIC we skip quant/dequant of INTRADC */
level = (block[0] + 4)>>3;
 
block[0]=0; //avoid fake overflow
// temp_block[0] = (block[0] + (q >> 1)) / q;
last_non_zero_p1 = 1;
} else {
last_non_zero_p1 = 0;
bias = s->q_inter_matrix16[qscale][1];
qmat = s->q_inter_matrix16[qscale][0];
}
 
if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){
 
__asm__ volatile(
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
SPREADW(MM"3")
"pxor "MM"7, "MM"7 \n\t" // 0
"pxor "MM"4, "MM"4 \n\t" // 0
MOVQ" (%2), "MM"5 \n\t" // qmat[0]
"pxor "MM"6, "MM"6 \n\t"
"psubw (%3), "MM"6 \n\t" // -bias[0]
"mov $-128, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
"psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16
"por "MM"0, "MM"4 \n\t"
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
"pandn "MM"1, "MM"0 \n\t"
PMAXW(MM"0", MM"3")
"add $"MMREG_WIDTH", %%"REG_a" \n\t"
" js 1b \n\t"
PMAX(MM"3", MM"0")
"movd "MM"3, %%"REG_a" \n\t"
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat), "r" (bias),
"r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}else{ // FMT_H263
__asm__ volatile(
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1
SPREADW(MM"3")
"pxor "MM"7, "MM"7 \n\t" // 0
"pxor "MM"4, "MM"4 \n\t" // 0
"mov $-128, %%"REG_a" \n\t"
".p2align 4 \n\t"
"1: \n\t"
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i]
SAVE_SIGN(MM"1", MM"0") // ABS(block[i])
MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0]
"paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0]
MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i]
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16
"por "MM"0, "MM"4 \n\t"
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i])
MOVQ" "MM"0, (%5, %%"REG_a") \n\t"
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t"
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0
"pandn "MM"1, "MM"0 \n\t"
PMAXW(MM"0", MM"3")
"add $"MMREG_WIDTH", %%"REG_a" \n\t"
" js 1b \n\t"
PMAX(MM"3", MM"0")
"movd "MM"3, %%"REG_a" \n\t"
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1
: "+a" (last_non_zero_p1)
: "r" (block+64), "r" (qmat+64), "r" (bias+64),
"r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64)
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
"%xmm4", "%xmm5", "%xmm6", "%xmm7")
);
}
__asm__ volatile(
"movd %1, "MM"1 \n\t" // max_qcoeff
SPREADW(MM"1")
"psubusw "MM"1, "MM"4 \n\t"
"packuswb "MM"4, "MM"4 \n\t"
#if COMPILE_TEMPLATE_SSE2
"packuswb "MM"4, "MM"4 \n\t"
#endif
"movd "MM"4, %0 \n\t" // *overflow
: "=g" (*overflow)
: "g" (s->max_qcoeff)
);
 
if(s->mb_intra) block[0]= level;
else block[0]= temp_block[0];
 
if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){
if(last_non_zero_p1 <= 1) goto end;
block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08];
block[0x20] = temp_block[0x10];
if(last_non_zero_p1 <= 4) goto end;
block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02];
block[0x09] = temp_block[0x03];
if(last_non_zero_p1 <= 7) goto end;
block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11];
block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20];
if(last_non_zero_p1 <= 11) goto end;
block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12];
block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04];
block[0x0C] = temp_block[0x05];
if(last_non_zero_p1 <= 16) goto end;
block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13];
block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21];
block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30];
block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22];
if(last_non_zero_p1 <= 24) goto end;
block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14];
block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06];
block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E];
block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C];
if(last_non_zero_p1 <= 32) goto end;
block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A];
block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38];
block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32];
block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24];
if(last_non_zero_p1 <= 40) goto end;
block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16];
block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17];
block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25];
block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33];
if(last_non_zero_p1 <= 48) goto end;
block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D];
block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E];
if(last_non_zero_p1 <= 56) goto end;
block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C];
block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36];
block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37];
block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
}else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){
if(last_non_zero_p1 <= 1) goto end;
block[0x04] = temp_block[0x01];
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
if(last_non_zero_p1 <= 4) goto end;
block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02];
block[0x05] = temp_block[0x03];
if(last_non_zero_p1 <= 7) goto end;
block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11];
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
if(last_non_zero_p1 <= 11) goto end;
block[0x1C] = temp_block[0x19];
block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B];
block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05];
if(last_non_zero_p1 <= 16) goto end;
block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13];
block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21];
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22];
if(last_non_zero_p1 <= 24) goto end;
block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14];
block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06];
block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E];
block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C];
if(last_non_zero_p1 <= 32) goto end;
block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A];
block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38];
block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32];
block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24];
if(last_non_zero_p1 <= 40) goto end;
block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16];
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25];
block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33];
if(last_non_zero_p1 <= 48) goto end;
block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B];
block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D];
block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E];
if(last_non_zero_p1 <= 56) goto end;
block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C];
block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36];
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
}else{
if(last_non_zero_p1 <= 1) goto end;
block[0x01] = temp_block[0x01];
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10];
if(last_non_zero_p1 <= 4) goto end;
block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02];
block[0x03] = temp_block[0x03];
if(last_non_zero_p1 <= 7) goto end;
block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11];
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20];
if(last_non_zero_p1 <= 11) goto end;
block[0x19] = temp_block[0x19];
block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B];
block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05];
if(last_non_zero_p1 <= 16) goto end;
block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13];
block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21];
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30];
block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22];
if(last_non_zero_p1 <= 24) goto end;
block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14];
block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06];
block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E];
block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C];
if(last_non_zero_p1 <= 32) goto end;
block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A];
block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38];
block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32];
block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24];
if(last_non_zero_p1 <= 40) goto end;
block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16];
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17];
block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25];
block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33];
if(last_non_zero_p1 <= 48) goto end;
block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B];
block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D];
block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F];
block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E];
if(last_non_zero_p1 <= 56) goto end;
block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C];
block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36];
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37];
block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F];
}
end:
return last_non_zero_p1 - 1;
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/pngdsp.asm
0,0 → 1,173
;******************************************************************************
;* x86 optimizations for PNG decoding
;*
;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu>
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
cextern pw_255
 
SECTION_TEXT
 
; %1 = nr. of xmm registers used
%macro ADD_BYTES_FN 1
cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i
%if ARCH_X86_64
movsxd waq, wad
%endif
xor iq, iq
 
; vector loop
mov wq, waq
and waq, ~(mmsize*2-1)
jmp .end_v
.loop_v:
mova m0, [src1q+iq]
mova m1, [src1q+iq+mmsize]
paddb m0, [src2q+iq]
paddb m1, [src2q+iq+mmsize]
mova [dstq+iq ], m0
mova [dstq+iq+mmsize], m1
add iq, mmsize*2
.end_v:
cmp iq, waq
jl .loop_v
 
%if mmsize == 16
; vector loop
mov waq, wq
and waq, ~7
jmp .end_l
.loop_l:
movq mm0, [src1q+iq]
paddb mm0, [src2q+iq]
movq [dstq+iq ], mm0
add iq, 8
.end_l:
cmp iq, waq
jl .loop_l
%endif
 
; scalar loop for leftover
jmp .end_s
.loop_s:
mov wab, [src1q+iq]
add wab, [src2q+iq]
mov [dstq+iq], wab
inc iq
.end_s:
cmp iq, wq
jl .loop_s
REP_RET
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
ADD_BYTES_FN 0
%endif
 
INIT_XMM sse2
ADD_BYTES_FN 2
 
%macro ADD_PAETH_PRED_FN 1
cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr
%if ARCH_X86_64
movsxd bppq, bppd
movsxd wq, wd
%endif
lea endq, [dstq+wq-(mmsize/2-1)]
sub topq, dstq
sub srcq, dstq
sub dstq, bppq
pxor m7, m7
 
PUSH dstq
lea cntrq, [bppq-1]
shr cntrq, 2 + mmsize/16
.bpp_loop:
lea dstq, [dstq+cntrq*(mmsize/2)]
movh m0, [dstq]
movh m1, [topq+dstq]
punpcklbw m0, m7
punpcklbw m1, m7
add dstq, bppq
.loop:
mova m2, m1
movh m1, [topq+dstq]
mova m3, m2
punpcklbw m1, m7
mova m4, m2
psubw m3, m1
psubw m4, m0
mova m5, m3
paddw m5, m4
%if cpuflag(ssse3)
pabsw m3, m3
pabsw m4, m4
pabsw m5, m5
%else ; !cpuflag(ssse3)
psubw m7, m5
pmaxsw m5, m7
pxor m6, m6
pxor m7, m7
psubw m6, m3
psubw m7, m4
pmaxsw m3, m6
pmaxsw m4, m7
pxor m7, m7
%endif ; cpuflag(ssse3)
mova m6, m4
pminsw m6, m5
pcmpgtw m3, m6
pcmpgtw m4, m5
mova m6, m4
pand m4, m3
pandn m6, m3
pandn m3, m0
movh m0, [srcq+dstq]
pand m6, m1
pand m2, m4
punpcklbw m0, m7
paddw m0, m6
paddw m3, m2
paddw m0, m3
pand m0, [pw_255]
mova m3, m0
packuswb m3, m3
movh [dstq], m3
add dstq, bppq
cmp dstq, endq
jle .loop
 
mov dstq, [rsp]
dec cntrq
jge .bpp_loop
POP dstq
RET
%endmacro
 
INIT_MMX mmxext
ADD_PAETH_PRED_FN 0
 
INIT_MMX ssse3
ADD_PAETH_PRED_FN 0
/contrib/sdk/sources/ffmpeg/libavcodec/x86/pngdsp_init.c
0,0 → 1,50
/*
* x86 PNG optimizations.
* Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/common.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/pngdsp.h"
 
void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src,
uint8_t *top, int w, int bpp);
void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src,
uint8_t *top, int w, int bpp);
void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1,
uint8_t *src2, int w);
void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1,
uint8_t *src2, int w);
 
av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
 
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags))
dsp->add_bytes_l2 = ff_add_bytes_l2_mmx;
#endif
if (EXTERNAL_MMXEXT(cpu_flags))
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext;
if (EXTERNAL_SSE2(cpu_flags))
dsp->add_bytes_l2 = ff_add_bytes_l2_sse2;
if (EXTERNAL_SSSE3(cpu_flags))
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3;
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/proresdsp.asm
0,0 → 1,326
;******************************************************************************
;* x86-SIMD-optimized IDCT for prores
;* this is identical to "simple" IDCT written by Michael Niedermayer
;* except for the clip range
;*
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1
%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1
%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2
%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1
%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1
%define W6sh2 8867 ; W6 = 35468 = 8867<<2
%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1
 
%if ARCH_X86_64
 
SECTION_RODATA
 
w4_plus_w2: times 4 dw W4sh2, +W2sh2
w4_min_w2: times 4 dw W4sh2, -W2sh2
w4_plus_w6: times 4 dw W4sh2, +W6sh2
w4_min_w6: times 4 dw W4sh2, -W6sh2
w1_plus_w3: times 4 dw W1sh2, +W3sh2
w3_min_w1: times 4 dw W3sh2, -W1sh2
w7_plus_w3: times 4 dw W7sh2, +W3sh2
w3_min_w7: times 4 dw W3sh2, -W7sh2
w1_plus_w5: times 4 dw W1sh2, +W5sh2
w5_min_w1: times 4 dw W5sh2, -W1sh2
w5_plus_w7: times 4 dw W5sh2, +W7sh2
w7_min_w5: times 4 dw W7sh2, -W5sh2
pw_88: times 8 dw 0x2008
 
cextern pw_1
cextern pw_4
cextern pw_512
cextern pw_1019
 
section .text align=16
 
; interleave data while maintaining source
; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave
%macro SBUTTERFLY3 5
punpckl%1 m%2, m%4, m%5
punpckh%1 m%3, m%4, m%5
%endmacro
 
; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift
; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6
; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3)
%macro SUMSUB_SHPK 7
psubd %3, %1, %5 ; { a0 - b0 }[0-3]
psubd %4, %2, %6 ; { a0 - b0 }[4-7]
paddd %1, %5 ; { a0 + b0 }[0-3]
paddd %2, %6 ; { a0 + b0 }[4-7]
psrad %1, %7
psrad %2, %7
psrad %3, %7
psrad %4, %7
packssdw %1, %2 ; row[0]
packssdw %3, %4 ; row[7]
%endmacro
 
; %1 = row or col (for rounding variable)
; %2 = number of bits to shift at the end
%macro IDCT_1D 2
; a0 = (W4 * row[0]) + (1 << (15 - 1));
; a1 = a0;
; a2 = a0;
; a3 = a0;
; a0 += W2 * row[2];
; a1 += W6 * row[2];
; a2 -= W6 * row[2];
; a3 -= W2 * row[2];
%ifidn %1, col
paddw m10,[pw_88]
%endif
%ifidn %1, row
paddw m10,[pw_1]
%endif
SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7]
pmaddwd m2, m0, [w4_plus_w6]
pmaddwd m3, m1, [w4_plus_w6]
pmaddwd m4, m0, [w4_min_w6]
pmaddwd m5, m1, [w4_min_w6]
pmaddwd m6, m0, [w4_min_w2]
pmaddwd m7, m1, [w4_min_w2]
pmaddwd m0, [w4_plus_w2]
pmaddwd m1, [w4_plus_w2]
 
; a0: -1*row[0]-1*row[2]
; a1: -1*row[0]
; a2: -1*row[0]
; a3: -1*row[0]+1*row[2]
 
; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4]
; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6]
; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6]
; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4]
SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7]
pmaddwd m10, m8, [w4_plus_w6]
pmaddwd m11, m9, [w4_plus_w6]
paddd m0, m10 ; a0[0-3]
paddd m1, m11 ; a0[4-7]
pmaddwd m10, m8, [w4_min_w6]
pmaddwd m11, m9, [w4_min_w6]
paddd m6, m10 ; a3[0-3]
paddd m7, m11 ; a3[4-7]
pmaddwd m10, m8, [w4_min_w2]
pmaddwd m11, m9, [w4_min_w2]
pmaddwd m8, [w4_plus_w2]
pmaddwd m9, [w4_plus_w2]
psubd m4, m10 ; a2[0-3] intermediate
psubd m5, m11 ; a2[4-7] intermediate
psubd m2, m8 ; a1[0-3] intermediate
psubd m3, m9 ; a1[4-7] intermediate
 
; load/store
mova [r2+ 0], m0
mova [r2+ 32], m2
mova [r2+ 64], m4
mova [r2+ 96], m6
mova m10,[r2+ 16] ; { row[1] }[0-7]
mova m8, [r2+ 48] ; { row[3] }[0-7]
mova m13,[r2+ 80] ; { row[5] }[0-7]
mova m14,[r2+112] ; { row[7] }[0-7]
mova [r2+ 16], m1
mova [r2+ 48], m3
mova [r2+ 80], m5
mova [r2+112], m7
%ifidn %1, row
pmullw m10,[r3+ 16]
pmullw m8, [r3+ 48]
pmullw m13,[r3+ 80]
pmullw m14,[r3+112]
%endif
 
; b0 = MUL(W1, row[1]);
; MAC(b0, W3, row[3]);
; b1 = MUL(W3, row[1]);
; MAC(b1, -W7, row[3]);
; b2 = MUL(W5, row[1]);
; MAC(b2, -W1, row[3]);
; b3 = MUL(W7, row[1]);
; MAC(b3, -W5, row[3]);
SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7]
pmaddwd m2, m0, [w3_min_w7]
pmaddwd m3, m1, [w3_min_w7]
pmaddwd m4, m0, [w5_min_w1]
pmaddwd m5, m1, [w5_min_w1]
pmaddwd m6, m0, [w7_min_w5]
pmaddwd m7, m1, [w7_min_w5]
pmaddwd m0, [w1_plus_w3]
pmaddwd m1, [w1_plus_w3]
 
; b0: +1*row[1]+2*row[3]
; b1: +2*row[1]-1*row[3]
; b2: -1*row[1]-1*row[3]
; b3: +1*row[1]+1*row[3]
 
; MAC(b0, W5, row[5]);
; MAC(b0, W7, row[7]);
; MAC(b1, -W1, row[5]);
; MAC(b1, -W5, row[7]);
; MAC(b2, W7, row[5]);
; MAC(b2, W3, row[7]);
; MAC(b3, W3, row[5]);
; MAC(b3, -W1, row[7]);
SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7]
 
; b0: -1*row[5]+1*row[7]
; b1: -1*row[5]+1*row[7]
; b2: +1*row[5]+2*row[7]
; b3: +2*row[5]-1*row[7]
 
pmaddwd m10, m8, [w1_plus_w5]
pmaddwd m11, m9, [w1_plus_w5]
pmaddwd m12, m8, [w5_plus_w7]
pmaddwd m13, m9, [w5_plus_w7]
psubd m2, m10 ; b1[0-3]
psubd m3, m11 ; b1[4-7]
paddd m0, m12 ; b0[0-3]
paddd m1, m13 ; b0[4-7]
pmaddwd m12, m8, [w7_plus_w3]
pmaddwd m13, m9, [w7_plus_w3]
pmaddwd m8, [w3_min_w1]
pmaddwd m9, [w3_min_w1]
paddd m4, m12 ; b2[0-3]
paddd m5, m13 ; b2[4-7]
paddd m6, m8 ; b3[0-3]
paddd m7, m9 ; b3[4-7]
 
; row[0] = (a0 + b0) >> 15;
; row[7] = (a0 - b0) >> 15;
; row[1] = (a1 + b1) >> 15;
; row[6] = (a1 - b1) >> 15;
; row[2] = (a2 + b2) >> 15;
; row[5] = (a2 - b2) >> 15;
; row[3] = (a3 + b3) >> 15;
; row[4] = (a3 - b3) >> 15;
mova m8, [r2+ 0] ; a0[0-3]
mova m9, [r2+16] ; a0[4-7]
SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2
mova m0, [r2+32] ; a1[0-3]
mova m1, [r2+48] ; a1[4-7]
SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2
mova m1, [r2+64] ; a2[0-3]
mova m2, [r2+80] ; a2[4-7]
SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2
mova m2, [r2+96] ; a3[0-3]
mova m3, [r2+112] ; a3[4-7]
SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2
%endmacro
 
; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride,
; int16_t *block, const int16_t *qmat);
%macro idct_put_fn 1
cglobal prores_idct_put_10, 4, 4, %1
movsxd r1, r1d
pxor m15, m15 ; zero
 
; for (i = 0; i < 8; i++)
; idctRowCondDC(block + i*8);
mova m10,[r2+ 0] ; { row[0] }[0-7]
mova m8, [r2+32] ; { row[2] }[0-7]
mova m13,[r2+64] ; { row[4] }[0-7]
mova m12,[r2+96] ; { row[6] }[0-7]
 
pmullw m10,[r3+ 0]
pmullw m8, [r3+32]
pmullw m13,[r3+64]
pmullw m12,[r3+96]
 
IDCT_1D row, 15
 
; transpose for second part of IDCT
TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3
mova [r2+ 16], m0
mova [r2+ 48], m2
mova [r2+ 80], m11
mova [r2+112], m10
SWAP 8, 10
SWAP 1, 8
SWAP 4, 13
SWAP 9, 12
 
; for (i = 0; i < 8; i++)
; idctSparseColAdd(dest + i, line_size, block + i);
IDCT_1D col, 18
 
; clip/store
mova m3, [pw_4]
mova m5, [pw_1019]
pmaxsw m8, m3
pmaxsw m0, m3
pmaxsw m1, m3
pmaxsw m2, m3
pmaxsw m4, m3
pmaxsw m11, m3
pmaxsw m9, m3
pmaxsw m10, m3
pminsw m8, m5
pminsw m0, m5
pminsw m1, m5
pminsw m2, m5
pminsw m4, m5
pminsw m11, m5
pminsw m9, m5
pminsw m10, m5
 
lea r2, [r1*3]
mova [r0 ], m8
mova [r0+r1 ], m0
mova [r0+r1*2], m1
mova [r0+r2 ], m2
lea r0, [r0+r1*4]
mova [r0 ], m4
mova [r0+r1 ], m11
mova [r0+r1*2], m9
mova [r0+r2 ], m10
RET
%endmacro
 
%macro SIGNEXTEND 2-3
%if cpuflag(sse4) ; dstlow, dsthigh
movhlps %2, %1
pmovsxwd %1, %1
pmovsxwd %2, %2
%elif cpuflag(sse2) ; dstlow, dsthigh, tmp
pxor %3, %3
pcmpgtw %3, %1
mova %2, %1
punpcklwd %1, %3
punpckhwd %2, %3
%endif
%endmacro
 
INIT_XMM sse2
idct_put_fn 16
INIT_XMM sse4
idct_put_fn 16
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
idct_put_fn 16
%endif
 
%endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/proresdsp_init.c
0,0 → 1,58
/*
* Apple ProRes compatible decoder
*
* Copyright (c) 2010-2011 Maxim Poliakovski
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/attributes.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/proresdsp.h"
 
void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize,
int16_t *block, const int16_t *qmat);
void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize,
int16_t *block, const int16_t *qmat);
void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize,
int16_t *block, const int16_t *qmat);
 
av_cold void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx)
{
#if ARCH_X86_64
int cpu_flags = av_get_cpu_flags();
 
if(avctx->flags & CODEC_FLAG_BITEXACT)
return;
 
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
dsp->idct_put = ff_prores_idct_put_10_sse2;
}
 
if (EXTERNAL_SSE4(cpu_flags)) {
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
dsp->idct_put = ff_prores_idct_put_10_sse4;
}
 
if (EXTERNAL_AVX(cpu_flags)) {
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM;
dsp->idct_put = ff_prores_idct_put_10_avx;
}
#endif /* ARCH_X86_64 */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/qpel.asm
0,0 → 1,176
;******************************************************************************
;* MMX optimized DSP utils
;* Copyright (c) 2008 Loren Merritt
;* Copyright (c) 2003-2013 Michael Niedermayer
;* Copyright (c) 2013 Daniel Kang
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION .text
 
%macro op_avgh 3
movh %3, %2
pavgb %1, %3
movh %2, %1
%endmacro
 
%macro op_avg 2
pavgb %1, %2
mova %2, %1
%endmacro
 
%macro op_puth 2-3
movh %2, %1
%endmacro
 
%macro op_put 2
mova %2, %1
%endmacro
 
; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS4_L2 1
%define OP op_%1h
cglobal %1_pixels4_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
movd m0, [r1]
movd m1, [r2]
add r1, r4
add r2, 4
pavgb m0, m1
OP m0, [r0], m3
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2]
pavgb m1, [r2+4]
OP m0, [r0], m3
OP m1, [r0+r3], m3
lea r0, [r0+2*r3]
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2+8]
pavgb m1, [r2+12]
OP m0, [r0], m3
OP m1, [r0+r3], m3
lea r0, [r0+2*r3]
add r2, 16
sub r5d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PIXELS4_L2 put
PIXELS4_L2 avg
 
; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS8_L2 1
%define OP op_%1
cglobal %1_pixels8_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r2]
add r1, r4
add r2, 8
pavgb m0, m1
OP m0, [r0]
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2]
pavgb m1, [r2+8]
OP m0, [r0]
OP m1, [r0+r3]
lea r0, [r0+2*r3]
mova m0, [r1]
mova m1, [r1+r4]
lea r1, [r1+2*r4]
pavgb m0, [r2+16]
pavgb m1, [r2+24]
OP m0, [r0]
OP m1, [r0+r3]
lea r0, [r0+2*r3]
add r2, 32
sub r5d, 4
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PIXELS8_L2 put
PIXELS8_L2 avg
 
; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h)
%macro PIXELS16_L2 1
%define OP op_%1
cglobal %1_pixels16_l2, 6,6
movsxdifnidn r3, r3d
movsxdifnidn r4, r4d
test r5d, 1
je .loop
mova m0, [r1]
mova m1, [r1+8]
pavgb m0, [r2]
pavgb m1, [r2+8]
add r1, r4
add r2, 16
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
dec r5d
.loop:
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
pavgb m0, [r2]
pavgb m1, [r2+8]
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
mova m0, [r1]
mova m1, [r1+8]
add r1, r4
pavgb m0, [r2+16]
pavgb m1, [r2+24]
OP m0, [r0]
OP m1, [r0+8]
add r0, r3
add r2, 32
sub r5d, 2
jne .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PIXELS16_L2 put
PIXELS16_L2 avg
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rnd_mmx.c
0,0 → 1,35
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "config.h"
#include "dsputil_x86.h"
 
#if HAVE_INLINE_ASM
 
#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx
#define SET_RND MOVQ_WTWO
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f)
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e)
#define STATIC
 
#include "rnd_template.c"
 
PIXELS16(, ff_avg, , _xy2, _mmx)
PIXELS16(, ff_put, , _xy2, _mmx)
 
#endif /* HAVE_INLINE_ASM */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rnd_template.c
0,0 → 1,173
/*
* DSP utils mmx functions are compiled twice for rnd/no_rnd
* Copyright (c) 2000, 2001 Fabrice Bellard
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at>
*
* MMX optimization by Nick Kurshev <nickols_k@mail.ru>
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at>
* and improved by Zdenek Kabelac <kabi@users.sf.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stddef.h>
#include <stdint.h>
 
// put_pixels
STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_ZERO(mm7);
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm4 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"xor %%"REG_a", %%"REG_a" \n\t"
"add %3, %1 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddusw %%mm2, %%mm0 \n\t"
"paddusw %%mm3, %%mm1 \n\t"
"paddusw %%mm6, %%mm4 \n\t"
"paddusw %%mm6, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm5 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"movq %%mm4, (%2, %%"REG_a") \n\t"
"add %3, %%"REG_a" \n\t"
 
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
"movq 1(%1, %%"REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm2, %%mm4 \n\t"
"paddusw %%mm3, %%mm5 \n\t"
"paddusw %%mm6, %%mm0 \n\t"
"paddusw %%mm6, %%mm1 \n\t"
"paddusw %%mm4, %%mm0 \n\t"
"paddusw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"movq %%mm0, (%2, %%"REG_a") \n\t"
"add %3, %%"REG_a" \n\t"
 
"subl $2, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels)
:"D"(block), "r"((x86_reg)line_size)
:REG_a, "memory");
}
 
// avg_pixels
// this routine is 'slightly' suboptimal but mostly unused
STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels,
ptrdiff_t line_size, int h)
{
MOVQ_ZERO(mm7);
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 1(%1), %%mm4 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"xor %%"REG_a", %%"REG_a" \n\t"
"add %3, %1 \n\t"
".p2align 3 \n\t"
"1: \n\t"
"movq (%1, %%"REG_a"), %%mm0 \n\t"
"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
"movq %%mm0, %%mm1 \n\t"
"movq %%mm2, %%mm3 \n\t"
"punpcklbw %%mm7, %%mm0 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpckhbw %%mm7, %%mm1 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"paddusw %%mm2, %%mm0 \n\t"
"paddusw %%mm3, %%mm1 \n\t"
"paddusw %%mm6, %%mm4 \n\t"
"paddusw %%mm6, %%mm5 \n\t"
"paddusw %%mm0, %%mm4 \n\t"
"paddusw %%mm1, %%mm5 \n\t"
"psrlw $2, %%mm4 \n\t"
"psrlw $2, %%mm5 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"packuswb %%mm5, %%mm4 \n\t"
"pcmpeqd %%mm2, %%mm2 \n\t"
"paddb %%mm2, %%mm2 \n\t"
PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2)
"movq %%mm5, (%2, %%"REG_a") \n\t"
"add %3, %%"REG_a" \n\t"
 
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3
"movq 1(%1, %%"REG_a"), %%mm4 \n\t"
"movq %%mm2, %%mm3 \n\t"
"movq %%mm4, %%mm5 \n\t"
"punpcklbw %%mm7, %%mm2 \n\t"
"punpcklbw %%mm7, %%mm4 \n\t"
"punpckhbw %%mm7, %%mm3 \n\t"
"punpckhbw %%mm7, %%mm5 \n\t"
"paddusw %%mm2, %%mm4 \n\t"
"paddusw %%mm3, %%mm5 \n\t"
"paddusw %%mm6, %%mm0 \n\t"
"paddusw %%mm6, %%mm1 \n\t"
"paddusw %%mm4, %%mm0 \n\t"
"paddusw %%mm5, %%mm1 \n\t"
"psrlw $2, %%mm0 \n\t"
"psrlw $2, %%mm1 \n\t"
"movq (%2, %%"REG_a"), %%mm3 \n\t"
"packuswb %%mm1, %%mm0 \n\t"
"pcmpeqd %%mm2, %%mm2 \n\t"
"paddb %%mm2, %%mm2 \n\t"
PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2)
"movq %%mm1, (%2, %%"REG_a") \n\t"
"add %3, %%"REG_a" \n\t"
 
"subl $2, %0 \n\t"
"jnz 1b \n\t"
:"+g"(h), "+S"(pixels)
:"D"(block), "r"((x86_reg)line_size)
:REG_a, "memory");
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rv34dsp.asm
0,0 → 1,196
;******************************************************************************
;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
pw_row_coeffs: times 4 dw 13
times 4 dw 17
times 4 dw 7
pd_512: times 2 dd 0x200
pw_col_coeffs: dw 13, 13, 13, -13
dw 17, 7, 7, -17
dw 13, -13, 13, 13
dw -7, 17, -17, -7
 
SECTION .text
 
%macro IDCT_DC_NOROUND 1
imul %1, 13*13*3
sar %1, 11
%endmacro
 
%macro IDCT_DC_ROUND 1
imul %1, 13*13
add %1, 0x200
sar %1, 10
%endmacro
 
%macro rv34_idct 1
cglobal rv34_idct_%1, 1, 2, 0
movsx r1, word [r0]
IDCT_DC r1
movd m0, r1d
pshufw m0, m0, 0
movq [r0+ 0], m0
movq [r0+ 8], m0
movq [r0+16], m0
movq [r0+24], m0
REP_RET
%endmacro
 
INIT_MMX mmxext
%define IDCT_DC IDCT_DC_ROUND
rv34_idct dc
%define IDCT_DC IDCT_DC_NOROUND
rv34_idct dc_noround
 
; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc);
INIT_MMX mmx
cglobal rv34_idct_dc_add, 3, 3
; calculate DC
IDCT_DC_ROUND r2
pxor m1, m1
movd m0, r2d
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
punpcklbw m0, m0
punpcklbw m1, m1
punpcklwd m0, m0
punpcklwd m1, m1
 
; add DC
lea r2, [r0+r1*2]
movh m2, [r0]
movh m3, [r0+r1]
movh m4, [r2]
movh m5, [r2+r1]
paddusb m2, m0
paddusb m3, m0
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
psubusb m3, m1
psubusb m4, m1
psubusb m5, m1
movh [r0], m2
movh [r0+r1], m3
movh [r2], m4
movh [r2+r1], m5
RET
 
; Load coeffs and perform row transform
; Output: coeffs in mm[0467], rounder in mm5
%macro ROW_TRANSFORM 1
pxor mm7, mm7
mova mm0, [%1+ 0*8]
mova mm1, [%1+ 1*8]
mova mm2, [%1+ 2*8]
mova mm3, [%1+ 3*8]
mova [%1+ 0*8], mm7
mova [%1+ 1*8], mm7
mova [%1+ 2*8], mm7
mova [%1+ 3*8], mm7
mova mm4, mm0
mova mm6, [pw_row_coeffs+ 0]
paddsw mm0, mm2 ; b0 + b2
psubsw mm4, mm2 ; b0 - b2
pmullw mm0, mm6 ; *13 = z0
pmullw mm4, mm6 ; *13 = z1
mova mm5, mm1
pmullw mm1, [pw_row_coeffs+ 8] ; b1*17
pmullw mm5, [pw_row_coeffs+16] ; b1* 7
mova mm7, mm3
pmullw mm3, [pw_row_coeffs+ 8] ; b3*17
pmullw mm7, [pw_row_coeffs+16] ; b3* 7
paddsw mm1, mm7 ; z3 = b1*17 + b3* 7
psubsw mm5, mm3 ; z2 = b1* 7 - b3*17
mova mm7, mm0
mova mm6, mm4
paddsw mm0, mm1 ; z0 + z3
psubsw mm7, mm1 ; z0 - z3
paddsw mm4, mm5 ; z1 + z2
psubsw mm6, mm5 ; z1 - z2
mova mm5, [pd_512] ; 0x200
%endmacro
 
; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
%macro COL_TRANSFORM 4
pshufw mm3, %2, 0xDD ; col. 1,3,1,3
pshufw %2, %2, 0x88 ; col. 0,2,0,2
pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1
pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2
paddd %2, mm5
pshufw mm1, %2, 01001110b ; z1 | z0
pshufw mm2, mm3, 01001110b ; z2 | z3
paddd %2, mm3 ; z0+z3 | z1+z2
psubd mm1, mm2 ; z1-z2 | z0-z3
movd mm3, %1
psrad %2, 10
pxor mm2, mm2
psrad mm1, 10
punpcklbw mm3, mm2
packssdw %2, mm1
paddw %2, mm3
packuswb %2, %2
movd %1, %2
%endmacro
INIT_MMX mmxext
cglobal rv34_idct_add, 3,3,0, d, s, b
ROW_TRANSFORM bq
COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8]
mova mm0, [pw_col_coeffs+ 0]
COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8]
mova mm4, [pw_col_coeffs+ 8]
lea dq, [dq + 2*sq]
COL_TRANSFORM [dq], mm6, mm0, mm4
COL_TRANSFORM [dq+sq], mm7, mm0, mm4
ret
 
; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc);
INIT_XMM sse4
cglobal rv34_idct_dc_add, 3, 3, 6
; load data
IDCT_DC_ROUND r2
pxor m1, m1
 
; calculate DC
movd m0, r2d
lea r2, [r0+r1*2]
movd m2, [r0]
movd m3, [r0+r1]
pshuflw m0, m0, 0
movd m4, [r2]
movd m5, [r2+r1]
punpcklqdq m0, m0
punpckldq m2, m3
punpckldq m4, m5
punpcklbw m2, m1
punpcklbw m4, m1
paddw m2, m0
paddw m4, m0
packuswb m2, m4
movd [r0], m2
pextrd [r0+r1], m2, 1
pextrd [r2], m2, 2
pextrd [r2+r1], m2, 3
RET
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rv34dsp_init.c
0,0 → 1,45
/*
* RV30/40 MMX/SSE2 optimizations
* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/rv34dsp.h"
 
void ff_rv34_idct_dc_mmxext(int16_t *block);
void ff_rv34_idct_dc_noround_mmxext(int16_t *block);
void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc);
void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc);
void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block);
 
av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c)
{
int cpu_flags = av_get_cpu_flags();
 
if (EXTERNAL_MMX(cpu_flags))
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx;
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext;
c->rv34_idct_add = ff_rv34_idct_add_mmxext;
}
if (EXTERNAL_SSE4(cpu_flags))
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4;
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rv40dsp.asm
0,0 → 1,505
;******************************************************************************
;* MMX/SSE2-optimized functions for the RV40 decoder
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
align 16
pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024
 
sixtap_filter_hb_m: times 8 db 1, -5
times 8 db 52, 20
; multiplied by 2 to have the same shift
times 8 db 2, -10
times 8 db 40, 40
; back to normal
times 8 db 1, -5
times 8 db 20, 52
 
sixtap_filter_v_m: times 8 dw 1
times 8 dw -5
times 8 dw 52
times 8 dw 20
; multiplied by 2 to have the same shift
times 8 dw 2
times 8 dw -10
times 8 dw 40
times 8 dw 40
; back to normal
times 8 dw 1
times 8 dw -5
times 8 dw 20
times 8 dw 52
 
%ifdef PIC
%define sixtap_filter_hw picregq
%define sixtap_filter_hb picregq
%define sixtap_filter_v picregq
%define npicregs 1
%else
%define sixtap_filter_hw sixtap_filter_hw_m
%define sixtap_filter_hb sixtap_filter_hb_m
%define sixtap_filter_v sixtap_filter_v_m
%define npicregs 0
%endif
 
filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11
 
cextern pw_32
cextern pw_16
cextern pw_512
 
SECTION .text
 
;-----------------------------------------------------------------------------
; subpel MC functions:
;
; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride,
; uint8_t *src, int srcstride,
; int len, int m);
;----------------------------------------------------------------------
%macro LOAD 2
%if WIN64
movsxd %1q, %1d
%endif
%ifdef PIC
add %1q, picregq
%else
add %1q, %2
%endif
%endmacro
 
%macro STORE 3
%ifidn %3, avg
movh %2, [dstq]
%endif
packuswb %1, %1
%ifidn %3, avg
%if cpuflag(3dnow)
pavgusb %1, %2
%else
pavgb %1, %2
%endif
%endif
movh [dstq], %1
%endmacro
 
%macro FILTER_V 1
cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg
%ifdef PIC
lea picregq, [sixtap_filter_v_m]
%endif
pxor m7, m7
LOAD my, sixtap_filter_v
 
; read 5 lines
sub srcq, srcstrideq
sub srcq, srcstrideq
movh m0, [srcq]
movh m1, [srcq+srcstrideq]
movh m2, [srcq+srcstrideq*2]
lea srcq, [srcq+srcstrideq*2]
add srcq, srcstrideq
movh m3, [srcq]
movh m4, [srcq+srcstrideq]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
 
%ifdef m8
mova m8, [myq+ 0]
mova m9, [myq+16]
mova m10, [myq+32]
mova m11, [myq+48]
%define COEFF05 m8
%define COEFF14 m9
%define COEFF2 m10
%define COEFF3 m11
%else
%define COEFF05 [myq+ 0]
%define COEFF14 [myq+16]
%define COEFF2 [myq+32]
%define COEFF3 [myq+48]
%endif
.nextrow:
mova m6, m1
movh m5, [srcq+2*srcstrideq] ; read new row
paddw m6, m4
punpcklbw m5, m7
pmullw m6, COEFF14
paddw m0, m5
pmullw m0, COEFF05
paddw m6, m0
mova m0, m1
paddw m6, [pw_32]
mova m1, m2
pmullw m2, COEFF2
paddw m6, m2
mova m2, m3
pmullw m3, COEFF3
paddw m6, m3
 
; round/clip/store
mova m3, m4
psraw m6, 6
mova m4, m5
STORE m6, m5, %1
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
%endmacro
 
%macro FILTER_H 1
cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg
%ifdef PIC
lea picregq, [sixtap_filter_v_m]
%endif
pxor m7, m7
LOAD mx, sixtap_filter_v
mova m6, [pw_32]
%ifdef m8
mova m8, [mxq+ 0]
mova m9, [mxq+16]
mova m10, [mxq+32]
mova m11, [mxq+48]
%define COEFF05 m8
%define COEFF14 m9
%define COEFF2 m10
%define COEFF3 m11
%else
%define COEFF05 [mxq+ 0]
%define COEFF14 [mxq+16]
%define COEFF2 [mxq+32]
%define COEFF3 [mxq+48]
%endif
.nextrow:
movq m0, [srcq-2]
movq m5, [srcq+3]
movq m1, [srcq-1]
movq m4, [srcq+2]
punpcklbw m0, m7
punpcklbw m5, m7
punpcklbw m1, m7
punpcklbw m4, m7
movq m2, [srcq-0]
movq m3, [srcq+1]
paddw m0, m5
paddw m1, m4
punpcklbw m2, m7
punpcklbw m3, m7
pmullw m0, COEFF05
pmullw m1, COEFF14
pmullw m2, COEFF2
pmullw m3, COEFF3
paddw m0, m6
paddw m1, m2
paddw m0, m3
paddw m0, m1
psraw m0, 6
STORE m0, m1, %1
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
FILTER_V put
FILTER_H put
 
INIT_MMX mmxext
FILTER_V avg
FILTER_H avg
 
INIT_MMX 3dnow
FILTER_V avg
FILTER_H avg
%endif
 
INIT_XMM sse2
FILTER_H put
FILTER_H avg
FILTER_V put
FILTER_V avg
 
%macro FILTER_SSSE3 1
cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg
%ifdef PIC
lea picregq, [sixtap_filter_hb_m]
%endif
 
; read 5 lines
sub srcq, srcstrideq
LOAD my, sixtap_filter_hb
sub srcq, srcstrideq
movh m0, [srcq]
movh m1, [srcq+srcstrideq]
movh m2, [srcq+srcstrideq*2]
lea srcq, [srcq+srcstrideq*2]
add srcq, srcstrideq
mova m5, [myq]
movh m3, [srcq]
movh m4, [srcq+srcstrideq]
lea srcq, [srcq+2*srcstrideq]
 
.nextrow:
mova m6, m2
punpcklbw m0, m1
punpcklbw m6, m3
pmaddubsw m0, m5
pmaddubsw m6, [myq+16]
movh m7, [srcq] ; read new row
paddw m6, m0
mova m0, m1
mova m1, m2
mova m2, m3
mova m3, m4
mova m4, m7
punpcklbw m7, m3
pmaddubsw m7, m5
paddw m6, m7
pmulhrsw m6, [pw_512]
STORE m6, m7, %1
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
 
cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg
%ifdef PIC
lea picregq, [sixtap_filter_hb_m]
%endif
mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3]
LOAD mx, sixtap_filter_hb
mova m5, [mxq] ; set up 6tap filter in bytes
mova m6, [mxq+16]
mova m7, [filter_h6_shuf1]
 
.nextrow:
movu m0, [srcq-2]
mova m1, m0
mova m2, m0
pshufb m0, m7
pshufb m1, m3
pshufb m2, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
pmaddubsw m2, m5
paddw m0, m1
paddw m0, m2
pmulhrsw m0, [pw_512]
STORE m0, m1, %1
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
%endmacro
 
INIT_XMM ssse3
FILTER_SSSE3 put
FILTER_SSSE3 avg
 
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2
%macro RV40_WCORE 4-5
movh m4, [%3 + r6 + 0]
movh m5, [%4 + r6 + 0]
%if %0 == 4
%define OFFSET r6 + mmsize / 2
%else
; 8x8 block and sse2, stride was provided
%define OFFSET r6
add r6, r5
%endif
movh m6, [%3 + OFFSET]
movh m7, [%4 + OFFSET]
 
%if %1 == 0
; 14bits weights
punpcklbw m4, m0
punpcklbw m5, m0
punpcklbw m6, m0
punpcklbw m7, m0
 
psllw m4, 7
psllw m5, 7
psllw m6, 7
psllw m7, 7
pmulhw m4, m3
pmulhw m5, m2
pmulhw m6, m3
pmulhw m7, m2
 
paddw m4, m5
paddw m6, m7
%else
; 5bits weights
%if cpuflag(ssse3)
punpcklbw m4, m5
punpcklbw m6, m7
 
pmaddubsw m4, m3
pmaddubsw m6, m3
%else
punpcklbw m4, m0
punpcklbw m5, m0
punpcklbw m6, m0
punpcklbw m7, m0
 
pmullw m4, m3
pmullw m5, m2
pmullw m6, m3
pmullw m7, m2
paddw m4, m5
paddw m6, m7
%endif
 
%endif
 
; bias and shift down
%if cpuflag(ssse3)
pmulhrsw m4, m1
pmulhrsw m6, m1
%else
paddw m4, m1
paddw m6, m1
psrlw m4, 5
psrlw m6, 5
%endif
 
packuswb m4, m6
%if %0 == 5
; Only called for 8x8 blocks and sse2
sub r6, r5
movh [%2 + r6], m4
add r6, r5
movhps [%2 + r6], m4
%else
mova [%2 + r6], m4
%endif
%endmacro
 
 
%macro MAIN_LOOP 2
%if mmsize == 8
RV40_WCORE %2, r0, r1, r2
%if %1 == 16
RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8
%endif
 
; Prepare for next loop
add r6, r5
%else
%ifidn %1, 8
RV40_WCORE %2, r0, r1, r2, r5
; Prepare 2 next lines
add r6, r5
%else
RV40_WCORE %2, r0, r1, r2
; Prepare single next line
add r6, r5
%endif
%endif
 
%endmacro
 
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride)
; %1=size %2=num of xmm regs
; The weights are FP0.14 notation of fractions depending on pts.
; For timebases without rounding error (i.e. PAL), the fractions
; can be simplified, and several operations can be avoided.
; Therefore, we check here whether they are multiples of 2^9 for
; those simplifications to occur.
%macro RV40_WEIGHT 3
cglobal rv40_weight_func_%1_%2, 6, 7, 8
%if cpuflag(ssse3)
mova m1, [pw_1024]
%else
mova m1, [pw_16]
%endif
pxor m0, m0
; Set loop counter and increments
mov r6, r5
shl r6, %3
add r0, r6
add r1, r6
add r2, r6
neg r6
 
movd m2, r3d
movd m3, r4d
%ifidn %1,rnd
%define RND 0
SPLATW m2, m2
%else
%define RND 1
%if cpuflag(ssse3)
punpcklbw m3, m2
%else
SPLATW m2, m2
%endif
%endif
SPLATW m3, m3
 
.loop:
MAIN_LOOP %2, RND
jnz .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
RV40_WEIGHT rnd, 8, 3
RV40_WEIGHT rnd, 16, 4
RV40_WEIGHT nornd, 8, 3
RV40_WEIGHT nornd, 16, 4
 
INIT_XMM sse2
RV40_WEIGHT rnd, 8, 3
RV40_WEIGHT rnd, 16, 4
RV40_WEIGHT nornd, 8, 3
RV40_WEIGHT nornd, 16, 4
 
INIT_XMM ssse3
RV40_WEIGHT rnd, 8, 3
RV40_WEIGHT rnd, 16, 4
RV40_WEIGHT nornd, 8, 3
RV40_WEIGHT nornd, 16, 4
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rv40dsp_init.c
0,0 → 1,270
/*
* RV40 decoder motion compensation functions x86-optimised
* Copyright (c) 2008 Konstantin Shishkov
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
/**
* @file
* RV40 decoder motion compensation functions x86-optimised
* 2,0 and 0,2 have h264 equivalents.
* 3,3 is bugged in the rv40 format and maps to _xy2 version
*/
 
#include "libavcodec/rv34dsp.h"
#include "libavutil/attributes.h"
#include "libavutil/mem.h"
#include "libavutil/x86/cpu.h"
#include "dsputil_x86.h"
 
#if HAVE_YASM
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
 
void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
 
#define DECLARE_WEIGHT(opt) \
void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride); \
void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \
int w1, int w2, ptrdiff_t stride);
DECLARE_WEIGHT(mmxext)
DECLARE_WEIGHT(sse2)
DECLARE_WEIGHT(ssse3)
 
/** @{ */
/**
* Define one qpel function.
* LOOPSIZE must be already set to the number of pixels processed per
* iteration in the inner loop of the called functions.
* COFF(x) must be already defined so as to provide the offset into any
* array of coeffs used by the called function for the qpel position x.
*/
#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \
static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \
uint8_t *src, \
ptrdiff_t stride) \
{ \
int i; \
if (PH && PV) { \
DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \
uint8_t *tmpptr = tmp + SIZE * 2; \
src -= stride * 2; \
\
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \
SIZE + 5, HCOFF(PH)); \
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \
SIZE, SIZE, VCOFF(PV)); \
} else if (PV) { \
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \
stride, SIZE, VCOFF(PV)); \
} else { \
for (i = 0; i < SIZE; i += LOOPSIZE) \
ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \
stride, SIZE, HCOFF(PH)); \
} \
};
 
/** Declare functions for sizes 8 and 16 and given operations
* and qpel position. */
#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \
QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \
QPEL_FUNC_DECL(OP, 16, PH, PV, OPT)
 
/** Declare all functions for all sizes and qpel positions */
#define QPEL_MC_DECL(OP, OPT) \
void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
const uint8_t *src, \
ptrdiff_t srcStride, \
int len, int m); \
void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \
const uint8_t *src, \
ptrdiff_t srcStride, \
int len, int m); \
QPEL_FUNCS_DECL(OP, 0, 1, OPT) \
QPEL_FUNCS_DECL(OP, 0, 3, OPT) \
QPEL_FUNCS_DECL(OP, 1, 0, OPT) \
QPEL_FUNCS_DECL(OP, 1, 1, OPT) \
QPEL_FUNCS_DECL(OP, 1, 2, OPT) \
QPEL_FUNCS_DECL(OP, 1, 3, OPT) \
QPEL_FUNCS_DECL(OP, 2, 1, OPT) \
QPEL_FUNCS_DECL(OP, 2, 2, OPT) \
QPEL_FUNCS_DECL(OP, 2, 3, OPT) \
QPEL_FUNCS_DECL(OP, 3, 0, OPT) \
QPEL_FUNCS_DECL(OP, 3, 1, OPT) \
QPEL_FUNCS_DECL(OP, 3, 2, OPT)
/** @} */
 
#define LOOPSIZE 8
#define HCOFF(x) (32 * (x - 1))
#define VCOFF(x) (32 * (x - 1))
QPEL_MC_DECL(put_, _ssse3)
QPEL_MC_DECL(avg_, _ssse3)
 
#undef LOOPSIZE
#undef HCOFF
#undef VCOFF
#define LOOPSIZE 8
#define HCOFF(x) (64 * (x - 1))
#define VCOFF(x) (64 * (x - 1))
QPEL_MC_DECL(put_, _sse2)
QPEL_MC_DECL(avg_, _sse2)
 
#if ARCH_X86_32
#undef LOOPSIZE
#undef HCOFF
#undef VCOFF
#define LOOPSIZE 4
#define HCOFF(x) (64 * (x - 1))
#define VCOFF(x) (64 * (x - 1))
 
QPEL_MC_DECL(put_, _mmx)
 
#define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx
#define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx
QPEL_MC_DECL(avg_, _mmxext)
 
#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx
#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx
QPEL_MC_DECL(avg_, _3dnow)
#endif
 
/** @{ */
/** Set one function */
#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \
c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT;
 
/** Set functions put and avg for sizes 8 and 16 and a given qpel position */
#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \
QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \
QPEL_FUNC_SET(OP, 16, PH, PV, OPT)
 
/** Set all functions for all sizes and qpel positions */
#define QPEL_MC_SET(OP, OPT) \
QPEL_FUNCS_SET (OP, 0, 1, OPT) \
QPEL_FUNCS_SET (OP, 0, 3, OPT) \
QPEL_FUNCS_SET (OP, 1, 0, OPT) \
QPEL_FUNCS_SET (OP, 1, 1, OPT) \
QPEL_FUNCS_SET (OP, 1, 2, OPT) \
QPEL_FUNCS_SET (OP, 1, 3, OPT) \
QPEL_FUNCS_SET (OP, 2, 1, OPT) \
QPEL_FUNCS_SET (OP, 2, 2, OPT) \
QPEL_FUNCS_SET (OP, 2, 3, OPT) \
QPEL_FUNCS_SET (OP, 3, 0, OPT) \
QPEL_FUNCS_SET (OP, 3, 1, OPT) \
QPEL_FUNCS_SET (OP, 3, 2, OPT)
/** @} */
 
#endif /* HAVE_YASM */
 
#if HAVE_MMX_INLINE
static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels8_xy2_mmx(dst, src, stride, 8);
}
static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_put_pixels16_xy2_mmx(dst, src, stride, 16);
}
static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels8_xy2_mmx(dst, src, stride, 8);
}
static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src,
ptrdiff_t stride)
{
ff_avg_pixels16_xy2_mmx(dst, src, stride, 16);
}
#endif /* HAVE_MMX_INLINE */
 
av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c)
{
int cpu_flags = av_get_cpu_flags();
 
#if HAVE_MMX_INLINE
if (INLINE_MMX(cpu_flags)) {
c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_mmx;
c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx;
c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_mmx;
c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmx;
}
#endif /* HAVE_MMX_INLINE */
 
#if HAVE_YASM
if (EXTERNAL_MMX(cpu_flags)) {
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx;
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx;
#if ARCH_X86_32
QPEL_MC_SET(put_, _mmx)
#endif
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow;
#if ARCH_X86_32
QPEL_MC_SET(avg_, _3dnow)
#endif
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext;
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext;
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext;
#if ARCH_X86_32
QPEL_MC_SET(avg_, _mmxext)
#endif
}
if (EXTERNAL_SSE2(cpu_flags)) {
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2;
QPEL_MC_SET(put_, _sse2)
QPEL_MC_SET(avg_, _sse2)
}
if (EXTERNAL_SSSE3(cpu_flags)) {
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3;
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3;
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3;
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3;
QPEL_MC_SET(put_, _ssse3)
QPEL_MC_SET(avg_, _ssse3)
}
#endif /* HAVE_YASM */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/sbrdsp.asm
0,0 → 1,425
;******************************************************************************
;* AAC Spectral Band Replication decoding functions
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
; mask equivalent for multiply by -1.0 1.0
ps_mask times 2 dd 1<<31, 0
ps_mask2 times 2 dd 0, 1<<31
ps_neg times 4 dd 1<<31
ps_noise0 times 2 dd 1.0, 0.0,
ps_noise2 times 2 dd -1.0, 0.0
ps_noise13 dd 0.0, 1.0, 0.0, -1.0
dd 0.0, -1.0, 0.0, 1.0
dd 0.0, 1.0, 0.0, -1.0
cextern sbr_noise_table
 
SECTION_TEXT
 
INIT_XMM sse
cglobal sbr_sum_square, 2, 3, 6
mov r2, r1
xorps m0, m0
xorps m1, m1
sar r2, 3
jz .prepare
.loop:
movu m2, [r0 + 0]
movu m3, [r0 + 16]
movu m4, [r0 + 32]
movu m5, [r0 + 48]
mulps m2, m2
mulps m3, m3
mulps m4, m4
mulps m5, m5
addps m0, m2
addps m1, m3
addps m0, m4
addps m1, m5
add r0, 64
dec r2
jnz .loop
.prepare:
and r1, 7
sar r1, 1
jz .end
; len is a multiple of 2, thus there are at least 4 elements to process
.endloop:
movu m2, [r0]
add r0, 16
mulps m2, m2
dec r1
addps m0, m2
jnz .endloop
.end:
addps m0, m1
movhlps m2, m0
addps m0, m2
movss m1, m0
shufps m0, m0, 1
addss m0, m1
%if ARCH_X86_64 == 0
movss r0m, m0
fld dword r0m
%endif
RET
 
%define STEP 40*4*2
cglobal sbr_hf_g_filt, 5, 6, 5
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high
mov r5, r3
and r3, 0xFC
lea r2, [r2 + r3*4]
lea r0, [r0 + r3*8]
neg r3
jz .loop1
.loop4:
movlps m0, [r2 + 4*r3 + 0]
movlps m1, [r2 + 4*r3 + 8]
movlps m2, [r1 + 0*STEP]
movlps m3, [r1 + 2*STEP]
movhps m2, [r1 + 1*STEP]
movhps m3, [r1 + 3*STEP]
unpcklps m0, m0
unpcklps m1, m1
mulps m0, m2
mulps m1, m3
movu [r0 + 8*r3 + 0], m0
movu [r0 + 8*r3 + 16], m1
add r1, 4*STEP
add r3, 4
jnz .loop4
and r5, 3 ; number of single element loops
jz .end
.loop1: ; element 0 and 1 can be computed at the same time
movss m0, [r2]
movlps m2, [r1]
unpcklps m0, m0
mulps m2, m0
movlps [r0], m2
add r0, 8
add r2, 4
add r1, STEP
dec r5
jnz .loop1
.end:
RET
 
; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2],
; const float alpha0[2], const float alpha1[2],
; float bw, int start, int end)
;
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E
; load alpha factors
%define bw m0
%if ARCH_X86_64 == 0 || WIN64
movss bw, BWm
%endif
movlps m2, [alpha1q]
movlps m1, [alpha0q]
shufps bw, bw, 0
mulps m2, bw ; (a1[0] a1[1])*bw
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3)
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1)
mova m3, m1
mova m4, m2
 
; Set pointers
%if ARCH_X86_64 == 0 || WIN64
; start and end 6th and 7th args on stack
mov r2d, Sm
mov r3d, Em
%define start r2q
%define end r3q
%else
; BW does not actually occupy a register, so shift by 1
%define start BWq
%define end Sq
%endif
sub start, end ; neg num of loops
lea X_highq, [X_highq + end*2*4]
lea X_lowq, [X_lowq + end*2*4 - 2*2*4]
shl start, 3 ; offset from num loops
 
mova m0, [X_lowq + start]
shufps m3, m3, q1111
shufps m4, m4, q1111
xorps m3, [ps_mask]
shufps m1, m1, q0000
shufps m2, m2, q0000
xorps m4, [ps_mask]
.loop2:
movu m7, [X_lowq + start + 8] ; BbCc
mova m6, m0
mova m5, m7
shufps m0, m0, q2301 ; aAbB
shufps m7, m7, q2301 ; bBcC
mulps m0, m4
mulps m7, m3
mulps m6, m2
mulps m5, m1
addps m7, m0
mova m0, [X_lowq + start +16] ; CcDd
addps m7, m0
addps m6, m5
addps m7, m6
mova [X_highq + start], m7
add start, 16
jnz .loop2
RET
 
cglobal sbr_sum64x5, 1,2,4,z
lea r1q, [zq+ 256]
.loop:
mova m0, [zq+ 0]
mova m2, [zq+ 16]
mova m1, [zq+ 256]
mova m3, [zq+ 272]
addps m0, [zq+ 512]
addps m2, [zq+ 528]
addps m1, [zq+ 768]
addps m3, [zq+ 784]
addps m0, [zq+1024]
addps m2, [zq+1040]
addps m0, m1
addps m2, m3
mova [zq], m0
mova [zq+16], m2
add zq, 32
cmp zq, r1q
jne .loop
REP_RET
 
INIT_XMM sse
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z
lea r2q, [zq + (64-4)*4]
mova m3, [ps_neg]
.loop:
mova m1, [zq]
xorps m0, m3, [r2q]
shufps m0, m0, m0, q0123
unpcklps m2, m0, m1
unpckhps m0, m0, m1
mova [Wq + 0], m2
mova [Wq + 16], m0
add Wq, 32
sub r2q, 16
add zq, 16
cmp zq, r2q
jl .loop
REP_RET
 
INIT_XMM sse
cglobal sbr_neg_odd_64, 1,2,4,z
lea r1q, [zq+256]
.loop:
mova m0, [zq+ 0]
mova m1, [zq+16]
mova m2, [zq+32]
mova m3, [zq+48]
xorps m0, [ps_mask2]
xorps m1, [ps_mask2]
xorps m2, [ps_mask2]
xorps m3, [ps_mask2]
mova [zq+ 0], m0
mova [zq+16], m1
mova [zq+32], m2
mova [zq+48], m3
add zq, 64
cmp zq, r1q
jne .loop
REP_RET
 
; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1)
%macro SBR_QMF_DEINT_BFLY 0
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c
mov cq, 64*4-2*mmsize
lea vrevq, [vq + 64*4]
.loop:
mova m0, [src0q+cq]
mova m1, [src1q]
mova m4, [src0q+cq+mmsize]
mova m5, [src1q+mmsize]
%if cpuflag(sse2)
pshufd m2, m0, q0123
pshufd m3, m1, q0123
pshufd m6, m4, q0123
pshufd m7, m5, q0123
%else
shufps m2, m0, m0, q0123
shufps m3, m1, m1, q0123
shufps m6, m4, m4, q0123
shufps m7, m5, m5, q0123
%endif
addps m5, m2
subps m0, m7
addps m1, m6
subps m4, m3
mova [vrevq], m1
mova [vrevq+mmsize], m5
mova [vq+cq], m0
mova [vq+cq+mmsize], m4
add src1q, 2*mmsize
add vrevq, 2*mmsize
sub cq, 2*mmsize
jge .loop
REP_RET
%endmacro
 
INIT_XMM sse
SBR_QMF_DEINT_BFLY
 
INIT_XMM sse2
SBR_QMF_DEINT_BFLY
 
INIT_XMM sse2
cglobal sbr_qmf_pre_shuffle, 1,4,6,z
%define OFFSET (32*4-2*mmsize)
mov r3q, OFFSET
lea r1q, [zq + (32+1)*4]
lea r2q, [zq + 64*4]
mova m5, [ps_neg]
.loop:
movu m0, [r1q]
movu m2, [r1q + mmsize]
movu m1, [zq + r3q + 4 + mmsize]
movu m3, [zq + r3q + 4]
 
pxor m2, m5
pxor m0, m5
pshufd m2, m2, q0123
pshufd m0, m0, q0123
SBUTTERFLY dq, 2, 3, 4
SBUTTERFLY dq, 0, 1, 4
mova [r2q + 2*r3q + 0*mmsize], m2
mova [r2q + 2*r3q + 1*mmsize], m3
mova [r2q + 2*r3q + 2*mmsize], m0
mova [r2q + 2*r3q + 3*mmsize], m1
add r1q, 2*mmsize
sub r3q, 2*mmsize
jge .loop
movq m2, [zq]
movq [r2q], m2
REP_RET
 
%ifdef PIC
%define NREGS 1
%if UNIX64
%define NOISE_TABLE r6q ; r5q is m_max
%else
%define NOISE_TABLE r5q
%endif
%else
%define NREGS 0
%define NOISE_TABLE sbr_noise_table
%endif
 
%macro LOAD_NST 1
%ifdef PIC
lea NOISE_TABLE, [%1]
mova m0, [kxq + NOISE_TABLE]
%else
mova m0, [kxq + %1]
%endif
%endmacro
 
INIT_XMM sse2
; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
mova m0, [ps_noise0]
jmp apply_noise_main
 
; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
and kxq, 1
shl kxq, 4
LOAD_NST ps_noise13
jmp apply_noise_main
 
; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
mova m0, [ps_noise2]
jmp apply_noise_main
 
; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m,
; const float *q_filt, int noise,
; int kx, int m_max)
cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max
and kxq, 1
shl kxq, 4
LOAD_NST ps_noise13+16
 
apply_noise_main:
%if ARCH_X86_64 == 0 || WIN64
mov kxd, m_maxm
%define count kxq
%else
%define count m_maxq
%endif
dec noiseq
shl count, 2
%ifdef PIC
lea NOISE_TABLE, [sbr_noise_table]
%endif
lea Yq, [Yq + 2*count]
add s_mq, count
add q_filtq, count
shl noiseq, 3
pxor m5, m5
neg count
.loop:
mova m1, [q_filtq + count]
movu m3, [noiseq + NOISE_TABLE + 1*mmsize]
movu m4, [noiseq + NOISE_TABLE + 2*mmsize]
add noiseq, 2*mmsize
and noiseq, 0x1ff<<3
punpckhdq m2, m1, m1
punpckldq m1, m1
mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise]
mova m3, [s_mq + count]
; TODO: replace by a vpermd in AVX2
punpckhdq m4, m3, m3
punpckldq m3, m3
pcmpeqd m6, m3, m5 ; m6 == 0
pcmpeqd m7, m4, m5 ; m7 == 0
mulps m3, m0 ; s_m[m] * phi_sign
mulps m4, m0 ; s_m[m] * phi_sign
pand m1, m6
pand m2, m7
movu m6, [Yq + 2*count]
movu m7, [Yq + 2*count + mmsize]
addps m3, m1
addps m4, m2
addps m6, m3
addps m7, m4
movu [Yq + 2*count], m6
movu [Yq + 2*count + mmsize], m7
add count, mmsize
jl .loop
RET
/contrib/sdk/sources/ffmpeg/libavcodec/x86/sbrdsp_init.c
0,0 → 1,76
/*
* AAC Spectral Band Replication decoding functions
* Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/sbrdsp.h"
 
float ff_sbr_sum_square_sse(float (*x)[2], int n);
void ff_sbr_sum64x5_sse(float *z);
void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2],
const float *g_filt, int m_max, intptr_t ixh);
void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2],
const float alpha0[2], const float alpha1[2],
float bw, int start, int end);
void ff_sbr_neg_odd_64_sse(float *z);
void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z);
void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1);
void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1);
void ff_sbr_qmf_pre_shuffle_sse2(float *z);
 
void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m,
const float *q_filt, int noise,
int kx, int m_max);
 
av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s)
{
int cpu_flags = av_get_cpu_flags();
 
if (EXTERNAL_SSE(cpu_flags)) {
s->neg_odd_64 = ff_sbr_neg_odd_64_sse;
s->sum_square = ff_sbr_sum_square_sse;
s->sum64x5 = ff_sbr_sum64x5_sse;
s->hf_g_filt = ff_sbr_hf_g_filt_sse;
s->hf_gen = ff_sbr_hf_gen_sse;
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse;
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse;
}
 
if (EXTERNAL_SSE2(cpu_flags)) {
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2;
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2;
s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2;
s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2;
s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2;
s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2;
}
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/simple_idct.c
0,0 → 1,1167
/*
* Simple IDCT MMX
*
* Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
#include "libavcodec/simple_idct.h"
#include "libavutil/mem.h"
#include "dsputil_x86.h"
 
#if HAVE_INLINE_ASM
 
/*
23170.475006
22725.260826
21406.727617
19265.545870
16384.000000
12872.826198
8866.956905
4520.335430
*/
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
 
#define ROW_SHIFT 11
#define COL_SHIFT 20 // 6
 
DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
 
DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
// the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :)
// 0, 0, 0, 0,
// 0, 0, 0, 0,
 
C4, C4, C4, C4,
C4, -C4, C4, -C4,
 
C2, C6, C2, C6,
C6, -C2, C6, -C2,
 
C1, C3, C1, C3,
C5, C7, C5, C7,
 
C3, -C7, C3, -C7,
-C1, -C5, -C1, -C5,
 
C5, -C1, C5, -C1,
C7, C3, C7, C3,
 
C7, -C5, C7, -C5,
C3, -C1, C3, -C1
};
 
static inline void idct(int16_t *block)
{
LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
int16_t * const temp= (int16_t*)align_tmp;
 
__asm__ volatile(
#if 0 //Alternative, simpler variant
 
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
#rounder ", %%mm4 \n\t"\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
#rounder ", %%mm0 \n\t"\
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
"paddd %%mm0, %%mm0 \n\t" \
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
"movq %%mm7, " #dst " \n\t"\
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
"movq %%mm2, 24+" #dst " \n\t"\
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
"movq %%mm2, 8+" #dst " \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
"movq %%mm4, 16+" #dst " \n\t"\
 
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"movd %%mm7, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"movd %%mm2, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"movd %%mm4, 112+" #dst " \n\t"\
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"movd %%mm2, 32+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm4, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"\
 
 
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
"pand %%mm0, %%mm4 \n\t"\
"por %%mm1, %%mm4 \n\t"\
"por %%mm2, %%mm4 \n\t"\
"por %%mm3, %%mm4 \n\t"\
"packssdw %%mm4,%%mm4 \n\t"\
"movd %%mm4, %%eax \n\t"\
"orl %%eax, %%eax \n\t"\
"jz 1f \n\t"\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
#rounder ", %%mm4 \n\t"\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
#rounder ", %%mm0 \n\t"\
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
"paddd %%mm0, %%mm0 \n\t" \
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
"movq %%mm7, " #dst " \n\t"\
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
"movq %%mm2, 24+" #dst " \n\t"\
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
"movq %%mm2, 8+" #dst " \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
"movq %%mm4, 16+" #dst " \n\t"\
"jmp 2f \n\t"\
"1: \n\t"\
"pslld $16, %%mm0 \n\t"\
"#paddd "MANGLE(d40000)", %%mm0 \n\t"\
"psrad $13, %%mm0 \n\t"\
"packssdw %%mm0, %%mm0 \n\t"\
"movq %%mm0, " #dst " \n\t"\
"movq %%mm0, 8+" #dst " \n\t"\
"movq %%mm0, 16+" #dst " \n\t"\
"movq %%mm0, 24+" #dst " \n\t"\
"2: \n\t"
 
 
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
 
DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
 
 
//IDCT( src0, src4, src1, src5, dst, shift)
COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
#else
 
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
"movq "MANGLE(wm1010)", %%mm4 \n\t"\
"pand %%mm0, %%mm4 \n\t"\
"por %%mm1, %%mm4 \n\t"\
"por %%mm2, %%mm4 \n\t"\
"por %%mm3, %%mm4 \n\t"\
"packssdw %%mm4,%%mm4 \n\t"\
"movd %%mm4, %%eax \n\t"\
"orl %%eax, %%eax \n\t"\
"jz 1f \n\t"\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
#rounder ", %%mm4 \n\t"\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
#rounder ", %%mm0 \n\t"\
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
"paddd %%mm0, %%mm0 \n\t" \
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
"movq %%mm7, " #dst " \n\t"\
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
"movq %%mm2, 24+" #dst " \n\t"\
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
"movq %%mm2, 8+" #dst " \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
"movq %%mm4, 16+" #dst " \n\t"\
"jmp 2f \n\t"\
"1: \n\t"\
"pslld $16, %%mm0 \n\t"\
"paddd "MANGLE(d40000)", %%mm0 \n\t"\
"psrad $13, %%mm0 \n\t"\
"packssdw %%mm0, %%mm0 \n\t"\
"movq %%mm0, " #dst " \n\t"\
"movq %%mm0, 8+" #dst " \n\t"\
"movq %%mm0, 16+" #dst " \n\t"\
"movq %%mm0, 24+" #dst " \n\t"\
"2: \n\t"
 
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
"movq %%mm0, %%mm4 \n\t"\
"por %%mm1, %%mm4 \n\t"\
"por %%mm2, %%mm4 \n\t"\
"por %%mm3, %%mm4 \n\t"\
"packssdw %%mm4,%%mm4 \n\t"\
"movd %%mm4, %%eax \n\t"\
"orl %%eax, %%eax \n\t"\
"jz " #bt " \n\t"\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
#rounder ", %%mm4 \n\t"\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
#rounder ", %%mm0 \n\t"\
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
"paddd %%mm0, %%mm0 \n\t" \
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
"movq %%mm7, " #dst " \n\t"\
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
"movq %%mm2, 24+" #dst " \n\t"\
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
"movq %%mm2, 8+" #dst " \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
"movq %%mm4, 16+" #dst " \n\t"\
 
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
#rounder ", %%mm4 \n\t"\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
#rounder ", %%mm0 \n\t"\
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\
"paddd %%mm0, %%mm0 \n\t" \
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\
"movq %%mm7, " #dst " \n\t"\
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
"movq %%mm2, 24+" #dst " \n\t"\
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\
"movq %%mm2, 8+" #dst " \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\
"movq %%mm4, 16+" #dst " \n\t"\
 
//IDCT( src0, src4, src1, src5, dst, rounder, shift)
DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11)
Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
 
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"movd %%mm7, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"movd %%mm2, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"movd %%mm4, 112+" #dst " \n\t"\
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"movd %%mm2, 32+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm4, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
 
 
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
"jmp 9f \n\t"
 
"# .p2align 4 \n\t"\
"4: \n\t"
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
 
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
"movd %%mm1, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"movd %%mm2, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"movd %%mm4, 112+" #dst " \n\t"\
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm1 \n\t"\
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"movd %%mm2, 32+" #dst " \n\t"\
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm1, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
 
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
"jmp 9f \n\t"
 
"# .p2align 4 \n\t"\
"6: \n\t"
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
 
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\
"movd %%mm1, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"movd %%mm2, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"movd %%mm4, 112+" #dst " \n\t"\
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm1 \n\t"\
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"movd %%mm2, 32+" #dst " \n\t"\
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm1, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
 
 
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
"jmp 9f \n\t"
 
"# .p2align 4 \n\t"\
"2: \n\t"
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
 
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"movd %%mm7, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\
"movd %%mm2, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"movd %%mm4, 112+" #dst " \n\t"\
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm2 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"movd %%mm2, 32+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm4, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
 
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
"jmp 9f \n\t"
 
"# .p2align 4 \n\t"\
"3: \n\t"
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 64(%2), %%mm3 \n\t"\
"pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm1 \n\t" /* A1 a1 */\
"paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm1 \n\t"\
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"movd %%mm7, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\
"movd %%mm1, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"movd %%mm4, 112+" #dst " \n\t"\
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
"movq %%mm5, %%mm1 \n\t" /* A2 a2 */\
"paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm1 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"movd %%mm1, 32+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
"movd %%mm6, 48+" #dst " \n\t"\
"movd %%mm4, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
 
 
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
"jmp 9f \n\t"
 
"# .p2align 4 \n\t"\
"5: \n\t"
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
"movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\
"paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\
"psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\
"paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\
"paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\
"psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\
"psrad $" #shift ", %%mm4 \n\t"\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm3 \n\t"\
"packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\
"movq %%mm4, " #dst " \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\
"movq %%mm0, 16+" #dst " \n\t"\
"movq %%mm0, 96+" #dst " \n\t"\
"movq %%mm4, 112+" #dst " \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"psrad $" #shift ", %%mm6 \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\
"movq %%mm5, 32+" #dst " \n\t"\
"psrad $" #shift ", %%mm1 \n\t"\
"packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"movq %%mm6, 48+" #dst " \n\t"\
"movq %%mm6, 64+" #dst " \n\t"\
"movq %%mm5, 80+" #dst " \n\t"
 
 
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
"jmp 9f \n\t"
 
 
"# .p2align 4 \n\t"\
"1: \n\t"
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\
"movq 64(%2), %%mm1 \n\t"\
"pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"psrad $" #shift ", %%mm7 \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"movq %%mm0, %%mm3 \n\t" /* A1 a1 */\
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\
"psrad $" #shift ", %%mm0 \n\t"\
"psrad $" #shift ", %%mm3 \n\t"\
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\
"movd %%mm7, " #dst " \n\t"\
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\
"movd %%mm0, 16+" #dst " \n\t"\
"packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\
"movd %%mm3, 96+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\
"movd %%mm4, 112+" #dst " \n\t"\
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\
"movq %%mm5, %%mm3 \n\t" /* A2 a2 */\
"paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\
"psrad $" #shift ", %%mm3 \n\t"\
"psrad $" #shift ", %%mm5 \n\t"\
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\
"psrad $" #shift ", %%mm6 \n\t"\
"packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\
"movd %%mm3, 32+" #dst " \n\t"\
"psrad $" #shift ", %%mm4 \n\t"\
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\
"movd %%mm6, 48+" #dst " \n\t"\
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\
"movd %%mm4, 64+" #dst " \n\t"\
"movd %%mm5, 80+" #dst " \n\t"
 
 
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
"jmp 9f \n\t"
 
 
"# .p2align 4 \n\t"
"7: \n\t"
#undef IDCT
#define IDCT(src0, src4, src1, src5, dst, shift) \
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"psrad $" #shift ", %%mm4 \n\t"\
"psrad $" #shift ", %%mm0 \n\t"\
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\
"psrad $" #shift ", %%mm1 \n\t"\
"packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\
"movq %%mm4, " #dst " \n\t"\
"psrad $" #shift ", %%mm2 \n\t"\
"packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\
"movq %%mm0, 16+" #dst " \n\t"\
"movq %%mm0, 96+" #dst " \n\t"\
"movq %%mm4, 112+" #dst " \n\t"\
"movq %%mm0, 32+" #dst " \n\t"\
"movq %%mm4, 48+" #dst " \n\t"\
"movq %%mm4, 64+" #dst " \n\t"\
"movq %%mm0, 80+" #dst " \n\t"
 
//IDCT( src0, src4, src1, src5, dst, shift)
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20)
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20)
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20)
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
 
 
#endif
 
/*
Input
00 40 04 44 20 60 24 64
10 30 14 34 50 70 54 74
01 41 03 43 21 61 23 63
11 31 13 33 51 71 53 73
02 42 06 46 22 62 26 66
12 32 16 36 52 72 56 76
05 45 07 47 25 65 27 67
15 35 17 37 55 75 57 77
 
Temp
00 04 10 14 20 24 30 34
40 44 50 54 60 64 70 74
01 03 11 13 21 23 31 33
41 43 51 53 61 63 71 73
02 06 12 16 22 26 32 36
42 46 52 56 62 66 72 76
05 07 15 17 25 27 35 37
45 47 55 57 65 67 75 77
*/
 
"9: \n\t"
:: "r" (block), "r" (temp), "r" (coeffs)
: "%eax"
);
}
 
void ff_simple_idct_mmx(int16_t *block)
{
idct(block);
}
 
//FIXME merge add/put into the idct
 
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
{
idct(block);
ff_put_pixels_clamped_mmx(block, dest, line_size);
}
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
{
idct(block);
ff_add_pixels_clamped_mmx(block, dest, line_size);
}
 
#endif /* HAVE_INLINE_ASM */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/snowdsp.c
0,0 → 1,902
/*
* MMX and SSE2 optimized snow DSP utils
* Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/snow.h"
#include "libavcodec/snow_dwt.h"
#include "dsputil_x86.h"
 
#if HAVE_INLINE_ASM
 
static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){
const int w2= (width+1)>>1;
const int w_l= (width>>1);
const int w_r= w2 - 1;
int i;
 
{ // Lift 0
IDWTELEM * const ref = b + w2 - 1;
IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice
// (the first time erroneously), we allow the SSE2 code to run an extra pass.
// The savings in code and time are well worth having to store this value and
// calculate b[0] correctly afterwards.
 
i = 0;
__asm__ volatile(
"pcmpeqd %%xmm7, %%xmm7 \n\t"
"pcmpeqd %%xmm3, %%xmm3 \n\t"
"psllw $1, %%xmm3 \n\t"
"paddw %%xmm7, %%xmm3 \n\t"
"psllw $13, %%xmm3 \n\t"
::);
for(; i<w_l-15; i+=16){
__asm__ volatile(
"movdqu (%1), %%xmm1 \n\t"
"movdqu 16(%1), %%xmm5 \n\t"
"movdqu 2(%1), %%xmm2 \n\t"
"movdqu 18(%1), %%xmm6 \n\t"
"paddw %%xmm1, %%xmm2 \n\t"
"paddw %%xmm5, %%xmm6 \n\t"
"paddw %%xmm7, %%xmm2 \n\t"
"paddw %%xmm7, %%xmm6 \n\t"
"pmulhw %%xmm3, %%xmm2 \n\t"
"pmulhw %%xmm3, %%xmm6 \n\t"
"paddw (%0), %%xmm2 \n\t"
"paddw 16(%0), %%xmm6 \n\t"
"movdqa %%xmm2, (%0) \n\t"
"movdqa %%xmm6, 16(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
}
 
{ // Lift 1
IDWTELEM * const dst = b+w2;
 
i = 0;
for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){
dst[i] = dst[i] - (b[i] + b[i + 1]);
}
for(; i<w_r-15; i+=16){
__asm__ volatile(
"movdqu (%1), %%xmm1 \n\t"
"movdqu 16(%1), %%xmm5 \n\t"
"movdqu 2(%1), %%xmm2 \n\t"
"movdqu 18(%1), %%xmm6 \n\t"
"paddw %%xmm1, %%xmm2 \n\t"
"paddw %%xmm5, %%xmm6 \n\t"
"movdqa (%0), %%xmm0 \n\t"
"movdqa 16(%0), %%xmm4 \n\t"
"psubw %%xmm2, %%xmm0 \n\t"
"psubw %%xmm6, %%xmm4 \n\t"
"movdqa %%xmm0, (%0) \n\t"
"movdqa %%xmm4, 16(%0) \n\t"
:: "r"(&dst[i]), "r"(&b[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
}
 
{ // Lift 2
IDWTELEM * const ref = b+w2 - 1;
IDWTELEM b_0 = b[0];
 
i = 0;
__asm__ volatile(
"psllw $15, %%xmm7 \n\t"
"pcmpeqw %%xmm6, %%xmm6 \n\t"
"psrlw $13, %%xmm6 \n\t"
"paddw %%xmm7, %%xmm6 \n\t"
::);
for(; i<w_l-15; i+=16){
__asm__ volatile(
"movdqu (%1), %%xmm0 \n\t"
"movdqu 16(%1), %%xmm4 \n\t"
"movdqu 2(%1), %%xmm1 \n\t"
"movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts
"paddw %%xmm6, %%xmm0 \n\t"
"paddw %%xmm6, %%xmm4 \n\t"
"paddw %%xmm7, %%xmm1 \n\t"
"paddw %%xmm7, %%xmm5 \n\t"
"pavgw %%xmm1, %%xmm0 \n\t"
"pavgw %%xmm5, %%xmm4 \n\t"
"psubw %%xmm7, %%xmm0 \n\t"
"psubw %%xmm7, %%xmm4 \n\t"
"psraw $1, %%xmm0 \n\t"
"psraw $1, %%xmm4 \n\t"
"movdqa (%0), %%xmm1 \n\t"
"movdqa 16(%0), %%xmm5 \n\t"
"paddw %%xmm1, %%xmm0 \n\t"
"paddw %%xmm5, %%xmm4 \n\t"
"psraw $2, %%xmm0 \n\t"
"psraw $2, %%xmm4 \n\t"
"paddw %%xmm1, %%xmm0 \n\t"
"paddw %%xmm5, %%xmm4 \n\t"
"movdqa %%xmm0, (%0) \n\t"
"movdqa %%xmm4, 16(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS);
}
 
{ // Lift 3
IDWTELEM * const src = b+w2;
 
i = 0;
for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){
temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS);
}
for(; i<w_r-7; i+=8){
__asm__ volatile(
"movdqu 2(%1), %%xmm2 \n\t"
"movdqu 18(%1), %%xmm6 \n\t"
"paddw (%1), %%xmm2 \n\t"
"paddw 16(%1), %%xmm6 \n\t"
"movdqu (%0), %%xmm0 \n\t"
"movdqu 16(%0), %%xmm4 \n\t"
"paddw %%xmm2, %%xmm0 \n\t"
"paddw %%xmm6, %%xmm4 \n\t"
"psraw $1, %%xmm2 \n\t"
"psraw $1, %%xmm6 \n\t"
"paddw %%xmm0, %%xmm2 \n\t"
"paddw %%xmm4, %%xmm6 \n\t"
"movdqa %%xmm2, (%2) \n\t"
"movdqa %%xmm6, 16(%2) \n\t"
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
}
 
{
snow_interleave_line_header(&i, width, b, temp);
 
for (; (i & 0x3E) != 0x3E; i-=2){
b[i+1] = temp[i>>1];
b[i] = b[i>>1];
}
for (i-=62; i>=0; i-=64){
__asm__ volatile(
"movdqa (%1), %%xmm0 \n\t"
"movdqa 16(%1), %%xmm2 \n\t"
"movdqa 32(%1), %%xmm4 \n\t"
"movdqa 48(%1), %%xmm6 \n\t"
"movdqa (%1), %%xmm1 \n\t"
"movdqa 16(%1), %%xmm3 \n\t"
"movdqa 32(%1), %%xmm5 \n\t"
"movdqa 48(%1), %%xmm7 \n\t"
"punpcklwd (%2), %%xmm0 \n\t"
"punpcklwd 16(%2), %%xmm2 \n\t"
"punpcklwd 32(%2), %%xmm4 \n\t"
"punpcklwd 48(%2), %%xmm6 \n\t"
"movdqa %%xmm0, (%0) \n\t"
"movdqa %%xmm2, 32(%0) \n\t"
"movdqa %%xmm4, 64(%0) \n\t"
"movdqa %%xmm6, 96(%0) \n\t"
"punpckhwd (%2), %%xmm1 \n\t"
"punpckhwd 16(%2), %%xmm3 \n\t"
"punpckhwd 32(%2), %%xmm5 \n\t"
"punpckhwd 48(%2), %%xmm7 \n\t"
"movdqa %%xmm1, 16(%0) \n\t"
"movdqa %%xmm3, 48(%0) \n\t"
"movdqa %%xmm5, 80(%0) \n\t"
"movdqa %%xmm7, 112(%0) \n\t"
:: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1])
: "memory"
);
}
}
}
 
static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){
const int w2= (width+1)>>1;
const int w_l= (width>>1);
const int w_r= w2 - 1;
int i;
 
{ // Lift 0
IDWTELEM * const ref = b + w2 - 1;
 
i = 1;
b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS);
__asm__ volatile(
"pcmpeqw %%mm7, %%mm7 \n\t"
"pcmpeqw %%mm3, %%mm3 \n\t"
"psllw $1, %%mm3 \n\t"
"paddw %%mm7, %%mm3 \n\t"
"psllw $13, %%mm3 \n\t"
::);
for(; i<w_l-7; i+=8){
__asm__ volatile(
"movq (%1), %%mm2 \n\t"
"movq 8(%1), %%mm6 \n\t"
"paddw 2(%1), %%mm2 \n\t"
"paddw 10(%1), %%mm6 \n\t"
"paddw %%mm7, %%mm2 \n\t"
"paddw %%mm7, %%mm6 \n\t"
"pmulhw %%mm3, %%mm2 \n\t"
"pmulhw %%mm3, %%mm6 \n\t"
"paddw (%0), %%mm2 \n\t"
"paddw 8(%0), %%mm6 \n\t"
"movq %%mm2, (%0) \n\t"
"movq %%mm6, 8(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS);
}
 
{ // Lift 1
IDWTELEM * const dst = b+w2;
 
i = 0;
for(; i<w_r-7; i+=8){
__asm__ volatile(
"movq (%1), %%mm2 \n\t"
"movq 8(%1), %%mm6 \n\t"
"paddw 2(%1), %%mm2 \n\t"
"paddw 10(%1), %%mm6 \n\t"
"movq (%0), %%mm0 \n\t"
"movq 8(%0), %%mm4 \n\t"
"psubw %%mm2, %%mm0 \n\t"
"psubw %%mm6, %%mm4 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm4, 8(%0) \n\t"
:: "r"(&dst[i]), "r"(&b[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS);
}
 
{ // Lift 2
IDWTELEM * const ref = b+w2 - 1;
 
i = 1;
b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS);
__asm__ volatile(
"psllw $15, %%mm7 \n\t"
"pcmpeqw %%mm6, %%mm6 \n\t"
"psrlw $13, %%mm6 \n\t"
"paddw %%mm7, %%mm6 \n\t"
::);
for(; i<w_l-7; i+=8){
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm4 \n\t"
"movq 2(%1), %%mm1 \n\t"
"movq 10(%1), %%mm5 \n\t"
"paddw %%mm6, %%mm0 \n\t"
"paddw %%mm6, %%mm4 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"paddw %%mm7, %%mm5 \n\t"
"pavgw %%mm1, %%mm0 \n\t"
"pavgw %%mm5, %%mm4 \n\t"
"psubw %%mm7, %%mm0 \n\t"
"psubw %%mm7, %%mm4 \n\t"
"psraw $1, %%mm0 \n\t"
"psraw $1, %%mm4 \n\t"
"movq (%0), %%mm1 \n\t"
"movq 8(%0), %%mm5 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm5, %%mm4 \n\t"
"psraw $2, %%mm0 \n\t"
"psraw $2, %%mm4 \n\t"
"paddw %%mm1, %%mm0 \n\t"
"paddw %%mm5, %%mm4 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm4, 8(%0) \n\t"
:: "r"(&b[i]), "r"(&ref[i])
: "memory"
);
}
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l);
}
 
{ // Lift 3
IDWTELEM * const src = b+w2;
i = 0;
 
for(; i<w_r-7; i+=8){
__asm__ volatile(
"movq 2(%1), %%mm2 \n\t"
"movq 10(%1), %%mm6 \n\t"
"paddw (%1), %%mm2 \n\t"
"paddw 8(%1), %%mm6 \n\t"
"movq (%0), %%mm0 \n\t"
"movq 8(%0), %%mm4 \n\t"
"paddw %%mm2, %%mm0 \n\t"
"paddw %%mm6, %%mm4 \n\t"
"psraw $1, %%mm2 \n\t"
"psraw $1, %%mm6 \n\t"
"paddw %%mm0, %%mm2 \n\t"
"paddw %%mm4, %%mm6 \n\t"
"movq %%mm2, (%2) \n\t"
"movq %%mm6, 8(%2) \n\t"
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i])
: "memory"
);
}
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS);
}
 
{
snow_interleave_line_header(&i, width, b, temp);
 
for (; (i & 0x1E) != 0x1E; i-=2){
b[i+1] = temp[i>>1];
b[i] = b[i>>1];
}
for (i-=30; i>=0; i-=32){
__asm__ volatile(
"movq (%1), %%mm0 \n\t"
"movq 8(%1), %%mm2 \n\t"
"movq 16(%1), %%mm4 \n\t"
"movq 24(%1), %%mm6 \n\t"
"movq (%1), %%mm1 \n\t"
"movq 8(%1), %%mm3 \n\t"
"movq 16(%1), %%mm5 \n\t"
"movq 24(%1), %%mm7 \n\t"
"punpcklwd (%2), %%mm0 \n\t"
"punpcklwd 8(%2), %%mm2 \n\t"
"punpcklwd 16(%2), %%mm4 \n\t"
"punpcklwd 24(%2), %%mm6 \n\t"
"movq %%mm0, (%0) \n\t"
"movq %%mm2, 16(%0) \n\t"
"movq %%mm4, 32(%0) \n\t"
"movq %%mm6, 48(%0) \n\t"
"punpckhwd (%2), %%mm1 \n\t"
"punpckhwd 8(%2), %%mm3 \n\t"
"punpckhwd 16(%2), %%mm5 \n\t"
"punpckhwd 24(%2), %%mm7 \n\t"
"movq %%mm1, 8(%0) \n\t"
"movq %%mm3, 24(%0) \n\t"
"movq %%mm5, 40(%0) \n\t"
"movq %%mm7, 56(%0) \n\t"
:: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1])
: "memory"
);
}
}
}
 
#if HAVE_7REGS
#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\
""op" ("r",%%"REG_d"), %%"t0" \n\t"\
""op" 16("r",%%"REG_d"), %%"t1" \n\t"\
""op" 32("r",%%"REG_d"), %%"t2" \n\t"\
""op" 48("r",%%"REG_d"), %%"t3" \n\t"
 
#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\
snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3)
 
#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\
snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3)
 
#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\
"psubw %%"s0", %%"t0" \n\t"\
"psubw %%"s1", %%"t1" \n\t"\
"psubw %%"s2", %%"t2" \n\t"\
"psubw %%"s3", %%"t3" \n\t"
 
#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\
"movdqa %%"s0", ("w",%%"REG_d") \n\t"\
"movdqa %%"s1", 16("w",%%"REG_d") \n\t"\
"movdqa %%"s2", 32("w",%%"REG_d") \n\t"\
"movdqa %%"s3", 48("w",%%"REG_d") \n\t"
 
#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\
"psraw $"n", %%"t0" \n\t"\
"psraw $"n", %%"t1" \n\t"\
"psraw $"n", %%"t2" \n\t"\
"psraw $"n", %%"t3" \n\t"
 
#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\
"paddw %%"s0", %%"t0" \n\t"\
"paddw %%"s1", %%"t1" \n\t"\
"paddw %%"s2", %%"t2" \n\t"\
"paddw %%"s3", %%"t3" \n\t"
 
#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\
"pmulhw %%"s0", %%"t0" \n\t"\
"pmulhw %%"s1", %%"t1" \n\t"\
"pmulhw %%"s2", %%"t2" \n\t"\
"pmulhw %%"s3", %%"t3" \n\t"
 
#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\
"movdqa %%"s0", %%"t0" \n\t"\
"movdqa %%"s1", %%"t1" \n\t"\
"movdqa %%"s2", %%"t2" \n\t"\
"movdqa %%"s3", %%"t3" \n\t"
 
static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
x86_reg i = width;
 
while(i & 0x1F)
{
i--;
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
}
i+=i;
 
__asm__ volatile (
"jmp 2f \n\t"
"1: \n\t"
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6")
 
 
"pcmpeqw %%xmm0, %%xmm0 \n\t"
"pcmpeqw %%xmm2, %%xmm2 \n\t"
"paddw %%xmm2, %%xmm2 \n\t"
"paddw %%xmm0, %%xmm2 \n\t"
"psllw $13, %%xmm2 \n\t"
snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6")
 
"pcmpeqw %%xmm7, %%xmm7 \n\t"
"pcmpeqw %%xmm5, %%xmm5 \n\t"
"psllw $15, %%xmm7 \n\t"
"psrlw $13, %%xmm5 \n\t"
"paddw %%xmm7, %%xmm5 \n\t"
snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6")
"movq (%2,%%"REG_d"), %%xmm1 \n\t"
"movq 8(%2,%%"REG_d"), %%xmm3 \n\t"
"paddw %%xmm7, %%xmm1 \n\t"
"paddw %%xmm7, %%xmm3 \n\t"
"pavgw %%xmm1, %%xmm0 \n\t"
"pavgw %%xmm3, %%xmm2 \n\t"
"movq 16(%2,%%"REG_d"), %%xmm1 \n\t"
"movq 24(%2,%%"REG_d"), %%xmm3 \n\t"
"paddw %%xmm7, %%xmm1 \n\t"
"paddw %%xmm7, %%xmm3 \n\t"
"pavgw %%xmm1, %%xmm4 \n\t"
"pavgw %%xmm3, %%xmm6 \n\t"
snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
 
snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7")
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6")
snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6")
 
"2: \n\t"
"sub $64, %%"REG_d" \n\t"
"jge 1b \n\t"
:"+d"(i)
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
}
 
#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\
""op" ("r",%%"REG_d"), %%"t0" \n\t"\
""op" 8("r",%%"REG_d"), %%"t1" \n\t"\
""op" 16("r",%%"REG_d"), %%"t2" \n\t"\
""op" 24("r",%%"REG_d"), %%"t3" \n\t"
 
#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\
snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3)
 
#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\
snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3)
 
#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\
"movq %%"s0", ("w",%%"REG_d") \n\t"\
"movq %%"s1", 8("w",%%"REG_d") \n\t"\
"movq %%"s2", 16("w",%%"REG_d") \n\t"\
"movq %%"s3", 24("w",%%"REG_d") \n\t"
 
#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\
"movq %%"s0", %%"t0" \n\t"\
"movq %%"s1", %%"t1" \n\t"\
"movq %%"s2", %%"t2" \n\t"\
"movq %%"s3", %%"t3" \n\t"
 
 
static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){
x86_reg i = width;
while(i & 15)
{
i--;
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS;
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS;
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS;
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS;
}
i+=i;
__asm__ volatile(
"jmp 2f \n\t"
"1: \n\t"
 
snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7")
"pcmpeqw %%mm0, %%mm0 \n\t"
"pcmpeqw %%mm2, %%mm2 \n\t"
"paddw %%mm2, %%mm2 \n\t"
"paddw %%mm0, %%mm2 \n\t"
"psllw $13, %%mm2 \n\t"
snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7")
snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7")
snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7")
snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6")
"pcmpeqw %%mm7, %%mm7 \n\t"
"pcmpeqw %%mm5, %%mm5 \n\t"
"psllw $15, %%mm7 \n\t"
"psrlw $13, %%mm5 \n\t"
"paddw %%mm7, %%mm5 \n\t"
snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6")
"movq (%2,%%"REG_d"), %%mm1 \n\t"
"movq 8(%2,%%"REG_d"), %%mm3 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"paddw %%mm7, %%mm3 \n\t"
"pavgw %%mm1, %%mm0 \n\t"
"pavgw %%mm3, %%mm2 \n\t"
"movq 16(%2,%%"REG_d"), %%mm1 \n\t"
"movq 24(%2,%%"REG_d"), %%mm3 \n\t"
"paddw %%mm7, %%mm1 \n\t"
"paddw %%mm7, %%mm3 \n\t"
"pavgw %%mm1, %%mm4 \n\t"
"pavgw %%mm3, %%mm6 \n\t"
snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6")
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
 
snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7")
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6")
snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6")
snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6")
 
"2: \n\t"
"sub $32, %%"REG_d" \n\t"
"jge 1b \n\t"
:"+d"(i)
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5));
}
#endif //HAVE_7REGS
 
#define snow_inner_add_yblock_sse2_header \
IDWTELEM * * dst_array = sb->line + src_y;\
x86_reg tmp;\
__asm__ volatile(\
"mov %7, %%"REG_c" \n\t"\
"mov %6, %2 \n\t"\
"mov %4, %%"REG_S" \n\t"\
"pxor %%xmm7, %%xmm7 \n\t" /* 0 */\
"pcmpeqd %%xmm3, %%xmm3 \n\t"\
"psllw $15, %%xmm3 \n\t"\
"psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\
"1: \n\t"\
"mov %1, %%"REG_D" \n\t"\
"mov (%%"REG_D"), %%"REG_D" \n\t"\
"add %3, %%"REG_D" \n\t"
 
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
"movq (%%"REG_d"), %%"out_reg1" \n\t"\
"movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\
"movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
"movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"punpcklbw %%xmm7, %%xmm4 \n\t"\
"pmullw %%xmm0, %%"out_reg1" \n\t"\
"pmullw %%xmm4, %%"out_reg2" \n\t"
 
#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
"movq (%%"REG_d"), %%"out_reg1" \n\t"\
"movq 8(%%"REG_d"), %%"out_reg2" \n\t"\
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\
"movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\
"movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\
"punpcklbw %%xmm7, %%xmm0 \n\t"\
"punpcklbw %%xmm7, %%xmm4 \n\t"\
"pmullw %%xmm0, %%"out_reg1" \n\t"\
"pmullw %%xmm4, %%"out_reg2" \n\t"
 
#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \
snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\
"paddusw %%xmm2, %%xmm1 \n\t"\
"paddusw %%xmm6, %%xmm5 \n\t"
 
#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \
snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\
"paddusw %%xmm2, %%xmm1 \n\t"\
"paddusw %%xmm6, %%xmm5 \n\t"
 
#define snow_inner_add_yblock_sse2_end_common1\
"add $32, %%"REG_S" \n\t"\
"add %%"REG_c", %0 \n\t"\
"add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
"add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
"add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
"add %%"REG_c", (%%"REG_a") \n\t"
 
#define snow_inner_add_yblock_sse2_end_common2\
"jnz 1b \n\t"\
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
:\
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
 
#define snow_inner_add_yblock_sse2_end_8\
"sal $1, %%"REG_c" \n\t"\
"add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\
snow_inner_add_yblock_sse2_end_common1\
"sar $1, %%"REG_c" \n\t"\
"sub $2, %2 \n\t"\
snow_inner_add_yblock_sse2_end_common2
 
#define snow_inner_add_yblock_sse2_end_16\
"add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\
snow_inner_add_yblock_sse2_end_common1\
"dec %2 \n\t"\
snow_inner_add_yblock_sse2_end_common2
 
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0")
snow_inner_add_yblock_sse2_accum_8("2", "8")
snow_inner_add_yblock_sse2_accum_8("1", "128")
snow_inner_add_yblock_sse2_accum_8("0", "136")
 
"mov %0, %%"REG_d" \n\t"
"movdqa (%%"REG_D"), %%xmm0 \n\t"
"movdqa %%xmm1, %%xmm2 \n\t"
 
"punpckhwd %%xmm7, %%xmm1 \n\t"
"punpcklwd %%xmm7, %%xmm2 \n\t"
"paddd %%xmm2, %%xmm0 \n\t"
"movdqa 16(%%"REG_D"), %%xmm2 \n\t"
"paddd %%xmm1, %%xmm2 \n\t"
"paddd %%xmm3, %%xmm0 \n\t"
"paddd %%xmm3, %%xmm2 \n\t"
 
"mov %1, %%"REG_D" \n\t"
"mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t"
"add %3, %%"REG_D" \n\t"
 
"movdqa (%%"REG_D"), %%xmm4 \n\t"
"movdqa %%xmm5, %%xmm6 \n\t"
"punpckhwd %%xmm7, %%xmm5 \n\t"
"punpcklwd %%xmm7, %%xmm6 \n\t"
"paddd %%xmm6, %%xmm4 \n\t"
"movdqa 16(%%"REG_D"), %%xmm6 \n\t"
"paddd %%xmm5, %%xmm6 \n\t"
"paddd %%xmm3, %%xmm4 \n\t"
"paddd %%xmm3, %%xmm6 \n\t"
 
"psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */
"psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */
"packssdw %%xmm2, %%xmm0 \n\t"
"packuswb %%xmm7, %%xmm0 \n\t"
"movq %%xmm0, (%%"REG_d") \n\t"
 
"psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */
"psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */
"packssdw %%xmm6, %%xmm4 \n\t"
"packuswb %%xmm7, %%xmm4 \n\t"
"movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t"
snow_inner_add_yblock_sse2_end_8
}
 
static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_sse2_header
snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0")
snow_inner_add_yblock_sse2_accum_16("2", "16")
snow_inner_add_yblock_sse2_accum_16("1", "512")
snow_inner_add_yblock_sse2_accum_16("0", "528")
 
"mov %0, %%"REG_d" \n\t"
"psrlw $4, %%xmm1 \n\t"
"psrlw $4, %%xmm5 \n\t"
"paddw (%%"REG_D"), %%xmm1 \n\t"
"paddw 16(%%"REG_D"), %%xmm5 \n\t"
"paddw %%xmm3, %%xmm1 \n\t"
"paddw %%xmm3, %%xmm5 \n\t"
"psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */
"psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */
"packuswb %%xmm5, %%xmm1 \n\t"
 
"movdqu %%xmm1, (%%"REG_d") \n\t"
 
snow_inner_add_yblock_sse2_end_16
}
 
#define snow_inner_add_yblock_mmx_header \
IDWTELEM * * dst_array = sb->line + src_y;\
x86_reg tmp;\
__asm__ volatile(\
"mov %7, %%"REG_c" \n\t"\
"mov %6, %2 \n\t"\
"mov %4, %%"REG_S" \n\t"\
"pxor %%mm7, %%mm7 \n\t" /* 0 */\
"pcmpeqd %%mm3, %%mm3 \n\t"\
"psllw $15, %%mm3 \n\t"\
"psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\
"1: \n\t"\
"mov %1, %%"REG_D" \n\t"\
"mov (%%"REG_D"), %%"REG_D" \n\t"\
"add %3, %%"REG_D" \n\t"
 
#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\
"movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\
"movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\
"punpcklbw %%mm7, %%"out_reg1" \n\t"\
"punpcklbw %%mm7, %%"out_reg2" \n\t"\
"movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\
"movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\
"punpcklbw %%mm7, %%mm0 \n\t"\
"punpcklbw %%mm7, %%mm4 \n\t"\
"pmullw %%mm0, %%"out_reg1" \n\t"\
"pmullw %%mm4, %%"out_reg2" \n\t"
 
#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \
snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\
"paddusw %%mm2, %%mm1 \n\t"\
"paddusw %%mm6, %%mm5 \n\t"
 
#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\
"mov %0, %%"REG_d" \n\t"\
"psrlw $4, %%mm1 \n\t"\
"psrlw $4, %%mm5 \n\t"\
"paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\
"paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\
"paddw %%mm3, %%mm1 \n\t"\
"paddw %%mm3, %%mm5 \n\t"\
"psraw $4, %%mm1 \n\t"\
"psraw $4, %%mm5 \n\t"\
"packuswb %%mm5, %%mm1 \n\t"\
"movq %%mm1, "write_offset"(%%"REG_d") \n\t"
 
#define snow_inner_add_yblock_mmx_end(s_step)\
"add $"s_step", %%"REG_S" \n\t"\
"add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\
"add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\
"add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\
"add %%"REG_c", (%%"REG_a") \n\t"\
"add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\
"add %%"REG_c", %0 \n\t"\
"dec %2 \n\t"\
"jnz 1b \n\t"\
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\
:\
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d"");
 
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
snow_inner_add_yblock_mmx_accum("2", "8", "0")
snow_inner_add_yblock_mmx_accum("1", "128", "0")
snow_inner_add_yblock_mmx_accum("0", "136", "0")
snow_inner_add_yblock_mmx_mix("0", "0")
snow_inner_add_yblock_mmx_end("16")
}
 
static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h,
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){
snow_inner_add_yblock_mmx_header
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0")
snow_inner_add_yblock_mmx_accum("2", "16", "0")
snow_inner_add_yblock_mmx_accum("1", "512", "0")
snow_inner_add_yblock_mmx_accum("0", "528", "0")
snow_inner_add_yblock_mmx_mix("0", "0")
 
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8")
snow_inner_add_yblock_mmx_accum("2", "24", "8")
snow_inner_add_yblock_mmx_accum("1", "520", "8")
snow_inner_add_yblock_mmx_accum("0", "536", "8")
snow_inner_add_yblock_mmx_mix("16", "8")
snow_inner_add_yblock_mmx_end("32")
}
 
static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
 
if (b_w == 16)
inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else if (b_w == 8 && obmc_stride == 16) {
if (!(b_h & 1))
inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
} else
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
}
 
static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h,
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){
if (b_w == 16)
inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else if (b_w == 8 && obmc_stride == 16)
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
else
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8);
}
 
#endif /* HAVE_INLINE_ASM */
 
void ff_dwt_init_x86(SnowDWTContext *c)
{
#if HAVE_INLINE_ASM
int mm_flags = av_get_cpu_flags();
 
if (mm_flags & AV_CPU_FLAG_MMX) {
if(mm_flags & AV_CPU_FLAG_SSE2 & 0){
c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2;
#if HAVE_7REGS
c->vertical_compose97i = ff_snow_vertical_compose97i_sse2;
#endif
c->inner_add_yblock = ff_snow_inner_add_yblock_sse2;
}
else{
if (mm_flags & AV_CPU_FLAG_MMXEXT) {
c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx;
#if HAVE_7REGS
c->vertical_compose97i = ff_snow_vertical_compose97i_mmx;
#endif
}
c->inner_add_yblock = ff_snow_inner_add_yblock_mmx;
}
}
#endif /* HAVE_INLINE_ASM */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/v210-init.c
0,0 → 1,48
/*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/cpu.h"
#include "libavcodec/v210dec.h"
 
extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
 
extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width);
 
av_cold void v210_x86_init(V210DecContext *s)
{
int cpu_flags = av_get_cpu_flags();
 
#if HAVE_YASM
if (s->aligned_input) {
if (cpu_flags & AV_CPU_FLAG_SSSE3)
s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3;
 
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
s->unpack_frame = ff_v210_planar_unpack_aligned_avx;
}
else {
if (cpu_flags & AV_CPU_FLAG_SSSE3)
s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3;
 
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX)
s->unpack_frame = ff_v210_planar_unpack_unaligned_avx;
}
#endif
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/v210.asm
0,0 → 1,88
;******************************************************************************
;* V210 SIMD unpack
;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu>
;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
v210_mask: times 4 dd 0x3ff
v210_mult: dw 64,4,64,4,64,4,64,4
v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1
v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1
 
SECTION .text
 
%macro v210_planar_unpack 2
 
; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width)
cglobal v210_planar_unpack_%1_%2, 5, 5, 7
movsxdifnidn r4, r4d
lea r1, [r1+2*r4]
add r2, r4
add r3, r4
neg r4
 
mova m3, [v210_mult]
mova m4, [v210_mask]
mova m5, [v210_luma_shuf]
mova m6, [v210_chroma_shuf]
.loop
%ifidn %1, unaligned
movu m0, [r0]
%else
mova m0, [r0]
%endif
 
pmullw m1, m0, m3
psrld m0, 10
psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5
pand m0, m4 ; y0 __ u1 __ y3 __ v2 __
 
shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __
pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __
movu [r1+2*r4], m2
 
shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __
pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __
movq [r2+r4], m1
movhps [r3+r4], m1
 
add r0, mmsize
add r4, 6
jl .loop
 
REP_RET
%endmacro
 
INIT_XMM
v210_planar_unpack unaligned, ssse3
%if HAVE_AVX_EXTERNAL
INIT_AVX
v210_planar_unpack unaligned, avx
%endif
 
INIT_XMM
v210_planar_unpack aligned, ssse3
%if HAVE_AVX_EXTERNAL
INIT_AVX
v210_planar_unpack aligned, avx
%endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vc1dsp.asm
0,0 → 1,317
;******************************************************************************
;* VC1 deblocking optimizations
;* Copyright (c) 2009 David Conrad
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
cextern pw_4
cextern pw_5
 
section .text
 
; dst_low, dst_high (src), zero
; zero-extends one vector from 8 to 16 bits
%macro UNPACK_8TO16 4
mova m%2, m%3
punpckh%1 m%3, m%4
punpckl%1 m%2, m%4
%endmacro
 
%macro STORE_4_WORDS 6
%if cpuflag(sse4)
pextrw %1, %5, %6+0
pextrw %2, %5, %6+1
pextrw %3, %5, %6+2
pextrw %4, %5, %6+3
%else
movd %6d, %5
%if mmsize==16
psrldq %5, 4
%else
psrlq %5, 32
%endif
mov %1, %6w
shr %6, 16
mov %2, %6w
movd %6d, %5
mov %3, %6w
shr %6, 16
mov %4, %6w
%endif
%endmacro
 
; in: p1 p0 q0 q1, clobbers p0
; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
%macro VC1_LOOP_FILTER_A0 4
psubw %1, %4
psubw %2, %3
paddw %1, %1
pmullw %2, [pw_5]
psubw %1, %2
paddw %1, [pw_4]
psraw %1, 3
%endmacro
 
; in: p0 q0 a0 a1 a2
; m0 m1 m7 m6 m5
; %1: size
; out: m0=p0' m1=q0'
%macro VC1_FILTER 1
PABSW m4, m7
PABSW m3, m6
PABSW m2, m5
mova m6, m4
pminsw m3, m2
pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0)
psubw m3, m4
pmullw m3, [pw_5] ; 5*(a3 - a0)
PABSW m2, m3
psraw m2, 3 ; abs(d/8)
pxor m7, m3 ; d_sign ^= a0_sign
 
pxor m5, m5
movd m3, r2d
%if %1 > 4
punpcklbw m3, m3
%endif
punpcklbw m3, m5
pcmpgtw m3, m4 ; if (a0 < pq)
pand m6, m3
 
mova m3, m0
psubw m3, m1
PABSW m4, m3
psraw m4, 1
pxor m3, m7 ; d_sign ^ clip_sign
psraw m3, 15
pminsw m2, m4 ; min(d, clip)
pcmpgtw m4, m5
pand m6, m4 ; filt3 (C return value)
 
; each set of 4 pixels is not filtered if the 3rd is not
%if mmsize==16
pshuflw m4, m6, 0xaa
%if %1 > 4
pshufhw m4, m4, 0xaa
%endif
%else
pshufw m4, m6, 0xaa
%endif
pandn m3, m4
pand m2, m6
pand m3, m2 ; d final
 
psraw m7, 15
pxor m3, m7
psubw m3, m7
psubw m0, m3
paddw m1, m3
packuswb m0, m0
packuswb m1, m1
%endmacro
 
; 1st param: size of filter
; 2nd param: mov suffix equivalent to the filter size
%macro VC1_V_LOOP_FILTER 2
pxor m5, m5
mov%2 m6, [r4]
mov%2 m4, [r4+r1]
mov%2 m7, [r4+2*r1]
mov%2 m0, [r4+r3]
punpcklbw m6, m5
punpcklbw m4, m5
punpcklbw m7, m5
punpcklbw m0, m5
 
VC1_LOOP_FILTER_A0 m6, m4, m7, m0
mov%2 m1, [r0]
mov%2 m2, [r0+r1]
punpcklbw m1, m5
punpcklbw m2, m5
mova m4, m0
VC1_LOOP_FILTER_A0 m7, m4, m1, m2
mov%2 m3, [r0+2*r1]
mov%2 m4, [r0+r3]
punpcklbw m3, m5
punpcklbw m4, m5
mova m5, m1
VC1_LOOP_FILTER_A0 m5, m2, m3, m4
 
VC1_FILTER %1
mov%2 [r4+r3], m0
mov%2 [r0], m1
%endmacro
 
; 1st param: size of filter
; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
; 2nd (optional) param: temp register to use for storing words
%macro VC1_H_LOOP_FILTER 1-2
%if %1 == 4
movq m0, [r0 -4]
movq m1, [r0+ r1-4]
movq m2, [r0+2*r1-4]
movq m3, [r0+ r3-4]
TRANSPOSE4x4B 0, 1, 2, 3, 4
%else
movq m0, [r0 -4]
movq m4, [r0+ r1-4]
movq m1, [r0+2*r1-4]
movq m5, [r0+ r3-4]
movq m2, [r4 -4]
movq m6, [r4+ r1-4]
movq m3, [r4+2*r1-4]
movq m7, [r4+ r3-4]
punpcklbw m0, m4
punpcklbw m1, m5
punpcklbw m2, m6
punpcklbw m3, m7
TRANSPOSE4x4W 0, 1, 2, 3, 4
%endif
pxor m5, m5
 
UNPACK_8TO16 bw, 6, 0, 5
UNPACK_8TO16 bw, 7, 1, 5
VC1_LOOP_FILTER_A0 m6, m0, m7, m1
UNPACK_8TO16 bw, 4, 2, 5
mova m0, m1 ; m0 = p0
VC1_LOOP_FILTER_A0 m7, m1, m4, m2
UNPACK_8TO16 bw, 1, 3, 5
mova m5, m4
VC1_LOOP_FILTER_A0 m5, m2, m1, m3
SWAP 1, 4 ; m1 = q0
 
VC1_FILTER %1
punpcklbw m0, m1
%if %0 > 1
STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
%if %1 > 4
psrldq m0, 4
STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
%endif
%else
STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
%endif
%endmacro
 
 
%macro START_V_FILTER 0
mov r4, r0
lea r3, [4*r1]
sub r4, r3
lea r3, [r1+2*r1]
imul r2, 0x01010101
%endmacro
 
%macro START_H_FILTER 1
lea r3, [r1+2*r1]
%if %1 > 4
lea r4, [r0+4*r1]
%endif
imul r2, 0x01010101
%endmacro
 
%macro VC1_LF 0
cglobal vc1_v_loop_filter_internal
VC1_V_LOOP_FILTER 4, d
ret
 
cglobal vc1_h_loop_filter_internal
VC1_H_LOOP_FILTER 4, r4
ret
 
; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter4, 3,5,0
START_V_FILTER
call vc1_v_loop_filter_internal
RET
 
; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter4, 3,5,0
START_H_FILTER 4
call vc1_h_loop_filter_internal
RET
 
; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter8, 3,5,0
START_V_FILTER
call vc1_v_loop_filter_internal
add r4, 4
add r0, 4
call vc1_v_loop_filter_internal
RET
 
; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8, 3,5,0
START_H_FILTER 4
call vc1_h_loop_filter_internal
lea r0, [r0+4*r1]
call vc1_h_loop_filter_internal
RET
%endmacro
 
INIT_MMX mmxext
VC1_LF
 
INIT_XMM sse2
; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter8, 3,5,8
START_V_FILTER
VC1_V_LOOP_FILTER 8, q
RET
 
; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8, 3,6,8
START_H_FILTER 8
VC1_H_LOOP_FILTER 8, r5
RET
 
INIT_MMX ssse3
; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter4, 3,5,0
START_V_FILTER
VC1_V_LOOP_FILTER 4, d
RET
 
; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter4, 3,5,0
START_H_FILTER 4
VC1_H_LOOP_FILTER 4, r4
RET
 
INIT_XMM ssse3
; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_v_loop_filter8, 3,5,8
START_V_FILTER
VC1_V_LOOP_FILTER 8, q
RET
 
; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8, 3,6,8
START_H_FILTER 8
VC1_H_LOOP_FILTER 8, r5
RET
 
INIT_XMM sse4
; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
cglobal vc1_h_loop_filter8, 3,5,8
START_H_FILTER 8
VC1_H_LOOP_FILTER 8
RET
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vc1dsp.h
0,0 → 1,29
/*
* VC-1 and WMV3 decoder - X86 DSP init functions
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef AVCODEC_X86_VC1DSP_H
#define AVCODEC_X86_VC1DSP_H
 
#include "libavcodec/vc1dsp.h"
 
void ff_vc1dsp_init_mmx(VC1DSPContext *dsp);
void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp);
 
#endif /* AVCODEC_X86_VC1DSP_H */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vc1dsp_init.c
0,0 → 1,131
/*
* VC-1 and WMV3 - DSP functions MMX-optimized
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
 
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vc1dsp.h"
#include "dsputil_x86.h"
#include "vc1dsp.h"
#include "config.h"
 
#define LOOP_FILTER(EXT) \
void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \
\
static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
{ \
ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \
ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \
} \
\
static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \
{ \
ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \
ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \
}
 
#if HAVE_YASM
LOOP_FILTER(mmxext)
LOOP_FILTER(sse2)
LOOP_FILTER(ssse3)
 
void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq);
 
static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq)
{
ff_vc1_h_loop_filter8_sse4(src, stride, pq);
ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq);
}
 
static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride, int rnd)
{
ff_avg_pixels8_mmxext(dst, src, stride, 8);
}
#endif /* HAVE_YASM */
 
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src,
int stride, int h, int x, int y);
 
 
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp)
{
int cpu_flags = av_get_cpu_flags();
 
if (INLINE_MMX(cpu_flags))
ff_vc1dsp_init_mmx(dsp);
 
if (INLINE_MMXEXT(cpu_flags))
ff_vc1dsp_init_mmxext(dsp);
 
#define ASSIGN_LF(EXT) \
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \
dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT
 
#if HAVE_YASM
if (EXTERNAL_MMX(cpu_flags)) {
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow;
}
if (EXTERNAL_MMXEXT(cpu_flags)) {
ASSIGN_LF(mmxext);
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext;
 
dsp->avg_vc1_mspel_pixels_tab[0] = avg_vc1_mspel_mc00_mmxext;
}
if (EXTERNAL_SSE2(cpu_flags)) {
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2;
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2;
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2;
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2;
}
if (EXTERNAL_SSSE3(cpu_flags)) {
ASSIGN_LF(ssse3);
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3;
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3;
}
if (EXTERNAL_SSE4(cpu_flags)) {
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4;
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4;
}
#endif /* HAVE_YASM */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vc1dsp_mmx.c
0,0 → 1,757
/*
* VC-1 and WMV3 - DSP functions MMX-optimized
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr>
*
* Permission is hereby granted, free of charge, to any person
* obtaining a copy of this software and associated documentation
* files (the "Software"), to deal in the Software without
* restriction, including without limitation the rights to use,
* copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following
* conditions:
*
* The above copyright notice and this permission notice shall be
* included in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY,
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
* OTHER DEALINGS IN THE SOFTWARE.
*/
 
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vc1dsp.h"
#include "constants.h"
#include "dsputil_x86.h"
#include "vc1dsp.h"
 
#if HAVE_INLINE_ASM
 
#define OP_PUT(S,D)
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t"
 
/** Add rounder from mm7 to mm3 and pack result at destination */
#define NORMALIZE_MMX(SHIFT) \
"paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \
"paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \
"psraw "SHIFT", %%mm3 \n\t" \
"psraw "SHIFT", %%mm4 \n\t"
 
#define TRANSFER_DO_PACK(OP) \
"packuswb %%mm4, %%mm3 \n\t" \
OP((%2), %%mm3) \
"movq %%mm3, (%2) \n\t"
 
#define TRANSFER_DONT_PACK(OP) \
OP(0(%2), %%mm3) \
OP(8(%2), %%mm4) \
"movq %%mm3, 0(%2) \n\t" \
"movq %%mm4, 8(%2) \n\t"
 
/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */
#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t"
#define DONT_UNPACK(reg)
 
/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */
#define LOAD_ROUNDER_MMX(ROUND) \
"movd "ROUND", %%mm7 \n\t" \
"punpcklwd %%mm7, %%mm7 \n\t" \
"punpckldq %%mm7, %%mm7 \n\t"
 
#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \
"paddw %%mm"#R2", %%mm"#R1" \n\t" \
"movd (%0,%3), %%mm"#R0" \n\t" \
"pmullw %%mm6, %%mm"#R1" \n\t" \
"punpcklbw %%mm0, %%mm"#R0" \n\t" \
"movd (%0,%2), %%mm"#R3" \n\t" \
"psubw %%mm"#R0", %%mm"#R1" \n\t" \
"punpcklbw %%mm0, %%mm"#R3" \n\t" \
"paddw %%mm7, %%mm"#R1" \n\t" \
"psubw %%mm"#R3", %%mm"#R1" \n\t" \
"psraw %4, %%mm"#R1" \n\t" \
"movq %%mm"#R1", "#OFF"(%1) \n\t" \
"add %2, %0 \n\t"
 
/** Sacrifying mm6 allows to pipeline loads from src */
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst,
const uint8_t *src, x86_reg stride,
int rnd, int64_t shift)
{
__asm__ volatile(
"mov $3, %%"REG_c" \n\t"
LOAD_ROUNDER_MMX("%5")
"movq "MANGLE(ff_pw_9)", %%mm6 \n\t"
"1: \n\t"
"movd (%0), %%mm2 \n\t"
"add %2, %0 \n\t"
"movd (%0), %%mm3 \n\t"
"punpcklbw %%mm0, %%mm2 \n\t"
"punpcklbw %%mm0, %%mm3 \n\t"
SHIFT2_LINE( 0, 1, 2, 3, 4)
SHIFT2_LINE( 24, 2, 3, 4, 1)
SHIFT2_LINE( 48, 3, 4, 1, 2)
SHIFT2_LINE( 72, 4, 1, 2, 3)
SHIFT2_LINE( 96, 1, 2, 3, 4)
SHIFT2_LINE(120, 2, 3, 4, 1)
SHIFT2_LINE(144, 3, 4, 1, 2)
SHIFT2_LINE(168, 4, 1, 2, 3)
"sub %6, %0 \n\t"
"add $8, %1 \n\t"
"dec %%"REG_c" \n\t"
"jnz 1b \n\t"
: "+r"(src), "+r"(dst)
: "r"(stride), "r"(-2*stride),
"m"(shift), "m"(rnd), "r"(9*stride-4)
: "%"REG_c, "memory"
);
}
 
/**
* Data is already unpacked, so some operations can directly be made from
* memory.
*/
#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\
static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\
const int16_t *src, int rnd)\
{\
int h = 8;\
\
src -= 1;\
rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\
__asm__ volatile(\
LOAD_ROUNDER_MMX("%4")\
"movq "MANGLE(ff_pw_128)", %%mm6\n\t"\
"movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\
"1: \n\t"\
"movq 2*0+0(%1), %%mm1 \n\t"\
"movq 2*0+8(%1), %%mm2 \n\t"\
"movq 2*1+0(%1), %%mm3 \n\t"\
"movq 2*1+8(%1), %%mm4 \n\t"\
"paddw 2*3+0(%1), %%mm1 \n\t"\
"paddw 2*3+8(%1), %%mm2 \n\t"\
"paddw 2*2+0(%1), %%mm3 \n\t"\
"paddw 2*2+8(%1), %%mm4 \n\t"\
"pmullw %%mm5, %%mm3 \n\t"\
"pmullw %%mm5, %%mm4 \n\t"\
"psubw %%mm1, %%mm3 \n\t"\
"psubw %%mm2, %%mm4 \n\t"\
NORMALIZE_MMX("$7")\
/* Remove bias */\
"paddw %%mm6, %%mm3 \n\t"\
"paddw %%mm6, %%mm4 \n\t"\
TRANSFER_DO_PACK(OP)\
"add $24, %1 \n\t"\
"add %3, %2 \n\t"\
"decl %0 \n\t"\
"jnz 1b \n\t"\
: "+r"(h), "+r" (src), "+r" (dst)\
: "r"(stride), "m"(rnd)\
: "memory"\
);\
}
 
VC1_HOR_16b_SHIFT2(OP_PUT, put_)
VC1_HOR_16b_SHIFT2(OP_AVG, avg_)
 
 
/**
* Purely vertical or horizontal 1/2 shift interpolation.
* Sacrify mm6 for *9 factor.
*/
#define VC1_SHIFT2(OP, OPNAME)\
static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\
x86_reg stride, int rnd, x86_reg offset)\
{\
rnd = 8-rnd;\
__asm__ volatile(\
"mov $8, %%"REG_c" \n\t"\
LOAD_ROUNDER_MMX("%5")\
"movq "MANGLE(ff_pw_9)", %%mm6\n\t"\
"1: \n\t"\
"movd 0(%0 ), %%mm3 \n\t"\
"movd 4(%0 ), %%mm4 \n\t"\
"movd 0(%0,%2), %%mm1 \n\t"\
"movd 4(%0,%2), %%mm2 \n\t"\
"add %2, %0 \n\t"\
"punpcklbw %%mm0, %%mm3 \n\t"\
"punpcklbw %%mm0, %%mm4 \n\t"\
"punpcklbw %%mm0, %%mm1 \n\t"\
"punpcklbw %%mm0, %%mm2 \n\t"\
"paddw %%mm1, %%mm3 \n\t"\
"paddw %%mm2, %%mm4 \n\t"\
"movd 0(%0,%3), %%mm1 \n\t"\
"movd 4(%0,%3), %%mm2 \n\t"\
"pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\
"pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\
"punpcklbw %%mm0, %%mm1 \n\t"\
"punpcklbw %%mm0, %%mm2 \n\t"\
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\
"movd 0(%0,%2), %%mm1 \n\t"\
"movd 4(%0,%2), %%mm2 \n\t"\
"punpcklbw %%mm0, %%mm1 \n\t"\
"punpcklbw %%mm0, %%mm2 \n\t"\
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\
NORMALIZE_MMX("$4")\
"packuswb %%mm4, %%mm3 \n\t"\
OP((%1), %%mm3)\
"movq %%mm3, (%1) \n\t"\
"add %6, %0 \n\t"\
"add %4, %1 \n\t"\
"dec %%"REG_c" \n\t"\
"jnz 1b \n\t"\
: "+r"(src), "+r"(dst)\
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\
"g"(stride-offset)\
: "%"REG_c, "memory"\
);\
}
 
VC1_SHIFT2(OP_PUT, put_)
VC1_SHIFT2(OP_AVG, avg_)
 
/**
* Core of the 1/4 and 3/4 shift bicubic interpolation.
*
* @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty).
* @param MOVQ "movd 1" or "movq 2", if data read is already unpacked.
* @param A1 Address of 1st tap (beware of unpacked/packed).
* @param A2 Address of 2nd tap
* @param A3 Address of 3rd tap
* @param A4 Address of 4th tap
*/
#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \
MOVQ "*0+"A1", %%mm1 \n\t" \
MOVQ "*4+"A1", %%mm2 \n\t" \
UNPACK("%%mm1") \
UNPACK("%%mm2") \
"pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \
"pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \
MOVQ "*0+"A2", %%mm3 \n\t" \
MOVQ "*4+"A2", %%mm4 \n\t" \
UNPACK("%%mm3") \
UNPACK("%%mm4") \
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
"pmullw %%mm6, %%mm4 \n\t" /* *18 */ \
"psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \
"psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \
MOVQ "*0+"A4", %%mm1 \n\t" \
MOVQ "*4+"A4", %%mm2 \n\t" \
UNPACK("%%mm1") \
UNPACK("%%mm2") \
"psllw $2, %%mm1 \n\t" /* 4* */ \
"psllw $2, %%mm2 \n\t" /* 4* */ \
"psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \
"psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \
MOVQ "*0+"A3", %%mm1 \n\t" \
MOVQ "*4+"A3", %%mm2 \n\t" \
UNPACK("%%mm1") \
UNPACK("%%mm2") \
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
"pmullw %%mm5, %%mm2 \n\t" /* *53 */ \
"paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \
"paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */
 
/**
* Macro to build the vertical 16bits version of vc1_put_shift[13].
* Here, offset=src_stride. Parameters passed A1 to A4 must use
* %3 (src_stride) and %4 (3*src_stride).
*
* @param NAME Either 1 or 3
* @see MSPEL_FILTER13_CORE for information on A1->A4
*/
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \
static void \
vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \
x86_reg src_stride, \
int rnd, int64_t shift) \
{ \
int h = 8; \
src -= src_stride; \
__asm__ volatile( \
LOAD_ROUNDER_MMX("%5") \
"movq "MANGLE(ff_pw_53)", %%mm5\n\t" \
"movq "MANGLE(ff_pw_18)", %%mm6\n\t" \
".p2align 3 \n\t" \
"1: \n\t" \
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
NORMALIZE_MMX("%6") \
TRANSFER_DONT_PACK(OP_PUT) \
/* Last 3 (in fact 4) bytes on the line */ \
"movd 8+"A1", %%mm1 \n\t" \
DO_UNPACK("%%mm1") \
"movq %%mm1, %%mm3 \n\t" \
"paddw %%mm1, %%mm1 \n\t" \
"paddw %%mm3, %%mm1 \n\t" /* 3* */ \
"movd 8+"A2", %%mm3 \n\t" \
DO_UNPACK("%%mm3") \
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \
"psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \
"movd 8+"A3", %%mm1 \n\t" \
DO_UNPACK("%%mm1") \
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \
"paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \
"movd 8+"A4", %%mm1 \n\t" \
DO_UNPACK("%%mm1") \
"psllw $2, %%mm1 \n\t" /* 4* */ \
"psubw %%mm1, %%mm3 \n\t" \
"paddw %%mm7, %%mm3 \n\t" \
"psraw %6, %%mm3 \n\t" \
"movq %%mm3, 16(%2) \n\t" \
"add %3, %1 \n\t" \
"add $24, %2 \n\t" \
"decl %0 \n\t" \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(src_stride), "r"(3*src_stride), \
"m"(rnd), "m"(shift) \
: "memory" \
); \
}
 
/**
* Macro to build the horizontal 16bits version of vc1_put_shift[13].
* Here, offset=16bits, so parameters passed A1 to A4 should be simple.
*
* @param NAME Either 1 or 3
* @see MSPEL_FILTER13_CORE for information on A1->A4
*/
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \
static void \
OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \
const int16_t *src, int rnd) \
{ \
int h = 8; \
src -= 1; \
rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \
__asm__ volatile( \
LOAD_ROUNDER_MMX("%4") \
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
".p2align 3 \n\t" \
"1: \n\t" \
MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \
NORMALIZE_MMX("$7") \
/* Remove bias */ \
"paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \
"paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \
TRANSFER_DO_PACK(OP) \
"add $24, %1 \n\t" \
"add %3, %2 \n\t" \
"decl %0 \n\t" \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(stride), "m"(rnd) \
: "memory" \
); \
}
 
/**
* Macro to build the 8bits, any direction, version of vc1_put_shift[13].
* Here, offset=src_stride. Parameters passed A1 to A4 must use
* %3 (offset) and %4 (3*offset).
*
* @param NAME Either 1 or 3
* @see MSPEL_FILTER13_CORE for information on A1->A4
*/
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \
static void \
OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \
x86_reg stride, int rnd, x86_reg offset) \
{ \
int h = 8; \
src -= offset; \
rnd = 32-rnd; \
__asm__ volatile ( \
LOAD_ROUNDER_MMX("%6") \
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \
".p2align 3 \n\t" \
"1: \n\t" \
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \
NORMALIZE_MMX("$6") \
TRANSFER_DO_PACK(OP) \
"add %5, %1 \n\t" \
"add %5, %2 \n\t" \
"decl %0 \n\t" \
"jnz 1b \n\t" \
: "+r"(h), "+r" (src), "+r" (dst) \
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \
: "memory" \
); \
}
 
/** 1/4 shift bicubic interpolation */
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_)
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_)
MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )")
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_)
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_)
 
/** 3/4 shift bicubic interpolation */
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_)
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_)
MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )")
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_)
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_)
 
typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift);
typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd);
typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset);
 
/**
* Interpolate fractional pel values by applying proper vertical then
* horizontal filter.
*
* @param dst Destination buffer for interpolated pels.
* @param src Source buffer.
* @param stride Stride for both src and dst buffers.
* @param hmode Horizontal filter (expressed in quarter pixels shift).
* @param hmode Vertical filter.
* @param rnd Rounding bias.
*/
#define VC1_MSPEL_MC(OP)\
static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\
int hmode, int vmode, int rnd)\
{\
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\
{ NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\
{ NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\
{ NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\
\
__asm__ volatile(\
"pxor %%mm0, %%mm0 \n\t"\
::: "memory"\
);\
\
if (vmode) { /* Vertical filter to apply */\
if (hmode) { /* Horizontal filter to apply, output to tmp */\
static const int shift_value[] = { 0, 5, 1, 5 };\
int shift = (shift_value[hmode]+shift_value[vmode])>>1;\
int r;\
DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\
\
r = (1<<(shift-1)) + rnd-1;\
vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\
\
vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\
return;\
}\
else { /* No horizontal filter, output 8 lines to dst */\
vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\
return;\
}\
}\
\
/* Horizontal mode with no vertical mode */\
vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\
}
 
VC1_MSPEL_MC(put_)
VC1_MSPEL_MC(avg_)
 
/** Macro to ease bicubic filter interpolation functions declarations */
#define DECLARE_FUNCTION(a, b) \
static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride, \
int rnd) \
{ \
put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
}\
static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \
const uint8_t *src, \
ptrdiff_t stride, \
int rnd) \
{ \
avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \
}
 
DECLARE_FUNCTION(0, 1)
DECLARE_FUNCTION(0, 2)
DECLARE_FUNCTION(0, 3)
 
DECLARE_FUNCTION(1, 0)
DECLARE_FUNCTION(1, 1)
DECLARE_FUNCTION(1, 2)
DECLARE_FUNCTION(1, 3)
 
DECLARE_FUNCTION(2, 0)
DECLARE_FUNCTION(2, 1)
DECLARE_FUNCTION(2, 2)
DECLARE_FUNCTION(2, 3)
 
DECLARE_FUNCTION(3, 0)
DECLARE_FUNCTION(3, 1)
DECLARE_FUNCTION(3, 2)
DECLARE_FUNCTION(3, 3)
 
static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = (17 * dc + 4) >> 3;
dc = (17 * dc + 64) >> 7;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movd %0, %%mm2 \n\t"
"movd %1, %%mm3 \n\t"
"movd %2, %%mm4 \n\t"
"movd %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movd %%mm2, %0 \n\t"
"movd %%mm3, %1 \n\t"
"movd %%mm4, %2 \n\t"
"movd %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
 
static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = (17 * dc + 4) >> 3;
dc = (12 * dc + 64) >> 7;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movd %0, %%mm2 \n\t"
"movd %1, %%mm3 \n\t"
"movd %2, %%mm4 \n\t"
"movd %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movd %%mm2, %0 \n\t"
"movd %%mm3, %1 \n\t"
"movd %%mm4, %2 \n\t"
"movd %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
dest += 4*linesize;
__asm__ volatile(
"movd %0, %%mm2 \n\t"
"movd %1, %%mm3 \n\t"
"movd %2, %%mm4 \n\t"
"movd %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movd %%mm2, %0 \n\t"
"movd %%mm3, %1 \n\t"
"movd %%mm4, %2 \n\t"
"movd %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
 
static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = ( 3 * dc + 1) >> 1;
dc = (17 * dc + 64) >> 7;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movq %0, %%mm2 \n\t"
"movq %1, %%mm3 \n\t"
"movq %2, %%mm4 \n\t"
"movq %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movq %%mm2, %0 \n\t"
"movq %%mm3, %1 \n\t"
"movq %%mm4, %2 \n\t"
"movq %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
 
static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize,
int16_t *block)
{
int dc = block[0];
dc = (3 * dc + 1) >> 1;
dc = (3 * dc + 16) >> 5;
__asm__ volatile(
"movd %0, %%mm0 \n\t"
"pshufw $0, %%mm0, %%mm0 \n\t"
"pxor %%mm1, %%mm1 \n\t"
"psubw %%mm0, %%mm1 \n\t"
"packuswb %%mm0, %%mm0 \n\t"
"packuswb %%mm1, %%mm1 \n\t"
::"r"(dc)
);
__asm__ volatile(
"movq %0, %%mm2 \n\t"
"movq %1, %%mm3 \n\t"
"movq %2, %%mm4 \n\t"
"movq %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movq %%mm2, %0 \n\t"
"movq %%mm3, %1 \n\t"
"movq %%mm4, %2 \n\t"
"movq %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
dest += 4*linesize;
__asm__ volatile(
"movq %0, %%mm2 \n\t"
"movq %1, %%mm3 \n\t"
"movq %2, %%mm4 \n\t"
"movq %3, %%mm5 \n\t"
"paddusb %%mm0, %%mm2 \n\t"
"paddusb %%mm0, %%mm3 \n\t"
"paddusb %%mm0, %%mm4 \n\t"
"paddusb %%mm0, %%mm5 \n\t"
"psubusb %%mm1, %%mm2 \n\t"
"psubusb %%mm1, %%mm3 \n\t"
"psubusb %%mm1, %%mm4 \n\t"
"psubusb %%mm1, %%mm5 \n\t"
"movq %%mm2, %0 \n\t"
"movq %%mm3, %1 \n\t"
"movq %%mm4, %2 \n\t"
"movq %%mm5, %3 \n\t"
:"+m"(*(uint32_t*)(dest+0*linesize)),
"+m"(*(uint32_t*)(dest+1*linesize)),
"+m"(*(uint32_t*)(dest+2*linesize)),
"+m"(*(uint32_t*)(dest+3*linesize))
);
}
 
static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src,
ptrdiff_t stride, int rnd)
{
ff_put_pixels8_mmx(dst, src, stride, 8);
}
 
av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp)
{
dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx;
dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx;
dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx;
dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx;
 
dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx;
dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx;
dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx;
dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx;
 
dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx;
dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx;
dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx;
dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx;
 
dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx;
dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx;
dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx;
dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx;
}
 
av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp)
{
dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext;
dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext;
 
dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext;
dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext;
 
dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext;
dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext;
dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext;
 
dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext;
dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext;
dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext;
dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext;
 
dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext;
dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext;
dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext;
dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext;
}
#endif /* HAVE_INLINE_ASM */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/videodsp.asm
0,0 → 1,444
;******************************************************************************
;* Core video DSP functions
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION .text
 
; slow vertical extension loop function. Works with variable-width, and
; does per-line reading/writing of source data
 
%macro V_COPY_ROW 2 ; type (top/body/bottom), h
.%1_y_loop: ; do {
mov wq, r7mp ; initialize w (r7mp = wmp)
.%1_x_loop: ; do {
movu m0, [srcq+wq] ; m0 = read($mmsize)
movu [dstq+wq], m0 ; write(m0, $mmsize)
add wq, mmsize ; w -= $mmsize
cmp wq, -mmsize ; } while (w > $mmsize);
jl .%1_x_loop
movu m0, [srcq-mmsize] ; m0 = read($mmsize)
movu [dstq-mmsize], m0 ; write(m0, $mmsize)
%ifidn %1, body ; if ($type == body) {
add srcq, src_strideq ; src += src_stride
%endif ; }
add dstq, dst_strideq ; dst += dst_stride
dec %2 ; } while (--$h);
jnz .%1_y_loop
%endmacro
 
%macro vvar_fn 0
; .----. <- zero
; | | <- top is copied from first line in body of source
; |----| <- start_y
; | | <- body is copied verbatim (line-by-line) from source
; |----| <- end_y
; | | <- bottom is copied from last line in body of source
; '----' <- bh
%if ARCH_X86_64
cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \
start_y, end_y, bh, w
%else ; x86-32
cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w
%define src_strideq r3mp
%define dst_strideq r1mp
mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
%endif
sub bhq, end_yq ; bh -= end_q
sub end_yq, start_yq ; end_q -= start_q
add srcq, r7mp ; (r7mp = wmp)
add dstq, r7mp ; (r7mp = wmp)
neg r7mp ; (r7mp = wmp)
test start_yq, start_yq ; if (start_q) {
jz .body
V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq)
.body: ; }
V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq)
test bhq, bhq ; if (bh) {
jz .end
sub srcq, src_strideq ; src -= src_stride
V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh)
.end: ; }
RET
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
vvar_fn
%endif
 
INIT_XMM sse
vvar_fn
 
%macro hvar_fn 0
cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w
lea dstq, [dstq+n_wordsq*2]
neg n_wordsq
lea start_xq, [start_xq+n_wordsq*2]
.y_loop: ; do {
; FIXME also write a ssse3 version using pshufb
movzx wd, byte [dstq+start_xq] ; w = read(1)
imul wd, 0x01010101 ; w *= 0x01010101
movd m0, wd
mov wq, n_wordsq ; initialize w
%if cpuflag(sse2)
pshufd m0, m0, q0000 ; splat
%else ; mmx
punpckldq m0, m0 ; splat
%endif ; mmx/sse
.x_loop: ; do {
movu [dstq+wq*2], m0 ; write($reg, $mmsize)
add wq, mmsize/2 ; w -= $mmsize/2
cmp wq, -mmsize/2 ; } while (w > $mmsize/2)
jl .x_loop
movu [dstq-mmsize], m0 ; write($reg, $mmsize)
add dstq, dst_strideq ; dst += dst_stride
dec hq ; } while (h--)
jnz .y_loop
RET
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
hvar_fn
%endif
 
INIT_XMM sse2
hvar_fn
 
; macro to read/write a horizontal number of pixels (%2) to/from registers
; on sse, - fills xmm0-15 for consecutive sets of 16 pixels
; - if (%2 & 8) fills 8 bytes into xmm$next
; - if (%2 & 4) fills 4 bytes into xmm$next
; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
; on mmx, - fills mm0-7 for consecutive sets of 8 pixels
; - if (%2 & 4) fills 4 bytes into mm$next
; - if (%2 & 3) fills 1, 2 or 4 bytes in eax
; writing data out is in the same way
%macro READ_NUM_BYTES 2
%assign %%off 0 ; offset in source buffer
%assign %%mmx_idx 0 ; mmx register index
%assign %%xmm_idx 0 ; xmm register index
 
%rep %2/mmsize
%if mmsize == 16
movu xmm %+ %%xmm_idx, [srcq+%%off]
%assign %%xmm_idx %%xmm_idx+1
%else ; mmx
movu mm %+ %%mmx_idx, [srcq+%%off]
%assign %%mmx_idx %%mmx_idx+1
%endif
%assign %%off %%off+mmsize
%endrep ; %2/mmsize
 
%if mmsize == 16
%if (%2-%%off) >= 8
%if %2 > 16 && (%2-%%off) > 8
movu xmm %+ %%xmm_idx, [srcq+%2-16]
%assign %%xmm_idx %%xmm_idx+1
%assign %%off %2
%else
movq mm %+ %%mmx_idx, [srcq+%%off]
%assign %%mmx_idx %%mmx_idx+1
%assign %%off %%off+8
%endif
%endif ; (%2-%%off) >= 8
%endif
 
%if (%2-%%off) >= 4
%if %2 > 8 && (%2-%%off) > 4
movq mm %+ %%mmx_idx, [srcq+%2-8]
%assign %%off %2
%else
movd mm %+ %%mmx_idx, [srcq+%%off]
%assign %%off %%off+4
%endif
%assign %%mmx_idx %%mmx_idx+1
%endif ; (%2-%%off) >= 4
 
%if (%2-%%off) >= 1
%if %2 >= 4
movd mm %+ %%mmx_idx, [srcq+%2-4]
%elif (%2-%%off) == 1
mov valb, [srcq+%2-1]
%elif (%2-%%off) == 2
mov valw, [srcq+%2-2]
%elifidn %1, body
mov vald, [srcq+%2-3]
%else
movd mm %+ %%mmx_idx, [srcq+%2-3]
%endif
%endif ; (%2-%%off) >= 1
%endmacro ; READ_NUM_BYTES
 
%macro WRITE_NUM_BYTES 2
%assign %%off 0 ; offset in destination buffer
%assign %%mmx_idx 0 ; mmx register index
%assign %%xmm_idx 0 ; xmm register index
 
%rep %2/mmsize
%if mmsize == 16
movu [dstq+%%off], xmm %+ %%xmm_idx
%assign %%xmm_idx %%xmm_idx+1
%else ; mmx
movu [dstq+%%off], mm %+ %%mmx_idx
%assign %%mmx_idx %%mmx_idx+1
%endif
%assign %%off %%off+mmsize
%endrep ; %2/mmsize
 
%if mmsize == 16
%if (%2-%%off) >= 8
%if %2 > 16 && (%2-%%off) > 8
movu [dstq+%2-16], xmm %+ %%xmm_idx
%assign %%xmm_idx %%xmm_idx+1
%assign %%off %2
%else
movq [dstq+%%off], mm %+ %%mmx_idx
%assign %%mmx_idx %%mmx_idx+1
%assign %%off %%off+8
%endif
%endif ; (%2-%%off) >= 8
%endif
 
%if (%2-%%off) >= 4
%if %2 > 8 && (%2-%%off) > 4
movq [dstq+%2-8], mm %+ %%mmx_idx
%assign %%off %2
%else
movd [dstq+%%off], mm %+ %%mmx_idx
%assign %%off %%off+4
%endif
%assign %%mmx_idx %%mmx_idx+1
%endif ; (%2-%%off) >= 4
 
%if (%2-%%off) >= 1
%if %2 >= 4
movd [dstq+%2-4], mm %+ %%mmx_idx
%elif (%2-%%off) == 1
mov [dstq+%2-1], valb
%elif (%2-%%off) == 2
mov [dstq+%2-2], valw
%elifidn %1, body
mov [dstq+%2-3], valw
shr vald, 16
mov [dstq+%2-1], valb
%else
movd vald, mm %+ %%mmx_idx
mov [dstq+%2-3], valw
shr vald, 16
mov [dstq+%2-1], valb
%endif
%endif ; (%2-%%off) >= 1
%endmacro ; WRITE_NUM_BYTES
 
; vertical top/bottom extend and body copy fast loops
; these are function pointers to set-width line copy functions, i.e.
; they read a fixed number of pixels into set registers, and write
; those out into the destination buffer
%macro VERTICAL_EXTEND 2
%assign %%n %1
%rep 1+%2-%1
%if %%n <= 3
%if ARCH_X86_64
cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \
start_y, end_y, val, bh
mov bhq, r6mp ; r6mp = bhmp
%else ; x86-32
cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh
mov dstq, r0mp
mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
%define dst_strideq r1mp
%define src_strideq r3mp
%endif ; x86-64/32
%else
%if ARCH_X86_64
cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \
start_y, end_y, bh
%else ; x86-32
cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh
mov srcq, r2mp
mov start_yq, r4mp
mov end_yq, r5mp
mov bhq, r6mp
%define dst_strideq r1mp
%define src_strideq r3mp
%endif ; x86-64/32
%endif
; FIXME move this to c wrapper?
sub bhq, end_yq ; bh -= end_y
sub end_yq, start_yq ; end_y -= start_y
 
; extend pixels above body
test start_yq, start_yq ; if (start_y) {
jz .body_loop
READ_NUM_BYTES top, %%n ; $variable_regs = read($n)
.top_loop: ; do {
WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n)
add dstq, dst_strideq ; dst += linesize
dec start_yq ; } while (--start_y)
jnz .top_loop ; }
 
; copy body pixels
.body_loop: ; do {
READ_NUM_BYTES body, %%n ; $variable_regs = read($n)
WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n)
add dstq, dst_strideq ; dst += dst_stride
add srcq, src_strideq ; src += src_stride
dec end_yq ; } while (--end_y)
jnz .body_loop
 
; copy bottom pixels
test bhq, bhq ; if (block_h) {
jz .end
sub srcq, src_strideq ; src -= linesize
READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n)
.bottom_loop: ; do {
WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n)
add dstq, dst_strideq ; dst += linesize
dec bhq ; } while (--bh)
jnz .bottom_loop ; }
 
.end:
RET
%assign %%n %%n+1
%endrep ; 1+%2-%1
%endmacro ; VERTICAL_EXTEND
 
INIT_MMX mmx
VERTICAL_EXTEND 1, 15
%if ARCH_X86_32
VERTICAL_EXTEND 16, 22
%endif
 
INIT_XMM sse
VERTICAL_EXTEND 16, 22
 
; left/right (horizontal) fast extend functions
; these are essentially identical to the vertical extend ones above,
; just left/right separated because number of pixels to extend is
; obviously not the same on both sides.
 
%macro READ_V_PIXEL 2
movzx vald, byte %2
imul vald, 0x01010101
%if %1 >= 8
movd m0, vald
%if mmsize == 16
pshufd m0, m0, q0000
%else
punpckldq m0, m0
%endif ; mmsize == 16
%endif ; %1 > 16
%endmacro ; READ_V_PIXEL
 
%macro WRITE_V_PIXEL 2
%assign %%off 0
 
%if %1 >= 8
 
%rep %1/mmsize
movu [%2+%%off], m0
%assign %%off %%off+mmsize
%endrep ; %1/mmsize
 
%if mmsize == 16
%if %1-%%off >= 8
%if %1 > 16 && %1-%%off > 8
movu [%2+%1-16], m0
%assign %%off %1
%else
movq [%2+%%off], m0
%assign %%off %%off+8
%endif
%endif ; %1-%%off >= 8
%endif ; mmsize == 16
 
%if %1-%%off >= 4
%if %1 > 8 && %1-%%off > 4
movq [%2+%1-8], m0
%assign %%off %1
%else
movd [%2+%%off], m0
%assign %%off %%off+4
%endif
%endif ; %1-%%off >= 4
 
%else ; %1 < 8
 
%rep %1/4
mov [%2+%%off], vald
%assign %%off %%off+4
%endrep ; %1/4
 
%endif ; %1 >=/< 8
 
%if %1-%%off == 2
mov [%2+%%off], valw
%endif ; (%1-%%off)/2
%endmacro ; WRITE_V_PIXEL
 
%macro H_EXTEND 2
%assign %%n %1
%rep 1+(%2-%1)/2
cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val
.loop_y: ; do {
READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n)
WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n)
add dstq, dst_strideq ; dst += dst_stride
dec bhq ; } while (--bh)
jnz .loop_y
RET
%assign %%n %%n+2
%endrep ; 1+(%2-%1)/2
%endmacro ; H_EXTEND
 
INIT_MMX mmx
H_EXTEND 2, 14
%if ARCH_X86_32
H_EXTEND 16, 22
%endif
 
INIT_XMM sse2
H_EXTEND 16, 22
 
%macro PREFETCH_FN 1
cglobal prefetch, 3, 3, 0, buf, stride, h
.loop:
%1 [bufq]
add bufq, strideq
dec hd
jg .loop
REP_RET
%endmacro
 
INIT_MMX mmxext
PREFETCH_FN prefetcht0
%if ARCH_X86_32
INIT_MMX 3dnow
PREFETCH_FN prefetch
%endif
/contrib/sdk/sources/ffmpeg/libavcodec/x86/videodsp_init.c
0,0 → 1,266
/*
* Copyright (C) 2002-2012 Michael Niedermayer
* Copyright (C) 2012 Ronald S. Bultje
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/avassert.h"
#include "libavutil/common.h"
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/videodsp.h"
 
#if HAVE_YASM
typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride,
const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh);
typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride,
const uint8_t *src, x86_reg src_stride,
x86_reg start_y, x86_reg end_y, x86_reg bh,
x86_reg w);
 
extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx;
#if ARCH_X86_32
static emu_edge_vfix_func *vfixtbl_mmx[22] = {
&ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx,
&ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx,
&ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx,
&ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx,
&ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx,
&ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx,
&ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx,
&ff_emu_edge_vfix22_mmx
};
#endif
extern emu_edge_vvar_func ff_emu_edge_vvar_mmx;
extern emu_edge_vfix_func ff_emu_edge_vfix16_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix17_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix18_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix19_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse;
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse;
static emu_edge_vfix_func *vfixtbl_sse[22] = {
ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx,
ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx,
ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx,
ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx,
ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx,
ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse,
ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse,
ff_emu_edge_vfix22_sse
};
extern emu_edge_vvar_func ff_emu_edge_vvar_sse;
 
typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride,
x86_reg start_x, x86_reg bh);
typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride,
x86_reg start_x, x86_reg n_words, x86_reg bh);
 
extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx;
#if ARCH_X86_32
static emu_edge_hfix_func *hfixtbl_mmx[11] = {
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx,
ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx
};
#endif
extern emu_edge_hvar_func ff_emu_edge_hvar_mmx;
extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2;
extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2;
static emu_edge_hfix_func *hfixtbl_sse2[11] = {
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx,
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx,
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2,
ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2
};
extern emu_edge_hvar_func ff_emu_edge_hvar_sse2;
 
static av_always_inline void emulated_edge_mc(uint8_t *dst, ptrdiff_t dst_stride,
const uint8_t *src, ptrdiff_t src_stride,
x86_reg block_w, x86_reg block_h,
x86_reg src_x, x86_reg src_y,
x86_reg w, x86_reg h,
emu_edge_vfix_func **vfix_tbl,
emu_edge_vvar_func *v_extend_var,
emu_edge_hfix_func **hfix_tbl,
emu_edge_hvar_func *h_extend_var)
{
x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p;
 
if(!w || !h)
return;
 
if (src_y >= h) {
src -= src_y*src_stride;
src_y_add = h - 1;
src_y = h - 1;
} else if (src_y <= -block_h) {
src -= src_y*src_stride;
src_y_add = 1 - block_h;
src_y = 1 - block_h;
}
if (src_x >= w) {
src += w - 1 - src_x;
src_x = w - 1;
} else if (src_x <= -block_w) {
src += 1 - block_w - src_x;
src_x = 1 - block_w;
}
 
start_y = FFMAX(0, -src_y);
start_x = FFMAX(0, -src_x);
end_y = FFMIN(block_h, h-src_y);
end_x = FFMIN(block_w, w-src_x);
av_assert2(start_x < end_x && block_w > 0);
av_assert2(start_y < end_y && block_h > 0);
 
// fill in the to-be-copied part plus all above/below
src += (src_y_add + start_y) * src_stride + start_x;
w = end_x - start_x;
if (w <= 22) {
vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride,
start_y, end_y, block_h);
} else {
v_extend_var(dst + start_x, dst_stride, src, src_stride,
start_y, end_y, block_h, w);
}
 
// fill left
if (start_x) {
if (start_x <= 22) {
hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h);
} else {
h_extend_var(dst, dst_stride,
start_x, (start_x + 1) >> 1, block_h);
}
}
 
// fill right
p = block_w - end_x;
if (p) {
if (p <= 22) {
hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride,
-!(p & 1), block_h);
} else {
h_extend_var(dst + end_x - (p & 1), dst_stride,
-!(p & 1), (p + 1) >> 1, block_h);
}
}
}
 
#if ARCH_X86_32
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, ptrdiff_t buf_stride,
const uint8_t *src, ptrdiff_t src_stride,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h,
src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx,
hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
}
 
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, ptrdiff_t buf_stride,
const uint8_t *src, ptrdiff_t src_stride,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h,
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
hfixtbl_mmx, &ff_emu_edge_hvar_mmx);
}
#endif
 
static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, ptrdiff_t buf_stride,
const uint8_t *src, ptrdiff_t src_stride,
int block_w, int block_h,
int src_x, int src_y, int w, int h)
{
emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h, src_x,
src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse,
hfixtbl_sse2, &ff_emu_edge_hvar_sse2);
}
#endif /* HAVE_YASM */
 
void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h);
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h);
 
av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
 
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_mmx;
}
if (EXTERNAL_AMD3DNOW(cpu_flags)) {
ctx->prefetch = ff_prefetch_3dnow;
}
#endif /* ARCH_X86_32 */
if (EXTERNAL_MMXEXT(cpu_flags)) {
ctx->prefetch = ff_prefetch_mmxext;
}
#if ARCH_X86_32
if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_sse;
}
#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) {
ctx->emulated_edge_mc = emulated_edge_mc_sse2;
}
#endif /* HAVE_YASM */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vorbisdsp.asm
0,0 → 1,83
;******************************************************************************
;* Vorbis x86 optimizations
;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
pdw_80000000: times 4 dd 0x80000000
 
SECTION .text
 
%if ARCH_X86_32
INIT_MMX 3dnow
cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size
pxor m7, m7
lea magq, [magq+block_sizeq*4]
lea angq, [angq+block_sizeq*4]
neg block_sizeq
.loop:
mova m0, [magq+block_sizeq*4]
mova m1, [angq+block_sizeq*4]
mova m2, m0
mova m3, m1
pfcmpge m2, m7 ; m <= 0.0
pfcmpge m3, m7 ; a <= 0.0
pslld m2, 31 ; keep only the sign bit
pxor m1, m2
mova m4, m3
pand m3, m1
pandn m4, m1
pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
mova [angq+block_sizeq*4], m3
mova [magq+block_sizeq*4], m0
add block_sizeq, 2
jl .loop
femms
RET
%endif
 
INIT_XMM sse
cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr
mova m5, [pdw_80000000]
xor cntrq, cntrq
align 16
.loop:
mova m0, [magq+cntrq*4]
mova m1, [angq+cntrq*4]
xorps m2, m2
xorps m3, m3
cmpleps m2, m0 ; m <= 0.0
cmpleps m3, m1 ; a <= 0.0
andps m2, m5 ; keep only the sign bit
xorps m1, m2
mova m4, m3
andps m3, m1
andnps m4, m1
addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m)))
subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m)))
mova [angq+cntrq*4], m3
mova [magq+cntrq*4], m0
add cntrq, 4
cmp cntrq, block_sizeq
jl .loop
RET
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vorbisdsp_init.c
0,0 → 1,44
/*
* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "config.h"
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vorbisdsp.h"
 
void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang,
intptr_t blocksize);
void ff_vorbis_inverse_coupling_sse(float *mag, float *ang,
intptr_t blocksize);
 
av_cold void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
 
#if ARCH_X86_32
if (EXTERNAL_AMD3DNOW(cpu_flags))
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow;
#endif /* ARCH_X86_32 */
if (EXTERNAL_SSE(cpu_flags))
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse;
#endif /* HAVE_YASM */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp3dsp.asm
0,0 → 1,709
;******************************************************************************
;* MMX/SSE2-optimized functions for the VP3 decoder
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
; MMX-optimized functions cribbed from the original VP3 source code.
 
SECTION_RODATA
 
vp3_idct_data: times 8 dw 64277
times 8 dw 60547
times 8 dw 54491
times 8 dw 46341
times 8 dw 36410
times 8 dw 25080
times 8 dw 12785
 
pb_7: times 8 db 0x07
pb_1F: times 8 db 0x1f
pb_81: times 8 db 0x81
 
cextern pb_1
cextern pb_3
cextern pb_80
 
cextern pw_8
 
SECTION .text
 
; this is off by one or two for some cases when filter_limit is greater than 63
; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1
; out: p1 in mm4, p2 in mm3
%macro VP3_LOOP_FILTER 0
movq m7, m6
pand m6, [pb_7] ; p0&7
psrlw m7, 3
pand m7, [pb_1F] ; p0>>3
movq m3, m2 ; p2
pxor m2, m4
pand m2, [pb_1] ; (p2^p1)&1
movq m5, m2
paddb m2, m2
paddb m2, m5 ; 3*(p2^p1)&1
paddb m2, m6 ; extra bits lost in shifts
pcmpeqb m0, m0
pxor m1, m0 ; 255 - p3
pavgb m1, m2 ; (256 - p3 + extrabits) >> 1
pxor m0, m4 ; 255 - p1
pavgb m0, m3 ; (256 + p2-p1) >> 1
paddb m1, [pb_3]
pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2
pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3
paddusb m7, m1 ; d+128+1
movq m6, [pb_81]
psubusb m6, m7
psubusb m7, [pb_81]
 
movq m5, [r2+516] ; flim
pminub m6, m5
pminub m7, m5
movq m0, m6
movq m1, m7
paddb m6, m6
paddb m7, m7
pminub m6, m5
pminub m7, m5
psubb m6, m0
psubb m7, m1
paddusb m4, m7
psubusb m4, m6
psubusb m3, m7
paddusb m3, m6
%endmacro
 
%macro STORE_4_WORDS 1
movd r2d, %1
mov [r0 -1], r2w
psrlq %1, 32
shr r2, 16
mov [r0+r1 -1], r2w
movd r2d, %1
mov [r0+r1*2-1], r2w
shr r2, 16
mov [r0+r3 -1], r2w
%endmacro
 
INIT_MMX mmxext
cglobal vp3_v_loop_filter, 3, 4
%if ARCH_X86_64
movsxd r1, r1d
%endif
mov r3, r1
neg r1
movq m6, [r0+r1*2]
movq m4, [r0+r1 ]
movq m2, [r0 ]
movq m1, [r0+r3 ]
 
VP3_LOOP_FILTER
 
movq [r0+r1], m4
movq [r0 ], m3
RET
 
cglobal vp3_h_loop_filter, 3, 4
%if ARCH_X86_64
movsxd r1, r1d
%endif
lea r3, [r1*3]
 
movd m6, [r0 -2]
movd m4, [r0+r1 -2]
movd m2, [r0+r1*2-2]
movd m1, [r0+r3 -2]
lea r0, [r0+r1*4 ]
punpcklbw m6, [r0 -2]
punpcklbw m4, [r0+r1 -2]
punpcklbw m2, [r0+r1*2-2]
punpcklbw m1, [r0+r3 -2]
sub r0, r3
sub r0, r1
 
TRANSPOSE4x4B 6, 4, 2, 1, 0
VP3_LOOP_FILTER
SBUTTERFLY bw, 4, 3, 5
 
STORE_4_WORDS m4
lea r0, [r0+r1*4 ]
STORE_4_WORDS m3
RET
 
; from original comments: The Macro does IDct on 4 1-D Dcts
%macro BeginIDCT 0
movq m2, I(3)
movq m6, C(3)
movq m4, m2
movq m7, J(5)
pmulhw m4, m6 ; r4 = c3*i3 - i3
movq m1, C(5)
pmulhw m6, m7 ; r6 = c3*i5 - i5
movq m5, m1
pmulhw m1, m2 ; r1 = c5*i3 - i3
movq m3, I(1)
pmulhw m5, m7 ; r5 = c5*i5 - i5
movq m0, C(1)
paddw m4, m2 ; r4 = c3*i3
paddw m6, m7 ; r6 = c3*i5
paddw m2, m1 ; r2 = c5*i3
movq m1, J(7)
paddw m7, m5 ; r7 = c5*i5
movq m5, m0 ; r5 = c1
pmulhw m0, m3 ; r0 = c1*i1 - i1
paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5
pmulhw m5, m1 ; r5 = c1*i7 - i7
movq m7, C(7)
psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3
paddw m0, m3 ; r0 = c1*i1
pmulhw m3, m7 ; r3 = c7*i1
movq m2, I(2)
pmulhw m7, m1 ; r7 = c7*i7
paddw m5, m1 ; r5 = c1*i7
movq m1, m2 ; r1 = i2
pmulhw m2, C(2) ; r2 = c2*i2 - i2
psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7
movq m5, J(6)
paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7
movq m7, m5 ; r7 = i6
psubsw m0, m4 ; r0 = A - C
pmulhw m5, C(2) ; r5 = c2*i6 - i6
paddw m2, m1 ; r2 = c2*i2
pmulhw m1, C(6) ; r1 = c6*i2
paddsw m4, m4 ; r4 = C + C
paddsw m4, m0 ; r4 = C. = A + C
psubsw m3, m6 ; r3 = B - D
paddw m5, m7 ; r5 = c2*i6
paddsw m6, m6 ; r6 = D + D
pmulhw m7, C(6) ; r7 = c6*i6
paddsw m6, m3 ; r6 = D. = B + D
movq I(1), m4 ; save C. at I(1)
psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6
movq m4, C(4)
movq m5, m3 ; r5 = B - D
pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D)
paddsw m7, m2 ; r3 = (c4 - 1) * (B - D)
movq I(2), m6 ; save D. at I(2)
movq m2, m0 ; r2 = A - C
movq m6, I(0)
pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C)
paddw m5, m3 ; r5 = B. = c4 * (B - D)
movq m3, J(4)
psubsw m5, m1 ; r5 = B.. = B. - H
paddw m2, m0 ; r0 = A. = c4 * (A - C)
psubsw m6, m3 ; r6 = i0 - i4
movq m0, m6
pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4)
paddsw m3, m3 ; r3 = i4 + i4
paddsw m1, m1 ; r1 = H + H
paddsw m3, m0 ; r3 = i0 + i4
paddsw m1, m5 ; r1 = H. = B + H
pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4)
paddsw m6, m0 ; r6 = F = c4 * (i0 - i4)
psubsw m6, m2 ; r6 = F. = F - A.
paddsw m2, m2 ; r2 = A. + A.
movq m0, I(1) ; r0 = C.
paddsw m2, m6 ; r2 = A.. = F + A.
paddw m4, m3 ; r4 = E = c4 * (i0 + i4)
psubsw m2, m1 ; r2 = R2 = A.. - H.
%endmacro
 
; RowIDCT gets ready to transpose
%macro RowIDCT 0
BeginIDCT
movq m3, I(2) ; r3 = D.
psubsw m4, m7 ; r4 = E. = E - G
paddsw m1, m1 ; r1 = H. + H.
paddsw m7, m7 ; r7 = G + G
paddsw m1, m2 ; r1 = R1 = A.. + H.
paddsw m7, m4 ; r1 = R1 = A.. + H.
psubsw m4, m3 ; r4 = R4 = E. - D.
paddsw m3, m3
psubsw m6, m5 ; r6 = R6 = F. - B..
paddsw m5, m5
paddsw m3, m4 ; r3 = R3 = E. + D.
paddsw m5, m6 ; r5 = R5 = F. + B..
psubsw m7, m0 ; r7 = R7 = G. - C.
paddsw m0, m0
movq I(1), m1 ; save R1
paddsw m0, m7 ; r0 = R0 = G. + C.
%endmacro
 
; Column IDCT normalizes and stores final results
%macro ColumnIDCT 0
BeginIDCT
paddsw m2, OC_8 ; adjust R2 (and R1) for shift
paddsw m1, m1 ; r1 = H. + H.
paddsw m1, m2 ; r1 = R1 = A.. + H.
psraw m2, 4 ; r2 = NR2
psubsw m4, m7 ; r4 = E. = E - G
psraw m1, 4 ; r1 = NR2
movq m3, I(2) ; r3 = D.
paddsw m7, m7 ; r7 = G + G
movq I(2), m2 ; store NR2 at I2
paddsw m7, m4 ; r7 = G. = E + G
movq I(1), m1 ; store NR1 at I1
psubsw m4, m3 ; r4 = R4 = E. - D.
paddsw m4, OC_8 ; adjust R4 (and R3) for shift
paddsw m3, m3 ; r3 = D. + D.
paddsw m3, m4 ; r3 = R3 = E. + D.
psraw m4, 4 ; r4 = NR4
psubsw m6, m5 ; r6 = R6 = F. - B..
psraw m3, 4 ; r3 = NR3
paddsw m6, OC_8 ; adjust R6 (and R5) for shift
paddsw m5, m5 ; r5 = B.. + B..
paddsw m5, m6 ; r5 = R5 = F. + B..
psraw m6, 4 ; r6 = NR6
movq J(4), m4 ; store NR4 at J4
psraw m5, 4 ; r5 = NR5
movq I(3), m3 ; store NR3 at I3
psubsw m7, m0 ; r7 = R7 = G. - C.
paddsw m7, OC_8 ; adjust R7 (and R0) for shift
paddsw m0, m0 ; r0 = C. + C.
paddsw m0, m7 ; r0 = R0 = G. + C.
psraw m7, 4 ; r7 = NR7
movq J(6), m6 ; store NR6 at J6
psraw m0, 4 ; r0 = NR0
movq J(5), m5 ; store NR5 at J5
movq J(7), m7 ; store NR7 at J7
movq I(0), m0 ; store NR0 at I0
%endmacro
 
; Following macro does two 4x4 transposes in place.
;
; At entry (we assume):
;
; r0 = a3 a2 a1 a0
; I(1) = b3 b2 b1 b0
; r2 = c3 c2 c1 c0
; r3 = d3 d2 d1 d0
;
; r4 = e3 e2 e1 e0
; r5 = f3 f2 f1 f0
; r6 = g3 g2 g1 g0
; r7 = h3 h2 h1 h0
;
; At exit, we have:
;
; I(0) = d0 c0 b0 a0
; I(1) = d1 c1 b1 a1
; I(2) = d2 c2 b2 a2
; I(3) = d3 c3 b3 a3
;
; J(4) = h0 g0 f0 e0
; J(5) = h1 g1 f1 e1
; J(6) = h2 g2 f2 e2
; J(7) = h3 g3 f3 e3
;
; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3.
; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7.
;
; Since r1 is free at entry, we calculate the Js first.
%macro Transpose 0
movq m1, m4 ; r1 = e3 e2 e1 e0
punpcklwd m4, m5 ; r4 = f1 e1 f0 e0
movq I(0), m0 ; save a3 a2 a1 a0
punpckhwd m1, m5 ; r1 = f3 e3 f2 e2
movq m0, m6 ; r0 = g3 g2 g1 g0
punpcklwd m6, m7 ; r6 = h1 g1 h0 g0
movq m5, m4 ; r5 = f1 e1 f0 e0
punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4
punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5
movq m6, m1 ; r6 = f3 e3 f2 e2
movq J(4), m4
punpckhwd m0, m7 ; r0 = h3 g3 h2 g2
movq J(5), m5
punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7
movq m4, I(0) ; r4 = a3 a2 a1 a0
punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6
movq m5, I(1) ; r5 = b3 b2 b1 b0
movq m0, m4 ; r0 = a3 a2 a1 a0
movq J(7), m6
punpcklwd m0, m5 ; r0 = b1 a1 b0 a0
movq J(6), m1
punpckhwd m4, m5 ; r4 = b3 a3 b2 a2
movq m5, m2 ; r5 = c3 c2 c1 c0
punpcklwd m2, m3 ; r2 = d1 c1 d0 c0
movq m1, m0 ; r1 = b1 a1 b0 a0
punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0
punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1
movq m2, m4 ; r2 = b3 a3 b2 a2
movq I(0), m0
punpckhwd m5, m3 ; r5 = d3 c3 d2 c2
movq I(1), m1
punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3
punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2
movq I(3), m4
movq I(2), m2
%endmacro
 
%macro VP3_1D_IDCT_SSE2 0
movdqa m2, I(3) ; xmm2 = i3
movdqa m6, C(3) ; xmm6 = c3
movdqa m4, m2 ; xmm4 = i3
movdqa m7, I(5) ; xmm7 = i5
pmulhw m4, m6 ; xmm4 = c3 * i3 - i3
movdqa m1, C(5) ; xmm1 = c5
pmulhw m6, m7 ; xmm6 = c3 * i5 - i5
movdqa m5, m1 ; xmm5 = c5
pmulhw m1, m2 ; xmm1 = c5 * i3 - i3
movdqa m3, I(1) ; xmm3 = i1
pmulhw m5, m7 ; xmm5 = c5 * i5 - i5
movdqa m0, C(1) ; xmm0 = c1
paddw m4, m2 ; xmm4 = c3 * i3
paddw m6, m7 ; xmm6 = c3 * i5
paddw m2, m1 ; xmm2 = c5 * i3
movdqa m1, I(7) ; xmm1 = i7
paddw m7, m5 ; xmm7 = c5 * i5
movdqa m5, m0 ; xmm5 = c1
pmulhw m0, m3 ; xmm0 = c1 * i1 - i1
paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C
pmulhw m5, m1 ; xmm5 = c1 * i7 - i7
movdqa m7, C(7) ; xmm7 = c7
psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D
paddw m0, m3 ; xmm0 = c1 * i1
pmulhw m3, m7 ; xmm3 = c7 * i1
movdqa m2, I(2) ; xmm2 = i2
pmulhw m7, m1 ; xmm7 = c7 * i7
paddw m5, m1 ; xmm5 = c1 * i7
movdqa m1, m2 ; xmm1 = i2
pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2
psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B
movdqa m5, I(6) ; xmm5 = i6
paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A
movdqa m7, m5 ; xmm7 = i6
psubsw m0, m4 ; xmm0 = A - C
pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6
paddw m2, m1 ; xmm2 = i2 * c2
pmulhw m1, C(6) ; xmm1 = c6 * i2
paddsw m4, m4 ; xmm4 = C + C
paddsw m4, m0 ; xmm4 = A + C = C.
psubsw m3, m6 ; xmm3 = B - D
paddw m5, m7 ; xmm5 = c2 * i6
paddsw m6, m6 ; xmm6 = D + D
pmulhw m7, C(6) ; xmm7 = c6 * i6
paddsw m6, m3 ; xmm6 = B + D = D.
movdqa I(1), m4 ; Save C. at I(1)
psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H
movdqa m4, C(4) ; xmm4 = C4
movdqa m5, m3 ; xmm5 = B - D
pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D )
paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G
movdqa I(2), m6 ; save D. at I(2)
movdqa m2, m0 ; xmm2 = A - C
movdqa m6, I(0) ; xmm6 = i0
pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A.
paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B.
movdqa m3, I(4) ; xmm3 = i4
psubsw m5, m1 ; xmm5 = B. - H = B..
paddw m2, m0 ; xmm2 = c4 * ( A - C) = A.
psubsw m6, m3 ; xmm6 = i0 - i4
movdqa m0, m6 ; xmm0 = i0 - i4
pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F
paddsw m3, m3 ; xmm3 = i4 + i4
paddsw m1, m1 ; xmm1 = H + H
paddsw m3, m0 ; xmm3 = i0 + i4
paddsw m1, m5 ; xmm1 = B. + H = H.
pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 )
paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 )
psubsw m6, m2 ; xmm6 = F - A. = F.
paddsw m2, m2 ; xmm2 = A. + A.
movdqa m0, I(1) ; Load C. from I(1)
paddsw m2, m6 ; xmm2 = F + A. = A..
paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3
psubsw m2, m1 ; xmm2 = A.. - H. = R2
ADD(m2) ; Adjust R2 and R1 before shifting
paddsw m1, m1 ; xmm1 = H. + H.
paddsw m1, m2 ; xmm1 = A.. + H. = R1
SHIFT(m2) ; xmm2 = op2
psubsw m4, m7 ; xmm4 = E - G = E.
SHIFT(m1) ; xmm1 = op1
movdqa m3, I(2) ; Load D. from I(2)
paddsw m7, m7 ; xmm7 = G + G
paddsw m7, m4 ; xmm7 = E + G = G.
psubsw m4, m3 ; xmm4 = E. - D. = R4
ADD(m4) ; Adjust R4 and R3 before shifting
paddsw m3, m3 ; xmm3 = D. + D.
paddsw m3, m4 ; xmm3 = E. + D. = R3
SHIFT(m4) ; xmm4 = op4
psubsw m6, m5 ; xmm6 = F. - B..= R6
SHIFT(m3) ; xmm3 = op3
ADD(m6) ; Adjust R6 and R5 before shifting
paddsw m5, m5 ; xmm5 = B.. + B..
paddsw m5, m6 ; xmm5 = F. + B.. = R5
SHIFT(m6) ; xmm6 = op6
SHIFT(m5) ; xmm5 = op5
psubsw m7, m0 ; xmm7 = G. - C. = R7
ADD(m7) ; Adjust R7 and R0 before shifting
paddsw m0, m0 ; xmm0 = C. + C.
paddsw m0, m7 ; xmm0 = G. + C.
SHIFT(m7) ; xmm7 = op7
SHIFT(m0) ; xmm0 = op0
%endmacro
 
%macro PUT_BLOCK 8
movdqa O(0), m%1
movdqa O(1), m%2
movdqa O(2), m%3
movdqa O(3), m%4
movdqa O(4), m%5
movdqa O(5), m%6
movdqa O(6), m%7
movdqa O(7), m%8
%endmacro
 
%macro VP3_IDCT 1
%if mmsize == 16
%define I(x) [%1+16*x]
%define O(x) [%1+16*x]
%define C(x) [vp3_idct_data+16*(x-1)]
%define SHIFT(x)
%define ADD(x)
VP3_1D_IDCT_SSE2
%if ARCH_X86_64
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
%else
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16]
%endif
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
 
%define SHIFT(x) psraw x, 4
%define ADD(x) paddsw x, [pw_8]
VP3_1D_IDCT_SSE2
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7
%else ; mmsize == 8
; eax = quantized input
; ebx = dequantizer matrix
; ecx = IDCT constants
; M(I) = ecx + MaskOffset(0) + I * 8
; C(I) = ecx + CosineOffset(32) + (I-1) * 8
; edx = output
; r0..r7 = mm0..mm7
%define OC_8 [pw_8]
%define C(x) [vp3_idct_data+16*(x-1)]
 
; at this point, function has completed dequantization + dezigzag +
; partial transposition; now do the idct itself
%define I(x) [%1+16*x]
%define J(x) [%1+16*x]
RowIDCT
Transpose
 
%define I(x) [%1+16*x+8]
%define J(x) [%1+16*x+8]
RowIDCT
Transpose
 
%define I(x) [%1+16* x]
%define J(x) [%1+16*(x-4)+8]
ColumnIDCT
 
%define I(x) [%1+16* x +64]
%define J(x) [%1+16*(x-4)+72]
ColumnIDCT
%endif ; mmsize == 16/8
%endmacro
 
%macro vp3_idct_funcs 0
cglobal vp3_idct_put, 3, 4, 9
VP3_IDCT r2
 
movsxdifnidn r1, r1d
mova m4, [pb_80]
lea r3, [r1*3]
%assign %%i 0
%rep 16/mmsize
mova m0, [r2+mmsize*0+%%i]
mova m1, [r2+mmsize*2+%%i]
mova m2, [r2+mmsize*4+%%i]
mova m3, [r2+mmsize*6+%%i]
%if mmsize == 8
packsswb m0, [r2+mmsize*8+%%i]
packsswb m1, [r2+mmsize*10+%%i]
packsswb m2, [r2+mmsize*12+%%i]
packsswb m3, [r2+mmsize*14+%%i]
%else
packsswb m0, [r2+mmsize*1+%%i]
packsswb m1, [r2+mmsize*3+%%i]
packsswb m2, [r2+mmsize*5+%%i]
packsswb m3, [r2+mmsize*7+%%i]
%endif
paddb m0, m4
paddb m1, m4
paddb m2, m4
paddb m3, m4
movq [r0 ], m0
%if mmsize == 8
movq [r0+r1 ], m1
movq [r0+r1*2], m2
movq [r0+r3 ], m3
%else
movhps [r0+r1 ], m0
movq [r0+r1*2], m1
movhps [r0+r3 ], m1
%endif
%if %%i == 0
lea r0, [r0+r1*4]
%endif
%if mmsize == 16
movq [r0 ], m2
movhps [r0+r1 ], m2
movq [r0+r1*2], m3
movhps [r0+r3 ], m3
%endif
%assign %%i %%i+8
%endrep
 
pxor m0, m0
%assign %%offset 0
%rep 128/mmsize
mova [r2+%%offset], m0
%assign %%offset %%offset+mmsize
%endrep
RET
 
cglobal vp3_idct_add, 3, 4, 9
VP3_IDCT r2
 
movsxdifnidn r1, r1d
lea r3, [r1*3]
pxor m4, m4
%if mmsize == 16
%assign %%i 0
%rep 2
movq m0, [r0]
movq m1, [r0+r1]
movq m2, [r0+r1*2]
movq m3, [r0+r3]
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpcklbw m3, m4
paddsw m0, [r2+ 0+%%i]
paddsw m1, [r2+16+%%i]
paddsw m2, [r2+32+%%i]
paddsw m3, [r2+48+%%i]
packuswb m0, m1
packuswb m2, m3
movq [r0 ], m0
movhps [r0+r1 ], m0
movq [r0+r1*2], m2
movhps [r0+r3 ], m2
%if %%i == 0
lea r0, [r0+r1*4]
%endif
%assign %%i %%i+64
%endrep
%else
%assign %%i 0
%rep 2
movq m0, [r0]
movq m1, [r0+r1]
movq m2, [r0+r1*2]
movq m3, [r0+r3]
movq m5, m0
movq m6, m1
movq m7, m2
punpcklbw m0, m4
punpcklbw m1, m4
punpcklbw m2, m4
punpckhbw m5, m4
punpckhbw m6, m4
punpckhbw m7, m4
paddsw m0, [r2+ 0+%%i]
paddsw m1, [r2+16+%%i]
paddsw m2, [r2+32+%%i]
paddsw m5, [r2+64+%%i]
paddsw m6, [r2+80+%%i]
paddsw m7, [r2+96+%%i]
packuswb m0, m5
movq m5, m3
punpcklbw m3, m4
punpckhbw m5, m4
packuswb m1, m6
paddsw m3, [r2+48+%%i]
paddsw m5, [r2+112+%%i]
packuswb m2, m7
packuswb m3, m5
movq [r0 ], m0
movq [r0+r1 ], m1
movq [r0+r1*2], m2
movq [r0+r3 ], m3
%if %%i == 0
lea r0, [r0+r1*4]
%endif
%assign %%i %%i+8
%endrep
%endif
%assign %%i 0
%rep 128/mmsize
mova [r2+%%i], m4
%assign %%i %%i+mmsize
%endrep
RET
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
vp3_idct_funcs
%endif
 
INIT_XMM sse2
vp3_idct_funcs
 
%macro DC_ADD 0
movq m2, [r0 ]
movq m3, [r0+r1 ]
paddusb m2, m0
movq m4, [r0+r1*2]
paddusb m3, m0
movq m5, [r0+r2 ]
paddusb m4, m0
paddusb m5, m0
psubusb m2, m1
psubusb m3, m1
movq [r0 ], m2
psubusb m4, m1
movq [r0+r1 ], m3
psubusb m5, m1
movq [r0+r1*2], m4
movq [r0+r2 ], m5
%endmacro
 
INIT_MMX mmxext
cglobal vp3_idct_dc_add, 3, 4
%if ARCH_X86_64
movsxd r1, r1d
%endif
movsx r3, word [r2]
mov word [r2], 0
lea r2, [r1*3]
add r3, 15
sar r3, 5
movd m0, r3d
pshufw m0, m0, 0x0
pxor m1, m1
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
DC_ADD
lea r0, [r0+r1*4]
DC_ADD
RET
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp3dsp_init.c
0,0 → 1,128
/*
* Copyright (c) 2009 David Conrad <lessen42@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include <stdint.h>
 
#include "libavutil/attributes.h"
#include "libavutil/cpu.h"
#include "libavutil/x86/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/avcodec.h"
#include "libavcodec/dsputil.h"
#include "libavcodec/vp3dsp.h"
#include "config.h"
 
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block);
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block);
 
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, int16_t *block);
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, int16_t *block);
 
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size,
int16_t *block);
 
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride,
int *bounding_values);
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride,
int *bounding_values);
 
#if HAVE_MMX_INLINE
 
#define MOVQ_BFE(regd) \
__asm__ volatile ( \
"pcmpeqd %%"#regd", %%"#regd" \n\t" \
"paddb %%"#regd", %%"#regd" \n\t" ::)
 
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \
"movq "#rega", "#regr" \n\t" \
"movq "#regc", "#regp" \n\t" \
"pand "#regb", "#regr" \n\t" \
"pand "#regd", "#regp" \n\t" \
"pxor "#rega", "#regb" \n\t" \
"pxor "#regc", "#regd" \n\t" \
"pand %%mm6, "#regb" \n\t" \
"pand %%mm6, "#regd" \n\t" \
"psrlq $1, "#regb" \n\t" \
"psrlq $1, "#regd" \n\t" \
"paddb "#regb", "#regr" \n\t" \
"paddb "#regd", "#regp" \n\t"
 
static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h)
{
// START_TIMER
MOVQ_BFE(mm6);
__asm__ volatile(
"1: \n\t"
"movq (%1), %%mm0 \n\t"
"movq (%2), %%mm1 \n\t"
"movq (%1,%4), %%mm2 \n\t"
"movq (%2,%4), %%mm3 \n\t"
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3) \n\t"
"movq %%mm5, (%3,%4) \n\t"
 
"movq (%1,%4,2), %%mm0 \n\t"
"movq (%2,%4,2), %%mm1 \n\t"
"movq (%1,%5), %%mm2 \n\t"
"movq (%2,%5), %%mm3 \n\t"
"lea (%1,%4,4), %1 \n\t"
"lea (%2,%4,4), %2 \n\t"
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5)
"movq %%mm4, (%3,%4,2) \n\t"
"movq %%mm5, (%3,%5) \n\t"
"lea (%3,%4,4), %3 \n\t"
"subl $4, %0 \n\t"
"jnz 1b \n\t"
:"+r"(h), "+r"(a), "+r"(b), "+r"(dst)
:"r"((x86_reg)stride), "r"((x86_reg)3L*stride)
:"memory");
// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx")
}
#endif /* HAVE_MMX_INLINE */
 
av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags)
{
int cpu_flags = av_get_cpu_flags();
 
#if HAVE_MMX_INLINE
c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx;
#endif /* HAVE_MMX_INLINE */
 
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->idct_put = ff_vp3_idct_put_mmx;
c->idct_add = ff_vp3_idct_add_mmx;
}
#endif
 
if (EXTERNAL_MMXEXT(cpu_flags)) {
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext;
 
if (!(flags & CODEC_FLAG_BITEXACT)) {
c->v_loop_filter = ff_vp3_v_loop_filter_mmxext;
c->h_loop_filter = ff_vp3_h_loop_filter_mmxext;
}
}
 
if (EXTERNAL_SSE2(cpu_flags)) {
c->idct_put = ff_vp3_idct_put_sse2;
c->idct_add = ff_vp3_idct_add_sse2;
}
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp56_arith.h
0,0 → 1,54
/**
* VP5 and VP6 compatible video decoder (arith decoder)
*
* Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org>
* Copyright (C) 2010 Eli Friedman
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#ifndef AVCODEC_X86_VP56_ARITH_H
#define AVCODEC_X86_VP56_ARITH_H
 
#if HAVE_INLINE_ASM && HAVE_FAST_CMOV
#define vp56_rac_get_prob vp56_rac_get_prob
static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob)
{
unsigned int code_word = vp56_rac_renorm(c);
unsigned int high = c->high;
unsigned int low = 1 + (((high - 1) * prob) >> 8);
unsigned int low_shift = low << 16;
int bit = 0;
 
__asm__(
"subl %4, %1 \n\t"
"subl %3, %2 \n\t"
"leal (%2, %3), %3 \n\t"
"setae %b0 \n\t"
"cmovb %4, %1 \n\t"
"cmovb %3, %2 \n\t"
: "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift)
: "r"(low)
);
 
c->high = high;
c->code_word = code_word;
return bit;
}
#endif
 
#endif /* AVCODEC_X86_VP56_ARITH_H */
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp6dsp.asm
0,0 → 1,170
;******************************************************************************
;* MMX/SSE2-optimized functions for the VP6 decoder
;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
cextern pw_64
 
SECTION .text
 
%macro DIAG4 6
%if mmsize == 8
movq m0, [%1+%2]
movq m1, [%1+%3]
movq m3, m0
movq m4, m1
punpcklbw m0, m7
punpcklbw m1, m7
punpckhbw m3, m7
punpckhbw m4, m7
pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0]
pmullw m1, [rsp+8*12] ; src[x ] * biweight [1]
pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0]
pmullw m4, [rsp+8*12] ; src[x ] * biweight [1]
paddw m0, m1
paddw m3, m4
movq m1, [%1+%4]
movq m2, [%1+%5]
movq m4, m1
movq m5, m2
punpcklbw m1, m7
punpcklbw m2, m7
punpckhbw m4, m7
punpckhbw m5, m7
pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2]
pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3]
pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2]
pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3]
paddw m1, m2
paddw m4, m5
paddsw m0, m1
paddsw m3, m4
paddsw m0, m6 ; Add 64
paddsw m3, m6 ; Add 64
psraw m0, 7
psraw m3, 7
packuswb m0, m3
movq [%6], m0
%else ; mmsize == 16
movq m0, [%1+%2]
movq m1, [%1+%3]
punpcklbw m0, m7
punpcklbw m1, m7
pmullw m0, m4 ; src[x-8 ] * biweight [0]
pmullw m1, m5 ; src[x ] * biweight [1]
paddw m0, m1
movq m1, [%1+%4]
movq m2, [%1+%5]
punpcklbw m1, m7
punpcklbw m2, m7
pmullw m1, m6 ; src[x+8 ] * biweight [2]
pmullw m2, m3 ; src[x+16] * biweight [3]
paddw m1, m2
paddsw m0, m1
paddsw m0, [pw_64] ; Add 64
psraw m0, 7
packuswb m0, m0
movq [%6], m0
%endif ; mmsize == 8/16
%endmacro
 
%macro SPLAT4REGS 0
%if mmsize == 8
movq m5, m3
punpcklwd m3, m3
movq m4, m3
punpckldq m3, m3
punpckhdq m4, m4
punpckhwd m5, m5
movq m2, m5
punpckhdq m2, m2
punpckldq m5, m5
movq [rsp+8*11], m3
movq [rsp+8*12], m4
movq [rsp+8*13], m5
movq [rsp+8*14], m2
%else ; mmsize == 16
pshuflw m4, m3, 0x0
pshuflw m5, m3, 0x55
pshuflw m6, m3, 0xAA
pshuflw m3, m3, 0xFF
punpcklqdq m4, m4
punpcklqdq m5, m5
punpcklqdq m6, m6
punpcklqdq m3, m3
%endif ; mmsize == 8/16
%endmacro
 
%macro vp6_filter_diag4 0
; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride,
; const int16_t h_weight[4], const int16_t v_weights[4])
cglobal vp6_filter_diag4, 5, 7, 8
mov r5, rsp ; backup stack pointer
and rsp, ~(mmsize-1) ; align stack
%if mmsize == 16
sub rsp, 8*11
%else
sub rsp, 8*15
movq m6, [pw_64]
%endif
%if ARCH_X86_64
movsxd r2, r2d
%endif
 
sub r1, r2
 
pxor m7, m7
movq m3, [r3]
SPLAT4REGS
 
mov r3, rsp
mov r6, 11
.nextrow:
DIAG4 r1, -1, 0, 1, 2, r3
add r3, 8
add r1, r2
dec r6
jnz .nextrow
 
movq m3, [r4]
SPLAT4REGS
 
lea r3, [rsp+8]
mov r6, 8
.nextcol:
DIAG4 r3, -8, 0, 8, 16, r0
add r3, 8
add r0, r2
dec r6
jnz .nextcol
 
mov rsp, r5 ; restore stack pointer
RET
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
vp6_filter_diag4
%endif
 
INIT_XMM sse2
vp6_filter_diag4
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp6dsp_init.c
0,0 → 1,45
/*
* VP6 MMX/SSE2 optimizations
* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com>
* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/cpu.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vp56dsp.h"
 
void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride,
const int16_t *h_weights,const int16_t *v_weights);
void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride,
const int16_t *h_weights,const int16_t *v_weights);
 
av_cold void ff_vp6dsp_init_x86(VP56DSPContext* c, enum AVCodecID codec)
{
int cpu_flags = av_get_cpu_flags();
 
#if ARCH_X86_32
if (EXTERNAL_MMX(cpu_flags)) {
c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx;
}
#endif
if (EXTERNAL_SSE2(cpu_flags)) {
c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2;
}
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp8dsp.asm
0,0 → 1,2780
;******************************************************************************
;* VP8 MMXEXT optimizations
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
fourtap_filter_hw_m: times 4 dw -6, 123
times 4 dw 12, -1
times 4 dw -9, 93
times 4 dw 50, -6
times 4 dw -6, 50
times 4 dw 93, -9
times 4 dw -1, 12
times 4 dw 123, -6
 
sixtap_filter_hw_m: times 4 dw 2, -11
times 4 dw 108, 36
times 4 dw -8, 1
times 4 dw 3, -16
times 4 dw 77, 77
times 4 dw -16, 3
times 4 dw 1, -8
times 4 dw 36, 108
times 4 dw -11, 2
 
fourtap_filter_hb_m: times 8 db -6, 123
times 8 db 12, -1
times 8 db -9, 93
times 8 db 50, -6
times 8 db -6, 50
times 8 db 93, -9
times 8 db -1, 12
times 8 db 123, -6
 
sixtap_filter_hb_m: times 8 db 2, 1
times 8 db -11, 108
times 8 db 36, -8
times 8 db 3, 3
times 8 db -16, 77
times 8 db 77, -16
times 8 db 1, 2
times 8 db -8, 36
times 8 db 108, -11
 
fourtap_filter_v_m: times 8 dw -6
times 8 dw 123
times 8 dw 12
times 8 dw -1
times 8 dw -9
times 8 dw 93
times 8 dw 50
times 8 dw -6
times 8 dw -6
times 8 dw 50
times 8 dw 93
times 8 dw -9
times 8 dw -1
times 8 dw 12
times 8 dw 123
times 8 dw -6
 
sixtap_filter_v_m: times 8 dw 2
times 8 dw -11
times 8 dw 108
times 8 dw 36
times 8 dw -8
times 8 dw 1
times 8 dw 3
times 8 dw -16
times 8 dw 77
times 8 dw 77
times 8 dw -16
times 8 dw 3
times 8 dw 1
times 8 dw -8
times 8 dw 36
times 8 dw 108
times 8 dw -11
times 8 dw 2
 
bilinear_filter_vw_m: times 8 dw 1
times 8 dw 2
times 8 dw 3
times 8 dw 4
times 8 dw 5
times 8 dw 6
times 8 dw 7
 
bilinear_filter_vb_m: times 8 db 7, 1
times 8 db 6, 2
times 8 db 5, 3
times 8 db 4, 4
times 8 db 3, 5
times 8 db 2, 6
times 8 db 1, 7
 
%ifdef PIC
%define fourtap_filter_hw picregq
%define sixtap_filter_hw picregq
%define fourtap_filter_hb picregq
%define sixtap_filter_hb picregq
%define fourtap_filter_v picregq
%define sixtap_filter_v picregq
%define bilinear_filter_vw picregq
%define bilinear_filter_vb picregq
%define npicregs 1
%else
%define fourtap_filter_hw fourtap_filter_hw_m
%define sixtap_filter_hw sixtap_filter_hw_m
%define fourtap_filter_hb fourtap_filter_hb_m
%define sixtap_filter_hb sixtap_filter_hb_m
%define fourtap_filter_v fourtap_filter_v_m
%define sixtap_filter_v sixtap_filter_v_m
%define bilinear_filter_vw bilinear_filter_vw_m
%define bilinear_filter_vb bilinear_filter_vb_m
%define npicregs 0
%endif
 
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10
 
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11
 
pw_27: times 8 dw 27
pw_63: times 8 dw 63
pw_256: times 8 dw 256
pw_20091: times 4 dw 20091
pw_17734: times 4 dw 17734
 
pb_4: times 16 db 4
pb_F8: times 16 db 0xF8
pb_FE: times 16 db 0xFE
pb_27_63: times 8 db 27, 63
pb_18_63: times 8 db 18, 63
pb_9_63: times 8 db 9, 63
 
cextern pb_1
cextern pw_3
cextern pb_3
cextern pw_4
cextern pw_9
cextern pw_18
cextern pw_64
cextern pb_80
 
SECTION .text
 
;-----------------------------------------------------------------------------
; subpel MC functions:
;
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride,
; uint8_t *src, int srcstride,
; int height, int mx, int my);
;-----------------------------------------------------------------------------
 
%macro FILTER_SSSE3 1
cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg
lea mxd, [mxq*3]
mova m3, [filter_h6_shuf2]
mova m4, [filter_h6_shuf3]
%ifdef PIC
lea picregq, [sixtap_filter_hb_m]
%endif
mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes
mova m6, [sixtap_filter_hb+mxq*8-32]
mova m7, [sixtap_filter_hb+mxq*8-16]
 
.nextrow:
movu m0, [srcq-2]
mova m1, m0
mova m2, m0
%if mmsize == 8
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the
; shuffle with a memory operand
punpcklbw m0, [srcq+3]
%else
pshufb m0, [filter_h6_shuf1]
%endif
pshufb m1, m3
pshufb m2, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
pmaddubsw m2, m7
paddsw m0, m1
paddsw m0, m2
pmulhrsw m0, [pw_256]
packuswb m0, m0
movh [dstq], m0 ; store
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
 
cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 4
mova m2, [pw_256]
mova m3, [filter_h2_shuf]
mova m4, [filter_h4_shuf]
%ifdef PIC
lea picregq, [fourtap_filter_hb_m]
%endif
mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes
mova m6, [fourtap_filter_hb+mxq]
 
.nextrow:
movu m0, [srcq-1]
mova m1, m0
pshufb m0, m3
pshufb m1, m4
pmaddubsw m0, m5
pmaddubsw m1, m6
paddsw m0, m1
pmulhrsw m0, m2
packuswb m0, m0
movh [dstq], m0 ; store
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
 
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
%ifdef PIC
lea picregq, [fourtap_filter_hb_m]
%endif
mova m5, [fourtap_filter_hb+myq-16]
mova m6, [fourtap_filter_hb+myq]
mova m7, [pw_256]
 
; read 3 lines
sub srcq, srcstrideq
movh m0, [srcq]
movh m1, [srcq+ srcstrideq]
movh m2, [srcq+2*srcstrideq]
add srcq, srcstrideq
 
.nextrow:
movh m3, [srcq+2*srcstrideq] ; read new row
mova m4, m0
mova m0, m1
punpcklbw m4, m1
mova m1, m2
punpcklbw m2, m3
pmaddubsw m4, m5
pmaddubsw m2, m6
paddsw m4, m2
mova m2, m3
pmulhrsw m4, m7
packuswb m4, m4
movh [dstq], m4
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
 
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
lea myd, [myq*3]
%ifdef PIC
lea picregq, [sixtap_filter_hb_m]
%endif
lea myq, [sixtap_filter_hb+myq*8]
 
; read 5 lines
sub srcq, srcstrideq
sub srcq, srcstrideq
movh m0, [srcq]
movh m1, [srcq+srcstrideq]
movh m2, [srcq+srcstrideq*2]
lea srcq, [srcq+srcstrideq*2]
add srcq, srcstrideq
movh m3, [srcq]
movh m4, [srcq+srcstrideq]
 
.nextrow:
movh m5, [srcq+2*srcstrideq] ; read new row
mova m6, m0
punpcklbw m6, m5
mova m0, m1
punpcklbw m1, m2
mova m7, m3
punpcklbw m7, m4
pmaddubsw m6, [myq-48]
pmaddubsw m1, [myq-32]
pmaddubsw m7, [myq-16]
paddsw m6, m1
paddsw m6, m7
mova m1, m2
mova m2, m3
pmulhrsw m6, [pw_256]
mova m3, m4
packuswb m6, m6
mova m4, m5
movh [dstq], m6
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
%endmacro
 
INIT_MMX ssse3
FILTER_SSSE3 4
INIT_XMM ssse3
FILTER_SSSE3 8
 
; 4x4 block, H-only 4-tap filter
INIT_MMX mmxext
cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 4
%ifdef PIC
lea picregq, [fourtap_filter_hw_m]
%endif
movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words
movq mm5, [fourtap_filter_hw+mxq]
movq mm7, [pw_64]
pxor mm6, mm6
 
.nextrow:
movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels
 
; first set of 2 pixels
movq mm2, mm1 ; byte ABCD..
punpcklbw mm1, mm6 ; byte->word ABCD
pshufw mm0, mm2, 9 ; byte CDEF..
punpcklbw mm0, mm6 ; byte->word CDEF
pshufw mm3, mm1, 0x94 ; word ABBC
pshufw mm1, mm0, 0x94 ; word CDDE
pmaddwd mm3, mm4 ; multiply 2px with F0/F1
movq mm0, mm1 ; backup for second set of pixels
pmaddwd mm1, mm5 ; multiply 2px with F2/F3
paddd mm3, mm1 ; finish 1st 2px
 
; second set of 2 pixels, use backup of above
punpckhbw mm2, mm6 ; byte->word EFGH
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1
pshufw mm1, mm2, 0x94 ; word EFFG
pmaddwd mm1, mm5 ; multiply 2px with F2/F3
paddd mm0, mm1 ; finish 2nd 2px
 
; merge two sets of 2 pixels into one set of 4, round/clip/store
packssdw mm3, mm0 ; merge dword->word (4px)
paddsw mm3, mm7 ; rounding
psraw mm3, 7
packuswb mm3, mm6 ; clip and word->bytes
movd [dstq], mm3 ; store
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
 
; 4x4 block, H-only 6-tap filter
INIT_MMX mmxext
cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg
lea mxd, [mxq*3]
%ifdef PIC
lea picregq, [sixtap_filter_hw_m]
%endif
movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words
movq mm5, [sixtap_filter_hw+mxq*8-32]
movq mm6, [sixtap_filter_hw+mxq*8-16]
movq mm7, [pw_64]
pxor mm3, mm3
 
.nextrow:
movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels
 
; first set of 2 pixels
movq mm2, mm1 ; byte ABCD..
punpcklbw mm1, mm3 ; byte->word ABCD
pshufw mm0, mm2, 0x9 ; byte CDEF..
punpckhbw mm2, mm3 ; byte->word EFGH
punpcklbw mm0, mm3 ; byte->word CDEF
pshufw mm1, mm1, 0x94 ; word ABBC
pshufw mm2, mm2, 0x94 ; word EFFG
pmaddwd mm1, mm4 ; multiply 2px with F0/F1
pshufw mm3, mm0, 0x94 ; word CDDE
movq mm0, mm3 ; backup for second set of pixels
pmaddwd mm3, mm5 ; multiply 2px with F2/F3
paddd mm1, mm3 ; add to 1st 2px cache
movq mm3, mm2 ; backup for second set of pixels
pmaddwd mm2, mm6 ; multiply 2px with F4/F5
paddd mm1, mm2 ; finish 1st 2px
 
; second set of 2 pixels, use backup of above
movd mm2, [srcq+3] ; byte FGHI (prevent overreads)
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3
paddd mm0, mm3 ; add to 2nd 2px cache
pxor mm3, mm3
punpcklbw mm2, mm3 ; byte->word FGHI
pshufw mm2, mm2, 0xE9 ; word GHHI
pmaddwd mm2, mm6 ; multiply 2px with F4/F5
paddd mm0, mm2 ; finish 2nd 2px
 
; merge two sets of 2 pixels into one set of 4, round/clip/store
packssdw mm1, mm0 ; merge dword->word (4px)
paddsw mm1, mm7 ; rounding
psraw mm1, 7
packuswb mm1, mm3 ; clip and word->bytes
movd [dstq], mm1 ; store
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
 
INIT_XMM sse2
cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 5
%ifdef PIC
lea picregq, [fourtap_filter_v_m]
%endif
lea mxq, [fourtap_filter_v+mxq-32]
pxor m7, m7
mova m4, [pw_64]
mova m5, [mxq+ 0]
mova m6, [mxq+16]
%ifdef m8
mova m8, [mxq+32]
mova m9, [mxq+48]
%endif
.nextrow:
movq m0, [srcq-1]
movq m1, [srcq-0]
movq m2, [srcq+1]
movq m3, [srcq+2]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
pmullw m0, m5
pmullw m1, m6
%ifdef m8
pmullw m2, m8
pmullw m3, m9
%else
pmullw m2, [mxq+32]
pmullw m3, [mxq+48]
%endif
paddsw m0, m1
paddsw m2, m3
paddsw m0, m2
paddsw m0, m4
psraw m0, 7
packuswb m0, m7
movh [dstq], m0 ; store
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
 
INIT_XMM sse2
cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg
lea mxd, [mxq*3]
shl mxd, 4
%ifdef PIC
lea picregq, [sixtap_filter_v_m]
%endif
lea mxq, [sixtap_filter_v+mxq-96]
pxor m7, m7
mova m6, [pw_64]
%ifdef m8
mova m8, [mxq+ 0]
mova m9, [mxq+16]
mova m10, [mxq+32]
mova m11, [mxq+48]
mova m12, [mxq+64]
mova m13, [mxq+80]
%endif
.nextrow:
movq m0, [srcq-2]
movq m1, [srcq-1]
movq m2, [srcq-0]
movq m3, [srcq+1]
movq m4, [srcq+2]
movq m5, [srcq+3]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
punpcklbw m5, m7
%ifdef m8
pmullw m0, m8
pmullw m1, m9
pmullw m2, m10
pmullw m3, m11
pmullw m4, m12
pmullw m5, m13
%else
pmullw m0, [mxq+ 0]
pmullw m1, [mxq+16]
pmullw m2, [mxq+32]
pmullw m3, [mxq+48]
pmullw m4, [mxq+64]
pmullw m5, [mxq+80]
%endif
paddsw m1, m4
paddsw m0, m5
paddsw m1, m2
paddsw m0, m3
paddsw m0, m1
paddsw m0, m6
psraw m0, 7
packuswb m0, m7
movh [dstq], m0 ; store
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
 
%macro FILTER_V 1
; 4x4 block, V-only 4-tap filter
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 5
%ifdef PIC
lea picregq, [fourtap_filter_v_m]
%endif
lea myq, [fourtap_filter_v+myq-32]
mova m6, [pw_64]
pxor m7, m7
mova m5, [myq+48]
 
; read 3 lines
sub srcq, srcstrideq
movh m0, [srcq]
movh m1, [srcq+ srcstrideq]
movh m2, [srcq+2*srcstrideq]
add srcq, srcstrideq
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
 
.nextrow:
; first calculate negative taps (to prevent losing positive overflows)
movh m4, [srcq+2*srcstrideq] ; read new row
punpcklbw m4, m7
mova m3, m4
pmullw m0, [myq+0]
pmullw m4, m5
paddsw m4, m0
 
; then calculate positive taps
mova m0, m1
pmullw m1, [myq+16]
paddsw m4, m1
mova m1, m2
pmullw m2, [myq+32]
paddsw m4, m2
mova m2, m3
 
; round/clip/store
paddsw m4, m6
psraw m4, 7
packuswb m4, m7
movh [dstq], m4
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
 
 
; 4x4 block, V-only 6-tap filter
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
lea myq, [myq*3]
%ifdef PIC
lea picregq, [sixtap_filter_v_m]
%endif
lea myq, [sixtap_filter_v+myq-96]
pxor m7, m7
 
; read 5 lines
sub srcq, srcstrideq
sub srcq, srcstrideq
movh m0, [srcq]
movh m1, [srcq+srcstrideq]
movh m2, [srcq+srcstrideq*2]
lea srcq, [srcq+srcstrideq*2]
add srcq, srcstrideq
movh m3, [srcq]
movh m4, [srcq+srcstrideq]
punpcklbw m0, m7
punpcklbw m1, m7
punpcklbw m2, m7
punpcklbw m3, m7
punpcklbw m4, m7
 
.nextrow:
; first calculate negative taps (to prevent losing positive overflows)
mova m5, m1
pmullw m5, [myq+16]
mova m6, m4
pmullw m6, [myq+64]
paddsw m6, m5
 
; then calculate positive taps
movh m5, [srcq+2*srcstrideq] ; read new row
punpcklbw m5, m7
pmullw m0, [myq+0]
paddsw m6, m0
mova m0, m1
mova m1, m2
pmullw m2, [myq+32]
paddsw m6, m2
mova m2, m3
pmullw m3, [myq+48]
paddsw m6, m3
mova m3, m4
mova m4, m5
pmullw m5, [myq+80]
paddsw m6, m5
 
; round/clip/store
paddsw m6, [pw_64]
psraw m6, 7
packuswb m6, m7
movh [dstq], m6
 
; go to next line
add dstq, dststrideq
add srcq, srcstrideq
dec heightd ; next row
jg .nextrow
REP_RET
%endmacro
 
INIT_MMX mmxext
FILTER_V 4
INIT_XMM sse2
FILTER_V 8
 
%macro FILTER_BILINEAR 1
cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
%ifdef PIC
lea picregq, [bilinear_filter_vw_m]
%endif
pxor m6, m6
mova m5, [bilinear_filter_vw+myq-1*16]
neg myq
mova m4, [bilinear_filter_vw+myq+7*16]
.nextrow:
movh m0, [srcq+srcstrideq*0]
movh m1, [srcq+srcstrideq*1]
movh m3, [srcq+srcstrideq*2]
punpcklbw m0, m6
punpcklbw m1, m6
punpcklbw m3, m6
mova m2, m1
pmullw m0, m4
pmullw m1, m5
pmullw m2, m4
pmullw m3, m5
paddsw m0, m1
paddsw m2, m3
psraw m0, 2
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
%if mmsize == 8
packuswb m0, m0
packuswb m2, m2
movh [dstq+dststrideq*0], m0
movh [dstq+dststrideq*1], m2
%else
packuswb m0, m2
movh [dstq+dststrideq*0], m0
movhps [dstq+dststrideq*1], m0
%endif
 
lea dstq, [dstq+dststrideq*2]
lea srcq, [srcq+srcstrideq*2]
sub heightd, 2
jg .nextrow
REP_RET
 
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 4
%ifdef PIC
lea picregq, [bilinear_filter_vw_m]
%endif
pxor m6, m6
mova m5, [bilinear_filter_vw+mxq-1*16]
neg mxq
mova m4, [bilinear_filter_vw+mxq+7*16]
.nextrow:
movh m0, [srcq+srcstrideq*0+0]
movh m1, [srcq+srcstrideq*0+1]
movh m2, [srcq+srcstrideq*1+0]
movh m3, [srcq+srcstrideq*1+1]
punpcklbw m0, m6
punpcklbw m1, m6
punpcklbw m2, m6
punpcklbw m3, m6
pmullw m0, m4
pmullw m1, m5
pmullw m2, m4
pmullw m3, m5
paddsw m0, m1
paddsw m2, m3
psraw m0, 2
psraw m2, 2
pavgw m0, m6
pavgw m2, m6
%if mmsize == 8
packuswb m0, m0
packuswb m2, m2
movh [dstq+dststrideq*0], m0
movh [dstq+dststrideq*1], m2
%else
packuswb m0, m2
movh [dstq+dststrideq*0], m0
movhps [dstq+dststrideq*1], m0
%endif
 
lea dstq, [dstq+dststrideq*2]
lea srcq, [srcq+srcstrideq*2]
sub heightd, 2
jg .nextrow
REP_RET
%endmacro
 
INIT_MMX mmxext
FILTER_BILINEAR 4
INIT_XMM sse2
FILTER_BILINEAR 8
 
%macro FILTER_BILINEAR_SSSE3 1
cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my
shl myd, 4
%ifdef PIC
lea picregq, [bilinear_filter_vb_m]
%endif
pxor m4, m4
mova m3, [bilinear_filter_vb+myq-16]
.nextrow:
movh m0, [srcq+srcstrideq*0]
movh m1, [srcq+srcstrideq*1]
movh m2, [srcq+srcstrideq*2]
punpcklbw m0, m1
punpcklbw m1, m2
pmaddubsw m0, m3
pmaddubsw m1, m3
psraw m0, 2
psraw m1, 2
pavgw m0, m4
pavgw m1, m4
%if mmsize==8
packuswb m0, m0
packuswb m1, m1
movh [dstq+dststrideq*0], m0
movh [dstq+dststrideq*1], m1
%else
packuswb m0, m1
movh [dstq+dststrideq*0], m0
movhps [dstq+dststrideq*1], m0
%endif
 
lea dstq, [dstq+dststrideq*2]
lea srcq, [srcq+srcstrideq*2]
sub heightd, 2
jg .nextrow
REP_RET
 
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg
shl mxd, 4
%ifdef PIC
lea picregq, [bilinear_filter_vb_m]
%endif
pxor m4, m4
mova m2, [filter_h2_shuf]
mova m3, [bilinear_filter_vb+mxq-16]
.nextrow:
movu m0, [srcq+srcstrideq*0]
movu m1, [srcq+srcstrideq*1]
pshufb m0, m2
pshufb m1, m2
pmaddubsw m0, m3
pmaddubsw m1, m3
psraw m0, 2
psraw m1, 2
pavgw m0, m4
pavgw m1, m4
%if mmsize==8
packuswb m0, m0
packuswb m1, m1
movh [dstq+dststrideq*0], m0
movh [dstq+dststrideq*1], m1
%else
packuswb m0, m1
movh [dstq+dststrideq*0], m0
movhps [dstq+dststrideq*1], m0
%endif
 
lea dstq, [dstq+dststrideq*2]
lea srcq, [srcq+srcstrideq*2]
sub heightd, 2
jg .nextrow
REP_RET
%endmacro
 
INIT_MMX ssse3
FILTER_BILINEAR_SSSE3 4
INIT_XMM ssse3
FILTER_BILINEAR_SSSE3 8
 
INIT_MMX mmx
cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height
.nextrow:
movq mm0, [srcq+srcstrideq*0]
movq mm1, [srcq+srcstrideq*1]
lea srcq, [srcq+srcstrideq*2]
movq [dstq+dststrideq*0], mm0
movq [dstq+dststrideq*1], mm1
lea dstq, [dstq+dststrideq*2]
sub heightd, 2
jg .nextrow
REP_RET
 
%if ARCH_X86_32
INIT_MMX mmx
cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height
.nextrow:
movq mm0, [srcq+srcstrideq*0+0]
movq mm1, [srcq+srcstrideq*0+8]
movq mm2, [srcq+srcstrideq*1+0]
movq mm3, [srcq+srcstrideq*1+8]
lea srcq, [srcq+srcstrideq*2]
movq [dstq+dststrideq*0+0], mm0
movq [dstq+dststrideq*0+8], mm1
movq [dstq+dststrideq*1+0], mm2
movq [dstq+dststrideq*1+8], mm3
lea dstq, [dstq+dststrideq*2]
sub heightd, 2
jg .nextrow
REP_RET
%endif
 
INIT_XMM sse
cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height
.nextrow:
movups xmm0, [srcq+srcstrideq*0]
movups xmm1, [srcq+srcstrideq*1]
lea srcq, [srcq+srcstrideq*2]
movaps [dstq+dststrideq*0], xmm0
movaps [dstq+dststrideq*1], xmm1
lea dstq, [dstq+dststrideq*2]
sub heightd, 2
jg .nextrow
REP_RET
 
;-----------------------------------------------------------------------------
; void vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
;-----------------------------------------------------------------------------
 
%macro ADD_DC 4
%4 m2, [dst1q+%3]
%4 m3, [dst1q+strideq+%3]
%4 m4, [dst2q+%3]
%4 m5, [dst2q+strideq+%3]
paddusb m2, %1
paddusb m3, %1
paddusb m4, %1
paddusb m5, %1
psubusb m2, %2
psubusb m3, %2
psubusb m4, %2
psubusb m5, %2
%4 [dst1q+%3], m2
%4 [dst1q+strideq+%3], m3
%4 [dst2q+%3], m4
%4 [dst2q+strideq+%3], m5
%endmacro
 
INIT_MMX mmx
cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride
; load data
movd m0, [blockq]
 
; calculate DC
paddw m0, [pw_4]
pxor m1, m1
psraw m0, 3
movd [blockq], m1
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
punpcklbw m0, m0
punpcklbw m1, m1
punpcklwd m0, m0
punpcklwd m1, m1
 
; add DC
DEFINE_ARGS dst1, dst2, stride
lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m1, 0, movh
RET
 
INIT_XMM sse4
cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride
; load data
movd m0, [blockq]
pxor m1, m1
 
; calculate DC
paddw m0, [pw_4]
movd [blockq], m1
DEFINE_ARGS dst1, dst2, stride
lea dst2q, [dst1q+strideq*2]
movd m2, [dst1q]
movd m3, [dst1q+strideq]
movd m4, [dst2q]
movd m5, [dst2q+strideq]
psraw m0, 3
pshuflw m0, m0, 0
punpcklqdq m0, m0
punpckldq m2, m3
punpckldq m4, m5
punpcklbw m2, m1
punpcklbw m4, m1
paddw m2, m0
paddw m4, m0
packuswb m2, m4
movd [dst1q], m2
pextrd [dst1q+strideq], m2, 1
pextrd [dst2q], m2, 2
pextrd [dst2q+strideq], m2, 3
RET
 
;-----------------------------------------------------------------------------
; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
;-----------------------------------------------------------------------------
 
%if ARCH_X86_32
INIT_MMX mmx
cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride
; load data
movd m0, [blockq+32*0] ; A
movd m1, [blockq+32*2] ; C
punpcklwd m0, [blockq+32*1] ; A B
punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m6, m6
 
; calculate DC
paddw m0, [pw_4]
movd [blockq+32*0], m6
movd [blockq+32*1], m6
movd [blockq+32*2], m6
movd [blockq+32*3], m6
psraw m0, 3
psubw m6, m0
packuswb m0, m0
packuswb m6, m6
punpcklbw m0, m0 ; AABBCCDD
punpcklbw m6, m6 ; AABBCCDD
movq m1, m0
movq m7, m6
punpcklbw m0, m0 ; AAAABBBB
punpckhbw m1, m1 ; CCCCDDDD
punpcklbw m6, m6 ; AAAABBBB
punpckhbw m7, m7 ; CCCCDDDD
 
; add DC
DEFINE_ARGS dst1, dst2, stride
lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m6, 0, mova
ADD_DC m1, m7, 8, mova
RET
%endif
 
INIT_XMM sse2
cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride
; load data
movd m0, [blockq+32*0] ; A
movd m1, [blockq+32*2] ; C
punpcklwd m0, [blockq+32*1] ; A B
punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m1, m1
 
; calculate DC
paddw m0, [pw_4]
movd [blockq+32*0], m1
movd [blockq+32*1], m1
movd [blockq+32*2], m1
movd [blockq+32*3], m1
psraw m0, 3
psubw m1, m0
packuswb m0, m0
packuswb m1, m1
punpcklbw m0, m0
punpcklbw m1, m1
punpcklbw m0, m0
punpcklbw m1, m1
 
; add DC
DEFINE_ARGS dst1, dst2, stride
lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m1, 0, mova
RET
 
;-----------------------------------------------------------------------------
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride);
;-----------------------------------------------------------------------------
 
INIT_MMX mmx
cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride
; load data
movd m0, [blockq+32*0] ; A
movd m1, [blockq+32*2] ; C
punpcklwd m0, [blockq+32*1] ; A B
punpcklwd m1, [blockq+32*3] ; C D
punpckldq m0, m1 ; A B C D
pxor m6, m6
 
; calculate DC
paddw m0, [pw_4]
movd [blockq+32*0], m6
movd [blockq+32*1], m6
movd [blockq+32*2], m6
movd [blockq+32*3], m6
psraw m0, 3
psubw m6, m0
packuswb m0, m0
packuswb m6, m6
punpcklbw m0, m0 ; AABBCCDD
punpcklbw m6, m6 ; AABBCCDD
movq m1, m0
movq m7, m6
punpcklbw m0, m0 ; AAAABBBB
punpckhbw m1, m1 ; CCCCDDDD
punpcklbw m6, m6 ; AAAABBBB
punpckhbw m7, m7 ; CCCCDDDD
 
; add DC
DEFINE_ARGS dst1, dst2, stride
lea dst2q, [dst1q+strideq*2]
ADD_DC m0, m6, 0, mova
lea dst1q, [dst1q+strideq*4]
lea dst2q, [dst2q+strideq*4]
ADD_DC m1, m7, 0, mova
RET
 
;-----------------------------------------------------------------------------
; void vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride);
;-----------------------------------------------------------------------------
 
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2)
; this macro assumes that m6/m7 have words for 20091/17734 loaded
%macro VP8_MULTIPLY_SUMSUB 4
mova %3, %1
mova %4, %2
pmulhw %3, m6 ;20091(1)
pmulhw %4, m6 ;20091(2)
paddw %3, %1
paddw %4, %2
paddw %1, %1
paddw %2, %2
pmulhw %1, m7 ;35468(1)
pmulhw %2, m7 ;35468(2)
psubw %1, %4
paddw %2, %3
%endmacro
 
; calculate x0=%1+%3; x1=%1-%3
; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4)
; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3)
; %5/%6 are temporary registers
; we assume m6/m7 have constant words 20091/17734 loaded in them
%macro VP8_IDCT_TRANSFORM4x4_1D 6
SUMSUB_BA w, %3, %1, %5 ;t0, t1
VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3
SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3
SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2
SWAP %4, %1
SWAP %4, %3
%endmacro
 
%macro VP8_IDCT_ADD 0
cglobal vp8_idct_add, 3, 3, 0, dst, block, stride
; load block data
movq m0, [blockq+ 0]
movq m1, [blockq+ 8]
movq m2, [blockq+16]
movq m3, [blockq+24]
movq m6, [pw_20091]
movq m7, [pw_17734]
%if cpuflag(sse)
xorps xmm0, xmm0
movaps [blockq+ 0], xmm0
movaps [blockq+16], xmm0
%else
pxor m4, m4
movq [blockq+ 0], m4
movq [blockq+ 8], m4
movq [blockq+16], m4
movq [blockq+24], m4
%endif
 
; actual IDCT
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
TRANSPOSE4x4W 0, 1, 2, 3, 4
paddw m0, [pw_4]
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5
TRANSPOSE4x4W 0, 1, 2, 3, 4
 
; store
pxor m4, m4
DEFINE_ARGS dst1, dst2, stride
lea dst2q, [dst1q+2*strideq]
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq
 
RET
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
VP8_IDCT_ADD
%endif
INIT_MMX sse
VP8_IDCT_ADD
 
;-----------------------------------------------------------------------------
; void vp8_luma_dc_wht_mmxext(int16_t block[4][4][16], int16_t dc[16])
;-----------------------------------------------------------------------------
 
%macro SCATTER_WHT 3
movd dc1d, m%1
movd dc2d, m%2
mov [blockq+2*16*(0+%3)], dc1w
mov [blockq+2*16*(1+%3)], dc2w
shr dc1d, 16
shr dc2d, 16
psrlq m%1, 32
psrlq m%2, 32
mov [blockq+2*16*(4+%3)], dc1w
mov [blockq+2*16*(5+%3)], dc2w
movd dc1d, m%1
movd dc2d, m%2
mov [blockq+2*16*(8+%3)], dc1w
mov [blockq+2*16*(9+%3)], dc2w
shr dc1d, 16
shr dc2d, 16
mov [blockq+2*16*(12+%3)], dc1w
mov [blockq+2*16*(13+%3)], dc2w
%endmacro
 
%macro HADAMARD4_1D 4
SUMSUB_BADC w, %2, %1, %4, %3
SUMSUB_BADC w, %4, %2, %3, %1
SWAP %1, %4, %3
%endmacro
 
%macro VP8_DC_WHT 0
cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2
movq m0, [dc1q]
movq m1, [dc1q+8]
movq m2, [dc1q+16]
movq m3, [dc1q+24]
%if cpuflag(sse)
xorps xmm0, xmm0
movaps [dc1q+ 0], xmm0
movaps [dc1q+16], xmm0
%else
pxor m4, m4
movq [dc1q+ 0], m4
movq [dc1q+ 8], m4
movq [dc1q+16], m4
movq [dc1q+24], m4
%endif
HADAMARD4_1D 0, 1, 2, 3
TRANSPOSE4x4W 0, 1, 2, 3, 4
paddw m0, [pw_3]
HADAMARD4_1D 0, 1, 2, 3
psraw m0, 3
psraw m1, 3
psraw m2, 3
psraw m3, 3
SCATTER_WHT 0, 1, 0
SCATTER_WHT 2, 3, 2
RET
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
VP8_DC_WHT
%endif
INIT_MMX sse
VP8_DC_WHT
 
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim);
;-----------------------------------------------------------------------------
 
; macro called with 7 mm register indexes as argument, and 4 regular registers
;
; first 4 mm registers will carry the transposed pixel data
; the other three are scratchspace (one would be sufficient, but this allows
; for more spreading/pipelining and thus faster execution on OOE CPUs)
;
; first two regular registers are buf+4*stride and buf+5*stride
; third is -stride, fourth is +stride
%macro READ_8x4_INTERLEAVED 11
; interleave 8 (A-H) rows of 4 pixels each
movd m%1, [%8+%10*4] ; A0-3
movd m%5, [%9+%10*4] ; B0-3
movd m%2, [%8+%10*2] ; C0-3
movd m%6, [%8+%10] ; D0-3
movd m%3, [%8] ; E0-3
movd m%7, [%9] ; F0-3
movd m%4, [%9+%11] ; G0-3
punpcklbw m%1, m%5 ; A/B interleaved
movd m%5, [%9+%11*2] ; H0-3
punpcklbw m%2, m%6 ; C/D interleaved
punpcklbw m%3, m%7 ; E/F interleaved
punpcklbw m%4, m%5 ; G/H interleaved
%endmacro
 
; macro called with 7 mm register indexes as argument, and 5 regular registers
; first 11 mean the same as READ_8x4_TRANSPOSED above
; fifth regular register is scratchspace to reach the bottom 8 rows, it
; will be set to second regular register + 8*stride at the end
%macro READ_16x4_INTERLEAVED 12
; transpose 16 (A-P) rows of 4 pixels each
lea %12, [r0+8*r2]
 
; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M
movd m%1, [%8+%10*4] ; A0-3
movd m%3, [%12+%10*4] ; I0-3
movd m%2, [%8+%10*2] ; C0-3
movd m%4, [%12+%10*2] ; K0-3
movd m%6, [%8+%10] ; D0-3
movd m%5, [%12+%10] ; L0-3
movd m%7, [%12] ; M0-3
add %12, %11
punpcklbw m%1, m%3 ; A/I
movd m%3, [%8] ; E0-3
punpcklbw m%2, m%4 ; C/K
punpcklbw m%6, m%5 ; D/L
punpcklbw m%3, m%7 ; E/M
punpcklbw m%2, m%6 ; C/D/K/L interleaved
 
; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P
movd m%5, [%9+%10*4] ; B0-3
movd m%4, [%12+%10*4] ; J0-3
movd m%7, [%9] ; F0-3
movd m%6, [%12] ; N0-3
punpcklbw m%5, m%4 ; B/J
punpcklbw m%7, m%6 ; F/N
punpcklbw m%1, m%5 ; A/B/I/J interleaved
punpcklbw m%3, m%7 ; E/F/M/N interleaved
movd m%4, [%9+%11] ; G0-3
movd m%6, [%12+%11] ; O0-3
movd m%5, [%9+%11*2] ; H0-3
movd m%7, [%12+%11*2] ; P0-3
punpcklbw m%4, m%6 ; G/O
punpcklbw m%5, m%7 ; H/P
punpcklbw m%4, m%5 ; G/H/O/P interleaved
%endmacro
 
; write 4 mm registers of 2 dwords each
; first four arguments are mm register indexes containing source data
; last four are registers containing buf+4*stride, buf+5*stride,
; -stride and +stride
%macro WRITE_4x2D 8
; write out (2 dwords per register)
movd [%5+%7*4], m%1
movd [%5+%7*2], m%2
movd [%5], m%3
movd [%6+%8], m%4
punpckhdq m%1, m%1
punpckhdq m%2, m%2
punpckhdq m%3, m%3
punpckhdq m%4, m%4
movd [%6+%7*4], m%1
movd [%5+%7], m%2
movd [%6], m%3
movd [%6+%8*2], m%4
%endmacro
 
; write 4 xmm registers of 4 dwords each
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride
; we add 1*stride to the third regular registry in the process
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the
; same memory region), or 8 if they cover two separate buffers (third one points to
; a different memory region than the first two), allowing for more optimal code for
; the 16-width case
%macro WRITE_4x4D 10
; write out (4 dwords per register), start with dwords zero
movd [%5+%8*4], m%1
movd [%5], m%2
movd [%7+%8*4], m%3
movd [%7], m%4
 
; store dwords 1
psrldq m%1, 4
psrldq m%2, 4
psrldq m%3, 4
psrldq m%4, 4
movd [%6+%8*4], m%1
movd [%6], m%2
%if %10 == 16
movd [%6+%9*4], m%3
%endif
movd [%7+%9], m%4
 
; write dwords 2
psrldq m%1, 4
psrldq m%2, 4
%if %10 == 8
movd [%5+%8*2], m%1
movd %5d, m%3
%endif
psrldq m%3, 4
psrldq m%4, 4
%if %10 == 16
movd [%5+%8*2], m%1
%endif
movd [%6+%9], m%2
movd [%7+%8*2], m%3
movd [%7+%9*2], m%4
add %7, %9
 
; store dwords 3
psrldq m%1, 4
psrldq m%2, 4
psrldq m%3, 4
psrldq m%4, 4
%if %10 == 8
mov [%7+%8*4], %5d
movd [%6+%8*2], m%1
%else
movd [%5+%8], m%1
%endif
movd [%6+%9*2], m%2
movd [%7+%8*2], m%3
movd [%7+%9*2], m%4
%endmacro
 
; write 4 or 8 words in the mmx/xmm registers as 8 lines
; 1 and 2 are the registers to write, this can be the same (for SSE2)
; for pre-SSE4:
; 3 is a general-purpose register that we will clobber
; for SSE4:
; 3 is a pointer to the destination's 5th line
; 4 is a pointer to the destination's 4th line
; 5/6 is -stride and +stride
%macro WRITE_2x4W 6
movd %3d, %1
punpckhdq %1, %1
mov [%4+%5*4], %3w
shr %3, 16
add %4, %6
mov [%4+%5*4], %3w
 
movd %3d, %1
add %4, %5
mov [%4+%5*2], %3w
shr %3, 16
mov [%4+%5 ], %3w
 
movd %3d, %2
punpckhdq %2, %2
mov [%4 ], %3w
shr %3, 16
mov [%4+%6 ], %3w
 
movd %3d, %2
add %4, %6
mov [%4+%6 ], %3w
shr %3, 16
mov [%4+%6*2], %3w
add %4, %5
%endmacro
 
%macro WRITE_8W 5
%if cpuflag(sse4)
pextrw [%3+%4*4], %1, 0
pextrw [%2+%4*4], %1, 1
pextrw [%3+%4*2], %1, 2
pextrw [%3+%4 ], %1, 3
pextrw [%3 ], %1, 4
pextrw [%2 ], %1, 5
pextrw [%2+%5 ], %1, 6
pextrw [%2+%5*2], %1, 7
%else
movd %2d, %1
psrldq %1, 4
mov [%3+%4*4], %2w
shr %2, 16
add %3, %5
mov [%3+%4*4], %2w
 
movd %2d, %1
psrldq %1, 4
add %3, %4
mov [%3+%4*2], %2w
shr %2, 16
mov [%3+%4 ], %2w
 
movd %2d, %1
psrldq %1, 4
mov [%3 ], %2w
shr %2, 16
mov [%3+%5 ], %2w
 
movd %2d, %1
add %3, %5
mov [%3+%5 ], %2w
shr %2, 16
mov [%3+%5*2], %2w
%endif
%endmacro
 
%macro SIMPLE_LOOPFILTER 2
cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr
%if mmsize == 8 ; mmx/mmxext
mov cntrq, 2
%endif
%if cpuflag(ssse3)
pxor m0, m0
%endif
SPLATB_REG m7, flim, m0 ; splat "flim" into register
 
; set up indexes to address 4 rows
%if mmsize == 8
DEFINE_ARGS dst1, mstride, stride, cntr, dst2
%else
DEFINE_ARGS dst1, mstride, stride, dst3, dst2
%endif
mov strideq, mstrideq
neg mstrideq
%ifidn %1, h
lea dst1q, [dst1q+4*strideq-2]
%endif
 
%if mmsize == 8 ; mmx / mmxext
.next8px:
%endif
%ifidn %1, v
; read 4 half/full rows of pixels
mova m0, [dst1q+mstrideq*2] ; p1
mova m1, [dst1q+mstrideq] ; p0
mova m2, [dst1q] ; q0
mova m3, [dst1q+ strideq] ; q1
%else ; h
lea dst2q, [dst1q+ strideq]
 
%if mmsize == 8 ; mmx/mmxext
READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq
%else ; sse2
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q
%endif
TRANSPOSE4x4W 0, 1, 2, 3, 4
%endif
 
; simple_limit
mova m5, m2 ; m5=backup of q0
mova m6, m1 ; m6=backup of p0
psubusb m1, m2 ; p0-q0
psubusb m2, m6 ; q0-p0
por m1, m2 ; FFABS(p0-q0)
paddusb m1, m1 ; m1=FFABS(p0-q0)*2
 
mova m4, m3
mova m2, m0
psubusb m3, m0 ; q1-p1
psubusb m0, m4 ; p1-q1
por m3, m0 ; FFABS(p1-q1)
mova m0, [pb_80]
pxor m2, m0
pxor m4, m0
psubsb m2, m4 ; m2=p1-q1 (signed) backup for below
pand m3, [pb_FE]
psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed
paddusb m3, m1
psubusb m3, m7
pxor m1, m1
pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0)
 
; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask)
mova m4, m5
pxor m5, m0
pxor m0, m6
psubsb m5, m0 ; q0-p0 (signed)
paddsb m2, m5
paddsb m2, m5
paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0)
pand m2, m3 ; apply filter mask (m3)
 
mova m3, [pb_F8]
mova m1, m2
paddsb m2, [pb_4] ; f1<<3=a+4
paddsb m1, [pb_3] ; f2<<3=a+3
pand m2, m3
pand m1, m3 ; cache f2<<3
 
pxor m0, m0
pxor m3, m3
pcmpgtb m0, m2 ; which values are <0?
psubb m3, m2 ; -f1<<3
psrlq m2, 3 ; +f1
psrlq m3, 3 ; -f1
pand m3, m0
pandn m0, m2
psubusb m4, m0
paddusb m4, m3 ; q0-f1
 
pxor m0, m0
pxor m3, m3
pcmpgtb m0, m1 ; which values are <0?
psubb m3, m1 ; -f2<<3
psrlq m1, 3 ; +f2
psrlq m3, 3 ; -f2
pand m3, m0
pandn m0, m1
paddusb m6, m0
psubusb m6, m3 ; p0+f2
 
; store
%ifidn %1, v
mova [dst1q], m4
mova [dst1q+mstrideq], m6
%else ; h
inc dst1q
SBUTTERFLY bw, 6, 4, 0
 
%if mmsize == 16 ; sse2
%if cpuflag(sse4)
inc dst2q
%endif
WRITE_8W m6, dst2q, dst1q, mstrideq, strideq
lea dst2q, [dst3q+mstrideq+1]
%if cpuflag(sse4)
inc dst3q
%endif
WRITE_8W m4, dst3q, dst2q, mstrideq, strideq
%else ; mmx/mmxext
WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq
%endif
%endif
 
%if mmsize == 8 ; mmx/mmxext
; next 8 pixels
%ifidn %1, v
add dst1q, 8 ; advance 8 cols = pixels
%else ; h
lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines
%endif
dec cntrq
jg .next8px
REP_RET
%else ; sse2
RET
%endif
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
SIMPLE_LOOPFILTER v, 4
SIMPLE_LOOPFILTER h, 5
INIT_MMX mmxext
SIMPLE_LOOPFILTER v, 4
SIMPLE_LOOPFILTER h, 5
%endif
 
INIT_XMM sse2
SIMPLE_LOOPFILTER v, 3
SIMPLE_LOOPFILTER h, 5
INIT_XMM ssse3
SIMPLE_LOOPFILTER v, 3
SIMPLE_LOOPFILTER h, 5
INIT_XMM sse4
SIMPLE_LOOPFILTER h, 5
 
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
; int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------
 
%macro INNER_LOOPFILTER 2
%define stack_size 0
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%ifidn %1, v ; [3]=hev() result
%define stack_size mmsize * -4
%else ; h ; extra storage space for transposes
%define stack_size mmsize * -5
%endif
%endif
 
%if %2 == 8 ; chroma
cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr
%else ; luma
cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr
%endif
 
%if cpuflag(ssse3)
pxor m7, m7
%endif
 
%ifndef m8
; splat function arguments
SPLATB_REG m0, flimEq, m7 ; E
SPLATB_REG m1, flimIq, m7 ; I
SPLATB_REG m2, hevthrq, m7 ; hev_thresh
 
%define m_flimE [rsp]
%define m_flimI [rsp+mmsize]
%define m_hevthr [rsp+mmsize*2]
%define m_maskres [rsp+mmsize*3]
%define m_p0backup [rsp+mmsize*3]
%define m_q0backup [rsp+mmsize*4]
 
mova m_flimE, m0
mova m_flimI, m1
mova m_hevthr, m2
%else
%define m_flimE m9
%define m_flimI m10
%define m_hevthr m11
%define m_maskres m12
%define m_p0backup m12
%define m_q0backup m8
 
; splat function arguments
SPLATB_REG m_flimE, flimEq, m7 ; E
SPLATB_REG m_flimI, flimIq, m7 ; I
SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
%endif
 
%if %2 == 8 ; chroma
DEFINE_ARGS dst1, dst8, mstride, stride, dst2
%elif mmsize == 8
DEFINE_ARGS dst1, mstride, stride, dst2, cntr
mov cntrq, 2
%else
DEFINE_ARGS dst1, mstride, stride, dst2, dst8
%endif
mov strideq, mstrideq
neg mstrideq
%ifidn %1, h
lea dst1q, [dst1q+strideq*4-4]
%if %2 == 8 ; chroma
lea dst8q, [dst8q+strideq*4-4]
%endif
%endif
 
%if mmsize == 8
.next8px:
%endif
; read
lea dst2q, [dst1q+strideq]
%ifidn %1, v
%if %2 == 8 && mmsize == 16
%define movrow movh
%else
%define movrow mova
%endif
movrow m0, [dst1q+mstrideq*4] ; p3
movrow m1, [dst2q+mstrideq*4] ; p2
movrow m2, [dst1q+mstrideq*2] ; p1
movrow m5, [dst2q] ; q1
movrow m6, [dst2q+ strideq*1] ; q2
movrow m7, [dst2q+ strideq*2] ; q3
%if mmsize == 16 && %2 == 8
movhps m0, [dst8q+mstrideq*4]
movhps m2, [dst8q+mstrideq*2]
add dst8q, strideq
movhps m1, [dst8q+mstrideq*4]
movhps m5, [dst8q]
movhps m6, [dst8q+ strideq ]
movhps m7, [dst8q+ strideq*2]
add dst8q, mstrideq
%endif
%elif mmsize == 8 ; mmx/mmxext (h)
; read 8 rows of 8px each
movu m0, [dst1q+mstrideq*4]
movu m1, [dst2q+mstrideq*4]
movu m2, [dst1q+mstrideq*2]
movu m3, [dst1q+mstrideq ]
movu m4, [dst1q]
movu m5, [dst2q]
movu m6, [dst2q+ strideq ]
 
; 8x8 transpose
TRANSPOSE4x4B 0, 1, 2, 3, 7
mova m_q0backup, m1
movu m7, [dst2q+ strideq*2]
TRANSPOSE4x4B 4, 5, 6, 7, 1
SBUTTERFLY dq, 0, 4, 1 ; p3/p2
SBUTTERFLY dq, 2, 6, 1 ; q0/q1
SBUTTERFLY dq, 3, 7, 1 ; q2/q3
mova m1, m_q0backup
mova m_q0backup, m2 ; store q0
SBUTTERFLY dq, 1, 5, 2 ; p1/p0
mova m_p0backup, m5 ; store p0
SWAP 1, 4
SWAP 2, 4
SWAP 6, 3
SWAP 5, 3
%else ; sse2 (h)
%if %2 == 16
lea dst8q, [dst1q+ strideq*8]
%endif
 
; read 16 rows of 8px each, interleave
movh m0, [dst1q+mstrideq*4]
movh m1, [dst8q+mstrideq*4]
movh m2, [dst1q+mstrideq*2]
movh m5, [dst8q+mstrideq*2]
movh m3, [dst1q+mstrideq ]
movh m6, [dst8q+mstrideq ]
movh m4, [dst1q]
movh m7, [dst8q]
punpcklbw m0, m1 ; A/I
punpcklbw m2, m5 ; C/K
punpcklbw m3, m6 ; D/L
punpcklbw m4, m7 ; E/M
 
add dst8q, strideq
movh m1, [dst2q+mstrideq*4]
movh m6, [dst8q+mstrideq*4]
movh m5, [dst2q]
movh m7, [dst8q]
punpcklbw m1, m6 ; B/J
punpcklbw m5, m7 ; F/N
movh m6, [dst2q+ strideq ]
movh m7, [dst8q+ strideq ]
punpcklbw m6, m7 ; G/O
 
; 8x16 transpose
TRANSPOSE4x4B 0, 1, 2, 3, 7
%ifdef m8
SWAP 1, 8
%else
mova m_q0backup, m1
%endif
movh m7, [dst2q+ strideq*2]
movh m1, [dst8q+ strideq*2]
punpcklbw m7, m1 ; H/P
TRANSPOSE4x4B 4, 5, 6, 7, 1
SBUTTERFLY dq, 0, 4, 1 ; p3/p2
SBUTTERFLY dq, 2, 6, 1 ; q0/q1
SBUTTERFLY dq, 3, 7, 1 ; q2/q3
%ifdef m8
SWAP 1, 8
SWAP 2, 8
%else
mova m1, m_q0backup
mova m_q0backup, m2 ; store q0
%endif
SBUTTERFLY dq, 1, 5, 2 ; p1/p0
%ifdef m12
SWAP 5, 12
%else
mova m_p0backup, m5 ; store p0
%endif
SWAP 1, 4
SWAP 2, 4
SWAP 6, 3
SWAP 5, 3
%endif
 
; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
mova m4, m1
SWAP 4, 1
psubusb m4, m0 ; p2-p3
psubusb m0, m1 ; p3-p2
por m0, m4 ; abs(p3-p2)
 
mova m4, m2
SWAP 4, 2
psubusb m4, m1 ; p1-p2
psubusb m1, m2 ; p2-p1
por m1, m4 ; abs(p2-p1)
 
mova m4, m6
SWAP 4, 6
psubusb m4, m7 ; q2-q3
psubusb m7, m6 ; q3-q2
por m7, m4 ; abs(q3-q2)
 
mova m4, m5
SWAP 4, 5
psubusb m4, m6 ; q1-q2
psubusb m6, m5 ; q2-q1
por m6, m4 ; abs(q2-q1)
 
%if notcpuflag(mmxext)
mova m4, m_flimI
pxor m3, m3
psubusb m0, m4
psubusb m1, m4
psubusb m7, m4
psubusb m6, m4
pcmpeqb m0, m3 ; abs(p3-p2) <= I
pcmpeqb m1, m3 ; abs(p2-p1) <= I
pcmpeqb m7, m3 ; abs(q3-q2) <= I
pcmpeqb m6, m3 ; abs(q2-q1) <= I
pand m0, m1
pand m7, m6
pand m0, m7
%else ; mmxext/sse2
pmaxub m0, m1
pmaxub m6, m7
pmaxub m0, m6
%endif
 
; normal_limit and high_edge_variance for p1-p0, q1-q0
SWAP 7, 3 ; now m7 is zero
%ifidn %1, v
movrow m3, [dst1q+mstrideq ] ; p0
%if mmsize == 16 && %2 == 8
movhps m3, [dst8q+mstrideq ]
%endif
%elifdef m12
SWAP 3, 12
%else
mova m3, m_p0backup
%endif
 
mova m1, m2
SWAP 1, 2
mova m6, m3
SWAP 3, 6
psubusb m1, m3 ; p1-p0
psubusb m6, m2 ; p0-p1
por m1, m6 ; abs(p1-p0)
%if notcpuflag(mmxext)
mova m6, m1
psubusb m1, m4
psubusb m6, m_hevthr
pcmpeqb m1, m7 ; abs(p1-p0) <= I
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
pand m0, m1
mova m_maskres, m6
%else ; mmxext/sse2
pmaxub m0, m1 ; max_I
SWAP 1, 4 ; max_hev_thresh
%endif
 
SWAP 6, 4 ; now m6 is I
%ifidn %1, v
movrow m4, [dst1q] ; q0
%if mmsize == 16 && %2 == 8
movhps m4, [dst8q]
%endif
%elifdef m8
SWAP 4, 8
%else
mova m4, m_q0backup
%endif
mova m1, m4
SWAP 1, 4
mova m7, m5
SWAP 7, 5
psubusb m1, m5 ; q0-q1
psubusb m7, m4 ; q1-q0
por m1, m7 ; abs(q1-q0)
%if notcpuflag(mmxext)
mova m7, m1
psubusb m1, m6
psubusb m7, m_hevthr
pxor m6, m6
pcmpeqb m1, m6 ; abs(q1-q0) <= I
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
mova m6, m_maskres
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
pand m6, m7
%else ; mmxext/sse2
pxor m7, m7
pmaxub m0, m1
pmaxub m6, m1
psubusb m0, m_flimI
psubusb m6, m_hevthr
pcmpeqb m0, m7 ; max(abs(..)) <= I
pcmpeqb m6, m7 ; !(max(abs..) > thresh)
%endif
%ifdef m12
SWAP 6, 12
%else
mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
%endif
 
; simple_limit
mova m1, m3
SWAP 1, 3
mova m6, m4 ; keep copies of p0/q0 around for later use
SWAP 6, 4
psubusb m1, m4 ; p0-q0
psubusb m6, m3 ; q0-p0
por m1, m6 ; abs(q0-p0)
paddusb m1, m1 ; m1=2*abs(q0-p0)
 
mova m7, m2
SWAP 7, 2
mova m6, m5
SWAP 6, 5
psubusb m7, m5 ; p1-q1
psubusb m6, m2 ; q1-p1
por m7, m6 ; abs(q1-p1)
pxor m6, m6
pand m7, [pb_FE]
psrlq m7, 1 ; abs(q1-p1)/2
paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
psubusb m7, m_flimE
pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
pand m0, m7 ; normal_limit result
 
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
%ifdef m8 ; x86-64 && sse2
mova m8, [pb_80]
%define m_pb_80 m8
%else ; x86-32 or mmx/mmxext
%define m_pb_80 [pb_80]
%endif
mova m1, m4
mova m7, m3
pxor m1, m_pb_80
pxor m7, m_pb_80
psubsb m1, m7 ; (signed) q0-p0
mova m6, m2
mova m7, m5
pxor m6, m_pb_80
pxor m7, m_pb_80
psubsb m6, m7 ; (signed) p1-q1
mova m7, m_maskres
pandn m7, m6
paddsb m7, m1
paddsb m7, m1
paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1)
 
pand m7, m0
mova m1, [pb_F8]
mova m6, m7
paddsb m7, [pb_3]
paddsb m6, [pb_4]
pand m7, m1
pand m6, m1
 
pxor m1, m1
pxor m0, m0
pcmpgtb m1, m7
psubb m0, m7
psrlq m7, 3 ; +f2
psrlq m0, 3 ; -f2
pand m0, m1
pandn m1, m7
psubusb m3, m0
paddusb m3, m1 ; p0+f2
 
pxor m1, m1
pxor m0, m0
pcmpgtb m0, m6
psubb m1, m6
psrlq m6, 3 ; +f1
psrlq m1, 3 ; -f1
pand m1, m0
pandn m0, m6
psubusb m4, m0
paddusb m4, m1 ; q0-f1
 
%ifdef m12
SWAP 6, 12
%else
mova m6, m_maskres
%endif
%if notcpuflag(mmxext)
mova m7, [pb_1]
%else ; mmxext/sse2
pxor m7, m7
%endif
pand m0, m6
pand m1, m6
%if notcpuflag(mmxext)
paddusb m0, m7
pand m1, [pb_FE]
pandn m7, m0
psrlq m1, 1
psrlq m7, 1
SWAP 0, 7
%else ; mmxext/sse2
psubusb m1, [pb_1]
pavgb m0, m7 ; a
pavgb m1, m7 ; -a
%endif
psubusb m5, m0
psubusb m2, m1
paddusb m5, m1 ; q1-a
paddusb m2, m0 ; p1+a
 
; store
%ifidn %1, v
movrow [dst1q+mstrideq*2], m2
movrow [dst1q+mstrideq ], m3
movrow [dst1q], m4
movrow [dst1q+ strideq ], m5
%if mmsize == 16 && %2 == 8
movhps [dst8q+mstrideq*2], m2
movhps [dst8q+mstrideq ], m3
movhps [dst8q], m4
movhps [dst8q+ strideq ], m5
%endif
%else ; h
add dst1q, 2
add dst2q, 2
 
; 4x8/16 transpose
TRANSPOSE4x4B 2, 3, 4, 5, 6
 
%if mmsize == 8 ; mmx/mmxext (h)
WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq
%else ; sse2 (h)
lea dst8q, [dst8q+mstrideq +2]
WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2
%endif
%endif
 
%if mmsize == 8
%if %2 == 8 ; chroma
%ifidn %1, h
sub dst1q, 2
%endif
cmp dst1q, dst8q
mov dst1q, dst8q
jnz .next8px
%else
%ifidn %1, h
lea dst1q, [dst1q+ strideq*8-2]
%else ; v
add dst1q, 8
%endif
dec cntrq
jg .next8px
%endif
REP_RET
%else ; mmsize == 16
RET
%endif
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
INNER_LOOPFILTER v, 16
INNER_LOOPFILTER h, 16
INNER_LOOPFILTER v, 8
INNER_LOOPFILTER h, 8
 
INIT_MMX mmxext
INNER_LOOPFILTER v, 16
INNER_LOOPFILTER h, 16
INNER_LOOPFILTER v, 8
INNER_LOOPFILTER h, 8
%endif
 
INIT_XMM sse2
INNER_LOOPFILTER v, 16
INNER_LOOPFILTER h, 16
INNER_LOOPFILTER v, 8
INNER_LOOPFILTER h, 8
 
INIT_XMM ssse3
INNER_LOOPFILTER v, 16
INNER_LOOPFILTER h, 16
INNER_LOOPFILTER v, 8
INNER_LOOPFILTER h, 8
 
;-----------------------------------------------------------------------------
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride,
; int flimE, int flimI, int hev_thr);
;-----------------------------------------------------------------------------
 
%macro MBEDGE_LOOPFILTER 2
%define stack_size 0
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr
%if mmsize == 16 ; [3]=hev() result
; [4]=filter tmp result
; [5]/[6] = p2/q2 backup
; [7]=lim_res sign result
%define stack_size mmsize * -7
%else ; 8 ; extra storage space for transposes
%define stack_size mmsize * -8
%endif
%endif
 
%if %2 == 8 ; chroma
cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr
%else ; luma
cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr
%endif
 
%if cpuflag(ssse3)
pxor m7, m7
%endif
 
%ifndef m8
; splat function arguments
SPLATB_REG m0, flimEq, m7 ; E
SPLATB_REG m1, flimIq, m7 ; I
SPLATB_REG m2, hevthrq, m7 ; hev_thresh
 
%define m_flimE [rsp]
%define m_flimI [rsp+mmsize]
%define m_hevthr [rsp+mmsize*2]
%define m_maskres [rsp+mmsize*3]
%define m_limres [rsp+mmsize*4]
%define m_p0backup [rsp+mmsize*3]
%define m_q0backup [rsp+mmsize*4]
%define m_p2backup [rsp+mmsize*5]
%define m_q2backup [rsp+mmsize*6]
%if mmsize == 16
%define m_limsign [rsp]
%else
%define m_limsign [rsp+mmsize*7]
%endif
 
mova m_flimE, m0
mova m_flimI, m1
mova m_hevthr, m2
%else ; sse2 on x86-64
%define m_flimE m9
%define m_flimI m10
%define m_hevthr m11
%define m_maskres m12
%define m_limres m8
%define m_p0backup m12
%define m_q0backup m8
%define m_p2backup m13
%define m_q2backup m14
%define m_limsign m9
 
; splat function arguments
SPLATB_REG m_flimE, flimEq, m7 ; E
SPLATB_REG m_flimI, flimIq, m7 ; I
SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh
%endif
 
%if %2 == 8 ; chroma
DEFINE_ARGS dst1, dst8, mstride, stride, dst2
%elif mmsize == 8
DEFINE_ARGS dst1, mstride, stride, dst2, cntr
mov cntrq, 2
%else
DEFINE_ARGS dst1, mstride, stride, dst2, dst8
%endif
mov strideq, mstrideq
neg mstrideq
%ifidn %1, h
lea dst1q, [dst1q+strideq*4-4]
%if %2 == 8 ; chroma
lea dst8q, [dst8q+strideq*4-4]
%endif
%endif
 
%if mmsize == 8
.next8px:
%endif
; read
lea dst2q, [dst1q+ strideq ]
%ifidn %1, v
%if %2 == 8 && mmsize == 16
%define movrow movh
%else
%define movrow mova
%endif
movrow m0, [dst1q+mstrideq*4] ; p3
movrow m1, [dst2q+mstrideq*4] ; p2
movrow m2, [dst1q+mstrideq*2] ; p1
movrow m5, [dst2q] ; q1
movrow m6, [dst2q+ strideq ] ; q2
movrow m7, [dst2q+ strideq*2] ; q3
%if mmsize == 16 && %2 == 8
movhps m0, [dst8q+mstrideq*4]
movhps m2, [dst8q+mstrideq*2]
add dst8q, strideq
movhps m1, [dst8q+mstrideq*4]
movhps m5, [dst8q]
movhps m6, [dst8q+ strideq ]
movhps m7, [dst8q+ strideq*2]
add dst8q, mstrideq
%endif
%elif mmsize == 8 ; mmx/mmxext (h)
; read 8 rows of 8px each
movu m0, [dst1q+mstrideq*4]
movu m1, [dst2q+mstrideq*4]
movu m2, [dst1q+mstrideq*2]
movu m3, [dst1q+mstrideq ]
movu m4, [dst1q]
movu m5, [dst2q]
movu m6, [dst2q+ strideq ]
 
; 8x8 transpose
TRANSPOSE4x4B 0, 1, 2, 3, 7
mova m_q0backup, m1
movu m7, [dst2q+ strideq*2]
TRANSPOSE4x4B 4, 5, 6, 7, 1
SBUTTERFLY dq, 0, 4, 1 ; p3/p2
SBUTTERFLY dq, 2, 6, 1 ; q0/q1
SBUTTERFLY dq, 3, 7, 1 ; q2/q3
mova m1, m_q0backup
mova m_q0backup, m2 ; store q0
SBUTTERFLY dq, 1, 5, 2 ; p1/p0
mova m_p0backup, m5 ; store p0
SWAP 1, 4
SWAP 2, 4
SWAP 6, 3
SWAP 5, 3
%else ; sse2 (h)
%if %2 == 16
lea dst8q, [dst1q+ strideq*8 ]
%endif
 
; read 16 rows of 8px each, interleave
movh m0, [dst1q+mstrideq*4]
movh m1, [dst8q+mstrideq*4]
movh m2, [dst1q+mstrideq*2]
movh m5, [dst8q+mstrideq*2]
movh m3, [dst1q+mstrideq ]
movh m6, [dst8q+mstrideq ]
movh m4, [dst1q]
movh m7, [dst8q]
punpcklbw m0, m1 ; A/I
punpcklbw m2, m5 ; C/K
punpcklbw m3, m6 ; D/L
punpcklbw m4, m7 ; E/M
 
add dst8q, strideq
movh m1, [dst2q+mstrideq*4]
movh m6, [dst8q+mstrideq*4]
movh m5, [dst2q]
movh m7, [dst8q]
punpcklbw m1, m6 ; B/J
punpcklbw m5, m7 ; F/N
movh m6, [dst2q+ strideq ]
movh m7, [dst8q+ strideq ]
punpcklbw m6, m7 ; G/O
 
; 8x16 transpose
TRANSPOSE4x4B 0, 1, 2, 3, 7
%ifdef m8
SWAP 1, 8
%else
mova m_q0backup, m1
%endif
movh m7, [dst2q+ strideq*2]
movh m1, [dst8q+ strideq*2]
punpcklbw m7, m1 ; H/P
TRANSPOSE4x4B 4, 5, 6, 7, 1
SBUTTERFLY dq, 0, 4, 1 ; p3/p2
SBUTTERFLY dq, 2, 6, 1 ; q0/q1
SBUTTERFLY dq, 3, 7, 1 ; q2/q3
%ifdef m8
SWAP 1, 8
SWAP 2, 8
%else
mova m1, m_q0backup
mova m_q0backup, m2 ; store q0
%endif
SBUTTERFLY dq, 1, 5, 2 ; p1/p0
%ifdef m12
SWAP 5, 12
%else
mova m_p0backup, m5 ; store p0
%endif
SWAP 1, 4
SWAP 2, 4
SWAP 6, 3
SWAP 5, 3
%endif
 
; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1
mova m4, m1
SWAP 4, 1
psubusb m4, m0 ; p2-p3
psubusb m0, m1 ; p3-p2
por m0, m4 ; abs(p3-p2)
 
mova m4, m2
SWAP 4, 2
psubusb m4, m1 ; p1-p2
mova m_p2backup, m1
psubusb m1, m2 ; p2-p1
por m1, m4 ; abs(p2-p1)
 
mova m4, m6
SWAP 4, 6
psubusb m4, m7 ; q2-q3
psubusb m7, m6 ; q3-q2
por m7, m4 ; abs(q3-q2)
 
mova m4, m5
SWAP 4, 5
psubusb m4, m6 ; q1-q2
mova m_q2backup, m6
psubusb m6, m5 ; q2-q1
por m6, m4 ; abs(q2-q1)
 
%if notcpuflag(mmxext)
mova m4, m_flimI
pxor m3, m3
psubusb m0, m4
psubusb m1, m4
psubusb m7, m4
psubusb m6, m4
pcmpeqb m0, m3 ; abs(p3-p2) <= I
pcmpeqb m1, m3 ; abs(p2-p1) <= I
pcmpeqb m7, m3 ; abs(q3-q2) <= I
pcmpeqb m6, m3 ; abs(q2-q1) <= I
pand m0, m1
pand m7, m6
pand m0, m7
%else ; mmxext/sse2
pmaxub m0, m1
pmaxub m6, m7
pmaxub m0, m6
%endif
 
; normal_limit and high_edge_variance for p1-p0, q1-q0
SWAP 7, 3 ; now m7 is zero
%ifidn %1, v
movrow m3, [dst1q+mstrideq ] ; p0
%if mmsize == 16 && %2 == 8
movhps m3, [dst8q+mstrideq ]
%endif
%elifdef m12
SWAP 3, 12
%else
mova m3, m_p0backup
%endif
 
mova m1, m2
SWAP 1, 2
mova m6, m3
SWAP 3, 6
psubusb m1, m3 ; p1-p0
psubusb m6, m2 ; p0-p1
por m1, m6 ; abs(p1-p0)
%if notcpuflag(mmxext)
mova m6, m1
psubusb m1, m4
psubusb m6, m_hevthr
pcmpeqb m1, m7 ; abs(p1-p0) <= I
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh
pand m0, m1
mova m_maskres, m6
%else ; mmxext/sse2
pmaxub m0, m1 ; max_I
SWAP 1, 4 ; max_hev_thresh
%endif
 
SWAP 6, 4 ; now m6 is I
%ifidn %1, v
movrow m4, [dst1q] ; q0
%if mmsize == 16 && %2 == 8
movhps m4, [dst8q]
%endif
%elifdef m8
SWAP 4, 8
%else
mova m4, m_q0backup
%endif
mova m1, m4
SWAP 1, 4
mova m7, m5
SWAP 7, 5
psubusb m1, m5 ; q0-q1
psubusb m7, m4 ; q1-q0
por m1, m7 ; abs(q1-q0)
%if notcpuflag(mmxext)
mova m7, m1
psubusb m1, m6
psubusb m7, m_hevthr
pxor m6, m6
pcmpeqb m1, m6 ; abs(q1-q0) <= I
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh
mova m6, m_maskres
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I
pand m6, m7
%else ; mmxext/sse2
pxor m7, m7
pmaxub m0, m1
pmaxub m6, m1
psubusb m0, m_flimI
psubusb m6, m_hevthr
pcmpeqb m0, m7 ; max(abs(..)) <= I
pcmpeqb m6, m7 ; !(max(abs..) > thresh)
%endif
%ifdef m12
SWAP 6, 12
%else
mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t)
%endif
 
; simple_limit
mova m1, m3
SWAP 1, 3
mova m6, m4 ; keep copies of p0/q0 around for later use
SWAP 6, 4
psubusb m1, m4 ; p0-q0
psubusb m6, m3 ; q0-p0
por m1, m6 ; abs(q0-p0)
paddusb m1, m1 ; m1=2*abs(q0-p0)
 
mova m7, m2
SWAP 7, 2
mova m6, m5
SWAP 6, 5
psubusb m7, m5 ; p1-q1
psubusb m6, m2 ; q1-p1
por m7, m6 ; abs(q1-p1)
pxor m6, m6
pand m7, [pb_FE]
psrlq m7, 1 ; abs(q1-p1)/2
paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2
psubusb m7, m_flimE
pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E
pand m0, m7 ; normal_limit result
 
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask
%ifdef m8 ; x86-64 && sse2
mova m8, [pb_80]
%define m_pb_80 m8
%else ; x86-32 or mmx/mmxext
%define m_pb_80 [pb_80]
%endif
mova m1, m4
mova m7, m3
pxor m1, m_pb_80
pxor m7, m_pb_80
psubsb m1, m7 ; (signed) q0-p0
mova m6, m2
mova m7, m5
pxor m6, m_pb_80
pxor m7, m_pb_80
psubsb m6, m7 ; (signed) p1-q1
mova m7, m_maskres
paddsb m6, m1
paddsb m6, m1
paddsb m6, m1
pand m6, m0
%ifdef m8
mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge
pand m_limres, m7
%else
mova m0, m6
pand m0, m7
mova m_limres, m0
%endif
pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common
 
mova m1, [pb_F8]
mova m6, m7
paddsb m7, [pb_3]
paddsb m6, [pb_4]
pand m7, m1
pand m6, m1
 
pxor m1, m1
pxor m0, m0
pcmpgtb m1, m7
psubb m0, m7
psrlq m7, 3 ; +f2
psrlq m0, 3 ; -f2
pand m0, m1
pandn m1, m7
psubusb m3, m0
paddusb m3, m1 ; p0+f2
 
pxor m1, m1
pxor m0, m0
pcmpgtb m0, m6
psubb m1, m6
psrlq m6, 3 ; +f1
psrlq m1, 3 ; -f1
pand m1, m0
pandn m0, m6
psubusb m4, m0
paddusb m4, m1 ; q0-f1
 
; filter_mbedge (m2-m5 = p1-q1; lim_res carries w)
%if cpuflag(ssse3)
mova m7, [pb_1]
%else
mova m7, [pw_63]
%endif
%ifdef m8
SWAP 1, 8
%else
mova m1, m_limres
%endif
pxor m0, m0
mova m6, m1
pcmpgtb m0, m1 ; which are negative
%if cpuflag(ssse3)
punpcklbw m6, m7 ; interleave with "1" for rounding
punpckhbw m1, m7
%else
punpcklbw m6, m0 ; signed byte->word
punpckhbw m1, m0
%endif
mova m_limsign, m0
%if cpuflag(ssse3)
mova m7, [pb_27_63]
%ifndef m8
mova m_limres, m1
%endif
%ifdef m10
SWAP 0, 10 ; don't lose lim_sign copy
%endif
mova m0, m7
pmaddubsw m7, m6
SWAP 6, 7
pmaddubsw m0, m1
SWAP 1, 0
%ifdef m10
SWAP 0, 10
%else
mova m0, m_limsign
%endif
%else
mova m_maskres, m6 ; backup for later in filter
mova m_limres, m1
pmullw m6, [pw_27]
pmullw m1, [pw_27]
paddw m6, m7
paddw m1, m7
%endif
psraw m6, 7
psraw m1, 7
packsswb m6, m1 ; a0
pxor m1, m1
psubb m1, m6
pand m1, m0 ; -a0
pandn m0, m6 ; +a0
%if cpuflag(ssse3)
mova m6, [pb_18_63] ; pipelining
%endif
psubusb m3, m1
paddusb m4, m1
paddusb m3, m0 ; p0+a0
psubusb m4, m0 ; q0-a0
 
%if cpuflag(ssse3)
SWAP 6, 7
%ifdef m10
SWAP 1, 10
%else
mova m1, m_limres
%endif
mova m0, m7
pmaddubsw m7, m6
SWAP 6, 7
pmaddubsw m0, m1
SWAP 1, 0
%ifdef m10
SWAP 0, 10
%endif
mova m0, m_limsign
%else
mova m6, m_maskres
mova m1, m_limres
pmullw m6, [pw_18]
pmullw m1, [pw_18]
paddw m6, m7
paddw m1, m7
%endif
mova m0, m_limsign
psraw m6, 7
psraw m1, 7
packsswb m6, m1 ; a1
pxor m1, m1
psubb m1, m6
pand m1, m0 ; -a1
pandn m0, m6 ; +a1
%if cpuflag(ssse3)
mova m6, [pb_9_63]
%endif
psubusb m2, m1
paddusb m5, m1
paddusb m2, m0 ; p1+a1
psubusb m5, m0 ; q1-a1
 
%if cpuflag(ssse3)
SWAP 6, 7
%ifdef m10
SWAP 1, 10
%else
mova m1, m_limres
%endif
mova m0, m7
pmaddubsw m7, m6
SWAP 6, 7
pmaddubsw m0, m1
SWAP 1, 0
%else
%ifdef m8
SWAP 6, 12
SWAP 1, 8
%else
mova m6, m_maskres
mova m1, m_limres
%endif
pmullw m6, [pw_9]
pmullw m1, [pw_9]
paddw m6, m7
paddw m1, m7
%endif
%ifdef m9
SWAP 7, 9
%else
mova m7, m_limsign
%endif
psraw m6, 7
psraw m1, 7
packsswb m6, m1 ; a1
pxor m0, m0
psubb m0, m6
pand m0, m7 ; -a1
pandn m7, m6 ; +a1
%ifdef m8
SWAP 1, 13
SWAP 6, 14
%else
mova m1, m_p2backup
mova m6, m_q2backup
%endif
psubusb m1, m0
paddusb m6, m0
paddusb m1, m7 ; p1+a1
psubusb m6, m7 ; q1-a1
 
; store
%ifidn %1, v
movrow [dst2q+mstrideq*4], m1
movrow [dst1q+mstrideq*2], m2
movrow [dst1q+mstrideq ], m3
movrow [dst1q], m4
movrow [dst2q], m5
movrow [dst2q+ strideq ], m6
%if mmsize == 16 && %2 == 8
add dst8q, mstrideq
movhps [dst8q+mstrideq*2], m1
movhps [dst8q+mstrideq ], m2
movhps [dst8q], m3
add dst8q, strideq
movhps [dst8q], m4
movhps [dst8q+ strideq ], m5
movhps [dst8q+ strideq*2], m6
%endif
%else ; h
inc dst1q
inc dst2q
 
; 4x8/16 transpose
TRANSPOSE4x4B 1, 2, 3, 4, 0
SBUTTERFLY bw, 5, 6, 0
 
%if mmsize == 8 ; mmx/mmxext (h)
WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq
add dst1q, 4
WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq
%else ; sse2 (h)
lea dst8q, [dst8q+mstrideq+1]
WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2
lea dst1q, [dst2q+mstrideq+4]
lea dst8q, [dst8q+mstrideq+4]
%if cpuflag(sse4)
add dst2q, 4
%endif
WRITE_8W m5, dst2q, dst1q, mstrideq, strideq
%if cpuflag(sse4)
lea dst2q, [dst8q+ strideq ]
%endif
WRITE_8W m6, dst2q, dst8q, mstrideq, strideq
%endif
%endif
 
%if mmsize == 8
%if %2 == 8 ; chroma
%ifidn %1, h
sub dst1q, 5
%endif
cmp dst1q, dst8q
mov dst1q, dst8q
jnz .next8px
%else
%ifidn %1, h
lea dst1q, [dst1q+ strideq*8-5]
%else ; v
add dst1q, 8
%endif
dec cntrq
jg .next8px
%endif
REP_RET
%else ; mmsize == 16
RET
%endif
%endmacro
 
%if ARCH_X86_32
INIT_MMX mmx
MBEDGE_LOOPFILTER v, 16
MBEDGE_LOOPFILTER h, 16
MBEDGE_LOOPFILTER v, 8
MBEDGE_LOOPFILTER h, 8
 
INIT_MMX mmxext
MBEDGE_LOOPFILTER v, 16
MBEDGE_LOOPFILTER h, 16
MBEDGE_LOOPFILTER v, 8
MBEDGE_LOOPFILTER h, 8
%endif
 
INIT_XMM sse2
MBEDGE_LOOPFILTER v, 16
MBEDGE_LOOPFILTER h, 16
MBEDGE_LOOPFILTER v, 8
MBEDGE_LOOPFILTER h, 8
 
INIT_XMM ssse3
MBEDGE_LOOPFILTER v, 16
MBEDGE_LOOPFILTER h, 16
MBEDGE_LOOPFILTER v, 8
MBEDGE_LOOPFILTER h, 8
 
INIT_XMM sse4
MBEDGE_LOOPFILTER h, 16
MBEDGE_LOOPFILTER h, 8
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp8dsp_init.c
0,0 → 1,441
/*
* VP8 DSP functions x86-optimized
* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com>
* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavutil/x86/cpu.h"
#include "libavcodec/vp8dsp.h"
 
#if HAVE_YASM
 
/*
* MC functions
*/
void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
 
void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
 
void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
 
void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
 
void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
 
 
void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride,
uint8_t *src, ptrdiff_t srcstride,
int height, int mx, int my);
 
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
dst, dststride, src, srcstride, height, mx, my); \
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
dst + 8, dststride, src + 8, srcstride, height, mx, my); \
}
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
dst, dststride, src, srcstride, height, mx, my); \
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \
dst + 4, dststride, src + 4, srcstride, height, mx, my); \
}
 
#if ARCH_X86_32
TAP_W8 (mmxext, epel, h4)
TAP_W8 (mmxext, epel, h6)
TAP_W16(mmxext, epel, h6)
TAP_W8 (mmxext, epel, v4)
TAP_W8 (mmxext, epel, v6)
TAP_W16(mmxext, epel, v6)
TAP_W8 (mmxext, bilinear, h)
TAP_W16(mmxext, bilinear, h)
TAP_W8 (mmxext, bilinear, v)
TAP_W16(mmxext, bilinear, v)
#endif
 
TAP_W16(sse2, epel, h6)
TAP_W16(sse2, epel, v6)
TAP_W16(sse2, bilinear, h)
TAP_W16(sse2, bilinear, v)
 
TAP_W16(ssse3, epel, h6)
TAP_W16(ssse3, epel, v6)
TAP_W16(ssse3, bilinear, h)
TAP_W16(ssse3, bilinear, v)
 
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \
uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \
src -= srcstride * (TAPNUMY / 2 - 1); \
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \
tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \
ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \
dst, dststride, tmpptr, SIZE, height, mx, my); \
}
 
#if ARCH_X86_32
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8) \
HVTAP(mmxext, 8, x, y, 8, 16)
 
HVTAP(mmxext, 8, 6, 6, 16, 16)
#else
#define HVTAPMMX(x, y) \
HVTAP(mmxext, 8, x, y, 4, 8)
#endif
 
HVTAPMMX(4, 4)
HVTAPMMX(4, 6)
HVTAPMMX(6, 4)
HVTAPMMX(6, 6)
 
#define HVTAPSSE2(x, y, w) \
HVTAP(sse2, 16, x, y, w, 16) \
HVTAP(ssse3, 16, x, y, w, 16)
 
HVTAPSSE2(4, 4, 8)
HVTAPSSE2(4, 6, 8)
HVTAPSSE2(6, 4, 8)
HVTAPSSE2(6, 6, 8)
HVTAPSSE2(6, 6, 16)
 
HVTAP(ssse3, 16, 4, 4, 4, 8)
HVTAP(ssse3, 16, 4, 6, 4, 8)
HVTAP(ssse3, 16, 6, 4, 4, 8)
HVTAP(ssse3, 16, 6, 6, 4, 8)
 
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \
ptrdiff_t srcstride, int height, int mx, int my) \
{ \
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \
ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \
tmp, SIZE, src, srcstride, height + 1, mx, my); \
ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \
dst, dststride, tmp, SIZE, height, mx, my); \
}
 
HVBILIN(mmxext, 8, 4, 8)
#if ARCH_X86_32
HVBILIN(mmxext, 8, 8, 16)
HVBILIN(mmxext, 8, 16, 16)
#endif
HVBILIN(sse2, 8, 8, 16)
HVBILIN(sse2, 8, 16, 16)
HVBILIN(ssse3, 8, 4, 8)
HVBILIN(ssse3, 8, 8, 16)
HVBILIN(ssse3, 8, 16, 16)
 
void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16],
ptrdiff_t stride);
void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16],
ptrdiff_t stride);
void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]);
void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride);
 
#define DECLARE_LOOP_FILTER(NAME) \
void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int flim); \
void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int flim); \
void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt); \
void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \
ptrdiff_t stride, \
int e, int i, int hvt); \
void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt); \
void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \
uint8_t *dstV, \
ptrdiff_t s, \
int e, int i, int hvt);
 
DECLARE_LOOP_FILTER(mmx)
DECLARE_LOOP_FILTER(mmxext)
DECLARE_LOOP_FILTER(sse2)
DECLARE_LOOP_FILTER(ssse3)
DECLARE_LOOP_FILTER(sse4)
 
#endif /* HAVE_YASM */
 
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT
 
#define VP8_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \
c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \
VP8_LUMA_MC_FUNC(IDX, SIZE, OPT)
 
#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \
c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \
c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT
 
 
av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
 
if (EXTERNAL_MMX(cpu_flags)) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx;
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx;
#if ARCH_X86_32
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx;
c->vp8_idct_add = ff_vp8_idct_add_mmx;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx;
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx;
#endif
c->put_vp8_epel_pixels_tab[1][0][0] =
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx;
 
#if ARCH_X86_32
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx;
 
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx;
 
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx;
#endif
}
 
/* note that 4-tap width=16 functions are missing because w=16
* is only used for luma, and luma is always a copy or sixtap. */
if (EXTERNAL_MMXEXT(cpu_flags)) {
VP8_MC_FUNC(2, 4, mmxext);
VP8_BILINEAR_MC_FUNC(2, 4, mmxext);
#if ARCH_X86_32
VP8_LUMA_MC_FUNC(0, 16, mmxext);
VP8_MC_FUNC(1, 8, mmxext);
VP8_BILINEAR_MC_FUNC(0, 16, mmxext);
VP8_BILINEAR_MC_FUNC(1, 8, mmxext);
 
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext;
 
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext;
 
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext;
#endif
}
 
if (EXTERNAL_SSE(cpu_flags)) {
c->vp8_idct_add = ff_vp8_idct_add_sse;
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse;
c->put_vp8_epel_pixels_tab[0][0][0] =
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse;
}
 
if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) {
VP8_LUMA_MC_FUNC(0, 16, sse2);
VP8_MC_FUNC(1, 8, sse2);
VP8_BILINEAR_MC_FUNC(0, 16, sse2);
VP8_BILINEAR_MC_FUNC(1, 8, sse2);
 
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2;
 
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2;
 
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2;
}
 
if (EXTERNAL_SSE2(cpu_flags)) {
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2;
 
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2;
 
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2;
 
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2;
}
 
if (EXTERNAL_SSSE3(cpu_flags)) {
VP8_LUMA_MC_FUNC(0, 16, ssse3);
VP8_MC_FUNC(1, 8, ssse3);
VP8_MC_FUNC(2, 4, ssse3);
VP8_BILINEAR_MC_FUNC(0, 16, ssse3);
VP8_BILINEAR_MC_FUNC(1, 8, ssse3);
VP8_BILINEAR_MC_FUNC(2, 4, ssse3);
 
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3;
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3;
 
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3;
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3;
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3;
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3;
 
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3;
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3;
}
 
if (EXTERNAL_SSE4(cpu_flags)) {
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4;
 
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4;
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4;
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4;
}
#endif /* HAVE_YASM */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp9dsp.asm
0,0 → 1,278
;******************************************************************************
;* VP9 SIMD optimizations
;*
;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA
 
; FIXME share with vp8dsp.asm
pw_256: times 8 dw 256
 
%macro F8_TAPS 8
times 8 db %1, %2
times 8 db %3, %4
times 8 db %5, %6
times 8 db %7, %8
%endmacro
; int8_t ff_filters_ssse3[3][15][4][16]
const filters_ssse3 ; smooth
F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0
F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0
F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0
F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0
F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0
F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0
F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1
F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1
F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1
F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1
F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2
F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2
F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2
F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2
F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3
; regular
F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0
F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0
F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1
F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1
F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1
F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1
F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1
F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1
F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1
F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1
F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1
F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1
F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1
F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1
F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0
; sharp
F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0
F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1
F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2
F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2
F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3
F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3
F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4
F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4
F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4
F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4
F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4
F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4
F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3
F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2
F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1
 
SECTION .text
 
%macro filter_h_fn 1
%assign %%px mmsize/2
cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery
mova m6, [pw_256]
mova m7, [filteryq+ 0]
%if ARCH_X86_64 && mmsize > 8
mova m8, [filteryq+16]
mova m9, [filteryq+32]
mova m10, [filteryq+48]
%endif
.loop:
movh m0, [srcq-3]
movh m1, [srcq-2]
movh m2, [srcq-1]
movh m3, [srcq+0]
movh m4, [srcq+1]
movh m5, [srcq+2]
punpcklbw m0, m1
punpcklbw m2, m3
movh m1, [srcq+3]
movh m3, [srcq+4]
add srcq, sstrideq
punpcklbw m4, m5
punpcklbw m1, m3
pmaddubsw m0, m7
%if ARCH_X86_64 && mmsize > 8
pmaddubsw m2, m8
pmaddubsw m4, m9
pmaddubsw m1, m10
%else
pmaddubsw m2, [filteryq+16]
pmaddubsw m4, [filteryq+32]
pmaddubsw m1, [filteryq+48]
%endif
paddw m0, m2
paddw m4, m1
paddsw m0, m4
pmulhrsw m0, m6
%ifidn %1, avg
movh m1, [dstq]
%endif
packuswb m0, m0
%ifidn %1, avg
pavgb m0, m1
%endif
movh [dstq], m0
add dstq, dstrideq
dec hd
jg .loop
RET
%endmacro
 
INIT_MMX ssse3
filter_h_fn put
filter_h_fn avg
 
INIT_XMM ssse3
filter_h_fn put
filter_h_fn avg
 
%macro filter_v_fn 1
%assign %%px mmsize/2
%if ARCH_X86_64
cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3
%else
cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3
mov filteryq, r5mp
%define hd r4mp
%endif
sub srcq, sstrideq
lea sstride3q, [sstrideq*3]
sub srcq, sstrideq
mova m6, [pw_256]
sub srcq, sstrideq
mova m7, [filteryq+ 0]
lea src4q, [srcq+sstrideq*4]
%if ARCH_X86_64 && mmsize > 8
mova m8, [filteryq+16]
mova m9, [filteryq+32]
mova m10, [filteryq+48]
%endif
.loop:
; FIXME maybe reuse loads from previous rows, or just
; more generally unroll this to prevent multiple loads of
; the same data?
movh m0, [srcq]
movh m1, [srcq+sstrideq]
movh m2, [srcq+sstrideq*2]
movh m3, [srcq+sstride3q]
movh m4, [src4q]
movh m5, [src4q+sstrideq]
punpcklbw m0, m1
punpcklbw m2, m3
movh m1, [src4q+sstrideq*2]
movh m3, [src4q+sstride3q]
add srcq, sstrideq
add src4q, sstrideq
punpcklbw m4, m5
punpcklbw m1, m3
pmaddubsw m0, m7
%if ARCH_X86_64 && mmsize > 8
pmaddubsw m2, m8
pmaddubsw m4, m9
pmaddubsw m1, m10
%else
pmaddubsw m2, [filteryq+16]
pmaddubsw m4, [filteryq+32]
pmaddubsw m1, [filteryq+48]
%endif
paddw m0, m2
paddw m4, m1
paddsw m0, m4
pmulhrsw m0, m6
%ifidn %1, avg
movh m1, [dstq]
%endif
packuswb m0, m0
%ifidn %1, avg
pavgb m0, m1
%endif
movh [dstq], m0
add dstq, dstrideq
dec hd
jg .loop
RET
%endmacro
 
INIT_MMX ssse3
filter_v_fn put
filter_v_fn avg
 
INIT_XMM ssse3
filter_v_fn put
filter_v_fn avg
 
%macro fpel_fn 6
%if %2 == 4
%define %%srcfn movh
%define %%dstfn movh
%else
%define %%srcfn movu
%define %%dstfn mova
%endif
 
%if %2 <= 16
cglobal %1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3
lea sstride3q, [sstrideq*3]
lea dstride3q, [dstrideq*3]
%else
cglobal %1%2, 5, 5, 4, dst, dstride, src, sstride, h
%endif
.loop:
%%srcfn m0, [srcq]
%%srcfn m1, [srcq+s%3]
%%srcfn m2, [srcq+s%4]
%%srcfn m3, [srcq+s%5]
lea srcq, [srcq+sstrideq*%6]
%ifidn %1, avg
pavgb m0, [dstq]
pavgb m1, [dstq+d%3]
pavgb m2, [dstq+d%4]
pavgb m3, [dstq+d%5]
%endif
%%dstfn [dstq], m0
%%dstfn [dstq+d%3], m1
%%dstfn [dstq+d%4], m2
%%dstfn [dstq+d%5], m3
lea dstq, [dstq+dstrideq*%6]
sub hd, %6
jnz .loop
RET
%endmacro
 
%define d16 16
%define s16 16
INIT_MMX mmx
fpel_fn put, 4, strideq, strideq*2, stride3q, 4
fpel_fn put, 8, strideq, strideq*2, stride3q, 4
INIT_MMX sse
fpel_fn avg, 4, strideq, strideq*2, stride3q, 4
fpel_fn avg, 8, strideq, strideq*2, stride3q, 4
INIT_XMM sse
fpel_fn put, 16, strideq, strideq*2, stride3q, 4
fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2
fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1
INIT_XMM sse2
fpel_fn avg, 16, strideq, strideq*2, stride3q, 4
fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2
fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1
%undef s16
%undef d16
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp9dsp_init.c
0,0 → 1,214
/*
* VP9 SIMD optimizations
*
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/cpu.h"
#include "libavutil/mem.h"
#include "libavutil/x86/asm.h"
#include "libavcodec/vp9dsp.h"
 
#if HAVE_YASM
 
#define fpel_func(avg, sz, opt) \
void ff_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my)
fpel_func(put, 4, mmx);
fpel_func(put, 8, mmx);
fpel_func(put, 16, sse);
fpel_func(put, 32, sse);
fpel_func(put, 64, sse);
fpel_func(avg, 4, sse);
fpel_func(avg, 8, sse);
fpel_func(avg, 16, sse2);
fpel_func(avg, 32, sse2);
fpel_func(avg, 64, sse2);
#undef fpel_func
 
#define mc_func(avg, sz, dir, opt) \
void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, const int8_t (*filter)[16])
#define mc_funcs(sz) \
mc_func(put, sz, h, ssse3); \
mc_func(avg, sz, h, ssse3); \
mc_func(put, sz, v, ssse3); \
mc_func(avg, sz, v, ssse3)
 
mc_funcs(4);
mc_funcs(8);
 
#undef mc_funcs
#undef mc_func
 
#define mc_rep_func(avg, sz, hsz, dir, opt) \
static av_always_inline void \
ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, const int8_t (*filter)[16]) \
{ \
ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst, dst_stride, src, \
src_stride, h, filter); \
ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \
src_stride, h, filter); \
}
 
#define mc_rep_funcs(sz, hsz) \
mc_rep_func(put, sz, hsz, h, ssse3); \
mc_rep_func(avg, sz, hsz, h, ssse3); \
mc_rep_func(put, sz, hsz, v, ssse3); \
mc_rep_func(avg, sz, hsz, v, ssse3)
 
mc_rep_funcs(16, 8);
mc_rep_funcs(32, 16);
mc_rep_funcs(64, 32);
 
#undef mc_rep_funcs
#undef mc_rep_func
 
extern const int8_t ff_filters_ssse3[3][15][4][16];
 
#define filter_8tap_2d_fn(op, sz, f, fname) \
static void op##_8tap_##fname##_##sz##hv_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]); \
ff_put_8tap_1d_h_##sz##_ssse3(temp, 64, src - 3 * src_stride, src_stride, \
h + 7, ff_filters_ssse3[f][mx - 1]); \
ff_##op##_8tap_1d_v_##sz##_ssse3(dst, dst_stride, temp + 3 * 64, 64, \
h, ff_filters_ssse3[f][my - 1]); \
}
 
#define filters_8tap_2d_fn(op, sz) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth)
 
#define filters_8tap_2d_fn2(op) \
filters_8tap_2d_fn(op, 64) \
filters_8tap_2d_fn(op, 32) \
filters_8tap_2d_fn(op, 16) \
filters_8tap_2d_fn(op, 8) \
filters_8tap_2d_fn(op, 4)
 
filters_8tap_2d_fn2(put)
filters_8tap_2d_fn2(avg)
 
#undef filters_8tap_2d_fn2
#undef filters_8tap_2d_fn
#undef filter_8tap_2d_fn
 
#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar) \
static void op##_8tap_##fname##_##sz##dir##_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \
const uint8_t *src, ptrdiff_t src_stride, \
int h, int mx, int my) \
{ \
ff_##op##_8tap_1d_##dir##_##sz##_ssse3(dst, dst_stride, src, src_stride, \
h, ff_filters_ssse3[f][dvar - 1]); \
}
 
#define filters_8tap_1d_fn(op, sz, dir, dvar) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar)
 
#define filters_8tap_1d_fn2(op, sz) \
filters_8tap_1d_fn(op, sz, h, mx) \
filters_8tap_1d_fn(op, sz, v, my)
 
#define filters_8tap_1d_fn3(op) \
filters_8tap_1d_fn2(op, 64) \
filters_8tap_1d_fn2(op, 32) \
filters_8tap_1d_fn2(op, 16) \
filters_8tap_1d_fn2(op, 8) \
filters_8tap_1d_fn2(op, 4)
 
filters_8tap_1d_fn3(put)
filters_8tap_1d_fn3(avg)
 
#undef filters_8tap_1d_fn
#undef filters_8tap_1d_fn2
#undef filters_8tap_1d_fn3
#undef filter_8tap_1d_fn
 
#endif /* HAVE_YASM */
 
av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp)
{
#if HAVE_YASM
int cpu_flags = av_get_cpu_flags();
 
#define init_fpel(idx1, idx2, sz, type, opt) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_##type##sz##_##opt
 
 
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt
 
#define init_subpel2(idx, idxh, idxv, dir, type, opt) \
init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \
init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \
init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \
init_subpel1(3, idx, idxh, idxv, 8, dir, type, opt); \
init_subpel1(4, idx, idxh, idxv, 4, dir, type, opt)
 
#define init_subpel3(idx, type, opt) \
init_subpel2(idx, 1, 1, hv, type, opt); \
init_subpel2(idx, 0, 1, v, type, opt); \
init_subpel2(idx, 1, 0, h, type, opt)
 
if (cpu_flags & AV_CPU_FLAG_MMX) {
init_fpel(4, 0, 4, put, mmx);
init_fpel(3, 0, 8, put, mmx);
}
 
if (cpu_flags & AV_CPU_FLAG_SSE) {
init_fpel(2, 0, 16, put, sse);
init_fpel(1, 0, 32, put, sse);
init_fpel(0, 0, 64, put, sse);
init_fpel(4, 1, 4, avg, sse);
init_fpel(3, 1, 8, avg, sse);
}
 
if (cpu_flags & AV_CPU_FLAG_SSE2) {
init_fpel(2, 1, 16, avg, sse2);
init_fpel(1, 1, 32, avg, sse2);
init_fpel(0, 1, 64, avg, sse2);
}
 
if (cpu_flags & AV_CPU_FLAG_SSSE3) {
init_subpel3(0, put, ssse3);
init_subpel3(1, avg, ssse3);
}
 
#undef init_fpel
#undef init_subpel1
#undef init_subpel2
#undef init_subpel3
 
#endif /* HAVE_YASM */
}
/contrib/sdk/sources/ffmpeg/libavcodec/x86/w64xmmtest.c
0,0 → 1,86
/*
* check XMM registers for clobbers on Win64
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavcodec/avcodec.h"
#include "libavutil/x86/w64xmmtest.h"
 
wrap(avcodec_open2(AVCodecContext *avctx,
AVCodec *codec,
AVDictionary **options))
{
testxmmclobbers(avcodec_open2, avctx, codec, options);
}
 
wrap(avcodec_decode_audio4(AVCodecContext *avctx,
AVFrame *frame,
int *got_frame_ptr,
AVPacket *avpkt))
{
testxmmclobbers(avcodec_decode_audio4, avctx, frame,
got_frame_ptr, avpkt);
}
 
wrap(avcodec_decode_video2(AVCodecContext *avctx,
AVFrame *picture,
int *got_picture_ptr,
AVPacket *avpkt))
{
testxmmclobbers(avcodec_decode_video2, avctx, picture,
got_picture_ptr, avpkt);
}
 
wrap(avcodec_decode_subtitle2(AVCodecContext *avctx,
AVSubtitle *sub,
int *got_sub_ptr,
AVPacket *avpkt))
{
testxmmclobbers(avcodec_decode_subtitle2, avctx, sub,
got_sub_ptr, avpkt);
}
 
wrap(avcodec_encode_audio2(AVCodecContext *avctx,
AVPacket *avpkt,
const AVFrame *frame,
int *got_packet_ptr))
{
testxmmclobbers(avcodec_encode_audio2, avctx, avpkt, frame,
got_packet_ptr);
}
 
wrap(avcodec_encode_video(AVCodecContext *avctx,
uint8_t *buf, int buf_size,
const AVFrame *pict))
{
testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict);
}
 
wrap(avcodec_encode_subtitle(AVCodecContext *avctx,
uint8_t *buf, int buf_size,
const AVSubtitle *sub))
{
testxmmclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub);
}
 
wrap(avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt,
const AVFrame *frame, int *got_packet_ptr))
{
testxmmclobbers(avcodec_encode_video2, avctx, avpkt, frame, got_packet_ptr);
}