/contrib/sdk/sources/ffmpeg/libavcodec/x86/Makefile |
---|
0,0 → 1,104 |
OBJS += x86/constants.o \ |
x86/fmtconvert_init.o \ |
OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp_init.o |
OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp_init.o |
OBJS-$(CONFIG_CAVS_DECODER) += x86/cavsdsp.o |
OBJS-$(CONFIG_DCT) += x86/dct_init.o |
OBJS-$(CONFIG_DNXHD_ENCODER) += x86/dnxhdenc.o |
OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_init.o \ |
x86/dsputil_x86.o |
OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc_mmx.o \ |
x86/fdct.o \ |
x86/motion_est.o |
OBJS-$(CONFIG_FFT) += x86/fft_init.o |
OBJS-$(CONFIG_H264CHROMA) += x86/h264chroma_init.o |
OBJS-$(CONFIG_H264DSP) += x86/h264dsp_init.o |
OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred_init.o |
OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel.o |
OBJS-$(CONFIG_HPELDSP) += x86/hpeldsp_init.o |
OBJS-$(CONFIG_LPC) += x86/lpc.o |
OBJS-$(CONFIG_MLP_DECODER) += x86/mlpdsp.o |
OBJS-$(CONFIG_MPEGAUDIODSP) += x86/mpegaudiodsp.o |
OBJS-$(CONFIG_MPEGVIDEO) += x86/mpegvideo.o |
OBJS-$(CONFIG_MPEGVIDEOENC) += x86/mpegvideoenc.o |
OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp_init.o |
OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp_init.o |
OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp_init.o |
OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp_init.o |
OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp_init.o \ |
x86/rv40dsp_init.o |
OBJS-$(CONFIG_V210_DECODER) += x86/v210-init.o |
OBJS-$(CONFIG_TRUEHD_DECODER) += x86/mlpdsp.o |
OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_init.o |
OBJS-$(CONFIG_VIDEODSP) += x86/videodsp_init.o |
OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp_init.o |
OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp_init.o |
OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp_init.o |
OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp_init.o |
OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp_init.o |
OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp_init.o |
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o |
MMX-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil_mmx.o \ |
x86/fpel_mmx.o \ |
x86/idct_mmx_xvid.o \ |
x86/idct_sse2_xvid.o \ |
x86/rnd_mmx.o \ |
x86/simple_idct.o |
MMX-OBJS-$(CONFIG_DIRAC_DECODER) += x86/dirac_dwt.o |
MMX-OBJS-$(CONFIG_HPELDSP) += x86/fpel_mmx.o \ |
x86/hpeldsp_mmx.o \ |
x86/rnd_mmx.o |
MMX-OBJS-$(CONFIG_SNOW_DECODER) += x86/snowdsp.o |
MMX-OBJS-$(CONFIG_SNOW_ENCODER) += x86/snowdsp.o |
MMX-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp_mmx.o |
YASM-OBJS += x86/deinterlace.o \ |
x86/fmtconvert.o \ |
YASM-OBJS-$(CONFIG_AAC_DECODER) += x86/sbrdsp.o |
YASM-OBJS-$(CONFIG_AC3DSP) += x86/ac3dsp.o |
YASM-OBJS-$(CONFIG_DCT) += x86/dct32.o |
YASM-OBJS-$(CONFIG_DIRAC_DECODER) += x86/diracdsp_mmx.o x86/diracdsp_yasm.o\ |
x86/dwt_yasm.o |
YASM-OBJS-$(CONFIG_DSPUTIL) += x86/dsputil.o \ |
x86/fpel.o \ |
x86/mpeg4qpel.o \ |
x86/qpel.o |
YASM-OBJS-$(CONFIG_ENCODERS) += x86/dsputilenc.o |
YASM-OBJS-$(CONFIG_FFT) += x86/fft.o |
YASM-OBJS-$(CONFIG_H263_DECODER) += x86/h263_loopfilter.o |
YASM-OBJS-$(CONFIG_H263_ENCODER) += x86/h263_loopfilter.o |
YASM-OBJS-$(CONFIG_H264CHROMA) += x86/h264_chromamc.o \ |
x86/h264_chromamc_10bit.o |
YASM-OBJS-$(CONFIG_H264DSP) += x86/h264_deblock.o \ |
x86/h264_deblock_10bit.o \ |
x86/h264_idct.o \ |
x86/h264_idct_10bit.o \ |
x86/h264_weight.o \ |
x86/h264_weight_10bit.o |
YASM-OBJS-$(CONFIG_H264PRED) += x86/h264_intrapred.o \ |
x86/h264_intrapred_10bit.o |
YASM-OBJS-$(CONFIG_H264QPEL) += x86/h264_qpel_8bit.o \ |
x86/h264_qpel_10bit.o \ |
x86/fpel.o \ |
x86/qpel.o |
YASM-OBJS-$(CONFIG_HPELDSP) += x86/fpel.o \ |
x86/hpeldsp.o |
YASM-OBJS-$(CONFIG_MPEGAUDIODSP) += x86/imdct36.o |
YASM-OBJS-$(CONFIG_PNG_DECODER) += x86/pngdsp.o |
YASM-OBJS-$(CONFIG_PRORES_DECODER) += x86/proresdsp.o |
YASM-OBJS-$(CONFIG_PRORES_LGPL_DECODER) += x86/proresdsp.o |
YASM-OBJS-$(CONFIG_RV30_DECODER) += x86/rv34dsp.o |
YASM-OBJS-$(CONFIG_RV40_DECODER) += x86/rv34dsp.o \ |
x86/rv40dsp.o |
YASM-OBJS-$(CONFIG_V210_DECODER) += x86/v210.o |
YASM-OBJS-$(CONFIG_VC1_DECODER) += x86/vc1dsp.o |
YASM-OBJS-$(CONFIG_VIDEODSP) += x86/videodsp.o |
YASM-OBJS-$(CONFIG_VORBIS_DECODER) += x86/vorbisdsp.o |
YASM-OBJS-$(CONFIG_VP3DSP) += x86/vp3dsp.o |
YASM-OBJS-$(CONFIG_VP6_DECODER) += x86/vp6dsp.o |
YASM-OBJS-$(CONFIG_VP8_DECODER) += x86/vp8dsp.o |
YASM-OBJS-$(CONFIG_VP9_DECODER) += x86/vp9dsp.o |
YASM-OBJS-$(CONFIG_WEBP_DECODER) += x86/vp8dsp.o |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/ac3dsp.asm |
---|
0,0 → 1,421 |
;***************************************************************************** |
;* x86-optimized AC-3 DSP utils |
;* Copyright (c) 2011 Justin Ruggles |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
; 16777216.0f - used in ff_float_to_fixed24() |
pf_1_24: times 4 dd 0x4B800000 |
; used in ff_ac3_compute_mantissa_size() |
cextern ac3_bap_bits |
pw_bap_mul1: dw 21846, 21846, 0, 32768, 21846, 21846, 0, 32768 |
pw_bap_mul2: dw 5, 7, 0, 7, 5, 7, 0, 7 |
; used in ff_ac3_extract_exponents() |
pd_1: times 4 dd 1 |
pd_151: times 4 dd 151 |
SECTION .text |
;----------------------------------------------------------------------------- |
; void ff_ac3_exponent_min(uint8_t *exp, int num_reuse_blocks, int nb_coefs) |
;----------------------------------------------------------------------------- |
%macro AC3_EXPONENT_MIN 0 |
cglobal ac3_exponent_min, 3, 4, 2, exp, reuse_blks, expn, offset |
shl reuse_blksq, 8 |
jz .end |
LOOP_ALIGN |
.nextexp: |
mov offsetq, reuse_blksq |
mova m0, [expq+offsetq] |
sub offsetq, 256 |
LOOP_ALIGN |
.nextblk: |
PMINUB m0, [expq+offsetq], m1 |
sub offsetq, 256 |
jae .nextblk |
mova [expq], m0 |
add expq, mmsize |
sub expnq, mmsize |
jg .nextexp |
.end: |
REP_RET |
%endmacro |
%define LOOP_ALIGN |
INIT_MMX mmx |
AC3_EXPONENT_MIN |
%if HAVE_MMXEXT_EXTERNAL |
%define LOOP_ALIGN ALIGN 16 |
INIT_MMX mmxext |
AC3_EXPONENT_MIN |
%endif |
%if HAVE_SSE2_EXTERNAL |
INIT_XMM sse2 |
AC3_EXPONENT_MIN |
%endif |
%undef LOOP_ALIGN |
;----------------------------------------------------------------------------- |
; int ff_ac3_max_msb_abs_int16(const int16_t *src, int len) |
; |
; This function uses 2 different methods to calculate a valid result. |
; 1) logical 'or' of abs of each element |
; This is used for ssse3 because of the pabsw instruction. |
; It is also used for mmx because of the lack of min/max instructions. |
; 2) calculate min/max for the array, then or(abs(min),abs(max)) |
; This is used for mmxext and sse2 because they have pminsw/pmaxsw. |
;----------------------------------------------------------------------------- |
; logical 'or' of 4 or 8 words in an mmx or xmm register into the low word |
%macro OR_WORDS_HORIZ 2 ; src, tmp |
%if cpuflag(sse2) |
movhlps %2, %1 |
por %1, %2 |
pshuflw %2, %1, q0032 |
por %1, %2 |
pshuflw %2, %1, q0001 |
por %1, %2 |
%elif cpuflag(mmxext) |
pshufw %2, %1, q0032 |
por %1, %2 |
pshufw %2, %1, q0001 |
por %1, %2 |
%else ; mmx |
movq %2, %1 |
psrlq %2, 32 |
por %1, %2 |
movq %2, %1 |
psrlq %2, 16 |
por %1, %2 |
%endif |
%endmacro |
%macro AC3_MAX_MSB_ABS_INT16 1 |
cglobal ac3_max_msb_abs_int16, 2,2,5, src, len |
pxor m2, m2 |
pxor m3, m3 |
.loop: |
%ifidn %1, min_max |
mova m0, [srcq] |
mova m1, [srcq+mmsize] |
pminsw m2, m0 |
pminsw m2, m1 |
pmaxsw m3, m0 |
pmaxsw m3, m1 |
%else ; or_abs |
%if notcpuflag(ssse3) |
mova m0, [srcq] |
mova m1, [srcq+mmsize] |
ABS2 m0, m1, m3, m4 |
%else ; ssse3 |
; using memory args is faster for ssse3 |
pabsw m0, [srcq] |
pabsw m1, [srcq+mmsize] |
%endif |
por m2, m0 |
por m2, m1 |
%endif |
add srcq, mmsize*2 |
sub lend, mmsize |
ja .loop |
%ifidn %1, min_max |
ABS2 m2, m3, m0, m1 |
por m2, m3 |
%endif |
OR_WORDS_HORIZ m2, m0 |
movd eax, m2 |
and eax, 0xFFFF |
RET |
%endmacro |
INIT_MMX mmx |
AC3_MAX_MSB_ABS_INT16 or_abs |
INIT_MMX mmxext |
AC3_MAX_MSB_ABS_INT16 min_max |
INIT_XMM sse2 |
AC3_MAX_MSB_ABS_INT16 min_max |
INIT_XMM ssse3 |
AC3_MAX_MSB_ABS_INT16 or_abs |
;----------------------------------------------------------------------------- |
; macro used for ff_ac3_lshift_int16() and ff_ac3_rshift_int32() |
;----------------------------------------------------------------------------- |
%macro AC3_SHIFT 3 ; l/r, 16/32, shift instruction, instruction set |
cglobal ac3_%1shift_int%2, 3, 3, 5, src, len, shift |
movd m0, shiftd |
.loop: |
mova m1, [srcq ] |
mova m2, [srcq+mmsize ] |
mova m3, [srcq+mmsize*2] |
mova m4, [srcq+mmsize*3] |
%3 m1, m0 |
%3 m2, m0 |
%3 m3, m0 |
%3 m4, m0 |
mova [srcq ], m1 |
mova [srcq+mmsize ], m2 |
mova [srcq+mmsize*2], m3 |
mova [srcq+mmsize*3], m4 |
add srcq, mmsize*4 |
sub lend, mmsize*32/%2 |
ja .loop |
.end: |
REP_RET |
%endmacro |
;----------------------------------------------------------------------------- |
; void ff_ac3_lshift_int16(int16_t *src, unsigned int len, unsigned int shift) |
;----------------------------------------------------------------------------- |
INIT_MMX mmx |
AC3_SHIFT l, 16, psllw |
INIT_XMM sse2 |
AC3_SHIFT l, 16, psllw |
;----------------------------------------------------------------------------- |
; void ff_ac3_rshift_int32(int32_t *src, unsigned int len, unsigned int shift) |
;----------------------------------------------------------------------------- |
INIT_MMX mmx |
AC3_SHIFT r, 32, psrad |
INIT_XMM sse2 |
AC3_SHIFT r, 32, psrad |
;----------------------------------------------------------------------------- |
; void ff_float_to_fixed24(int32_t *dst, const float *src, unsigned int len) |
;----------------------------------------------------------------------------- |
; The 3DNow! version is not bit-identical because pf2id uses truncation rather |
; than round-to-nearest. |
INIT_MMX 3dnow |
cglobal float_to_fixed24, 3, 3, 0, dst, src, len |
movq m0, [pf_1_24] |
.loop: |
movq m1, [srcq ] |
movq m2, [srcq+8 ] |
movq m3, [srcq+16] |
movq m4, [srcq+24] |
pfmul m1, m0 |
pfmul m2, m0 |
pfmul m3, m0 |
pfmul m4, m0 |
pf2id m1, m1 |
pf2id m2, m2 |
pf2id m3, m3 |
pf2id m4, m4 |
movq [dstq ], m1 |
movq [dstq+8 ], m2 |
movq [dstq+16], m3 |
movq [dstq+24], m4 |
add srcq, 32 |
add dstq, 32 |
sub lend, 8 |
ja .loop |
femms |
RET |
INIT_XMM sse |
cglobal float_to_fixed24, 3, 3, 3, dst, src, len |
movaps m0, [pf_1_24] |
.loop: |
movaps m1, [srcq ] |
movaps m2, [srcq+16] |
mulps m1, m0 |
mulps m2, m0 |
cvtps2pi mm0, m1 |
movhlps m1, m1 |
cvtps2pi mm1, m1 |
cvtps2pi mm2, m2 |
movhlps m2, m2 |
cvtps2pi mm3, m2 |
movq [dstq ], mm0 |
movq [dstq+ 8], mm1 |
movq [dstq+16], mm2 |
movq [dstq+24], mm3 |
add srcq, 32 |
add dstq, 32 |
sub lend, 8 |
ja .loop |
emms |
RET |
INIT_XMM sse2 |
cglobal float_to_fixed24, 3, 3, 9, dst, src, len |
movaps m0, [pf_1_24] |
.loop: |
movaps m1, [srcq ] |
movaps m2, [srcq+16 ] |
movaps m3, [srcq+32 ] |
movaps m4, [srcq+48 ] |
%ifdef m8 |
movaps m5, [srcq+64 ] |
movaps m6, [srcq+80 ] |
movaps m7, [srcq+96 ] |
movaps m8, [srcq+112] |
%endif |
mulps m1, m0 |
mulps m2, m0 |
mulps m3, m0 |
mulps m4, m0 |
%ifdef m8 |
mulps m5, m0 |
mulps m6, m0 |
mulps m7, m0 |
mulps m8, m0 |
%endif |
cvtps2dq m1, m1 |
cvtps2dq m2, m2 |
cvtps2dq m3, m3 |
cvtps2dq m4, m4 |
%ifdef m8 |
cvtps2dq m5, m5 |
cvtps2dq m6, m6 |
cvtps2dq m7, m7 |
cvtps2dq m8, m8 |
%endif |
movdqa [dstq ], m1 |
movdqa [dstq+16 ], m2 |
movdqa [dstq+32 ], m3 |
movdqa [dstq+48 ], m4 |
%ifdef m8 |
movdqa [dstq+64 ], m5 |
movdqa [dstq+80 ], m6 |
movdqa [dstq+96 ], m7 |
movdqa [dstq+112], m8 |
add srcq, 128 |
add dstq, 128 |
sub lenq, 32 |
%else |
add srcq, 64 |
add dstq, 64 |
sub lenq, 16 |
%endif |
ja .loop |
REP_RET |
;------------------------------------------------------------------------------ |
; int ff_ac3_compute_mantissa_size(uint16_t mant_cnt[6][16]) |
;------------------------------------------------------------------------------ |
%macro PHADDD4 2 ; xmm src, xmm tmp |
movhlps %2, %1 |
paddd %1, %2 |
pshufd %2, %1, 0x1 |
paddd %1, %2 |
%endmacro |
INIT_XMM sse2 |
cglobal ac3_compute_mantissa_size, 1, 2, 4, mant_cnt, sum |
movdqa m0, [mant_cntq ] |
movdqa m1, [mant_cntq+ 1*16] |
paddw m0, [mant_cntq+ 2*16] |
paddw m1, [mant_cntq+ 3*16] |
paddw m0, [mant_cntq+ 4*16] |
paddw m1, [mant_cntq+ 5*16] |
paddw m0, [mant_cntq+ 6*16] |
paddw m1, [mant_cntq+ 7*16] |
paddw m0, [mant_cntq+ 8*16] |
paddw m1, [mant_cntq+ 9*16] |
paddw m0, [mant_cntq+10*16] |
paddw m1, [mant_cntq+11*16] |
pmaddwd m0, [ac3_bap_bits ] |
pmaddwd m1, [ac3_bap_bits+16] |
paddd m0, m1 |
PHADDD4 m0, m1 |
movd sumd, m0 |
movdqa m3, [pw_bap_mul1] |
movhpd m0, [mant_cntq +2] |
movlpd m0, [mant_cntq+1*32+2] |
movhpd m1, [mant_cntq+2*32+2] |
movlpd m1, [mant_cntq+3*32+2] |
movhpd m2, [mant_cntq+4*32+2] |
movlpd m2, [mant_cntq+5*32+2] |
pmulhuw m0, m3 |
pmulhuw m1, m3 |
pmulhuw m2, m3 |
paddusw m0, m1 |
paddusw m0, m2 |
pmaddwd m0, [pw_bap_mul2] |
PHADDD4 m0, m1 |
movd eax, m0 |
add eax, sumd |
RET |
;------------------------------------------------------------------------------ |
; void ff_ac3_extract_exponents(uint8_t *exp, int32_t *coef, int nb_coefs) |
;------------------------------------------------------------------------------ |
%macro PABSD 1-2 ; src/dst, unused |
%if cpuflag(ssse3) |
pabsd %1, %1 |
%else ; src/dst, tmp |
pxor %2, %2 |
pcmpgtd %2, %1 |
pxor %1, %2 |
psubd %1, %2 |
%endif |
%endmacro |
%macro AC3_EXTRACT_EXPONENTS 0 |
cglobal ac3_extract_exponents, 3, 3, 4, exp, coef, len |
add expq, lenq |
lea coefq, [coefq+4*lenq] |
neg lenq |
mova m2, [pd_1] |
mova m3, [pd_151] |
.loop: |
; move 4 32-bit coefs to xmm0 |
mova m0, [coefq+4*lenq] |
; absolute value |
PABSD m0, m1 |
; convert to float and extract exponents |
pslld m0, 1 |
por m0, m2 |
cvtdq2ps m1, m0 |
psrld m1, 23 |
mova m0, m3 |
psubd m0, m1 |
; move the lowest byte in each of 4 dwords to the low dword |
; NOTE: We cannot just extract the low bytes with pshufb because the dword |
; result for 16777215 is -1 due to float inaccuracy. Using packuswb |
; clips this to 0, which is the correct exponent. |
packssdw m0, m0 |
packuswb m0, m0 |
movd [expq+lenq], m0 |
add lenq, 4 |
jl .loop |
REP_RET |
%endmacro |
%if HAVE_SSE2_EXTERNAL |
INIT_XMM sse2 |
AC3_EXTRACT_EXPONENTS |
%endif |
%if HAVE_SSSE3_EXTERNAL |
INIT_XMM ssse3 |
AC3_EXTRACT_EXPONENTS |
%endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/ac3dsp_init.c |
---|
0,0 → 1,231 |
/* |
* x86-optimized AC-3 DSP utils |
* Copyright (c) 2011 Justin Ruggles |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/mem.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "dsputil_x86.h" |
#include "libavcodec/ac3.h" |
#include "libavcodec/ac3dsp.h" |
void ff_ac3_exponent_min_mmx (uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
void ff_ac3_exponent_min_mmxext(uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
void ff_ac3_exponent_min_sse2 (uint8_t *exp, int num_reuse_blocks, int nb_coefs); |
int ff_ac3_max_msb_abs_int16_mmx (const int16_t *src, int len); |
int ff_ac3_max_msb_abs_int16_mmxext(const int16_t *src, int len); |
int ff_ac3_max_msb_abs_int16_sse2 (const int16_t *src, int len); |
int ff_ac3_max_msb_abs_int16_ssse3(const int16_t *src, int len); |
void ff_ac3_lshift_int16_mmx (int16_t *src, unsigned int len, unsigned int shift); |
void ff_ac3_lshift_int16_sse2(int16_t *src, unsigned int len, unsigned int shift); |
void ff_ac3_rshift_int32_mmx (int32_t *src, unsigned int len, unsigned int shift); |
void ff_ac3_rshift_int32_sse2(int32_t *src, unsigned int len, unsigned int shift); |
void ff_float_to_fixed24_3dnow(int32_t *dst, const float *src, unsigned int len); |
void ff_float_to_fixed24_sse (int32_t *dst, const float *src, unsigned int len); |
void ff_float_to_fixed24_sse2 (int32_t *dst, const float *src, unsigned int len); |
int ff_ac3_compute_mantissa_size_sse2(uint16_t mant_cnt[6][16]); |
void ff_ac3_extract_exponents_3dnow(uint8_t *exp, int32_t *coef, int nb_coefs); |
void ff_ac3_extract_exponents_sse2 (uint8_t *exp, int32_t *coef, int nb_coefs); |
void ff_ac3_extract_exponents_ssse3(uint8_t *exp, int32_t *coef, int nb_coefs); |
#if ARCH_X86_32 && defined(__INTEL_COMPILER) |
# undef HAVE_7REGS |
# define HAVE_7REGS 0 |
#endif |
#if HAVE_SSE_INLINE && HAVE_7REGS |
#define IF1(x) x |
#define IF0(x) |
#define MIX5(mono, stereo) \ |
__asm__ volatile ( \ |
"movss 0(%1), %%xmm5 \n" \ |
"movss 8(%1), %%xmm6 \n" \ |
"movss 24(%1), %%xmm7 \n" \ |
"shufps $0, %%xmm5, %%xmm5 \n" \ |
"shufps $0, %%xmm6, %%xmm6 \n" \ |
"shufps $0, %%xmm7, %%xmm7 \n" \ |
"1: \n" \ |
"movaps (%0, %2), %%xmm0 \n" \ |
"movaps (%0, %3), %%xmm1 \n" \ |
"movaps (%0, %4), %%xmm2 \n" \ |
"movaps (%0, %5), %%xmm3 \n" \ |
"movaps (%0, %6), %%xmm4 \n" \ |
"mulps %%xmm5, %%xmm0 \n" \ |
"mulps %%xmm6, %%xmm1 \n" \ |
"mulps %%xmm5, %%xmm2 \n" \ |
"mulps %%xmm7, %%xmm3 \n" \ |
"mulps %%xmm7, %%xmm4 \n" \ |
stereo("addps %%xmm1, %%xmm0 \n") \ |
"addps %%xmm1, %%xmm2 \n" \ |
"addps %%xmm3, %%xmm0 \n" \ |
"addps %%xmm4, %%xmm2 \n" \ |
mono("addps %%xmm2, %%xmm0 \n") \ |
"movaps %%xmm0, (%0, %2) \n" \ |
stereo("movaps %%xmm2, (%0, %3) \n") \ |
"add $16, %0 \n" \ |
"jl 1b \n" \ |
: "+&r"(i) \ |
: "r"(matrix), \ |
"r"(samples[0] + len), \ |
"r"(samples[1] + len), \ |
"r"(samples[2] + len), \ |
"r"(samples[3] + len), \ |
"r"(samples[4] + len) \ |
: XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", \ |
"%xmm4", "%xmm5", "%xmm6", "%xmm7",) \ |
"memory" \ |
); |
#define MIX_MISC(stereo) \ |
__asm__ volatile ( \ |
"mov %5, %2 \n" \ |
"1: \n" \ |
"mov -%c7(%6, %2, %c8), %3 \n" \ |
"movaps (%3, %0), %%xmm0 \n" \ |
stereo("movaps %%xmm0, %%xmm1 \n") \ |
"mulps %%xmm4, %%xmm0 \n" \ |
stereo("mulps %%xmm5, %%xmm1 \n") \ |
"2: \n" \ |
"mov (%6, %2, %c8), %1 \n" \ |
"movaps (%1, %0), %%xmm2 \n" \ |
stereo("movaps %%xmm2, %%xmm3 \n") \ |
"mulps (%4, %2, 8), %%xmm2 \n" \ |
stereo("mulps 16(%4, %2, 8), %%xmm3 \n") \ |
"addps %%xmm2, %%xmm0 \n" \ |
stereo("addps %%xmm3, %%xmm1 \n") \ |
"add $4, %2 \n" \ |
"jl 2b \n" \ |
"mov %5, %2 \n" \ |
stereo("mov (%6, %2, %c8), %1 \n") \ |
"movaps %%xmm0, (%3, %0) \n" \ |
stereo("movaps %%xmm1, (%1, %0) \n") \ |
"add $16, %0 \n" \ |
"jl 1b \n" \ |
: "+&r"(i), "=&r"(j), "=&r"(k), "=&r"(m) \ |
: "r"(matrix_simd + in_ch), \ |
"g"((intptr_t) - 4 * (in_ch - 1)), \ |
"r"(samp + in_ch), \ |
"i"(sizeof(float *)), "i"(sizeof(float *)/4) \ |
: "memory" \ |
); |
static void ac3_downmix_sse(float **samples, float (*matrix)[2], |
int out_ch, int in_ch, int len) |
{ |
int (*matrix_cmp)[2] = (int(*)[2])matrix; |
intptr_t i, j, k, m; |
i = -len * sizeof(float); |
if (in_ch == 5 && out_ch == 2 && |
!(matrix_cmp[0][1] | matrix_cmp[2][0] | |
matrix_cmp[3][1] | matrix_cmp[4][0] | |
(matrix_cmp[1][0] ^ matrix_cmp[1][1]) | |
(matrix_cmp[0][0] ^ matrix_cmp[2][1]))) { |
MIX5(IF0, IF1); |
} else if (in_ch == 5 && out_ch == 1 && |
matrix_cmp[0][0] == matrix_cmp[2][0] && |
matrix_cmp[3][0] == matrix_cmp[4][0]) { |
MIX5(IF1, IF0); |
} else { |
DECLARE_ALIGNED(16, float, matrix_simd)[AC3_MAX_CHANNELS][2][4]; |
float *samp[AC3_MAX_CHANNELS]; |
for (j = 0; j < in_ch; j++) |
samp[j] = samples[j] + len; |
j = 2 * in_ch * sizeof(float); |
__asm__ volatile ( |
"1: \n" |
"sub $8, %0 \n" |
"movss (%2, %0), %%xmm4 \n" |
"movss 4(%2, %0), %%xmm5 \n" |
"shufps $0, %%xmm4, %%xmm4 \n" |
"shufps $0, %%xmm5, %%xmm5 \n" |
"movaps %%xmm4, (%1, %0, 4) \n" |
"movaps %%xmm5, 16(%1, %0, 4) \n" |
"jg 1b \n" |
: "+&r"(j) |
: "r"(matrix_simd), "r"(matrix) |
: "memory" |
); |
if (out_ch == 2) { |
MIX_MISC(IF1); |
} else { |
MIX_MISC(IF0); |
} |
} |
} |
#endif /* HAVE_SSE_INLINE && HAVE_7REGS */ |
av_cold void ff_ac3dsp_init_x86(AC3DSPContext *c, int bit_exact) |
{ |
int cpu_flags = av_get_cpu_flags(); |
if (EXTERNAL_MMX(cpu_flags)) { |
c->ac3_exponent_min = ff_ac3_exponent_min_mmx; |
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmx; |
c->ac3_lshift_int16 = ff_ac3_lshift_int16_mmx; |
c->ac3_rshift_int32 = ff_ac3_rshift_int32_mmx; |
} |
if (EXTERNAL_AMD3DNOW(cpu_flags)) { |
if (!bit_exact) { |
c->float_to_fixed24 = ff_float_to_fixed24_3dnow; |
} |
} |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
c->ac3_exponent_min = ff_ac3_exponent_min_mmxext; |
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_mmxext; |
} |
if (EXTERNAL_SSE(cpu_flags)) { |
c->float_to_fixed24 = ff_float_to_fixed24_sse; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
c->ac3_exponent_min = ff_ac3_exponent_min_sse2; |
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_sse2; |
c->float_to_fixed24 = ff_float_to_fixed24_sse2; |
c->compute_mantissa_size = ff_ac3_compute_mantissa_size_sse2; |
c->extract_exponents = ff_ac3_extract_exponents_sse2; |
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { |
c->ac3_lshift_int16 = ff_ac3_lshift_int16_sse2; |
c->ac3_rshift_int32 = ff_ac3_rshift_int32_sse2; |
} |
} |
if (EXTERNAL_SSSE3(cpu_flags)) { |
c->ac3_max_msb_abs_int16 = ff_ac3_max_msb_abs_int16_ssse3; |
if (!(cpu_flags & AV_CPU_FLAG_ATOM)) { |
c->extract_exponents = ff_ac3_extract_exponents_ssse3; |
} |
} |
#if HAVE_SSE_INLINE && HAVE_7REGS |
if (INLINE_SSE(cpu_flags)) { |
c->downmix = ac3_downmix_sse; |
} |
#endif |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/cabac.h |
---|
0,0 → 1,299 |
/* |
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#ifndef AVCODEC_X86_CABAC_H |
#define AVCODEC_X86_CABAC_H |
#include "libavcodec/cabac.h" |
#include "libavutil/attributes.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/internal.h" |
#include "config.h" |
#if (defined(__i386) && defined(__clang__) && (__clang_major__<2 || (__clang_major__==2 && __clang_minor__<10)))\ |
|| ( !defined(__clang__) && defined(__llvm__) && __GNUC__==4 && __GNUC_MINOR__==2 && __GNUC_PATCHLEVEL__<=1) |
# define BROKEN_COMPILER 1 |
#else |
# define BROKEN_COMPILER 0 |
#endif |
#if HAVE_INLINE_ASM |
#ifndef UNCHECKED_BITSTREAM_READER |
#define UNCHECKED_BITSTREAM_READER !CONFIG_SAFE_BITSTREAM_READER |
#endif |
#if UNCHECKED_BITSTREAM_READER |
#define END_CHECK(end) "" |
#else |
#define END_CHECK(end) \ |
"cmp "end" , %%"REG_c" \n\t"\ |
"jge 1f \n\t" |
#endif |
#ifdef BROKEN_RELOCATIONS |
#define TABLES_ARG , "r"(tables) |
#if HAVE_FAST_CMOV |
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \ |
"cmp "low" , "tmp" \n\t"\ |
"cmova %%ecx , "range" \n\t"\ |
"sbb %%rcx , %%rcx \n\t"\ |
"and %%ecx , "tmp" \n\t"\ |
"xor %%rcx , "retq" \n\t"\ |
"sub "tmp" , "low" \n\t" |
#else /* HAVE_FAST_CMOV */ |
#define BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \ |
/* P4 Prescott has crappy cmov,sbb,64bit shift so avoid them */ \ |
"sub "low" , "tmp" \n\t"\ |
"sar $31 , "tmp" \n\t"\ |
"sub %%ecx , "range" \n\t"\ |
"and "tmp" , "range" \n\t"\ |
"add %%ecx , "range" \n\t"\ |
"shl $17 , %%ecx \n\t"\ |
"and "tmp" , %%ecx \n\t"\ |
"sub %%ecx , "low" \n\t"\ |
"xor "tmp" , "ret" \n\t"\ |
"movslq "ret" , "retq" \n\t" |
#endif /* HAVE_FAST_CMOV */ |
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \ |
"movzbl "statep" , "ret" \n\t"\ |
"mov "range" , "tmp" \n\t"\ |
"and $0xC0 , "range" \n\t"\ |
"lea ("ret", "range", 2), %%ecx \n\t"\ |
"movzbl "lps_off"("tables", %%rcx), "range" \n\t"\ |
"sub "range" , "tmp" \n\t"\ |
"mov "tmp" , %%ecx \n\t"\ |
"shl $17 , "tmp" \n\t"\ |
BRANCHLESS_GET_CABAC_UPDATE(ret, retq, low, range, tmp) \ |
"movzbl "norm_off"("tables", "rangeq"), %%ecx \n\t"\ |
"shl %%cl , "range" \n\t"\ |
"movzbl "mlps_off"+128("tables", "retq"), "tmp" \n\t"\ |
"shl %%cl , "low" \n\t"\ |
"mov "tmpbyte" , "statep" \n\t"\ |
"test "lowword" , "lowword" \n\t"\ |
"jnz 2f \n\t"\ |
"mov "byte" , %%"REG_c" \n\t"\ |
END_CHECK(end)\ |
"add"OPSIZE" $2 , "byte" \n\t"\ |
"1: \n\t"\ |
"movzwl (%%"REG_c") , "tmp" \n\t"\ |
"lea -1("low") , %%ecx \n\t"\ |
"xor "low" , %%ecx \n\t"\ |
"shr $15 , %%ecx \n\t"\ |
"bswap "tmp" \n\t"\ |
"shr $15 , "tmp" \n\t"\ |
"movzbl "norm_off"("tables", %%rcx), %%ecx \n\t"\ |
"sub $0xFFFF , "tmp" \n\t"\ |
"neg %%ecx \n\t"\ |
"add $7 , %%ecx \n\t"\ |
"shl %%cl , "tmp" \n\t"\ |
"add "tmp" , "low" \n\t"\ |
"2: \n\t" |
#else /* BROKEN_RELOCATIONS */ |
#define TABLES_ARG |
#define RIP_ARG |
#if HAVE_FAST_CMOV |
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\ |
"mov "tmp" , %%ecx \n\t"\ |
"shl $17 , "tmp" \n\t"\ |
"cmp "low" , "tmp" \n\t"\ |
"cmova %%ecx , "range" \n\t"\ |
"sbb %%ecx , %%ecx \n\t"\ |
"and %%ecx , "tmp" \n\t"\ |
"xor %%ecx , "ret" \n\t"\ |
"sub "tmp" , "low" \n\t" |
#else /* HAVE_FAST_CMOV */ |
#define BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp)\ |
"mov "tmp" , %%ecx \n\t"\ |
"shl $17 , "tmp" \n\t"\ |
"sub "low" , "tmp" \n\t"\ |
"sar $31 , "tmp" \n\t" /*lps_mask*/\ |
"sub %%ecx , "range" \n\t" /*RangeLPS - range*/\ |
"and "tmp" , "range" \n\t" /*(RangeLPS - range)&lps_mask*/\ |
"add %%ecx , "range" \n\t" /*new range*/\ |
"shl $17 , %%ecx \n\t"\ |
"and "tmp" , %%ecx \n\t"\ |
"sub %%ecx , "low" \n\t"\ |
"xor "tmp" , "ret" \n\t" |
#endif /* HAVE_FAST_CMOV */ |
#define BRANCHLESS_GET_CABAC(ret, retq, statep, low, lowword, range, rangeq, tmp, tmpbyte, byte, end, norm_off, lps_off, mlps_off, tables) \ |
"movzbl "statep" , "ret" \n\t"\ |
"mov "range" , "tmp" \n\t"\ |
"and $0xC0 , "range" \n\t"\ |
"movzbl "MANGLE(ff_h264_cabac_tables)"+"lps_off"("ret", "range", 2), "range" \n\t"\ |
"sub "range" , "tmp" \n\t"\ |
BRANCHLESS_GET_CABAC_UPDATE(ret, low, range, tmp) \ |
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"("range"), %%ecx \n\t"\ |
"shl %%cl , "range" \n\t"\ |
"movzbl "MANGLE(ff_h264_cabac_tables)"+"mlps_off"+128("ret"), "tmp" \n\t"\ |
"shl %%cl , "low" \n\t"\ |
"mov "tmpbyte" , "statep" \n\t"\ |
"test "lowword" , "lowword" \n\t"\ |
" jnz 2f \n\t"\ |
"mov "byte" , %%"REG_c" \n\t"\ |
END_CHECK(end)\ |
"add"OPSIZE" $2 , "byte" \n\t"\ |
"1: \n\t"\ |
"movzwl (%%"REG_c") , "tmp" \n\t"\ |
"lea -1("low") , %%ecx \n\t"\ |
"xor "low" , %%ecx \n\t"\ |
"shr $15 , %%ecx \n\t"\ |
"bswap "tmp" \n\t"\ |
"shr $15 , "tmp" \n\t"\ |
"movzbl "MANGLE(ff_h264_cabac_tables)"+"norm_off"(%%ecx), %%ecx \n\t"\ |
"sub $0xFFFF , "tmp" \n\t"\ |
"neg %%ecx \n\t"\ |
"add $7 , %%ecx \n\t"\ |
"shl %%cl , "tmp" \n\t"\ |
"add "tmp" , "low" \n\t"\ |
"2: \n\t" |
#endif /* BROKEN_RELOCATIONS */ |
#if HAVE_7REGS && !BROKEN_COMPILER |
#define get_cabac_inline get_cabac_inline_x86 |
static av_always_inline int get_cabac_inline_x86(CABACContext *c, |
uint8_t *const state) |
{ |
int bit, tmp; |
#ifdef BROKEN_RELOCATIONS |
void *tables; |
__asm__ volatile( |
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" |
: "=&r"(tables) |
); |
#endif |
__asm__ volatile( |
BRANCHLESS_GET_CABAC("%0", "%q0", "(%4)", "%1", "%w1", |
"%2", "%q2", "%3", "%b3", |
"%c6(%5)", "%c7(%5)", |
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), |
AV_STRINGIFY(H264_LPS_RANGE_OFFSET), |
AV_STRINGIFY(H264_MLPS_STATE_OFFSET), |
"%8") |
: "=&r"(bit), "=&r"(c->low), "=&r"(c->range), "=&q"(tmp) |
: "r"(state), "r"(c), |
"i"(offsetof(CABACContext, bytestream)), |
"i"(offsetof(CABACContext, bytestream_end)) |
TABLES_ARG |
,"1"(c->low), "2"(c->range) |
: "%"REG_c, "memory" |
); |
return bit & 1; |
} |
#endif /* HAVE_7REGS */ |
#if !BROKEN_COMPILER |
#define get_cabac_bypass_sign get_cabac_bypass_sign_x86 |
static av_always_inline int get_cabac_bypass_sign_x86(CABACContext *c, int val) |
{ |
x86_reg tmp; |
__asm__ volatile( |
"movl %c6(%2), %k1 \n\t" |
"movl %c3(%2), %%eax \n\t" |
"shl $17, %k1 \n\t" |
"add %%eax, %%eax \n\t" |
"sub %k1, %%eax \n\t" |
"cltd \n\t" |
"and %%edx, %k1 \n\t" |
"add %k1, %%eax \n\t" |
"xor %%edx, %%ecx \n\t" |
"sub %%edx, %%ecx \n\t" |
"test %%ax, %%ax \n\t" |
"jnz 1f \n\t" |
"mov %c4(%2), %1 \n\t" |
"subl $0xFFFF, %%eax \n\t" |
"movzwl (%1), %%edx \n\t" |
"bswap %%edx \n\t" |
"shrl $15, %%edx \n\t" |
#if UNCHECKED_BITSTREAM_READER |
"add $2, %1 \n\t" |
"addl %%edx, %%eax \n\t" |
"mov %1, %c4(%2) \n\t" |
#else |
"addl %%edx, %%eax \n\t" |
"cmp %c5(%2), %1 \n\t" |
"jge 1f \n\t" |
"add"OPSIZE" $2, %c4(%2) \n\t" |
#endif |
"1: \n\t" |
"movl %%eax, %c3(%2) \n\t" |
: "+c"(val), "=&r"(tmp) |
: "r"(c), |
"i"(offsetof(CABACContext, low)), |
"i"(offsetof(CABACContext, bytestream)), |
"i"(offsetof(CABACContext, bytestream_end)), |
"i"(offsetof(CABACContext, range)) |
: "%eax", "%edx", "memory" |
); |
return val; |
} |
#define get_cabac_bypass get_cabac_bypass_x86 |
static av_always_inline int get_cabac_bypass_x86(CABACContext *c) |
{ |
x86_reg tmp; |
int res; |
__asm__ volatile( |
"movl %c6(%2), %k1 \n\t" |
"movl %c3(%2), %%eax \n\t" |
"shl $17, %k1 \n\t" |
"add %%eax, %%eax \n\t" |
"sub %k1, %%eax \n\t" |
"cltd \n\t" |
"and %%edx, %k1 \n\t" |
"add %k1, %%eax \n\t" |
"inc %%edx \n\t" |
"test %%ax, %%ax \n\t" |
"jnz 1f \n\t" |
"mov %c4(%2), %1 \n\t" |
"subl $0xFFFF, %%eax \n\t" |
"movzwl (%1), %%ecx \n\t" |
"bswap %%ecx \n\t" |
"shrl $15, %%ecx \n\t" |
"addl %%ecx, %%eax \n\t" |
"cmp %c5(%2), %1 \n\t" |
"jge 1f \n\t" |
"add"OPSIZE" $2, %c4(%2) \n\t" |
"1: \n\t" |
"movl %%eax, %c3(%2) \n\t" |
: "=&d"(res), "=&r"(tmp) |
: "r"(c), |
"i"(offsetof(CABACContext, low)), |
"i"(offsetof(CABACContext, bytestream)), |
"i"(offsetof(CABACContext, bytestream_end)), |
"i"(offsetof(CABACContext, range)) |
: "%eax", "%ecx", "memory" |
); |
return res; |
} |
#endif /* !BROKEN_COMPILER */ |
#endif /* HAVE_INLINE_ASM */ |
#endif /* AVCODEC_X86_CABAC_H */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/cavsdsp.c |
---|
0,0 → 1,558 |
/* |
* Chinese AVS video (AVS1-P2, JiZhun profile) decoder. |
* Copyright (c) 2006 Stefan Gehrer <stefan.gehrer@gmx.de> |
* |
* MMX-optimized DSP functions, based on H.264 optimizations by |
* Michael Niedermayer and Loren Merritt |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/common.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/cavsdsp.h" |
#include "constants.h" |
#include "dsputil_x86.h" |
#include "config.h" |
#if HAVE_MMX_INLINE |
/* in/out: mma=mma+mmb, mmb=mmb-mma */ |
#define SUMSUB_BA( a, b ) \ |
"paddw "#b", "#a" \n\t"\ |
"paddw "#b", "#b" \n\t"\ |
"psubw "#a", "#b" \n\t" |
/***************************************************************************** |
* |
* inverse transform |
* |
****************************************************************************/ |
static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) |
{ |
__asm__ volatile( |
"movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ |
"movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ |
"movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ |
"movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ |
"movq %%mm4, %%mm0 \n\t" |
"movq %%mm5, %%mm3 \n\t" |
"movq %%mm2, %%mm6 \n\t" |
"movq %%mm7, %%mm1 \n\t" |
"paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ |
"paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ |
"paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ |
"paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ |
"paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ |
"paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ |
"paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ |
"paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ |
"psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ |
"paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ |
"psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ |
"paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ |
"movq %%mm5, %%mm4 \n\t" |
"movq %%mm7, %%mm6 \n\t" |
"movq %%mm3, %%mm0 \n\t" |
"movq %%mm1, %%mm2 \n\t" |
SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ |
"paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ |
"paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ |
"paddw %%mm7, %%mm7 \n\t" |
"paddw %%mm5, %%mm5 \n\t" |
"paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ |
"paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ |
SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ |
"psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ |
"movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ |
"psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ |
"paddw %%mm1, %%mm1 \n\t" |
"paddw %%mm3, %%mm3 \n\t" |
"psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ |
"paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ |
"movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ |
"movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ |
"movq %%mm2, %%mm4 \n\t" |
"movq %%mm6, %%mm0 \n\t" |
"psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ |
"psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ |
"paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ |
"paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ |
"paddw %%mm2, %%mm2 \n\t" |
"paddw %%mm0, %%mm0 \n\t" |
"psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ |
"paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ |
"movq (%0), %%mm2 \n\t" /* mm2 = src0 */ |
"movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ |
SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ |
"psllw $3, %%mm0 \n\t" |
"psllw $3, %%mm2 \n\t" |
"paddw %1, %%mm0 \n\t" /* add rounding bias */ |
"paddw %1, %%mm2 \n\t" /* add rounding bias */ |
SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ |
SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ |
SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ |
SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ |
SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ |
SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ |
:: "r"(block), "m"(bias) |
); |
} |
#define SBUTTERFLY(a,b,t,n,m)\ |
"mov" #m " " #a ", " #t " \n\t" /* abcd */\ |
"punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ |
"punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ |
#define TRANSPOSE4(a,b,c,d,t)\ |
SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ |
SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ |
SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ |
SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ |
static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) |
{ |
int i; |
DECLARE_ALIGNED(8, int16_t, b2)[64]; |
for(i=0; i<2; i++){ |
DECLARE_ALIGNED(8, uint64_t, tmp); |
cavs_idct8_1d(block+4*i, ff_pw_4.a); |
__asm__ volatile( |
"psraw $3, %%mm7 \n\t" |
"psraw $3, %%mm6 \n\t" |
"psraw $3, %%mm5 \n\t" |
"psraw $3, %%mm4 \n\t" |
"psraw $3, %%mm3 \n\t" |
"psraw $3, %%mm2 \n\t" |
"psraw $3, %%mm1 \n\t" |
"psraw $3, %%mm0 \n\t" |
"movq %%mm7, %0 \n\t" |
TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) |
"movq %%mm0, 8(%1) \n\t" |
"movq %%mm6, 24(%1) \n\t" |
"movq %%mm7, 40(%1) \n\t" |
"movq %%mm4, 56(%1) \n\t" |
"movq %0, %%mm7 \n\t" |
TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) |
"movq %%mm7, (%1) \n\t" |
"movq %%mm1, 16(%1) \n\t" |
"movq %%mm0, 32(%1) \n\t" |
"movq %%mm3, 48(%1) \n\t" |
: "=m"(tmp) |
: "r"(b2+32*i) |
: "memory" |
); |
} |
for(i=0; i<2; i++){ |
cavs_idct8_1d(b2+4*i, ff_pw_64.a); |
__asm__ volatile( |
"psraw $7, %%mm7 \n\t" |
"psraw $7, %%mm6 \n\t" |
"psraw $7, %%mm5 \n\t" |
"psraw $7, %%mm4 \n\t" |
"psraw $7, %%mm3 \n\t" |
"psraw $7, %%mm2 \n\t" |
"psraw $7, %%mm1 \n\t" |
"psraw $7, %%mm0 \n\t" |
"movq %%mm7, (%0) \n\t" |
"movq %%mm5, 16(%0) \n\t" |
"movq %%mm3, 32(%0) \n\t" |
"movq %%mm1, 48(%0) \n\t" |
"movq %%mm0, 64(%0) \n\t" |
"movq %%mm2, 80(%0) \n\t" |
"movq %%mm4, 96(%0) \n\t" |
"movq %%mm6, 112(%0) \n\t" |
:: "r"(b2+4*i) |
: "memory" |
); |
} |
ff_add_pixels_clamped_mmx(b2, dst, stride); |
} |
#endif /* HAVE_MMX_INLINE */ |
#if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) |
/***************************************************************************** |
* |
* motion compensation |
* |
****************************************************************************/ |
/* vertical filter [-1 -2 96 42 -7 0] */ |
#define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \ |
"movd (%0), "#F" \n\t"\ |
"movq "#C", %%mm6 \n\t"\ |
"pmullw %5, %%mm6 \n\t"\ |
"movq "#D", %%mm7 \n\t"\ |
"pmullw "MANGLE(MUL2)", %%mm7\n\t"\ |
"psllw $3, "#E" \n\t"\ |
"psubw "#E", %%mm6 \n\t"\ |
"psraw $3, "#E" \n\t"\ |
"paddw %%mm7, %%mm6 \n\t"\ |
"paddw "#E", %%mm6 \n\t"\ |
"paddw "#B", "#B" \n\t"\ |
"pxor %%mm7, %%mm7 \n\t"\ |
"add %2, %0 \n\t"\ |
"punpcklbw %%mm7, "#F" \n\t"\ |
"psubw "#B", %%mm6 \n\t"\ |
"psraw $1, "#B" \n\t"\ |
"psubw "#A", %%mm6 \n\t"\ |
"paddw %4, %%mm6 \n\t"\ |
"psraw $7, %%mm6 \n\t"\ |
"packuswb %%mm6, %%mm6 \n\t"\ |
OP(%%mm6, (%1), A, d) \ |
"add %3, %1 \n\t" |
/* vertical filter [ 0 -1 5 5 -1 0] */ |
#define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \ |
"movd (%0), "#F" \n\t"\ |
"movq "#C", %%mm6 \n\t"\ |
"paddw "#D", %%mm6 \n\t"\ |
"pmullw %5, %%mm6 \n\t"\ |
"add %2, %0 \n\t"\ |
"punpcklbw %%mm7, "#F" \n\t"\ |
"psubw "#B", %%mm6 \n\t"\ |
"psubw "#E", %%mm6 \n\t"\ |
"paddw %4, %%mm6 \n\t"\ |
"psraw $3, %%mm6 \n\t"\ |
"packuswb %%mm6, %%mm6 \n\t"\ |
OP(%%mm6, (%1), A, d) \ |
"add %3, %1 \n\t" |
/* vertical filter [ 0 -7 42 96 -2 -1] */ |
#define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \ |
"movd (%0), "#F" \n\t"\ |
"movq "#C", %%mm6 \n\t"\ |
"pmullw "MANGLE(MUL2)", %%mm6\n\t"\ |
"movq "#D", %%mm7 \n\t"\ |
"pmullw %5, %%mm7 \n\t"\ |
"psllw $3, "#B" \n\t"\ |
"psubw "#B", %%mm6 \n\t"\ |
"psraw $3, "#B" \n\t"\ |
"paddw %%mm7, %%mm6 \n\t"\ |
"paddw "#B", %%mm6 \n\t"\ |
"paddw "#E", "#E" \n\t"\ |
"pxor %%mm7, %%mm7 \n\t"\ |
"add %2, %0 \n\t"\ |
"punpcklbw %%mm7, "#F" \n\t"\ |
"psubw "#E", %%mm6 \n\t"\ |
"psraw $1, "#E" \n\t"\ |
"psubw "#F", %%mm6 \n\t"\ |
"paddw %4, %%mm6 \n\t"\ |
"psraw $7, %%mm6 \n\t"\ |
"packuswb %%mm6, %%mm6 \n\t"\ |
OP(%%mm6, (%1), A, d) \ |
"add %3, %1 \n\t" |
#define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ |
int w= 2;\ |
src -= 2*srcStride;\ |
\ |
while(w--){\ |
__asm__ volatile(\ |
"pxor %%mm7, %%mm7 \n\t"\ |
"movd (%0), %%mm0 \n\t"\ |
"add %2, %0 \n\t"\ |
"movd (%0), %%mm1 \n\t"\ |
"add %2, %0 \n\t"\ |
"movd (%0), %%mm2 \n\t"\ |
"add %2, %0 \n\t"\ |
"movd (%0), %%mm3 \n\t"\ |
"add %2, %0 \n\t"\ |
"movd (%0), %%mm4 \n\t"\ |
"add %2, %0 \n\t"\ |
"punpcklbw %%mm7, %%mm0 \n\t"\ |
"punpcklbw %%mm7, %%mm1 \n\t"\ |
"punpcklbw %%mm7, %%mm2 \n\t"\ |
"punpcklbw %%mm7, %%mm3 \n\t"\ |
"punpcklbw %%mm7, %%mm4 \n\t"\ |
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ |
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ |
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
\ |
: "+a"(src), "+c"(dst)\ |
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ |
: "memory"\ |
);\ |
if(h==16){\ |
__asm__ volatile(\ |
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ |
VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ |
VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
\ |
: "+a"(src), "+c"(dst)\ |
: "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ |
: "memory"\ |
);\ |
}\ |
src += 4-(h+5)*srcStride;\ |
dst += 4-h*dstStride;\ |
} |
#define QPEL_CAVS(OPNAME, OP, MMX)\ |
static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
int h=8;\ |
__asm__ volatile(\ |
"pxor %%mm7, %%mm7 \n\t"\ |
"movq %5, %%mm6 \n\t"\ |
"1: \n\t"\ |
"movq (%0), %%mm0 \n\t"\ |
"movq 1(%0), %%mm2 \n\t"\ |
"movq %%mm0, %%mm1 \n\t"\ |
"movq %%mm2, %%mm3 \n\t"\ |
"punpcklbw %%mm7, %%mm0 \n\t"\ |
"punpckhbw %%mm7, %%mm1 \n\t"\ |
"punpcklbw %%mm7, %%mm2 \n\t"\ |
"punpckhbw %%mm7, %%mm3 \n\t"\ |
"paddw %%mm2, %%mm0 \n\t"\ |
"paddw %%mm3, %%mm1 \n\t"\ |
"pmullw %%mm6, %%mm0 \n\t"\ |
"pmullw %%mm6, %%mm1 \n\t"\ |
"movq -1(%0), %%mm2 \n\t"\ |
"movq 2(%0), %%mm4 \n\t"\ |
"movq %%mm2, %%mm3 \n\t"\ |
"movq %%mm4, %%mm5 \n\t"\ |
"punpcklbw %%mm7, %%mm2 \n\t"\ |
"punpckhbw %%mm7, %%mm3 \n\t"\ |
"punpcklbw %%mm7, %%mm4 \n\t"\ |
"punpckhbw %%mm7, %%mm5 \n\t"\ |
"paddw %%mm4, %%mm2 \n\t"\ |
"paddw %%mm3, %%mm5 \n\t"\ |
"psubw %%mm2, %%mm0 \n\t"\ |
"psubw %%mm5, %%mm1 \n\t"\ |
"movq %6, %%mm5 \n\t"\ |
"paddw %%mm5, %%mm0 \n\t"\ |
"paddw %%mm5, %%mm1 \n\t"\ |
"psraw $3, %%mm0 \n\t"\ |
"psraw $3, %%mm1 \n\t"\ |
"packuswb %%mm1, %%mm0 \n\t"\ |
OP(%%mm0, (%1),%%mm5, q) \ |
"add %3, %0 \n\t"\ |
"add %4, %1 \n\t"\ |
"decl %2 \n\t"\ |
" jnz 1b \n\t"\ |
: "+a"(src), "+c"(dst), "+m"(h)\ |
: "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ |
: "memory"\ |
);\ |
}\ |
\ |
static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ |
}\ |
\ |
static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ |
}\ |
\ |
static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ |
}\ |
\ |
static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
}\ |
static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
}\ |
\ |
static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
}\ |
static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
}\ |
\ |
static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
}\ |
static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
}\ |
\ |
static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ |
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ |
src += 8*srcStride;\ |
dst += 8*dstStride;\ |
OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ |
OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ |
}\ |
#define CAVS_MC(OPNAME, SIZE, MMX) \ |
static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ |
}\ |
\ |
static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ |
}\ |
\ |
static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ |
}\ |
\ |
static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ |
}\ |
#define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" |
#define AVG_3DNOW_OP(a,b,temp, size) \ |
"mov" #size " " #b ", " #temp " \n\t"\ |
"pavgusb " #temp ", " #a " \n\t"\ |
"mov" #size " " #a ", " #b " \n\t" |
#define AVG_MMXEXT_OP(a, b, temp, size) \ |
"mov" #size " " #b ", " #temp " \n\t"\ |
"pavgb " #temp ", " #a " \n\t"\ |
"mov" #size " " #a ", " #b " \n\t" |
#endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */ |
#if HAVE_MMX_INLINE |
static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, |
ptrdiff_t stride) |
{ |
ff_put_pixels8_mmx(dst, src, stride, 8); |
} |
static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, |
ptrdiff_t stride) |
{ |
ff_avg_pixels8_mmx(dst, src, stride, 8); |
} |
static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, |
ptrdiff_t stride) |
{ |
ff_put_pixels16_mmx(dst, src, stride, 16); |
} |
static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, |
ptrdiff_t stride) |
{ |
ff_avg_pixels16_mmx(dst, src, stride, 16); |
} |
static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c, |
AVCodecContext *avctx) |
{ |
c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx; |
c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx; |
c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx; |
c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx; |
c->cavs_idct8_add = cavs_idct8_add_mmx; |
c->idct_perm = FF_TRANSPOSE_IDCT_PERM; |
} |
#endif /* HAVE_MMX_INLINE */ |
#define DSPFUNC(PFX, IDX, NUM, EXT) \ |
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \ |
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \ |
c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \ |
c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \ |
#if HAVE_MMXEXT_INLINE |
QPEL_CAVS(put_, PUT_OP, mmxext) |
QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext) |
CAVS_MC(put_, 8, mmxext) |
CAVS_MC(put_, 16, mmxext) |
CAVS_MC(avg_, 8, mmxext) |
CAVS_MC(avg_, 16, mmxext) |
static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c, |
AVCodecContext *avctx) |
{ |
DSPFUNC(put, 0, 16, mmxext); |
DSPFUNC(put, 1, 8, mmxext); |
DSPFUNC(avg, 0, 16, mmxext); |
DSPFUNC(avg, 1, 8, mmxext); |
} |
#endif /* HAVE_MMXEXT_INLINE */ |
#if HAVE_AMD3DNOW_INLINE |
QPEL_CAVS(put_, PUT_OP, 3dnow) |
QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) |
CAVS_MC(put_, 8, 3dnow) |
CAVS_MC(put_, 16,3dnow) |
CAVS_MC(avg_, 8, 3dnow) |
CAVS_MC(avg_, 16,3dnow) |
static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c, |
AVCodecContext *avctx) |
{ |
DSPFUNC(put, 0, 16, 3dnow); |
DSPFUNC(put, 1, 8, 3dnow); |
DSPFUNC(avg, 0, 16, 3dnow); |
DSPFUNC(avg, 1, 8, 3dnow); |
} |
#endif /* HAVE_AMD3DNOW_INLINE */ |
av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx) |
{ |
#if HAVE_MMX_INLINE |
int cpu_flags = av_get_cpu_flags(); |
if (INLINE_MMX(cpu_flags)) |
cavsdsp_init_mmx(c, avctx); |
#endif /* HAVE_MMX_INLINE */ |
#if HAVE_MMXEXT_INLINE |
if (INLINE_MMXEXT(cpu_flags)) |
cavsdsp_init_mmxext(c, avctx); |
#endif /* HAVE_MMXEXT_INLINE */ |
#if HAVE_AMD3DNOW_INLINE |
if (INLINE_AMD3DNOW(cpu_flags)) |
cavsdsp_init_3dnow(c, avctx); |
#endif /* HAVE_AMD3DNOW_INLINE */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/constants.c |
---|
0,0 → 1,53 |
/* |
* MMX/SSE constants used across x86 dsp optimizations. |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/mem.h" |
#include "libavutil/x86/asm.h" // for xmm_reg |
#include "constants.h" |
DECLARE_ALIGNED(8, const uint64_t, ff_wtwo) = 0x0002000200020002ULL; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1) = { 0x0001000100010001ULL, 0x0001000100010001ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_2) = { 0x0002000200020002ULL, 0x0002000200020002ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_3) = { 0x0003000300030003ULL, 0x0003000300030003ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_4) = { 0x0004000400040004ULL, 0x0004000400040004ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_5) = { 0x0005000500050005ULL, 0x0005000500050005ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_8) = { 0x0008000800080008ULL, 0x0008000800080008ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_9) = { 0x0009000900090009ULL, 0x0009000900090009ULL }; |
DECLARE_ALIGNED(8, const uint64_t, ff_pw_15) = 0x000F000F000F000FULL; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_16) = { 0x0010001000100010ULL, 0x0010001000100010ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_17) = { 0x0011001100110011ULL, 0x0011001100110011ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_18) = { 0x0012001200120012ULL, 0x0012001200120012ULL }; |
DECLARE_ALIGNED(8, const uint64_t, ff_pw_20) = 0x0014001400140014ULL; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_32) = { 0x0020002000200020ULL, 0x0020002000200020ULL }; |
DECLARE_ALIGNED(8, const uint64_t, ff_pw_42) = 0x002A002A002A002AULL; |
DECLARE_ALIGNED(8, const uint64_t, ff_pw_53) = 0x0035003500350035ULL; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_64) = { 0x0040004000400040ULL, 0x0040004000400040ULL }; |
DECLARE_ALIGNED(8, const uint64_t, ff_pw_96) = 0x0060006000600060ULL; |
DECLARE_ALIGNED(8, const uint64_t, ff_pw_128) = 0x0080008000800080ULL; |
DECLARE_ALIGNED(8, const uint64_t, ff_pw_255) = 0x00ff00ff00ff00ffULL; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_512) = { 0x0200020002000200ULL, 0x0200020002000200ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pw_1019) = { 0x03FB03FB03FB03FBULL, 0x03FB03FB03FB03FBULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_0) = { 0x0000000000000000ULL, 0x0000000000000000ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_1) = { 0x0101010101010101ULL, 0x0101010101010101ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_3) = { 0x0303030303030303ULL, 0x0303030303030303ULL }; |
DECLARE_ALIGNED(16, const xmm_reg, ff_pb_80) = { 0x8080808080808080ULL, 0x8080808080808080ULL }; |
DECLARE_ALIGNED(8, const uint64_t, ff_pb_FC) = 0xFCFCFCFCFCFCFCFCULL; |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/constants.h |
---|
0,0 → 1,51 |
/* |
* MMX/SSE constants used across x86 dsp optimizations. |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#ifndef AVCODEC_X86_CONSTANTS_H |
#define AVCODEC_X86_CONSTANTS_H |
#include <stdint.h> |
#include "libavutil/x86/asm.h" |
extern const uint64_t ff_wtwo; |
extern const xmm_reg ff_pw_3; |
extern const xmm_reg ff_pw_4; |
extern const xmm_reg ff_pw_5; |
extern const xmm_reg ff_pw_8; |
extern const uint64_t ff_pw_15; |
extern const xmm_reg ff_pw_16; |
extern const xmm_reg ff_pw_18; |
extern const uint64_t ff_pw_20; |
extern const xmm_reg ff_pw_32; |
extern const uint64_t ff_pw_42; |
extern const uint64_t ff_pw_53; |
extern const xmm_reg ff_pw_64; |
extern const uint64_t ff_pw_96; |
extern const uint64_t ff_pw_128; |
extern const uint64_t ff_pw_255; |
extern const xmm_reg ff_pb_1; |
extern const xmm_reg ff_pb_3; |
extern const xmm_reg ff_pb_F8; |
extern const uint64_t ff_pb_FC; |
#endif /* AVCODEC_X86_CONSTANTS_H */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dct32.asm |
---|
0,0 → 1,490 |
;****************************************************************************** |
;* 32 point SSE-optimized DCT transform |
;* Copyright (c) 2010 Vitor Sessak |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA 32 |
align 32 |
ps_cos_vec: dd 0.500603, 0.505471, 0.515447, 0.531043 |
dd 0.553104, 0.582935, 0.622504, 0.674808 |
dd -10.190008, -3.407609, -2.057781, -1.484165 |
dd -1.169440, -0.972568, -0.839350, -0.744536 |
dd 0.502419, 0.522499, 0.566944, 0.646822 |
dd 0.788155, 1.060678, 1.722447, 5.101149 |
dd 0.509796, 0.601345, 0.899976, 2.562916 |
dd 0.509796, 0.601345, 0.899976, 2.562916 |
dd 1.000000, 1.000000, 1.306563, 0.541196 |
dd 1.000000, 1.000000, 1.306563, 0.541196 |
dd 1.000000, 0.707107, 1.000000, -0.707107 |
dd 1.000000, 0.707107, 1.000000, -0.707107 |
dd 0.707107, 0.707107, 0.707107, 0.707107 |
align 32 |
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000, 0, 0, 0x80000000, 0x80000000 |
%macro BUTTERFLY 4 |
subps %4, %1, %2 |
addps %2, %2, %1 |
mulps %1, %4, %3 |
%endmacro |
%macro BUTTERFLY0 5 |
%if cpuflag(sse2) && notcpuflag(avx) |
pshufd %4, %1, %5 |
xorps %1, %2 |
addps %1, %4 |
mulps %1, %3 |
%else |
shufps %4, %1, %1, %5 |
xorps %1, %1, %2 |
addps %4, %4, %1 |
mulps %1, %4, %3 |
%endif |
%endmacro |
%macro BUTTERFLY2 4 |
BUTTERFLY0 %1, %2, %3, %4, 0x1b |
%endmacro |
%macro BUTTERFLY3 4 |
BUTTERFLY0 %1, %2, %3, %4, 0xb1 |
%endmacro |
%macro BUTTERFLY3V 5 |
movaps m%5, m%1 |
addps m%1, m%2 |
subps m%5, m%2 |
SWAP %2, %5 |
mulps m%2, [ps_cos_vec+192] |
movaps m%5, m%3 |
addps m%3, m%4 |
subps m%4, m%5 |
mulps m%4, [ps_cos_vec+192] |
%endmacro |
%macro PASS6_AND_PERMUTE 0 |
mov tmpd, [outq+4] |
movss m7, [outq+72] |
addss m7, [outq+76] |
movss m3, [outq+56] |
addss m3, [outq+60] |
addss m4, m3 |
movss m2, [outq+52] |
addss m2, m3 |
movss m3, [outq+104] |
addss m3, [outq+108] |
addss m1, m3 |
addss m5, m4 |
movss [outq+ 16], m1 |
movss m1, [outq+100] |
addss m1, m3 |
movss m3, [outq+40] |
movss [outq+ 48], m1 |
addss m3, [outq+44] |
movss m1, [outq+100] |
addss m4, m3 |
addss m3, m2 |
addss m1, [outq+108] |
movss [outq+ 40], m3 |
addss m2, [outq+36] |
movss m3, [outq+8] |
movss [outq+ 56], m2 |
addss m3, [outq+12] |
movss [outq+ 32], m3 |
movss m3, [outq+80] |
movss [outq+ 8], m5 |
movss [outq+ 80], m1 |
movss m2, [outq+52] |
movss m5, [outq+120] |
addss m5, [outq+124] |
movss m1, [outq+64] |
addss m2, [outq+60] |
addss m0, m5 |
addss m5, [outq+116] |
mov [outq+64], tmpd |
addss m6, m0 |
addss m1, m6 |
mov tmpd, [outq+12] |
mov [outq+ 96], tmpd |
movss [outq+ 4], m1 |
movss m1, [outq+24] |
movss [outq+ 24], m4 |
movss m4, [outq+88] |
addss m4, [outq+92] |
addss m3, m4 |
addss m4, [outq+84] |
mov tmpd, [outq+108] |
addss m1, [outq+28] |
addss m0, m1 |
addss m1, m5 |
addss m6, m3 |
addss m3, m0 |
addss m0, m7 |
addss m5, [outq+20] |
addss m7, m1 |
movss [outq+ 12], m6 |
mov [outq+112], tmpd |
movss m6, [outq+28] |
movss [outq+ 28], m0 |
movss m0, [outq+36] |
movss [outq+ 36], m7 |
addss m1, m4 |
movss m7, [outq+116] |
addss m0, m2 |
addss m7, [outq+124] |
movss [outq+ 72], m0 |
movss m0, [outq+44] |
addss m2, m0 |
movss [outq+ 44], m1 |
movss [outq+ 88], m2 |
addss m0, [outq+60] |
mov tmpd, [outq+60] |
mov [outq+120], tmpd |
movss [outq+104], m0 |
addss m4, m5 |
addss m5, [outq+68] |
movss [outq+52], m4 |
movss [outq+60], m5 |
movss m4, [outq+68] |
movss m5, [outq+20] |
movss [outq+ 20], m3 |
addss m5, m7 |
addss m7, m6 |
addss m4, m5 |
movss m2, [outq+84] |
addss m2, [outq+92] |
addss m5, m2 |
movss [outq+ 68], m4 |
addss m2, m7 |
movss m4, [outq+76] |
movss [outq+ 84], m2 |
movss [outq+ 76], m5 |
addss m7, m4 |
addss m6, [outq+124] |
addss m4, m6 |
addss m6, [outq+92] |
movss [outq+100], m4 |
movss [outq+108], m6 |
movss m6, [outq+92] |
movss [outq+92], m7 |
addss m6, [outq+124] |
movss [outq+116], m6 |
%endmacro |
INIT_YMM avx |
SECTION_TEXT |
%if HAVE_AVX_EXTERNAL |
; void ff_dct32_float_avx(FFTSample *out, const FFTSample *in) |
cglobal dct32_float, 2,3,8, out, in, tmp |
; pass 1 |
vmovaps m4, [inq+0] |
vinsertf128 m5, m5, [inq+96], 1 |
vinsertf128 m5, m5, [inq+112], 0 |
vshufps m5, m5, m5, 0x1b |
BUTTERFLY m4, m5, [ps_cos_vec], m6 |
vmovaps m2, [inq+64] |
vinsertf128 m6, m6, [inq+32], 1 |
vinsertf128 m6, m6, [inq+48], 0 |
vshufps m6, m6, m6, 0x1b |
BUTTERFLY m2, m6, [ps_cos_vec+32], m0 |
; pass 2 |
BUTTERFLY m5, m6, [ps_cos_vec+64], m0 |
BUTTERFLY m4, m2, [ps_cos_vec+64], m7 |
; pass 3 |
vperm2f128 m3, m6, m4, 0x31 |
vperm2f128 m1, m6, m4, 0x20 |
vshufps m3, m3, m3, 0x1b |
BUTTERFLY m1, m3, [ps_cos_vec+96], m6 |
vperm2f128 m4, m5, m2, 0x20 |
vperm2f128 m5, m5, m2, 0x31 |
vshufps m5, m5, m5, 0x1b |
BUTTERFLY m4, m5, [ps_cos_vec+96], m6 |
; pass 4 |
vmovaps m6, [ps_p1p1m1m1+0] |
vmovaps m2, [ps_cos_vec+128] |
BUTTERFLY2 m5, m6, m2, m7 |
BUTTERFLY2 m4, m6, m2, m7 |
BUTTERFLY2 m1, m6, m2, m7 |
BUTTERFLY2 m3, m6, m2, m7 |
; pass 5 |
vshufps m6, m6, m6, 0xcc |
vmovaps m2, [ps_cos_vec+160] |
BUTTERFLY3 m5, m6, m2, m7 |
BUTTERFLY3 m4, m6, m2, m7 |
BUTTERFLY3 m1, m6, m2, m7 |
BUTTERFLY3 m3, m6, m2, m7 |
vperm2f128 m6, m3, m3, 0x31 |
vmovaps [outq], m3 |
vextractf128 [outq+64], m5, 1 |
vextractf128 [outq+32], m5, 0 |
vextractf128 [outq+80], m4, 1 |
vextractf128 [outq+48], m4, 0 |
vperm2f128 m0, m1, m1, 0x31 |
vmovaps [outq+96], m1 |
vzeroupper |
; pass 6, no SIMD... |
INIT_XMM |
PASS6_AND_PERMUTE |
RET |
%endif |
%if ARCH_X86_64 |
%define SPILL SWAP |
%define UNSPILL SWAP |
%macro PASS5 0 |
nop ; FIXME code alignment |
SWAP 5, 8 |
SWAP 4, 12 |
SWAP 6, 14 |
SWAP 7, 13 |
SWAP 0, 15 |
PERMUTE 9,10, 10,12, 11,14, 12,9, 13,11, 14,13 |
TRANSPOSE4x4PS 8, 9, 10, 11, 0 |
BUTTERFLY3V 8, 9, 10, 11, 0 |
addps m10, m11 |
TRANSPOSE4x4PS 12, 13, 14, 15, 0 |
BUTTERFLY3V 12, 13, 14, 15, 0 |
addps m14, m15 |
addps m12, m14 |
addps m14, m13 |
addps m13, m15 |
%endmacro |
%macro PASS6 0 |
SWAP 9, 12 |
SWAP 11, 14 |
movss [outq+0x00], m8 |
pshuflw m0, m8, 0xe |
movss [outq+0x10], m9 |
pshuflw m1, m9, 0xe |
movss [outq+0x20], m10 |
pshuflw m2, m10, 0xe |
movss [outq+0x30], m11 |
pshuflw m3, m11, 0xe |
movss [outq+0x40], m12 |
pshuflw m4, m12, 0xe |
movss [outq+0x50], m13 |
pshuflw m5, m13, 0xe |
movss [outq+0x60], m14 |
pshuflw m6, m14, 0xe |
movaps [outq+0x70], m15 |
pshuflw m7, m15, 0xe |
addss m0, m1 |
addss m1, m2 |
movss [outq+0x08], m0 |
addss m2, m3 |
movss [outq+0x18], m1 |
addss m3, m4 |
movss [outq+0x28], m2 |
addss m4, m5 |
movss [outq+0x38], m3 |
addss m5, m6 |
movss [outq+0x48], m4 |
addss m6, m7 |
movss [outq+0x58], m5 |
movss [outq+0x68], m6 |
movss [outq+0x78], m7 |
PERMUTE 1,8, 3,9, 5,10, 7,11, 9,12, 11,13, 13,14, 8,1, 10,3, 12,5, 14,7 |
movhlps m0, m1 |
pshufd m1, m1, 3 |
SWAP 0, 2, 4, 6, 8, 10, 12, 14 |
SWAP 1, 3, 5, 7, 9, 11, 13, 15 |
%rep 7 |
movhlps m0, m1 |
pshufd m1, m1, 3 |
addss m15, m1 |
SWAP 0, 2, 4, 6, 8, 10, 12, 14 |
SWAP 1, 3, 5, 7, 9, 11, 13, 15 |
%endrep |
%assign i 4 |
%rep 15 |
addss m0, m1 |
movss [outq+i], m0 |
SWAP 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15 |
%assign i i+8 |
%endrep |
%endmacro |
%else ; ARCH_X86_32 |
%macro SPILL 2 ; xmm#, mempos |
movaps [outq+(%2-8)*16], m%1 |
%endmacro |
%macro UNSPILL 2 |
movaps m%1, [outq+(%2-8)*16] |
%endmacro |
%define PASS6 PASS6_AND_PERMUTE |
%macro PASS5 0 |
movaps m2, [ps_cos_vec+160] |
shufps m3, m3, 0xcc |
BUTTERFLY3 m5, m3, m2, m1 |
SPILL 5, 8 |
UNSPILL 1, 9 |
BUTTERFLY3 m1, m3, m2, m5 |
SPILL 1, 14 |
BUTTERFLY3 m4, m3, m2, m5 |
SPILL 4, 12 |
BUTTERFLY3 m7, m3, m2, m5 |
SPILL 7, 13 |
UNSPILL 5, 10 |
BUTTERFLY3 m5, m3, m2, m7 |
SPILL 5, 10 |
UNSPILL 4, 11 |
BUTTERFLY3 m4, m3, m2, m7 |
SPILL 4, 11 |
BUTTERFLY3 m6, m3, m2, m7 |
SPILL 6, 9 |
BUTTERFLY3 m0, m3, m2, m7 |
SPILL 0, 15 |
%endmacro |
%endif |
; void ff_dct32_float_sse(FFTSample *out, const FFTSample *in) |
%macro DCT32_FUNC 0 |
cglobal dct32_float, 2, 3, 16, out, in, tmp |
; pass 1 |
movaps m0, [inq+0] |
LOAD_INV m1, [inq+112] |
BUTTERFLY m0, m1, [ps_cos_vec], m3 |
movaps m7, [inq+64] |
LOAD_INV m4, [inq+48] |
BUTTERFLY m7, m4, [ps_cos_vec+32], m3 |
; pass 2 |
movaps m2, [ps_cos_vec+64] |
BUTTERFLY m1, m4, m2, m3 |
SPILL 1, 11 |
SPILL 4, 8 |
; pass 1 |
movaps m1, [inq+16] |
LOAD_INV m6, [inq+96] |
BUTTERFLY m1, m6, [ps_cos_vec+16], m3 |
movaps m4, [inq+80] |
LOAD_INV m5, [inq+32] |
BUTTERFLY m4, m5, [ps_cos_vec+48], m3 |
; pass 2 |
BUTTERFLY m0, m7, m2, m3 |
movaps m2, [ps_cos_vec+80] |
BUTTERFLY m6, m5, m2, m3 |
BUTTERFLY m1, m4, m2, m3 |
; pass 3 |
movaps m2, [ps_cos_vec+96] |
shufps m1, m1, 0x1b |
BUTTERFLY m0, m1, m2, m3 |
SPILL 0, 15 |
SPILL 1, 14 |
UNSPILL 0, 8 |
shufps m5, m5, 0x1b |
BUTTERFLY m0, m5, m2, m3 |
UNSPILL 1, 11 |
shufps m6, m6, 0x1b |
BUTTERFLY m1, m6, m2, m3 |
SPILL 1, 11 |
shufps m4, m4, 0x1b |
BUTTERFLY m7, m4, m2, m3 |
; pass 4 |
movaps m3, [ps_p1p1m1m1+0] |
movaps m2, [ps_cos_vec+128] |
BUTTERFLY2 m5, m3, m2, m1 |
BUTTERFLY2 m0, m3, m2, m1 |
SPILL 0, 9 |
BUTTERFLY2 m6, m3, m2, m1 |
SPILL 6, 10 |
UNSPILL 0, 11 |
BUTTERFLY2 m0, m3, m2, m1 |
SPILL 0, 11 |
BUTTERFLY2 m4, m3, m2, m1 |
BUTTERFLY2 m7, m3, m2, m1 |
UNSPILL 6, 14 |
BUTTERFLY2 m6, m3, m2, m1 |
UNSPILL 0, 15 |
BUTTERFLY2 m0, m3, m2, m1 |
PASS5 |
PASS6 |
RET |
%endmacro |
%macro LOAD_INV 2 |
%if cpuflag(sse2) |
pshufd %1, %2, 0x1b |
%elif cpuflag(sse) |
movaps %1, %2 |
shufps %1, %1, 0x1b |
%endif |
%endmacro |
INIT_XMM sse |
DCT32_FUNC |
INIT_XMM sse2 |
DCT32_FUNC |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dct_init.c |
---|
0,0 → 1,39 |
/* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "config.h" |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/dct.h" |
void ff_dct32_float_sse(FFTSample *out, const FFTSample *in); |
void ff_dct32_float_sse2(FFTSample *out, const FFTSample *in); |
void ff_dct32_float_avx(FFTSample *out, const FFTSample *in); |
av_cold void ff_dct_init_x86(DCTContext *s) |
{ |
int cpu_flags = av_get_cpu_flags(); |
if (EXTERNAL_SSE(cpu_flags)) |
s->dct32 = ff_dct32_float_sse; |
if (EXTERNAL_SSE2(cpu_flags)) |
s->dct32 = ff_dct32_float_sse2; |
if (EXTERNAL_AVX(cpu_flags)) |
s->dct32 = ff_dct32_float_avx; |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/deinterlace.asm |
---|
0,0 → 1,82 |
;****************************************************************************** |
;* MMX optimized deinterlacing functions |
;* Copyright (c) 2010 Vitor Sessak |
;* Copyright (c) 2002 Michael Niedermayer |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
cextern pw_4 |
SECTION .text |
%macro DEINTERLACE 1 |
%ifidn %1, inplace |
;void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size) |
cglobal deinterlace_line_inplace_mmx, 6,6,7, lum_m4, lum_m3, lum_m2, lum_m1, lum, size |
%else |
;void ff_deinterlace_line_mmx(uint8_t *dst, const uint8_t *lum_m4, const uint8_t *lum_m3, const uint8_t *lum_m2, const uint8_t *lum_m1, const uint8_t *lum, int size) |
cglobal deinterlace_line_mmx, 7,7,7, dst, lum_m4, lum_m3, lum_m2, lum_m1, lum, size |
%endif |
pxor mm7, mm7 |
movq mm6, [pw_4] |
.nextrow: |
movd mm0, [lum_m4q] |
movd mm1, [lum_m3q] |
movd mm2, [lum_m2q] |
%ifidn %1, inplace |
movd [lum_m4q], mm2 |
%endif |
movd mm3, [lum_m1q] |
movd mm4, [lumq] |
punpcklbw mm0, mm7 |
punpcklbw mm1, mm7 |
punpcklbw mm2, mm7 |
punpcklbw mm3, mm7 |
punpcklbw mm4, mm7 |
paddw mm1, mm3 |
psllw mm2, 1 |
paddw mm0, mm4 |
psllw mm1, 2 |
paddw mm2, mm6 |
paddw mm1, mm2 |
psubusw mm1, mm0 |
psrlw mm1, 3 |
packuswb mm1, mm7 |
%ifidn %1, inplace |
movd [lum_m2q], mm1 |
%else |
movd [dstq], mm1 |
add dstq, 4 |
%endif |
add lum_m4q, 4 |
add lum_m3q, 4 |
add lum_m2q, 4 |
add lum_m1q, 4 |
add lumq, 4 |
sub sized, 4 |
jg .nextrow |
REP_RET |
%endmacro |
DEINTERLACE "" |
DEINTERLACE inplace |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dirac_dwt.c |
---|
0,0 → 1,202 |
/* |
* MMX optimized discrete wavelet transform |
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
* Copyright (c) 2010 David Conrad |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/x86/asm.h" |
#include "dsputil_x86.h" |
#include "dirac_dwt.h" |
#define COMPOSE_VERTICAL(ext, align) \ |
void ff_vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ |
void ff_vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width); \ |
void ff_vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ |
void ff_vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, int width); \ |
void ff_vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width); \ |
void ff_horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ |
void ff_horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w);\ |
\ |
static void vertical_compose53iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ |
{ \ |
int i, width_align = width&~(align-1); \ |
\ |
for(i=width_align; i<width; i++) \ |
b1[i] = COMPOSE_53iL0(b0[i], b1[i], b2[i]); \ |
\ |
ff_vertical_compose53iL0##ext(b0, b1, b2, width_align); \ |
} \ |
\ |
static void vertical_compose_dirac53iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, int width) \ |
{ \ |
int i, width_align = width&~(align-1); \ |
\ |
for(i=width_align; i<width; i++) \ |
b1[i] = COMPOSE_DIRAC53iH0(b0[i], b1[i], b2[i]); \ |
\ |
ff_vertical_compose_dirac53iH0##ext(b0, b1, b2, width_align); \ |
} \ |
\ |
static void vertical_compose_dd137iL0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ |
IDWTELEM *b3, IDWTELEM *b4, int width) \ |
{ \ |
int i, width_align = width&~(align-1); \ |
\ |
for(i=width_align; i<width; i++) \ |
b2[i] = COMPOSE_DD137iL0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ |
\ |
ff_vertical_compose_dd137iL0##ext(b0, b1, b2, b3, b4, width_align); \ |
} \ |
\ |
static void vertical_compose_dd97iH0##ext(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, \ |
IDWTELEM *b3, IDWTELEM *b4, int width) \ |
{ \ |
int i, width_align = width&~(align-1); \ |
\ |
for(i=width_align; i<width; i++) \ |
b2[i] = COMPOSE_DD97iH0(b0[i], b1[i], b2[i], b3[i], b4[i]); \ |
\ |
ff_vertical_compose_dd97iH0##ext(b0, b1, b2, b3, b4, width_align); \ |
} \ |
static void vertical_compose_haar##ext(IDWTELEM *b0, IDWTELEM *b1, int width) \ |
{ \ |
int i, width_align = width&~(align-1); \ |
\ |
for(i=width_align; i<width; i++) { \ |
b0[i] = COMPOSE_HAARiL0(b0[i], b1[i]); \ |
b1[i] = COMPOSE_HAARiH0(b1[i], b0[i]); \ |
} \ |
\ |
ff_vertical_compose_haar##ext(b0, b1, width_align); \ |
} \ |
static void horizontal_compose_haar0i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ |
{\ |
int w2= w>>1;\ |
int x= w2 - (w2&(align-1));\ |
ff_horizontal_compose_haar0i##ext(b, tmp, w);\ |
\ |
for (; x < w2; x++) {\ |
b[2*x ] = tmp[x];\ |
b[2*x+1] = COMPOSE_HAARiH0(b[x+w2], tmp[x]);\ |
}\ |
}\ |
static void horizontal_compose_haar1i##ext(IDWTELEM *b, IDWTELEM *tmp, int w)\ |
{\ |
int w2= w>>1;\ |
int x= w2 - (w2&(align-1));\ |
ff_horizontal_compose_haar1i##ext(b, tmp, w);\ |
\ |
for (; x < w2; x++) {\ |
b[2*x ] = (tmp[x] + 1)>>1;\ |
b[2*x+1] = (COMPOSE_HAARiH0(b[x+w2], tmp[x]) + 1)>>1;\ |
}\ |
}\ |
\ |
#if HAVE_YASM |
#if !ARCH_X86_64 |
COMPOSE_VERTICAL(_mmx, 4) |
#endif |
COMPOSE_VERTICAL(_sse2, 8) |
void ff_horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w); |
static void horizontal_compose_dd97i_ssse3(IDWTELEM *b, IDWTELEM *tmp, int w) |
{ |
int w2= w>>1; |
int x= w2 - (w2&7); |
ff_horizontal_compose_dd97i_ssse3(b, tmp, w); |
for (; x < w2; x++) { |
b[2*x ] = (tmp[x] + 1)>>1; |
b[2*x+1] = (COMPOSE_DD97iH0(tmp[x-1], tmp[x], b[x+w2], tmp[x+1], tmp[x+2]) + 1)>>1; |
} |
} |
#endif |
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type) |
{ |
#if HAVE_YASM |
int mm_flags = av_get_cpu_flags(); |
#if !ARCH_X86_64 |
if (!(mm_flags & AV_CPU_FLAG_MMX)) |
return; |
switch (type) { |
case DWT_DIRAC_DD9_7: |
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; |
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; |
break; |
case DWT_DIRAC_LEGALL5_3: |
d->vertical_compose_l0 = (void*)vertical_compose53iL0_mmx; |
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_mmx; |
break; |
case DWT_DIRAC_DD13_7: |
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_mmx; |
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_mmx; |
break; |
case DWT_DIRAC_HAAR0: |
d->vertical_compose = (void*)vertical_compose_haar_mmx; |
d->horizontal_compose = horizontal_compose_haar0i_mmx; |
break; |
case DWT_DIRAC_HAAR1: |
d->vertical_compose = (void*)vertical_compose_haar_mmx; |
d->horizontal_compose = horizontal_compose_haar1i_mmx; |
break; |
} |
#endif |
if (!(mm_flags & AV_CPU_FLAG_SSE2)) |
return; |
switch (type) { |
case DWT_DIRAC_DD9_7: |
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; |
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; |
break; |
case DWT_DIRAC_LEGALL5_3: |
d->vertical_compose_l0 = (void*)vertical_compose53iL0_sse2; |
d->vertical_compose_h0 = (void*)vertical_compose_dirac53iH0_sse2; |
break; |
case DWT_DIRAC_DD13_7: |
d->vertical_compose_l0 = (void*)vertical_compose_dd137iL0_sse2; |
d->vertical_compose_h0 = (void*)vertical_compose_dd97iH0_sse2; |
break; |
case DWT_DIRAC_HAAR0: |
d->vertical_compose = (void*)vertical_compose_haar_sse2; |
d->horizontal_compose = horizontal_compose_haar0i_sse2; |
break; |
case DWT_DIRAC_HAAR1: |
d->vertical_compose = (void*)vertical_compose_haar_sse2; |
d->horizontal_compose = horizontal_compose_haar1i_sse2; |
break; |
} |
if (!(mm_flags & AV_CPU_FLAG_SSSE3)) |
return; |
switch (type) { |
case DWT_DIRAC_DD9_7: |
d->horizontal_compose = horizontal_compose_dd97i_ssse3; |
break; |
} |
#endif // HAVE_YASM |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dirac_dwt.h |
---|
0,0 → 1,30 |
/* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#ifndef AVCODEC_X86_DIRAC_DWT_H |
#define AVCODEC_X86_DIRAC_DWT_H |
#include "libavcodec/dirac_dwt.h" |
void ff_horizontal_compose_dd97i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); |
void ff_horizontal_compose_haar1i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); |
void ff_horizontal_compose_haar0i_end_c(IDWTELEM *b, IDWTELEM *tmp, int w2, int x); |
void ff_spatial_idwt_init_mmx(DWTContext *d, enum dwt_type type); |
#endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/diracdsp_mmx.c |
---|
0,0 → 1,104 |
/* |
* Copyright (C) 2010 David Conrad |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "dsputil_x86.h" |
#include "diracdsp_mmx.h" |
void ff_put_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
void ff_put_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
void ff_put_signed_rect_clamped_mmx(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
void ff_put_signed_rect_clamped_sse2(uint8_t *dst, int dst_stride, const int16_t *src, int src_stride, int width, int height); |
#define HPEL_FILTER(MMSIZE, EXT) \ |
void ff_dirac_hpel_filter_v_ ## EXT(uint8_t *, const uint8_t *, int, int); \ |
void ff_dirac_hpel_filter_h_ ## EXT(uint8_t *, const uint8_t *, int); \ |
\ |
static void dirac_hpel_filter_ ## EXT(uint8_t *dsth, uint8_t *dstv, uint8_t *dstc, \ |
const uint8_t *src, int stride, int width, int height) \ |
{ \ |
while( height-- ) \ |
{ \ |
ff_dirac_hpel_filter_v_ ## EXT(dstv-MMSIZE, src-MMSIZE, stride, width+MMSIZE+5); \ |
ff_dirac_hpel_filter_h_ ## EXT(dsth, src, width); \ |
ff_dirac_hpel_filter_h_ ## EXT(dstc, dstv, width); \ |
\ |
dsth += stride; \ |
dstv += stride; \ |
dstc += stride; \ |
src += stride; \ |
} \ |
} |
#if !ARCH_X86_64 |
HPEL_FILTER(8, mmx) |
#endif |
HPEL_FILTER(16, sse2) |
#define PIXFUNC(PFX, IDX, EXT) \ |
/*MMXDISABLEDc->PFX ## _dirac_pixels_tab[0][IDX] = ff_ ## PFX ## _dirac_pixels8_ ## EXT;*/ \ |
c->PFX ## _dirac_pixels_tab[1][IDX] = ff_ ## PFX ## _dirac_pixels16_ ## EXT; \ |
c->PFX ## _dirac_pixels_tab[2][IDX] = ff_ ## PFX ## _dirac_pixels32_ ## EXT |
void ff_diracdsp_init_mmx(DiracDSPContext* c) |
{ |
int mm_flags = av_get_cpu_flags(); |
if (!(mm_flags & AV_CPU_FLAG_MMX)) |
return; |
#if HAVE_YASM |
c->add_dirac_obmc[0] = ff_add_dirac_obmc8_mmx; |
#if !ARCH_X86_64 |
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_mmx; |
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_mmx; |
c->dirac_hpel_filter = dirac_hpel_filter_mmx; |
c->add_rect_clamped = ff_add_rect_clamped_mmx; |
c->put_signed_rect_clamped = ff_put_signed_rect_clamped_mmx; |
#endif |
#endif |
#if HAVE_MMX_INLINE |
PIXFUNC(put, 0, mmx); |
PIXFUNC(avg, 0, mmx); |
#endif |
#if HAVE_MMXEXT_INLINE |
if (mm_flags & AV_CPU_FLAG_MMX2) { |
PIXFUNC(avg, 0, mmxext); |
} |
#endif |
if (mm_flags & AV_CPU_FLAG_SSE2) { |
#if HAVE_YASM |
c->dirac_hpel_filter = dirac_hpel_filter_sse2; |
c->add_rect_clamped = ff_add_rect_clamped_sse2; |
c->put_signed_rect_clamped = ff_put_signed_rect_clamped_sse2; |
c->add_dirac_obmc[1] = ff_add_dirac_obmc16_sse2; |
c->add_dirac_obmc[2] = ff_add_dirac_obmc32_sse2; |
#endif |
#if HAVE_SSE2_INLINE |
c->put_dirac_pixels_tab[1][0] = ff_put_dirac_pixels16_sse2; |
c->avg_dirac_pixels_tab[1][0] = ff_avg_dirac_pixels16_sse2; |
c->put_dirac_pixels_tab[2][0] = ff_put_dirac_pixels32_sse2; |
c->avg_dirac_pixels_tab[2][0] = ff_avg_dirac_pixels32_sse2; |
#endif |
} |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/diracdsp_mmx.h |
---|
0,0 → 1,47 |
/* |
* Copyright (c) 2010 David Conrad |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#ifndef AVCODEC_X86_DIRACDSP_H |
#define AVCODEC_X86_DIRACDSP_H |
#include "libavcodec/diracdsp.h" |
void ff_diracdsp_init_mmx(DiracDSPContext* c); |
DECL_DIRAC_PIXOP(put, mmx); |
DECL_DIRAC_PIXOP(avg, mmx); |
DECL_DIRAC_PIXOP(avg, mmxext); |
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); |
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); |
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); |
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h); |
void ff_add_rect_clamped_mmx(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); |
void ff_add_rect_clamped_sse2(uint8_t *, const uint16_t *, int, const int16_t *, int, int, int); |
void ff_add_dirac_obmc8_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
void ff_add_dirac_obmc16_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
void ff_add_dirac_obmc32_mmx(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
void ff_add_dirac_obmc16_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
void ff_add_dirac_obmc32_sse2(uint16_t *dst, const uint8_t *src, int stride, const uint8_t *obmc_weight, int yblen); |
#endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/diracdsp_yasm.asm |
---|
0,0 → 1,264 |
;****************************************************************************** |
;* Copyright (c) 2010 David Conrad |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
pw_3: times 8 dw 3 |
pw_7: times 8 dw 7 |
pw_16: times 8 dw 16 |
pw_32: times 8 dw 32 |
pb_128: times 16 db 128 |
section .text |
%macro UNPACK_ADD 6 |
mov%5 %1, %3 |
mov%6 m5, %4 |
mova m4, %1 |
mova %2, m5 |
punpcklbw %1, m7 |
punpcklbw m5, m7 |
punpckhbw m4, m7 |
punpckhbw %2, m7 |
paddw %1, m5 |
paddw %2, m4 |
%endmacro |
%macro HPEL_FILTER 1 |
; dirac_hpel_filter_v_sse2(uint8_t *dst, uint8_t *src, int stride, int width); |
cglobal dirac_hpel_filter_v_%1, 4,6,8, dst, src, stride, width, src0, stridex3 |
mov src0q, srcq |
lea stridex3q, [3*strideq] |
sub src0q, stridex3q |
pxor m7, m7 |
.loop: |
; 7*(src[0] + src[1]) |
UNPACK_ADD m0, m1, [srcq], [srcq + strideq], a,a |
pmullw m0, [pw_7] |
pmullw m1, [pw_7] |
; 3*( ... + src[-2] + src[3]) |
UNPACK_ADD m2, m3, [src0q + strideq], [srcq + stridex3q], a,a |
paddw m0, m2 |
paddw m1, m3 |
pmullw m0, [pw_3] |
pmullw m1, [pw_3] |
; ... - 7*(src[-1] + src[2]) |
UNPACK_ADD m2, m3, [src0q + strideq*2], [srcq + strideq*2], a,a |
pmullw m2, [pw_7] |
pmullw m3, [pw_7] |
psubw m0, m2 |
psubw m1, m3 |
; ... - (src[-3] + src[4]) |
UNPACK_ADD m2, m3, [src0q], [srcq + strideq*4], a,a |
psubw m0, m2 |
psubw m1, m3 |
paddw m0, [pw_16] |
paddw m1, [pw_16] |
psraw m0, 5 |
psraw m1, 5 |
packuswb m0, m1 |
mova [dstq], m0 |
add dstq, mmsize |
add srcq, mmsize |
add src0q, mmsize |
sub widthd, mmsize |
jg .loop |
RET |
; dirac_hpel_filter_h_sse2(uint8_t *dst, uint8_t *src, int width); |
cglobal dirac_hpel_filter_h_%1, 3,3,8, dst, src, width |
dec widthd |
pxor m7, m7 |
and widthd, ~(mmsize-1) |
.loop: |
; 7*(src[0] + src[1]) |
UNPACK_ADD m0, m1, [srcq + widthq], [srcq + widthq + 1], u,u |
pmullw m0, [pw_7] |
pmullw m1, [pw_7] |
; 3*( ... + src[-2] + src[3]) |
UNPACK_ADD m2, m3, [srcq + widthq - 2], [srcq + widthq + 3], u,u |
paddw m0, m2 |
paddw m1, m3 |
pmullw m0, [pw_3] |
pmullw m1, [pw_3] |
; ... - 7*(src[-1] + src[2]) |
UNPACK_ADD m2, m3, [srcq + widthq - 1], [srcq + widthq + 2], u,u |
pmullw m2, [pw_7] |
pmullw m3, [pw_7] |
psubw m0, m2 |
psubw m1, m3 |
; ... - (src[-3] + src[4]) |
UNPACK_ADD m2, m3, [srcq + widthq - 3], [srcq + widthq + 4], u,u |
psubw m0, m2 |
psubw m1, m3 |
paddw m0, [pw_16] |
paddw m1, [pw_16] |
psraw m0, 5 |
psraw m1, 5 |
packuswb m0, m1 |
mova [dstq + widthq], m0 |
sub widthd, mmsize |
jge .loop |
RET |
%endmacro |
%macro PUT_RECT 1 |
; void put_rect_clamped(uint8_t *dst, int dst_stride, int16_t *src, int src_stride, int width, int height) |
cglobal put_signed_rect_clamped_%1, 5,9,3, dst, dst_stride, src, src_stride, w, dst2, src2 |
mova m0, [pb_128] |
add wd, (mmsize-1) |
and wd, ~(mmsize-1) |
%if ARCH_X86_64 |
movsxd dst_strideq, dst_strided |
movsxd src_strideq, src_strided |
mov r7d, r5m |
mov r8d, wd |
%define wspill r8d |
%define hd r7d |
%else |
mov r4m, wd |
%define wspill r4m |
%define hd r5mp |
%endif |
.loopy |
lea src2q, [srcq+src_strideq*2] |
lea dst2q, [dstq+dst_strideq] |
.loopx: |
sub wd, mmsize |
mova m1, [srcq +2*wq] |
mova m2, [src2q+2*wq] |
packsswb m1, [srcq +2*wq+mmsize] |
packsswb m2, [src2q+2*wq+mmsize] |
paddb m1, m0 |
paddb m2, m0 |
mova [dstq +wq], m1 |
mova [dst2q+wq], m2 |
jg .loopx |
lea srcq, [srcq+src_strideq*4] |
lea dstq, [dstq+dst_strideq*2] |
sub hd, 2 |
mov wd, wspill |
jg .loopy |
RET |
%endm |
%macro ADD_RECT 1 |
; void add_rect_clamped(uint8_t *dst, uint16_t *src, int stride, int16_t *idwt, int idwt_stride, int width, int height) |
cglobal add_rect_clamped_%1, 7,9,3, dst, src, stride, idwt, idwt_stride, w, h |
mova m0, [pw_32] |
add wd, (mmsize-1) |
and wd, ~(mmsize-1) |
%if ARCH_X86_64 |
movsxd strideq, strided |
movsxd idwt_strideq, idwt_strided |
mov r8d, wd |
%define wspill r8d |
%else |
mov r5m, wd |
%define wspill r5m |
%endif |
.loop: |
sub wd, mmsize |
movu m1, [srcq +2*wq] ; FIXME: ensure alignment |
paddw m1, m0 |
psraw m1, 6 |
movu m2, [srcq +2*wq+mmsize] ; FIXME: ensure alignment |
paddw m2, m0 |
psraw m2, 6 |
paddw m1, [idwtq+2*wq] |
paddw m2, [idwtq+2*wq+mmsize] |
packuswb m1, m2 |
mova [dstq +wq], m1 |
jg .loop |
lea srcq, [srcq + 2*strideq] |
add dstq, strideq |
lea idwtq, [idwtq+ 2*idwt_strideq] |
sub hd, 1 |
mov wd, wspill |
jg .loop |
RET |
%endm |
%macro ADD_OBMC 2 |
; void add_obmc(uint16_t *dst, uint8_t *src, int stride, uint8_t *obmc_weight, int yblen) |
cglobal add_dirac_obmc%1_%2, 6,6,5, dst, src, stride, obmc, yblen |
pxor m4, m4 |
.loop: |
%assign i 0 |
%rep %1 / mmsize |
mova m0, [srcq+i] |
mova m1, m0 |
punpcklbw m0, m4 |
punpckhbw m1, m4 |
mova m2, [obmcq+i] |
mova m3, m2 |
punpcklbw m2, m4 |
punpckhbw m3, m4 |
pmullw m0, m2 |
pmullw m1, m3 |
movu m2, [dstq+2*i] |
movu m3, [dstq+2*i+mmsize] |
paddw m0, m2 |
paddw m1, m3 |
movu [dstq+2*i], m0 |
movu [dstq+2*i+mmsize], m1 |
%assign i i+mmsize |
%endrep |
lea srcq, [srcq+strideq] |
lea dstq, [dstq+2*strideq] |
add obmcq, 32 |
sub yblend, 1 |
jg .loop |
RET |
%endm |
INIT_MMX |
%if ARCH_X86_64 == 0 |
PUT_RECT mmx |
ADD_RECT mmx |
HPEL_FILTER mmx |
ADD_OBMC 32, mmx |
ADD_OBMC 16, mmx |
%endif |
ADD_OBMC 8, mmx |
INIT_XMM |
PUT_RECT sse2 |
ADD_RECT sse2 |
HPEL_FILTER sse2 |
ADD_OBMC 32, sse2 |
ADD_OBMC 16, sse2 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dnxhdenc.c |
---|
0,0 → 1,67 |
/* |
* VC3/DNxHD SIMD functions |
* Copyright (c) 2007 Baptiste Coudurier <baptiste dot coudurier at smartjog dot com> |
* |
* VC-3 encoder funded by the British Broadcasting Corporation |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/dnxhdenc.h" |
#if HAVE_SSE2_INLINE |
static void get_pixels_8x4_sym_sse2(int16_t *block, const uint8_t *pixels, int line_size) |
{ |
__asm__ volatile( |
"pxor %%xmm5, %%xmm5 \n\t" |
"movq (%0), %%xmm0 \n\t" |
"add %2, %0 \n\t" |
"movq (%0), %%xmm1 \n\t" |
"movq (%0, %2), %%xmm2 \n\t" |
"movq (%0, %2,2), %%xmm3 \n\t" |
"punpcklbw %%xmm5, %%xmm0 \n\t" |
"punpcklbw %%xmm5, %%xmm1 \n\t" |
"punpcklbw %%xmm5, %%xmm2 \n\t" |
"punpcklbw %%xmm5, %%xmm3 \n\t" |
"movdqa %%xmm0, (%1) \n\t" |
"movdqa %%xmm1, 16(%1) \n\t" |
"movdqa %%xmm2, 32(%1) \n\t" |
"movdqa %%xmm3, 48(%1) \n\t" |
"movdqa %%xmm3 , 64(%1) \n\t" |
"movdqa %%xmm2 , 80(%1) \n\t" |
"movdqa %%xmm1 , 96(%1) \n\t" |
"movdqa %%xmm0, 112(%1) \n\t" |
: "+r" (pixels) |
: "r" (block), "r" ((x86_reg)line_size) |
); |
} |
#endif /* HAVE_SSE2_INLINE */ |
av_cold void ff_dnxhdenc_init_x86(DNXHDEncContext *ctx) |
{ |
#if HAVE_SSE2_INLINE |
if (INLINE_SSE2(av_get_cpu_flags())) { |
if (ctx->cid_table->bit_depth == 8) |
ctx->get_pixels_8x4_sym = get_pixels_8x4_sym_sse2; |
} |
#endif /* HAVE_SSE2_INLINE */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil.asm |
---|
0,0 → 1,653 |
;****************************************************************************** |
;* MMX optimized DSP utils |
;* Copyright (c) 2008 Loren Merritt |
;* Copyright (c) 2003-2013 Michael Niedermayer |
;* Copyright (c) 2013 Daniel Kang |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
pb_f: times 16 db 15 |
pb_zzzzzzzz77777777: times 8 db -1 |
pb_7: times 8 db 7 |
pb_zzzz3333zzzzbbbb: db -1,-1,-1,-1,3,3,3,3,-1,-1,-1,-1,11,11,11,11 |
pb_zz11zz55zz99zzdd: db -1,-1,1,1,-1,-1,5,5,-1,-1,9,9,-1,-1,13,13 |
pb_revwords: SHUFFLE_MASK_W 7, 6, 5, 4, 3, 2, 1, 0 |
pd_16384: times 4 dd 16384 |
pb_bswap32: db 3, 2, 1, 0, 7, 6, 5, 4, 11, 10, 9, 8, 15, 14, 13, 12 |
SECTION_TEXT |
%macro SCALARPRODUCT 0 |
; int scalarproduct_int16(int16_t *v1, int16_t *v2, int order) |
cglobal scalarproduct_int16, 3,3,3, v1, v2, order |
shl orderq, 1 |
add v1q, orderq |
add v2q, orderq |
neg orderq |
pxor m2, m2 |
.loop: |
movu m0, [v1q + orderq] |
movu m1, [v1q + orderq + mmsize] |
pmaddwd m0, [v2q + orderq] |
pmaddwd m1, [v2q + orderq + mmsize] |
paddd m2, m0 |
paddd m2, m1 |
add orderq, mmsize*2 |
jl .loop |
%if mmsize == 16 |
movhlps m0, m2 |
paddd m2, m0 |
pshuflw m0, m2, 0x4e |
%else |
pshufw m0, m2, 0x4e |
%endif |
paddd m2, m0 |
movd eax, m2 |
RET |
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
cglobal scalarproduct_and_madd_int16, 4,4,8, v1, v2, v3, order, mul |
shl orderq, 1 |
movd m7, mulm |
%if mmsize == 16 |
pshuflw m7, m7, 0 |
punpcklqdq m7, m7 |
%else |
pshufw m7, m7, 0 |
%endif |
pxor m6, m6 |
add v1q, orderq |
add v2q, orderq |
add v3q, orderq |
neg orderq |
.loop: |
movu m0, [v2q + orderq] |
movu m1, [v2q + orderq + mmsize] |
mova m4, [v1q + orderq] |
mova m5, [v1q + orderq + mmsize] |
movu m2, [v3q + orderq] |
movu m3, [v3q + orderq + mmsize] |
pmaddwd m0, m4 |
pmaddwd m1, m5 |
pmullw m2, m7 |
pmullw m3, m7 |
paddd m6, m0 |
paddd m6, m1 |
paddw m2, m4 |
paddw m3, m5 |
mova [v1q + orderq], m2 |
mova [v1q + orderq + mmsize], m3 |
add orderq, mmsize*2 |
jl .loop |
%if mmsize == 16 |
movhlps m0, m6 |
paddd m6, m0 |
pshuflw m0, m6, 0x4e |
%else |
pshufw m0, m6, 0x4e |
%endif |
paddd m6, m0 |
movd eax, m6 |
RET |
%endmacro |
INIT_MMX mmxext |
SCALARPRODUCT |
INIT_XMM sse2 |
SCALARPRODUCT |
%macro SCALARPRODUCT_LOOP 1 |
align 16 |
.loop%1: |
sub orderq, mmsize*2 |
%if %1 |
mova m1, m4 |
mova m4, [v2q + orderq] |
mova m0, [v2q + orderq + mmsize] |
palignr m1, m0, %1 |
palignr m0, m4, %1 |
mova m3, m5 |
mova m5, [v3q + orderq] |
mova m2, [v3q + orderq + mmsize] |
palignr m3, m2, %1 |
palignr m2, m5, %1 |
%else |
mova m0, [v2q + orderq] |
mova m1, [v2q + orderq + mmsize] |
mova m2, [v3q + orderq] |
mova m3, [v3q + orderq + mmsize] |
%endif |
%define t0 [v1q + orderq] |
%define t1 [v1q + orderq + mmsize] |
%if ARCH_X86_64 |
mova m8, t0 |
mova m9, t1 |
%define t0 m8 |
%define t1 m9 |
%endif |
pmaddwd m0, t0 |
pmaddwd m1, t1 |
pmullw m2, m7 |
pmullw m3, m7 |
paddw m2, t0 |
paddw m3, t1 |
paddd m6, m0 |
paddd m6, m1 |
mova [v1q + orderq], m2 |
mova [v1q + orderq + mmsize], m3 |
jg .loop%1 |
%if %1 |
jmp .end |
%endif |
%endmacro |
; int scalarproduct_and_madd_int16(int16_t *v1, int16_t *v2, int16_t *v3, int order, int mul) |
INIT_XMM ssse3 |
cglobal scalarproduct_and_madd_int16, 4,5,10, v1, v2, v3, order, mul |
shl orderq, 1 |
movd m7, mulm |
pshuflw m7, m7, 0 |
punpcklqdq m7, m7 |
pxor m6, m6 |
mov r4d, v2d |
and r4d, 15 |
and v2q, ~15 |
and v3q, ~15 |
mova m4, [v2q + orderq] |
mova m5, [v3q + orderq] |
; linear is faster than branch tree or jump table, because the branches taken are cyclic (i.e. predictable) |
cmp r4d, 0 |
je .loop0 |
cmp r4d, 2 |
je .loop2 |
cmp r4d, 4 |
je .loop4 |
cmp r4d, 6 |
je .loop6 |
cmp r4d, 8 |
je .loop8 |
cmp r4d, 10 |
je .loop10 |
cmp r4d, 12 |
je .loop12 |
SCALARPRODUCT_LOOP 14 |
SCALARPRODUCT_LOOP 12 |
SCALARPRODUCT_LOOP 10 |
SCALARPRODUCT_LOOP 8 |
SCALARPRODUCT_LOOP 6 |
SCALARPRODUCT_LOOP 4 |
SCALARPRODUCT_LOOP 2 |
SCALARPRODUCT_LOOP 0 |
.end: |
movhlps m0, m6 |
paddd m6, m0 |
pshuflw m0, m6, 0x4e |
paddd m6, m0 |
movd eax, m6 |
RET |
;----------------------------------------------------------------------------- |
; void ff_apply_window_int16(int16_t *output, const int16_t *input, |
; const int16_t *window, unsigned int len) |
;----------------------------------------------------------------------------- |
%macro REVERSE_WORDS 1-2 |
%if cpuflag(ssse3) && notcpuflag(atom) |
pshufb %1, %2 |
%elif cpuflag(sse2) |
pshuflw %1, %1, 0x1B |
pshufhw %1, %1, 0x1B |
pshufd %1, %1, 0x4E |
%elif cpuflag(mmxext) |
pshufw %1, %1, 0x1B |
%endif |
%endmacro |
%macro MUL16FIXED 3 |
%if cpuflag(ssse3) ; dst, src, unused |
; dst = ((dst * src) + (1<<14)) >> 15 |
pmulhrsw %1, %2 |
%elif cpuflag(mmxext) ; dst, src, temp |
; dst = (dst * src) >> 15 |
; pmulhw cuts off the bottom bit, so we have to lshift by 1 and add it back |
; in from the pmullw result. |
mova %3, %1 |
pmulhw %1, %2 |
pmullw %3, %2 |
psrlw %3, 15 |
psllw %1, 1 |
por %1, %3 |
%endif |
%endmacro |
%macro APPLY_WINDOW_INT16 1 ; %1 bitexact version |
%if %1 |
cglobal apply_window_int16, 4,5,6, output, input, window, offset, offset2 |
%else |
cglobal apply_window_int16_round, 4,5,6, output, input, window, offset, offset2 |
%endif |
lea offset2q, [offsetq-mmsize] |
%if cpuflag(ssse3) && notcpuflag(atom) |
mova m5, [pb_revwords] |
ALIGN 16 |
%elif %1 |
mova m5, [pd_16384] |
%endif |
.loop: |
%if cpuflag(ssse3) |
; This version does the 16x16->16 multiplication in-place without expanding |
; to 32-bit. The ssse3 version is bit-identical. |
mova m0, [windowq+offset2q] |
mova m1, [ inputq+offset2q] |
pmulhrsw m1, m0 |
REVERSE_WORDS m0, m5 |
pmulhrsw m0, [ inputq+offsetq ] |
mova [outputq+offset2q], m1 |
mova [outputq+offsetq ], m0 |
%elif %1 |
; This version expands 16-bit to 32-bit, multiplies by the window, |
; adds 16384 for rounding, right shifts 15, then repacks back to words to |
; save to the output. The window is reversed for the second half. |
mova m3, [windowq+offset2q] |
mova m4, [ inputq+offset2q] |
pxor m0, m0 |
punpcklwd m0, m3 |
punpcklwd m1, m4 |
pmaddwd m0, m1 |
paddd m0, m5 |
psrad m0, 15 |
pxor m2, m2 |
punpckhwd m2, m3 |
punpckhwd m1, m4 |
pmaddwd m2, m1 |
paddd m2, m5 |
psrad m2, 15 |
packssdw m0, m2 |
mova [outputq+offset2q], m0 |
REVERSE_WORDS m3 |
mova m4, [ inputq+offsetq] |
pxor m0, m0 |
punpcklwd m0, m3 |
punpcklwd m1, m4 |
pmaddwd m0, m1 |
paddd m0, m5 |
psrad m0, 15 |
pxor m2, m2 |
punpckhwd m2, m3 |
punpckhwd m1, m4 |
pmaddwd m2, m1 |
paddd m2, m5 |
psrad m2, 15 |
packssdw m0, m2 |
mova [outputq+offsetq], m0 |
%else |
; This version does the 16x16->16 multiplication in-place without expanding |
; to 32-bit. The mmxext and sse2 versions do not use rounding, and |
; therefore are not bit-identical to the C version. |
mova m0, [windowq+offset2q] |
mova m1, [ inputq+offset2q] |
mova m2, [ inputq+offsetq ] |
MUL16FIXED m1, m0, m3 |
REVERSE_WORDS m0 |
MUL16FIXED m2, m0, m3 |
mova [outputq+offset2q], m1 |
mova [outputq+offsetq ], m2 |
%endif |
add offsetd, mmsize |
sub offset2d, mmsize |
jae .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
APPLY_WINDOW_INT16 0 |
INIT_XMM sse2 |
APPLY_WINDOW_INT16 0 |
INIT_MMX mmxext |
APPLY_WINDOW_INT16 1 |
INIT_XMM sse2 |
APPLY_WINDOW_INT16 1 |
INIT_XMM ssse3 |
APPLY_WINDOW_INT16 1 |
INIT_XMM ssse3, atom |
APPLY_WINDOW_INT16 1 |
; void add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, const uint8_t *diff, int w, int *left, int *left_top) |
INIT_MMX mmxext |
cglobal add_hfyu_median_prediction, 6,6,0, dst, top, diff, w, left, left_top |
movq mm0, [topq] |
movq mm2, mm0 |
movd mm4, [left_topq] |
psllq mm2, 8 |
movq mm1, mm0 |
por mm4, mm2 |
movd mm3, [leftq] |
psubb mm0, mm4 ; t-tl |
add dstq, wq |
add topq, wq |
add diffq, wq |
neg wq |
jmp .skip |
.loop: |
movq mm4, [topq+wq] |
movq mm0, mm4 |
psllq mm4, 8 |
por mm4, mm1 |
movq mm1, mm0 ; t |
psubb mm0, mm4 ; t-tl |
.skip: |
movq mm2, [diffq+wq] |
%assign i 0 |
%rep 8 |
movq mm4, mm0 |
paddb mm4, mm3 ; t-tl+l |
movq mm5, mm3 |
pmaxub mm3, mm1 |
pminub mm5, mm1 |
pminub mm3, mm4 |
pmaxub mm3, mm5 ; median |
paddb mm3, mm2 ; +residual |
%if i==0 |
movq mm7, mm3 |
psllq mm7, 56 |
%else |
movq mm6, mm3 |
psrlq mm7, 8 |
psllq mm6, 56 |
por mm7, mm6 |
%endif |
%if i<7 |
psrlq mm0, 8 |
psrlq mm1, 8 |
psrlq mm2, 8 |
%endif |
%assign i i+1 |
%endrep |
movq [dstq+wq], mm7 |
add wq, 8 |
jl .loop |
movzx r2d, byte [dstq-1] |
mov [leftq], r2d |
movzx r2d, byte [topq-1] |
mov [left_topq], r2d |
RET |
%macro ADD_HFYU_LEFT_LOOP 2 ; %1 = dst_is_aligned, %2 = src_is_aligned |
add srcq, wq |
add dstq, wq |
neg wq |
%%.loop: |
%if %2 |
mova m1, [srcq+wq] |
%else |
movu m1, [srcq+wq] |
%endif |
mova m2, m1 |
psllw m1, 8 |
paddb m1, m2 |
mova m2, m1 |
pshufb m1, m3 |
paddb m1, m2 |
pshufb m0, m5 |
mova m2, m1 |
pshufb m1, m4 |
paddb m1, m2 |
%if mmsize == 16 |
mova m2, m1 |
pshufb m1, m6 |
paddb m1, m2 |
%endif |
paddb m0, m1 |
%if %1 |
mova [dstq+wq], m0 |
%else |
movq [dstq+wq], m0 |
movhps [dstq+wq+8], m0 |
%endif |
add wq, mmsize |
jl %%.loop |
mov eax, mmsize-1 |
sub eax, wd |
movd m1, eax |
pshufb m0, m1 |
movd eax, m0 |
RET |
%endmacro |
; int add_hfyu_left_prediction(uint8_t *dst, const uint8_t *src, int w, int left) |
INIT_MMX ssse3 |
cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left |
.skip_prologue: |
mova m5, [pb_7] |
mova m4, [pb_zzzz3333zzzzbbbb] |
mova m3, [pb_zz11zz55zz99zzdd] |
movd m0, leftm |
psllq m0, 56 |
ADD_HFYU_LEFT_LOOP 1, 1 |
INIT_XMM sse4 |
cglobal add_hfyu_left_prediction, 3,3,7, dst, src, w, left |
mova m5, [pb_f] |
mova m6, [pb_zzzzzzzz77777777] |
mova m4, [pb_zzzz3333zzzzbbbb] |
mova m3, [pb_zz11zz55zz99zzdd] |
movd m0, leftm |
pslldq m0, 15 |
test srcq, 15 |
jnz .src_unaligned |
test dstq, 15 |
jnz .dst_unaligned |
ADD_HFYU_LEFT_LOOP 1, 1 |
.dst_unaligned: |
ADD_HFYU_LEFT_LOOP 0, 1 |
.src_unaligned: |
ADD_HFYU_LEFT_LOOP 0, 0 |
;----------------------------------------------------------------------------- |
; void ff_vector_clip_int32(int32_t *dst, const int32_t *src, int32_t min, |
; int32_t max, unsigned int len) |
;----------------------------------------------------------------------------- |
; %1 = number of xmm registers used |
; %2 = number of inline load/process/store loops per asm loop |
; %3 = process 4*mmsize (%3=0) or 8*mmsize (%3=1) bytes per loop |
; %4 = CLIPD function takes min/max as float instead of int (CLIPD_SSE2) |
; %5 = suffix |
%macro VECTOR_CLIP_INT32 4-5 |
cglobal vector_clip_int32%5, 5,5,%1, dst, src, min, max, len |
%if %4 |
cvtsi2ss m4, minm |
cvtsi2ss m5, maxm |
%else |
movd m4, minm |
movd m5, maxm |
%endif |
SPLATD m4 |
SPLATD m5 |
.loop: |
%assign %%i 1 |
%rep %2 |
mova m0, [srcq+mmsize*0*%%i] |
mova m1, [srcq+mmsize*1*%%i] |
mova m2, [srcq+mmsize*2*%%i] |
mova m3, [srcq+mmsize*3*%%i] |
%if %3 |
mova m7, [srcq+mmsize*4*%%i] |
mova m8, [srcq+mmsize*5*%%i] |
mova m9, [srcq+mmsize*6*%%i] |
mova m10, [srcq+mmsize*7*%%i] |
%endif |
CLIPD m0, m4, m5, m6 |
CLIPD m1, m4, m5, m6 |
CLIPD m2, m4, m5, m6 |
CLIPD m3, m4, m5, m6 |
%if %3 |
CLIPD m7, m4, m5, m6 |
CLIPD m8, m4, m5, m6 |
CLIPD m9, m4, m5, m6 |
CLIPD m10, m4, m5, m6 |
%endif |
mova [dstq+mmsize*0*%%i], m0 |
mova [dstq+mmsize*1*%%i], m1 |
mova [dstq+mmsize*2*%%i], m2 |
mova [dstq+mmsize*3*%%i], m3 |
%if %3 |
mova [dstq+mmsize*4*%%i], m7 |
mova [dstq+mmsize*5*%%i], m8 |
mova [dstq+mmsize*6*%%i], m9 |
mova [dstq+mmsize*7*%%i], m10 |
%endif |
%assign %%i %%i+1 |
%endrep |
add srcq, mmsize*4*(%2+%3) |
add dstq, mmsize*4*(%2+%3) |
sub lend, mmsize*(%2+%3) |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmx |
%define CLIPD CLIPD_MMX |
VECTOR_CLIP_INT32 0, 1, 0, 0 |
INIT_XMM sse2 |
VECTOR_CLIP_INT32 6, 1, 0, 0, _int |
%define CLIPD CLIPD_SSE2 |
VECTOR_CLIP_INT32 6, 2, 0, 1 |
INIT_XMM sse4 |
%define CLIPD CLIPD_SSE41 |
%ifdef m8 |
VECTOR_CLIP_INT32 11, 1, 1, 0 |
%else |
VECTOR_CLIP_INT32 6, 1, 0, 0 |
%endif |
; %1 = aligned/unaligned |
%macro BSWAP_LOOPS 1 |
mov r3, r2 |
sar r2, 3 |
jz .left4_%1 |
.loop8_%1: |
mov%1 m0, [r1 + 0] |
mov%1 m1, [r1 + 16] |
%if cpuflag(ssse3) |
pshufb m0, m2 |
pshufb m1, m2 |
mov%1 [r0 + 0], m0 |
mov%1 [r0 + 16], m1 |
%else |
pshuflw m0, m0, 10110001b |
pshuflw m1, m1, 10110001b |
pshufhw m0, m0, 10110001b |
pshufhw m1, m1, 10110001b |
mova m2, m0 |
mova m3, m1 |
psllw m0, 8 |
psllw m1, 8 |
psrlw m2, 8 |
psrlw m3, 8 |
por m2, m0 |
por m3, m1 |
mov%1 [r0 + 0], m2 |
mov%1 [r0 + 16], m3 |
%endif |
add r0, 32 |
add r1, 32 |
dec r2 |
jnz .loop8_%1 |
.left4_%1: |
mov r2, r3 |
and r3, 4 |
jz .left |
mov%1 m0, [r1] |
%if cpuflag(ssse3) |
pshufb m0, m2 |
mov%1 [r0], m0 |
%else |
pshuflw m0, m0, 10110001b |
pshufhw m0, m0, 10110001b |
mova m2, m0 |
psllw m0, 8 |
psrlw m2, 8 |
por m2, m0 |
mov%1 [r0], m2 |
%endif |
add r1, 16 |
add r0, 16 |
%endmacro |
; void bswap_buf(uint32_t *dst, const uint32_t *src, int w); |
%macro BSWAP32_BUF 0 |
%if cpuflag(ssse3) |
cglobal bswap32_buf, 3,4,3 |
mov r3, r1 |
mova m2, [pb_bswap32] |
%else |
cglobal bswap32_buf, 3,4,5 |
mov r3, r1 |
%endif |
or r3, r0 |
and r3, 15 |
jz .start_align |
BSWAP_LOOPS u |
jmp .left |
.start_align: |
BSWAP_LOOPS a |
.left: |
%if cpuflag(ssse3) |
mov r3, r2 |
and r2, 2 |
jz .left1 |
movq m0, [r1] |
pshufb m0, m2 |
movq [r0], m0 |
add r1, 8 |
add r0, 8 |
.left1: |
and r3, 1 |
jz .end |
mov r2d, [r1] |
bswap r2d |
mov [r0], r2d |
%else |
and r2, 3 |
jz .end |
.loop2: |
mov r3d, [r1] |
bswap r3d |
mov [r0], r3d |
add r1, 4 |
add r0, 4 |
dec r2 |
jnz .loop2 |
%endif |
.end: |
RET |
%endmacro |
INIT_XMM sse2 |
BSWAP32_BUF |
INIT_XMM ssse3 |
BSWAP32_BUF |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil_init.c |
---|
0,0 → 1,733 |
/* |
* Copyright (c) 2000, 2001 Fabrice Bellard |
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "config.h" |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/dsputil.h" |
#include "libavcodec/simple_idct.h" |
#include "dsputil_x86.h" |
#include "idct_xvid.h" |
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
void ff_put_no_rnd_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, |
uint8_t *src2, int dstStride, |
int src1Stride, int h); |
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
void ff_put_no_rnd_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
void ff_put_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride, int h); |
void ff_avg_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride, int h); |
void ff_put_no_rnd_mpeg4_qpel16_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride, |
int h); |
void ff_put_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride, int h); |
void ff_avg_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride, int h); |
void ff_put_no_rnd_mpeg4_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride, |
int h); |
void ff_put_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride); |
void ff_avg_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride); |
void ff_put_no_rnd_mpeg4_qpel16_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride); |
void ff_put_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride); |
void ff_avg_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride); |
void ff_put_no_rnd_mpeg4_qpel8_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, |
int dstStride, int srcStride); |
#define ff_put_no_rnd_pixels16_mmxext ff_put_pixels16_mmxext |
#define ff_put_no_rnd_pixels8_mmxext ff_put_pixels8_mmxext |
void ff_h263_v_loop_filter_mmx(uint8_t *src, int stride, int qscale); |
void ff_h263_h_loop_filter_mmx(uint8_t *src, int stride, int qscale); |
int32_t ff_scalarproduct_int16_mmxext(const int16_t *v1, const int16_t *v2, |
int order); |
int32_t ff_scalarproduct_int16_sse2(const int16_t *v1, const int16_t *v2, |
int order); |
int32_t ff_scalarproduct_and_madd_int16_mmxext(int16_t *v1, const int16_t *v2, |
const int16_t *v3, |
int order, int mul); |
int32_t ff_scalarproduct_and_madd_int16_sse2(int16_t *v1, const int16_t *v2, |
const int16_t *v3, |
int order, int mul); |
int32_t ff_scalarproduct_and_madd_int16_ssse3(int16_t *v1, const int16_t *v2, |
const int16_t *v3, |
int order, int mul); |
void ff_apply_window_int16_round_mmxext(int16_t *output, const int16_t *input, |
const int16_t *window, unsigned int len); |
void ff_apply_window_int16_round_sse2(int16_t *output, const int16_t *input, |
const int16_t *window, unsigned int len); |
void ff_apply_window_int16_mmxext(int16_t *output, const int16_t *input, |
const int16_t *window, unsigned int len); |
void ff_apply_window_int16_sse2(int16_t *output, const int16_t *input, |
const int16_t *window, unsigned int len); |
void ff_apply_window_int16_ssse3(int16_t *output, const int16_t *input, |
const int16_t *window, unsigned int len); |
void ff_apply_window_int16_ssse3_atom(int16_t *output, const int16_t *input, |
const int16_t *window, unsigned int len); |
void ff_bswap32_buf_ssse3(uint32_t *dst, const uint32_t *src, int w); |
void ff_bswap32_buf_sse2(uint32_t *dst, const uint32_t *src, int w); |
void ff_add_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *top, |
const uint8_t *diff, int w, |
int *left, int *left_top); |
int ff_add_hfyu_left_prediction_ssse3(uint8_t *dst, const uint8_t *src, |
int w, int left); |
int ff_add_hfyu_left_prediction_sse4(uint8_t *dst, const uint8_t *src, |
int w, int left); |
void ff_vector_clip_int32_mmx (int32_t *dst, const int32_t *src, |
int32_t min, int32_t max, unsigned int len); |
void ff_vector_clip_int32_sse2 (int32_t *dst, const int32_t *src, |
int32_t min, int32_t max, unsigned int len); |
void ff_vector_clip_int32_int_sse2(int32_t *dst, const int32_t *src, |
int32_t min, int32_t max, unsigned int len); |
void ff_vector_clip_int32_sse4 (int32_t *dst, const int32_t *src, |
int32_t min, int32_t max, unsigned int len); |
#if HAVE_YASM |
PIXELS16(static, ff_avg, , , _mmxext) |
PIXELS16(static, ff_put, , , _mmxext) |
#define QPEL_OP(OPNAME, RND, MMX) \ |
static void OPNAME ## qpel8_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
ff_ ## OPNAME ## pixels8_ ## MMX(dst, src, stride, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t temp[8]; \ |
uint8_t * const half = (uint8_t*)temp; \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ |
stride, 8); \ |
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \ |
stride, stride, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
ff_ ## OPNAME ## mpeg4_qpel8_h_lowpass_ ## MMX(dst, src, stride, \ |
stride, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t temp[8]; \ |
uint8_t * const half = (uint8_t*)temp; \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(half, src, 8, \ |
stride, 8); \ |
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + 1, half, stride, \ |
stride, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t temp[8]; \ |
uint8_t * const half = (uint8_t*)temp; \ |
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \ |
8, stride); \ |
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src, half, \ |
stride, stride, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, src, \ |
stride, stride); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t temp[8]; \ |
uint8_t * const half = (uint8_t*)temp; \ |
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(half, src, \ |
8, stride); \ |
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, src + stride, half, stride,\ |
stride, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[8 + 9]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
stride, 9); \ |
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \ |
stride, 9); \ |
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ |
stride, 8, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[8 + 9]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
stride, 9); \ |
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ |
stride, 9); \ |
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ |
stride, 8, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[8 + 9]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
stride, 9); \ |
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, 8, \ |
stride, 9); \ |
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ |
stride, 8, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[8 + 9]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
stride, 9); \ |
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ |
stride, 9); \ |
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ |
stride, 8, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[8 + 9]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
stride, 9); \ |
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH, halfHV, \ |
stride, 8, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[8 + 9]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 64; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
stride, 9); \ |
ff_put ## RND ## mpeg4_qpel8_v_lowpass_ ## MMX(halfHV, halfH, 8, 8);\ |
ff_ ## OPNAME ## pixels8_l2_ ## MMX(dst, halfH + 8, halfHV, \ |
stride, 8, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[8 + 9]; \ |
uint8_t * const halfH = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
stride, 9); \ |
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src, halfH, \ |
8, stride, 9); \ |
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ |
stride, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[8 + 9]; \ |
uint8_t * const halfH = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
stride, 9); \ |
ff_put ## RND ## pixels8_l2_ ## MMX(halfH, src + 1, halfH, 8, \ |
stride, 9); \ |
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ |
stride, 8); \ |
} \ |
\ |
static void OPNAME ## qpel8_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[9]; \ |
uint8_t * const halfH = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel8_h_lowpass_ ## MMX(halfH, src, 8, \ |
stride, 9); \ |
ff_ ## OPNAME ## mpeg4_qpel8_v_lowpass_ ## MMX(dst, halfH, \ |
stride, 8); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc00_ ## MMX (uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
ff_ ## OPNAME ## pixels16_ ## MMX(dst, src, stride, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc10_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t temp[32]; \ |
uint8_t * const half = (uint8_t*)temp; \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ |
stride, 16); \ |
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \ |
stride, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc20_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
ff_ ## OPNAME ## mpeg4_qpel16_h_lowpass_ ## MMX(dst, src, \ |
stride, stride, 16);\ |
} \ |
\ |
static void OPNAME ## qpel16_mc30_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t temp[32]; \ |
uint8_t * const half = (uint8_t*)temp; \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(half, src, 16, \ |
stride, 16); \ |
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src + 1, half, \ |
stride, stride, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc01_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t temp[32]; \ |
uint8_t * const half = (uint8_t*)temp; \ |
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ |
stride); \ |
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src, half, stride, \ |
stride, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc02_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, src, \ |
stride, stride); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc03_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t temp[32]; \ |
uint8_t * const half = (uint8_t*)temp; \ |
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(half, src, 16, \ |
stride); \ |
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, src+stride, half, \ |
stride, stride, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc11_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[16 * 2 + 17 * 2]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
stride, 17); \ |
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ |
stride, 17); \ |
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
16, 16); \ |
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ |
stride, 16, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc31_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[16 * 2 + 17 * 2]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
stride, 17); \ |
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ |
stride, 17); \ |
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
16, 16); \ |
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ |
stride, 16, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc13_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[16 * 2 + 17 * 2]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
stride, 17); \ |
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ |
stride, 17); \ |
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
16, 16); \ |
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ |
stride, 16, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc33_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[16 * 2 + 17 * 2]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
stride, 17); \ |
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ |
stride, 17); \ |
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
16, 16); \ |
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ |
stride, 16, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc21_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[16 * 2 + 17 * 2]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
stride, 17); \ |
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
16, 16); \ |
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH, halfHV, \ |
stride, 16, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc23_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[16 * 2 + 17 * 2]; \ |
uint8_t * const halfH = ((uint8_t*)half) + 256; \ |
uint8_t * const halfHV = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
stride, 17); \ |
ff_put ## RND ## mpeg4_qpel16_v_lowpass_ ## MMX(halfHV, halfH, \ |
16, 16); \ |
ff_ ## OPNAME ## pixels16_l2_ ## MMX(dst, halfH + 16, halfHV, \ |
stride, 16, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc12_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[17 * 2]; \ |
uint8_t * const halfH = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
stride, 17); \ |
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src, halfH, 16, \ |
stride, 17); \ |
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ |
stride, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc32_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[17 * 2]; \ |
uint8_t * const halfH = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
stride, 17); \ |
ff_put ## RND ## pixels16_l2_ ## MMX(halfH, src + 1, halfH, 16, \ |
stride, 17); \ |
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ |
stride, 16); \ |
} \ |
\ |
static void OPNAME ## qpel16_mc22_ ## MMX(uint8_t *dst, uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
uint64_t half[17 * 2]; \ |
uint8_t * const halfH = ((uint8_t*)half); \ |
ff_put ## RND ## mpeg4_qpel16_h_lowpass_ ## MMX(halfH, src, 16, \ |
stride, 17); \ |
ff_ ## OPNAME ## mpeg4_qpel16_v_lowpass_ ## MMX(dst, halfH, \ |
stride, 16); \ |
} |
QPEL_OP(put_, _, mmxext) |
QPEL_OP(avg_, _, mmxext) |
QPEL_OP(put_no_rnd_, _no_rnd_, mmxext) |
#endif /* HAVE_YASM */ |
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ |
do { \ |
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ |
} while (0) |
static av_cold void dsputil_init_mmx(DSPContext *c, AVCodecContext *avctx, |
int cpu_flags) |
{ |
#if HAVE_MMX_INLINE |
const int high_bit_depth = avctx->bits_per_raw_sample > 8; |
c->put_pixels_clamped = ff_put_pixels_clamped_mmx; |
c->put_signed_pixels_clamped = ff_put_signed_pixels_clamped_mmx; |
c->add_pixels_clamped = ff_add_pixels_clamped_mmx; |
if (!high_bit_depth) { |
c->clear_block = ff_clear_block_mmx; |
c->clear_blocks = ff_clear_blocks_mmx; |
c->draw_edges = ff_draw_edges_mmx; |
} |
#if CONFIG_VIDEODSP && (ARCH_X86_32 || !HAVE_YASM) |
c->gmc = ff_gmc_mmx; |
#endif |
c->add_bytes = ff_add_bytes_mmx; |
#endif /* HAVE_MMX_INLINE */ |
#if HAVE_MMX_EXTERNAL |
if (CONFIG_H263_DECODER || CONFIG_H263_ENCODER) { |
c->h263_v_loop_filter = ff_h263_v_loop_filter_mmx; |
c->h263_h_loop_filter = ff_h263_h_loop_filter_mmx; |
} |
c->vector_clip_int32 = ff_vector_clip_int32_mmx; |
#endif /* HAVE_MMX_EXTERNAL */ |
} |
static av_cold void dsputil_init_mmxext(DSPContext *c, AVCodecContext *avctx, |
int cpu_flags) |
{ |
#if HAVE_MMXEXT_INLINE |
const int high_bit_depth = avctx->bits_per_raw_sample > 8; |
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) { |
c->idct_put = ff_idct_xvid_mmxext_put; |
c->idct_add = ff_idct_xvid_mmxext_add; |
c->idct = ff_idct_xvid_mmxext; |
} |
#endif /* HAVE_MMXEXT_INLINE */ |
#if HAVE_MMXEXT_EXTERNAL |
SET_QPEL_FUNCS(avg_qpel, 0, 16, mmxext, ); |
SET_QPEL_FUNCS(avg_qpel, 1, 8, mmxext, ); |
SET_QPEL_FUNCS(put_qpel, 0, 16, mmxext, ); |
SET_QPEL_FUNCS(put_qpel, 1, 8, mmxext, ); |
SET_QPEL_FUNCS(put_no_rnd_qpel, 0, 16, mmxext, ); |
SET_QPEL_FUNCS(put_no_rnd_qpel, 1, 8, mmxext, ); |
/* slower than cmov version on AMD */ |
if (!(cpu_flags & AV_CPU_FLAG_3DNOW)) |
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_mmxext; |
c->scalarproduct_int16 = ff_scalarproduct_int16_mmxext; |
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_mmxext; |
if (avctx->flags & CODEC_FLAG_BITEXACT) { |
c->apply_window_int16 = ff_apply_window_int16_mmxext; |
} else { |
c->apply_window_int16 = ff_apply_window_int16_round_mmxext; |
} |
#endif /* HAVE_MMXEXT_EXTERNAL */ |
} |
static av_cold void dsputil_init_sse(DSPContext *c, AVCodecContext *avctx, |
int cpu_flags) |
{ |
#if HAVE_SSE_INLINE |
const int high_bit_depth = avctx->bits_per_raw_sample > 8; |
if (!high_bit_depth) { |
if (!(CONFIG_MPEG_XVMC_DECODER && avctx->xvmc_acceleration > 1)) { |
/* XvMCCreateBlocks() may not allocate 16-byte aligned blocks */ |
c->clear_block = ff_clear_block_sse; |
c->clear_blocks = ff_clear_blocks_sse; |
} |
} |
c->vector_clipf = ff_vector_clipf_sse; |
#endif /* HAVE_SSE_INLINE */ |
#if HAVE_YASM |
#if HAVE_INLINE_ASM && CONFIG_VIDEODSP |
c->gmc = ff_gmc_sse; |
#endif |
#endif /* HAVE_YASM */ |
} |
static av_cold void dsputil_init_sse2(DSPContext *c, AVCodecContext *avctx, |
int cpu_flags) |
{ |
#if HAVE_SSE2_INLINE |
const int high_bit_depth = avctx->bits_per_raw_sample > 8; |
if (!high_bit_depth && avctx->idct_algo == FF_IDCT_XVIDMMX && avctx->lowres == 0) { |
c->idct_put = ff_idct_xvid_sse2_put; |
c->idct_add = ff_idct_xvid_sse2_add; |
c->idct = ff_idct_xvid_sse2; |
c->idct_permutation_type = FF_SSE2_IDCT_PERM; |
} |
#endif /* HAVE_SSE2_INLINE */ |
#if HAVE_SSE2_EXTERNAL |
c->scalarproduct_int16 = ff_scalarproduct_int16_sse2; |
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_sse2; |
if (cpu_flags & AV_CPU_FLAG_ATOM) { |
c->vector_clip_int32 = ff_vector_clip_int32_int_sse2; |
} else { |
c->vector_clip_int32 = ff_vector_clip_int32_sse2; |
} |
if (avctx->flags & CODEC_FLAG_BITEXACT) { |
c->apply_window_int16 = ff_apply_window_int16_sse2; |
} else if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { |
c->apply_window_int16 = ff_apply_window_int16_round_sse2; |
} |
c->bswap_buf = ff_bswap32_buf_sse2; |
#endif /* HAVE_SSE2_EXTERNAL */ |
} |
static av_cold void dsputil_init_ssse3(DSPContext *c, AVCodecContext *avctx, |
int cpu_flags) |
{ |
#if HAVE_SSSE3_EXTERNAL |
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_ssse3; |
if (cpu_flags & AV_CPU_FLAG_SSE4) // not really SSE4, just slow on Conroe |
c->add_hfyu_left_prediction = ff_add_hfyu_left_prediction_sse4; |
if (cpu_flags & AV_CPU_FLAG_ATOM) |
c->apply_window_int16 = ff_apply_window_int16_ssse3_atom; |
else |
c->apply_window_int16 = ff_apply_window_int16_ssse3; |
if (!(cpu_flags & (AV_CPU_FLAG_SSE42 | AV_CPU_FLAG_3DNOW))) // cachesplit |
c->scalarproduct_and_madd_int16 = ff_scalarproduct_and_madd_int16_ssse3; |
c->bswap_buf = ff_bswap32_buf_ssse3; |
#endif /* HAVE_SSSE3_EXTERNAL */ |
} |
static av_cold void dsputil_init_sse4(DSPContext *c, AVCodecContext *avctx, |
int cpu_flags) |
{ |
#if HAVE_SSE4_EXTERNAL |
c->vector_clip_int32 = ff_vector_clip_int32_sse4; |
#endif /* HAVE_SSE4_EXTERNAL */ |
} |
av_cold void ff_dsputil_init_x86(DSPContext *c, AVCodecContext *avctx) |
{ |
int cpu_flags = av_get_cpu_flags(); |
#if HAVE_7REGS && HAVE_INLINE_ASM |
if (HAVE_MMX && cpu_flags & AV_CPU_FLAG_CMOV) |
c->add_hfyu_median_prediction = ff_add_hfyu_median_prediction_cmov; |
#endif |
if (X86_MMX(cpu_flags)) { |
#if HAVE_INLINE_ASM |
const int idct_algo = avctx->idct_algo; |
if (avctx->lowres == 0 && avctx->bits_per_raw_sample <= 8) { |
if (idct_algo == FF_IDCT_AUTO || idct_algo == FF_IDCT_SIMPLEMMX) { |
c->idct_put = ff_simple_idct_put_mmx; |
c->idct_add = ff_simple_idct_add_mmx; |
c->idct = ff_simple_idct_mmx; |
c->idct_permutation_type = FF_SIMPLE_IDCT_PERM; |
} else if (idct_algo == FF_IDCT_XVIDMMX) { |
c->idct_put = ff_idct_xvid_mmx_put; |
c->idct_add = ff_idct_xvid_mmx_add; |
c->idct = ff_idct_xvid_mmx; |
} |
} |
#endif /* HAVE_INLINE_ASM */ |
dsputil_init_mmx(c, avctx, cpu_flags); |
} |
if (X86_MMXEXT(cpu_flags)) |
dsputil_init_mmxext(c, avctx, cpu_flags); |
if (X86_SSE(cpu_flags)) |
dsputil_init_sse(c, avctx, cpu_flags); |
if (X86_SSE2(cpu_flags)) |
dsputil_init_sse2(c, avctx, cpu_flags); |
if (EXTERNAL_SSSE3(cpu_flags)) |
dsputil_init_ssse3(c, avctx, cpu_flags); |
if (EXTERNAL_SSE4(cpu_flags)) |
dsputil_init_sse4(c, avctx, cpu_flags); |
if (CONFIG_ENCODERS) |
ff_dsputilenc_init_mmx(c, avctx); |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil_mmx.c |
---|
0,0 → 1,637 |
/* |
* MMX optimized DSP utils |
* Copyright (c) 2000, 2001 Fabrice Bellard |
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
* |
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
*/ |
#include "config.h" |
#include "libavutil/avassert.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavcodec/videodsp.h" |
#include "constants.h" |
#include "dsputil_x86.h" |
#include "diracdsp_mmx.h" |
#if HAVE_INLINE_ASM |
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, |
int line_size) |
{ |
const int16_t *p; |
uint8_t *pix; |
/* read the pixels */ |
p = block; |
pix = pixels; |
/* unrolled loop */ |
__asm__ volatile ( |
"movq (%3), %%mm0 \n\t" |
"movq 8(%3), %%mm1 \n\t" |
"movq 16(%3), %%mm2 \n\t" |
"movq 24(%3), %%mm3 \n\t" |
"movq 32(%3), %%mm4 \n\t" |
"movq 40(%3), %%mm5 \n\t" |
"movq 48(%3), %%mm6 \n\t" |
"movq 56(%3), %%mm7 \n\t" |
"packuswb %%mm1, %%mm0 \n\t" |
"packuswb %%mm3, %%mm2 \n\t" |
"packuswb %%mm5, %%mm4 \n\t" |
"packuswb %%mm7, %%mm6 \n\t" |
"movq %%mm0, (%0) \n\t" |
"movq %%mm2, (%0, %1) \n\t" |
"movq %%mm4, (%0, %1, 2) \n\t" |
"movq %%mm6, (%0, %2) \n\t" |
:: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), |
"r"(p) |
: "memory"); |
pix += line_size * 4; |
p += 32; |
// if here would be an exact copy of the code above |
// compiler would generate some very strange code |
// thus using "r" |
__asm__ volatile ( |
"movq (%3), %%mm0 \n\t" |
"movq 8(%3), %%mm1 \n\t" |
"movq 16(%3), %%mm2 \n\t" |
"movq 24(%3), %%mm3 \n\t" |
"movq 32(%3), %%mm4 \n\t" |
"movq 40(%3), %%mm5 \n\t" |
"movq 48(%3), %%mm6 \n\t" |
"movq 56(%3), %%mm7 \n\t" |
"packuswb %%mm1, %%mm0 \n\t" |
"packuswb %%mm3, %%mm2 \n\t" |
"packuswb %%mm5, %%mm4 \n\t" |
"packuswb %%mm7, %%mm6 \n\t" |
"movq %%mm0, (%0) \n\t" |
"movq %%mm2, (%0, %1) \n\t" |
"movq %%mm4, (%0, %1, 2) \n\t" |
"movq %%mm6, (%0, %2) \n\t" |
:: "r"(pix), "r"((x86_reg)line_size), "r"((x86_reg)line_size * 3), "r"(p) |
: "memory"); |
} |
#define put_signed_pixels_clamped_mmx_half(off) \ |
"movq "#off"(%2), %%mm1 \n\t" \ |
"movq 16 + "#off"(%2), %%mm2 \n\t" \ |
"movq 32 + "#off"(%2), %%mm3 \n\t" \ |
"movq 48 + "#off"(%2), %%mm4 \n\t" \ |
"packsswb 8 + "#off"(%2), %%mm1 \n\t" \ |
"packsswb 24 + "#off"(%2), %%mm2 \n\t" \ |
"packsswb 40 + "#off"(%2), %%mm3 \n\t" \ |
"packsswb 56 + "#off"(%2), %%mm4 \n\t" \ |
"paddb %%mm0, %%mm1 \n\t" \ |
"paddb %%mm0, %%mm2 \n\t" \ |
"paddb %%mm0, %%mm3 \n\t" \ |
"paddb %%mm0, %%mm4 \n\t" \ |
"movq %%mm1, (%0) \n\t" \ |
"movq %%mm2, (%0, %3) \n\t" \ |
"movq %%mm3, (%0, %3, 2) \n\t" \ |
"movq %%mm4, (%0, %1) \n\t" |
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, |
int line_size) |
{ |
x86_reg line_skip = line_size; |
x86_reg line_skip3; |
__asm__ volatile ( |
"movq "MANGLE(ff_pb_80)", %%mm0 \n\t" |
"lea (%3, %3, 2), %1 \n\t" |
put_signed_pixels_clamped_mmx_half(0) |
"lea (%0, %3, 4), %0 \n\t" |
put_signed_pixels_clamped_mmx_half(64) |
: "+&r"(pixels), "=&r"(line_skip3) |
: "r"(block), "r"(line_skip) |
: "memory"); |
} |
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, |
int line_size) |
{ |
const int16_t *p; |
uint8_t *pix; |
int i; |
/* read the pixels */ |
p = block; |
pix = pixels; |
MOVQ_ZERO(mm7); |
i = 4; |
do { |
__asm__ volatile ( |
"movq (%2), %%mm0 \n\t" |
"movq 8(%2), %%mm1 \n\t" |
"movq 16(%2), %%mm2 \n\t" |
"movq 24(%2), %%mm3 \n\t" |
"movq %0, %%mm4 \n\t" |
"movq %1, %%mm6 \n\t" |
"movq %%mm4, %%mm5 \n\t" |
"punpcklbw %%mm7, %%mm4 \n\t" |
"punpckhbw %%mm7, %%mm5 \n\t" |
"paddsw %%mm4, %%mm0 \n\t" |
"paddsw %%mm5, %%mm1 \n\t" |
"movq %%mm6, %%mm5 \n\t" |
"punpcklbw %%mm7, %%mm6 \n\t" |
"punpckhbw %%mm7, %%mm5 \n\t" |
"paddsw %%mm6, %%mm2 \n\t" |
"paddsw %%mm5, %%mm3 \n\t" |
"packuswb %%mm1, %%mm0 \n\t" |
"packuswb %%mm3, %%mm2 \n\t" |
"movq %%mm0, %0 \n\t" |
"movq %%mm2, %1 \n\t" |
: "+m"(*pix), "+m"(*(pix + line_size)) |
: "r"(p) |
: "memory"); |
pix += line_size * 2; |
p += 16; |
} while (--i); |
} |
#define CLEAR_BLOCKS(name, n) \ |
void name(int16_t *blocks) \ |
{ \ |
__asm__ volatile ( \ |
"pxor %%mm7, %%mm7 \n\t" \ |
"mov %1, %%"REG_a" \n\t" \ |
"1: \n\t" \ |
"movq %%mm7, (%0, %%"REG_a") \n\t" \ |
"movq %%mm7, 8(%0, %%"REG_a") \n\t" \ |
"movq %%mm7, 16(%0, %%"REG_a") \n\t" \ |
"movq %%mm7, 24(%0, %%"REG_a") \n\t" \ |
"add $32, %%"REG_a" \n\t" \ |
"js 1b \n\t" \ |
:: "r"(((uint8_t *)blocks) + 128 * n), \ |
"i"(-128 * n) \ |
: "%"REG_a \ |
); \ |
} |
CLEAR_BLOCKS(ff_clear_blocks_mmx, 6) |
CLEAR_BLOCKS(ff_clear_block_mmx, 1) |
void ff_clear_block_sse(int16_t *block) |
{ |
__asm__ volatile ( |
"xorps %%xmm0, %%xmm0 \n" |
"movaps %%xmm0, (%0) \n" |
"movaps %%xmm0, 16(%0) \n" |
"movaps %%xmm0, 32(%0) \n" |
"movaps %%xmm0, 48(%0) \n" |
"movaps %%xmm0, 64(%0) \n" |
"movaps %%xmm0, 80(%0) \n" |
"movaps %%xmm0, 96(%0) \n" |
"movaps %%xmm0, 112(%0) \n" |
:: "r"(block) |
: "memory" |
); |
} |
void ff_clear_blocks_sse(int16_t *blocks) |
{ |
__asm__ volatile ( |
"xorps %%xmm0, %%xmm0 \n" |
"mov %1, %%"REG_a" \n" |
"1: \n" |
"movaps %%xmm0, (%0, %%"REG_a") \n" |
"movaps %%xmm0, 16(%0, %%"REG_a") \n" |
"movaps %%xmm0, 32(%0, %%"REG_a") \n" |
"movaps %%xmm0, 48(%0, %%"REG_a") \n" |
"movaps %%xmm0, 64(%0, %%"REG_a") \n" |
"movaps %%xmm0, 80(%0, %%"REG_a") \n" |
"movaps %%xmm0, 96(%0, %%"REG_a") \n" |
"movaps %%xmm0, 112(%0, %%"REG_a") \n" |
"add $128, %%"REG_a" \n" |
"js 1b \n" |
:: "r"(((uint8_t *)blocks) + 128 * 6), |
"i"(-128 * 6) |
: "%"REG_a |
); |
} |
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w) |
{ |
x86_reg i = 0; |
__asm__ volatile ( |
"jmp 2f \n\t" |
"1: \n\t" |
"movq (%1, %0), %%mm0 \n\t" |
"movq (%2, %0), %%mm1 \n\t" |
"paddb %%mm0, %%mm1 \n\t" |
"movq %%mm1, (%2, %0) \n\t" |
"movq 8(%1, %0), %%mm0 \n\t" |
"movq 8(%2, %0), %%mm1 \n\t" |
"paddb %%mm0, %%mm1 \n\t" |
"movq %%mm1, 8(%2, %0) \n\t" |
"add $16, %0 \n\t" |
"2: \n\t" |
"cmp %3, %0 \n\t" |
"js 1b \n\t" |
: "+r"(i) |
: "r"(src), "r"(dst), "r"((x86_reg)w - 15) |
); |
for ( ; i < w; i++) |
dst[i + 0] += src[i + 0]; |
} |
/* Draw the edges of width 'w' of an image of size width, height |
* this MMX version can only handle w == 8 || w == 16. */ |
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, |
int w, int h, int sides) |
{ |
uint8_t *ptr, *last_line; |
int i; |
last_line = buf + (height - 1) * wrap; |
/* left and right */ |
ptr = buf; |
if (w == 8) { |
__asm__ volatile ( |
"1: \n\t" |
"movd (%0), %%mm0 \n\t" |
"punpcklbw %%mm0, %%mm0 \n\t" |
"punpcklwd %%mm0, %%mm0 \n\t" |
"punpckldq %%mm0, %%mm0 \n\t" |
"movq %%mm0, -8(%0) \n\t" |
"movq -8(%0, %2), %%mm1 \n\t" |
"punpckhbw %%mm1, %%mm1 \n\t" |
"punpckhwd %%mm1, %%mm1 \n\t" |
"punpckhdq %%mm1, %%mm1 \n\t" |
"movq %%mm1, (%0, %2) \n\t" |
"add %1, %0 \n\t" |
"cmp %3, %0 \n\t" |
"jb 1b \n\t" |
: "+r"(ptr) |
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) |
); |
} else if(w==16){ |
__asm__ volatile ( |
"1: \n\t" |
"movd (%0), %%mm0 \n\t" |
"punpcklbw %%mm0, %%mm0 \n\t" |
"punpcklwd %%mm0, %%mm0 \n\t" |
"punpckldq %%mm0, %%mm0 \n\t" |
"movq %%mm0, -8(%0) \n\t" |
"movq %%mm0, -16(%0) \n\t" |
"movq -8(%0, %2), %%mm1 \n\t" |
"punpckhbw %%mm1, %%mm1 \n\t" |
"punpckhwd %%mm1, %%mm1 \n\t" |
"punpckhdq %%mm1, %%mm1 \n\t" |
"movq %%mm1, (%0, %2) \n\t" |
"movq %%mm1, 8(%0, %2) \n\t" |
"add %1, %0 \n\t" |
"cmp %3, %0 \n\t" |
"jb 1b \n\t" |
: "+r"(ptr) |
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) |
); |
} else { |
av_assert1(w == 4); |
__asm__ volatile ( |
"1: \n\t" |
"movd (%0), %%mm0 \n\t" |
"punpcklbw %%mm0, %%mm0 \n\t" |
"punpcklwd %%mm0, %%mm0 \n\t" |
"movd %%mm0, -4(%0) \n\t" |
"movd -4(%0, %2), %%mm1 \n\t" |
"punpcklbw %%mm1, %%mm1 \n\t" |
"punpckhwd %%mm1, %%mm1 \n\t" |
"punpckhdq %%mm1, %%mm1 \n\t" |
"movd %%mm1, (%0, %2) \n\t" |
"add %1, %0 \n\t" |
"cmp %3, %0 \n\t" |
"jb 1b \n\t" |
: "+r"(ptr) |
: "r"((x86_reg)wrap), "r"((x86_reg)width), "r"(ptr + wrap * height) |
); |
} |
/* top and bottom (and hopefully also the corners) */ |
if (sides & EDGE_TOP) { |
for (i = 0; i < h; i += 4) { |
ptr = buf - (i + 1) * wrap - w; |
__asm__ volatile ( |
"1: \n\t" |
"movq (%1, %0), %%mm0 \n\t" |
"movq %%mm0, (%0) \n\t" |
"movq %%mm0, (%0, %2) \n\t" |
"movq %%mm0, (%0, %2, 2) \n\t" |
"movq %%mm0, (%0, %3) \n\t" |
"add $8, %0 \n\t" |
"cmp %4, %0 \n\t" |
"jb 1b \n\t" |
: "+r"(ptr) |
: "r"((x86_reg)buf - (x86_reg)ptr - w), "r"((x86_reg) -wrap), |
"r"((x86_reg) -wrap * 3), "r"(ptr + width + 2 * w) |
); |
} |
} |
if (sides & EDGE_BOTTOM) { |
for (i = 0; i < h; i += 4) { |
ptr = last_line + (i + 1) * wrap - w; |
__asm__ volatile ( |
"1: \n\t" |
"movq (%1, %0), %%mm0 \n\t" |
"movq %%mm0, (%0) \n\t" |
"movq %%mm0, (%0, %2) \n\t" |
"movq %%mm0, (%0, %2, 2) \n\t" |
"movq %%mm0, (%0, %3) \n\t" |
"add $8, %0 \n\t" |
"cmp %4, %0 \n\t" |
"jb 1b \n\t" |
: "+r"(ptr) |
: "r"((x86_reg)last_line - (x86_reg)ptr - w), |
"r"((x86_reg)wrap), "r"((x86_reg)wrap * 3), |
"r"(ptr + width + 2 * w) |
); |
} |
} |
} |
typedef void emulated_edge_mc_func(uint8_t *dst, ptrdiff_t dst_stride, |
const uint8_t *src, ptrdiff_t src_linesize, |
int block_w, int block_h, |
int src_x, int src_y, int w, int h); |
static av_always_inline void gmc(uint8_t *dst, uint8_t *src, |
int stride, int h, int ox, int oy, |
int dxx, int dxy, int dyx, int dyy, |
int shift, int r, int width, int height, |
emulated_edge_mc_func *emu_edge_fn) |
{ |
const int w = 8; |
const int ix = ox >> (16 + shift); |
const int iy = oy >> (16 + shift); |
const int oxs = ox >> 4; |
const int oys = oy >> 4; |
const int dxxs = dxx >> 4; |
const int dxys = dxy >> 4; |
const int dyxs = dyx >> 4; |
const int dyys = dyy >> 4; |
const uint16_t r4[4] = { r, r, r, r }; |
const uint16_t dxy4[4] = { dxys, dxys, dxys, dxys }; |
const uint16_t dyy4[4] = { dyys, dyys, dyys, dyys }; |
const uint64_t shift2 = 2 * shift; |
#define MAX_STRIDE 4096U |
#define MAX_H 8U |
uint8_t edge_buf[(MAX_H + 1) * MAX_STRIDE]; |
int x, y; |
const int dxw = (dxx - (1 << (16 + shift))) * (w - 1); |
const int dyh = (dyy - (1 << (16 + shift))) * (h - 1); |
const int dxh = dxy * (h - 1); |
const int dyw = dyx * (w - 1); |
int need_emu = (unsigned)ix >= width - w || |
(unsigned)iy >= height - h; |
if ( // non-constant fullpel offset (3% of blocks) |
((ox ^ (ox + dxw)) | (ox ^ (ox + dxh)) | (ox ^ (ox + dxw + dxh)) | |
(oy ^ (oy + dyw)) | (oy ^ (oy + dyh)) | (oy ^ (oy + dyw + dyh))) >> (16 + shift) |
// uses more than 16 bits of subpel mv (only at huge resolution) |
|| (dxx | dxy | dyx | dyy) & 15 |
|| (need_emu && (h > MAX_H || stride > MAX_STRIDE))) { |
// FIXME could still use mmx for some of the rows |
ff_gmc_c(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, |
shift, r, width, height); |
return; |
} |
src += ix + iy * stride; |
if (need_emu) { |
emu_edge_fn(edge_buf, stride, src, stride, w + 1, h + 1, ix, iy, width, height); |
src = edge_buf; |
} |
__asm__ volatile ( |
"movd %0, %%mm6 \n\t" |
"pxor %%mm7, %%mm7 \n\t" |
"punpcklwd %%mm6, %%mm6 \n\t" |
"punpcklwd %%mm6, %%mm6 \n\t" |
:: "r"(1<<shift) |
); |
for (x = 0; x < w; x += 4) { |
uint16_t dx4[4] = { oxs - dxys + dxxs * (x + 0), |
oxs - dxys + dxxs * (x + 1), |
oxs - dxys + dxxs * (x + 2), |
oxs - dxys + dxxs * (x + 3) }; |
uint16_t dy4[4] = { oys - dyys + dyxs * (x + 0), |
oys - dyys + dyxs * (x + 1), |
oys - dyys + dyxs * (x + 2), |
oys - dyys + dyxs * (x + 3) }; |
for (y = 0; y < h; y++) { |
__asm__ volatile ( |
"movq %0, %%mm4 \n\t" |
"movq %1, %%mm5 \n\t" |
"paddw %2, %%mm4 \n\t" |
"paddw %3, %%mm5 \n\t" |
"movq %%mm4, %0 \n\t" |
"movq %%mm5, %1 \n\t" |
"psrlw $12, %%mm4 \n\t" |
"psrlw $12, %%mm5 \n\t" |
: "+m"(*dx4), "+m"(*dy4) |
: "m"(*dxy4), "m"(*dyy4) |
); |
__asm__ volatile ( |
"movq %%mm6, %%mm2 \n\t" |
"movq %%mm6, %%mm1 \n\t" |
"psubw %%mm4, %%mm2 \n\t" |
"psubw %%mm5, %%mm1 \n\t" |
"movq %%mm2, %%mm0 \n\t" |
"movq %%mm4, %%mm3 \n\t" |
"pmullw %%mm1, %%mm0 \n\t" // (s - dx) * (s - dy) |
"pmullw %%mm5, %%mm3 \n\t" // dx * dy |
"pmullw %%mm5, %%mm2 \n\t" // (s - dx) * dy |
"pmullw %%mm4, %%mm1 \n\t" // dx * (s - dy) |
"movd %4, %%mm5 \n\t" |
"movd %3, %%mm4 \n\t" |
"punpcklbw %%mm7, %%mm5 \n\t" |
"punpcklbw %%mm7, %%mm4 \n\t" |
"pmullw %%mm5, %%mm3 \n\t" // src[1, 1] * dx * dy |
"pmullw %%mm4, %%mm2 \n\t" // src[0, 1] * (s - dx) * dy |
"movd %2, %%mm5 \n\t" |
"movd %1, %%mm4 \n\t" |
"punpcklbw %%mm7, %%mm5 \n\t" |
"punpcklbw %%mm7, %%mm4 \n\t" |
"pmullw %%mm5, %%mm1 \n\t" // src[1, 0] * dx * (s - dy) |
"pmullw %%mm4, %%mm0 \n\t" // src[0, 0] * (s - dx) * (s - dy) |
"paddw %5, %%mm1 \n\t" |
"paddw %%mm3, %%mm2 \n\t" |
"paddw %%mm1, %%mm0 \n\t" |
"paddw %%mm2, %%mm0 \n\t" |
"psrlw %6, %%mm0 \n\t" |
"packuswb %%mm0, %%mm0 \n\t" |
"movd %%mm0, %0 \n\t" |
: "=m"(dst[x + y * stride]) |
: "m"(src[0]), "m"(src[1]), |
"m"(src[stride]), "m"(src[stride + 1]), |
"m"(*r4), "m"(shift2) |
); |
src += stride; |
} |
src += 4 - h * stride; |
} |
} |
#if CONFIG_VIDEODSP |
#if HAVE_YASM |
#if ARCH_X86_32 |
void ff_gmc_mmx(uint8_t *dst, uint8_t *src, |
int stride, int h, int ox, int oy, |
int dxx, int dxy, int dyx, int dyy, |
int shift, int r, int width, int height) |
{ |
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, |
width, height, &ff_emulated_edge_mc_8); |
} |
#endif |
void ff_gmc_sse(uint8_t *dst, uint8_t *src, |
int stride, int h, int ox, int oy, |
int dxx, int dxy, int dyx, int dyy, |
int shift, int r, int width, int height) |
{ |
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, |
width, height, &ff_emulated_edge_mc_8); |
} |
#else |
void ff_gmc_mmx(uint8_t *dst, uint8_t *src, |
int stride, int h, int ox, int oy, |
int dxx, int dxy, int dyx, int dyy, |
int shift, int r, int width, int height) |
{ |
gmc(dst, src, stride, h, ox, oy, dxx, dxy, dyx, dyy, shift, r, |
width, height, &ff_emulated_edge_mc_8); |
} |
#endif |
#endif |
#if CONFIG_DIRAC_DECODER |
#define DIRAC_PIXOP(OPNAME2, OPNAME, EXT)\ |
void ff_ ## OPNAME2 ## _dirac_pixels8_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ |
{\ |
if (h&3)\ |
ff_ ## OPNAME2 ## _dirac_pixels8_c(dst, src, stride, h);\ |
else\ |
OPNAME ## _pixels8_ ## EXT(dst, src[0], stride, h);\ |
}\ |
void ff_ ## OPNAME2 ## _dirac_pixels16_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ |
{\ |
if (h&3)\ |
ff_ ## OPNAME2 ## _dirac_pixels16_c(dst, src, stride, h);\ |
else\ |
OPNAME ## _pixels16_ ## EXT(dst, src[0], stride, h);\ |
}\ |
void ff_ ## OPNAME2 ## _dirac_pixels32_ ## EXT(uint8_t *dst, const uint8_t *src[5], int stride, int h)\ |
{\ |
if (h&3) {\ |
ff_ ## OPNAME2 ## _dirac_pixels32_c(dst, src, stride, h);\ |
} else {\ |
OPNAME ## _pixels16_ ## EXT(dst , src[0] , stride, h);\ |
OPNAME ## _pixels16_ ## EXT(dst+16, src[0]+16, stride, h);\ |
}\ |
} |
#if HAVE_MMX_INLINE |
PIXELS16(static, ff_avg, , , _mmxext) |
DIRAC_PIXOP(put, ff_put, mmx) |
DIRAC_PIXOP(avg, ff_avg, mmx) |
#endif |
#if HAVE_YASM |
DIRAC_PIXOP(avg, ff_avg, mmxext) |
void ff_put_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) |
{ |
if (h&3) |
ff_put_dirac_pixels16_c(dst, src, stride, h); |
else |
ff_put_pixels16_sse2(dst, src[0], stride, h); |
} |
void ff_avg_dirac_pixels16_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) |
{ |
if (h&3) |
ff_avg_dirac_pixels16_c(dst, src, stride, h); |
else |
ff_avg_pixels16_sse2(dst, src[0], stride, h); |
} |
void ff_put_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) |
{ |
if (h&3) { |
ff_put_dirac_pixels32_c(dst, src, stride, h); |
} else { |
ff_put_pixels16_sse2(dst , src[0] , stride, h); |
ff_put_pixels16_sse2(dst+16, src[0]+16, stride, h); |
} |
} |
void ff_avg_dirac_pixels32_sse2(uint8_t *dst, const uint8_t *src[5], int stride, int h) |
{ |
if (h&3) { |
ff_avg_dirac_pixels32_c(dst, src, stride, h); |
} else { |
ff_avg_pixels16_sse2(dst , src[0] , stride, h); |
ff_avg_pixels16_sse2(dst+16, src[0]+16, stride, h); |
} |
} |
#endif |
#endif |
void ff_vector_clipf_sse(float *dst, const float *src, |
float min, float max, int len) |
{ |
x86_reg i = (len - 16) * 4; |
__asm__ volatile ( |
"movss %3, %%xmm4 \n\t" |
"movss %4, %%xmm5 \n\t" |
"shufps $0, %%xmm4, %%xmm4 \n\t" |
"shufps $0, %%xmm5, %%xmm5 \n\t" |
"1: \n\t" |
"movaps (%2, %0), %%xmm0 \n\t" // 3/1 on intel |
"movaps 16(%2, %0), %%xmm1 \n\t" |
"movaps 32(%2, %0), %%xmm2 \n\t" |
"movaps 48(%2, %0), %%xmm3 \n\t" |
"maxps %%xmm4, %%xmm0 \n\t" |
"maxps %%xmm4, %%xmm1 \n\t" |
"maxps %%xmm4, %%xmm2 \n\t" |
"maxps %%xmm4, %%xmm3 \n\t" |
"minps %%xmm5, %%xmm0 \n\t" |
"minps %%xmm5, %%xmm1 \n\t" |
"minps %%xmm5, %%xmm2 \n\t" |
"minps %%xmm5, %%xmm3 \n\t" |
"movaps %%xmm0, (%1, %0) \n\t" |
"movaps %%xmm1, 16(%1, %0) \n\t" |
"movaps %%xmm2, 32(%1, %0) \n\t" |
"movaps %%xmm3, 48(%1, %0) \n\t" |
"sub $64, %0 \n\t" |
"jge 1b \n\t" |
: "+&r"(i) |
: "r"(dst), "r"(src), "m"(min), "m"(max) |
: "memory" |
); |
} |
#endif /* HAVE_INLINE_ASM */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil_qns_template.c |
---|
0,0 → 1,101 |
/* |
* DSP utils : QNS functions are compiled 3 times for mmx/3dnow/ssse3 |
* Copyright (c) 2004 Michael Niedermayer |
* |
* MMX optimization by Michael Niedermayer <michaelni@gmx.at> |
* 3DNow! and SSSE3 optimization by Zuxy Meng <zuxy.meng@gmail.com> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#define MAX_ABS (512 >> (SCALE_OFFSET>0 ? SCALE_OFFSET : 0)) |
static int DEF(try_8x8basis)(int16_t rem[64], int16_t weight[64], int16_t basis[64], int scale) |
{ |
x86_reg i=0; |
av_assert2(FFABS(scale) < MAX_ABS); |
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; |
SET_RND(mm6); |
__asm__ volatile( |
"pxor %%mm7, %%mm7 \n\t" |
"movd %4, %%mm5 \n\t" |
"punpcklwd %%mm5, %%mm5 \n\t" |
"punpcklwd %%mm5, %%mm5 \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%1, %0), %%mm0 \n\t" |
"movq 8(%1, %0), %%mm1 \n\t" |
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) |
"paddw (%2, %0), %%mm0 \n\t" |
"paddw 8(%2, %0), %%mm1 \n\t" |
"psraw $6, %%mm0 \n\t" |
"psraw $6, %%mm1 \n\t" |
"pmullw (%3, %0), %%mm0 \n\t" |
"pmullw 8(%3, %0), %%mm1 \n\t" |
"pmaddwd %%mm0, %%mm0 \n\t" |
"pmaddwd %%mm1, %%mm1 \n\t" |
"paddd %%mm1, %%mm0 \n\t" |
"psrld $4, %%mm0 \n\t" |
"paddd %%mm0, %%mm7 \n\t" |
"add $16, %0 \n\t" |
"cmp $128, %0 \n\t" //FIXME optimize & bench |
" jb 1b \n\t" |
PHADDD(%%mm7, %%mm6) |
"psrld $2, %%mm7 \n\t" |
"movd %%mm7, %0 \n\t" |
: "+r" (i) |
: "r"(basis), "r"(rem), "r"(weight), "g"(scale) |
); |
return i; |
} |
static void DEF(add_8x8basis)(int16_t rem[64], int16_t basis[64], int scale) |
{ |
x86_reg i=0; |
if(FFABS(scale) < MAX_ABS){ |
scale<<= 16 + SCALE_OFFSET - BASIS_SHIFT + RECON_SHIFT; |
SET_RND(mm6); |
__asm__ volatile( |
"movd %3, %%mm5 \n\t" |
"punpcklwd %%mm5, %%mm5 \n\t" |
"punpcklwd %%mm5, %%mm5 \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%1, %0), %%mm0 \n\t" |
"movq 8(%1, %0), %%mm1 \n\t" |
PMULHRW(%%mm0, %%mm1, %%mm5, %%mm6) |
"paddw (%2, %0), %%mm0 \n\t" |
"paddw 8(%2, %0), %%mm1 \n\t" |
"movq %%mm0, (%2, %0) \n\t" |
"movq %%mm1, 8(%2, %0) \n\t" |
"add $16, %0 \n\t" |
"cmp $128, %0 \n\t" // FIXME optimize & bench |
" jb 1b \n\t" |
: "+r" (i) |
: "r"(basis), "r"(rem), "g"(scale) |
); |
}else{ |
for(i=0; i<8*8; i++){ |
rem[i] += (basis[i]*scale + (1<<(BASIS_SHIFT - RECON_SHIFT-1)))>>(BASIS_SHIFT - RECON_SHIFT); |
} |
} |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil_x86.c |
---|
0,0 → 1,65 |
/* |
* Copyright (c) 2009 Loren Merritt <lorenm@u.washington.edu> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "config.h" |
#include "libavutil/x86/asm.h" |
#include "dsputil_x86.h" |
#if HAVE_INLINE_ASM |
#if HAVE_7REGS |
void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, |
const uint8_t *diff, int w, |
int *left, int *left_top) |
{ |
x86_reg w2 = -w; |
x86_reg x; |
int l = *left & 0xff; |
int tl = *left_top & 0xff; |
int t; |
__asm__ volatile ( |
"mov %7, %3 \n" |
"1: \n" |
"movzbl (%3, %4), %2 \n" |
"mov %2, %k3 \n" |
"sub %b1, %b3 \n" |
"add %b0, %b3 \n" |
"mov %2, %1 \n" |
"cmp %0, %2 \n" |
"cmovg %0, %2 \n" |
"cmovg %1, %0 \n" |
"cmp %k3, %0 \n" |
"cmovg %k3, %0 \n" |
"mov %7, %3 \n" |
"cmp %2, %0 \n" |
"cmovl %2, %0 \n" |
"add (%6, %4), %b0 \n" |
"mov %b0, (%5, %4) \n" |
"inc %4 \n" |
"jl 1b \n" |
: "+&q"(l), "+&q"(tl), "=&r"(t), "=&q"(x), "+&r"(w2) |
: "r"(dst + w), "r"(diff + w), "rm"(top + w) |
); |
*left = l; |
*left_top = tl; |
} |
#endif |
#endif /* HAVE_INLINE_ASM */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputil_x86.h |
---|
0,0 → 1,198 |
/* |
* MMX optimized DSP utils |
* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#ifndef AVCODEC_X86_DSPUTIL_MMX_H |
#define AVCODEC_X86_DSPUTIL_MMX_H |
#include <stddef.h> |
#include <stdint.h> |
#include "libavcodec/dsputil.h" |
#include "libavutil/x86/asm.h" |
#include "constants.h" |
#define MOVQ_WONE(regd) \ |
__asm__ volatile ( \ |
"pcmpeqd %%" #regd ", %%" #regd " \n\t" \ |
"psrlw $15, %%" #regd ::) |
#define JUMPALIGN() __asm__ volatile (".p2align 3"::) |
#define MOVQ_ZERO(regd) __asm__ volatile ("pxor %%"#regd", %%"#regd ::) |
#define MOVQ_BFE(regd) \ |
__asm__ volatile ( \ |
"pcmpeqd %%"#regd", %%"#regd" \n\t" \ |
"paddb %%"#regd", %%"#regd" \n\t" ::) |
#ifndef PIC |
#define MOVQ_WTWO(regd) __asm__ volatile ("movq %0, %%"#regd" \n\t" :: "m"(ff_wtwo)) |
#else |
// for shared library it's better to use this way for accessing constants |
// pcmpeqd -> -1 |
#define MOVQ_WTWO(regd) \ |
__asm__ volatile ( \ |
"pcmpeqd %%"#regd", %%"#regd" \n\t" \ |
"psrlw $15, %%"#regd" \n\t" \ |
"psllw $1, %%"#regd" \n\t"::) |
#endif |
// using regr as temporary and for the output result |
// first argument is unmodifed and second is trashed |
// regfe is supposed to contain 0xfefefefefefefefe |
#define PAVGB_MMX_NO_RND(rega, regb, regr, regfe) \ |
"movq "#rega", "#regr" \n\t" \ |
"pand "#regb", "#regr" \n\t" \ |
"pxor "#rega", "#regb" \n\t" \ |
"pand "#regfe", "#regb" \n\t" \ |
"psrlq $1, "#regb" \n\t" \ |
"paddb "#regb", "#regr" \n\t" |
#define PAVGB_MMX(rega, regb, regr, regfe) \ |
"movq "#rega", "#regr" \n\t" \ |
"por "#regb", "#regr" \n\t" \ |
"pxor "#rega", "#regb" \n\t" \ |
"pand "#regfe", "#regb" \n\t" \ |
"psrlq $1, "#regb" \n\t" \ |
"psubb "#regb", "#regr" \n\t" |
// mm6 is supposed to contain 0xfefefefefefefefe |
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
"movq "#rega", "#regr" \n\t" \ |
"movq "#regc", "#regp" \n\t" \ |
"pand "#regb", "#regr" \n\t" \ |
"pand "#regd", "#regp" \n\t" \ |
"pxor "#rega", "#regb" \n\t" \ |
"pxor "#regc", "#regd" \n\t" \ |
"pand %%mm6, "#regb" \n\t" \ |
"pand %%mm6, "#regd" \n\t" \ |
"psrlq $1, "#regb" \n\t" \ |
"psrlq $1, "#regd" \n\t" \ |
"paddb "#regb", "#regr" \n\t" \ |
"paddb "#regd", "#regp" \n\t" |
#define PAVGBP_MMX(rega, regb, regr, regc, regd, regp) \ |
"movq "#rega", "#regr" \n\t" \ |
"movq "#regc", "#regp" \n\t" \ |
"por "#regb", "#regr" \n\t" \ |
"por "#regd", "#regp" \n\t" \ |
"pxor "#rega", "#regb" \n\t" \ |
"pxor "#regc", "#regd" \n\t" \ |
"pand %%mm6, "#regb" \n\t" \ |
"pand %%mm6, "#regd" \n\t" \ |
"psrlq $1, "#regd" \n\t" \ |
"psrlq $1, "#regb" \n\t" \ |
"psubb "#regb", "#regr" \n\t" \ |
"psubb "#regd", "#regp" \n\t" |
void ff_dsputilenc_init_mmx(DSPContext* c, AVCodecContext *avctx); |
void ff_dsputil_init_pix_mmx(DSPContext* c, AVCodecContext *avctx); |
void ff_add_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); |
void ff_put_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); |
void ff_put_signed_pixels_clamped_mmx(const int16_t *block, uint8_t *pixels, int line_size); |
void ff_clear_block_mmx(int16_t *block); |
void ff_clear_block_sse(int16_t *block); |
void ff_clear_blocks_mmx(int16_t *blocks); |
void ff_clear_blocks_sse(int16_t *blocks); |
void ff_add_bytes_mmx(uint8_t *dst, uint8_t *src, int w); |
void ff_add_hfyu_median_prediction_cmov(uint8_t *dst, const uint8_t *top, |
const uint8_t *diff, int w, |
int *left, int *left_top); |
void ff_draw_edges_mmx(uint8_t *buf, int wrap, int width, int height, |
int w, int h, int sides); |
void ff_gmc_mmx(uint8_t *dst, uint8_t *src, |
int stride, int h, int ox, int oy, |
int dxx, int dxy, int dyx, int dyy, |
int shift, int r, int width, int height); |
void ff_gmc_sse(uint8_t *dst, uint8_t *src, |
int stride, int h, int ox, int oy, |
int dxx, int dxy, int dyx, int dyy, |
int shift, int r, int width, int height); |
void ff_vector_clipf_sse(float *dst, const float *src, |
float min, float max, int len); |
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels8_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels8_xy2_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels16_xy2_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_mmx_idct(int16_t *block); |
void ff_mmxext_idct(int16_t *block); |
void ff_deinterlace_line_mmx(uint8_t *dst, |
const uint8_t *lum_m4, const uint8_t *lum_m3, |
const uint8_t *lum_m2, const uint8_t *lum_m1, |
const uint8_t *lum, |
int size); |
void ff_deinterlace_line_inplace_mmx(const uint8_t *lum_m4, |
const uint8_t *lum_m3, |
const uint8_t *lum_m2, |
const uint8_t *lum_m1, |
const uint8_t *lum, int size); |
#define PIXELS16(STATIC, PFX1, PFX2, TYPE, CPUEXT) \ |
STATIC void PFX1 ## _pixels16 ## TYPE ## CPUEXT(uint8_t *block, \ |
const uint8_t *pixels, \ |
ptrdiff_t line_size, \ |
int h) \ |
{ \ |
PFX2 ## PFX1 ## _pixels8 ## TYPE ## CPUEXT(block, pixels, \ |
line_size, h); \ |
PFX2 ## PFX1 ## _pixels8 ## TYPE ## CPUEXT(block + 8, pixels + 8, \ |
line_size, h); \ |
} |
#endif /* AVCODEC_X86_DSPUTIL_MMX_H */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputilenc.asm |
---|
0,0 → 1,487 |
;***************************************************************************** |
;* MMX optimized DSP utils |
;***************************************************************************** |
;* Copyright (c) 2000, 2001 Fabrice Bellard |
;* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;***************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION .text |
%macro DIFF_PIXELS_1 4 |
movh %1, %3 |
movh %2, %4 |
punpcklbw %2, %1 |
punpcklbw %1, %1 |
psubw %1, %2 |
%endmacro |
; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 |
; %6=temporary storage location |
; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) |
%macro DIFF_PIXELS_8 6 |
DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] |
DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] |
DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
add %1, %5 |
add %2, %5 |
DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] |
DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] |
DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] |
%ifdef m8 |
DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] |
%else |
mova [%6], m0 |
DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] |
mova m0, [%6] |
%endif |
sub %1, %5 |
sub %2, %5 |
%endmacro |
%macro HADAMARD8 0 |
SUMSUB_BADC w, 0, 1, 2, 3 |
SUMSUB_BADC w, 4, 5, 6, 7 |
SUMSUB_BADC w, 0, 2, 1, 3 |
SUMSUB_BADC w, 4, 6, 5, 7 |
SUMSUB_BADC w, 0, 4, 1, 5 |
SUMSUB_BADC w, 2, 6, 3, 7 |
%endmacro |
%macro ABS1_SUM 3 |
ABS1 %1, %2 |
paddusw %3, %1 |
%endmacro |
%macro ABS2_SUM 6 |
ABS2 %1, %2, %3, %4 |
paddusw %5, %1 |
paddusw %6, %2 |
%endmacro |
%macro ABS_SUM_8x8_64 1 |
ABS2 m0, m1, m8, m9 |
ABS2_SUM m2, m3, m8, m9, m0, m1 |
ABS2_SUM m4, m5, m8, m9, m0, m1 |
ABS2_SUM m6, m7, m8, m9, m0, m1 |
paddusw m0, m1 |
%endmacro |
%macro ABS_SUM_8x8_32 1 |
mova [%1], m7 |
ABS1 m0, m7 |
ABS1 m1, m7 |
ABS1_SUM m2, m7, m0 |
ABS1_SUM m3, m7, m1 |
ABS1_SUM m4, m7, m0 |
ABS1_SUM m5, m7, m1 |
ABS1_SUM m6, m7, m0 |
mova m2, [%1] |
ABS1_SUM m2, m7, m1 |
paddusw m0, m1 |
%endmacro |
; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to |
; about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
; and it's even more unlikely to not have any alternative mvs/modes with lower cost. |
%macro HSUM 3 |
%if cpuflag(sse2) |
movhlps %2, %1 |
paddusw %1, %2 |
pshuflw %2, %1, 0xE |
paddusw %1, %2 |
pshuflw %2, %1, 0x1 |
paddusw %1, %2 |
movd %3, %1 |
%elif cpuflag(mmxext) |
pshufw %2, %1, 0xE |
paddusw %1, %2 |
pshufw %2, %1, 0x1 |
paddusw %1, %2 |
movd %3, %1 |
%elif cpuflag(mmx) |
mova %2, %1 |
psrlq %1, 32 |
paddusw %1, %2 |
mova %2, %1 |
psrlq %1, 16 |
paddusw %1, %2 |
movd %3, %1 |
%endif |
%endmacro |
%macro STORE4 5 |
mova [%1+mmsize*0], %2 |
mova [%1+mmsize*1], %3 |
mova [%1+mmsize*2], %4 |
mova [%1+mmsize*3], %5 |
%endmacro |
%macro LOAD4 5 |
mova %2, [%1+mmsize*0] |
mova %3, [%1+mmsize*1] |
mova %4, [%1+mmsize*2] |
mova %5, [%1+mmsize*3] |
%endmacro |
%macro hadamard8_16_wrapper 2 |
cglobal hadamard8_diff, 4, 4, %1 |
%ifndef m8 |
%assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
SUB rsp, pad |
%endif |
call hadamard8x8_diff %+ SUFFIX |
%ifndef m8 |
ADD rsp, pad |
%endif |
RET |
cglobal hadamard8_diff16, 5, 6, %1 |
%ifndef m8 |
%assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
SUB rsp, pad |
%endif |
call hadamard8x8_diff %+ SUFFIX |
mov r5d, eax |
add r1, 8 |
add r2, 8 |
call hadamard8x8_diff %+ SUFFIX |
add r5d, eax |
cmp r4d, 16 |
jne .done |
lea r1, [r1+r3*8-8] |
lea r2, [r2+r3*8-8] |
call hadamard8x8_diff %+ SUFFIX |
add r5d, eax |
add r1, 8 |
add r2, 8 |
call hadamard8x8_diff %+ SUFFIX |
add r5d, eax |
.done: |
mov eax, r5d |
%ifndef m8 |
ADD rsp, pad |
%endif |
RET |
%endmacro |
%macro HADAMARD8_DIFF 0-1 |
%if cpuflag(sse2) |
hadamard8x8_diff %+ SUFFIX: |
lea r0, [r3*3] |
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize |
HADAMARD8 |
%if ARCH_X86_64 |
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
%else |
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] |
%endif |
HADAMARD8 |
ABS_SUM_8x8 rsp+gprsize |
HSUM m0, m1, eax |
and eax, 0xFFFF |
ret |
hadamard8_16_wrapper %1, 3 |
%elif cpuflag(mmx) |
ALIGN 16 |
; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, |
; int stride, int h) |
; r0 = void *s = unused, int h = unused (always 8) |
; note how r1, r2 and r3 are not clobbered in this function, so 16x16 |
; can simply call this 2x2x (and that's why we access rsp+gprsize |
; everywhere, which is rsp of calling func |
hadamard8x8_diff %+ SUFFIX: |
lea r0, [r3*3] |
; first 4x8 pixels |
DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 |
HADAMARD8 |
mova [rsp+gprsize+0x60], m7 |
TRANSPOSE4x4W 0, 1, 2, 3, 7 |
STORE4 rsp+gprsize, m0, m1, m2, m3 |
mova m7, [rsp+gprsize+0x60] |
TRANSPOSE4x4W 4, 5, 6, 7, 0 |
STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 |
; second 4x8 pixels |
DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 |
HADAMARD8 |
mova [rsp+gprsize+0x60], m7 |
TRANSPOSE4x4W 0, 1, 2, 3, 7 |
STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 |
mova m7, [rsp+gprsize+0x60] |
TRANSPOSE4x4W 4, 5, 6, 7, 0 |
LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 |
HADAMARD8 |
ABS_SUM_8x8_32 rsp+gprsize+0x60 |
mova [rsp+gprsize+0x60], m0 |
LOAD4 rsp+gprsize , m0, m1, m2, m3 |
LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 |
HADAMARD8 |
ABS_SUM_8x8_32 rsp+gprsize |
paddusw m0, [rsp+gprsize+0x60] |
HSUM m0, m1, eax |
and rax, 0xFFFF |
ret |
hadamard8_16_wrapper 0, 14 |
%endif |
%endmacro |
INIT_MMX mmx |
HADAMARD8_DIFF |
INIT_MMX mmxext |
HADAMARD8_DIFF |
INIT_XMM sse2 |
%if ARCH_X86_64 |
%define ABS_SUM_8x8 ABS_SUM_8x8_64 |
%else |
%define ABS_SUM_8x8 ABS_SUM_8x8_32 |
%endif |
HADAMARD8_DIFF 10 |
INIT_XMM ssse3 |
%define ABS_SUM_8x8 ABS_SUM_8x8_64 |
HADAMARD8_DIFF 9 |
INIT_XMM sse2 |
; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
cglobal sse16, 5, 5, 8 |
shr r4d, 1 |
pxor m0, m0 ; mm0 = 0 |
pxor m7, m7 ; mm7 holds the sum |
.next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned |
movu m1, [r1 ] ; mm1 = pix1[0][0-15] |
movu m2, [r2 ] ; mm2 = pix2[0][0-15] |
movu m3, [r1+r3] ; mm3 = pix1[1][0-15] |
movu m4, [r2+r3] ; mm4 = pix2[1][0-15] |
; todo: mm1-mm2, mm3-mm4 |
; algo: subtract mm1 from mm2 with saturation and vice versa |
; OR the result to get the absolute difference |
mova m5, m1 |
mova m6, m3 |
psubusb m1, m2 |
psubusb m3, m4 |
psubusb m2, m5 |
psubusb m4, m6 |
por m2, m1 |
por m4, m3 |
; now convert to 16-bit vectors so we can square them |
mova m1, m2 |
mova m3, m4 |
punpckhbw m2, m0 |
punpckhbw m4, m0 |
punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) |
punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) |
pmaddwd m2, m2 |
pmaddwd m4, m4 |
pmaddwd m1, m1 |
pmaddwd m3, m3 |
lea r1, [r1+r3*2] ; pix1 += 2*line_size |
lea r2, [r2+r3*2] ; pix2 += 2*line_size |
paddd m1, m2 |
paddd m3, m4 |
paddd m7, m1 |
paddd m7, m3 |
dec r4 |
jnz .next2lines |
mova m1, m7 |
psrldq m7, 8 ; shift hi qword to lo |
paddd m7, m1 |
mova m1, m7 |
psrldq m7, 4 ; shift hi dword to lo |
paddd m7, m1 |
movd eax, m7 ; return value |
RET |
INIT_MMX mmx |
; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size) |
cglobal get_pixels, 3,4 |
movsxdifnidn r2, r2d |
add r0, 128 |
mov r3, -128 |
pxor m7, m7 |
.loop: |
mova m0, [r1] |
mova m2, [r1+r2] |
mova m1, m0 |
mova m3, m2 |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
mova [r0+r3+ 0], m0 |
mova [r0+r3+ 8], m1 |
mova [r0+r3+16], m2 |
mova [r0+r3+24], m3 |
lea r1, [r1+r2*2] |
add r3, 32 |
js .loop |
REP_RET |
INIT_XMM sse2 |
cglobal get_pixels, 3, 4 |
movsxdifnidn r2, r2d |
lea r3, [r2*3] |
pxor m4, m4 |
movh m0, [r1] |
movh m1, [r1+r2] |
movh m2, [r1+r2*2] |
movh m3, [r1+r3] |
lea r1, [r1+r2*4] |
punpcklbw m0, m4 |
punpcklbw m1, m4 |
punpcklbw m2, m4 |
punpcklbw m3, m4 |
mova [r0], m0 |
mova [r0+0x10], m1 |
mova [r0+0x20], m2 |
mova [r0+0x30], m3 |
movh m0, [r1] |
movh m1, [r1+r2*1] |
movh m2, [r1+r2*2] |
movh m3, [r1+r3] |
punpcklbw m0, m4 |
punpcklbw m1, m4 |
punpcklbw m2, m4 |
punpcklbw m3, m4 |
mova [r0+0x40], m0 |
mova [r0+0x50], m1 |
mova [r0+0x60], m2 |
mova [r0+0x70], m3 |
RET |
INIT_MMX mmx |
; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride) |
cglobal diff_pixels, 4,5 |
movsxdifnidn r3, r3d |
pxor m7, m7 |
add r0, 128 |
mov r4, -128 |
.loop: |
mova m0, [r1] |
mova m2, [r2] |
mova m1, m0 |
mova m3, m2 |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
psubw m0, m2 |
psubw m1, m3 |
mova [r0+r4+0], m0 |
mova [r0+r4+8], m1 |
add r1, r3 |
add r2, r3 |
add r4, 16 |
jne .loop |
REP_RET |
INIT_MMX mmx |
; pix_sum16_mmx(uint8_t * pix, int line_size) |
cglobal pix_sum16, 2, 3 |
movsxdifnidn r1, r1d |
mov r2, r1 |
neg r2 |
shl r2, 4 |
sub r0, r2 |
pxor m7, m7 |
pxor m6, m6 |
.loop: |
mova m0, [r0+r2+0] |
mova m1, [r0+r2+0] |
mova m2, [r0+r2+8] |
mova m3, [r0+r2+8] |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
paddw m1, m0 |
paddw m3, m2 |
paddw m3, m1 |
paddw m6, m3 |
add r2, r1 |
js .loop |
mova m5, m6 |
psrlq m6, 32 |
paddw m6, m5 |
mova m5, m6 |
psrlq m6, 16 |
paddw m6, m5 |
movd eax, m6 |
and eax, 0xffff |
RET |
INIT_MMX mmx |
; pix_norm1_mmx(uint8_t *pix, int line_size) |
cglobal pix_norm1, 2, 4 |
movsxdifnidn r1, r1d |
mov r2, 16 |
pxor m0, m0 |
pxor m7, m7 |
.loop: |
mova m2, [r0+0] |
mova m3, [r0+8] |
mova m1, m2 |
punpckhbw m1, m0 |
punpcklbw m2, m0 |
mova m4, m3 |
punpckhbw m3, m0 |
punpcklbw m4, m0 |
pmaddwd m1, m1 |
pmaddwd m2, m2 |
pmaddwd m3, m3 |
pmaddwd m4, m4 |
paddd m2, m1 |
paddd m4, m3 |
paddd m7, m2 |
add r0, r1 |
paddd m7, m4 |
dec r2 |
jne .loop |
mova m1, m7 |
psrlq m7, 32 |
paddd m1, m7 |
movd eax, m1 |
RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dsputilenc_mmx.c |
---|
0,0 → 1,1061 |
/* |
* MMX optimized DSP utils |
* Copyright (c) 2000, 2001 Fabrice Bellard |
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
* |
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/dct.h" |
#include "libavcodec/dsputil.h" |
#include "libavcodec/mpegvideo.h" |
#include "libavcodec/mathops.h" |
#include "dsputil_x86.h" |
void ff_get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size); |
void ff_get_pixels_sse2(int16_t *block, const uint8_t *pixels, int line_size); |
void ff_diff_pixels_mmx(int16_t *block, const uint8_t *s1, const uint8_t *s2, int stride); |
int ff_pix_sum16_mmx(uint8_t * pix, int line_size); |
int ff_pix_norm1_mmx(uint8_t *pix, int line_size); |
#if HAVE_INLINE_ASM |
static int sse8_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
int tmp; |
__asm__ volatile ( |
"movl %4,%%ecx\n" |
"shr $1,%%ecx\n" |
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */ |
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ |
"1:\n" |
"movq (%0),%%mm1\n" /* mm1 = pix1[0][0-7] */ |
"movq (%1),%%mm2\n" /* mm2 = pix2[0][0-7] */ |
"movq (%0,%3),%%mm3\n" /* mm3 = pix1[1][0-7] */ |
"movq (%1,%3),%%mm4\n" /* mm4 = pix2[1][0-7] */ |
/* todo: mm1-mm2, mm3-mm4 */ |
/* algo: subtract mm1 from mm2 with saturation and vice versa */ |
/* OR the results to get absolute difference */ |
"movq %%mm1,%%mm5\n" |
"movq %%mm3,%%mm6\n" |
"psubusb %%mm2,%%mm1\n" |
"psubusb %%mm4,%%mm3\n" |
"psubusb %%mm5,%%mm2\n" |
"psubusb %%mm6,%%mm4\n" |
"por %%mm1,%%mm2\n" |
"por %%mm3,%%mm4\n" |
/* now convert to 16-bit vectors so we can square them */ |
"movq %%mm2,%%mm1\n" |
"movq %%mm4,%%mm3\n" |
"punpckhbw %%mm0,%%mm2\n" |
"punpckhbw %%mm0,%%mm4\n" |
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ |
"pmaddwd %%mm2,%%mm2\n" |
"pmaddwd %%mm4,%%mm4\n" |
"pmaddwd %%mm1,%%mm1\n" |
"pmaddwd %%mm3,%%mm3\n" |
"lea (%0,%3,2), %0\n" /* pix1 += 2*line_size */ |
"lea (%1,%3,2), %1\n" /* pix2 += 2*line_size */ |
"paddd %%mm2,%%mm1\n" |
"paddd %%mm4,%%mm3\n" |
"paddd %%mm1,%%mm7\n" |
"paddd %%mm3,%%mm7\n" |
"decl %%ecx\n" |
"jnz 1b\n" |
"movq %%mm7,%%mm1\n" |
"psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
"paddd %%mm7,%%mm1\n" |
"movd %%mm1,%2\n" |
: "+r" (pix1), "+r" (pix2), "=r"(tmp) |
: "r" ((x86_reg)line_size) , "m" (h) |
: "%ecx"); |
return tmp; |
} |
static int sse16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
int tmp; |
__asm__ volatile ( |
"movl %4,%%ecx\n" |
"pxor %%mm0,%%mm0\n" /* mm0 = 0 */ |
"pxor %%mm7,%%mm7\n" /* mm7 holds the sum */ |
"1:\n" |
"movq (%0),%%mm1\n" /* mm1 = pix1[0-7] */ |
"movq (%1),%%mm2\n" /* mm2 = pix2[0-7] */ |
"movq 8(%0),%%mm3\n" /* mm3 = pix1[8-15] */ |
"movq 8(%1),%%mm4\n" /* mm4 = pix2[8-15] */ |
/* todo: mm1-mm2, mm3-mm4 */ |
/* algo: subtract mm1 from mm2 with saturation and vice versa */ |
/* OR the results to get absolute difference */ |
"movq %%mm1,%%mm5\n" |
"movq %%mm3,%%mm6\n" |
"psubusb %%mm2,%%mm1\n" |
"psubusb %%mm4,%%mm3\n" |
"psubusb %%mm5,%%mm2\n" |
"psubusb %%mm6,%%mm4\n" |
"por %%mm1,%%mm2\n" |
"por %%mm3,%%mm4\n" |
/* now convert to 16-bit vectors so we can square them */ |
"movq %%mm2,%%mm1\n" |
"movq %%mm4,%%mm3\n" |
"punpckhbw %%mm0,%%mm2\n" |
"punpckhbw %%mm0,%%mm4\n" |
"punpcklbw %%mm0,%%mm1\n" /* mm1 now spread over (mm1,mm2) */ |
"punpcklbw %%mm0,%%mm3\n" /* mm4 now spread over (mm3,mm4) */ |
"pmaddwd %%mm2,%%mm2\n" |
"pmaddwd %%mm4,%%mm4\n" |
"pmaddwd %%mm1,%%mm1\n" |
"pmaddwd %%mm3,%%mm3\n" |
"add %3,%0\n" |
"add %3,%1\n" |
"paddd %%mm2,%%mm1\n" |
"paddd %%mm4,%%mm3\n" |
"paddd %%mm1,%%mm7\n" |
"paddd %%mm3,%%mm7\n" |
"decl %%ecx\n" |
"jnz 1b\n" |
"movq %%mm7,%%mm1\n" |
"psrlq $32, %%mm7\n" /* shift hi dword to lo */ |
"paddd %%mm7,%%mm1\n" |
"movd %%mm1,%2\n" |
: "+r" (pix1), "+r" (pix2), "=r"(tmp) |
: "r" ((x86_reg)line_size) , "m" (h) |
: "%ecx"); |
return tmp; |
} |
static int hf_noise8_mmx(uint8_t * pix1, int line_size, int h) { |
int tmp; |
__asm__ volatile ( |
"movl %3,%%ecx\n" |
"pxor %%mm7,%%mm7\n" |
"pxor %%mm6,%%mm6\n" |
"movq (%0),%%mm0\n" |
"movq %%mm0, %%mm1\n" |
"psllq $8, %%mm0\n" |
"psrlq $8, %%mm1\n" |
"psrlq $8, %%mm0\n" |
"movq %%mm0, %%mm2\n" |
"movq %%mm1, %%mm3\n" |
"punpcklbw %%mm7,%%mm0\n" |
"punpcklbw %%mm7,%%mm1\n" |
"punpckhbw %%mm7,%%mm2\n" |
"punpckhbw %%mm7,%%mm3\n" |
"psubw %%mm1, %%mm0\n" |
"psubw %%mm3, %%mm2\n" |
"add %2,%0\n" |
"movq (%0),%%mm4\n" |
"movq %%mm4, %%mm1\n" |
"psllq $8, %%mm4\n" |
"psrlq $8, %%mm1\n" |
"psrlq $8, %%mm4\n" |
"movq %%mm4, %%mm5\n" |
"movq %%mm1, %%mm3\n" |
"punpcklbw %%mm7,%%mm4\n" |
"punpcklbw %%mm7,%%mm1\n" |
"punpckhbw %%mm7,%%mm5\n" |
"punpckhbw %%mm7,%%mm3\n" |
"psubw %%mm1, %%mm4\n" |
"psubw %%mm3, %%mm5\n" |
"psubw %%mm4, %%mm0\n" |
"psubw %%mm5, %%mm2\n" |
"pxor %%mm3, %%mm3\n" |
"pxor %%mm1, %%mm1\n" |
"pcmpgtw %%mm0, %%mm3\n\t" |
"pcmpgtw %%mm2, %%mm1\n\t" |
"pxor %%mm3, %%mm0\n" |
"pxor %%mm1, %%mm2\n" |
"psubw %%mm3, %%mm0\n" |
"psubw %%mm1, %%mm2\n" |
"paddw %%mm0, %%mm2\n" |
"paddw %%mm2, %%mm6\n" |
"add %2,%0\n" |
"1:\n" |
"movq (%0),%%mm0\n" |
"movq %%mm0, %%mm1\n" |
"psllq $8, %%mm0\n" |
"psrlq $8, %%mm1\n" |
"psrlq $8, %%mm0\n" |
"movq %%mm0, %%mm2\n" |
"movq %%mm1, %%mm3\n" |
"punpcklbw %%mm7,%%mm0\n" |
"punpcklbw %%mm7,%%mm1\n" |
"punpckhbw %%mm7,%%mm2\n" |
"punpckhbw %%mm7,%%mm3\n" |
"psubw %%mm1, %%mm0\n" |
"psubw %%mm3, %%mm2\n" |
"psubw %%mm0, %%mm4\n" |
"psubw %%mm2, %%mm5\n" |
"pxor %%mm3, %%mm3\n" |
"pxor %%mm1, %%mm1\n" |
"pcmpgtw %%mm4, %%mm3\n\t" |
"pcmpgtw %%mm5, %%mm1\n\t" |
"pxor %%mm3, %%mm4\n" |
"pxor %%mm1, %%mm5\n" |
"psubw %%mm3, %%mm4\n" |
"psubw %%mm1, %%mm5\n" |
"paddw %%mm4, %%mm5\n" |
"paddw %%mm5, %%mm6\n" |
"add %2,%0\n" |
"movq (%0),%%mm4\n" |
"movq %%mm4, %%mm1\n" |
"psllq $8, %%mm4\n" |
"psrlq $8, %%mm1\n" |
"psrlq $8, %%mm4\n" |
"movq %%mm4, %%mm5\n" |
"movq %%mm1, %%mm3\n" |
"punpcklbw %%mm7,%%mm4\n" |
"punpcklbw %%mm7,%%mm1\n" |
"punpckhbw %%mm7,%%mm5\n" |
"punpckhbw %%mm7,%%mm3\n" |
"psubw %%mm1, %%mm4\n" |
"psubw %%mm3, %%mm5\n" |
"psubw %%mm4, %%mm0\n" |
"psubw %%mm5, %%mm2\n" |
"pxor %%mm3, %%mm3\n" |
"pxor %%mm1, %%mm1\n" |
"pcmpgtw %%mm0, %%mm3\n\t" |
"pcmpgtw %%mm2, %%mm1\n\t" |
"pxor %%mm3, %%mm0\n" |
"pxor %%mm1, %%mm2\n" |
"psubw %%mm3, %%mm0\n" |
"psubw %%mm1, %%mm2\n" |
"paddw %%mm0, %%mm2\n" |
"paddw %%mm2, %%mm6\n" |
"add %2,%0\n" |
"subl $2, %%ecx\n" |
" jnz 1b\n" |
"movq %%mm6, %%mm0\n" |
"punpcklwd %%mm7,%%mm0\n" |
"punpckhwd %%mm7,%%mm6\n" |
"paddd %%mm0, %%mm6\n" |
"movq %%mm6,%%mm0\n" |
"psrlq $32, %%mm6\n" |
"paddd %%mm6,%%mm0\n" |
"movd %%mm0,%1\n" |
: "+r" (pix1), "=r"(tmp) |
: "r" ((x86_reg)line_size) , "g" (h-2) |
: "%ecx"); |
return tmp; |
} |
static int hf_noise16_mmx(uint8_t * pix1, int line_size, int h) { |
int tmp; |
uint8_t * pix= pix1; |
__asm__ volatile ( |
"movl %3,%%ecx\n" |
"pxor %%mm7,%%mm7\n" |
"pxor %%mm6,%%mm6\n" |
"movq (%0),%%mm0\n" |
"movq 1(%0),%%mm1\n" |
"movq %%mm0, %%mm2\n" |
"movq %%mm1, %%mm3\n" |
"punpcklbw %%mm7,%%mm0\n" |
"punpcklbw %%mm7,%%mm1\n" |
"punpckhbw %%mm7,%%mm2\n" |
"punpckhbw %%mm7,%%mm3\n" |
"psubw %%mm1, %%mm0\n" |
"psubw %%mm3, %%mm2\n" |
"add %2,%0\n" |
"movq (%0),%%mm4\n" |
"movq 1(%0),%%mm1\n" |
"movq %%mm4, %%mm5\n" |
"movq %%mm1, %%mm3\n" |
"punpcklbw %%mm7,%%mm4\n" |
"punpcklbw %%mm7,%%mm1\n" |
"punpckhbw %%mm7,%%mm5\n" |
"punpckhbw %%mm7,%%mm3\n" |
"psubw %%mm1, %%mm4\n" |
"psubw %%mm3, %%mm5\n" |
"psubw %%mm4, %%mm0\n" |
"psubw %%mm5, %%mm2\n" |
"pxor %%mm3, %%mm3\n" |
"pxor %%mm1, %%mm1\n" |
"pcmpgtw %%mm0, %%mm3\n\t" |
"pcmpgtw %%mm2, %%mm1\n\t" |
"pxor %%mm3, %%mm0\n" |
"pxor %%mm1, %%mm2\n" |
"psubw %%mm3, %%mm0\n" |
"psubw %%mm1, %%mm2\n" |
"paddw %%mm0, %%mm2\n" |
"paddw %%mm2, %%mm6\n" |
"add %2,%0\n" |
"1:\n" |
"movq (%0),%%mm0\n" |
"movq 1(%0),%%mm1\n" |
"movq %%mm0, %%mm2\n" |
"movq %%mm1, %%mm3\n" |
"punpcklbw %%mm7,%%mm0\n" |
"punpcklbw %%mm7,%%mm1\n" |
"punpckhbw %%mm7,%%mm2\n" |
"punpckhbw %%mm7,%%mm3\n" |
"psubw %%mm1, %%mm0\n" |
"psubw %%mm3, %%mm2\n" |
"psubw %%mm0, %%mm4\n" |
"psubw %%mm2, %%mm5\n" |
"pxor %%mm3, %%mm3\n" |
"pxor %%mm1, %%mm1\n" |
"pcmpgtw %%mm4, %%mm3\n\t" |
"pcmpgtw %%mm5, %%mm1\n\t" |
"pxor %%mm3, %%mm4\n" |
"pxor %%mm1, %%mm5\n" |
"psubw %%mm3, %%mm4\n" |
"psubw %%mm1, %%mm5\n" |
"paddw %%mm4, %%mm5\n" |
"paddw %%mm5, %%mm6\n" |
"add %2,%0\n" |
"movq (%0),%%mm4\n" |
"movq 1(%0),%%mm1\n" |
"movq %%mm4, %%mm5\n" |
"movq %%mm1, %%mm3\n" |
"punpcklbw %%mm7,%%mm4\n" |
"punpcklbw %%mm7,%%mm1\n" |
"punpckhbw %%mm7,%%mm5\n" |
"punpckhbw %%mm7,%%mm3\n" |
"psubw %%mm1, %%mm4\n" |
"psubw %%mm3, %%mm5\n" |
"psubw %%mm4, %%mm0\n" |
"psubw %%mm5, %%mm2\n" |
"pxor %%mm3, %%mm3\n" |
"pxor %%mm1, %%mm1\n" |
"pcmpgtw %%mm0, %%mm3\n\t" |
"pcmpgtw %%mm2, %%mm1\n\t" |
"pxor %%mm3, %%mm0\n" |
"pxor %%mm1, %%mm2\n" |
"psubw %%mm3, %%mm0\n" |
"psubw %%mm1, %%mm2\n" |
"paddw %%mm0, %%mm2\n" |
"paddw %%mm2, %%mm6\n" |
"add %2,%0\n" |
"subl $2, %%ecx\n" |
" jnz 1b\n" |
"movq %%mm6, %%mm0\n" |
"punpcklwd %%mm7,%%mm0\n" |
"punpckhwd %%mm7,%%mm6\n" |
"paddd %%mm0, %%mm6\n" |
"movq %%mm6,%%mm0\n" |
"psrlq $32, %%mm6\n" |
"paddd %%mm6,%%mm0\n" |
"movd %%mm0,%1\n" |
: "+r" (pix1), "=r"(tmp) |
: "r" ((x86_reg)line_size) , "g" (h-2) |
: "%ecx"); |
return tmp + hf_noise8_mmx(pix+8, line_size, h); |
} |
static int nsse16_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
MpegEncContext *c = p; |
int score1, score2; |
if(c) score1 = c->dsp.sse[0](c, pix1, pix2, line_size, h); |
else score1 = sse16_mmx(c, pix1, pix2, line_size, h); |
score2= hf_noise16_mmx(pix1, line_size, h) - hf_noise16_mmx(pix2, line_size, h); |
if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
else return score1 + FFABS(score2)*8; |
} |
static int nsse8_mmx(void *p, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
MpegEncContext *c = p; |
int score1= sse8_mmx(c, pix1, pix2, line_size, h); |
int score2= hf_noise8_mmx(pix1, line_size, h) - hf_noise8_mmx(pix2, line_size, h); |
if(c) return score1 + FFABS(score2)*c->avctx->nsse_weight; |
else return score1 + FFABS(score2)*8; |
} |
static int vsad_intra16_mmx(void *v, uint8_t * pix, uint8_t * dummy, int line_size, int h) { |
int tmp; |
av_assert2( (((int)pix) & 7) == 0); |
av_assert2((line_size &7) ==0); |
#define SUM(in0, in1, out0, out1) \ |
"movq (%0), %%mm2\n"\ |
"movq 8(%0), %%mm3\n"\ |
"add %2,%0\n"\ |
"movq %%mm2, " #out0 "\n"\ |
"movq %%mm3, " #out1 "\n"\ |
"psubusb " #in0 ", %%mm2\n"\ |
"psubusb " #in1 ", %%mm3\n"\ |
"psubusb " #out0 ", " #in0 "\n"\ |
"psubusb " #out1 ", " #in1 "\n"\ |
"por %%mm2, " #in0 "\n"\ |
"por %%mm3, " #in1 "\n"\ |
"movq " #in0 ", %%mm2\n"\ |
"movq " #in1 ", %%mm3\n"\ |
"punpcklbw %%mm7, " #in0 "\n"\ |
"punpcklbw %%mm7, " #in1 "\n"\ |
"punpckhbw %%mm7, %%mm2\n"\ |
"punpckhbw %%mm7, %%mm3\n"\ |
"paddw " #in1 ", " #in0 "\n"\ |
"paddw %%mm3, %%mm2\n"\ |
"paddw %%mm2, " #in0 "\n"\ |
"paddw " #in0 ", %%mm6\n" |
__asm__ volatile ( |
"movl %3,%%ecx\n" |
"pxor %%mm6,%%mm6\n" |
"pxor %%mm7,%%mm7\n" |
"movq (%0),%%mm0\n" |
"movq 8(%0),%%mm1\n" |
"add %2,%0\n" |
"jmp 2f\n" |
"1:\n" |
SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
"2:\n" |
SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
"subl $2, %%ecx\n" |
"jnz 1b\n" |
"movq %%mm6,%%mm0\n" |
"psrlq $32, %%mm6\n" |
"paddw %%mm6,%%mm0\n" |
"movq %%mm0,%%mm6\n" |
"psrlq $16, %%mm0\n" |
"paddw %%mm6,%%mm0\n" |
"movd %%mm0,%1\n" |
: "+r" (pix), "=r"(tmp) |
: "r" ((x86_reg)line_size) , "m" (h) |
: "%ecx"); |
return tmp & 0xFFFF; |
} |
#undef SUM |
static int vsad_intra16_mmxext(void *v, uint8_t *pix, uint8_t *dummy, |
int line_size, int h) |
{ |
int tmp; |
av_assert2( (((int)pix) & 7) == 0); |
av_assert2((line_size &7) ==0); |
#define SUM(in0, in1, out0, out1) \ |
"movq (%0), " #out0 "\n"\ |
"movq 8(%0), " #out1 "\n"\ |
"add %2,%0\n"\ |
"psadbw " #out0 ", " #in0 "\n"\ |
"psadbw " #out1 ", " #in1 "\n"\ |
"paddw " #in1 ", " #in0 "\n"\ |
"paddw " #in0 ", %%mm6\n" |
__asm__ volatile ( |
"movl %3,%%ecx\n" |
"pxor %%mm6,%%mm6\n" |
"pxor %%mm7,%%mm7\n" |
"movq (%0),%%mm0\n" |
"movq 8(%0),%%mm1\n" |
"add %2,%0\n" |
"jmp 2f\n" |
"1:\n" |
SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
"2:\n" |
SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
"subl $2, %%ecx\n" |
"jnz 1b\n" |
"movd %%mm6,%1\n" |
: "+r" (pix), "=r"(tmp) |
: "r" ((x86_reg)line_size) , "m" (h) |
: "%ecx"); |
return tmp; |
} |
#undef SUM |
static int vsad16_mmx(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) { |
int tmp; |
av_assert2( (((int)pix1) & 7) == 0); |
av_assert2( (((int)pix2) & 7) == 0); |
av_assert2((line_size &7) ==0); |
#define SUM(in0, in1, out0, out1) \ |
"movq (%0),%%mm2\n"\ |
"movq (%1)," #out0 "\n"\ |
"movq 8(%0),%%mm3\n"\ |
"movq 8(%1)," #out1 "\n"\ |
"add %3,%0\n"\ |
"add %3,%1\n"\ |
"psubb " #out0 ", %%mm2\n"\ |
"psubb " #out1 ", %%mm3\n"\ |
"pxor %%mm7, %%mm2\n"\ |
"pxor %%mm7, %%mm3\n"\ |
"movq %%mm2, " #out0 "\n"\ |
"movq %%mm3, " #out1 "\n"\ |
"psubusb " #in0 ", %%mm2\n"\ |
"psubusb " #in1 ", %%mm3\n"\ |
"psubusb " #out0 ", " #in0 "\n"\ |
"psubusb " #out1 ", " #in1 "\n"\ |
"por %%mm2, " #in0 "\n"\ |
"por %%mm3, " #in1 "\n"\ |
"movq " #in0 ", %%mm2\n"\ |
"movq " #in1 ", %%mm3\n"\ |
"punpcklbw %%mm7, " #in0 "\n"\ |
"punpcklbw %%mm7, " #in1 "\n"\ |
"punpckhbw %%mm7, %%mm2\n"\ |
"punpckhbw %%mm7, %%mm3\n"\ |
"paddw " #in1 ", " #in0 "\n"\ |
"paddw %%mm3, %%mm2\n"\ |
"paddw %%mm2, " #in0 "\n"\ |
"paddw " #in0 ", %%mm6\n" |
__asm__ volatile ( |
"movl %4,%%ecx\n" |
"pxor %%mm6,%%mm6\n" |
"pcmpeqw %%mm7,%%mm7\n" |
"psllw $15, %%mm7\n" |
"packsswb %%mm7, %%mm7\n" |
"movq (%0),%%mm0\n" |
"movq (%1),%%mm2\n" |
"movq 8(%0),%%mm1\n" |
"movq 8(%1),%%mm3\n" |
"add %3,%0\n" |
"add %3,%1\n" |
"psubb %%mm2, %%mm0\n" |
"psubb %%mm3, %%mm1\n" |
"pxor %%mm7, %%mm0\n" |
"pxor %%mm7, %%mm1\n" |
"jmp 2f\n" |
"1:\n" |
SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
"2:\n" |
SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
"subl $2, %%ecx\n" |
"jnz 1b\n" |
"movq %%mm6,%%mm0\n" |
"psrlq $32, %%mm6\n" |
"paddw %%mm6,%%mm0\n" |
"movq %%mm0,%%mm6\n" |
"psrlq $16, %%mm0\n" |
"paddw %%mm6,%%mm0\n" |
"movd %%mm0,%2\n" |
: "+r" (pix1), "+r" (pix2), "=r"(tmp) |
: "r" ((x86_reg)line_size) , "m" (h) |
: "%ecx"); |
return tmp & 0x7FFF; |
} |
#undef SUM |
static int vsad16_mmxext(void *v, uint8_t *pix1, uint8_t *pix2, |
int line_size, int h) |
{ |
int tmp; |
av_assert2( (((int)pix1) & 7) == 0); |
av_assert2( (((int)pix2) & 7) == 0); |
av_assert2((line_size &7) ==0); |
#define SUM(in0, in1, out0, out1) \ |
"movq (%0)," #out0 "\n"\ |
"movq (%1),%%mm2\n"\ |
"movq 8(%0)," #out1 "\n"\ |
"movq 8(%1),%%mm3\n"\ |
"add %3,%0\n"\ |
"add %3,%1\n"\ |
"psubb %%mm2, " #out0 "\n"\ |
"psubb %%mm3, " #out1 "\n"\ |
"pxor %%mm7, " #out0 "\n"\ |
"pxor %%mm7, " #out1 "\n"\ |
"psadbw " #out0 ", " #in0 "\n"\ |
"psadbw " #out1 ", " #in1 "\n"\ |
"paddw " #in1 ", " #in0 "\n"\ |
"paddw " #in0 ", %%mm6\n" |
__asm__ volatile ( |
"movl %4,%%ecx\n" |
"pxor %%mm6,%%mm6\n" |
"pcmpeqw %%mm7,%%mm7\n" |
"psllw $15, %%mm7\n" |
"packsswb %%mm7, %%mm7\n" |
"movq (%0),%%mm0\n" |
"movq (%1),%%mm2\n" |
"movq 8(%0),%%mm1\n" |
"movq 8(%1),%%mm3\n" |
"add %3,%0\n" |
"add %3,%1\n" |
"psubb %%mm2, %%mm0\n" |
"psubb %%mm3, %%mm1\n" |
"pxor %%mm7, %%mm0\n" |
"pxor %%mm7, %%mm1\n" |
"jmp 2f\n" |
"1:\n" |
SUM(%%mm4, %%mm5, %%mm0, %%mm1) |
"2:\n" |
SUM(%%mm0, %%mm1, %%mm4, %%mm5) |
"subl $2, %%ecx\n" |
"jnz 1b\n" |
"movd %%mm6,%2\n" |
: "+r" (pix1), "+r" (pix2), "=r"(tmp) |
: "r" ((x86_reg)line_size) , "m" (h) |
: "%ecx"); |
return tmp; |
} |
#undef SUM |
static void diff_bytes_mmx(uint8_t *dst, const uint8_t *src1, const uint8_t *src2, int w){ |
x86_reg i=0; |
if(w>=16) |
__asm__ volatile( |
"1: \n\t" |
"movq (%2, %0), %%mm0 \n\t" |
"movq (%1, %0), %%mm1 \n\t" |
"psubb %%mm0, %%mm1 \n\t" |
"movq %%mm1, (%3, %0) \n\t" |
"movq 8(%2, %0), %%mm0 \n\t" |
"movq 8(%1, %0), %%mm1 \n\t" |
"psubb %%mm0, %%mm1 \n\t" |
"movq %%mm1, 8(%3, %0) \n\t" |
"add $16, %0 \n\t" |
"cmp %4, %0 \n\t" |
" jb 1b \n\t" |
: "+r" (i) |
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w-15) |
); |
for(; i<w; i++) |
dst[i+0] = src1[i+0]-src2[i+0]; |
} |
static void sub_hfyu_median_prediction_mmxext(uint8_t *dst, const uint8_t *src1, |
const uint8_t *src2, int w, |
int *left, int *left_top) |
{ |
x86_reg i=0; |
uint8_t l, lt; |
__asm__ volatile( |
"movq (%1, %0), %%mm0 \n\t" // LT |
"psllq $8, %%mm0 \n\t" |
"1: \n\t" |
"movq (%1, %0), %%mm1 \n\t" // T |
"movq -1(%2, %0), %%mm2 \n\t" // L |
"movq (%2, %0), %%mm3 \n\t" // X |
"movq %%mm2, %%mm4 \n\t" // L |
"psubb %%mm0, %%mm2 \n\t" |
"paddb %%mm1, %%mm2 \n\t" // L + T - LT |
"movq %%mm4, %%mm5 \n\t" // L |
"pmaxub %%mm1, %%mm4 \n\t" // max(T, L) |
"pminub %%mm5, %%mm1 \n\t" // min(T, L) |
"pminub %%mm2, %%mm4 \n\t" |
"pmaxub %%mm1, %%mm4 \n\t" |
"psubb %%mm4, %%mm3 \n\t" // dst - pred |
"movq %%mm3, (%3, %0) \n\t" |
"add $8, %0 \n\t" |
"movq -1(%1, %0), %%mm0 \n\t" // LT |
"cmp %4, %0 \n\t" |
" jb 1b \n\t" |
: "+r" (i) |
: "r"(src1), "r"(src2), "r"(dst), "r"((x86_reg)w) |
); |
l= *left; |
lt= *left_top; |
dst[0]= src2[0] - mid_pred(l, src1[0], (l + src1[0] - lt)&0xFF); |
*left_top= src1[w-1]; |
*left = src2[w-1]; |
} |
#define MMABS_MMX(a,z)\ |
"pxor " #z ", " #z " \n\t"\ |
"pcmpgtw " #a ", " #z " \n\t"\ |
"pxor " #z ", " #a " \n\t"\ |
"psubw " #z ", " #a " \n\t" |
#define MMABS_MMXEXT(a, z) \ |
"pxor " #z ", " #z " \n\t"\ |
"psubw " #a ", " #z " \n\t"\ |
"pmaxsw " #z ", " #a " \n\t" |
#define MMABS_SSSE3(a,z)\ |
"pabsw " #a ", " #a " \n\t" |
#define MMABS_SUM(a,z, sum)\ |
MMABS(a,z)\ |
"paddusw " #a ", " #sum " \n\t" |
/* FIXME: HSUM_* saturates at 64k, while an 8x8 hadamard or dct block can get up to |
* about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
* and it's even more unlikely to not have any alternative mvs/modes with lower cost. */ |
#define HSUM_MMX(a, t, dst)\ |
"movq "#a", "#t" \n\t"\ |
"psrlq $32, "#a" \n\t"\ |
"paddusw "#t", "#a" \n\t"\ |
"movq "#a", "#t" \n\t"\ |
"psrlq $16, "#a" \n\t"\ |
"paddusw "#t", "#a" \n\t"\ |
"movd "#a", "#dst" \n\t"\ |
#define HSUM_MMXEXT(a, t, dst) \ |
"pshufw $0x0E, "#a", "#t" \n\t"\ |
"paddusw "#t", "#a" \n\t"\ |
"pshufw $0x01, "#a", "#t" \n\t"\ |
"paddusw "#t", "#a" \n\t"\ |
"movd "#a", "#dst" \n\t"\ |
#define HSUM_SSE2(a, t, dst)\ |
"movhlps "#a", "#t" \n\t"\ |
"paddusw "#t", "#a" \n\t"\ |
"pshuflw $0x0E, "#a", "#t" \n\t"\ |
"paddusw "#t", "#a" \n\t"\ |
"pshuflw $0x01, "#a", "#t" \n\t"\ |
"paddusw "#t", "#a" \n\t"\ |
"movd "#a", "#dst" \n\t"\ |
#define DCT_SAD4(m,mm,o)\ |
"mov"#m" "#o"+ 0(%1), "#mm"2 \n\t"\ |
"mov"#m" "#o"+16(%1), "#mm"3 \n\t"\ |
"mov"#m" "#o"+32(%1), "#mm"4 \n\t"\ |
"mov"#m" "#o"+48(%1), "#mm"5 \n\t"\ |
MMABS_SUM(mm##2, mm##6, mm##0)\ |
MMABS_SUM(mm##3, mm##7, mm##1)\ |
MMABS_SUM(mm##4, mm##6, mm##0)\ |
MMABS_SUM(mm##5, mm##7, mm##1)\ |
#define DCT_SAD_MMX\ |
"pxor %%mm0, %%mm0 \n\t"\ |
"pxor %%mm1, %%mm1 \n\t"\ |
DCT_SAD4(q, %%mm, 0)\ |
DCT_SAD4(q, %%mm, 8)\ |
DCT_SAD4(q, %%mm, 64)\ |
DCT_SAD4(q, %%mm, 72)\ |
"paddusw %%mm1, %%mm0 \n\t"\ |
HSUM(%%mm0, %%mm1, %0) |
#define DCT_SAD_SSE2\ |
"pxor %%xmm0, %%xmm0 \n\t"\ |
"pxor %%xmm1, %%xmm1 \n\t"\ |
DCT_SAD4(dqa, %%xmm, 0)\ |
DCT_SAD4(dqa, %%xmm, 64)\ |
"paddusw %%xmm1, %%xmm0 \n\t"\ |
HSUM(%%xmm0, %%xmm1, %0) |
#define DCT_SAD_FUNC(cpu) \ |
static int sum_abs_dctelem_##cpu(int16_t *block){\ |
int sum;\ |
__asm__ volatile(\ |
DCT_SAD\ |
:"=r"(sum)\ |
:"r"(block)\ |
);\ |
return sum&0xFFFF;\ |
} |
#define DCT_SAD DCT_SAD_MMX |
#define HSUM(a,t,dst) HSUM_MMX(a,t,dst) |
#define MMABS(a,z) MMABS_MMX(a,z) |
DCT_SAD_FUNC(mmx) |
#undef MMABS |
#undef HSUM |
#define HSUM(a,t,dst) HSUM_MMXEXT(a,t,dst) |
#define MMABS(a,z) MMABS_MMXEXT(a,z) |
DCT_SAD_FUNC(mmxext) |
#undef HSUM |
#undef DCT_SAD |
#define DCT_SAD DCT_SAD_SSE2 |
#define HSUM(a,t,dst) HSUM_SSE2(a,t,dst) |
DCT_SAD_FUNC(sse2) |
#undef MMABS |
#if HAVE_SSSE3_INLINE |
#define MMABS(a,z) MMABS_SSSE3(a,z) |
DCT_SAD_FUNC(ssse3) |
#undef MMABS |
#endif |
#undef HSUM |
#undef DCT_SAD |
static int ssd_int8_vs_int16_mmx(const int8_t *pix1, const int16_t *pix2, int size){ |
int sum; |
x86_reg i=size; |
__asm__ volatile( |
"pxor %%mm4, %%mm4 \n" |
"1: \n" |
"sub $8, %0 \n" |
"movq (%2,%0), %%mm2 \n" |
"movq (%3,%0,2), %%mm0 \n" |
"movq 8(%3,%0,2), %%mm1 \n" |
"punpckhbw %%mm2, %%mm3 \n" |
"punpcklbw %%mm2, %%mm2 \n" |
"psraw $8, %%mm3 \n" |
"psraw $8, %%mm2 \n" |
"psubw %%mm3, %%mm1 \n" |
"psubw %%mm2, %%mm0 \n" |
"pmaddwd %%mm1, %%mm1 \n" |
"pmaddwd %%mm0, %%mm0 \n" |
"paddd %%mm1, %%mm4 \n" |
"paddd %%mm0, %%mm4 \n" |
"jg 1b \n" |
"movq %%mm4, %%mm3 \n" |
"psrlq $32, %%mm3 \n" |
"paddd %%mm3, %%mm4 \n" |
"movd %%mm4, %1 \n" |
:"+r"(i), "=r"(sum) |
:"r"(pix1), "r"(pix2) |
); |
return sum; |
} |
#define PHADDD(a, t)\ |
"movq "#a", "#t" \n\t"\ |
"psrlq $32, "#a" \n\t"\ |
"paddd "#t", "#a" \n\t" |
/* |
pmulhw: dst[0-15]=(src[0-15]*dst[0-15])[16-31] |
pmulhrw: dst[0-15]=(src[0-15]*dst[0-15] + 0x8000)[16-31] |
pmulhrsw: dst[0-15]=(src[0-15]*dst[0-15] + 0x4000)[15-30] |
*/ |
#define PMULHRW(x, y, s, o)\ |
"pmulhw " #s ", "#x " \n\t"\ |
"pmulhw " #s ", "#y " \n\t"\ |
"paddw " #o ", "#x " \n\t"\ |
"paddw " #o ", "#y " \n\t"\ |
"psraw $1, "#x " \n\t"\ |
"psraw $1, "#y " \n\t" |
#define DEF(x) x ## _mmx |
#define SET_RND MOVQ_WONE |
#define SCALE_OFFSET 1 |
#include "dsputil_qns_template.c" |
#undef DEF |
#undef SET_RND |
#undef SCALE_OFFSET |
#undef PMULHRW |
#define DEF(x) x ## _3dnow |
#define SET_RND(x) |
#define SCALE_OFFSET 0 |
#define PMULHRW(x, y, s, o)\ |
"pmulhrw " #s ", "#x " \n\t"\ |
"pmulhrw " #s ", "#y " \n\t" |
#include "dsputil_qns_template.c" |
#undef DEF |
#undef SET_RND |
#undef SCALE_OFFSET |
#undef PMULHRW |
#if HAVE_SSSE3_INLINE |
#undef PHADDD |
#define DEF(x) x ## _ssse3 |
#define SET_RND(x) |
#define SCALE_OFFSET -1 |
#define PHADDD(a, t)\ |
"pshufw $0x0E, "#a", "#t" \n\t"\ |
"paddd "#t", "#a" \n\t" /* faster than phaddd on core2 */ |
#define PMULHRW(x, y, s, o)\ |
"pmulhrsw " #s ", "#x " \n\t"\ |
"pmulhrsw " #s ", "#y " \n\t" |
#include "dsputil_qns_template.c" |
#undef DEF |
#undef SET_RND |
#undef SCALE_OFFSET |
#undef PMULHRW |
#undef PHADDD |
#endif /* HAVE_SSSE3_INLINE */ |
#endif /* HAVE_INLINE_ASM */ |
int ff_sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h); |
#define hadamard_func(cpu) \ |
int ff_hadamard8_diff_##cpu (void *s, uint8_t *src1, uint8_t *src2, \ |
int stride, int h); \ |
int ff_hadamard8_diff16_##cpu(void *s, uint8_t *src1, uint8_t *src2, \ |
int stride, int h); |
hadamard_func(mmx) |
hadamard_func(mmxext) |
hadamard_func(sse2) |
hadamard_func(ssse3) |
av_cold void ff_dsputilenc_init_mmx(DSPContext *c, AVCodecContext *avctx) |
{ |
int cpu_flags = av_get_cpu_flags(); |
const int dct_algo = avctx->dct_algo; |
#if HAVE_YASM |
int bit_depth = avctx->bits_per_raw_sample; |
if (EXTERNAL_MMX(cpu_flags)) { |
if (bit_depth <= 8) |
c->get_pixels = ff_get_pixels_mmx; |
c->diff_pixels = ff_diff_pixels_mmx; |
c->pix_sum = ff_pix_sum16_mmx; |
c->pix_norm1 = ff_pix_norm1_mmx; |
} |
if (EXTERNAL_SSE2(cpu_flags)) |
if (bit_depth <= 8) |
c->get_pixels = ff_get_pixels_sse2; |
#endif /* HAVE_YASM */ |
#if HAVE_INLINE_ASM |
if (INLINE_MMX(cpu_flags)) { |
if (avctx->bits_per_raw_sample <= 8 && |
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) |
c->fdct = ff_fdct_mmx; |
c->diff_bytes= diff_bytes_mmx; |
c->sum_abs_dctelem= sum_abs_dctelem_mmx; |
c->sse[0] = sse16_mmx; |
c->sse[1] = sse8_mmx; |
c->vsad[4]= vsad_intra16_mmx; |
c->nsse[0] = nsse16_mmx; |
c->nsse[1] = nsse8_mmx; |
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
c->vsad[0] = vsad16_mmx; |
} |
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
c->try_8x8basis= try_8x8basis_mmx; |
} |
c->add_8x8basis= add_8x8basis_mmx; |
c->ssd_int8_vs_int16 = ssd_int8_vs_int16_mmx; |
} |
if (INLINE_MMXEXT(cpu_flags)) { |
if (avctx->bits_per_raw_sample <= 8 && |
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) |
c->fdct = ff_fdct_mmxext; |
c->sum_abs_dctelem = sum_abs_dctelem_mmxext; |
c->vsad[4] = vsad_intra16_mmxext; |
if (!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
c->vsad[0] = vsad16_mmxext; |
} |
c->sub_hfyu_median_prediction = sub_hfyu_median_prediction_mmxext; |
} |
if (INLINE_SSE2(cpu_flags)) { |
if (avctx->bits_per_raw_sample <= 8 && |
(dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX)) |
c->fdct = ff_fdct_sse2; |
c->sum_abs_dctelem= sum_abs_dctelem_sse2; |
} |
#if HAVE_SSSE3_INLINE |
if (INLINE_SSSE3(cpu_flags)) { |
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
c->try_8x8basis = try_8x8basis_ssse3; |
} |
c->add_8x8basis = add_8x8basis_ssse3; |
c->sum_abs_dctelem = sum_abs_dctelem_ssse3; |
} |
#endif |
if (INLINE_AMD3DNOW(cpu_flags)) { |
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
c->try_8x8basis = try_8x8basis_3dnow; |
} |
c->add_8x8basis = add_8x8basis_3dnow; |
} |
#endif /* HAVE_INLINE_ASM */ |
if (EXTERNAL_MMX(cpu_flags)) { |
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmx; |
c->hadamard8_diff[1] = ff_hadamard8_diff_mmx; |
} |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
c->hadamard8_diff[0] = ff_hadamard8_diff16_mmxext; |
c->hadamard8_diff[1] = ff_hadamard8_diff_mmxext; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
c->sse[0] = ff_sse16_sse2; |
#if HAVE_ALIGNED_STACK |
c->hadamard8_diff[0] = ff_hadamard8_diff16_sse2; |
c->hadamard8_diff[1] = ff_hadamard8_diff_sse2; |
#endif |
} |
if (EXTERNAL_SSSE3(cpu_flags) && HAVE_ALIGNED_STACK) { |
c->hadamard8_diff[0] = ff_hadamard8_diff16_ssse3; |
c->hadamard8_diff[1] = ff_hadamard8_diff_ssse3; |
} |
ff_dsputil_init_pix_mmx(c, avctx); |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/dwt_yasm.asm |
---|
0,0 → 1,306 |
;****************************************************************************** |
;* MMX optimized discrete wavelet trasnform |
;* Copyright (c) 2010 David Conrad |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
pw_1: times 8 dw 1 |
pw_2: times 8 dw 2 |
pw_8: times 8 dw 8 |
pw_16: times 8 dw 16 |
pw_1991: times 4 dw 9,-1 |
section .text |
; %1 -= (%2 + %3 + 2)>>2 %4 is pw_2 |
%macro COMPOSE_53iL0 4 |
paddw %2, %3 |
paddw %2, %4 |
psraw %2, 2 |
psubw %1, %2 |
%endm |
; m1 = %1 + (-m0 + 9*m1 + 9*%2 -%3 + 8)>>4 |
; if %4 is supplied, %1 is loaded unaligned from there |
; m2: clobbered m3: pw_8 m4: pw_1991 |
%macro COMPOSE_DD97iH0 3-4 |
paddw m0, %3 |
paddw m1, %2 |
psubw m0, m3 |
mova m2, m1 |
punpcklwd m1, m0 |
punpckhwd m2, m0 |
pmaddwd m1, m4 |
pmaddwd m2, m4 |
%if %0 > 3 |
movu %1, %4 |
%endif |
psrad m1, 4 |
psrad m2, 4 |
packssdw m1, m2 |
paddw m1, %1 |
%endm |
%macro COMPOSE_VERTICAL 1 |
; void vertical_compose53iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
; int width) |
cglobal vertical_compose53iL0_%1, 4,4,1, b0, b1, b2, width |
mova m2, [pw_2] |
%if ARCH_X86_64 |
mov widthd, widthd |
%endif |
.loop: |
sub widthq, mmsize/2 |
mova m1, [b0q+2*widthq] |
mova m0, [b1q+2*widthq] |
COMPOSE_53iL0 m0, m1, [b2q+2*widthq], m2 |
mova [b1q+2*widthq], m0 |
jg .loop |
REP_RET |
; void vertical_compose_dirac53iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
; int width) |
cglobal vertical_compose_dirac53iH0_%1, 4,4,1, b0, b1, b2, width |
mova m1, [pw_1] |
%if ARCH_X86_64 |
mov widthd, widthd |
%endif |
.loop: |
sub widthq, mmsize/2 |
mova m0, [b0q+2*widthq] |
paddw m0, [b2q+2*widthq] |
paddw m0, m1 |
psraw m0, 1 |
paddw m0, [b1q+2*widthq] |
mova [b1q+2*widthq], m0 |
jg .loop |
REP_RET |
; void vertical_compose_dd97iH0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
; IDWTELEM *b3, IDWTELEM *b4, int width) |
cglobal vertical_compose_dd97iH0_%1, 6,6,5, b0, b1, b2, b3, b4, width |
mova m3, [pw_8] |
mova m4, [pw_1991] |
%if ARCH_X86_64 |
mov widthd, widthd |
%endif |
.loop: |
sub widthq, mmsize/2 |
mova m0, [b0q+2*widthq] |
mova m1, [b1q+2*widthq] |
COMPOSE_DD97iH0 [b2q+2*widthq], [b3q+2*widthq], [b4q+2*widthq] |
mova [b2q+2*widthq], m1 |
jg .loop |
REP_RET |
; void vertical_compose_dd137iL0(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, |
; IDWTELEM *b3, IDWTELEM *b4, int width) |
cglobal vertical_compose_dd137iL0_%1, 6,6,6, b0, b1, b2, b3, b4, width |
mova m3, [pw_16] |
mova m4, [pw_1991] |
%if ARCH_X86_64 |
mov widthd, widthd |
%endif |
.loop: |
sub widthq, mmsize/2 |
mova m0, [b0q+2*widthq] |
mova m1, [b1q+2*widthq] |
mova m5, [b2q+2*widthq] |
paddw m0, [b4q+2*widthq] |
paddw m1, [b3q+2*widthq] |
psubw m0, m3 |
mova m2, m1 |
punpcklwd m1, m0 |
punpckhwd m2, m0 |
pmaddwd m1, m4 |
pmaddwd m2, m4 |
psrad m1, 5 |
psrad m2, 5 |
packssdw m1, m2 |
psubw m5, m1 |
mova [b2q+2*widthq], m5 |
jg .loop |
REP_RET |
; void vertical_compose_haar(IDWTELEM *b0, IDWTELEM *b1, int width) |
cglobal vertical_compose_haar_%1, 3,4,3, b0, b1, width |
mova m3, [pw_1] |
%if ARCH_X86_64 |
mov widthd, widthd |
%endif |
.loop: |
sub widthq, mmsize/2 |
mova m1, [b1q+2*widthq] |
mova m0, [b0q+2*widthq] |
mova m2, m1 |
paddw m1, m3 |
psraw m1, 1 |
psubw m0, m1 |
mova [b0q+2*widthq], m0 |
paddw m2, m0 |
mova [b1q+2*widthq], m2 |
jg .loop |
REP_RET |
%endmacro |
; extend the left and right edges of the tmp array by %1 and %2 respectively |
%macro EDGE_EXTENSION 3 |
mov %3, [tmpq] |
%assign %%i 1 |
%rep %1 |
mov [tmpq-2*%%i], %3 |
%assign %%i %%i+1 |
%endrep |
mov %3, [tmpq+2*w2q-2] |
%assign %%i 0 |
%rep %2 |
mov [tmpq+2*w2q+2*%%i], %3 |
%assign %%i %%i+1 |
%endrep |
%endmacro |
%macro HAAR_HORIZONTAL 2 |
; void horizontal_compose_haari(IDWTELEM *b, IDWTELEM *tmp, int width) |
cglobal horizontal_compose_haar%2i_%1, 3,6,4, b, tmp, w, x, w2, b_w2 |
mov w2d, wd |
xor xq, xq |
shr w2d, 1 |
lea b_w2q, [bq+wq] |
mova m3, [pw_1] |
.lowpass_loop: |
movu m1, [b_w2q + 2*xq] |
mova m0, [bq + 2*xq] |
paddw m1, m3 |
psraw m1, 1 |
psubw m0, m1 |
mova [tmpq + 2*xq], m0 |
add xq, mmsize/2 |
cmp xq, w2q |
jl .lowpass_loop |
xor xq, xq |
and w2q, ~(mmsize/2 - 1) |
cmp w2q, mmsize/2 |
jl .end |
.highpass_loop: |
movu m1, [b_w2q + 2*xq] |
mova m0, [tmpq + 2*xq] |
paddw m1, m0 |
; shift and interleave |
%if %2 == 1 |
paddw m0, m3 |
paddw m1, m3 |
psraw m0, 1 |
psraw m1, 1 |
%endif |
mova m2, m0 |
punpcklwd m0, m1 |
punpckhwd m2, m1 |
mova [bq+4*xq], m0 |
mova [bq+4*xq+mmsize], m2 |
add xq, mmsize/2 |
cmp xq, w2q |
jl .highpass_loop |
.end: |
REP_RET |
%endmacro |
INIT_XMM |
; void horizontal_compose_dd97i(IDWTELEM *b, IDWTELEM *tmp, int width) |
cglobal horizontal_compose_dd97i_ssse3, 3,6,8, b, tmp, w, x, w2, b_w2 |
mov w2d, wd |
xor xd, xd |
shr w2d, 1 |
lea b_w2q, [bq+wq] |
movu m4, [bq+wq] |
mova m7, [pw_2] |
pslldq m4, 14 |
.lowpass_loop: |
movu m1, [b_w2q + 2*xq] |
mova m0, [bq + 2*xq] |
mova m2, m1 |
palignr m1, m4, 14 |
mova m4, m2 |
COMPOSE_53iL0 m0, m1, m2, m7 |
mova [tmpq + 2*xq], m0 |
add xd, mmsize/2 |
cmp xd, w2d |
jl .lowpass_loop |
EDGE_EXTENSION 1, 2, xw |
; leave the last up to 7 (sse) or 3 (mmx) values for C |
xor xd, xd |
and w2d, ~(mmsize/2 - 1) |
cmp w2d, mmsize/2 |
jl .end |
mova m7, [tmpq-mmsize] |
mova m0, [tmpq] |
mova m5, [pw_1] |
mova m3, [pw_8] |
mova m4, [pw_1991] |
.highpass_loop: |
mova m6, m0 |
palignr m0, m7, 14 |
mova m7, [tmpq + 2*xq + 16] |
mova m1, m7 |
mova m2, m7 |
palignr m1, m6, 2 |
palignr m2, m6, 4 |
COMPOSE_DD97iH0 m0, m6, m2, [b_w2q + 2*xq] |
mova m0, m7 |
mova m7, m6 |
; shift and interleave |
paddw m6, m5 |
paddw m1, m5 |
psraw m6, 1 |
psraw m1, 1 |
mova m2, m6 |
punpcklwd m6, m1 |
punpckhwd m2, m1 |
mova [bq+4*xq], m6 |
mova [bq+4*xq+mmsize], m2 |
add xd, mmsize/2 |
cmp xd, w2d |
jl .highpass_loop |
.end: |
REP_RET |
%if ARCH_X86_64 == 0 |
INIT_MMX |
COMPOSE_VERTICAL mmx |
HAAR_HORIZONTAL mmx, 0 |
HAAR_HORIZONTAL mmx, 1 |
%endif |
;;INIT_XMM |
INIT_XMM |
COMPOSE_VERTICAL sse2 |
HAAR_HORIZONTAL sse2, 0 |
HAAR_HORIZONTAL sse2, 1 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fdct.c |
---|
0,0 → 1,594 |
/* |
* MMX optimized forward DCT |
* The gcc porting is Copyright (c) 2001 Fabrice Bellard. |
* cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
* SSE2 optimization is Copyright (c) 2004 Denes Balatoni. |
* |
* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT |
* |
* Intel Application Note AP-922 - fast, precise implementation of DCT |
* http://developer.intel.com/vtune/cbts/appnotes.htm |
* |
* Also of inspiration: |
* a page about fdct at http://www.geocities.com/ssavekar/dct.htm |
* Skal's fdct at http://skal.planet-d.net/coding/dct.html |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/common.h" |
#include "libavutil/x86/asm.h" |
#include "libavcodec/dct.h" |
#if HAVE_MMX_INLINE |
////////////////////////////////////////////////////////////////////// |
// |
// constants for the forward DCT |
// ----------------------------- |
// |
// Be sure to check that your compiler is aligning all constants to QWORD |
// (8-byte) memory boundaries! Otherwise the unaligned memory access will |
// severely stall MMX execution. |
// |
////////////////////////////////////////////////////////////////////// |
#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy |
#define SHIFT_FRW_COL BITS_FRW_ACC |
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3) |
#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1)) |
//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1)) |
#define X8(x) x,x,x,x,x,x,x,x |
//concatenated table, for forward DCT transformation |
DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = { |
X8(13036), // tg * (2<<16) + 0.5 |
X8(27146), // tg * (2<<16) + 0.5 |
X8(-21746) // tg * (2<<16) + 0.5 |
}; |
DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = { |
X8(23170) //cos * (2<<15) + 0.5 |
}; |
DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) }; |
DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW }; |
static const struct |
{ |
DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4]; |
} fdct_r_row_sse2 = |
{{ |
RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW |
}}; |
//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW}; |
DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table |
16384, 16384, 22725, 19266, |
16384, 16384, 12873, 4520, |
21407, 8867, 19266, -4520, |
-8867, -21407, -22725, -12873, |
16384, -16384, 12873, -22725, |
-16384, 16384, 4520, 19266, |
8867, -21407, 4520, -12873, |
21407, -8867, 19266, -22725, |
22725, 22725, 31521, 26722, |
22725, 22725, 17855, 6270, |
29692, 12299, 26722, -6270, |
-12299, -29692, -31521, -17855, |
22725, -22725, 17855, -31521, |
-22725, 22725, 6270, 26722, |
12299, -29692, 6270, -17855, |
29692, -12299, 26722, -31521, |
21407, 21407, 29692, 25172, |
21407, 21407, 16819, 5906, |
27969, 11585, 25172, -5906, |
-11585, -27969, -29692, -16819, |
21407, -21407, 16819, -29692, |
-21407, 21407, 5906, 25172, |
11585, -27969, 5906, -16819, |
27969, -11585, 25172, -29692, |
19266, 19266, 26722, 22654, |
19266, 19266, 15137, 5315, |
25172, 10426, 22654, -5315, |
-10426, -25172, -26722, -15137, |
19266, -19266, 15137, -26722, |
-19266, 19266, 5315, 22654, |
10426, -25172, 5315, -15137, |
25172, -10426, 22654, -26722, |
16384, 16384, 22725, 19266, |
16384, 16384, 12873, 4520, |
21407, 8867, 19266, -4520, |
-8867, -21407, -22725, -12873, |
16384, -16384, 12873, -22725, |
-16384, 16384, 4520, 19266, |
8867, -21407, 4520, -12873, |
21407, -8867, 19266, -22725, |
19266, 19266, 26722, 22654, |
19266, 19266, 15137, 5315, |
25172, 10426, 22654, -5315, |
-10426, -25172, -26722, -15137, |
19266, -19266, 15137, -26722, |
-19266, 19266, 5315, 22654, |
10426, -25172, 5315, -15137, |
25172, -10426, 22654, -26722, |
21407, 21407, 29692, 25172, |
21407, 21407, 16819, 5906, |
27969, 11585, 25172, -5906, |
-11585, -27969, -29692, -16819, |
21407, -21407, 16819, -29692, |
-21407, 21407, 5906, 25172, |
11585, -27969, 5906, -16819, |
27969, -11585, 25172, -29692, |
22725, 22725, 31521, 26722, |
22725, 22725, 17855, 6270, |
29692, 12299, 26722, -6270, |
-12299, -29692, -31521, -17855, |
22725, -22725, 17855, -31521, |
-22725, 22725, 6270, 26722, |
12299, -29692, 6270, -17855, |
29692, -12299, 26722, -31521, |
}; |
static const struct |
{ |
DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256]; |
} tab_frw_01234567_sse2 = |
{{ |
//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table |
#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \ |
C4, C4, C5, C7, C2, C6, C3, -C7, \ |
-C4, C4, C7, C3, C6, -C2, C7, -C5, \ |
C4, -C4, C5, -C1, C2, -C6, C3, -C1, |
// c1..c7 * cos(pi/4) * 2^15 |
#define C1 22725 |
#define C2 21407 |
#define C3 19266 |
#define C4 16384 |
#define C5 12873 |
#define C6 8867 |
#define C7 4520 |
TABLE_SSE2 |
#undef C1 |
#undef C2 |
#undef C3 |
#undef C4 |
#undef C5 |
#undef C6 |
#undef C7 |
#define C1 31521 |
#define C2 29692 |
#define C3 26722 |
#define C4 22725 |
#define C5 17855 |
#define C6 12299 |
#define C7 6270 |
TABLE_SSE2 |
#undef C1 |
#undef C2 |
#undef C3 |
#undef C4 |
#undef C5 |
#undef C6 |
#undef C7 |
#define C1 29692 |
#define C2 27969 |
#define C3 25172 |
#define C4 21407 |
#define C5 16819 |
#define C6 11585 |
#define C7 5906 |
TABLE_SSE2 |
#undef C1 |
#undef C2 |
#undef C3 |
#undef C4 |
#undef C5 |
#undef C6 |
#undef C7 |
#define C1 26722 |
#define C2 25172 |
#define C3 22654 |
#define C4 19266 |
#define C5 15137 |
#define C6 10426 |
#define C7 5315 |
TABLE_SSE2 |
#undef C1 |
#undef C2 |
#undef C3 |
#undef C4 |
#undef C5 |
#undef C6 |
#undef C7 |
#define C1 22725 |
#define C2 21407 |
#define C3 19266 |
#define C4 16384 |
#define C5 12873 |
#define C6 8867 |
#define C7 4520 |
TABLE_SSE2 |
#undef C1 |
#undef C2 |
#undef C3 |
#undef C4 |
#undef C5 |
#undef C6 |
#undef C7 |
#define C1 26722 |
#define C2 25172 |
#define C3 22654 |
#define C4 19266 |
#define C5 15137 |
#define C6 10426 |
#define C7 5315 |
TABLE_SSE2 |
#undef C1 |
#undef C2 |
#undef C3 |
#undef C4 |
#undef C5 |
#undef C6 |
#undef C7 |
#define C1 29692 |
#define C2 27969 |
#define C3 25172 |
#define C4 21407 |
#define C5 16819 |
#define C6 11585 |
#define C7 5906 |
TABLE_SSE2 |
#undef C1 |
#undef C2 |
#undef C3 |
#undef C4 |
#undef C5 |
#undef C6 |
#undef C7 |
#define C1 31521 |
#define C2 29692 |
#define C3 26722 |
#define C4 22725 |
#define C5 17855 |
#define C6 12299 |
#define C7 6270 |
TABLE_SSE2 |
}}; |
#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long |
#define FDCT_COL(cpu, mm, mov)\ |
static av_always_inline void fdct_col_##cpu(const int16_t *in, int16_t *out, int offset)\ |
{\ |
__asm__ volatile (\ |
#mov" 16(%0), %%"#mm"0 \n\t" \ |
#mov" 96(%0), %%"#mm"1 \n\t" \ |
#mov" %%"#mm"0, %%"#mm"2 \n\t" \ |
#mov" 32(%0), %%"#mm"3 \n\t" \ |
"paddsw %%"#mm"1, %%"#mm"0 \n\t" \ |
#mov" 80(%0), %%"#mm"4 \n\t" \ |
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \ |
#mov" (%0), %%"#mm"5 \n\t" \ |
"paddsw %%"#mm"3, %%"#mm"4 \n\t" \ |
"paddsw 112(%0), %%"#mm"5 \n\t" \ |
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \ |
#mov" %%"#mm"0, %%"#mm"6 \n\t" \ |
"psubsw %%"#mm"1, %%"#mm"2 \n\t" \ |
#mov" 16(%1), %%"#mm"1 \n\t" \ |
"psubsw %%"#mm"4, %%"#mm"0 \n\t" \ |
#mov" 48(%0), %%"#mm"7 \n\t" \ |
"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \ |
"paddsw 64(%0), %%"#mm"7 \n\t" \ |
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \ |
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \ |
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \ |
#mov" %%"#mm"5, %%"#mm"4 \n\t" \ |
"psubsw %%"#mm"7, %%"#mm"5 \n\t" \ |
"paddsw %%"#mm"5, %%"#mm"1 \n\t" \ |
"paddsw %%"#mm"7, %%"#mm"4 \n\t" \ |
"por (%2), %%"#mm"1 \n\t" \ |
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \ |
"pmulhw 16(%1), %%"#mm"5 \n\t" \ |
#mov" %%"#mm"4, %%"#mm"7 \n\t" \ |
"psubsw 80(%0), %%"#mm"3 \n\t" \ |
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \ |
#mov" %%"#mm"1, 32(%3) \n\t" \ |
"paddsw %%"#mm"6, %%"#mm"7 \n\t" \ |
#mov" 48(%0), %%"#mm"1 \n\t" \ |
"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \ |
"psubsw 64(%0), %%"#mm"1 \n\t" \ |
#mov" %%"#mm"2, %%"#mm"6 \n\t" \ |
#mov" %%"#mm"4, 64(%3) \n\t" \ |
"paddsw %%"#mm"3, %%"#mm"2 \n\t" \ |
"pmulhw (%4), %%"#mm"2 \n\t" \ |
"psubsw %%"#mm"3, %%"#mm"6 \n\t" \ |
"pmulhw (%4), %%"#mm"6 \n\t" \ |
"psubsw %%"#mm"0, %%"#mm"5 \n\t" \ |
"por (%2), %%"#mm"5 \n\t" \ |
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \ |
"por (%2), %%"#mm"2 \n\t" \ |
#mov" %%"#mm"1, %%"#mm"4 \n\t" \ |
#mov" (%0), %%"#mm"3 \n\t" \ |
"paddsw %%"#mm"6, %%"#mm"1 \n\t" \ |
"psubsw 112(%0), %%"#mm"3 \n\t" \ |
"psubsw %%"#mm"6, %%"#mm"4 \n\t" \ |
#mov" (%1), %%"#mm"0 \n\t" \ |
"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \ |
#mov" 32(%1), %%"#mm"6 \n\t" \ |
"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \ |
#mov" %%"#mm"7, (%3) \n\t" \ |
"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \ |
#mov" %%"#mm"5, 96(%3) \n\t" \ |
#mov" %%"#mm"3, %%"#mm"7 \n\t" \ |
#mov" 32(%1), %%"#mm"5 \n\t" \ |
"psubsw %%"#mm"2, %%"#mm"7 \n\t" \ |
"paddsw %%"#mm"2, %%"#mm"3 \n\t" \ |
"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \ |
"paddsw %%"#mm"3, %%"#mm"0 \n\t" \ |
"paddsw %%"#mm"4, %%"#mm"6 \n\t" \ |
"pmulhw (%1), %%"#mm"3 \n\t" \ |
"por (%2), %%"#mm"0 \n\t" \ |
"paddsw %%"#mm"7, %%"#mm"5 \n\t" \ |
"psubsw %%"#mm"6, %%"#mm"7 \n\t" \ |
#mov" %%"#mm"0, 16(%3) \n\t" \ |
"paddsw %%"#mm"4, %%"#mm"5 \n\t" \ |
#mov" %%"#mm"7, 48(%3) \n\t" \ |
"psubsw %%"#mm"1, %%"#mm"3 \n\t" \ |
#mov" %%"#mm"5, 80(%3) \n\t" \ |
#mov" %%"#mm"3, 112(%3) \n\t" \ |
: \ |
: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \ |
"r" (out + offset), "r" (ocos_4_16)); \ |
} |
FDCT_COL(mmx, mm, movq) |
FDCT_COL(sse2, xmm, movdqa) |
static av_always_inline void fdct_row_sse2(const int16_t *in, int16_t *out) |
{ |
__asm__ volatile( |
#define FDCT_ROW_SSE2_H1(i,t) \ |
"movq " #i "(%0), %%xmm2 \n\t" \ |
"movq " #i "+8(%0), %%xmm0 \n\t" \ |
"movdqa " #t "+32(%1), %%xmm3 \n\t" \ |
"movdqa " #t "+48(%1), %%xmm7 \n\t" \ |
"movdqa " #t "(%1), %%xmm4 \n\t" \ |
"movdqa " #t "+16(%1), %%xmm5 \n\t" |
#define FDCT_ROW_SSE2_H2(i,t) \ |
"movq " #i "(%0), %%xmm2 \n\t" \ |
"movq " #i "+8(%0), %%xmm0 \n\t" \ |
"movdqa " #t "+32(%1), %%xmm3 \n\t" \ |
"movdqa " #t "+48(%1), %%xmm7 \n\t" |
#define FDCT_ROW_SSE2(i) \ |
"movq %%xmm2, %%xmm1 \n\t" \ |
"pshuflw $27, %%xmm0, %%xmm0 \n\t" \ |
"paddsw %%xmm0, %%xmm1 \n\t" \ |
"psubsw %%xmm0, %%xmm2 \n\t" \ |
"punpckldq %%xmm2, %%xmm1 \n\t" \ |
"pshufd $78, %%xmm1, %%xmm2 \n\t" \ |
"pmaddwd %%xmm2, %%xmm3 \n\t" \ |
"pmaddwd %%xmm1, %%xmm7 \n\t" \ |
"pmaddwd %%xmm5, %%xmm2 \n\t" \ |
"pmaddwd %%xmm4, %%xmm1 \n\t" \ |
"paddd %%xmm7, %%xmm3 \n\t" \ |
"paddd %%xmm2, %%xmm1 \n\t" \ |
"paddd %%xmm6, %%xmm3 \n\t" \ |
"paddd %%xmm6, %%xmm1 \n\t" \ |
"psrad %3, %%xmm3 \n\t" \ |
"psrad %3, %%xmm1 \n\t" \ |
"packssdw %%xmm3, %%xmm1 \n\t" \ |
"movdqa %%xmm1, " #i "(%4) \n\t" |
"movdqa (%2), %%xmm6 \n\t" |
FDCT_ROW_SSE2_H1(0,0) |
FDCT_ROW_SSE2(0) |
FDCT_ROW_SSE2_H2(64,0) |
FDCT_ROW_SSE2(64) |
FDCT_ROW_SSE2_H1(16,64) |
FDCT_ROW_SSE2(16) |
FDCT_ROW_SSE2_H2(112,64) |
FDCT_ROW_SSE2(112) |
FDCT_ROW_SSE2_H1(32,128) |
FDCT_ROW_SSE2(32) |
FDCT_ROW_SSE2_H2(96,128) |
FDCT_ROW_SSE2(96) |
FDCT_ROW_SSE2_H1(48,192) |
FDCT_ROW_SSE2(48) |
FDCT_ROW_SSE2_H2(80,192) |
FDCT_ROW_SSE2(80) |
: |
: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2), |
"r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out) |
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", |
"%xmm4", "%xmm5", "%xmm6", "%xmm7") |
); |
} |
static av_always_inline void fdct_row_mmxext(const int16_t *in, int16_t *out, |
const int16_t *table) |
{ |
__asm__ volatile ( |
"pshufw $0x1B, 8(%0), %%mm5 \n\t" |
"movq (%0), %%mm0 \n\t" |
"movq %%mm0, %%mm1 \n\t" |
"paddsw %%mm5, %%mm0 \n\t" |
"psubsw %%mm5, %%mm1 \n\t" |
"movq %%mm0, %%mm2 \n\t" |
"punpckldq %%mm1, %%mm0 \n\t" |
"punpckhdq %%mm1, %%mm2 \n\t" |
"movq (%1), %%mm1 \n\t" |
"movq 8(%1), %%mm3 \n\t" |
"movq 16(%1), %%mm4 \n\t" |
"movq 24(%1), %%mm5 \n\t" |
"movq 32(%1), %%mm6 \n\t" |
"movq 40(%1), %%mm7 \n\t" |
"pmaddwd %%mm0, %%mm1 \n\t" |
"pmaddwd %%mm2, %%mm3 \n\t" |
"pmaddwd %%mm0, %%mm4 \n\t" |
"pmaddwd %%mm2, %%mm5 \n\t" |
"pmaddwd %%mm0, %%mm6 \n\t" |
"pmaddwd %%mm2, %%mm7 \n\t" |
"pmaddwd 48(%1), %%mm0 \n\t" |
"pmaddwd 56(%1), %%mm2 \n\t" |
"paddd %%mm1, %%mm3 \n\t" |
"paddd %%mm4, %%mm5 \n\t" |
"paddd %%mm6, %%mm7 \n\t" |
"paddd %%mm0, %%mm2 \n\t" |
"movq (%2), %%mm0 \n\t" |
"paddd %%mm0, %%mm3 \n\t" |
"paddd %%mm0, %%mm5 \n\t" |
"paddd %%mm0, %%mm7 \n\t" |
"paddd %%mm0, %%mm2 \n\t" |
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" |
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" |
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" |
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" |
"packssdw %%mm5, %%mm3 \n\t" |
"packssdw %%mm2, %%mm7 \n\t" |
"movq %%mm3, (%3) \n\t" |
"movq %%mm7, 8(%3) \n\t" |
: |
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); |
} |
static av_always_inline void fdct_row_mmx(const int16_t *in, int16_t *out, const int16_t *table) |
{ |
//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...) |
__asm__ volatile( |
"movd 12(%0), %%mm1 \n\t" |
"punpcklwd 8(%0), %%mm1 \n\t" |
"movq %%mm1, %%mm2 \n\t" |
"psrlq $0x20, %%mm1 \n\t" |
"movq 0(%0), %%mm0 \n\t" |
"punpcklwd %%mm2, %%mm1 \n\t" |
"movq %%mm0, %%mm5 \n\t" |
"paddsw %%mm1, %%mm0 \n\t" |
"psubsw %%mm1, %%mm5 \n\t" |
"movq %%mm0, %%mm2 \n\t" |
"punpckldq %%mm5, %%mm0 \n\t" |
"punpckhdq %%mm5, %%mm2 \n\t" |
"movq 0(%1), %%mm1 \n\t" |
"movq 8(%1), %%mm3 \n\t" |
"movq 16(%1), %%mm4 \n\t" |
"movq 24(%1), %%mm5 \n\t" |
"movq 32(%1), %%mm6 \n\t" |
"movq 40(%1), %%mm7 \n\t" |
"pmaddwd %%mm0, %%mm1 \n\t" |
"pmaddwd %%mm2, %%mm3 \n\t" |
"pmaddwd %%mm0, %%mm4 \n\t" |
"pmaddwd %%mm2, %%mm5 \n\t" |
"pmaddwd %%mm0, %%mm6 \n\t" |
"pmaddwd %%mm2, %%mm7 \n\t" |
"pmaddwd 48(%1), %%mm0 \n\t" |
"pmaddwd 56(%1), %%mm2 \n\t" |
"paddd %%mm1, %%mm3 \n\t" |
"paddd %%mm4, %%mm5 \n\t" |
"paddd %%mm6, %%mm7 \n\t" |
"paddd %%mm0, %%mm2 \n\t" |
"movq (%2), %%mm0 \n\t" |
"paddd %%mm0, %%mm3 \n\t" |
"paddd %%mm0, %%mm5 \n\t" |
"paddd %%mm0, %%mm7 \n\t" |
"paddd %%mm0, %%mm2 \n\t" |
"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t" |
"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t" |
"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t" |
"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t" |
"packssdw %%mm5, %%mm3 \n\t" |
"packssdw %%mm2, %%mm7 \n\t" |
"movq %%mm3, 0(%3) \n\t" |
"movq %%mm7, 8(%3) \n\t" |
: |
: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out)); |
} |
void ff_fdct_mmx(int16_t *block) |
{ |
DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; |
int16_t * block1= (int16_t*)align_tmp; |
const int16_t *table= tab_frw_01234567; |
int i; |
fdct_col_mmx(block, block1, 0); |
fdct_col_mmx(block, block1, 4); |
for(i=8;i>0;i--) { |
fdct_row_mmx(block1, block, table); |
block1 += 8; |
table += 32; |
block += 8; |
} |
} |
#endif /* HAVE_MMX_INLINE */ |
#if HAVE_MMXEXT_INLINE |
void ff_fdct_mmxext(int16_t *block) |
{ |
DECLARE_ALIGNED(8, int64_t, align_tmp)[16]; |
int16_t *block1= (int16_t*)align_tmp; |
const int16_t *table= tab_frw_01234567; |
int i; |
fdct_col_mmx(block, block1, 0); |
fdct_col_mmx(block, block1, 4); |
for(i=8;i>0;i--) { |
fdct_row_mmxext(block1, block, table); |
block1 += 8; |
table += 32; |
block += 8; |
} |
} |
#endif /* HAVE_MMXEXT_INLINE */ |
#if HAVE_SSE2_INLINE |
void ff_fdct_sse2(int16_t *block) |
{ |
DECLARE_ALIGNED(16, int64_t, align_tmp)[16]; |
int16_t * const block1= (int16_t*)align_tmp; |
fdct_col_sse2(block, block1, 0); |
fdct_row_sse2(block1, block); |
} |
#endif /* HAVE_SSE2_INLINE */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fft.asm |
---|
0,0 → 1,1092 |
;****************************************************************************** |
;* FFT transform with SSE/3DNow optimizations |
;* Copyright (c) 2008 Loren Merritt |
;* Copyright (c) 2011 Vitor Sessak |
;* |
;* This algorithm (though not any of the implementation details) is |
;* based on libdjbfft by D. J. Bernstein. |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
; These functions are not individually interchangeable with the C versions. |
; While C takes arrays of FFTComplex, SSE/3DNow leave intermediate results |
; in blocks as conventient to the vector size. |
; i.e. {4x real, 4x imaginary, 4x real, ...} (or 2x respectively) |
%include "libavutil/x86/x86util.asm" |
%if ARCH_X86_64 |
%define pointer resq |
%else |
%define pointer resd |
%endif |
SECTION_RODATA 32 |
struc FFTContext |
.nbits: resd 1 |
.reverse: resd 1 |
.revtab: pointer 1 |
.tmpbuf: pointer 1 |
.mdctsize: resd 1 |
.mdctbits: resd 1 |
.tcos: pointer 1 |
.tsin: pointer 1 |
.fftperm: pointer 1 |
.fftcalc: pointer 1 |
.imdctcalc:pointer 1 |
.imdcthalf:pointer 1 |
endstruc |
%define M_SQRT1_2 0.70710678118654752440 |
%define M_COS_PI_1_8 0.923879532511287 |
%define M_COS_PI_3_8 0.38268343236509 |
ps_cos16_1: dd 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8, 1.0, M_COS_PI_1_8, M_SQRT1_2, M_COS_PI_3_8 |
ps_cos16_2: dd 0, M_COS_PI_3_8, M_SQRT1_2, M_COS_PI_1_8, 0, -M_COS_PI_3_8, -M_SQRT1_2, -M_COS_PI_1_8 |
ps_root2: times 8 dd M_SQRT1_2 |
ps_root2mppm: dd -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2, -M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, -M_SQRT1_2 |
ps_p1p1m1p1: dd 0, 0, 1<<31, 0, 0, 0, 1<<31, 0 |
perm1: dd 0x00, 0x02, 0x03, 0x01, 0x03, 0x00, 0x02, 0x01 |
perm2: dd 0x00, 0x01, 0x02, 0x03, 0x01, 0x00, 0x02, 0x03 |
ps_p1p1m1p1root2: dd 1.0, 1.0, -1.0, 1.0, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2, M_SQRT1_2 |
ps_m1m1p1m1p1m1m1m1: dd 1<<31, 1<<31, 0, 1<<31, 0, 1<<31, 1<<31, 1<<31 |
ps_m1m1m1m1: times 4 dd 1<<31 |
ps_m1p1: dd 1<<31, 0 |
%assign i 16 |
%rep 13 |
cextern cos_ %+ i |
%assign i i<<1 |
%endrep |
%if ARCH_X86_64 |
%define pointer dq |
%else |
%define pointer dd |
%endif |
%macro IF0 1+ |
%endmacro |
%macro IF1 1+ |
%1 |
%endmacro |
SECTION_TEXT |
%macro T2_3DNOW 4 ; z0, z1, mem0, mem1 |
mova %1, %3 |
mova %2, %1 |
pfadd %1, %4 |
pfsub %2, %4 |
%endmacro |
%macro T4_3DNOW 6 ; z0, z1, z2, z3, tmp0, tmp1 |
mova %5, %3 |
pfsub %3, %4 |
pfadd %5, %4 ; {t6,t5} |
pxor %3, [ps_m1p1] ; {t8,t7} |
mova %6, %1 |
movd [r0+12], %3 |
punpckhdq %3, [r0+8] |
pfadd %1, %5 ; {r0,i0} |
pfsub %6, %5 ; {r2,i2} |
mova %4, %2 |
pfadd %2, %3 ; {r1,i1} |
pfsub %4, %3 ; {r3,i3} |
SWAP %3, %6 |
%endmacro |
; in: %1 = {r0,i0,r2,i2,r4,i4,r6,i6} |
; %2 = {r1,i1,r3,i3,r5,i5,r7,i7} |
; %3, %4, %5 tmp |
; out: %1 = {r0,r1,r2,r3,i0,i1,i2,i3} |
; %2 = {r4,r5,r6,r7,i4,i5,i6,i7} |
%macro T8_AVX 5 |
vsubps %5, %1, %2 ; v = %1 - %2 |
vaddps %3, %1, %2 ; w = %1 + %2 |
vmulps %2, %5, [ps_p1p1m1p1root2] ; v *= vals1 |
vpermilps %2, %2, [perm1] |
vblendps %1, %2, %3, 0x33 ; q = {w1,w2,v4,v2,w5,w6,v7,v6} |
vshufps %5, %3, %2, 0x4e ; r = {w3,w4,v1,v3,w7,w8,v8,v5} |
vsubps %4, %5, %1 ; s = r - q |
vaddps %1, %5, %1 ; u = r + q |
vpermilps %1, %1, [perm2] ; k = {u1,u2,u3,u4,u6,u5,u7,u8} |
vshufps %5, %4, %1, 0xbb |
vshufps %3, %4, %1, 0xee |
vperm2f128 %3, %3, %5, 0x13 |
vxorps %4, %4, [ps_m1m1p1m1p1m1m1m1] ; s *= {1,1,-1,-1,1,-1,-1,-1} |
vshufps %2, %1, %4, 0xdd |
vshufps %1, %1, %4, 0x88 |
vperm2f128 %4, %2, %1, 0x02 ; v = {k1,k3,s1,s3,k2,k4,s2,s4} |
vperm2f128 %1, %1, %2, 0x13 ; w = {k6,k8,s6,s8,k5,k7,s5,s7} |
vsubps %5, %1, %3 |
vblendps %1, %5, %1, 0x55 ; w -= {0,s7,0,k7,0,s8,0,k8} |
vsubps %2, %4, %1 ; %2 = v - w |
vaddps %1, %4, %1 ; %1 = v + w |
%endmacro |
; In SSE mode do one fft4 transforms |
; in: %1={r0,i0,r2,i2} %2={r1,i1,r3,i3} |
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} |
; |
; In AVX mode do two fft4 transforms |
; in: %1={r0,i0,r2,i2,r4,i4,r6,i6} %2={r1,i1,r3,i3,r5,i5,r7,i7} |
; out: %1={r0,r1,r2,r3,r4,r5,r6,r7} %2={i0,i1,i2,i3,i4,i5,i6,i7} |
%macro T4_SSE 3 |
subps %3, %1, %2 ; {t3,t4,-t8,t7} |
addps %1, %1, %2 ; {t1,t2,t6,t5} |
xorps %3, %3, [ps_p1p1m1p1] |
shufps %2, %1, %3, 0xbe ; {t6,t5,t7,t8} |
shufps %1, %1, %3, 0x44 ; {t1,t2,t3,t4} |
subps %3, %1, %2 ; {r2,i2,r3,i3} |
addps %1, %1, %2 ; {r0,i0,r1,i1} |
shufps %2, %1, %3, 0xdd ; {i0,i1,i2,i3} |
shufps %1, %1, %3, 0x88 ; {r0,r1,r2,r3} |
%endmacro |
; In SSE mode do one FFT8 |
; in: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %3={r4,i4,r6,i6} %4={r5,i5,r7,i7} |
; out: %1={r0,r1,r2,r3} %2={i0,i1,i2,i3} %1={r4,r5,r6,r7} %2={i4,i5,i6,i7} |
; |
; In AVX mode do two FFT8 |
; in: %1={r0,i0,r2,i2,r8, i8, r10,i10} %2={r1,i1,r3,i3,r9, i9, r11,i11} |
; %3={r4,i4,r6,i6,r12,i12,r14,i14} %4={r5,i5,r7,i7,r13,i13,r15,i15} |
; out: %1={r0,r1,r2,r3,r8, r9, r10,r11} %2={i0,i1,i2,i3,i8, i9, i10,i11} |
; %3={r4,r5,r6,r7,r12,r13,r14,r15} %4={i4,i5,i6,i7,i12,i13,i14,i15} |
%macro T8_SSE 6 |
addps %6, %3, %4 ; {t1,t2,t3,t4} |
subps %3, %3, %4 ; {r5,i5,r7,i7} |
shufps %4, %3, %3, 0xb1 ; {i5,r5,i7,r7} |
mulps %3, %3, [ps_root2mppm] ; {-r5,i5,r7,-i7} |
mulps %4, %4, [ps_root2] |
addps %3, %3, %4 ; {t8,t7,ta,t9} |
shufps %4, %6, %3, 0x9c ; {t1,t4,t7,ta} |
shufps %6, %6, %3, 0x36 ; {t3,t2,t9,t8} |
subps %3, %6, %4 ; {t6,t5,tc,tb} |
addps %6, %6, %4 ; {t1,t2,t9,ta} |
shufps %5, %6, %3, 0x8d ; {t2,ta,t6,tc} |
shufps %6, %6, %3, 0xd8 ; {t1,t9,t5,tb} |
subps %3, %1, %6 ; {r4,r5,r6,r7} |
addps %1, %1, %6 ; {r0,r1,r2,r3} |
subps %4, %2, %5 ; {i4,i5,i6,i7} |
addps %2, %2, %5 ; {i0,i1,i2,i3} |
%endmacro |
; scheduled for cpu-bound sizes |
%macro PASS_SMALL 3 ; (to load m4-m7), wre, wim |
IF%1 mova m4, Z(4) |
IF%1 mova m5, Z(5) |
mova m0, %2 ; wre |
mova m1, %3 ; wim |
mulps m2, m4, m0 ; r2*wre |
IF%1 mova m6, Z2(6) |
mulps m3, m5, m1 ; i2*wim |
IF%1 mova m7, Z2(7) |
mulps m4, m4, m1 ; r2*wim |
mulps m5, m5, m0 ; i2*wre |
addps m2, m2, m3 ; r2*wre + i2*wim |
mulps m3, m1, m7 ; i3*wim |
subps m5, m5, m4 ; i2*wre - r2*wim |
mulps m1, m1, m6 ; r3*wim |
mulps m4, m0, m6 ; r3*wre |
mulps m0, m0, m7 ; i3*wre |
subps m4, m4, m3 ; r3*wre - i3*wim |
mova m3, Z(0) |
addps m0, m0, m1 ; i3*wre + r3*wim |
subps m1, m4, m2 ; t3 |
addps m4, m4, m2 ; t5 |
subps m3, m3, m4 ; r2 |
addps m4, m4, Z(0) ; r0 |
mova m6, Z(2) |
mova Z(4), m3 |
mova Z(0), m4 |
subps m3, m5, m0 ; t4 |
subps m4, m6, m3 ; r3 |
addps m3, m3, m6 ; r1 |
mova Z2(6), m4 |
mova Z(2), m3 |
mova m2, Z(3) |
addps m3, m5, m0 ; t6 |
subps m2, m2, m1 ; i3 |
mova m7, Z(1) |
addps m1, m1, Z(3) ; i1 |
mova Z2(7), m2 |
mova Z(3), m1 |
subps m4, m7, m3 ; i2 |
addps m3, m3, m7 ; i0 |
mova Z(5), m4 |
mova Z(1), m3 |
%endmacro |
; scheduled to avoid store->load aliasing |
%macro PASS_BIG 1 ; (!interleave) |
mova m4, Z(4) ; r2 |
mova m5, Z(5) ; i2 |
mova m0, [wq] ; wre |
mova m1, [wq+o1q] ; wim |
mulps m2, m4, m0 ; r2*wre |
mova m6, Z2(6) ; r3 |
mulps m3, m5, m1 ; i2*wim |
mova m7, Z2(7) ; i3 |
mulps m4, m4, m1 ; r2*wim |
mulps m5, m5, m0 ; i2*wre |
addps m2, m2, m3 ; r2*wre + i2*wim |
mulps m3, m1, m7 ; i3*wim |
mulps m1, m1, m6 ; r3*wim |
subps m5, m5, m4 ; i2*wre - r2*wim |
mulps m4, m0, m6 ; r3*wre |
mulps m0, m0, m7 ; i3*wre |
subps m4, m4, m3 ; r3*wre - i3*wim |
mova m3, Z(0) |
addps m0, m0, m1 ; i3*wre + r3*wim |
subps m1, m4, m2 ; t3 |
addps m4, m4, m2 ; t5 |
subps m3, m3, m4 ; r2 |
addps m4, m4, Z(0) ; r0 |
mova m6, Z(2) |
mova Z(4), m3 |
mova Z(0), m4 |
subps m3, m5, m0 ; t4 |
subps m4, m6, m3 ; r3 |
addps m3, m3, m6 ; r1 |
IF%1 mova Z2(6), m4 |
IF%1 mova Z(2), m3 |
mova m2, Z(3) |
addps m5, m5, m0 ; t6 |
subps m2, m2, m1 ; i3 |
mova m7, Z(1) |
addps m1, m1, Z(3) ; i1 |
IF%1 mova Z2(7), m2 |
IF%1 mova Z(3), m1 |
subps m6, m7, m5 ; i2 |
addps m5, m5, m7 ; i0 |
IF%1 mova Z(5), m6 |
IF%1 mova Z(1), m5 |
%if %1==0 |
INTERL m1, m3, m7, Z, 2 |
INTERL m2, m4, m0, Z2, 6 |
mova m1, Z(0) |
mova m2, Z(4) |
INTERL m5, m1, m3, Z, 0 |
INTERL m6, m2, m7, Z, 4 |
%endif |
%endmacro |
%macro PUNPCK 3 |
mova %3, %1 |
punpckldq %1, %2 |
punpckhdq %3, %2 |
%endmacro |
%define Z(x) [r0+mmsize*x] |
%define Z2(x) [r0+mmsize*x] |
%define ZH(x) [r0+mmsize*x+mmsize/2] |
INIT_YMM avx |
%if HAVE_AVX_EXTERNAL |
align 16 |
fft8_avx: |
mova m0, Z(0) |
mova m1, Z(1) |
T8_AVX m0, m1, m2, m3, m4 |
mova Z(0), m0 |
mova Z(1), m1 |
ret |
align 16 |
fft16_avx: |
mova m2, Z(2) |
mova m3, Z(3) |
T4_SSE m2, m3, m7 |
mova m0, Z(0) |
mova m1, Z(1) |
T8_AVX m0, m1, m4, m5, m7 |
mova m4, [ps_cos16_1] |
mova m5, [ps_cos16_2] |
vmulps m6, m2, m4 |
vmulps m7, m3, m5 |
vaddps m7, m7, m6 |
vmulps m2, m2, m5 |
vmulps m3, m3, m4 |
vsubps m3, m3, m2 |
vblendps m2, m7, m3, 0xf0 |
vperm2f128 m3, m7, m3, 0x21 |
vaddps m4, m2, m3 |
vsubps m2, m3, m2 |
vperm2f128 m2, m2, m2, 0x01 |
vsubps m3, m1, m2 |
vaddps m1, m1, m2 |
vsubps m5, m0, m4 |
vaddps m0, m0, m4 |
vextractf128 Z(0), m0, 0 |
vextractf128 ZH(0), m1, 0 |
vextractf128 Z(1), m0, 1 |
vextractf128 ZH(1), m1, 1 |
vextractf128 Z(2), m5, 0 |
vextractf128 ZH(2), m3, 0 |
vextractf128 Z(3), m5, 1 |
vextractf128 ZH(3), m3, 1 |
ret |
align 16 |
fft32_avx: |
call fft16_avx |
mova m0, Z(4) |
mova m1, Z(5) |
T4_SSE m0, m1, m4 |
mova m2, Z(6) |
mova m3, Z(7) |
T8_SSE m0, m1, m2, m3, m4, m6 |
; m0={r0,r1,r2,r3,r8, r9, r10,r11} m1={i0,i1,i2,i3,i8, i9, i10,i11} |
; m2={r4,r5,r6,r7,r12,r13,r14,r15} m3={i4,i5,i6,i7,i12,i13,i14,i15} |
vperm2f128 m4, m0, m2, 0x20 |
vperm2f128 m5, m1, m3, 0x20 |
vperm2f128 m6, m0, m2, 0x31 |
vperm2f128 m7, m1, m3, 0x31 |
PASS_SMALL 0, [cos_32], [cos_32+32] |
ret |
fft32_interleave_avx: |
call fft32_avx |
mov r2d, 32 |
.deint_loop: |
mova m2, Z(0) |
mova m3, Z(1) |
vunpcklps m0, m2, m3 |
vunpckhps m1, m2, m3 |
vextractf128 Z(0), m0, 0 |
vextractf128 ZH(0), m1, 0 |
vextractf128 Z(1), m0, 1 |
vextractf128 ZH(1), m1, 1 |
add r0, mmsize*2 |
sub r2d, mmsize/4 |
jg .deint_loop |
ret |
%endif |
INIT_XMM sse |
align 16 |
fft4_avx: |
fft4_sse: |
mova m0, Z(0) |
mova m1, Z(1) |
T4_SSE m0, m1, m2 |
mova Z(0), m0 |
mova Z(1), m1 |
ret |
align 16 |
fft8_sse: |
mova m0, Z(0) |
mova m1, Z(1) |
T4_SSE m0, m1, m2 |
mova m2, Z(2) |
mova m3, Z(3) |
T8_SSE m0, m1, m2, m3, m4, m5 |
mova Z(0), m0 |
mova Z(1), m1 |
mova Z(2), m2 |
mova Z(3), m3 |
ret |
align 16 |
fft16_sse: |
mova m0, Z(0) |
mova m1, Z(1) |
T4_SSE m0, m1, m2 |
mova m2, Z(2) |
mova m3, Z(3) |
T8_SSE m0, m1, m2, m3, m4, m5 |
mova m4, Z(4) |
mova m5, Z(5) |
mova Z(0), m0 |
mova Z(1), m1 |
mova Z(2), m2 |
mova Z(3), m3 |
T4_SSE m4, m5, m6 |
mova m6, Z2(6) |
mova m7, Z2(7) |
T4_SSE m6, m7, m0 |
PASS_SMALL 0, [cos_16], [cos_16+16] |
ret |
%macro FFT48_3DNOW 0 |
align 16 |
fft4 %+ SUFFIX: |
T2_3DNOW m0, m1, Z(0), Z(1) |
mova m2, Z(2) |
mova m3, Z(3) |
T4_3DNOW m0, m1, m2, m3, m4, m5 |
PUNPCK m0, m1, m4 |
PUNPCK m2, m3, m5 |
mova Z(0), m0 |
mova Z(1), m4 |
mova Z(2), m2 |
mova Z(3), m5 |
ret |
align 16 |
fft8 %+ SUFFIX: |
T2_3DNOW m0, m1, Z(0), Z(1) |
mova m2, Z(2) |
mova m3, Z(3) |
T4_3DNOW m0, m1, m2, m3, m4, m5 |
mova Z(0), m0 |
mova Z(2), m2 |
T2_3DNOW m4, m5, Z(4), Z(5) |
T2_3DNOW m6, m7, Z2(6), Z2(7) |
PSWAPD m0, m5 |
PSWAPD m2, m7 |
pxor m0, [ps_m1p1] |
pxor m2, [ps_m1p1] |
pfsub m5, m0 |
pfadd m7, m2 |
pfmul m5, [ps_root2] |
pfmul m7, [ps_root2] |
T4_3DNOW m1, m3, m5, m7, m0, m2 |
mova Z(5), m5 |
mova Z2(7), m7 |
mova m0, Z(0) |
mova m2, Z(2) |
T4_3DNOW m0, m2, m4, m6, m5, m7 |
PUNPCK m0, m1, m5 |
PUNPCK m2, m3, m7 |
mova Z(0), m0 |
mova Z(1), m5 |
mova Z(2), m2 |
mova Z(3), m7 |
PUNPCK m4, Z(5), m5 |
PUNPCK m6, Z2(7), m7 |
mova Z(4), m4 |
mova Z(5), m5 |
mova Z2(6), m6 |
mova Z2(7), m7 |
ret |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX 3dnowext |
FFT48_3DNOW |
INIT_MMX 3dnow |
FFT48_3DNOW |
%endif |
%define Z(x) [zcq + o1q*(x&6) + mmsize*(x&1)] |
%define Z2(x) [zcq + o3q + mmsize*(x&1)] |
%define ZH(x) [zcq + o1q*(x&6) + mmsize*(x&1) + mmsize/2] |
%define Z2H(x) [zcq + o3q + mmsize*(x&1) + mmsize/2] |
%macro DECL_PASS 2+ ; name, payload |
align 16 |
%1: |
DEFINE_ARGS zc, w, n, o1, o3 |
lea o3q, [nq*3] |
lea o1q, [nq*8] |
shl o3q, 4 |
.loop: |
%2 |
add zcq, mmsize*2 |
add wq, mmsize |
sub nd, mmsize/8 |
jg .loop |
rep ret |
%endmacro |
%macro FFT_DISPATCH 2; clobbers 5 GPRs, 8 XMMs |
lea r2, [dispatch_tab%1] |
mov r2, [r2 + (%2q-2)*gprsize] |
%ifdef PIC |
lea r3, [$$] |
add r2, r3 |
%endif |
call r2 |
%endmacro ; FFT_DISPATCH |
INIT_YMM avx |
%if HAVE_AVX_EXTERNAL |
%macro INTERL_AVX 5 |
vunpckhps %3, %2, %1 |
vunpcklps %2, %2, %1 |
vextractf128 %4(%5), %2, 0 |
vextractf128 %4 %+ H(%5), %3, 0 |
vextractf128 %4(%5 + 1), %2, 1 |
vextractf128 %4 %+ H(%5 + 1), %3, 1 |
%endmacro |
%define INTERL INTERL_AVX |
DECL_PASS pass_avx, PASS_BIG 1 |
DECL_PASS pass_interleave_avx, PASS_BIG 0 |
cglobal fft_calc, 2,5,8 |
mov r3d, [r0 + FFTContext.nbits] |
mov r0, r1 |
mov r1, r3 |
FFT_DISPATCH _interleave %+ SUFFIX, r1 |
REP_RET |
%endif |
INIT_XMM sse |
%macro INTERL_SSE 5 |
mova %3, %2 |
unpcklps %2, %1 |
unpckhps %3, %1 |
mova %4(%5), %2 |
mova %4(%5+1), %3 |
%endmacro |
%define INTERL INTERL_SSE |
DECL_PASS pass_sse, PASS_BIG 1 |
DECL_PASS pass_interleave_sse, PASS_BIG 0 |
%macro FFT_CALC_FUNC 0 |
cglobal fft_calc, 2,5,8 |
mov r3d, [r0 + FFTContext.nbits] |
PUSH r1 |
PUSH r3 |
mov r0, r1 |
mov r1, r3 |
FFT_DISPATCH _interleave %+ SUFFIX, r1 |
POP rcx |
POP r4 |
cmp rcx, 3+(mmsize/16) |
jg .end |
mov r2, -1 |
add rcx, 3 |
shl r2, cl |
sub r4, r2 |
.loop: |
%if mmsize == 8 |
PSWAPD m0, [r4 + r2 + 4] |
mova [r4 + r2 + 4], m0 |
%else |
movaps xmm0, [r4 + r2] |
movaps xmm1, xmm0 |
unpcklps xmm0, [r4 + r2 + 16] |
unpckhps xmm1, [r4 + r2 + 16] |
movaps [r4 + r2], xmm0 |
movaps [r4 + r2 + 16], xmm1 |
%endif |
add r2, mmsize*2 |
jl .loop |
.end: |
%if cpuflag(3dnow) |
femms |
RET |
%else |
REP_RET |
%endif |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX 3dnow |
FFT_CALC_FUNC |
INIT_MMX 3dnowext |
FFT_CALC_FUNC |
%endif |
INIT_XMM sse |
FFT_CALC_FUNC |
cglobal fft_permute, 2,7,1 |
mov r4, [r0 + FFTContext.revtab] |
mov r5, [r0 + FFTContext.tmpbuf] |
mov ecx, [r0 + FFTContext.nbits] |
mov r2, 1 |
shl r2, cl |
xor r0, r0 |
%if ARCH_X86_32 |
mov r1, r1m |
%endif |
.loop: |
movaps xmm0, [r1 + 8*r0] |
movzx r6, word [r4 + 2*r0] |
movzx r3, word [r4 + 2*r0 + 2] |
movlps [r5 + 8*r6], xmm0 |
movhps [r5 + 8*r3], xmm0 |
add r0, 2 |
cmp r0, r2 |
jl .loop |
shl r2, 3 |
add r1, r2 |
add r5, r2 |
neg r2 |
; nbits >= 2 (FFT4) and sizeof(FFTComplex)=8 => at least 32B |
.loopcopy: |
movaps xmm0, [r5 + r2] |
movaps xmm1, [r5 + r2 + 16] |
movaps [r1 + r2], xmm0 |
movaps [r1 + r2 + 16], xmm1 |
add r2, 32 |
jl .loopcopy |
REP_RET |
%macro IMDCT_CALC_FUNC 0 |
cglobal imdct_calc, 3,5,3 |
mov r3d, [r0 + FFTContext.mdctsize] |
mov r4, [r0 + FFTContext.imdcthalf] |
add r1, r3 |
PUSH r3 |
PUSH r1 |
%if ARCH_X86_32 |
push r2 |
push r1 |
push r0 |
%else |
sub rsp, 8+32*WIN64 ; allocate win64 shadow space |
%endif |
call r4 |
%if ARCH_X86_32 |
add esp, 12 |
%else |
add rsp, 8+32*WIN64 |
%endif |
POP r1 |
POP r3 |
lea r0, [r1 + 2*r3] |
mov r2, r3 |
sub r3, mmsize |
neg r2 |
mova m2, [ps_m1m1m1m1] |
.loop: |
%if mmsize == 8 |
PSWAPD m0, [r1 + r3] |
PSWAPD m1, [r0 + r2] |
pxor m0, m2 |
%else |
mova m0, [r1 + r3] |
mova m1, [r0 + r2] |
shufps m0, m0, 0x1b |
shufps m1, m1, 0x1b |
xorps m0, m2 |
%endif |
mova [r0 + r3], m1 |
mova [r1 + r2], m0 |
sub r3, mmsize |
add r2, mmsize |
jl .loop |
%if cpuflag(3dnow) |
femms |
RET |
%else |
REP_RET |
%endif |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX 3dnow |
IMDCT_CALC_FUNC |
INIT_MMX 3dnowext |
IMDCT_CALC_FUNC |
%endif |
INIT_XMM sse |
IMDCT_CALC_FUNC |
%if ARCH_X86_32 |
INIT_MMX 3dnow |
%define mulps pfmul |
%define addps pfadd |
%define subps pfsub |
%define unpcklps punpckldq |
%define unpckhps punpckhdq |
DECL_PASS pass_3dnow, PASS_SMALL 1, [wq], [wq+o1q] |
DECL_PASS pass_interleave_3dnow, PASS_BIG 0 |
%define pass_3dnowext pass_3dnow |
%define pass_interleave_3dnowext pass_interleave_3dnow |
%endif |
%ifdef PIC |
%define SECTION_REL - $$ |
%else |
%define SECTION_REL |
%endif |
%macro DECL_FFT 1-2 ; nbits, suffix |
%ifidn %0, 1 |
%xdefine fullsuffix SUFFIX |
%else |
%xdefine fullsuffix %2 %+ SUFFIX |
%endif |
%xdefine list_of_fft fft4 %+ SUFFIX SECTION_REL, fft8 %+ SUFFIX SECTION_REL |
%if %1>=5 |
%xdefine list_of_fft list_of_fft, fft16 %+ SUFFIX SECTION_REL |
%endif |
%if %1>=6 |
%xdefine list_of_fft list_of_fft, fft32 %+ fullsuffix SECTION_REL |
%endif |
%assign n 1<<%1 |
%rep 17-%1 |
%assign n2 n/2 |
%assign n4 n/4 |
%xdefine list_of_fft list_of_fft, fft %+ n %+ fullsuffix SECTION_REL |
align 16 |
fft %+ n %+ fullsuffix: |
call fft %+ n2 %+ SUFFIX |
add r0, n*4 - (n&(-2<<%1)) |
call fft %+ n4 %+ SUFFIX |
add r0, n*2 - (n2&(-2<<%1)) |
call fft %+ n4 %+ SUFFIX |
sub r0, n*6 + (n2&(-2<<%1)) |
lea r1, [cos_ %+ n] |
mov r2d, n4/2 |
jmp pass %+ fullsuffix |
%assign n n*2 |
%endrep |
%undef n |
align 8 |
dispatch_tab %+ fullsuffix: pointer list_of_fft |
%endmacro ; DECL_FFT |
%if HAVE_AVX_EXTERNAL |
INIT_YMM avx |
DECL_FFT 6 |
DECL_FFT 6, _interleave |
%endif |
INIT_XMM sse |
DECL_FFT 5 |
DECL_FFT 5, _interleave |
%if ARCH_X86_32 |
INIT_MMX 3dnow |
DECL_FFT 4 |
DECL_FFT 4, _interleave |
INIT_MMX 3dnowext |
DECL_FFT 4 |
DECL_FFT 4, _interleave |
%endif |
INIT_XMM sse |
%undef mulps |
%undef addps |
%undef subps |
%undef unpcklps |
%undef unpckhps |
%macro PREROTATER 5 ;-2*k, 2*k, input+n4, tcos+n8, tsin+n8 |
%if mmsize == 8 ; j*2+2-n4, n4-2-j*2, input+n4, tcos+n8, tsin+n8 |
PSWAPD m0, [%3+%2*4] |
movq m2, [%3+%1*4-8] |
movq m3, m0 |
punpckldq m0, m2 |
punpckhdq m2, m3 |
movd m1, [%4+%1*2-4] ; tcos[j] |
movd m3, [%4+%2*2] ; tcos[n4-j-1] |
punpckldq m1, [%5+%1*2-4] ; tsin[j] |
punpckldq m3, [%5+%2*2] ; tsin[n4-j-1] |
mova m4, m0 |
PSWAPD m5, m1 |
pfmul m0, m1 |
pfmul m4, m5 |
mova m6, m2 |
PSWAPD m5, m3 |
pfmul m2, m3 |
pfmul m6, m5 |
%if cpuflag(3dnowext) |
pfpnacc m0, m4 |
pfpnacc m2, m6 |
%else |
SBUTTERFLY dq, 0, 4, 1 |
SBUTTERFLY dq, 2, 6, 3 |
pxor m4, m7 |
pxor m6, m7 |
pfadd m0, m4 |
pfadd m2, m6 |
%endif |
%else |
movaps xmm0, [%3+%2*4] |
movaps xmm1, [%3+%1*4-0x10] |
movaps xmm2, xmm0 |
shufps xmm0, xmm1, 0x88 |
shufps xmm1, xmm2, 0x77 |
movlps xmm4, [%4+%2*2] |
movlps xmm5, [%5+%2*2+0x0] |
movhps xmm4, [%4+%1*2-0x8] |
movhps xmm5, [%5+%1*2-0x8] |
movaps xmm2, xmm0 |
movaps xmm3, xmm1 |
mulps xmm0, xmm5 |
mulps xmm1, xmm4 |
mulps xmm2, xmm4 |
mulps xmm3, xmm5 |
subps xmm1, xmm0 |
addps xmm2, xmm3 |
movaps xmm0, xmm1 |
unpcklps xmm1, xmm2 |
unpckhps xmm0, xmm2 |
%endif |
%endmacro |
%macro CMUL 6 ;j, xmm0, xmm1, 3, 4, 5 |
mulps m6, %3, [%5+%1] |
mulps m7, %2, [%5+%1] |
mulps %2, %2, [%6+%1] |
mulps %3, %3, [%6+%1] |
subps %2, %2, m6 |
addps %3, %3, m7 |
%endmacro |
%macro POSROTATESHUF_AVX 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
.post: |
vmovaps ymm1, [%3+%1*2] |
vmovaps ymm0, [%3+%1*2+0x20] |
vmovaps ymm3, [%3+%2*2] |
vmovaps ymm2, [%3+%2*2+0x20] |
CMUL %1, ymm0, ymm1, %3, %4, %5 |
CMUL %2, ymm2, ymm3, %3, %4, %5 |
vshufps ymm1, ymm1, ymm1, 0x1b |
vshufps ymm3, ymm3, ymm3, 0x1b |
vperm2f128 ymm1, ymm1, ymm1, 0x01 |
vperm2f128 ymm3, ymm3, ymm3, 0x01 |
vunpcklps ymm6, ymm2, ymm1 |
vunpckhps ymm4, ymm2, ymm1 |
vunpcklps ymm7, ymm0, ymm3 |
vunpckhps ymm5, ymm0, ymm3 |
vextractf128 [%3+%1*2], ymm7, 0 |
vextractf128 [%3+%1*2+0x10], ymm5, 0 |
vextractf128 [%3+%1*2+0x20], ymm7, 1 |
vextractf128 [%3+%1*2+0x30], ymm5, 1 |
vextractf128 [%3+%2*2], ymm6, 0 |
vextractf128 [%3+%2*2+0x10], ymm4, 0 |
vextractf128 [%3+%2*2+0x20], ymm6, 1 |
vextractf128 [%3+%2*2+0x30], ymm4, 1 |
sub %2, 0x20 |
add %1, 0x20 |
jl .post |
%endmacro |
%macro POSROTATESHUF 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
.post: |
movaps xmm1, [%3+%1*2] |
movaps xmm0, [%3+%1*2+0x10] |
CMUL %1, xmm0, xmm1, %3, %4, %5 |
movaps xmm5, [%3+%2*2] |
movaps xmm4, [%3+%2*2+0x10] |
CMUL %2, xmm4, xmm5, %3, %4, %5 |
shufps xmm1, xmm1, 0x1b |
shufps xmm5, xmm5, 0x1b |
movaps xmm6, xmm4 |
unpckhps xmm4, xmm1 |
unpcklps xmm6, xmm1 |
movaps xmm2, xmm0 |
unpcklps xmm0, xmm5 |
unpckhps xmm2, xmm5 |
movaps [%3+%2*2], xmm6 |
movaps [%3+%2*2+0x10], xmm4 |
movaps [%3+%1*2], xmm0 |
movaps [%3+%1*2+0x10], xmm2 |
sub %2, 0x10 |
add %1, 0x10 |
jl .post |
%endmacro |
%macro CMUL_3DNOW 6 |
mova m6, [%1+%2*2] |
mova %3, [%1+%2*2+8] |
mova %4, m6 |
mova m7, %3 |
pfmul m6, [%5+%2] |
pfmul %3, [%6+%2] |
pfmul %4, [%6+%2] |
pfmul m7, [%5+%2] |
pfsub %3, m6 |
pfadd %4, m7 |
%endmacro |
%macro POSROTATESHUF_3DNOW 5 ;j, k, z+n8, tcos+n8, tsin+n8 |
.post: |
CMUL_3DNOW %3, %1, m0, m1, %4, %5 |
CMUL_3DNOW %3, %2, m2, m3, %4, %5 |
movd [%3+%1*2+ 0], m0 |
movd [%3+%2*2+12], m1 |
movd [%3+%2*2+ 0], m2 |
movd [%3+%1*2+12], m3 |
psrlq m0, 32 |
psrlq m1, 32 |
psrlq m2, 32 |
psrlq m3, 32 |
movd [%3+%1*2+ 8], m0 |
movd [%3+%2*2+ 4], m1 |
movd [%3+%2*2+ 8], m2 |
movd [%3+%1*2+ 4], m3 |
sub %2, 8 |
add %1, 8 |
jl .post |
%endmacro |
%macro DECL_IMDCT 1 |
cglobal imdct_half, 3,12,8; FFTContext *s, FFTSample *output, const FFTSample *input |
%if ARCH_X86_64 |
%define rrevtab r7 |
%define rtcos r8 |
%define rtsin r9 |
%else |
%define rrevtab r6 |
%define rtsin r6 |
%define rtcos r5 |
%endif |
mov r3d, [r0+FFTContext.mdctsize] |
add r2, r3 |
shr r3, 1 |
mov rtcos, [r0+FFTContext.tcos] |
mov rtsin, [r0+FFTContext.tsin] |
add rtcos, r3 |
add rtsin, r3 |
%if ARCH_X86_64 == 0 |
push rtcos |
push rtsin |
%endif |
shr r3, 1 |
mov rrevtab, [r0+FFTContext.revtab] |
add rrevtab, r3 |
%if ARCH_X86_64 == 0 |
push rrevtab |
%endif |
%if mmsize == 8 |
sub r3, 2 |
%else |
sub r3, 4 |
%endif |
%if ARCH_X86_64 || mmsize == 8 |
xor r4, r4 |
sub r4, r3 |
%endif |
%if notcpuflag(3dnowext) && mmsize == 8 |
movd m7, [ps_m1m1m1m1] |
%endif |
.pre: |
%if ARCH_X86_64 == 0 |
;unspill |
%if mmsize != 8 |
xor r4, r4 |
sub r4, r3 |
%endif |
mov rtcos, [esp+8] |
mov rtsin, [esp+4] |
%endif |
PREROTATER r4, r3, r2, rtcos, rtsin |
%if mmsize == 8 |
mov r6, [esp] ; rrevtab = ptr+n8 |
movzx r5, word [rrevtab+r4-2] ; rrevtab[j] |
movzx r6, word [rrevtab+r3] ; rrevtab[n4-j-1] |
mova [r1+r5*8], m0 |
mova [r1+r6*8], m2 |
add r4, 2 |
sub r3, 2 |
%else |
%if ARCH_X86_64 |
movzx r5, word [rrevtab+r4-4] |
movzx r6, word [rrevtab+r4-2] |
movzx r10, word [rrevtab+r3] |
movzx r11, word [rrevtab+r3+2] |
movlps [r1+r5 *8], xmm0 |
movhps [r1+r6 *8], xmm0 |
movlps [r1+r10*8], xmm1 |
movhps [r1+r11*8], xmm1 |
add r4, 4 |
%else |
mov r6, [esp] |
movzx r5, word [r6+r4-4] |
movzx r4, word [r6+r4-2] |
movlps [r1+r5*8], xmm0 |
movhps [r1+r4*8], xmm0 |
movzx r5, word [r6+r3] |
movzx r4, word [r6+r3+2] |
movlps [r1+r5*8], xmm1 |
movhps [r1+r4*8], xmm1 |
%endif |
sub r3, 4 |
%endif |
jns .pre |
mov r5, r0 |
mov r6, r1 |
mov r0, r1 |
mov r1d, [r5+FFTContext.nbits] |
FFT_DISPATCH SUFFIX, r1 |
mov r0d, [r5+FFTContext.mdctsize] |
add r6, r0 |
shr r0, 1 |
%if ARCH_X86_64 == 0 |
%define rtcos r2 |
%define rtsin r3 |
mov rtcos, [esp+8] |
mov rtsin, [esp+4] |
%endif |
neg r0 |
mov r1, -mmsize |
sub r1, r0 |
%1 r0, r1, r6, rtcos, rtsin |
%if ARCH_X86_64 == 0 |
add esp, 12 |
%endif |
%if mmsize == 8 |
femms |
%endif |
RET |
%endmacro |
DECL_IMDCT POSROTATESHUF |
%if ARCH_X86_32 |
INIT_MMX 3dnow |
DECL_IMDCT POSROTATESHUF_3DNOW |
INIT_MMX 3dnowext |
DECL_IMDCT POSROTATESHUF_3DNOW |
%endif |
INIT_YMM avx |
%if HAVE_AVX_EXTERNAL |
DECL_IMDCT POSROTATESHUF_AVX |
%endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fft.h |
---|
0,0 → 1,38 |
/* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#ifndef AVCODEC_X86_FFT_H |
#define AVCODEC_X86_FFT_H |
#include "libavcodec/fft.h" |
void ff_fft_permute_sse(FFTContext *s, FFTComplex *z); |
void ff_fft_calc_avx(FFTContext *s, FFTComplex *z); |
void ff_fft_calc_sse(FFTContext *s, FFTComplex *z); |
void ff_fft_calc_3dnow(FFTContext *s, FFTComplex *z); |
void ff_fft_calc_3dnowext(FFTContext *s, FFTComplex *z); |
void ff_imdct_calc_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); |
void ff_imdct_half_3dnow(FFTContext *s, FFTSample *output, const FFTSample *input); |
void ff_imdct_calc_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); |
void ff_imdct_half_3dnowext(FFTContext *s, FFTSample *output, const FFTSample *input); |
void ff_imdct_calc_sse(FFTContext *s, FFTSample *output, const FFTSample *input); |
void ff_imdct_half_sse(FFTContext *s, FFTSample *output, const FFTSample *input); |
void ff_imdct_half_avx(FFTContext *s, FFTSample *output, const FFTSample *input); |
#endif /* AVCODEC_X86_FFT_H */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fft_init.c |
---|
0,0 → 1,57 |
/* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "config.h" |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/cpu.h" |
#include "fft.h" |
av_cold void ff_fft_init_x86(FFTContext *s) |
{ |
int cpu_flags = av_get_cpu_flags(); |
#if ARCH_X86_32 |
if (EXTERNAL_AMD3DNOW(cpu_flags)) { |
/* 3DNow! for K6-2/3 */ |
s->imdct_calc = ff_imdct_calc_3dnow; |
s->imdct_half = ff_imdct_half_3dnow; |
s->fft_calc = ff_fft_calc_3dnow; |
} |
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) { |
/* 3DNowEx for K7 */ |
s->imdct_calc = ff_imdct_calc_3dnowext; |
s->imdct_half = ff_imdct_half_3dnowext; |
s->fft_calc = ff_fft_calc_3dnowext; |
} |
#endif |
if (EXTERNAL_SSE(cpu_flags)) { |
/* SSE for P3/P4/K8 */ |
s->imdct_calc = ff_imdct_calc_sse; |
s->imdct_half = ff_imdct_half_sse; |
s->fft_permute = ff_fft_permute_sse; |
s->fft_calc = ff_fft_calc_sse; |
s->fft_permutation = FF_FFT_PERM_SWAP_LSBS; |
} |
if (EXTERNAL_AVX(cpu_flags) && s->nbits >= 5) { |
/* AVX for SB */ |
s->imdct_half = ff_imdct_half_avx; |
s->fft_calc = ff_fft_calc_avx; |
s->fft_permutation = FF_FFT_PERM_AVX; |
} |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fmtconvert.asm |
---|
0,0 → 1,429 |
;****************************************************************************** |
;* x86 optimized Format Conversion Utils |
;* Copyright (c) 2008 Loren Merritt |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_TEXT |
%macro CVTPS2PI 2 |
%if cpuflag(sse) |
cvtps2pi %1, %2 |
%elif cpuflag(3dnow) |
pf2id %1, %2 |
%endif |
%endmacro |
;--------------------------------------------------------------------------------- |
; void int32_to_float_fmul_scalar(float *dst, const int32_t *src, float mul, int len); |
;--------------------------------------------------------------------------------- |
%macro INT32_TO_FLOAT_FMUL_SCALAR 1 |
%if UNIX64 |
cglobal int32_to_float_fmul_scalar, 3, 3, %1, dst, src, len |
%else |
cglobal int32_to_float_fmul_scalar, 4, 4, %1, dst, src, mul, len |
%endif |
%if WIN64 |
SWAP 0, 2 |
%elif ARCH_X86_32 |
movss m0, mulm |
%endif |
SPLATD m0 |
shl lenq, 2 |
add srcq, lenq |
add dstq, lenq |
neg lenq |
.loop: |
%if cpuflag(sse2) |
cvtdq2ps m1, [srcq+lenq ] |
cvtdq2ps m2, [srcq+lenq+16] |
%else |
cvtpi2ps m1, [srcq+lenq ] |
cvtpi2ps m3, [srcq+lenq+ 8] |
cvtpi2ps m2, [srcq+lenq+16] |
cvtpi2ps m4, [srcq+lenq+24] |
movlhps m1, m3 |
movlhps m2, m4 |
%endif |
mulps m1, m0 |
mulps m2, m0 |
mova [dstq+lenq ], m1 |
mova [dstq+lenq+16], m2 |
add lenq, 32 |
jl .loop |
REP_RET |
%endmacro |
INIT_XMM sse |
INT32_TO_FLOAT_FMUL_SCALAR 5 |
INIT_XMM sse2 |
INT32_TO_FLOAT_FMUL_SCALAR 3 |
;------------------------------------------------------------------------------ |
; void ff_float_to_int16(int16_t *dst, const float *src, long len); |
;------------------------------------------------------------------------------ |
%macro FLOAT_TO_INT16 1 |
cglobal float_to_int16, 3, 3, %1, dst, src, len |
add lenq, lenq |
lea srcq, [srcq+2*lenq] |
add dstq, lenq |
neg lenq |
.loop: |
%if cpuflag(sse2) |
cvtps2dq m0, [srcq+2*lenq ] |
cvtps2dq m1, [srcq+2*lenq+16] |
packssdw m0, m1 |
mova [dstq+lenq], m0 |
%else |
CVTPS2PI m0, [srcq+2*lenq ] |
CVTPS2PI m1, [srcq+2*lenq+ 8] |
CVTPS2PI m2, [srcq+2*lenq+16] |
CVTPS2PI m3, [srcq+2*lenq+24] |
packssdw m0, m1 |
packssdw m2, m3 |
mova [dstq+lenq ], m0 |
mova [dstq+lenq+8], m2 |
%endif |
add lenq, 16 |
js .loop |
%if mmsize == 8 |
emms |
%endif |
REP_RET |
%endmacro |
INIT_XMM sse2 |
FLOAT_TO_INT16 2 |
INIT_MMX sse |
FLOAT_TO_INT16 0 |
INIT_MMX 3dnow |
FLOAT_TO_INT16 0 |
;------------------------------------------------------------------------------ |
; void ff_float_to_int16_step(int16_t *dst, const float *src, long len, long step); |
;------------------------------------------------------------------------------ |
%macro FLOAT_TO_INT16_STEP 1 |
cglobal float_to_int16_step, 4, 7, %1, dst, src, len, step, step3, v1, v2 |
add lenq, lenq |
lea srcq, [srcq+2*lenq] |
lea step3q, [stepq*3] |
neg lenq |
.loop: |
%if cpuflag(sse2) |
cvtps2dq m0, [srcq+2*lenq ] |
cvtps2dq m1, [srcq+2*lenq+16] |
packssdw m0, m1 |
movd v1d, m0 |
psrldq m0, 4 |
movd v2d, m0 |
psrldq m0, 4 |
mov [dstq], v1w |
mov [dstq+stepq*4], v2w |
shr v1d, 16 |
shr v2d, 16 |
mov [dstq+stepq*2], v1w |
mov [dstq+step3q*2], v2w |
lea dstq, [dstq+stepq*8] |
movd v1d, m0 |
psrldq m0, 4 |
movd v2d, m0 |
mov [dstq], v1w |
mov [dstq+stepq*4], v2w |
shr v1d, 16 |
shr v2d, 16 |
mov [dstq+stepq*2], v1w |
mov [dstq+step3q*2], v2w |
lea dstq, [dstq+stepq*8] |
%else |
CVTPS2PI m0, [srcq+2*lenq ] |
CVTPS2PI m1, [srcq+2*lenq+ 8] |
CVTPS2PI m2, [srcq+2*lenq+16] |
CVTPS2PI m3, [srcq+2*lenq+24] |
packssdw m0, m1 |
packssdw m2, m3 |
movd v1d, m0 |
psrlq m0, 32 |
movd v2d, m0 |
mov [dstq], v1w |
mov [dstq+stepq*4], v2w |
shr v1d, 16 |
shr v2d, 16 |
mov [dstq+stepq*2], v1w |
mov [dstq+step3q*2], v2w |
lea dstq, [dstq+stepq*8] |
movd v1d, m2 |
psrlq m2, 32 |
movd v2d, m2 |
mov [dstq], v1w |
mov [dstq+stepq*4], v2w |
shr v1d, 16 |
shr v2d, 16 |
mov [dstq+stepq*2], v1w |
mov [dstq+step3q*2], v2w |
lea dstq, [dstq+stepq*8] |
%endif |
add lenq, 16 |
js .loop |
%if mmsize == 8 |
emms |
%endif |
REP_RET |
%endmacro |
INIT_XMM sse2 |
FLOAT_TO_INT16_STEP 2 |
INIT_MMX sse |
FLOAT_TO_INT16_STEP 0 |
INIT_MMX 3dnow |
FLOAT_TO_INT16_STEP 0 |
;------------------------------------------------------------------------------- |
; void ff_float_to_int16_interleave2(int16_t *dst, const float **src, long len); |
;------------------------------------------------------------------------------- |
%macro FLOAT_TO_INT16_INTERLEAVE2 0 |
cglobal float_to_int16_interleave2, 3, 4, 2, dst, src0, src1, len |
lea lenq, [4*r2q] |
mov src1q, [src0q+gprsize] |
mov src0q, [src0q] |
add dstq, lenq |
add src0q, lenq |
add src1q, lenq |
neg lenq |
.loop: |
%if cpuflag(sse2) |
cvtps2dq m0, [src0q+lenq] |
cvtps2dq m1, [src1q+lenq] |
packssdw m0, m1 |
movhlps m1, m0 |
punpcklwd m0, m1 |
mova [dstq+lenq], m0 |
%else |
CVTPS2PI m0, [src0q+lenq ] |
CVTPS2PI m1, [src0q+lenq+8] |
CVTPS2PI m2, [src1q+lenq ] |
CVTPS2PI m3, [src1q+lenq+8] |
packssdw m0, m1 |
packssdw m2, m3 |
mova m1, m0 |
punpcklwd m0, m2 |
punpckhwd m1, m2 |
mova [dstq+lenq ], m0 |
mova [dstq+lenq+8], m1 |
%endif |
add lenq, 16 |
js .loop |
%if mmsize == 8 |
emms |
%endif |
REP_RET |
%endmacro |
INIT_MMX 3dnow |
FLOAT_TO_INT16_INTERLEAVE2 |
INIT_MMX sse |
FLOAT_TO_INT16_INTERLEAVE2 |
INIT_XMM sse2 |
FLOAT_TO_INT16_INTERLEAVE2 |
%macro FLOAT_TO_INT16_INTERLEAVE6 0 |
; void float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len) |
cglobal float_to_int16_interleave6, 2, 8, 0, dst, src, src1, src2, src3, src4, src5, len |
%if ARCH_X86_64 |
mov lend, r2d |
%else |
%define lend dword r2m |
%endif |
mov src1q, [srcq+1*gprsize] |
mov src2q, [srcq+2*gprsize] |
mov src3q, [srcq+3*gprsize] |
mov src4q, [srcq+4*gprsize] |
mov src5q, [srcq+5*gprsize] |
mov srcq, [srcq] |
sub src1q, srcq |
sub src2q, srcq |
sub src3q, srcq |
sub src4q, srcq |
sub src5q, srcq |
.loop: |
CVTPS2PI mm0, [srcq] |
CVTPS2PI mm1, [srcq+src1q] |
CVTPS2PI mm2, [srcq+src2q] |
CVTPS2PI mm3, [srcq+src3q] |
CVTPS2PI mm4, [srcq+src4q] |
CVTPS2PI mm5, [srcq+src5q] |
packssdw mm0, mm3 |
packssdw mm1, mm4 |
packssdw mm2, mm5 |
PSWAPD mm3, mm0 |
punpcklwd mm0, mm1 |
punpckhwd mm1, mm2 |
punpcklwd mm2, mm3 |
PSWAPD mm3, mm0 |
punpckldq mm0, mm2 |
punpckhdq mm2, mm1 |
punpckldq mm1, mm3 |
movq [dstq ], mm0 |
movq [dstq+16], mm2 |
movq [dstq+ 8], mm1 |
add srcq, 8 |
add dstq, 24 |
sub lend, 2 |
jg .loop |
emms |
RET |
%endmacro ; FLOAT_TO_INT16_INTERLEAVE6 |
INIT_MMX sse |
FLOAT_TO_INT16_INTERLEAVE6 |
INIT_MMX 3dnow |
FLOAT_TO_INT16_INTERLEAVE6 |
INIT_MMX 3dnowext |
FLOAT_TO_INT16_INTERLEAVE6 |
;----------------------------------------------------------------------------- |
; void ff_float_interleave6(float *dst, const float **src, unsigned int len); |
;----------------------------------------------------------------------------- |
%macro FLOAT_INTERLEAVE6 1 |
cglobal float_interleave6, 2, 8, %1, dst, src, src1, src2, src3, src4, src5, len |
%if ARCH_X86_64 |
mov lend, r2d |
%else |
%define lend dword r2m |
%endif |
mov src1q, [srcq+1*gprsize] |
mov src2q, [srcq+2*gprsize] |
mov src3q, [srcq+3*gprsize] |
mov src4q, [srcq+4*gprsize] |
mov src5q, [srcq+5*gprsize] |
mov srcq, [srcq] |
sub src1q, srcq |
sub src2q, srcq |
sub src3q, srcq |
sub src4q, srcq |
sub src5q, srcq |
.loop: |
%if cpuflag(sse) |
movaps m0, [srcq] |
movaps m1, [srcq+src1q] |
movaps m2, [srcq+src2q] |
movaps m3, [srcq+src3q] |
movaps m4, [srcq+src4q] |
movaps m5, [srcq+src5q] |
SBUTTERFLYPS 0, 1, 6 |
SBUTTERFLYPS 2, 3, 6 |
SBUTTERFLYPS 4, 5, 6 |
movaps m6, m4 |
shufps m4, m0, 0xe4 |
movlhps m0, m2 |
movhlps m6, m2 |
movaps [dstq ], m0 |
movaps [dstq+16], m4 |
movaps [dstq+32], m6 |
movaps m6, m5 |
shufps m5, m1, 0xe4 |
movlhps m1, m3 |
movhlps m6, m3 |
movaps [dstq+48], m1 |
movaps [dstq+64], m5 |
movaps [dstq+80], m6 |
%else ; mmx |
movq m0, [srcq] |
movq m1, [srcq+src1q] |
movq m2, [srcq+src2q] |
movq m3, [srcq+src3q] |
movq m4, [srcq+src4q] |
movq m5, [srcq+src5q] |
SBUTTERFLY dq, 0, 1, 6 |
SBUTTERFLY dq, 2, 3, 6 |
SBUTTERFLY dq, 4, 5, 6 |
movq [dstq ], m0 |
movq [dstq+ 8], m2 |
movq [dstq+16], m4 |
movq [dstq+24], m1 |
movq [dstq+32], m3 |
movq [dstq+40], m5 |
%endif |
add srcq, mmsize |
add dstq, mmsize*6 |
sub lend, mmsize/4 |
jg .loop |
%if mmsize == 8 |
emms |
%endif |
REP_RET |
%endmacro |
INIT_MMX mmx |
FLOAT_INTERLEAVE6 0 |
INIT_XMM sse |
FLOAT_INTERLEAVE6 7 |
;----------------------------------------------------------------------------- |
; void ff_float_interleave2(float *dst, const float **src, unsigned int len); |
;----------------------------------------------------------------------------- |
%macro FLOAT_INTERLEAVE2 1 |
cglobal float_interleave2, 3, 4, %1, dst, src, len, src1 |
mov src1q, [srcq+gprsize] |
mov srcq, [srcq ] |
sub src1q, srcq |
.loop: |
mova m0, [srcq ] |
mova m1, [srcq+src1q ] |
mova m3, [srcq +mmsize] |
mova m4, [srcq+src1q+mmsize] |
mova m2, m0 |
PUNPCKLDQ m0, m1 |
PUNPCKHDQ m2, m1 |
mova m1, m3 |
PUNPCKLDQ m3, m4 |
PUNPCKHDQ m1, m4 |
mova [dstq ], m0 |
mova [dstq+1*mmsize], m2 |
mova [dstq+2*mmsize], m3 |
mova [dstq+3*mmsize], m1 |
add srcq, mmsize*2 |
add dstq, mmsize*4 |
sub lend, mmsize/2 |
jg .loop |
%if mmsize == 8 |
emms |
%endif |
REP_RET |
%endmacro |
INIT_MMX mmx |
%define PUNPCKLDQ punpckldq |
%define PUNPCKHDQ punpckhdq |
FLOAT_INTERLEAVE2 0 |
INIT_XMM sse |
%define PUNPCKLDQ unpcklps |
%define PUNPCKHDQ unpckhps |
FLOAT_INTERLEAVE2 5 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fmtconvert_init.c |
---|
0,0 → 1,147 |
/* |
* Format Conversion Utils |
* Copyright (c) 2000, 2001 Fabrice Bellard |
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
* |
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/fmtconvert.h" |
#if HAVE_YASM |
void ff_int32_to_float_fmul_scalar_sse (float *dst, const int32_t *src, float mul, int len); |
void ff_int32_to_float_fmul_scalar_sse2(float *dst, const int32_t *src, float mul, int len); |
void ff_float_to_int16_3dnow(int16_t *dst, const float *src, long len); |
void ff_float_to_int16_sse (int16_t *dst, const float *src, long len); |
void ff_float_to_int16_sse2 (int16_t *dst, const float *src, long len); |
void ff_float_to_int16_step_3dnow(int16_t *dst, const float *src, long len, long step); |
void ff_float_to_int16_step_sse (int16_t *dst, const float *src, long len, long step); |
void ff_float_to_int16_step_sse2 (int16_t *dst, const float *src, long len, long step); |
void ff_float_to_int16_interleave2_3dnow(int16_t *dst, const float **src, long len); |
void ff_float_to_int16_interleave2_sse (int16_t *dst, const float **src, long len); |
void ff_float_to_int16_interleave2_sse2 (int16_t *dst, const float **src, long len); |
void ff_float_to_int16_interleave6_sse(int16_t *dst, const float **src, int len); |
void ff_float_to_int16_interleave6_3dnow(int16_t *dst, const float **src, int len); |
void ff_float_to_int16_interleave6_3dnowext(int16_t *dst, const float **src, int len); |
#define ff_float_to_int16_interleave6_sse2 ff_float_to_int16_interleave6_sse |
#define FLOAT_TO_INT16_INTERLEAVE(cpu) \ |
/* gcc pessimizes register allocation if this is in the same function as float_to_int16_interleave_sse2*/\ |
static av_noinline void float_to_int16_interleave_misc_##cpu(int16_t *dst, const float **src, long len, int channels){\ |
int c;\ |
for(c=0; c<channels; c++){\ |
ff_float_to_int16_step_##cpu(dst+c, src[c], len, channels);\ |
}\ |
}\ |
\ |
static void float_to_int16_interleave_##cpu(int16_t *dst, const float **src, long len, int channels){\ |
if(channels==1)\ |
ff_float_to_int16_##cpu(dst, src[0], len);\ |
else if(channels==2){\ |
ff_float_to_int16_interleave2_##cpu(dst, src, len);\ |
}else if(channels==6){\ |
ff_float_to_int16_interleave6_##cpu(dst, src, len);\ |
}else\ |
float_to_int16_interleave_misc_##cpu(dst, src, len, channels);\ |
} |
FLOAT_TO_INT16_INTERLEAVE(3dnow) |
FLOAT_TO_INT16_INTERLEAVE(sse) |
FLOAT_TO_INT16_INTERLEAVE(sse2) |
static void float_to_int16_interleave_3dnowext(int16_t *dst, const float **src, |
long len, int channels) |
{ |
if(channels==6) |
ff_float_to_int16_interleave6_3dnowext(dst, src, len); |
else |
float_to_int16_interleave_3dnow(dst, src, len, channels); |
} |
void ff_float_interleave2_mmx(float *dst, const float **src, unsigned int len); |
void ff_float_interleave2_sse(float *dst, const float **src, unsigned int len); |
void ff_float_interleave6_mmx(float *dst, const float **src, unsigned int len); |
void ff_float_interleave6_sse(float *dst, const float **src, unsigned int len); |
static void float_interleave_mmx(float *dst, const float **src, |
unsigned int len, int channels) |
{ |
if (channels == 2) { |
ff_float_interleave2_mmx(dst, src, len); |
} else if (channels == 6) |
ff_float_interleave6_mmx(dst, src, len); |
else |
ff_float_interleave_c(dst, src, len, channels); |
} |
static void float_interleave_sse(float *dst, const float **src, |
unsigned int len, int channels) |
{ |
if (channels == 2) { |
ff_float_interleave2_sse(dst, src, len); |
} else if (channels == 6) |
ff_float_interleave6_sse(dst, src, len); |
else |
ff_float_interleave_c(dst, src, len, channels); |
} |
#endif /* HAVE_YASM */ |
av_cold void ff_fmt_convert_init_x86(FmtConvertContext *c, AVCodecContext *avctx) |
{ |
#if HAVE_YASM |
int cpu_flags = av_get_cpu_flags(); |
if (EXTERNAL_MMX(cpu_flags)) { |
c->float_interleave = float_interleave_mmx; |
} |
if (EXTERNAL_AMD3DNOW(cpu_flags)) { |
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
c->float_to_int16 = ff_float_to_int16_3dnow; |
c->float_to_int16_interleave = float_to_int16_interleave_3dnow; |
} |
} |
if (EXTERNAL_AMD3DNOWEXT(cpu_flags)) { |
if (!(avctx->flags & CODEC_FLAG_BITEXACT)) { |
c->float_to_int16_interleave = float_to_int16_interleave_3dnowext; |
} |
} |
if (EXTERNAL_SSE(cpu_flags)) { |
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse; |
c->float_to_int16 = ff_float_to_int16_sse; |
c->float_to_int16_interleave = float_to_int16_interleave_sse; |
c->float_interleave = float_interleave_sse; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
c->int32_to_float_fmul_scalar = ff_int32_to_float_fmul_scalar_sse2; |
c->float_to_int16 = ff_float_to_int16_sse2; |
c->float_to_int16_interleave = float_to_int16_interleave_sse2; |
} |
#endif /* HAVE_YASM */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fpel.asm |
---|
0,0 → 1,106 |
;****************************************************************************** |
;* MMX optimized DSP utils |
;* Copyright (c) 2008 Loren Merritt |
;* Copyright (c) 2003-2013 Michael Niedermayer |
;* Copyright (c) 2013 Daniel Kang |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION .text |
INIT_MMX mmxext |
; void pixels(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro PIXELS48 2 |
%if %2 == 4 |
%define OP movh |
%else |
%define OP mova |
%endif |
cglobal %1_pixels%2, 4,5 |
movsxdifnidn r2, r2d |
lea r4, [r2*3] |
.loop: |
OP m0, [r1] |
OP m1, [r1+r2] |
OP m2, [r1+r2*2] |
OP m3, [r1+r4] |
lea r1, [r1+r2*4] |
%ifidn %1, avg |
pavgb m0, [r0] |
pavgb m1, [r0+r2] |
pavgb m2, [r0+r2*2] |
pavgb m3, [r0+r4] |
%endif |
OP [r0], m0 |
OP [r0+r2], m1 |
OP [r0+r2*2], m2 |
OP [r0+r4], m3 |
sub r3d, 4 |
lea r0, [r0+r2*4] |
jne .loop |
RET |
%endmacro |
PIXELS48 put, 4 |
PIXELS48 avg, 4 |
PIXELS48 put, 8 |
PIXELS48 avg, 8 |
INIT_XMM sse2 |
; void put_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
cglobal put_pixels16, 4,5,4 |
lea r4, [r2*3] |
.loop: |
movu m0, [r1] |
movu m1, [r1+r2] |
movu m2, [r1+r2*2] |
movu m3, [r1+r4] |
lea r1, [r1+r2*4] |
mova [r0], m0 |
mova [r0+r2], m1 |
mova [r0+r2*2], m2 |
mova [r0+r4], m3 |
sub r3d, 4 |
lea r0, [r0+r2*4] |
jnz .loop |
REP_RET |
; void avg_pixels16_sse2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
cglobal avg_pixels16, 4,5,4 |
lea r4, [r2*3] |
.loop: |
movu m0, [r1] |
movu m1, [r1+r2] |
movu m2, [r1+r2*2] |
movu m3, [r1+r4] |
lea r1, [r1+r2*4] |
pavgb m0, [r0] |
pavgb m1, [r0+r2] |
pavgb m2, [r0+r2*2] |
pavgb m3, [r0+r4] |
mova [r0], m0 |
mova [r0+r2], m1 |
mova [r0+r2*2], m2 |
mova [r0+r4], m3 |
sub r3d, 4 |
lea r0, [r0+r2*4] |
jnz .loop |
REP_RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/fpel_mmx.c |
---|
0,0 → 1,139 |
/* |
* MMX-optimized avg/put pixel routines |
* |
* Copyright (c) 2000, 2001 Fabrice Bellard |
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include <stddef.h> |
#include <stdint.h> |
#include "config.h" |
#include "dsputil_x86.h" |
#if HAVE_MMX_INLINE |
// in case more speed is needed - unrolling would certainly help |
void ff_avg_pixels8_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h) |
{ |
MOVQ_BFE(mm6); |
JUMPALIGN(); |
do { |
__asm__ volatile( |
"movq %0, %%mm0 \n\t" |
"movq %1, %%mm1 \n\t" |
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) |
"movq %%mm2, %0 \n\t" |
:"+m"(*block) |
:"m"(*pixels) |
:"memory"); |
pixels += line_size; |
block += line_size; |
} |
while (--h); |
} |
void ff_avg_pixels16_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h) |
{ |
MOVQ_BFE(mm6); |
JUMPALIGN(); |
do { |
__asm__ volatile( |
"movq %0, %%mm0 \n\t" |
"movq %1, %%mm1 \n\t" |
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) |
"movq %%mm2, %0 \n\t" |
"movq 8%0, %%mm0 \n\t" |
"movq 8%1, %%mm1 \n\t" |
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) |
"movq %%mm2, 8%0 \n\t" |
:"+m"(*block) |
:"m"(*pixels) |
:"memory"); |
pixels += line_size; |
block += line_size; |
} |
while (--h); |
} |
void ff_put_pixels8_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h) |
{ |
__asm__ volatile ( |
"lea (%3, %3), %%"REG_a" \n\t" |
".p2align 3 \n\t" |
"1: \n\t" |
"movq (%1 ), %%mm0 \n\t" |
"movq (%1, %3), %%mm1 \n\t" |
"movq %%mm0, (%2) \n\t" |
"movq %%mm1, (%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"movq (%1 ), %%mm0 \n\t" |
"movq (%1, %3), %%mm1 \n\t" |
"movq %%mm0, (%2) \n\t" |
"movq %%mm1, (%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"subl $4, %0 \n\t" |
"jnz 1b \n\t" |
: "+g"(h), "+r"(pixels), "+r"(block) |
: "r"((x86_reg)line_size) |
: "%"REG_a, "memory" |
); |
} |
void ff_put_pixels16_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h) |
{ |
__asm__ volatile ( |
"lea (%3, %3), %%"REG_a" \n\t" |
".p2align 3 \n\t" |
"1: \n\t" |
"movq (%1 ), %%mm0 \n\t" |
"movq 8(%1 ), %%mm4 \n\t" |
"movq (%1, %3), %%mm1 \n\t" |
"movq 8(%1, %3), %%mm5 \n\t" |
"movq %%mm0, (%2) \n\t" |
"movq %%mm4, 8(%2) \n\t" |
"movq %%mm1, (%2, %3) \n\t" |
"movq %%mm5, 8(%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"movq (%1 ), %%mm0 \n\t" |
"movq 8(%1 ), %%mm4 \n\t" |
"movq (%1, %3), %%mm1 \n\t" |
"movq 8(%1, %3), %%mm5 \n\t" |
"movq %%mm0, (%2) \n\t" |
"movq %%mm4, 8(%2) \n\t" |
"movq %%mm1, (%2, %3) \n\t" |
"movq %%mm5, 8(%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"subl $4, %0 \n\t" |
"jnz 1b \n\t" |
: "+g"(h), "+r"(pixels), "+r"(block) |
: "r"((x86_reg)line_size) |
: "%"REG_a, "memory" |
); |
} |
#endif /* HAVE_MMX_INLINE */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h263_loopfilter.asm |
---|
0,0 → 1,189 |
;****************************************************************************** |
;* MMX-optimized H.263 loop filter |
;* Copyright (c) 2003-2013 Michael Niedermayer |
;* Copyright (c) 2013 Daniel Kang |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
cextern pb_FC |
cextern h263_loop_filter_strength |
SECTION_TEXT |
%macro H263_LOOP_FILTER 5 |
pxor m7, m7 |
mova m0, [%1] |
mova m1, [%1] |
mova m2, [%4] |
mova m3, [%4] |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
psubw m0, m2 |
psubw m1, m3 |
mova m2, [%2] |
mova m3, [%2] |
mova m4, [%3] |
mova m5, [%3] |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
punpcklbw m4, m7 |
punpckhbw m5, m7 |
psubw m4, m2 |
psubw m5, m3 |
psllw m4, 2 |
psllw m5, 2 |
paddw m4, m0 |
paddw m5, m1 |
pxor m6, m6 |
pcmpgtw m6, m4 |
pcmpgtw m7, m5 |
pxor m4, m6 |
pxor m5, m7 |
psubw m4, m6 |
psubw m5, m7 |
psrlw m4, 3 |
psrlw m5, 3 |
packuswb m4, m5 |
packsswb m6, m7 |
pxor m7, m7 |
movd m2, %5 |
punpcklbw m2, m2 |
punpcklbw m2, m2 |
punpcklbw m2, m2 |
psubusb m2, m4 |
mova m3, m2 |
psubusb m3, m4 |
psubb m2, m3 |
mova m3, [%2] |
mova m4, [%3] |
pxor m3, m6 |
pxor m4, m6 |
paddusb m3, m2 |
psubusb m4, m2 |
pxor m3, m6 |
pxor m4, m6 |
paddusb m2, m2 |
packsswb m0, m1 |
pcmpgtb m7, m0 |
pxor m0, m7 |
psubb m0, m7 |
mova m1, m0 |
psubusb m0, m2 |
psubb m1, m0 |
pand m1, [pb_FC] |
psrlw m1, 2 |
pxor m1, m7 |
psubb m1, m7 |
mova m5, [%1] |
mova m6, [%4] |
psubb m5, m1 |
paddb m6, m1 |
%endmacro |
INIT_MMX mmx |
; void h263_v_loop_filter(uint8_t *src, int stride, int qscale) |
cglobal h263_v_loop_filter, 3,5 |
movsxdifnidn r1, r1d |
movsxdifnidn r2, r2d |
lea r4, [h263_loop_filter_strength] |
movzx r3d, BYTE [r4+r2] |
movsx r2, r3b |
shl r2, 1 |
mov r3, r0 |
sub r3, r1 |
mov r4, r3 |
sub r4, r1 |
H263_LOOP_FILTER r4, r3, r0, r0+r1, r2d |
mova [r3], m3 |
mova [r0], m4 |
mova [r4], m5 |
mova [r0+r1], m6 |
RET |
%macro TRANSPOSE4X4 2 |
movd m0, [%1] |
movd m1, [%1+r1] |
movd m2, [%1+r1*2] |
movd m3, [%1+r3] |
punpcklbw m0, m1 |
punpcklbw m2, m3 |
mova m1, m0 |
punpcklwd m0, m2 |
punpckhwd m1, m2 |
movd [%2+ 0], m0 |
punpckhdq m0, m0 |
movd [%2+ 8], m0 |
movd [%2+16], m1 |
punpckhdq m1, m1 |
movd [%2+24], m1 |
%endmacro |
; void h263_h_loop_filter(uint8_t *src, int stride, int qscale) |
INIT_MMX mmx |
cglobal h263_h_loop_filter, 3,5,0,32 |
movsxdifnidn r1, r1d |
movsxdifnidn r2, r2d |
lea r4, [h263_loop_filter_strength] |
movzx r3d, BYTE [r4+r2] |
movsx r2, r3b |
shl r2, 1 |
sub r0, 2 |
lea r3, [r1*3] |
TRANSPOSE4X4 r0, rsp |
lea r4, [r0+r1*4] |
TRANSPOSE4X4 r4, rsp+4 |
H263_LOOP_FILTER rsp, rsp+8, rsp+16, rsp+24, r2d |
mova m1, m5 |
mova m0, m4 |
punpcklbw m5, m3 |
punpcklbw m4, m6 |
punpckhbw m1, m3 |
punpckhbw m0, m6 |
mova m3, m5 |
mova m6, m1 |
punpcklwd m5, m4 |
punpcklwd m1, m0 |
punpckhwd m3, m4 |
punpckhwd m6, m0 |
movd [r0], m5 |
punpckhdq m5, m5 |
movd [r0+r1*1], m5 |
movd [r0+r1*2], m3 |
punpckhdq m3, m3 |
movd [r0+r3], m3 |
movd [r4], m1 |
punpckhdq m1, m1 |
movd [r4+r1*1], m1 |
movd [r4+r1*2], m6 |
punpckhdq m6, m6 |
movd [r4+r3], m6 |
RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_chromamc.asm |
---|
0,0 → 1,678 |
;****************************************************************************** |
;* MMX/SSSE3-optimized functions for H264 chroma MC |
;* Copyright (c) 2005 Zoltan Hidvegi <hzoli -a- hzoli -d- com>, |
;* 2005-2008 Loren Merritt |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
rnd_rv40_2d_tbl: times 4 dw 0 |
times 4 dw 16 |
times 4 dw 32 |
times 4 dw 16 |
times 4 dw 32 |
times 4 dw 28 |
times 4 dw 32 |
times 4 dw 28 |
times 4 dw 0 |
times 4 dw 32 |
times 4 dw 16 |
times 4 dw 32 |
times 4 dw 32 |
times 4 dw 28 |
times 4 dw 32 |
times 4 dw 28 |
rnd_rv40_1d_tbl: times 4 dw 0 |
times 4 dw 2 |
times 4 dw 4 |
times 4 dw 2 |
times 4 dw 4 |
times 4 dw 3 |
times 4 dw 4 |
times 4 dw 3 |
times 4 dw 0 |
times 4 dw 4 |
times 4 dw 2 |
times 4 dw 4 |
times 4 dw 4 |
times 4 dw 3 |
times 4 dw 4 |
times 4 dw 3 |
cextern pw_3 |
cextern pw_4 |
cextern pw_8 |
pw_28: times 8 dw 28 |
cextern pw_32 |
cextern pw_64 |
SECTION .text |
%macro mv0_pixels_mc8 0 |
lea r4, [r2*2 ] |
.next4rows: |
movq mm0, [r1 ] |
movq mm1, [r1+r2] |
add r1, r4 |
CHROMAMC_AVG mm0, [r0 ] |
CHROMAMC_AVG mm1, [r0+r2] |
movq [r0 ], mm0 |
movq [r0+r2], mm1 |
add r0, r4 |
movq mm0, [r1 ] |
movq mm1, [r1+r2] |
add r1, r4 |
CHROMAMC_AVG mm0, [r0 ] |
CHROMAMC_AVG mm1, [r0+r2] |
movq [r0 ], mm0 |
movq [r0+r2], mm1 |
add r0, r4 |
sub r3d, 4 |
jne .next4rows |
%endmacro |
%macro chroma_mc8_mmx_func 2-3 |
%ifidn %2, rv40 |
%ifdef PIC |
%define rnd_1d_rv40 r8 |
%define rnd_2d_rv40 r8 |
%define extra_regs 2 |
%else ; no-PIC |
%define rnd_1d_rv40 rnd_rv40_1d_tbl |
%define rnd_2d_rv40 rnd_rv40_2d_tbl |
%define extra_regs 1 |
%endif ; PIC |
%else |
%define extra_regs 0 |
%endif ; rv40 |
; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, |
; int stride, int h, int mx, int my) |
cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0 |
%if ARCH_X86_64 |
movsxd r2, r2d |
%endif |
mov r6d, r5d |
or r6d, r4d |
jne .at_least_one_non_zero |
; mx == 0 AND my == 0 - no filter needed |
mv0_pixels_mc8 |
REP_RET |
.at_least_one_non_zero: |
%ifidn %2, rv40 |
%if ARCH_X86_64 |
mov r7, r5 |
and r7, 6 ; &~1 for mx/my=[0,7] |
lea r7, [r7*4+r4] |
sar r7d, 1 |
%define rnd_bias r7 |
%define dest_reg r0 |
%else ; x86-32 |
mov r0, r5 |
and r0, 6 ; &~1 for mx/my=[0,7] |
lea r0, [r0*4+r4] |
sar r0d, 1 |
%define rnd_bias r0 |
%define dest_reg r5 |
%endif |
%else ; vc1, h264 |
%define rnd_bias 0 |
%define dest_reg r0 |
%endif |
test r5d, r5d |
mov r6, 1 |
je .my_is_zero |
test r4d, r4d |
mov r6, r2 ; dxy = x ? 1 : stride |
jne .both_non_zero |
.my_is_zero: |
; mx == 0 XOR my == 0 - 1 dimensional filter only |
or r4d, r5d ; x + y |
%ifidn %2, rv40 |
%ifdef PIC |
lea r8, [rnd_rv40_1d_tbl] |
%endif |
%if ARCH_X86_64 == 0 |
mov r5, r0m |
%endif |
%endif |
movd m5, r4d |
movq m4, [pw_8] |
movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 |
punpcklwd m5, m5 |
punpckldq m5, m5 ; mm5 = B = x |
pxor m7, m7 |
psubw m4, m5 ; mm4 = A = 8-x |
.next1drow: |
movq m0, [r1 ] ; mm0 = src[0..7] |
movq m2, [r1+r6] ; mm1 = src[1..8] |
movq m1, m0 |
movq m3, m2 |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] |
pmullw m1, m4 |
pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] |
pmullw m3, m5 |
paddw m0, m6 |
paddw m1, m6 |
paddw m0, m2 |
paddw m1, m3 |
psrlw m0, 3 |
psrlw m1, 3 |
packuswb m0, m1 |
CHROMAMC_AVG m0, [dest_reg] |
movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 |
add dest_reg, r2 |
add r1, r2 |
dec r3d |
jne .next1drow |
REP_RET |
.both_non_zero: ; general case, bilinear |
movd m4, r4d ; x |
movd m6, r5d ; y |
%ifidn %2, rv40 |
%ifdef PIC |
lea r8, [rnd_rv40_2d_tbl] |
%endif |
%if ARCH_X86_64 == 0 |
mov r5, r0m |
%endif |
%endif |
mov r6, rsp ; backup stack pointer |
and rsp, ~(mmsize-1) ; align stack |
sub rsp, 16 ; AA and DD |
punpcklwd m4, m4 |
punpcklwd m6, m6 |
punpckldq m4, m4 ; mm4 = x words |
punpckldq m6, m6 ; mm6 = y words |
movq m5, m4 |
pmullw m4, m6 ; mm4 = x * y |
psllw m5, 3 |
psllw m6, 3 |
movq m7, m5 |
paddw m7, m6 |
movq [rsp+8], m4 ; DD = x * y |
psubw m5, m4 ; mm5 = B = 8x - xy |
psubw m6, m4 ; mm6 = C = 8y - xy |
paddw m4, [pw_64] |
psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 |
pxor m7, m7 |
movq [rsp ], m4 |
movq m0, [r1 ] ; mm0 = src[0..7] |
movq m1, [r1+1] ; mm1 = src[1..8] |
.next2drow: |
add r1, r2 |
movq m2, m0 |
movq m3, m1 |
punpckhbw m0, m7 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
pmullw m0, [rsp] |
pmullw m2, [rsp] |
pmullw m1, m5 |
pmullw m3, m5 |
paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] |
paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] |
movq m0, [r1] |
movq m1, m0 |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
pmullw m0, m6 |
pmullw m1, m6 |
paddw m2, m0 |
paddw m3, m1 ; [mm2,mm3] += C * src[0..7] |
movq m1, [r1+1] |
movq m0, m1 |
movq m4, m1 |
punpcklbw m0, m7 |
punpckhbw m4, m7 |
pmullw m0, [rsp+8] |
pmullw m4, [rsp+8] |
paddw m2, m0 |
paddw m3, m4 ; [mm2,mm3] += D * src[1..8] |
movq m0, [r1] |
paddw m2, [rnd_2d_%2+rnd_bias*8] |
paddw m3, [rnd_2d_%2+rnd_bias*8] |
psrlw m2, 6 |
psrlw m3, 6 |
packuswb m2, m3 |
CHROMAMC_AVG m2, [dest_reg] |
movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 |
add dest_reg, r2 |
dec r3d |
jne .next2drow |
mov rsp, r6 ; restore stack pointer |
RET |
%endmacro |
%macro chroma_mc4_mmx_func 2 |
%define extra_regs 0 |
%ifidn %2, rv40 |
%ifdef PIC |
%define extra_regs 1 |
%endif ; PIC |
%endif ; rv40 |
cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0 |
%if ARCH_X86_64 |
movsxd r2, r2d |
%endif |
pxor m7, m7 |
movd m2, r4d ; x |
movd m3, r5d ; y |
movq m4, [pw_8] |
movq m5, [pw_8] |
punpcklwd m2, m2 |
punpcklwd m3, m3 |
punpcklwd m2, m2 |
punpcklwd m3, m3 |
psubw m4, m2 |
psubw m5, m3 |
%ifidn %2, rv40 |
%ifdef PIC |
lea r6, [rnd_rv40_2d_tbl] |
%define rnd_2d_rv40 r6 |
%else |
%define rnd_2d_rv40 rnd_rv40_2d_tbl |
%endif |
and r5, 6 ; &~1 for mx/my=[0,7] |
lea r5, [r5*4+r4] |
sar r5d, 1 |
%define rnd_bias r5 |
%else ; vc1, h264 |
%define rnd_bias 0 |
%endif |
movd m0, [r1 ] |
movd m6, [r1+1] |
add r1, r2 |
punpcklbw m0, m7 |
punpcklbw m6, m7 |
pmullw m0, m4 |
pmullw m6, m2 |
paddw m6, m0 |
.next2rows: |
movd m0, [r1 ] |
movd m1, [r1+1] |
add r1, r2 |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
pmullw m0, m4 |
pmullw m1, m2 |
paddw m1, m0 |
movq m0, m1 |
pmullw m6, m5 |
pmullw m1, m3 |
paddw m6, [rnd_2d_%2+rnd_bias*8] |
paddw m1, m6 |
psrlw m1, 6 |
packuswb m1, m1 |
CHROMAMC_AVG4 m1, m6, [r0] |
movd [r0], m1 |
add r0, r2 |
movd m6, [r1 ] |
movd m1, [r1+1] |
add r1, r2 |
punpcklbw m6, m7 |
punpcklbw m1, m7 |
pmullw m6, m4 |
pmullw m1, m2 |
paddw m1, m6 |
movq m6, m1 |
pmullw m0, m5 |
pmullw m1, m3 |
paddw m0, [rnd_2d_%2+rnd_bias*8] |
paddw m1, m0 |
psrlw m1, 6 |
packuswb m1, m1 |
CHROMAMC_AVG4 m1, m0, [r0] |
movd [r0], m1 |
add r0, r2 |
sub r3d, 2 |
jnz .next2rows |
REP_RET |
%endmacro |
%macro chroma_mc2_mmx_func 2 |
cglobal %1_%2_chroma_mc2, 6, 7, 0 |
%if ARCH_X86_64 |
movsxd r2, r2d |
%endif |
mov r6d, r4d |
shl r4d, 16 |
sub r4d, r6d |
add r4d, 8 |
imul r5d, r4d ; x*y<<16 | y*(8-x) |
shl r4d, 3 |
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) |
movd m5, r4d |
movd m6, r5d |
punpckldq m5, m5 ; mm5 = {A,B,A,B} |
punpckldq m6, m6 ; mm6 = {C,D,C,D} |
pxor m7, m7 |
movd m2, [r1] |
punpcklbw m2, m7 |
pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] |
.nextrow: |
add r1, r2 |
movq m1, m2 |
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] |
movd m0, [r1] |
punpcklbw m0, m7 |
pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] |
movq m2, m0 |
pmaddwd m0, m6 |
paddw m1, [rnd_2d_%2] |
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] |
psrlw m1, 6 |
packssdw m1, m7 |
packuswb m1, m7 |
CHROMAMC_AVG4 m1, m3, [r0] |
movd r5d, m1 |
mov [r0], r5w |
add r0, r2 |
sub r3d, 1 |
jnz .nextrow |
REP_RET |
%endmacro |
%define rnd_1d_h264 pw_4 |
%define rnd_2d_h264 pw_32 |
%define rnd_1d_vc1 pw_3 |
%define rnd_2d_vc1 pw_28 |
%macro NOTHING 2-3 |
%endmacro |
%macro DIRECT_AVG 2 |
PAVGB %1, %2 |
%endmacro |
%macro COPY_AVG 3 |
movd %2, %3 |
PAVGB %1, %2 |
%endmacro |
INIT_MMX mmx |
%define CHROMAMC_AVG NOTHING |
%define CHROMAMC_AVG4 NOTHING |
chroma_mc8_mmx_func put, h264, _rnd |
chroma_mc8_mmx_func put, vc1, _nornd |
chroma_mc8_mmx_func put, rv40 |
chroma_mc4_mmx_func put, h264 |
chroma_mc4_mmx_func put, rv40 |
INIT_MMX mmxext |
chroma_mc2_mmx_func put, h264 |
%define CHROMAMC_AVG DIRECT_AVG |
%define CHROMAMC_AVG4 COPY_AVG |
chroma_mc8_mmx_func avg, h264, _rnd |
chroma_mc8_mmx_func avg, vc1, _nornd |
chroma_mc8_mmx_func avg, rv40 |
chroma_mc4_mmx_func avg, h264 |
chroma_mc4_mmx_func avg, rv40 |
chroma_mc2_mmx_func avg, h264 |
INIT_MMX 3dnow |
chroma_mc8_mmx_func avg, h264, _rnd |
chroma_mc8_mmx_func avg, vc1, _nornd |
chroma_mc8_mmx_func avg, rv40 |
chroma_mc4_mmx_func avg, h264 |
chroma_mc4_mmx_func avg, rv40 |
%macro chroma_mc8_ssse3_func 2-3 |
cglobal %1_%2_chroma_mc8%3, 6, 7, 8 |
%if ARCH_X86_64 |
movsxd r2, r2d |
%endif |
mov r6d, r5d |
or r6d, r4d |
jne .at_least_one_non_zero |
; mx == 0 AND my == 0 - no filter needed |
mv0_pixels_mc8 |
REP_RET |
.at_least_one_non_zero: |
test r5d, r5d |
je .my_is_zero |
test r4d, r4d |
je .mx_is_zero |
; general case, bilinear |
mov r6d, r4d |
shl r4d, 8 |
sub r4, r6 |
mov r6, 8 |
add r4, 8 ; x*288+8 = x<<8 | (8-x) |
sub r6d, r5d |
imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) |
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) |
movd m7, r6d |
movd m6, r4d |
movdqa m5, [rnd_2d_%2] |
movq m0, [r1 ] |
movq m1, [r1+1] |
pshuflw m7, m7, 0 |
pshuflw m6, m6, 0 |
punpcklbw m0, m1 |
movlhps m7, m7 |
movlhps m6, m6 |
.next2rows: |
movq m1, [r1+r2*1 ] |
movq m2, [r1+r2*1+1] |
movq m3, [r1+r2*2 ] |
movq m4, [r1+r2*2+1] |
lea r1, [r1+r2*2] |
punpcklbw m1, m2 |
movdqa m2, m1 |
punpcklbw m3, m4 |
movdqa m4, m3 |
pmaddubsw m0, m7 |
pmaddubsw m1, m6 |
pmaddubsw m2, m7 |
pmaddubsw m3, m6 |
paddw m0, m5 |
paddw m2, m5 |
paddw m1, m0 |
paddw m3, m2 |
psrlw m1, 6 |
movdqa m0, m4 |
psrlw m3, 6 |
%ifidn %1, avg |
movq m2, [r0 ] |
movhps m2, [r0+r2] |
%endif |
packuswb m1, m3 |
CHROMAMC_AVG m1, m2 |
movq [r0 ], m1 |
movhps [r0+r2], m1 |
sub r3d, 2 |
lea r0, [r0+r2*2] |
jg .next2rows |
REP_RET |
.my_is_zero: |
mov r5d, r4d |
shl r4d, 8 |
add r4, 8 |
sub r4, r5 ; 255*x+8 = x<<8 | (8-x) |
movd m7, r4d |
movdqa m6, [rnd_1d_%2] |
pshuflw m7, m7, 0 |
movlhps m7, m7 |
.next2xrows: |
movq m0, [r1 ] |
movq m1, [r1 +1] |
movq m2, [r1+r2 ] |
movq m3, [r1+r2+1] |
punpcklbw m0, m1 |
punpcklbw m2, m3 |
pmaddubsw m0, m7 |
pmaddubsw m2, m7 |
%ifidn %1, avg |
movq m4, [r0 ] |
movhps m4, [r0+r2] |
%endif |
paddw m0, m6 |
paddw m2, m6 |
psrlw m0, 3 |
psrlw m2, 3 |
packuswb m0, m2 |
CHROMAMC_AVG m0, m4 |
movq [r0 ], m0 |
movhps [r0+r2], m0 |
sub r3d, 2 |
lea r0, [r0+r2*2] |
lea r1, [r1+r2*2] |
jg .next2xrows |
REP_RET |
.mx_is_zero: |
mov r4d, r5d |
shl r5d, 8 |
add r5, 8 |
sub r5, r4 ; 255*y+8 = y<<8 | (8-y) |
movd m7, r5d |
movdqa m6, [rnd_1d_%2] |
pshuflw m7, m7, 0 |
movlhps m7, m7 |
.next2yrows: |
movq m0, [r1 ] |
movq m1, [r1+r2 ] |
movdqa m2, m1 |
movq m3, [r1+r2*2] |
lea r1, [r1+r2*2] |
punpcklbw m0, m1 |
punpcklbw m2, m3 |
pmaddubsw m0, m7 |
pmaddubsw m2, m7 |
%ifidn %1, avg |
movq m4, [r0 ] |
movhps m4, [r0+r2] |
%endif |
paddw m0, m6 |
paddw m2, m6 |
psrlw m0, 3 |
psrlw m2, 3 |
packuswb m0, m2 |
CHROMAMC_AVG m0, m4 |
movq [r0 ], m0 |
movhps [r0+r2], m0 |
sub r3d, 2 |
lea r0, [r0+r2*2] |
jg .next2yrows |
REP_RET |
%endmacro |
%macro chroma_mc4_ssse3_func 2 |
cglobal %1_%2_chroma_mc4, 6, 7, 0 |
%if ARCH_X86_64 |
movsxd r2, r2d |
%endif |
mov r6, r4 |
shl r4d, 8 |
sub r4d, r6d |
mov r6, 8 |
add r4d, 8 ; x*288+8 |
sub r6d, r5d |
imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) |
imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) |
movd m7, r6d |
movd m6, r4d |
movq m5, [pw_32] |
movd m0, [r1 ] |
pshufw m7, m7, 0 |
punpcklbw m0, [r1+1] |
pshufw m6, m6, 0 |
.next2rows: |
movd m1, [r1+r2*1 ] |
movd m3, [r1+r2*2 ] |
punpcklbw m1, [r1+r2*1+1] |
punpcklbw m3, [r1+r2*2+1] |
lea r1, [r1+r2*2] |
movq m2, m1 |
movq m4, m3 |
pmaddubsw m0, m7 |
pmaddubsw m1, m6 |
pmaddubsw m2, m7 |
pmaddubsw m3, m6 |
paddw m0, m5 |
paddw m2, m5 |
paddw m1, m0 |
paddw m3, m2 |
psrlw m1, 6 |
movq m0, m4 |
psrlw m3, 6 |
packuswb m1, m1 |
packuswb m3, m3 |
CHROMAMC_AVG m1, [r0 ] |
CHROMAMC_AVG m3, [r0+r2] |
movd [r0 ], m1 |
movd [r0+r2], m3 |
sub r3d, 2 |
lea r0, [r0+r2*2] |
jg .next2rows |
REP_RET |
%endmacro |
%define CHROMAMC_AVG NOTHING |
INIT_XMM ssse3 |
chroma_mc8_ssse3_func put, h264, _rnd |
chroma_mc8_ssse3_func put, vc1, _nornd |
INIT_MMX ssse3 |
chroma_mc4_ssse3_func put, h264 |
%define CHROMAMC_AVG DIRECT_AVG |
INIT_XMM ssse3 |
chroma_mc8_ssse3_func avg, h264, _rnd |
chroma_mc8_ssse3_func avg, vc1, _nornd |
INIT_MMX ssse3 |
chroma_mc4_ssse3_func avg, h264 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_chromamc_10bit.asm |
---|
0,0 → 1,271 |
;***************************************************************************** |
;* MMX/SSE2/AVX-optimized 10-bit H.264 chroma MC code |
;***************************************************************************** |
;* Copyright (C) 2005-2011 x264 project |
;* |
;* Authors: Daniel Kang <daniel.d.kang@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
cextern pw_4 |
cextern pw_8 |
cextern pw_32 |
cextern pw_64 |
SECTION .text |
%macro MV0_PIXELS_MC8 0 |
lea r4, [r2*3 ] |
lea r5, [r2*4 ] |
.next4rows: |
movu m0, [r1 ] |
movu m1, [r1+r2 ] |
CHROMAMC_AVG m0, [r0 ] |
CHROMAMC_AVG m1, [r0+r2 ] |
mova [r0 ], m0 |
mova [r0+r2 ], m1 |
movu m0, [r1+r2*2] |
movu m1, [r1+r4 ] |
CHROMAMC_AVG m0, [r0+r2*2] |
CHROMAMC_AVG m1, [r0+r4 ] |
mova [r0+r2*2], m0 |
mova [r0+r4 ], m1 |
add r1, r5 |
add r0, r5 |
sub r3d, 4 |
jne .next4rows |
%endmacro |
;----------------------------------------------------------------------------- |
; void put/avg_h264_chroma_mc8(pixel *dst, pixel *src, int stride, int h, int mx, int my) |
;----------------------------------------------------------------------------- |
%macro CHROMA_MC8 1 |
; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, |
; int stride, int h, int mx, int my) |
cglobal %1_h264_chroma_mc8_10, 6,7,8 |
movsxdifnidn r2, r2d |
mov r6d, r5d |
or r6d, r4d |
jne .at_least_one_non_zero |
; mx == 0 AND my == 0 - no filter needed |
MV0_PIXELS_MC8 |
REP_RET |
.at_least_one_non_zero: |
mov r6d, 2 |
test r5d, r5d |
je .x_interpolation |
mov r6, r2 ; dxy = x ? 1 : stride |
test r4d, r4d |
jne .xy_interpolation |
.x_interpolation: |
; mx == 0 XOR my == 0 - 1 dimensional filter only |
or r4d, r5d ; x + y |
movd m5, r4d |
mova m4, [pw_8] |
mova m6, [pw_4] ; mm6 = rnd >> 3 |
SPLATW m5, m5 ; mm5 = B = x |
psubw m4, m5 ; mm4 = A = 8-x |
.next1drow: |
movu m0, [r1 ] ; mm0 = src[0..7] |
movu m2, [r1+r6] ; mm2 = src[1..8] |
pmullw m0, m4 ; mm0 = A * src[0..7] |
pmullw m2, m5 ; mm2 = B * src[1..8] |
paddw m0, m6 |
paddw m0, m2 |
psrlw m0, 3 |
CHROMAMC_AVG m0, [r0] |
mova [r0], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 |
add r0, r2 |
add r1, r2 |
dec r3d |
jne .next1drow |
REP_RET |
.xy_interpolation: ; general case, bilinear |
movd m4, r4m ; x |
movd m6, r5m ; y |
SPLATW m4, m4 ; mm4 = x words |
SPLATW m6, m6 ; mm6 = y words |
psllw m5, m4, 3 ; mm5 = 8x |
pmullw m4, m6 ; mm4 = x * y |
psllw m6, 3 ; mm6 = 8y |
paddw m1, m5, m6 ; mm7 = 8x+8y |
mova m7, m4 ; DD = x * y |
psubw m5, m4 ; mm5 = B = 8x - xy |
psubw m6, m4 ; mm6 = C = 8y - xy |
paddw m4, [pw_64] |
psubw m4, m1 ; mm4 = A = xy - (8x+8y) + 64 |
movu m0, [r1 ] ; mm0 = src[0..7] |
movu m1, [r1+2] ; mm1 = src[1..8] |
.next2drow: |
add r1, r2 |
pmullw m2, m0, m4 |
pmullw m1, m5 |
paddw m2, m1 ; mm2 = A * src[0..7] + B * src[1..8] |
movu m0, [r1] |
movu m1, [r1+2] |
pmullw m3, m0, m6 |
paddw m2, m3 ; mm2 += C * src[0..7+strde] |
pmullw m3, m1, m7 |
paddw m2, m3 ; mm2 += D * src[1..8+strde] |
paddw m2, [pw_32] |
psrlw m2, 6 |
CHROMAMC_AVG m2, [r0] |
mova [r0], m2 ; dst[0..7] = (mm2 + 32) >> 6 |
add r0, r2 |
dec r3d |
jne .next2drow |
REP_RET |
%endmacro |
;----------------------------------------------------------------------------- |
; void put/avg_h264_chroma_mc4(pixel *dst, pixel *src, int stride, int h, int mx, int my) |
;----------------------------------------------------------------------------- |
;TODO: xmm mc4 |
%macro MC4_OP 2 |
movq %1, [r1 ] |
movq m1, [r1+2] |
add r1, r2 |
pmullw %1, m4 |
pmullw m1, m2 |
paddw m1, %1 |
mova %1, m1 |
pmullw %2, m5 |
pmullw m1, m3 |
paddw %2, [pw_32] |
paddw m1, %2 |
psrlw m1, 6 |
CHROMAMC_AVG m1, %2, [r0] |
movq [r0], m1 |
add r0, r2 |
%endmacro |
%macro CHROMA_MC4 1 |
cglobal %1_h264_chroma_mc4_10, 6,6,7 |
movsxdifnidn r2, r2d |
movd m2, r4m ; x |
movd m3, r5m ; y |
mova m4, [pw_8] |
mova m5, m4 |
SPLATW m2, m2 |
SPLATW m3, m3 |
psubw m4, m2 |
psubw m5, m3 |
movq m0, [r1 ] |
movq m6, [r1+2] |
add r1, r2 |
pmullw m0, m4 |
pmullw m6, m2 |
paddw m6, m0 |
.next2rows: |
MC4_OP m0, m6 |
MC4_OP m6, m0 |
sub r3d, 2 |
jnz .next2rows |
REP_RET |
%endmacro |
;----------------------------------------------------------------------------- |
; void put/avg_h264_chroma_mc2(pixel *dst, pixel *src, int stride, int h, int mx, int my) |
;----------------------------------------------------------------------------- |
%macro CHROMA_MC2 1 |
cglobal %1_h264_chroma_mc2_10, 6,7 |
movsxdifnidn r2, r2d |
mov r6d, r4d |
shl r4d, 16 |
sub r4d, r6d |
add r4d, 8 |
imul r5d, r4d ; x*y<<16 | y*(8-x) |
shl r4d, 3 |
sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) |
movd m5, r4d |
movd m6, r5d |
punpckldq m5, m5 ; mm5 = {A,B,A,B} |
punpckldq m6, m6 ; mm6 = {C,D,C,D} |
pxor m7, m7 |
pshufw m2, [r1], 0x94 ; mm0 = src[0,1,1,2] |
.nextrow: |
add r1, r2 |
movq m1, m2 |
pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] |
pshufw m0, [r1], 0x94 ; mm0 = src[0,1,1,2] |
movq m2, m0 |
pmaddwd m0, m6 |
paddw m1, [pw_32] |
paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] |
psrlw m1, 6 |
packssdw m1, m7 |
CHROMAMC_AVG m1, m3, [r0] |
movd [r0], m1 |
add r0, r2 |
dec r3d |
jnz .nextrow |
REP_RET |
%endmacro |
%macro NOTHING 2-3 |
%endmacro |
%macro AVG 2-3 |
%if %0==3 |
movq %2, %3 |
%endif |
pavgw %1, %2 |
%endmacro |
%define CHROMAMC_AVG NOTHING |
INIT_XMM sse2 |
CHROMA_MC8 put |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
CHROMA_MC8 put |
%endif |
INIT_MMX mmxext |
CHROMA_MC4 put |
CHROMA_MC2 put |
%define CHROMAMC_AVG AVG |
INIT_XMM sse2 |
CHROMA_MC8 avg |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
CHROMA_MC8 avg |
%endif |
INIT_MMX mmxext |
CHROMA_MC4 avg |
CHROMA_MC2 avg |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_deblock.asm |
---|
0,0 → 1,1078 |
;***************************************************************************** |
;* MMX/SSE2/AVX-optimized H.264 deblocking code |
;***************************************************************************** |
;* Copyright (C) 2005-2011 x264 project |
;* |
;* Authors: Loren Merritt <lorenm@u.washington.edu> |
;* Jason Garrett-Glaser <darkshikari@gmail.com> |
;* Oskar Arvidsson <oskar@irock.se> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
pb_A1: times 16 db 0xA1 |
pb_3_1: times 4 db 3, 1 |
SECTION .text |
cextern pb_0 |
cextern pb_1 |
cextern pb_3 |
; expands to [base],...,[base+7*stride] |
%define PASS8ROWS(base, base3, stride, stride3) \ |
[base], [base+stride], [base+stride*2], [base3], \ |
[base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4] |
%define PASS8ROWS(base, base3, stride, stride3, offset) \ |
PASS8ROWS(base+offset, base3+offset, stride, stride3) |
; in: 8 rows of 4 bytes in %4..%11 |
; out: 4 rows of 8 bytes in m0..m3 |
%macro TRANSPOSE4x8_LOAD 11 |
movh m0, %4 |
movh m2, %5 |
movh m1, %6 |
movh m3, %7 |
punpckl%1 m0, m2 |
punpckl%1 m1, m3 |
mova m2, m0 |
punpckl%2 m0, m1 |
punpckh%2 m2, m1 |
movh m4, %8 |
movh m6, %9 |
movh m5, %10 |
movh m7, %11 |
punpckl%1 m4, m6 |
punpckl%1 m5, m7 |
mova m6, m4 |
punpckl%2 m4, m5 |
punpckh%2 m6, m5 |
punpckh%3 m1, m0, m4 |
punpckh%3 m3, m2, m6 |
punpckl%3 m0, m4 |
punpckl%3 m2, m6 |
%endmacro |
; in: 4 rows of 8 bytes in m0..m3 |
; out: 8 rows of 4 bytes in %1..%8 |
%macro TRANSPOSE8x4B_STORE 8 |
punpckhdq m4, m0, m0 |
punpckhdq m5, m1, m1 |
punpckhdq m6, m2, m2 |
punpcklbw m0, m1 |
punpcklbw m2, m3 |
punpcklwd m1, m0, m2 |
punpckhwd m0, m2 |
movh %1, m1 |
punpckhdq m1, m1 |
movh %2, m1 |
movh %3, m0 |
punpckhdq m0, m0 |
movh %4, m0 |
punpckhdq m3, m3 |
punpcklbw m4, m5 |
punpcklbw m6, m3 |
punpcklwd m5, m4, m6 |
punpckhwd m4, m6 |
movh %5, m5 |
punpckhdq m5, m5 |
movh %6, m5 |
movh %7, m4 |
punpckhdq m4, m4 |
movh %8, m4 |
%endmacro |
%macro TRANSPOSE4x8B_LOAD 8 |
TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8 |
%endmacro |
%macro SBUTTERFLY3 4 |
punpckh%1 %4, %2, %3 |
punpckl%1 %2, %3 |
%endmacro |
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8 |
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16] |
%macro TRANSPOSE6x8_MEM 9 |
RESET_MM_PERMUTATION |
movq m0, %1 |
movq m1, %2 |
movq m2, %3 |
movq m3, %4 |
movq m4, %5 |
movq m5, %6 |
movq m6, %7 |
SBUTTERFLY bw, 0, 1, 7 |
SBUTTERFLY bw, 2, 3, 7 |
SBUTTERFLY bw, 4, 5, 7 |
movq [%9+0x10], m3 |
SBUTTERFLY3 bw, m6, %8, m7 |
SBUTTERFLY wd, 0, 2, 3 |
SBUTTERFLY wd, 4, 6, 3 |
punpckhdq m0, m4 |
movq [%9+0x00], m0 |
SBUTTERFLY3 wd, m1, [%9+0x10], m3 |
SBUTTERFLY wd, 5, 7, 0 |
SBUTTERFLY dq, 1, 5, 0 |
SBUTTERFLY dq, 2, 6, 0 |
punpckldq m3, m7 |
movq [%9+0x10], m2 |
movq [%9+0x20], m6 |
movq [%9+0x30], m1 |
movq [%9+0x40], m5 |
movq [%9+0x50], m3 |
RESET_MM_PERMUTATION |
%endmacro |
; in: 8 rows of 8 in %1..%8 |
; out: 8 rows of 8 in %9..%16 |
%macro TRANSPOSE8x8_MEM 16 |
RESET_MM_PERMUTATION |
movq m0, %1 |
movq m1, %2 |
movq m2, %3 |
movq m3, %4 |
movq m4, %5 |
movq m5, %6 |
movq m6, %7 |
SBUTTERFLY bw, 0, 1, 7 |
SBUTTERFLY bw, 2, 3, 7 |
SBUTTERFLY bw, 4, 5, 7 |
SBUTTERFLY3 bw, m6, %8, m7 |
movq %9, m5 |
SBUTTERFLY wd, 0, 2, 5 |
SBUTTERFLY wd, 4, 6, 5 |
SBUTTERFLY wd, 1, 3, 5 |
movq %11, m6 |
movq m6, %9 |
SBUTTERFLY wd, 6, 7, 5 |
SBUTTERFLY dq, 0, 4, 5 |
SBUTTERFLY dq, 1, 6, 5 |
movq %9, m0 |
movq %10, m4 |
movq %13, m1 |
movq %14, m6 |
SBUTTERFLY3 dq, m2, %11, m0 |
SBUTTERFLY dq, 3, 7, 4 |
movq %11, m2 |
movq %12, m0 |
movq %15, m3 |
movq %16, m7 |
RESET_MM_PERMUTATION |
%endmacro |
; out: %4 = |%1-%2|>%3 |
; clobbers: %5 |
%macro DIFF_GT 5 |
%if avx_enabled == 0 |
mova %5, %2 |
mova %4, %1 |
psubusb %5, %1 |
psubusb %4, %2 |
%else |
psubusb %5, %2, %1 |
psubusb %4, %1, %2 |
%endif |
por %4, %5 |
psubusb %4, %3 |
%endmacro |
; out: %4 = |%1-%2|>%3 |
; clobbers: %5 |
%macro DIFF_GT2 5 |
%if ARCH_X86_64 |
psubusb %5, %2, %1 |
psubusb %4, %1, %2 |
%else |
mova %5, %2 |
mova %4, %1 |
psubusb %5, %1 |
psubusb %4, %2 |
%endif |
psubusb %5, %3 |
psubusb %4, %3 |
pcmpeqb %4, %5 |
%endmacro |
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1 |
; out: m5=beta-1, m7=mask, %3=alpha-1 |
; clobbers: m4,m6 |
%macro LOAD_MASK 2-3 |
movd m4, %1 |
movd m5, %2 |
SPLATW m4, m4 |
SPLATW m5, m5 |
packuswb m4, m4 ; 16x alpha-1 |
packuswb m5, m5 ; 16x beta-1 |
%if %0>2 |
mova %3, m4 |
%endif |
DIFF_GT m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1 |
DIFF_GT m0, m1, m5, m4, m6 ; |p1-p0| > beta-1 |
por m7, m4 |
DIFF_GT m3, m2, m5, m4, m6 ; |q1-q0| > beta-1 |
por m7, m4 |
pxor m6, m6 |
pcmpeqb m7, m6 |
%endmacro |
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask) |
; out: m1=p0' m2=q0' |
; clobbers: m0,3-6 |
%macro DEBLOCK_P0_Q0 0 |
pcmpeqb m4, m4 |
pxor m5, m1, m2 ; p0^q0 |
pxor m3, m4 |
pand m5, [pb_1] ; (p0^q0)&1 |
pavgb m3, m0 ; (p1 - q1 + 256)>>1 |
pxor m4, m1 |
pavgb m3, [pb_3] ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2 |
pavgb m4, m2 ; (q0 - p0 + 256)>>1 |
pavgb m3, m5 |
mova m6, [pb_A1] |
paddusb m3, m4 ; d+128+33 |
psubusb m6, m3 |
psubusb m3, [pb_A1] |
pminub m6, m7 |
pminub m3, m7 |
psubusb m1, m6 |
psubusb m2, m3 |
paddusb m1, m3 |
paddusb m2, m6 |
%endmacro |
; in: m1=p0 m2=q0 |
; %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp |
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 ) |
; clobbers: q2, tmp, tc0 |
%macro LUMA_Q1 6 |
pavgb %6, m1, m2 |
pavgb %2, %6 ; avg(p2,avg(p0,q0)) |
pxor %6, %3 |
pand %6, [pb_1] ; (p2^avg(p0,q0))&1 |
psubusb %2, %6 ; (p2+((p0+q0+1)>>1))>>1 |
psubusb %6, %1, %5 |
paddusb %5, %1 |
pmaxub %2, %6 |
pminub %2, %5 |
mova %4, %2 |
%endmacro |
%if ARCH_X86_64 |
;----------------------------------------------------------------------------- |
; void deblock_v_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
;----------------------------------------------------------------------------- |
%macro DEBLOCK_LUMA 0 |
cglobal deblock_v_luma_8, 5,5,10 |
movd m8, [r4] ; tc0 |
lea r4, [r1*3] |
dec r2d ; alpha-1 |
neg r4 |
dec r3d ; beta-1 |
add r4, r0 ; pix-3*stride |
mova m0, [r4+r1] ; p1 |
mova m1, [r4+2*r1] ; p0 |
mova m2, [r0] ; q0 |
mova m3, [r0+r1] ; q1 |
LOAD_MASK r2d, r3d |
punpcklbw m8, m8 |
punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] |
pcmpeqb m9, m9 |
pcmpeqb m9, m8 |
pandn m9, m7 |
pand m8, m9 |
movdqa m3, [r4] ; p2 |
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |
pand m6, m9 |
psubb m7, m8, m6 |
pand m6, m8 |
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 |
movdqa m4, [r0+2*r1] ; q2 |
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |
pand m6, m9 |
pand m8, m6 |
psubb m7, m6 |
mova m3, [r0+r1] |
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6 |
DEBLOCK_P0_Q0 |
mova [r4+2*r1], m1 |
mova [r0], m2 |
RET |
;----------------------------------------------------------------------------- |
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
;----------------------------------------------------------------------------- |
INIT_MMX cpuname |
cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64 |
movsxd r7, r1d |
lea r8, [r7+r7*2] |
lea r6, [r0-4] |
lea r5, [r0-4+r8] |
%if WIN64 |
%define pix_tmp rsp+0x30 ; shadow space + r4 |
%else |
%define pix_tmp rsp |
%endif |
; transpose 6x16 -> tmp space |
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp |
lea r6, [r6+r7*8] |
lea r5, [r5+r7*8] |
TRANSPOSE6x8_MEM PASS8ROWS(r6, r5, r7, r8), pix_tmp+8 |
; vertical filter |
; alpha, beta, tc0 are still in r2d, r3d, r4 |
; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them |
lea r0, [pix_tmp+0x30] |
mov r1d, 0x10 |
%if WIN64 |
mov [rsp+0x20], r4 |
%endif |
call deblock_v_luma_8 |
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) |
add r6, 2 |
add r5, 2 |
movq m0, [pix_tmp+0x18] |
movq m1, [pix_tmp+0x28] |
movq m2, [pix_tmp+0x38] |
movq m3, [pix_tmp+0x48] |
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) |
shl r7, 3 |
sub r6, r7 |
sub r5, r7 |
shr r7, 3 |
movq m0, [pix_tmp+0x10] |
movq m1, [pix_tmp+0x20] |
movq m2, [pix_tmp+0x30] |
movq m3, [pix_tmp+0x40] |
TRANSPOSE8x4B_STORE PASS8ROWS(r6, r5, r7, r8) |
RET |
%endmacro |
INIT_XMM sse2 |
DEBLOCK_LUMA |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
DEBLOCK_LUMA |
%endif |
%else |
%macro DEBLOCK_LUMA 2 |
;----------------------------------------------------------------------------- |
; void deblock_v8_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
;----------------------------------------------------------------------------- |
cglobal deblock_%1_luma_8, 5,5,8,2*%2 |
lea r4, [r1*3] |
dec r2 ; alpha-1 |
neg r4 |
dec r3 ; beta-1 |
add r4, r0 ; pix-3*stride |
mova m0, [r4+r1] ; p1 |
mova m1, [r4+2*r1] ; p0 |
mova m2, [r0] ; q0 |
mova m3, [r0+r1] ; q1 |
LOAD_MASK r2, r3 |
mov r3, r4mp |
pcmpeqb m3, m3 |
movd m4, [r3] ; tc0 |
punpcklbw m4, m4 |
punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0] |
mova [esp+%2], m4 ; tc |
pcmpgtb m4, m3 |
mova m3, [r4] ; p2 |
pand m4, m7 |
mova [esp], m4 ; mask |
DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1 |
pand m6, m4 |
pand m4, [esp+%2] ; tc |
psubb m7, m4, m6 |
pand m6, m4 |
LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4 |
mova m4, [r0+2*r1] ; q2 |
DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1 |
pand m6, [esp] ; mask |
mova m5, [esp+%2] ; tc |
psubb m7, m6 |
pand m5, m6 |
mova m3, [r0+r1] |
LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6 |
DEBLOCK_P0_Q0 |
mova [r4+2*r1], m1 |
mova [r0], m2 |
RET |
;----------------------------------------------------------------------------- |
; void deblock_h_luma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
;----------------------------------------------------------------------------- |
INIT_MMX cpuname |
cglobal deblock_h_luma_8, 0,5,8,0x60+HAVE_ALIGNED_STACK*12 |
mov r0, r0mp |
mov r3, r1m |
lea r4, [r3*3] |
sub r0, 4 |
lea r1, [r0+r4] |
%define pix_tmp esp+12*HAVE_ALIGNED_STACK |
; transpose 6x16 -> tmp space |
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp |
lea r0, [r0+r3*8] |
lea r1, [r1+r3*8] |
TRANSPOSE6x8_MEM PASS8ROWS(r0, r1, r3, r4), pix_tmp+8 |
; vertical filter |
lea r0, [pix_tmp+0x30] |
PUSH dword r4m |
PUSH dword r3m |
PUSH dword r2m |
PUSH dword 16 |
PUSH dword r0 |
call deblock_%1_luma_8 |
%ifidn %1, v8 |
add dword [esp ], 8 ; pix_tmp+0x38 |
add dword [esp+16], 2 ; tc0+2 |
call deblock_%1_luma_8 |
%endif |
ADD esp, 20 |
; transpose 16x4 -> original space (only the middle 4 rows were changed by the filter) |
mov r0, r0mp |
sub r0, 2 |
movq m0, [pix_tmp+0x10] |
movq m1, [pix_tmp+0x20] |
lea r1, [r0+r4] |
movq m2, [pix_tmp+0x30] |
movq m3, [pix_tmp+0x40] |
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) |
lea r0, [r0+r3*8] |
lea r1, [r1+r3*8] |
movq m0, [pix_tmp+0x18] |
movq m1, [pix_tmp+0x28] |
movq m2, [pix_tmp+0x38] |
movq m3, [pix_tmp+0x48] |
TRANSPOSE8x4B_STORE PASS8ROWS(r0, r1, r3, r4) |
RET |
%endmacro ; DEBLOCK_LUMA |
INIT_MMX mmxext |
DEBLOCK_LUMA v8, 8 |
INIT_XMM sse2 |
DEBLOCK_LUMA v, 16 |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
DEBLOCK_LUMA v, 16 |
%endif |
%endif ; ARCH |
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory |
%if ARCH_X86_64 |
pavgb t0, p2, p1 |
pavgb t1, p0, q0 |
%else |
mova t0, p2 |
mova t1, p0 |
pavgb t0, p1 |
pavgb t1, q0 |
%endif |
pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2 |
mova t5, t1 |
%if ARCH_X86_64 |
paddb t2, p2, p1 |
paddb t3, p0, q0 |
%else |
mova t2, p2 |
mova t3, p0 |
paddb t2, p1 |
paddb t3, q0 |
%endif |
paddb t2, t3 |
mova t3, t2 |
mova t4, t2 |
psrlw t2, 1 |
pavgb t2, mpb_0 |
pxor t2, t0 |
pand t2, mpb_1 |
psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4; |
%if ARCH_X86_64 |
pavgb t1, p2, q1 |
psubb t2, p2, q1 |
%else |
mova t1, p2 |
mova t2, p2 |
pavgb t1, q1 |
psubb t2, q1 |
%endif |
paddb t3, t3 |
psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1 |
pand t2, mpb_1 |
psubb t1, t2 |
pavgb t1, p1 |
pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2 |
psrlw t3, 2 |
pavgb t3, mpb_0 |
pxor t3, t1 |
pand t3, mpb_1 |
psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8 |
pxor t3, p0, q1 |
pavgb t2, p0, q1 |
pand t3, mpb_1 |
psubb t2, t3 |
pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4 |
pxor t1, t2 |
pxor t2, p0 |
pand t1, mask1p |
pand t2, mask0 |
pxor t1, t2 |
pxor t1, p0 |
mova %1, t1 ; store p0 |
mova t1, %4 ; p3 |
paddb t2, t1, p2 |
pavgb t1, p2 |
pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4 |
paddb t2, t2 |
paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0 |
psrlw t2, 2 |
pavgb t2, mpb_0 |
pxor t2, t1 |
pand t2, mpb_1 |
psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8 |
pxor t0, p1 |
pxor t1, p2 |
pand t0, mask1p |
pand t1, mask1p |
pxor t0, p1 |
pxor t1, p2 |
mova %2, t0 ; store p1 |
mova %3, t1 ; store p2 |
%endmacro |
%macro LUMA_INTRA_SWAP_PQ 0 |
%define q1 m0 |
%define q0 m1 |
%define p0 m2 |
%define p1 m3 |
%define p2 q2 |
%define mask1p mask1q |
%endmacro |
%macro DEBLOCK_LUMA_INTRA 1 |
%define p1 m0 |
%define p0 m1 |
%define q0 m2 |
%define q1 m3 |
%define t0 m4 |
%define t1 m5 |
%define t2 m6 |
%define t3 m7 |
%if ARCH_X86_64 |
%define p2 m8 |
%define q2 m9 |
%define t4 m10 |
%define t5 m11 |
%define mask0 m12 |
%define mask1p m13 |
%if WIN64 |
%define mask1q [rsp] |
%else |
%define mask1q [rsp-24] |
%endif |
%define mpb_0 m14 |
%define mpb_1 m15 |
%else |
%define spill(x) [esp+16*x] |
%define p2 [r4+r1] |
%define q2 [r0+2*r1] |
%define t4 spill(0) |
%define t5 spill(1) |
%define mask0 spill(2) |
%define mask1p spill(3) |
%define mask1q spill(4) |
%define mpb_0 [pb_0] |
%define mpb_1 [pb_1] |
%endif |
;----------------------------------------------------------------------------- |
; void deblock_v_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) |
;----------------------------------------------------------------------------- |
%if WIN64 |
cglobal deblock_%1_luma_intra_8, 4,6,16,0x10 |
%else |
cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50 |
%endif |
lea r4, [r1*4] |
lea r5, [r1*3] ; 3*stride |
dec r2d ; alpha-1 |
jl .end |
neg r4 |
dec r3d ; beta-1 |
jl .end |
add r4, r0 ; pix-4*stride |
mova p1, [r4+2*r1] |
mova p0, [r4+r5] |
mova q0, [r0] |
mova q1, [r0+r1] |
%if ARCH_X86_64 |
pxor mpb_0, mpb_0 |
mova mpb_1, [pb_1] |
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |
SWAP 7, 12 ; m12=mask0 |
pavgb t5, mpb_0 |
pavgb t5, mpb_1 ; alpha/4+1 |
movdqa p2, [r4+r1] |
movdqa q2, [r0+2*r1] |
DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1 |
DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1 |
DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1 |
pand t0, mask0 |
pand t4, t0 |
pand t2, t0 |
mova mask1q, t4 |
mova mask1p, t2 |
%else |
LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0 |
mova m4, t5 |
mova mask0, m7 |
pavgb m4, [pb_0] |
pavgb m4, [pb_1] ; alpha/4+1 |
DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1 |
pand m6, mask0 |
DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1 |
pand m4, m6 |
mova mask1p, m4 |
DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1 |
pand m4, m6 |
mova mask1q, m4 |
%endif |
LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4] |
LUMA_INTRA_SWAP_PQ |
LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5] |
.end: |
RET |
INIT_MMX cpuname |
%if ARCH_X86_64 |
;----------------------------------------------------------------------------- |
; void deblock_h_luma_intra( uint8_t *pix, int stride, int alpha, int beta ) |
;----------------------------------------------------------------------------- |
cglobal deblock_h_luma_intra_8, 4,9,0,0x80 |
movsxd r7, r1d |
lea r8, [r7*3] |
lea r6, [r0-4] |
lea r5, [r0-4+r8] |
%if WIN64 |
%define pix_tmp rsp+0x20 ; shadow space |
%else |
%define pix_tmp rsp |
%endif |
; transpose 8x16 -> tmp space |
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) |
lea r6, [r6+r7*8] |
lea r5, [r5+r7*8] |
TRANSPOSE8x8_MEM PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) |
lea r0, [pix_tmp+0x40] |
mov r1, 0x10 |
call deblock_v_luma_intra_8 |
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) |
lea r5, [r6+r8] |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) |
shl r7, 3 |
sub r6, r7 |
sub r5, r7 |
shr r7, 3 |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8) |
RET |
%else |
cglobal deblock_h_luma_intra_8, 2,4,8,0x80 |
lea r3, [r1*3] |
sub r0, 4 |
lea r2, [r0+r3] |
%define pix_tmp rsp |
; transpose 8x16 -> tmp space |
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30) |
lea r0, [r0+r1*8] |
lea r2, [r2+r1*8] |
TRANSPOSE8x8_MEM PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30) |
lea r0, [pix_tmp+0x40] |
PUSH dword r3m |
PUSH dword r2m |
PUSH dword 16 |
PUSH r0 |
call deblock_%1_luma_intra_8 |
%ifidn %1, v8 |
add dword [rsp], 8 ; pix_tmp+8 |
call deblock_%1_luma_intra_8 |
%endif |
ADD esp, 16 |
mov r1, r1m |
mov r0, r0mp |
lea r3, [r1*3] |
sub r0, 4 |
lea r2, [r0+r3] |
; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8) |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) |
lea r0, [r0+r1*8] |
lea r2, [r2+r1*8] |
TRANSPOSE8x8_MEM PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3) |
RET |
%endif ; ARCH_X86_64 |
%endmacro ; DEBLOCK_LUMA_INTRA |
INIT_XMM sse2 |
DEBLOCK_LUMA_INTRA v |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
DEBLOCK_LUMA_INTRA v |
%endif |
%if ARCH_X86_64 == 0 |
INIT_MMX mmxext |
DEBLOCK_LUMA_INTRA v8 |
%endif |
INIT_MMX mmxext |
%macro CHROMA_V_START 0 |
dec r2d ; alpha-1 |
dec r3d ; beta-1 |
mov t5, r0 |
sub t5, r1 |
sub t5, r1 |
%endmacro |
%macro CHROMA_H_START 0 |
dec r2d |
dec r3d |
sub r0, 2 |
lea t6, [r1*3] |
mov t5, r0 |
add r0, t6 |
%endmacro |
%define t5 r5 |
%define t6 r6 |
;----------------------------------------------------------------------------- |
; void ff_deblock_v_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
;----------------------------------------------------------------------------- |
cglobal deblock_v_chroma_8, 5,6 |
CHROMA_V_START |
movq m0, [t5] |
movq m1, [t5+r1] |
movq m2, [r0] |
movq m3, [r0+r1] |
call ff_chroma_inter_body_mmxext |
movq [t5+r1], m1 |
movq [r0], m2 |
RET |
;----------------------------------------------------------------------------- |
; void ff_deblock_h_chroma( uint8_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
;----------------------------------------------------------------------------- |
cglobal deblock_h_chroma_8, 5,7 |
%if UNIX64 |
%define buf0 [rsp-24] |
%define buf1 [rsp-16] |
%elif WIN64 |
sub rsp, 16 |
%define buf0 [rsp] |
%define buf1 [rsp+8] |
%else |
%define buf0 r0m |
%define buf1 r2m |
%endif |
CHROMA_H_START |
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) |
movq buf0, m0 |
movq buf1, m3 |
LOAD_MASK r2d, r3d |
movd m6, [r4] ; tc0 |
punpcklbw m6, m6 |
pand m7, m6 |
DEBLOCK_P0_Q0 |
movq m0, buf0 |
movq m3, buf1 |
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) |
%if WIN64 |
add rsp, 16 |
%endif |
RET |
ALIGN 16 |
ff_chroma_inter_body_mmxext: |
LOAD_MASK r2d, r3d |
movd m6, [r4] ; tc0 |
punpcklbw m6, m6 |
pand m7, m6 |
DEBLOCK_P0_Q0 |
ret |
; in: %1=p0 %2=p1 %3=q1 |
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2 |
%macro CHROMA_INTRA_P0 3 |
movq m4, %1 |
pxor m4, %3 |
pand m4, [pb_1] ; m4 = (p0^q1)&1 |
pavgb %1, %3 |
psubusb %1, m4 |
pavgb %1, %2 ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1)) |
%endmacro |
%define t5 r4 |
%define t6 r5 |
;----------------------------------------------------------------------------- |
; void ff_deblock_v_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) |
;----------------------------------------------------------------------------- |
cglobal deblock_v_chroma_intra_8, 4,5 |
CHROMA_V_START |
movq m0, [t5] |
movq m1, [t5+r1] |
movq m2, [r0] |
movq m3, [r0+r1] |
call ff_chroma_intra_body_mmxext |
movq [t5+r1], m1 |
movq [r0], m2 |
RET |
;----------------------------------------------------------------------------- |
; void ff_deblock_h_chroma_intra( uint8_t *pix, int stride, int alpha, int beta ) |
;----------------------------------------------------------------------------- |
cglobal deblock_h_chroma_intra_8, 4,6 |
CHROMA_H_START |
TRANSPOSE4x8_LOAD bw, wd, dq, PASS8ROWS(t5, r0, r1, t6) |
call ff_chroma_intra_body_mmxext |
TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6) |
RET |
ALIGN 16 |
ff_chroma_intra_body_mmxext: |
LOAD_MASK r2d, r3d |
movq m5, m1 |
movq m6, m2 |
CHROMA_INTRA_P0 m1, m0, m3 |
CHROMA_INTRA_P0 m2, m3, m0 |
psubb m1, m5 |
psubb m2, m6 |
pand m1, m7 |
pand m2, m7 |
paddb m1, m5 |
paddb m2, m6 |
ret |
;----------------------------------------------------------------------------- |
; void h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40], |
; int8_t ref[2][40], int16_t mv[2][40][2], |
; int bidir, int edges, int step, |
; int mask_mv0, int mask_mv1, int field); |
; |
; bidir is 0 or 1 |
; edges is 1 or 4 |
; step is 1 or 2 |
; mask_mv0 is 0 or 3 |
; mask_mv1 is 0 or 1 |
; field is 0 or 1 |
;----------------------------------------------------------------------------- |
%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv, |
; dir, d_idx, mask_dir, bidir |
%define edgesd %1 |
%define stepd %2 |
%define mask_mvd %3 |
%define dir %4 |
%define d_idx %5 |
%define mask_dir %6 |
%define bidir %7 |
xor b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step) |
%%.b_idx_loop: |
%if mask_dir == 0 |
pxor m0, m0 |
%endif |
test b_idxd, dword mask_mvd |
jnz %%.skip_loop_iter ; if (!(b_idx & mask_mv)) |
%if bidir == 1 |
movd m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] } |
punpckldq m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] } |
pshufw m0, [refq+b_idxq+12], 0x44 ; { ref0[b], ref0[b] } |
pshufw m1, [refq+b_idxq+52], 0x44 ; { ref1[b], ref1[b] } |
pshufw m3, m2, 0x4E ; { ref1[bn], ref0[bn] } |
psubb m0, m2 ; { ref0[b] != ref0[bn], |
; ref0[b] != ref1[bn] } |
psubb m1, m3 ; { ref1[b] != ref1[bn], |
; ref1[b] != ref0[bn] } |
por m0, m1 |
mova m1, [mvq+b_idxq*4+(d_idx+12)*4] |
mova m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] |
mova m3, m1 |
mova m4, m2 |
psubw m1, [mvq+b_idxq*4+12*4] |
psubw m2, [mvq+b_idxq*4+12*4+mmsize] |
psubw m3, [mvq+b_idxq*4+52*4] |
psubw m4, [mvq+b_idxq*4+52*4+mmsize] |
packsswb m1, m2 |
packsswb m3, m4 |
paddb m1, m6 |
paddb m3, m6 |
psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit |
psubusb m3, m5 |
packsswb m1, m3 |
por m0, m1 |
mova m1, [mvq+b_idxq*4+(d_idx+52)*4] |
mova m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize] |
mova m3, m1 |
mova m4, m2 |
psubw m1, [mvq+b_idxq*4+12*4] |
psubw m2, [mvq+b_idxq*4+12*4+mmsize] |
psubw m3, [mvq+b_idxq*4+52*4] |
psubw m4, [mvq+b_idxq*4+52*4+mmsize] |
packsswb m1, m2 |
packsswb m3, m4 |
paddb m1, m6 |
paddb m3, m6 |
psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit |
psubusb m3, m5 |
packsswb m1, m3 |
pshufw m1, m1, 0x4E |
por m0, m1 |
pshufw m1, m0, 0x4E |
pminub m0, m1 |
%else ; bidir == 0 |
movd m0, [refq+b_idxq+12] |
psubb m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn] |
mova m1, [mvq+b_idxq*4+12*4] |
mova m2, [mvq+b_idxq*4+12*4+mmsize] |
psubw m1, [mvq+b_idxq*4+(d_idx+12)*4] |
psubw m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize] |
packsswb m1, m2 |
paddb m1, m6 |
psubusb m1, m5 ; abs(mv[b] - mv[bn]) >= limit |
packsswb m1, m1 |
por m0, m1 |
%endif ; bidir == 1/0 |
%%.skip_loop_iter: |
movd m1, [nnzq+b_idxq+12] |
por m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn] |
pminub m1, m7 |
pminub m0, m7 |
psllw m1, 1 |
pxor m2, m2 |
pmaxub m1, m0 |
punpcklbw m1, m2 |
movq [bsq+b_idxq+32*dir], m1 |
add b_idxd, dword stepd |
cmp b_idxd, dword edgesd |
jl %%.b_idx_loop |
%endmacro |
INIT_MMX mmxext |
cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \ |
step, mask_mv0, mask_mv1, field |
%define b_idxq bidirq |
%define b_idxd bidird |
cmp dword fieldm, 0 |
mova m7, [pb_1] |
mova m5, [pb_3] |
je .nofield |
mova m5, [pb_3_1] |
.nofield: |
mova m6, m5 |
paddb m5, m5 |
shl dword stepd, 3 |
shl dword edgesd, 3 |
%if ARCH_X86_32 |
%define mask_mv0d mask_mv0m |
%define mask_mv1d mask_mv1m |
%endif |
shl dword mask_mv1d, 3 |
shl dword mask_mv0d, 3 |
cmp dword bidird, 0 |
jne .bidir |
loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 0 |
loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 0 |
mova m0, [bsq+mmsize*0] |
mova m1, [bsq+mmsize*1] |
mova m2, [bsq+mmsize*2] |
mova m3, [bsq+mmsize*3] |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
mova [bsq+mmsize*0], m0 |
mova [bsq+mmsize*1], m1 |
mova [bsq+mmsize*2], m2 |
mova [bsq+mmsize*3], m3 |
RET |
.bidir: |
loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8, 0, 1 |
loop_filter_strength_iteration 32, 8, mask_mv0d, 0, -1, -1, 1 |
mova m0, [bsq+mmsize*0] |
mova m1, [bsq+mmsize*1] |
mova m2, [bsq+mmsize*2] |
mova m3, [bsq+mmsize*3] |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
mova [bsq+mmsize*0], m0 |
mova [bsq+mmsize*1], m1 |
mova [bsq+mmsize*2], m2 |
mova [bsq+mmsize*3], m3 |
RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_deblock_10bit.asm |
---|
0,0 → 1,923 |
;***************************************************************************** |
;* MMX/SSE2/AVX-optimized 10-bit H.264 deblocking code |
;***************************************************************************** |
;* Copyright (C) 2005-2011 x264 project |
;* |
;* Authors: Oskar Arvidsson <oskar@irock.se> |
;* Loren Merritt <lorenm@u.washington.edu> |
;* Jason Garrett-Glaser <darkshikari@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
pw_pixel_max: times 8 dw ((1 << 10)-1) |
SECTION .text |
cextern pw_2 |
cextern pw_3 |
cextern pw_4 |
; out: %4 = |%1-%2|-%3 |
; clobbers: %5 |
%macro ABS_SUB 5 |
psubusw %5, %2, %1 |
psubusw %4, %1, %2 |
por %4, %5 |
psubw %4, %3 |
%endmacro |
; out: %4 = |%1-%2|<%3 |
%macro DIFF_LT 5 |
psubusw %4, %2, %1 |
psubusw %5, %1, %2 |
por %5, %4 ; |%1-%2| |
pxor %4, %4 |
psubw %5, %3 ; |%1-%2|-%3 |
pcmpgtw %4, %5 ; 0 > |%1-%2|-%3 |
%endmacro |
%macro LOAD_AB 4 |
movd %1, %3 |
movd %2, %4 |
SPLATW %1, %1 |
SPLATW %2, %2 |
%endmacro |
; in: %2=tc reg |
; out: %1=splatted tc |
%macro LOAD_TC 2 |
movd %1, [%2] |
punpcklbw %1, %1 |
%if mmsize == 8 |
pshufw %1, %1, 0 |
%else |
pshuflw %1, %1, 01010000b |
pshufd %1, %1, 01010000b |
%endif |
psraw %1, 6 |
%endmacro |
; in: %1=p1, %2=p0, %3=q0, %4=q1 |
; %5=alpha, %6=beta, %7-%9=tmp |
; out: %7=mask |
%macro LOAD_MASK 9 |
ABS_SUB %2, %3, %5, %8, %7 ; |p0-q0| - alpha |
ABS_SUB %1, %2, %6, %9, %7 ; |p1-p0| - beta |
pand %8, %9 |
ABS_SUB %3, %4, %6, %9, %7 ; |q1-q0| - beta |
pxor %7, %7 |
pand %8, %9 |
pcmpgtw %7, %8 |
%endmacro |
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp |
; out: %1=p0', m2=q0' |
%macro DEBLOCK_P0_Q0 7 |
psubw %3, %4 |
pxor %7, %7 |
paddw %3, [pw_4] |
psubw %7, %5 |
psubw %6, %2, %1 |
psllw %6, 2 |
paddw %3, %6 |
psraw %3, 3 |
mova %6, [pw_pixel_max] |
CLIPW %3, %7, %5 |
pxor %7, %7 |
paddw %1, %3 |
psubw %2, %3 |
CLIPW %1, %7, %6 |
CLIPW %2, %7, %6 |
%endmacro |
; in: %1=x2, %2=x1, %3=p0, %4=q0 %5=mask&tc, %6=tmp |
%macro LUMA_Q1 6 |
pavgw %6, %3, %4 ; (p0+q0+1)>>1 |
paddw %1, %6 |
pxor %6, %6 |
psraw %1, 1 |
psubw %6, %5 |
psubw %1, %2 |
CLIPW %1, %6, %5 |
paddw %1, %2 |
%endmacro |
%macro LUMA_DEBLOCK_ONE 3 |
DIFF_LT m5, %1, bm, m4, m6 |
pxor m6, m6 |
mova %3, m4 |
pcmpgtw m6, tcm |
pand m4, tcm |
pandn m6, m7 |
pand m4, m6 |
LUMA_Q1 m5, %2, m1, m2, m4, m6 |
%endmacro |
%macro LUMA_H_STORE 2 |
%if mmsize == 8 |
movq [r0-4], m0 |
movq [r0+r1-4], m1 |
movq [r0+r1*2-4], m2 |
movq [r0+%2-4], m3 |
%else |
movq [r0-4], m0 |
movhps [r0+r1-4], m0 |
movq [r0+r1*2-4], m1 |
movhps [%1-4], m1 |
movq [%1+r1-4], m2 |
movhps [%1+r1*2-4], m2 |
movq [%1+%2-4], m3 |
movhps [%1+r1*4-4], m3 |
%endif |
%endmacro |
%macro DEBLOCK_LUMA 0 |
;----------------------------------------------------------------------------- |
; void deblock_v_luma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
;----------------------------------------------------------------------------- |
cglobal deblock_v_luma_10, 5,5,8*(mmsize/16) |
%assign pad 5*mmsize+12-(stack_offset&15) |
%define tcm [rsp] |
%define ms1 [rsp+mmsize] |
%define ms2 [rsp+mmsize*2] |
%define am [rsp+mmsize*3] |
%define bm [rsp+mmsize*4] |
SUB rsp, pad |
shl r2d, 2 |
shl r3d, 2 |
LOAD_AB m4, m5, r2d, r3d |
mov r3, 32/mmsize |
mov r2, r0 |
sub r0, r1 |
mova am, m4 |
sub r0, r1 |
mova bm, m5 |
sub r0, r1 |
.loop: |
mova m0, [r0+r1] |
mova m1, [r0+r1*2] |
mova m2, [r2] |
mova m3, [r2+r1] |
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 |
LOAD_TC m6, r4 |
mova tcm, m6 |
mova m5, [r0] |
LUMA_DEBLOCK_ONE m1, m0, ms1 |
mova [r0+r1], m5 |
mova m5, [r2+r1*2] |
LUMA_DEBLOCK_ONE m2, m3, ms2 |
mova [r2+r1], m5 |
pxor m5, m5 |
mova m6, tcm |
pcmpgtw m5, tcm |
psubw m6, ms1 |
pandn m5, m7 |
psubw m6, ms2 |
pand m5, m6 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 |
mova [r0+r1*2], m1 |
mova [r2], m2 |
add r0, mmsize |
add r2, mmsize |
add r4, mmsize/8 |
dec r3 |
jg .loop |
ADD rsp, pad |
RET |
cglobal deblock_h_luma_10, 5,6,8*(mmsize/16) |
%assign pad 7*mmsize+12-(stack_offset&15) |
%define tcm [rsp] |
%define ms1 [rsp+mmsize] |
%define ms2 [rsp+mmsize*2] |
%define p1m [rsp+mmsize*3] |
%define p2m [rsp+mmsize*4] |
%define am [rsp+mmsize*5] |
%define bm [rsp+mmsize*6] |
SUB rsp, pad |
shl r2d, 2 |
shl r3d, 2 |
LOAD_AB m4, m5, r2d, r3d |
mov r3, r1 |
mova am, m4 |
add r3, r1 |
mov r5, 32/mmsize |
mova bm, m5 |
add r3, r1 |
%if mmsize == 16 |
mov r2, r0 |
add r2, r3 |
%endif |
.loop: |
%if mmsize == 8 |
movq m2, [r0-8] ; y q2 q1 q0 |
movq m7, [r0+0] |
movq m5, [r0+r1-8] |
movq m3, [r0+r1+0] |
movq m0, [r0+r1*2-8] |
movq m6, [r0+r1*2+0] |
movq m1, [r0+r3-8] |
TRANSPOSE4x4W 2, 5, 0, 1, 4 |
SWAP 2, 7 |
movq m7, [r0+r3] |
TRANSPOSE4x4W 2, 3, 6, 7, 4 |
%else |
movu m5, [r0-8] ; y q2 q1 q0 p0 p1 p2 x |
movu m0, [r0+r1-8] |
movu m2, [r0+r1*2-8] |
movu m3, [r2-8] |
TRANSPOSE4x4W 5, 0, 2, 3, 6 |
mova tcm, m3 |
movu m4, [r2+r1-8] |
movu m1, [r2+r1*2-8] |
movu m3, [r2+r3-8] |
movu m7, [r2+r1*4-8] |
TRANSPOSE4x4W 4, 1, 3, 7, 6 |
mova m6, tcm |
punpcklqdq m6, m7 |
punpckhqdq m5, m4 |
SBUTTERFLY qdq, 0, 1, 7 |
SBUTTERFLY qdq, 2, 3, 7 |
%endif |
mova p2m, m6 |
LOAD_MASK m0, m1, m2, m3, am, bm, m7, m4, m6 |
LOAD_TC m6, r4 |
mova tcm, m6 |
LUMA_DEBLOCK_ONE m1, m0, ms1 |
mova p1m, m5 |
mova m5, p2m |
LUMA_DEBLOCK_ONE m2, m3, ms2 |
mova p2m, m5 |
pxor m5, m5 |
mova m6, tcm |
pcmpgtw m5, tcm |
psubw m6, ms1 |
pandn m5, m7 |
psubw m6, ms2 |
pand m5, m6 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m5, m7, m6 |
mova m0, p1m |
mova m3, p2m |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
LUMA_H_STORE r2, r3 |
add r4, mmsize/8 |
lea r0, [r0+r1*(mmsize/2)] |
lea r2, [r2+r1*(mmsize/2)] |
dec r5 |
jg .loop |
ADD rsp, pad |
RET |
%endmacro |
%if ARCH_X86_64 |
; in: m0=p1, m1=p0, m2=q0, m3=q1, m8=p2, m9=q2 |
; m12=alpha, m13=beta |
; out: m0=p1', m3=q1', m1=p0', m2=q0' |
; clobbers: m4, m5, m6, m7, m10, m11, m14 |
%macro DEBLOCK_LUMA_INTER_SSE2 0 |
LOAD_MASK m0, m1, m2, m3, m12, m13, m7, m4, m6 |
LOAD_TC m6, r4 |
DIFF_LT m8, m1, m13, m10, m4 |
DIFF_LT m9, m2, m13, m11, m4 |
pand m6, m7 |
mova m14, m6 |
pxor m4, m4 |
pcmpgtw m6, m4 |
pand m6, m14 |
mova m5, m10 |
pand m5, m6 |
LUMA_Q1 m8, m0, m1, m2, m5, m4 |
mova m5, m11 |
pand m5, m6 |
LUMA_Q1 m9, m3, m1, m2, m5, m4 |
pxor m4, m4 |
psubw m6, m10 |
pcmpgtw m4, m14 |
pandn m4, m7 |
psubw m6, m11 |
pand m4, m6 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m4, m5, m6 |
SWAP 0, 8 |
SWAP 3, 9 |
%endmacro |
%macro DEBLOCK_LUMA_64 0 |
cglobal deblock_v_luma_10, 5,5,15 |
%define p2 m8 |
%define p1 m0 |
%define p0 m1 |
%define q0 m2 |
%define q1 m3 |
%define q2 m9 |
%define mask0 m7 |
%define mask1 m10 |
%define mask2 m11 |
shl r2d, 2 |
shl r3d, 2 |
LOAD_AB m12, m13, r2d, r3d |
mov r2, r0 |
sub r0, r1 |
sub r0, r1 |
sub r0, r1 |
mov r3, 2 |
.loop: |
mova p2, [r0] |
mova p1, [r0+r1] |
mova p0, [r0+r1*2] |
mova q0, [r2] |
mova q1, [r2+r1] |
mova q2, [r2+r1*2] |
DEBLOCK_LUMA_INTER_SSE2 |
mova [r0+r1], p1 |
mova [r0+r1*2], p0 |
mova [r2], q0 |
mova [r2+r1], q1 |
add r0, mmsize |
add r2, mmsize |
add r4, 2 |
dec r3 |
jg .loop |
REP_RET |
cglobal deblock_h_luma_10, 5,7,15 |
shl r2d, 2 |
shl r3d, 2 |
LOAD_AB m12, m13, r2d, r3d |
mov r2, r1 |
add r2, r1 |
add r2, r1 |
mov r5, r0 |
add r5, r2 |
mov r6, 2 |
.loop: |
movu m8, [r0-8] ; y q2 q1 q0 p0 p1 p2 x |
movu m0, [r0+r1-8] |
movu m2, [r0+r1*2-8] |
movu m9, [r5-8] |
movu m5, [r5+r1-8] |
movu m1, [r5+r1*2-8] |
movu m3, [r5+r2-8] |
movu m7, [r5+r1*4-8] |
TRANSPOSE4x4W 8, 0, 2, 9, 10 |
TRANSPOSE4x4W 5, 1, 3, 7, 10 |
punpckhqdq m8, m5 |
SBUTTERFLY qdq, 0, 1, 10 |
SBUTTERFLY qdq, 2, 3, 10 |
punpcklqdq m9, m7 |
DEBLOCK_LUMA_INTER_SSE2 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
LUMA_H_STORE r5, r2 |
add r4, 2 |
lea r0, [r0+r1*8] |
lea r5, [r5+r1*8] |
dec r6 |
jg .loop |
REP_RET |
%endmacro |
INIT_XMM sse2 |
DEBLOCK_LUMA_64 |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
DEBLOCK_LUMA_64 |
%endif |
%endif |
%macro SWAPMOVA 2 |
%ifid %1 |
SWAP %1, %2 |
%else |
mova %1, %2 |
%endif |
%endmacro |
; in: t0-t2: tmp registers |
; %1=p0 %2=p1 %3=p2 %4=p3 %5=q0 %6=q1 %7=mask0 |
; %8=mask1p %9=2 %10=p0' %11=p1' %12=p2' |
%macro LUMA_INTRA_P012 12 ; p0..p3 in memory |
%if ARCH_X86_64 |
paddw t0, %3, %2 |
mova t2, %4 |
paddw t2, %3 |
%else |
mova t0, %3 |
mova t2, %4 |
paddw t0, %2 |
paddw t2, %3 |
%endif |
paddw t0, %1 |
paddw t2, t2 |
paddw t0, %5 |
paddw t2, %9 |
paddw t0, %9 ; (p2 + p1 + p0 + q0 + 2) |
paddw t2, t0 ; (2*p3 + 3*p2 + p1 + p0 + q0 + 4) |
psrlw t2, 3 |
psrlw t1, t0, 2 |
psubw t2, %3 |
psubw t1, %2 |
pand t2, %8 |
pand t1, %8 |
paddw t2, %3 |
paddw t1, %2 |
SWAPMOVA %11, t1 |
psubw t1, t0, %3 |
paddw t0, t0 |
psubw t1, %5 |
psubw t0, %3 |
paddw t1, %6 |
paddw t1, %2 |
paddw t0, %6 |
psrlw t1, 2 ; (2*p1 + p0 + q1 + 2)/4 |
psrlw t0, 3 ; (p2 + 2*p1 + 2*p0 + 2*q0 + q1 + 4)>>3 |
pxor t0, t1 |
pxor t1, %1 |
pand t0, %8 |
pand t1, %7 |
pxor t0, t1 |
pxor t0, %1 |
SWAPMOVA %10, t0 |
SWAPMOVA %12, t2 |
%endmacro |
%macro LUMA_INTRA_INIT 1 |
%xdefine pad %1*mmsize+((gprsize*3) % mmsize)-(stack_offset&15) |
%define t0 m4 |
%define t1 m5 |
%define t2 m6 |
%define t3 m7 |
%assign i 4 |
%rep %1 |
CAT_XDEFINE t, i, [rsp+mmsize*(i-4)] |
%assign i i+1 |
%endrep |
SUB rsp, pad |
%endmacro |
; in: %1-%3=tmp, %4=p2, %5=q2 |
%macro LUMA_INTRA_INTER 5 |
LOAD_AB t0, t1, r2d, r3d |
mova %1, t0 |
LOAD_MASK m0, m1, m2, m3, %1, t1, t0, t2, t3 |
%if ARCH_X86_64 |
mova %2, t0 ; mask0 |
psrlw t3, %1, 2 |
%else |
mova t3, %1 |
mova %2, t0 ; mask0 |
psrlw t3, 2 |
%endif |
paddw t3, [pw_2] ; alpha/4+2 |
DIFF_LT m1, m2, t3, t2, t0 ; t2 = |p0-q0| < alpha/4+2 |
pand t2, %2 |
mova t3, %5 ; q2 |
mova %1, t2 ; mask1 |
DIFF_LT t3, m2, t1, t2, t0 ; t2 = |q2-q0| < beta |
pand t2, %1 |
mova t3, %4 ; p2 |
mova %3, t2 ; mask1q |
DIFF_LT t3, m1, t1, t2, t0 ; t2 = |p2-p0| < beta |
pand t2, %1 |
mova %1, t2 ; mask1p |
%endmacro |
%macro LUMA_H_INTRA_LOAD 0 |
%if mmsize == 8 |
movu t0, [r0-8] |
movu t1, [r0+r1-8] |
movu m0, [r0+r1*2-8] |
movu m1, [r0+r4-8] |
TRANSPOSE4x4W 4, 5, 0, 1, 2 |
mova t4, t0 ; p3 |
mova t5, t1 ; p2 |
movu m2, [r0] |
movu m3, [r0+r1] |
movu t0, [r0+r1*2] |
movu t1, [r0+r4] |
TRANSPOSE4x4W 2, 3, 4, 5, 6 |
mova t6, t0 ; q2 |
mova t7, t1 ; q3 |
%else |
movu t0, [r0-8] |
movu t1, [r0+r1-8] |
movu m0, [r0+r1*2-8] |
movu m1, [r0+r5-8] |
movu m2, [r4-8] |
movu m3, [r4+r1-8] |
movu t2, [r4+r1*2-8] |
movu t3, [r4+r5-8] |
TRANSPOSE8x8W 4, 5, 0, 1, 2, 3, 6, 7, t4, t5 |
mova t4, t0 ; p3 |
mova t5, t1 ; p2 |
mova t6, t2 ; q2 |
mova t7, t3 ; q3 |
%endif |
%endmacro |
; in: %1=q3 %2=q2' %3=q1' %4=q0' %5=p0' %6=p1' %7=p2' %8=p3 %9=tmp |
%macro LUMA_H_INTRA_STORE 9 |
%if mmsize == 8 |
TRANSPOSE4x4W %1, %2, %3, %4, %9 |
movq [r0-8], m%1 |
movq [r0+r1-8], m%2 |
movq [r0+r1*2-8], m%3 |
movq [r0+r4-8], m%4 |
movq m%1, %8 |
TRANSPOSE4x4W %5, %6, %7, %1, %9 |
movq [r0], m%5 |
movq [r0+r1], m%6 |
movq [r0+r1*2], m%7 |
movq [r0+r4], m%1 |
%else |
TRANSPOSE2x4x4W %1, %2, %3, %4, %9 |
movq [r0-8], m%1 |
movq [r0+r1-8], m%2 |
movq [r0+r1*2-8], m%3 |
movq [r0+r5-8], m%4 |
movhps [r4-8], m%1 |
movhps [r4+r1-8], m%2 |
movhps [r4+r1*2-8], m%3 |
movhps [r4+r5-8], m%4 |
%ifnum %8 |
SWAP %1, %8 |
%else |
mova m%1, %8 |
%endif |
TRANSPOSE2x4x4W %5, %6, %7, %1, %9 |
movq [r0], m%5 |
movq [r0+r1], m%6 |
movq [r0+r1*2], m%7 |
movq [r0+r5], m%1 |
movhps [r4], m%5 |
movhps [r4+r1], m%6 |
movhps [r4+r1*2], m%7 |
movhps [r4+r5], m%1 |
%endif |
%endmacro |
%if ARCH_X86_64 |
;----------------------------------------------------------------------------- |
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |
;----------------------------------------------------------------------------- |
%macro DEBLOCK_LUMA_INTRA_64 0 |
cglobal deblock_v_luma_intra_10, 4,7,16 |
%define t0 m1 |
%define t1 m2 |
%define t2 m4 |
%define p2 m8 |
%define p1 m9 |
%define p0 m10 |
%define q0 m11 |
%define q1 m12 |
%define q2 m13 |
%define aa m5 |
%define bb m14 |
lea r4, [r1*4] |
lea r5, [r1*3] ; 3*stride |
neg r4 |
add r4, r0 ; pix-4*stride |
mov r6, 2 |
mova m0, [pw_2] |
shl r2d, 2 |
shl r3d, 2 |
LOAD_AB aa, bb, r2d, r3d |
.loop: |
mova p2, [r4+r1] |
mova p1, [r4+2*r1] |
mova p0, [r4+r5] |
mova q0, [r0] |
mova q1, [r0+r1] |
mova q2, [r0+2*r1] |
LOAD_MASK p1, p0, q0, q1, aa, bb, m3, t0, t1 |
mova t2, aa |
psrlw t2, 2 |
paddw t2, m0 ; alpha/4+2 |
DIFF_LT p0, q0, t2, m6, t0 ; m6 = |p0-q0| < alpha/4+2 |
DIFF_LT p2, p0, bb, t1, t0 ; m7 = |p2-p0| < beta |
DIFF_LT q2, q0, bb, m7, t0 ; t1 = |q2-q0| < beta |
pand m6, m3 |
pand m7, m6 |
pand m6, t1 |
LUMA_INTRA_P012 p0, p1, p2, [r4], q0, q1, m3, m6, m0, [r4+r5], [r4+2*r1], [r4+r1] |
LUMA_INTRA_P012 q0, q1, q2, [r0+r5], p0, p1, m3, m7, m0, [r0], [r0+r1], [r0+2*r1] |
add r0, mmsize |
add r4, mmsize |
dec r6 |
jg .loop |
REP_RET |
;----------------------------------------------------------------------------- |
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |
;----------------------------------------------------------------------------- |
cglobal deblock_h_luma_intra_10, 4,7,16 |
%define t0 m15 |
%define t1 m14 |
%define t2 m2 |
%define q3 m5 |
%define q2 m8 |
%define q1 m9 |
%define q0 m10 |
%define p0 m11 |
%define p1 m12 |
%define p2 m13 |
%define p3 m4 |
%define spill [rsp] |
%assign pad 24-(stack_offset&15) |
SUB rsp, pad |
lea r4, [r1*4] |
lea r5, [r1*3] ; 3*stride |
add r4, r0 ; pix+4*stride |
mov r6, 2 |
mova m0, [pw_2] |
shl r2d, 2 |
shl r3d, 2 |
.loop: |
movu q3, [r0-8] |
movu q2, [r0+r1-8] |
movu q1, [r0+r1*2-8] |
movu q0, [r0+r5-8] |
movu p0, [r4-8] |
movu p1, [r4+r1-8] |
movu p2, [r4+r1*2-8] |
movu p3, [r4+r5-8] |
TRANSPOSE8x8W 5, 8, 9, 10, 11, 12, 13, 4, 1 |
LOAD_AB m1, m2, r2d, r3d |
LOAD_MASK q1, q0, p0, p1, m1, m2, m3, t0, t1 |
psrlw m1, 2 |
paddw m1, m0 ; alpha/4+2 |
DIFF_LT p0, q0, m1, m6, t0 ; m6 = |p0-q0| < alpha/4+2 |
DIFF_LT q2, q0, m2, t1, t0 ; t1 = |q2-q0| < beta |
DIFF_LT p0, p2, m2, m7, t0 ; m7 = |p2-p0| < beta |
pand m6, m3 |
pand m7, m6 |
pand m6, t1 |
mova spill, q3 |
LUMA_INTRA_P012 q0, q1, q2, q3, p0, p1, m3, m6, m0, m5, m1, q2 |
LUMA_INTRA_P012 p0, p1, p2, p3, q0, q1, m3, m7, m0, p0, m6, p2 |
mova m7, spill |
LUMA_H_INTRA_STORE 7, 8, 1, 5, 11, 6, 13, 4, 14 |
lea r0, [r0+r1*8] |
lea r4, [r4+r1*8] |
dec r6 |
jg .loop |
ADD rsp, pad |
RET |
%endmacro |
INIT_XMM sse2 |
DEBLOCK_LUMA_INTRA_64 |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
DEBLOCK_LUMA_INTRA_64 |
%endif |
%endif |
%macro DEBLOCK_LUMA_INTRA 0 |
;----------------------------------------------------------------------------- |
; void deblock_v_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |
;----------------------------------------------------------------------------- |
cglobal deblock_v_luma_intra_10, 4,7,8*(mmsize/16) |
LUMA_INTRA_INIT 3 |
lea r4, [r1*4] |
lea r5, [r1*3] |
neg r4 |
add r4, r0 |
mov r6, 32/mmsize |
shl r2d, 2 |
shl r3d, 2 |
.loop: |
mova m0, [r4+r1*2] ; p1 |
mova m1, [r4+r5] ; p0 |
mova m2, [r0] ; q0 |
mova m3, [r0+r1] ; q1 |
LUMA_INTRA_INTER t4, t5, t6, [r4+r1], [r0+r1*2] |
LUMA_INTRA_P012 m1, m0, t3, [r4], m2, m3, t5, t4, [pw_2], [r4+r5], [r4+2*r1], [r4+r1] |
mova t3, [r0+r1*2] ; q2 |
LUMA_INTRA_P012 m2, m3, t3, [r0+r5], m1, m0, t5, t6, [pw_2], [r0], [r0+r1], [r0+2*r1] |
add r0, mmsize |
add r4, mmsize |
dec r6 |
jg .loop |
ADD rsp, pad |
RET |
;----------------------------------------------------------------------------- |
; void deblock_h_luma_intra( uint16_t *pix, int stride, int alpha, int beta ) |
;----------------------------------------------------------------------------- |
cglobal deblock_h_luma_intra_10, 4,7,8*(mmsize/16) |
LUMA_INTRA_INIT 8 |
%if mmsize == 8 |
lea r4, [r1*3] |
mov r5, 32/mmsize |
%else |
lea r4, [r1*4] |
lea r5, [r1*3] ; 3*stride |
add r4, r0 ; pix+4*stride |
mov r6, 32/mmsize |
%endif |
shl r2d, 2 |
shl r3d, 2 |
.loop: |
LUMA_H_INTRA_LOAD |
LUMA_INTRA_INTER t8, t9, t10, t5, t6 |
LUMA_INTRA_P012 m1, m0, t3, t4, m2, m3, t9, t8, [pw_2], t8, t5, t11 |
mova t3, t6 ; q2 |
LUMA_INTRA_P012 m2, m3, t3, t7, m1, m0, t9, t10, [pw_2], m4, t6, m5 |
mova m2, t4 |
mova m0, t11 |
mova m1, t5 |
mova m3, t8 |
mova m6, t6 |
LUMA_H_INTRA_STORE 2, 0, 1, 3, 4, 6, 5, t7, 7 |
lea r0, [r0+r1*(mmsize/2)] |
%if mmsize == 8 |
dec r5 |
%else |
lea r4, [r4+r1*(mmsize/2)] |
dec r6 |
%endif |
jg .loop |
ADD rsp, pad |
RET |
%endmacro |
%if ARCH_X86_64 == 0 |
INIT_MMX mmxext |
DEBLOCK_LUMA |
DEBLOCK_LUMA_INTRA |
INIT_XMM sse2 |
DEBLOCK_LUMA |
DEBLOCK_LUMA_INTRA |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
DEBLOCK_LUMA |
DEBLOCK_LUMA_INTRA |
%endif |
%endif |
; in: %1=p0, %2=q0, %3=p1, %4=q1, %5=mask, %6=tmp, %7=tmp |
; out: %1=p0', %2=q0' |
%macro CHROMA_DEBLOCK_P0_Q0_INTRA 7 |
mova %6, [pw_2] |
paddw %6, %3 |
paddw %6, %4 |
paddw %7, %6, %2 |
paddw %6, %1 |
paddw %6, %3 |
paddw %7, %4 |
psraw %6, 2 |
psraw %7, 2 |
psubw %6, %1 |
psubw %7, %2 |
pand %6, %5 |
pand %7, %5 |
paddw %1, %6 |
paddw %2, %7 |
%endmacro |
%macro CHROMA_V_LOAD 1 |
mova m0, [r0] ; p1 |
mova m1, [r0+r1] ; p0 |
mova m2, [%1] ; q0 |
mova m3, [%1+r1] ; q1 |
%endmacro |
%macro CHROMA_V_STORE 0 |
mova [r0+1*r1], m1 |
mova [r0+2*r1], m2 |
%endmacro |
%macro CHROMA_V_LOAD_TC 2 |
movd %1, [%2] |
punpcklbw %1, %1 |
punpcklwd %1, %1 |
psraw %1, 6 |
%endmacro |
%macro DEBLOCK_CHROMA 0 |
;----------------------------------------------------------------------------- |
; void deblock_v_chroma( uint16_t *pix, int stride, int alpha, int beta, int8_t *tc0 ) |
;----------------------------------------------------------------------------- |
cglobal deblock_v_chroma_10, 5,7-(mmsize/16),8*(mmsize/16) |
mov r5, r0 |
sub r0, r1 |
sub r0, r1 |
shl r2d, 2 |
shl r3d, 2 |
%if mmsize < 16 |
mov r6, 16/mmsize |
.loop: |
%endif |
CHROMA_V_LOAD r5 |
LOAD_AB m4, m5, r2d, r3d |
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 |
pxor m4, m4 |
CHROMA_V_LOAD_TC m6, r4 |
psubw m6, [pw_3] |
pmaxsw m6, m4 |
pand m7, m6 |
DEBLOCK_P0_Q0 m1, m2, m0, m3, m7, m5, m6 |
CHROMA_V_STORE |
%if mmsize < 16 |
add r0, mmsize |
add r5, mmsize |
add r4, mmsize/4 |
dec r6 |
jg .loop |
REP_RET |
%else |
RET |
%endif |
;----------------------------------------------------------------------------- |
; void deblock_v_chroma_intra( uint16_t *pix, int stride, int alpha, int beta ) |
;----------------------------------------------------------------------------- |
cglobal deblock_v_chroma_intra_10, 4,6-(mmsize/16),8*(mmsize/16) |
mov r4, r0 |
sub r0, r1 |
sub r0, r1 |
shl r2d, 2 |
shl r3d, 2 |
%if mmsize < 16 |
mov r5, 16/mmsize |
.loop: |
%endif |
CHROMA_V_LOAD r4 |
LOAD_AB m4, m5, r2d, r3d |
LOAD_MASK m0, m1, m2, m3, m4, m5, m7, m6, m4 |
CHROMA_DEBLOCK_P0_Q0_INTRA m1, m2, m0, m3, m7, m5, m6 |
CHROMA_V_STORE |
%if mmsize < 16 |
add r0, mmsize |
add r4, mmsize |
dec r5 |
jg .loop |
REP_RET |
%else |
RET |
%endif |
%endmacro |
%if ARCH_X86_64 == 0 |
INIT_MMX mmxext |
DEBLOCK_CHROMA |
%endif |
INIT_XMM sse2 |
DEBLOCK_CHROMA |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
DEBLOCK_CHROMA |
%endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_i386.h |
---|
0,0 → 1,204 |
/* |
* H.26L/H.264/AVC/JVT/14496-10/... encoder/decoder |
* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
/** |
* @file |
* H.264 / AVC / MPEG4 part10 codec. |
* non-MMX i386-specific optimizations for H.264 |
* @author Michael Niedermayer <michaelni@gmx.at> |
*/ |
#ifndef AVCODEC_X86_H264_I386_H |
#define AVCODEC_X86_H264_I386_H |
#include <stddef.h> |
#include "libavcodec/cabac.h" |
#include "cabac.h" |
#if HAVE_INLINE_ASM |
//FIXME use some macros to avoid duplicating get_cabac (cannot be done yet |
//as that would make optimization work hard) |
#if HAVE_7REGS |
#define decode_significance decode_significance_x86 |
static int decode_significance_x86(CABACContext *c, int max_coeff, |
uint8_t *significant_coeff_ctx_base, |
int *index, x86_reg last_off){ |
void *end= significant_coeff_ctx_base + max_coeff - 1; |
int minusstart= -(intptr_t)significant_coeff_ctx_base; |
int minusindex= 4-(intptr_t)index; |
int bit; |
x86_reg coeff_count; |
#ifdef BROKEN_RELOCATIONS |
void *tables; |
__asm__ volatile( |
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" |
: "=&r"(tables) |
); |
#endif |
__asm__ volatile( |
"3: \n\t" |
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3", |
"%5", "%q5", "%k0", "%b0", |
"%c11(%6)", "%c12(%6)", |
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), |
AV_STRINGIFY(H264_LPS_RANGE_OFFSET), |
AV_STRINGIFY(H264_MLPS_STATE_OFFSET), |
"%13") |
"test $1, %4 \n\t" |
" jz 4f \n\t" |
"add %10, %1 \n\t" |
BRANCHLESS_GET_CABAC("%4", "%q4", "(%1)", "%3", "%w3", |
"%5", "%q5", "%k0", "%b0", |
"%c11(%6)", "%c12(%6)", |
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), |
AV_STRINGIFY(H264_LPS_RANGE_OFFSET), |
AV_STRINGIFY(H264_MLPS_STATE_OFFSET), |
"%13") |
"sub %10, %1 \n\t" |
"mov %2, %0 \n\t" |
"movl %7, %%ecx \n\t" |
"add %1, %%"REG_c" \n\t" |
"movl %%ecx, (%0) \n\t" |
"test $1, %4 \n\t" |
" jnz 5f \n\t" |
"add"OPSIZE" $4, %2 \n\t" |
"4: \n\t" |
"add $1, %1 \n\t" |
"cmp %8, %1 \n\t" |
" jb 3b \n\t" |
"mov %2, %0 \n\t" |
"movl %7, %%ecx \n\t" |
"add %1, %%"REG_c" \n\t" |
"movl %%ecx, (%0) \n\t" |
"5: \n\t" |
"add %9, %k0 \n\t" |
"shr $2, %k0 \n\t" |
: "=&q"(coeff_count), "+r"(significant_coeff_ctx_base), "+m"(index), |
"+&r"(c->low), "=&r"(bit), "+&r"(c->range) |
: "r"(c), "m"(minusstart), "m"(end), "m"(minusindex), "m"(last_off), |
"i"(offsetof(CABACContext, bytestream)), |
"i"(offsetof(CABACContext, bytestream_end)) |
TABLES_ARG |
: "%"REG_c, "memory" |
); |
return coeff_count; |
} |
#define decode_significance_8x8 decode_significance_8x8_x86 |
static int decode_significance_8x8_x86(CABACContext *c, |
uint8_t *significant_coeff_ctx_base, |
int *index, uint8_t *last_coeff_ctx_base, const uint8_t *sig_off){ |
int minusindex= 4-(intptr_t)index; |
int bit; |
x86_reg coeff_count; |
x86_reg last=0; |
x86_reg state; |
#ifdef BROKEN_RELOCATIONS |
void *tables; |
__asm__ volatile( |
"lea "MANGLE(ff_h264_cabac_tables)", %0 \n\t" |
: "=&r"(tables) |
); |
#endif |
__asm__ volatile( |
"mov %1, %6 \n\t" |
"3: \n\t" |
"mov %10, %0 \n\t" |
"movzbl (%0, %6), %k6 \n\t" |
"add %9, %6 \n\t" |
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3", |
"%5", "%q5", "%k0", "%b0", |
"%c12(%7)", "%c13(%7)", |
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), |
AV_STRINGIFY(H264_LPS_RANGE_OFFSET), |
AV_STRINGIFY(H264_MLPS_STATE_OFFSET), |
"%15") |
"mov %1, %k6 \n\t" |
"test $1, %4 \n\t" |
" jz 4f \n\t" |
#ifdef BROKEN_RELOCATIONS |
"movzbl %c14(%15, %q6), %k6\n\t" |
#else |
"movzbl "MANGLE(ff_h264_cabac_tables)"+%c14(%k6), %k6\n\t" |
#endif |
"add %11, %6 \n\t" |
BRANCHLESS_GET_CABAC("%4", "%q4", "(%6)", "%3", "%w3", |
"%5", "%q5", "%k0", "%b0", |
"%c12(%7)", "%c13(%7)", |
AV_STRINGIFY(H264_NORM_SHIFT_OFFSET), |
AV_STRINGIFY(H264_LPS_RANGE_OFFSET), |
AV_STRINGIFY(H264_MLPS_STATE_OFFSET), |
"%15") |
"mov %2, %0 \n\t" |
"mov %1, %k6 \n\t" |
"movl %k6, (%0) \n\t" |
"test $1, %4 \n\t" |
" jnz 5f \n\t" |
"add"OPSIZE" $4, %2 \n\t" |
"4: \n\t" |
"addl $1, %k6 \n\t" |
"mov %k6, %1 \n\t" |
"cmpl $63, %k6 \n\t" |
" jb 3b \n\t" |
"mov %2, %0 \n\t" |
"movl %k6, (%0) \n\t" |
"5: \n\t" |
"addl %8, %k0 \n\t" |
"shr $2, %k0 \n\t" |
: "=&q"(coeff_count), "+m"(last), "+m"(index), "+&r"(c->low), |
"=&r"(bit), "+&r"(c->range), "=&r"(state) |
: "r"(c), "m"(minusindex), "m"(significant_coeff_ctx_base), |
"m"(sig_off), "m"(last_coeff_ctx_base), |
"i"(offsetof(CABACContext, bytestream)), |
"i"(offsetof(CABACContext, bytestream_end)), |
"i"(H264_LAST_COEFF_FLAG_OFFSET_8x8_OFFSET) TABLES_ARG |
: "%"REG_c, "memory" |
); |
return coeff_count; |
} |
#endif /* HAVE_7REGS && !defined(BROKEN_RELOCATIONS) */ |
#endif /* HAVE_INLINE_ASM */ |
#endif /* AVCODEC_X86_H264_I386_H */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_idct.asm |
---|
0,0 → 1,1082 |
;***************************************************************************** |
;* MMX/SSE2-optimized H.264 iDCT |
;***************************************************************************** |
;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt |
;* Copyright (C) 2003-2008 x264 project |
;* |
;* Authors: Laurent Aimar <fenrir@via.ecp.fr> |
;* Loren Merritt <lorenm@u.washington.edu> |
;* Holger Lubitz <hal@duncan.ol.sub.de> |
;* Min Chen <chenm001.163.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;***************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 |
db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 |
db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 |
db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 |
db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 |
db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 |
db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 |
db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 |
db 4+11*8, 5+11*8, 4+12*8, 5+12*8 |
db 6+11*8, 7+11*8, 6+12*8, 7+12*8 |
db 4+13*8, 5+13*8, 4+14*8, 5+14*8 |
db 6+13*8, 7+13*8, 6+14*8, 7+14*8 |
%ifdef PIC |
%define npicregs 1 |
%define scan8 picregq |
%else |
%define npicregs 0 |
%define scan8 scan8_mem |
%endif |
cextern pw_32 |
cextern pw_1 |
SECTION .text |
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
%macro IDCT4_ADD 3 |
; Load dct coeffs |
movq m0, [%2] |
movq m1, [%2+8] |
movq m2, [%2+16] |
movq m3, [%2+24] |
IDCT4_1D w, 0, 1, 2, 3, 4, 5 |
mova m6, [pw_32] |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
paddw m0, m6 |
IDCT4_1D w, 0, 1, 2, 3, 4, 5 |
pxor m7, m7 |
movq [%2+ 0], m7 |
movq [%2+ 8], m7 |
movq [%2+16], m7 |
movq [%2+24], m7 |
STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 |
lea %1, [%1+%3*2] |
STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 |
%endmacro |
INIT_MMX mmx |
; ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride) |
cglobal h264_idct_add_8, 3, 3, 0 |
IDCT4_ADD r0, r1, r2 |
RET |
%macro IDCT8_1D 2 |
mova m0, m1 |
psraw m1, 1 |
mova m4, m5 |
psraw m4, 1 |
paddw m4, m5 |
paddw m1, m0 |
paddw m4, m7 |
paddw m1, m5 |
psubw m4, m0 |
paddw m1, m3 |
psubw m0, m3 |
psubw m5, m3 |
psraw m3, 1 |
paddw m0, m7 |
psubw m5, m7 |
psraw m7, 1 |
psubw m0, m3 |
psubw m5, m7 |
mova m7, m1 |
psraw m1, 2 |
mova m3, m4 |
psraw m3, 2 |
paddw m3, m0 |
psraw m0, 2 |
paddw m1, m5 |
psraw m5, 2 |
psubw m0, m4 |
psubw m7, m5 |
mova m5, m6 |
psraw m6, 1 |
mova m4, m2 |
psraw m4, 1 |
paddw m6, m2 |
psubw m4, m5 |
mova m2, %1 |
mova m5, %2 |
SUMSUB_BA w, 5, 2 |
SUMSUB_BA w, 6, 5 |
SUMSUB_BA w, 4, 2 |
SUMSUB_BA w, 7, 6 |
SUMSUB_BA w, 0, 4 |
SUMSUB_BA w, 3, 2 |
SUMSUB_BA w, 1, 5 |
SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 |
%endmacro |
%macro IDCT8_1D_FULL 1 |
mova m7, [%1+112] |
mova m6, [%1+ 96] |
mova m5, [%1+ 80] |
mova m3, [%1+ 48] |
mova m2, [%1+ 32] |
mova m1, [%1+ 16] |
IDCT8_1D [%1], [%1+ 64] |
%endmacro |
; %1=int16_t *block, %2=int16_t *dstblock |
%macro IDCT8_ADD_MMX_START 2 |
IDCT8_1D_FULL %1 |
mova [%1], m7 |
TRANSPOSE4x4W 0, 1, 2, 3, 7 |
mova m7, [%1] |
mova [%2 ], m0 |
mova [%2+16], m1 |
mova [%2+32], m2 |
mova [%2+48], m3 |
TRANSPOSE4x4W 4, 5, 6, 7, 3 |
mova [%2+ 8], m4 |
mova [%2+24], m5 |
mova [%2+40], m6 |
mova [%2+56], m7 |
%endmacro |
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
%macro IDCT8_ADD_MMX_END 3-4 |
IDCT8_1D_FULL %2 |
mova [%2 ], m5 |
mova [%2+16], m6 |
mova [%2+32], m7 |
pxor m7, m7 |
%if %0 == 4 |
movq [%4+ 0], m7 |
movq [%4+ 8], m7 |
movq [%4+ 16], m7 |
movq [%4+ 24], m7 |
movq [%4+ 32], m7 |
movq [%4+ 40], m7 |
movq [%4+ 48], m7 |
movq [%4+ 56], m7 |
movq [%4+ 64], m7 |
movq [%4+ 72], m7 |
movq [%4+ 80], m7 |
movq [%4+ 88], m7 |
movq [%4+ 96], m7 |
movq [%4+104], m7 |
movq [%4+112], m7 |
movq [%4+120], m7 |
%endif |
STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 |
lea %1, [%1+%3*2] |
STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 |
mova m0, [%2 ] |
mova m1, [%2+16] |
mova m2, [%2+32] |
lea %1, [%1+%3*2] |
STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 |
lea %1, [%1+%3*2] |
STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 |
%endmacro |
INIT_MMX mmx |
; ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride) |
cglobal h264_idct8_add_8, 3, 4, 0 |
%assign pad 128+4-(stack_offset&7) |
SUB rsp, pad |
add word [r1], 32 |
IDCT8_ADD_MMX_START r1 , rsp |
IDCT8_ADD_MMX_START r1+8, rsp+64 |
lea r3, [r0+4] |
IDCT8_ADD_MMX_END r0 , rsp, r2, r1 |
IDCT8_ADD_MMX_END r3 , rsp+8, r2 |
ADD rsp, pad |
RET |
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
%macro IDCT8_ADD_SSE 4 |
IDCT8_1D_FULL %2 |
%if ARCH_X86_64 |
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
%else |
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] |
%endif |
paddw m0, [pw_32] |
%if ARCH_X86_64 == 0 |
mova [%2 ], m0 |
mova [%2+16], m4 |
IDCT8_1D [%2], [%2+ 16] |
mova [%2 ], m6 |
mova [%2+16], m7 |
%else |
SWAP 0, 8 |
SWAP 4, 9 |
IDCT8_1D m8, m9 |
SWAP 6, 8 |
SWAP 7, 9 |
%endif |
pxor m7, m7 |
lea %4, [%3*3] |
STORE_DIFF m0, m6, m7, [%1 ] |
STORE_DIFF m1, m6, m7, [%1+%3 ] |
STORE_DIFF m2, m6, m7, [%1+%3*2] |
STORE_DIFF m3, m6, m7, [%1+%4 ] |
%if ARCH_X86_64 == 0 |
mova m0, [%2 ] |
mova m1, [%2+16] |
%else |
SWAP 0, 8 |
SWAP 1, 9 |
%endif |
mova [%2+ 0], m7 |
mova [%2+ 16], m7 |
mova [%2+ 32], m7 |
mova [%2+ 48], m7 |
mova [%2+ 64], m7 |
mova [%2+ 80], m7 |
mova [%2+ 96], m7 |
mova [%2+112], m7 |
lea %1, [%1+%3*4] |
STORE_DIFF m4, m6, m7, [%1 ] |
STORE_DIFF m5, m6, m7, [%1+%3 ] |
STORE_DIFF m0, m6, m7, [%1+%3*2] |
STORE_DIFF m1, m6, m7, [%1+%4 ] |
%endmacro |
INIT_XMM sse2 |
; ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride) |
cglobal h264_idct8_add_8, 3, 4, 10 |
IDCT8_ADD_SSE r0, r1, r2, r3 |
RET |
%macro DC_ADD_MMXEXT_INIT 2 |
add %1, 32 |
sar %1, 6 |
movd m0, %1d |
lea %1, [%2*3] |
pshufw m0, m0, 0 |
pxor m1, m1 |
psubw m1, m0 |
packuswb m0, m0 |
packuswb m1, m1 |
%endmacro |
%macro DC_ADD_MMXEXT_OP 4 |
%1 m2, [%2 ] |
%1 m3, [%2+%3 ] |
%1 m4, [%2+%3*2] |
%1 m5, [%2+%4 ] |
paddusb m2, m0 |
paddusb m3, m0 |
paddusb m4, m0 |
paddusb m5, m0 |
psubusb m2, m1 |
psubusb m3, m1 |
psubusb m4, m1 |
psubusb m5, m1 |
%1 [%2 ], m2 |
%1 [%2+%3 ], m3 |
%1 [%2+%3*2], m4 |
%1 [%2+%4 ], m5 |
%endmacro |
INIT_MMX mmxext |
; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) |
%if ARCH_X86_64 |
cglobal h264_idct_dc_add_8, 3, 4, 0 |
movsx r3, word [r1] |
mov dword [r1], 0 |
DC_ADD_MMXEXT_INIT r3, r2 |
DC_ADD_MMXEXT_OP movh, r0, r2, r3 |
RET |
; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) |
cglobal h264_idct8_dc_add_8, 3, 4, 0 |
movsx r3, word [r1] |
mov dword [r1], 0 |
DC_ADD_MMXEXT_INIT r3, r2 |
DC_ADD_MMXEXT_OP mova, r0, r2, r3 |
lea r0, [r0+r2*4] |
DC_ADD_MMXEXT_OP mova, r0, r2, r3 |
RET |
%else |
; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) |
cglobal h264_idct_dc_add_8, 2, 3, 0 |
movsx r2, word [r1] |
mov dword [r1], 0 |
mov r1, r2m |
DC_ADD_MMXEXT_INIT r2, r1 |
DC_ADD_MMXEXT_OP movh, r0, r1, r2 |
RET |
; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) |
cglobal h264_idct8_dc_add_8, 2, 3, 0 |
movsx r2, word [r1] |
mov dword [r1], 0 |
mov r1, r2m |
DC_ADD_MMXEXT_INIT r2, r1 |
DC_ADD_MMXEXT_OP mova, r0, r1, r2 |
lea r0, [r0+r1*4] |
DC_ADD_MMXEXT_OP mova, r0, r1, r2 |
RET |
%endif |
INIT_MMX mmx |
; ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg |
xor r5, r5 |
%ifdef PIC |
lea picregq, [scan8_mem] |
%endif |
.nextblock: |
movzx r6, byte [scan8+r5] |
movzx r6, byte [r4+r6] |
test r6, r6 |
jz .skipblock |
mov r6d, dword [r1+r5*4] |
lea r6, [r0+r6] |
IDCT4_ADD r6, r2, r3 |
.skipblock: |
inc r5 |
add r2, 32 |
cmp r5, 16 |
jl .nextblock |
REP_RET |
; ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg |
%assign pad 128+4-(stack_offset&7) |
SUB rsp, pad |
xor r5, r5 |
%ifdef PIC |
lea picregq, [scan8_mem] |
%endif |
.nextblock: |
movzx r6, byte [scan8+r5] |
movzx r6, byte [r4+r6] |
test r6, r6 |
jz .skipblock |
mov r6d, dword [r1+r5*4] |
add r6, r0 |
add word [r2], 32 |
IDCT8_ADD_MMX_START r2 , rsp |
IDCT8_ADD_MMX_START r2+8, rsp+64 |
IDCT8_ADD_MMX_END r6 , rsp, r3, r2 |
mov r6d, dword [r1+r5*4] |
lea r6, [r0+r6+4] |
IDCT8_ADD_MMX_END r6 , rsp+8, r3 |
.skipblock: |
add r5, 4 |
add r2, 128 |
cmp r5, 16 |
jl .nextblock |
ADD rsp, pad |
RET |
INIT_MMX mmxext |
; ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
xor r5, r5 |
%ifdef PIC |
lea picregq, [scan8_mem] |
%endif |
.nextblock: |
movzx r6, byte [scan8+r5] |
movzx r6, byte [r4+r6] |
test r6, r6 |
jz .skipblock |
cmp r6, 1 |
jnz .no_dc |
movsx r6, word [r2] |
test r6, r6 |
jz .no_dc |
mov word [r2], 0 |
DC_ADD_MMXEXT_INIT r6, r3 |
%if ARCH_X86_64 == 0 |
%define dst2q r1 |
%define dst2d r1d |
%endif |
mov dst2d, dword [r1+r5*4] |
lea dst2q, [r0+dst2q] |
DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 |
%if ARCH_X86_64 == 0 |
mov r1, r1m |
%endif |
inc r5 |
add r2, 32 |
cmp r5, 16 |
jl .nextblock |
REP_RET |
.no_dc: |
mov r6d, dword [r1+r5*4] |
add r6, r0 |
IDCT4_ADD r6, r2, r3 |
.skipblock: |
inc r5 |
add r2, 32 |
cmp r5, 16 |
jl .nextblock |
REP_RET |
INIT_MMX mmx |
; ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg |
xor r5, r5 |
%ifdef PIC |
lea picregq, [scan8_mem] |
%endif |
.nextblock: |
movzx r6, byte [scan8+r5] |
movzx r6, byte [r4+r6] |
or r6w, word [r2] |
test r6, r6 |
jz .skipblock |
mov r6d, dword [r1+r5*4] |
add r6, r0 |
IDCT4_ADD r6, r2, r3 |
.skipblock: |
inc r5 |
add r2, 32 |
cmp r5, 16 |
jl .nextblock |
REP_RET |
INIT_MMX mmxext |
; ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
xor r5, r5 |
%ifdef PIC |
lea picregq, [scan8_mem] |
%endif |
.nextblock: |
movzx r6, byte [scan8+r5] |
movzx r6, byte [r4+r6] |
test r6, r6 |
jz .try_dc |
mov r6d, dword [r1+r5*4] |
lea r6, [r0+r6] |
IDCT4_ADD r6, r2, r3 |
inc r5 |
add r2, 32 |
cmp r5, 16 |
jl .nextblock |
REP_RET |
.try_dc: |
movsx r6, word [r2] |
test r6, r6 |
jz .skipblock |
mov word [r2], 0 |
DC_ADD_MMXEXT_INIT r6, r3 |
%if ARCH_X86_64 == 0 |
%define dst2q r1 |
%define dst2d r1d |
%endif |
mov dst2d, dword [r1+r5*4] |
add dst2q, r0 |
DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 |
%if ARCH_X86_64 == 0 |
mov r1, r1m |
%endif |
.skipblock: |
inc r5 |
add r2, 32 |
cmp r5, 16 |
jl .nextblock |
REP_RET |
; ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
%assign pad 128+4-(stack_offset&7) |
SUB rsp, pad |
xor r5, r5 |
%ifdef PIC |
lea picregq, [scan8_mem] |
%endif |
.nextblock: |
movzx r6, byte [scan8+r5] |
movzx r6, byte [r4+r6] |
test r6, r6 |
jz .skipblock |
cmp r6, 1 |
jnz .no_dc |
movsx r6, word [r2] |
test r6, r6 |
jz .no_dc |
mov word [r2], 0 |
DC_ADD_MMXEXT_INIT r6, r3 |
%if ARCH_X86_64 == 0 |
%define dst2q r1 |
%define dst2d r1d |
%endif |
mov dst2d, dword [r1+r5*4] |
lea dst2q, [r0+dst2q] |
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 |
lea dst2q, [dst2q+r3*4] |
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 |
%if ARCH_X86_64 == 0 |
mov r1, r1m |
%endif |
add r5, 4 |
add r2, 128 |
cmp r5, 16 |
jl .nextblock |
ADD rsp, pad |
RET |
.no_dc: |
mov r6d, dword [r1+r5*4] |
add r6, r0 |
add word [r2], 32 |
IDCT8_ADD_MMX_START r2 , rsp |
IDCT8_ADD_MMX_START r2+8, rsp+64 |
IDCT8_ADD_MMX_END r6 , rsp, r3, r2 |
mov r6d, dword [r1+r5*4] |
lea r6, [r0+r6+4] |
IDCT8_ADD_MMX_END r6 , rsp+8, r3 |
.skipblock: |
add r5, 4 |
add r2, 128 |
cmp r5, 16 |
jl .nextblock |
ADD rsp, pad |
RET |
INIT_XMM sse2 |
; ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
xor r5, r5 |
%ifdef PIC |
lea picregq, [scan8_mem] |
%endif |
.nextblock: |
movzx r6, byte [scan8+r5] |
movzx r6, byte [r4+r6] |
test r6, r6 |
jz .skipblock |
cmp r6, 1 |
jnz .no_dc |
movsx r6, word [r2] |
test r6, r6 |
jz .no_dc |
INIT_MMX cpuname |
mov word [r2], 0 |
DC_ADD_MMXEXT_INIT r6, r3 |
%if ARCH_X86_64 == 0 |
%define dst2q r1 |
%define dst2d r1d |
%endif |
mov dst2d, dword [r1+r5*4] |
add dst2q, r0 |
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 |
lea dst2q, [dst2q+r3*4] |
DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 |
%if ARCH_X86_64 == 0 |
mov r1, r1m |
%endif |
add r5, 4 |
add r2, 128 |
cmp r5, 16 |
jl .nextblock |
REP_RET |
.no_dc: |
INIT_XMM cpuname |
mov dst2d, dword [r1+r5*4] |
add dst2q, r0 |
IDCT8_ADD_SSE dst2q, r2, r3, r6 |
%if ARCH_X86_64 == 0 |
mov r1, r1m |
%endif |
.skipblock: |
add r5, 4 |
add r2, 128 |
cmp r5, 16 |
jl .nextblock |
REP_RET |
INIT_MMX mmx |
h264_idct_add8_mmx_plane: |
.nextblock: |
movzx r6, byte [scan8+r5] |
movzx r6, byte [r4+r6] |
or r6w, word [r2] |
test r6, r6 |
jz .skipblock |
%if ARCH_X86_64 |
mov r0d, dword [r1+r5*4] |
add r0, [dst2q] |
%else |
mov r0, r1m ; XXX r1m here is actually r0m of the calling func |
mov r0, [r0] |
add r0, dword [r1+r5*4] |
%endif |
IDCT4_ADD r0, r2, r3 |
.skipblock: |
inc r5 |
add r2, 32 |
test r5, 3 |
jnz .nextblock |
rep ret |
; ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset, |
; int16_t *block, int stride, const uint8_t nnzc[6 * 8]) |
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
mov r5, 16 |
add r2, 512 |
%ifdef PIC |
lea picregq, [scan8_mem] |
%endif |
%if ARCH_X86_64 |
mov dst2q, r0 |
%endif |
call h264_idct_add8_mmx_plane |
mov r5, 32 |
add r2, 384 |
%if ARCH_X86_64 |
add dst2q, gprsize |
%else |
add r0mp, gprsize |
%endif |
call h264_idct_add8_mmx_plane |
RET |
h264_idct_add8_mmxext_plane: |
.nextblock: |
movzx r6, byte [scan8+r5] |
movzx r6, byte [r4+r6] |
test r6, r6 |
jz .try_dc |
%if ARCH_X86_64 |
mov r0d, dword [r1+r5*4] |
add r0, [dst2q] |
%else |
mov r0, r1m ; XXX r1m here is actually r0m of the calling func |
mov r0, [r0] |
add r0, dword [r1+r5*4] |
%endif |
IDCT4_ADD r0, r2, r3 |
inc r5 |
add r2, 32 |
test r5, 3 |
jnz .nextblock |
rep ret |
.try_dc: |
movsx r6, word [r2] |
test r6, r6 |
jz .skipblock |
mov word [r2], 0 |
DC_ADD_MMXEXT_INIT r6, r3 |
%if ARCH_X86_64 |
mov r0d, dword [r1+r5*4] |
add r0, [dst2q] |
%else |
mov r0, r1m ; XXX r1m here is actually r0m of the calling func |
mov r0, [r0] |
add r0, dword [r1+r5*4] |
%endif |
DC_ADD_MMXEXT_OP movh, r0, r3, r6 |
.skipblock: |
inc r5 |
add r2, 32 |
test r5, 3 |
jnz .nextblock |
rep ret |
INIT_MMX mmxext |
; ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
mov r5, 16 |
add r2, 512 |
%if ARCH_X86_64 |
mov dst2q, r0 |
%endif |
%ifdef PIC |
lea picregq, [scan8_mem] |
%endif |
call h264_idct_add8_mmxext_plane |
mov r5, 32 |
add r2, 384 |
%if ARCH_X86_64 |
add dst2q, gprsize |
%else |
add r0mp, gprsize |
%endif |
call h264_idct_add8_mmxext_plane |
RET |
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered |
h264_idct_dc_add8_mmxext: |
movd m0, [r2 ] ; 0 0 X D |
mov word [r2+ 0], 0 |
punpcklwd m0, [r2+32] ; x X d D |
mov word [r2+32], 0 |
paddsw m0, [pw_32] |
psraw m0, 6 |
punpcklwd m0, m0 ; d d D D |
pxor m1, m1 ; 0 0 0 0 |
psubw m1, m0 ; -d-d-D-D |
packuswb m0, m1 ; -d-d-D-D d d D D |
pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D |
punpcklwd m0, m0 ; d d d d D D D D |
lea r6, [r3*3] |
DC_ADD_MMXEXT_OP movq, r0, r3, r6 |
ret |
ALIGN 16 |
INIT_XMM sse2 |
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride |
h264_add8x4_idct_sse2: |
movq m0, [r2+ 0] |
movq m1, [r2+ 8] |
movq m2, [r2+16] |
movq m3, [r2+24] |
movhps m0, [r2+32] |
movhps m1, [r2+40] |
movhps m2, [r2+48] |
movhps m3, [r2+56] |
IDCT4_1D w,0,1,2,3,4,5 |
TRANSPOSE2x4x4W 0,1,2,3,4 |
paddw m0, [pw_32] |
IDCT4_1D w,0,1,2,3,4,5 |
pxor m7, m7 |
mova [r2+ 0], m7 |
mova [r2+16], m7 |
mova [r2+32], m7 |
mova [r2+48], m7 |
STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 |
lea r0, [r0+r3*2] |
STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 |
ret |
%macro add16_sse2_cycle 2 |
movzx r0, word [r4+%2] |
test r0, r0 |
jz .cycle%1end |
mov r0d, dword [r1+%1*8] |
%if ARCH_X86_64 |
add r0, r5 |
%else |
add r0, r0m |
%endif |
call h264_add8x4_idct_sse2 |
.cycle%1end: |
%if %1 < 7 |
add r2, 64 |
%endif |
%endmacro |
; ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8 |
%if ARCH_X86_64 |
mov r5, r0 |
%endif |
; unrolling of the loop leads to an average performance gain of |
; 20-25% |
add16_sse2_cycle 0, 0xc |
add16_sse2_cycle 1, 0x14 |
add16_sse2_cycle 2, 0xe |
add16_sse2_cycle 3, 0x16 |
add16_sse2_cycle 4, 0x1c |
add16_sse2_cycle 5, 0x24 |
add16_sse2_cycle 6, 0x1e |
add16_sse2_cycle 7, 0x26 |
RET |
%macro add16intra_sse2_cycle 2 |
movzx r0, word [r4+%2] |
test r0, r0 |
jz .try%1dc |
mov r0d, dword [r1+%1*8] |
%if ARCH_X86_64 |
add r0, r7 |
%else |
add r0, r0m |
%endif |
call h264_add8x4_idct_sse2 |
jmp .cycle%1end |
.try%1dc: |
movsx r0, word [r2 ] |
or r0w, word [r2+32] |
jz .cycle%1end |
mov r0d, dword [r1+%1*8] |
%if ARCH_X86_64 |
add r0, r7 |
%else |
add r0, r0m |
%endif |
call h264_idct_dc_add8_mmxext |
.cycle%1end: |
%if %1 < 7 |
add r2, 64 |
%endif |
%endmacro |
; ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8 |
%if ARCH_X86_64 |
mov r7, r0 |
%endif |
add16intra_sse2_cycle 0, 0xc |
add16intra_sse2_cycle 1, 0x14 |
add16intra_sse2_cycle 2, 0xe |
add16intra_sse2_cycle 3, 0x16 |
add16intra_sse2_cycle 4, 0x1c |
add16intra_sse2_cycle 5, 0x24 |
add16intra_sse2_cycle 6, 0x1e |
add16intra_sse2_cycle 7, 0x26 |
RET |
%macro add8_sse2_cycle 2 |
movzx r0, word [r4+%2] |
test r0, r0 |
jz .try%1dc |
%if ARCH_X86_64 |
mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] |
add r0, [r7] |
%else |
mov r0, r0m |
mov r0, [r0] |
add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] |
%endif |
call h264_add8x4_idct_sse2 |
jmp .cycle%1end |
.try%1dc: |
movsx r0, word [r2 ] |
or r0w, word [r2+32] |
jz .cycle%1end |
%if ARCH_X86_64 |
mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] |
add r0, [r7] |
%else |
mov r0, r0m |
mov r0, [r0] |
add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] |
%endif |
call h264_idct_dc_add8_mmxext |
.cycle%1end: |
%if %1 == 1 |
add r2, 384+64 |
%elif %1 < 3 |
add r2, 64 |
%endif |
%endmacro |
; ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset, |
; int16_t *block, int stride, |
; const uint8_t nnzc[6 * 8]) |
cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8 |
add r2, 512 |
%if ARCH_X86_64 |
mov r7, r0 |
%endif |
add8_sse2_cycle 0, 0x34 |
add8_sse2_cycle 1, 0x3c |
%if ARCH_X86_64 |
add r7, gprsize |
%else |
add r0mp, gprsize |
%endif |
add8_sse2_cycle 2, 0x5c |
add8_sse2_cycle 3, 0x64 |
RET |
;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul) |
%macro WALSH4_1D 5 |
SUMSUB_BADC w, %4, %3, %2, %1, %5 |
SUMSUB_BADC w, %4, %2, %3, %1, %5 |
SWAP %1, %4, %3 |
%endmacro |
%macro DEQUANT_MMX 3 |
mova m7, [pw_1] |
mova m4, %1 |
punpcklwd %1, m7 |
punpckhwd m4, m7 |
mova m5, %2 |
punpcklwd %2, m7 |
punpckhwd m5, m7 |
movd m7, t3d |
punpckldq m7, m7 |
pmaddwd %1, m7 |
pmaddwd %2, m7 |
pmaddwd m4, m7 |
pmaddwd m5, m7 |
psrad %1, %3 |
psrad %2, %3 |
psrad m4, %3 |
psrad m5, %3 |
packssdw %1, m4 |
packssdw %2, m5 |
%endmacro |
%macro STORE_WORDS 5-9 |
%if cpuflag(sse) |
movd t0d, %1 |
psrldq %1, 4 |
movd t1d, %1 |
psrldq %1, 4 |
mov [t2+%2*32], t0w |
mov [t2+%4*32], t1w |
shr t0d, 16 |
shr t1d, 16 |
mov [t2+%3*32], t0w |
mov [t2+%5*32], t1w |
movd t0d, %1 |
psrldq %1, 4 |
movd t1d, %1 |
mov [t2+%6*32], t0w |
mov [t2+%8*32], t1w |
shr t0d, 16 |
shr t1d, 16 |
mov [t2+%7*32], t0w |
mov [t2+%9*32], t1w |
%else |
movd t0d, %1 |
psrlq %1, 32 |
movd t1d, %1 |
mov [t2+%2*32], t0w |
mov [t2+%4*32], t1w |
shr t0d, 16 |
shr t1d, 16 |
mov [t2+%3*32], t0w |
mov [t2+%5*32], t1w |
%endif |
%endmacro |
%macro DEQUANT_STORE 1 |
%if cpuflag(sse2) |
movd xmm4, t3d |
movq xmm5, [pw_1] |
pshufd xmm4, xmm4, 0 |
movq2dq xmm0, m0 |
movq2dq xmm1, m1 |
movq2dq xmm2, m2 |
movq2dq xmm3, m3 |
punpcklwd xmm0, xmm5 |
punpcklwd xmm1, xmm5 |
punpcklwd xmm2, xmm5 |
punpcklwd xmm3, xmm5 |
pmaddwd xmm0, xmm4 |
pmaddwd xmm1, xmm4 |
pmaddwd xmm2, xmm4 |
pmaddwd xmm3, xmm4 |
psrad xmm0, %1 |
psrad xmm1, %1 |
psrad xmm2, %1 |
psrad xmm3, %1 |
packssdw xmm0, xmm1 |
packssdw xmm2, xmm3 |
STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7 |
STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15 |
%else |
DEQUANT_MMX m0, m1, %1 |
STORE_WORDS m0, 0, 1, 4, 5 |
STORE_WORDS m1, 2, 3, 6, 7 |
DEQUANT_MMX m2, m3, %1 |
STORE_WORDS m2, 8, 9, 12, 13 |
STORE_WORDS m3, 10, 11, 14, 15 |
%endif |
%endmacro |
%macro IDCT_DC_DEQUANT 1 |
cglobal h264_luma_dc_dequant_idct, 3, 4, %1 |
; manually spill XMM registers for Win64 because |
; the code here is initialized with INIT_MMX |
WIN64_SPILL_XMM %1 |
movq m3, [r1+24] |
movq m2, [r1+16] |
movq m1, [r1+ 8] |
movq m0, [r1+ 0] |
WALSH4_1D 0,1,2,3,4 |
TRANSPOSE4x4W 0,1,2,3,4 |
WALSH4_1D 0,1,2,3,4 |
; shift, tmp, output, qmul |
%if WIN64 |
DECLARE_REG_TMP 0,3,1,2 |
; we can't avoid this, because r0 is the shift register (ecx) on win64 |
xchg r0, t2 |
%elif ARCH_X86_64 |
DECLARE_REG_TMP 3,1,0,2 |
%else |
DECLARE_REG_TMP 1,3,0,2 |
%endif |
cmp t3d, 32767 |
jg .big_qmul |
add t3d, 128 << 16 |
DEQUANT_STORE 8 |
RET |
.big_qmul: |
bsr t0d, t3d |
add t3d, 128 << 16 |
mov t1d, 7 |
cmp t0d, t1d |
cmovg t0d, t1d |
inc t1d |
shr t3d, t0b |
sub t1d, t0d |
%if cpuflag(sse2) |
movd xmm6, t1d |
DEQUANT_STORE xmm6 |
%else |
movd m6, t1d |
DEQUANT_STORE m6 |
%endif |
RET |
%endmacro |
INIT_MMX mmx |
IDCT_DC_DEQUANT 0 |
INIT_MMX sse2 |
IDCT_DC_DEQUANT 7 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_idct_10bit.asm |
---|
0,0 → 1,589 |
;***************************************************************************** |
;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code |
;***************************************************************************** |
;* Copyright (C) 2005-2011 x264 project |
;* |
;* Authors: Daniel Kang <daniel.d.kang@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
pw_pixel_max: times 8 dw ((1 << 10)-1) |
pd_32: times 4 dd 32 |
SECTION .text |
;----------------------------------------------------------------------------- |
; void h264_idct_add(pixel *dst, dctcoef *block, int stride) |
;----------------------------------------------------------------------------- |
%macro STORE_DIFFx2 6 |
psrad %1, 6 |
psrad %2, 6 |
packssdw %1, %2 |
movq %3, [%5] |
movhps %3, [%5+%6] |
paddsw %1, %3 |
CLIPW %1, %4, [pw_pixel_max] |
movq [%5], %1 |
movhps [%5+%6], %1 |
%endmacro |
%macro STORE_DIFF16 5 |
psrad %1, 6 |
psrad %2, 6 |
packssdw %1, %2 |
paddsw %1, [%5] |
CLIPW %1, %3, %4 |
mova [%5], %1 |
%endmacro |
;dst, in, stride |
%macro IDCT4_ADD_10 3 |
mova m0, [%2+ 0] |
mova m1, [%2+16] |
mova m2, [%2+32] |
mova m3, [%2+48] |
IDCT4_1D d,0,1,2,3,4,5 |
TRANSPOSE4x4D 0,1,2,3,4 |
paddd m0, [pd_32] |
IDCT4_1D d,0,1,2,3,4,5 |
pxor m5, m5 |
mova [%2+ 0], m5 |
mova [%2+16], m5 |
mova [%2+32], m5 |
mova [%2+48], m5 |
STORE_DIFFx2 m0, m1, m4, m5, %1, %3 |
lea %1, [%1+%3*2] |
STORE_DIFFx2 m2, m3, m4, m5, %1, %3 |
%endmacro |
%macro IDCT_ADD_10 0 |
cglobal h264_idct_add_10, 3,3 |
IDCT4_ADD_10 r0, r1, r2 |
RET |
%endmacro |
INIT_XMM sse2 |
IDCT_ADD_10 |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
IDCT_ADD_10 |
%endif |
;----------------------------------------------------------------------------- |
; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
;----------------------------------------------------------------------------- |
;;;;;;; NO FATE SAMPLES TRIGGER THIS |
%macro ADD4x4IDCT 0 |
add4x4_idct %+ SUFFIX: |
add r5, r0 |
mova m0, [r2+ 0] |
mova m1, [r2+16] |
mova m2, [r2+32] |
mova m3, [r2+48] |
IDCT4_1D d,0,1,2,3,4,5 |
TRANSPOSE4x4D 0,1,2,3,4 |
paddd m0, [pd_32] |
IDCT4_1D d,0,1,2,3,4,5 |
pxor m5, m5 |
mova [r2+ 0], m5 |
mova [r2+16], m5 |
mova [r2+32], m5 |
mova [r2+48], m5 |
STORE_DIFFx2 m0, m1, m4, m5, r5, r3 |
lea r5, [r5+r3*2] |
STORE_DIFFx2 m2, m3, m4, m5, r5, r3 |
ret |
%endmacro |
INIT_XMM sse2 |
ALIGN 16 |
ADD4x4IDCT |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
ALIGN 16 |
ADD4x4IDCT |
%endif |
%macro ADD16_OP 2 |
cmp byte [r4+%2], 0 |
jz .skipblock%1 |
mov r5d, [r1+%1*4] |
call add4x4_idct %+ SUFFIX |
.skipblock%1: |
%if %1<15 |
add r2, 64 |
%endif |
%endmacro |
%macro IDCT_ADD16_10 0 |
cglobal h264_idct_add16_10, 5,6 |
ADD16_OP 0, 4+1*8 |
ADD16_OP 1, 5+1*8 |
ADD16_OP 2, 4+2*8 |
ADD16_OP 3, 5+2*8 |
ADD16_OP 4, 6+1*8 |
ADD16_OP 5, 7+1*8 |
ADD16_OP 6, 6+2*8 |
ADD16_OP 7, 7+2*8 |
ADD16_OP 8, 4+3*8 |
ADD16_OP 9, 5+3*8 |
ADD16_OP 10, 4+4*8 |
ADD16_OP 11, 5+4*8 |
ADD16_OP 12, 6+3*8 |
ADD16_OP 13, 7+3*8 |
ADD16_OP 14, 6+4*8 |
ADD16_OP 15, 7+4*8 |
REP_RET |
%endmacro |
INIT_XMM sse2 |
IDCT_ADD16_10 |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
IDCT_ADD16_10 |
%endif |
;----------------------------------------------------------------------------- |
; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride) |
;----------------------------------------------------------------------------- |
%macro IDCT_DC_ADD_OP_10 3 |
pxor m5, m5 |
%if avx_enabled |
paddw m1, m0, [%1+0 ] |
paddw m2, m0, [%1+%2 ] |
paddw m3, m0, [%1+%2*2] |
paddw m4, m0, [%1+%3 ] |
%else |
mova m1, [%1+0 ] |
mova m2, [%1+%2 ] |
mova m3, [%1+%2*2] |
mova m4, [%1+%3 ] |
paddw m1, m0 |
paddw m2, m0 |
paddw m3, m0 |
paddw m4, m0 |
%endif |
CLIPW m1, m5, m6 |
CLIPW m2, m5, m6 |
CLIPW m3, m5, m6 |
CLIPW m4, m5, m6 |
mova [%1+0 ], m1 |
mova [%1+%2 ], m2 |
mova [%1+%2*2], m3 |
mova [%1+%3 ], m4 |
%endmacro |
INIT_MMX mmxext |
cglobal h264_idct_dc_add_10,3,3 |
movd m0, [r1] |
mov dword [r1], 0 |
paddd m0, [pd_32] |
psrad m0, 6 |
lea r1, [r2*3] |
pshufw m0, m0, 0 |
mova m6, [pw_pixel_max] |
IDCT_DC_ADD_OP_10 r0, r2, r1 |
RET |
;----------------------------------------------------------------------------- |
; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) |
;----------------------------------------------------------------------------- |
%macro IDCT8_DC_ADD 0 |
cglobal h264_idct8_dc_add_10,3,4,7 |
movd m0, [r1] |
mov dword[r1], 0 |
paddd m0, [pd_32] |
psrad m0, 6 |
lea r1, [r2*3] |
SPLATW m0, m0, 0 |
mova m6, [pw_pixel_max] |
IDCT_DC_ADD_OP_10 r0, r2, r1 |
lea r0, [r0+r2*4] |
IDCT_DC_ADD_OP_10 r0, r2, r1 |
RET |
%endmacro |
INIT_XMM sse2 |
IDCT8_DC_ADD |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
IDCT8_DC_ADD |
%endif |
;----------------------------------------------------------------------------- |
; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
;----------------------------------------------------------------------------- |
%macro AC 1 |
.ac%1: |
mov r5d, [r1+(%1+0)*4] |
call add4x4_idct %+ SUFFIX |
mov r5d, [r1+(%1+1)*4] |
add r2, 64 |
call add4x4_idct %+ SUFFIX |
add r2, 64 |
jmp .skipadd%1 |
%endmacro |
%assign last_block 16 |
%macro ADD16_OP_INTRA 2 |
cmp word [r4+%2], 0 |
jnz .ac%1 |
mov r5d, [r2+ 0] |
or r5d, [r2+64] |
jz .skipblock%1 |
mov r5d, [r1+(%1+0)*4] |
call idct_dc_add %+ SUFFIX |
.skipblock%1: |
%if %1<last_block-2 |
add r2, 128 |
%endif |
.skipadd%1: |
%endmacro |
%macro IDCT_ADD16INTRA_10 0 |
idct_dc_add %+ SUFFIX: |
add r5, r0 |
movq m0, [r2+ 0] |
movhps m0, [r2+64] |
mov dword [r2+ 0], 0 |
mov dword [r2+64], 0 |
paddd m0, [pd_32] |
psrad m0, 6 |
pshufhw m0, m0, 0 |
pshuflw m0, m0, 0 |
lea r6, [r3*3] |
mova m6, [pw_pixel_max] |
IDCT_DC_ADD_OP_10 r5, r3, r6 |
ret |
cglobal h264_idct_add16intra_10,5,7,8 |
ADD16_OP_INTRA 0, 4+1*8 |
ADD16_OP_INTRA 2, 4+2*8 |
ADD16_OP_INTRA 4, 6+1*8 |
ADD16_OP_INTRA 6, 6+2*8 |
ADD16_OP_INTRA 8, 4+3*8 |
ADD16_OP_INTRA 10, 4+4*8 |
ADD16_OP_INTRA 12, 6+3*8 |
ADD16_OP_INTRA 14, 6+4*8 |
REP_RET |
AC 8 |
AC 10 |
AC 12 |
AC 14 |
AC 0 |
AC 2 |
AC 4 |
AC 6 |
%endmacro |
INIT_XMM sse2 |
IDCT_ADD16INTRA_10 |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
IDCT_ADD16INTRA_10 |
%endif |
%assign last_block 36 |
;----------------------------------------------------------------------------- |
; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
;----------------------------------------------------------------------------- |
%macro IDCT_ADD8 0 |
cglobal h264_idct_add8_10,5,8,7 |
%if ARCH_X86_64 |
mov r7, r0 |
%endif |
add r2, 1024 |
mov r0, [r0] |
ADD16_OP_INTRA 16, 4+ 6*8 |
ADD16_OP_INTRA 18, 4+ 7*8 |
add r2, 1024-128*2 |
%if ARCH_X86_64 |
mov r0, [r7+gprsize] |
%else |
mov r0, r0m |
mov r0, [r0+gprsize] |
%endif |
ADD16_OP_INTRA 32, 4+11*8 |
ADD16_OP_INTRA 34, 4+12*8 |
REP_RET |
AC 16 |
AC 18 |
AC 32 |
AC 34 |
%endmacro ; IDCT_ADD8 |
INIT_XMM sse2 |
IDCT_ADD8 |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
IDCT_ADD8 |
%endif |
;----------------------------------------------------------------------------- |
; void h264_idct8_add(pixel *dst, dctcoef *block, int stride) |
;----------------------------------------------------------------------------- |
%macro IDCT8_1D 2 |
SWAP 0, 1 |
psrad m4, m5, 1 |
psrad m1, m0, 1 |
paddd m4, m5 |
paddd m1, m0 |
paddd m4, m7 |
paddd m1, m5 |
psubd m4, m0 |
paddd m1, m3 |
psubd m0, m3 |
psubd m5, m3 |
paddd m0, m7 |
psubd m5, m7 |
psrad m3, 1 |
psrad m7, 1 |
psubd m0, m3 |
psubd m5, m7 |
SWAP 1, 7 |
psrad m1, m7, 2 |
psrad m3, m4, 2 |
paddd m3, m0 |
psrad m0, 2 |
paddd m1, m5 |
psrad m5, 2 |
psubd m0, m4 |
psubd m7, m5 |
SWAP 5, 6 |
psrad m4, m2, 1 |
psrad m6, m5, 1 |
psubd m4, m5 |
paddd m6, m2 |
mova m2, %1 |
mova m5, %2 |
SUMSUB_BA d, 5, 2 |
SUMSUB_BA d, 6, 5 |
SUMSUB_BA d, 4, 2 |
SUMSUB_BA d, 7, 6 |
SUMSUB_BA d, 0, 4 |
SUMSUB_BA d, 3, 2 |
SUMSUB_BA d, 1, 5 |
SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 |
%endmacro |
%macro IDCT8_1D_FULL 1 |
mova m7, [%1+112*2] |
mova m6, [%1+ 96*2] |
mova m5, [%1+ 80*2] |
mova m3, [%1+ 48*2] |
mova m2, [%1+ 32*2] |
mova m1, [%1+ 16*2] |
IDCT8_1D [%1], [%1+ 64*2] |
%endmacro |
; %1=int16_t *block, %2=int16_t *dstblock |
%macro IDCT8_ADD_SSE_START 2 |
IDCT8_1D_FULL %1 |
%if ARCH_X86_64 |
TRANSPOSE4x4D 0,1,2,3,8 |
mova [%2 ], m0 |
TRANSPOSE4x4D 4,5,6,7,8 |
mova [%2+8*2], m4 |
%else |
mova [%1], m7 |
TRANSPOSE4x4D 0,1,2,3,7 |
mova m7, [%1] |
mova [%2 ], m0 |
mova [%2+16*2], m1 |
mova [%2+32*2], m2 |
mova [%2+48*2], m3 |
TRANSPOSE4x4D 4,5,6,7,3 |
mova [%2+ 8*2], m4 |
mova [%2+24*2], m5 |
mova [%2+40*2], m6 |
mova [%2+56*2], m7 |
%endif |
%endmacro |
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
%macro IDCT8_ADD_SSE_END 3 |
IDCT8_1D_FULL %2 |
mova [%2 ], m6 |
mova [%2+16*2], m7 |
pxor m7, m7 |
STORE_DIFFx2 m0, m1, m6, m7, %1, %3 |
lea %1, [%1+%3*2] |
STORE_DIFFx2 m2, m3, m6, m7, %1, %3 |
mova m0, [%2 ] |
mova m1, [%2+16*2] |
lea %1, [%1+%3*2] |
STORE_DIFFx2 m4, m5, m6, m7, %1, %3 |
lea %1, [%1+%3*2] |
STORE_DIFFx2 m0, m1, m6, m7, %1, %3 |
%endmacro |
%macro IDCT8_ADD 0 |
cglobal h264_idct8_add_10, 3,4,16 |
%if UNIX64 == 0 |
%assign pad 16-gprsize-(stack_offset&15) |
sub rsp, pad |
call h264_idct8_add1_10 %+ SUFFIX |
add rsp, pad |
RET |
%endif |
ALIGN 16 |
; TODO: does not need to use stack |
h264_idct8_add1_10 %+ SUFFIX: |
%assign pad 256+16-gprsize |
sub rsp, pad |
add dword [r1], 32 |
%if ARCH_X86_64 |
IDCT8_ADD_SSE_START r1, rsp |
SWAP 1, 9 |
SWAP 2, 10 |
SWAP 3, 11 |
SWAP 5, 13 |
SWAP 6, 14 |
SWAP 7, 15 |
IDCT8_ADD_SSE_START r1+16, rsp+128 |
PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7 |
IDCT8_1D [rsp], [rsp+128] |
SWAP 0, 8 |
SWAP 1, 9 |
SWAP 2, 10 |
SWAP 3, 11 |
SWAP 4, 12 |
SWAP 5, 13 |
SWAP 6, 14 |
SWAP 7, 15 |
IDCT8_1D [rsp+16], [rsp+144] |
psrad m8, 6 |
psrad m0, 6 |
packssdw m8, m0 |
paddsw m8, [r0] |
pxor m0, m0 |
mova [r1+ 0], m0 |
mova [r1+ 16], m0 |
mova [r1+ 32], m0 |
mova [r1+ 48], m0 |
mova [r1+ 64], m0 |
mova [r1+ 80], m0 |
mova [r1+ 96], m0 |
mova [r1+112], m0 |
mova [r1+128], m0 |
mova [r1+144], m0 |
mova [r1+160], m0 |
mova [r1+176], m0 |
mova [r1+192], m0 |
mova [r1+208], m0 |
mova [r1+224], m0 |
mova [r1+240], m0 |
CLIPW m8, m0, [pw_pixel_max] |
mova [r0], m8 |
mova m8, [pw_pixel_max] |
STORE_DIFF16 m9, m1, m0, m8, r0+r2 |
lea r0, [r0+r2*2] |
STORE_DIFF16 m10, m2, m0, m8, r0 |
STORE_DIFF16 m11, m3, m0, m8, r0+r2 |
lea r0, [r0+r2*2] |
STORE_DIFF16 m12, m4, m0, m8, r0 |
STORE_DIFF16 m13, m5, m0, m8, r0+r2 |
lea r0, [r0+r2*2] |
STORE_DIFF16 m14, m6, m0, m8, r0 |
STORE_DIFF16 m15, m7, m0, m8, r0+r2 |
%else |
IDCT8_ADD_SSE_START r1, rsp |
IDCT8_ADD_SSE_START r1+16, rsp+128 |
lea r3, [r0+8] |
IDCT8_ADD_SSE_END r0, rsp, r2 |
IDCT8_ADD_SSE_END r3, rsp+16, r2 |
mova [r1+ 0], m7 |
mova [r1+ 16], m7 |
mova [r1+ 32], m7 |
mova [r1+ 48], m7 |
mova [r1+ 64], m7 |
mova [r1+ 80], m7 |
mova [r1+ 96], m7 |
mova [r1+112], m7 |
mova [r1+128], m7 |
mova [r1+144], m7 |
mova [r1+160], m7 |
mova [r1+176], m7 |
mova [r1+192], m7 |
mova [r1+208], m7 |
mova [r1+224], m7 |
mova [r1+240], m7 |
%endif ; ARCH_X86_64 |
add rsp, pad |
ret |
%endmacro |
INIT_XMM sse2 |
IDCT8_ADD |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
IDCT8_ADD |
%endif |
;----------------------------------------------------------------------------- |
; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
;----------------------------------------------------------------------------- |
;;;;;;; NO FATE SAMPLES TRIGGER THIS |
%macro IDCT8_ADD4_OP 2 |
cmp byte [r4+%2], 0 |
jz .skipblock%1 |
mov r0d, [r6+%1*4] |
add r0, r5 |
call h264_idct8_add1_10 %+ SUFFIX |
.skipblock%1: |
%if %1<12 |
add r1, 256 |
%endif |
%endmacro |
%macro IDCT8_ADD4 0 |
cglobal h264_idct8_add4_10, 0,7,16 |
%assign pad 16-gprsize-(stack_offset&15) |
SUB rsp, pad |
mov r5, r0mp |
mov r6, r1mp |
mov r1, r2mp |
mov r2d, r3m |
movifnidn r4, r4mp |
IDCT8_ADD4_OP 0, 4+1*8 |
IDCT8_ADD4_OP 4, 6+1*8 |
IDCT8_ADD4_OP 8, 4+3*8 |
IDCT8_ADD4_OP 12, 6+3*8 |
ADD rsp, pad |
RET |
%endmacro ; IDCT8_ADD4 |
INIT_XMM sse2 |
IDCT8_ADD4 |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
IDCT8_ADD4 |
%endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_intrapred.asm |
---|
0,0 → 1,2702 |
;****************************************************************************** |
;* H.264 intra prediction asm optimizations |
;* Copyright (c) 2010 Jason Garrett-Glaser |
;* Copyright (c) 2010 Holger Lubitz |
;* Copyright (c) 2010 Loren Merritt |
;* Copyright (c) 2010 Ronald S. Bultje |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
tm_shuf: times 8 db 0x03, 0x80 |
pw_ff00: times 8 dw 0xff00 |
plane_shuf: db -8, -7, -6, -5, -4, -3, -2, -1 |
db 1, 2, 3, 4, 5, 6, 7, 8 |
plane8_shuf: db -4, -3, -2, -1, 0, 0, 0, 0 |
db 1, 2, 3, 4, 0, 0, 0, 0 |
pw_0to7: dw 0, 1, 2, 3, 4, 5, 6, 7 |
pw_1to8: dw 1, 2, 3, 4, 5, 6, 7, 8 |
pw_m8tom1: dw -8, -7, -6, -5, -4, -3, -2, -1 |
pw_m4to4: dw -4, -3, -2, -1, 1, 2, 3, 4 |
SECTION .text |
cextern pb_1 |
cextern pb_3 |
cextern pw_4 |
cextern pw_5 |
cextern pw_8 |
cextern pw_16 |
cextern pw_17 |
cextern pw_32 |
;----------------------------------------------------------------------------- |
; void pred16x16_vertical_8(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmx |
cglobal pred16x16_vertical_8, 2,3 |
sub r0, r1 |
mov r2, 8 |
movq mm0, [r0+0] |
movq mm1, [r0+8] |
.loop: |
movq [r0+r1*1+0], mm0 |
movq [r0+r1*1+8], mm1 |
movq [r0+r1*2+0], mm0 |
movq [r0+r1*2+8], mm1 |
lea r0, [r0+r1*2] |
dec r2 |
jg .loop |
REP_RET |
INIT_XMM sse |
cglobal pred16x16_vertical_8, 2,3 |
sub r0, r1 |
mov r2, 4 |
movaps xmm0, [r0] |
.loop: |
movaps [r0+r1*1], xmm0 |
movaps [r0+r1*2], xmm0 |
lea r0, [r0+r1*2] |
movaps [r0+r1*1], xmm0 |
movaps [r0+r1*2], xmm0 |
lea r0, [r0+r1*2] |
dec r2 |
jg .loop |
REP_RET |
;----------------------------------------------------------------------------- |
; void pred16x16_horizontal_8(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED16x16_H 0 |
cglobal pred16x16_horizontal_8, 2,3 |
mov r2, 8 |
%if cpuflag(ssse3) |
mova m2, [pb_3] |
%endif |
.loop: |
movd m0, [r0+r1*0-4] |
movd m1, [r0+r1*1-4] |
%if cpuflag(ssse3) |
pshufb m0, m2 |
pshufb m1, m2 |
%else |
punpcklbw m0, m0 |
punpcklbw m1, m1 |
SPLATW m0, m0, 3 |
SPLATW m1, m1, 3 |
mova [r0+r1*0+8], m0 |
mova [r0+r1*1+8], m1 |
%endif |
mova [r0+r1*0], m0 |
mova [r0+r1*1], m1 |
lea r0, [r0+r1*2] |
dec r2 |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmx |
PRED16x16_H |
INIT_MMX mmxext |
PRED16x16_H |
INIT_XMM ssse3 |
PRED16x16_H |
;----------------------------------------------------------------------------- |
; void pred16x16_dc_8(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED16x16_DC 0 |
cglobal pred16x16_dc_8, 2,7 |
mov r4, r0 |
sub r0, r1 |
pxor mm0, mm0 |
pxor mm1, mm1 |
psadbw mm0, [r0+0] |
psadbw mm1, [r0+8] |
dec r0 |
movzx r5d, byte [r0+r1*1] |
paddw mm0, mm1 |
movd r6d, mm0 |
lea r0, [r0+r1*2] |
%rep 7 |
movzx r2d, byte [r0+r1*0] |
movzx r3d, byte [r0+r1*1] |
add r5d, r2d |
add r6d, r3d |
lea r0, [r0+r1*2] |
%endrep |
movzx r2d, byte [r0+r1*0] |
add r5d, r6d |
lea r2d, [r2+r5+16] |
shr r2d, 5 |
%if cpuflag(ssse3) |
pxor m1, m1 |
%endif |
SPLATB_REG m0, r2, m1 |
%if mmsize==8 |
mov r3d, 8 |
.loop: |
mova [r4+r1*0+0], m0 |
mova [r4+r1*0+8], m0 |
mova [r4+r1*1+0], m0 |
mova [r4+r1*1+8], m0 |
%else |
mov r3d, 4 |
.loop: |
mova [r4+r1*0], m0 |
mova [r4+r1*1], m0 |
lea r4, [r4+r1*2] |
mova [r4+r1*0], m0 |
mova [r4+r1*1], m0 |
%endif |
lea r4, [r4+r1*2] |
dec r3d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PRED16x16_DC |
INIT_XMM sse2 |
PRED16x16_DC |
INIT_XMM ssse3 |
PRED16x16_DC |
;----------------------------------------------------------------------------- |
; void pred16x16_tm_vp8_8(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED16x16_TM 0 |
cglobal pred16x16_tm_vp8_8, 2,5 |
sub r0, r1 |
pxor mm7, mm7 |
movq mm0, [r0+0] |
movq mm2, [r0+8] |
movq mm1, mm0 |
movq mm3, mm2 |
punpcklbw mm0, mm7 |
punpckhbw mm1, mm7 |
punpcklbw mm2, mm7 |
punpckhbw mm3, mm7 |
movzx r3d, byte [r0-1] |
mov r4d, 16 |
.loop: |
movzx r2d, byte [r0+r1-1] |
sub r2d, r3d |
movd mm4, r2d |
SPLATW mm4, mm4, 0 |
movq mm5, mm4 |
movq mm6, mm4 |
movq mm7, mm4 |
paddw mm4, mm0 |
paddw mm5, mm1 |
paddw mm6, mm2 |
paddw mm7, mm3 |
packuswb mm4, mm5 |
packuswb mm6, mm7 |
movq [r0+r1+0], mm4 |
movq [r0+r1+8], mm6 |
add r0, r1 |
dec r4d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmx |
PRED16x16_TM |
INIT_MMX mmxext |
PRED16x16_TM |
INIT_XMM sse2 |
cglobal pred16x16_tm_vp8_8, 2,6,6 |
sub r0, r1 |
pxor xmm2, xmm2 |
movdqa xmm0, [r0] |
movdqa xmm1, xmm0 |
punpcklbw xmm0, xmm2 |
punpckhbw xmm1, xmm2 |
movzx r4d, byte [r0-1] |
mov r5d, 8 |
.loop: |
movzx r2d, byte [r0+r1*1-1] |
movzx r3d, byte [r0+r1*2-1] |
sub r2d, r4d |
sub r3d, r4d |
movd xmm2, r2d |
movd xmm4, r3d |
pshuflw xmm2, xmm2, 0 |
pshuflw xmm4, xmm4, 0 |
punpcklqdq xmm2, xmm2 |
punpcklqdq xmm4, xmm4 |
movdqa xmm3, xmm2 |
movdqa xmm5, xmm4 |
paddw xmm2, xmm0 |
paddw xmm3, xmm1 |
paddw xmm4, xmm0 |
paddw xmm5, xmm1 |
packuswb xmm2, xmm3 |
packuswb xmm4, xmm5 |
movdqa [r0+r1*1], xmm2 |
movdqa [r0+r1*2], xmm4 |
lea r0, [r0+r1*2] |
dec r5d |
jg .loop |
REP_RET |
;----------------------------------------------------------------------------- |
; void pred16x16_plane_*_8(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro H264_PRED16x16_PLANE 1 |
cglobal pred16x16_plane_%1_8, 2,9,7 |
mov r2, r1 ; +stride |
neg r1 ; -stride |
movh m0, [r0+r1 -1] |
%if mmsize == 8 |
pxor m4, m4 |
movh m1, [r0+r1 +3 ] |
movh m2, [r0+r1 +8 ] |
movh m3, [r0+r1 +12] |
punpcklbw m0, m4 |
punpcklbw m1, m4 |
punpcklbw m2, m4 |
punpcklbw m3, m4 |
pmullw m0, [pw_m8tom1 ] |
pmullw m1, [pw_m8tom1+8] |
pmullw m2, [pw_1to8 ] |
pmullw m3, [pw_1to8 +8] |
paddw m0, m2 |
paddw m1, m3 |
%else ; mmsize == 16 |
%if cpuflag(ssse3) |
movhps m0, [r0+r1 +8] |
pmaddubsw m0, [plane_shuf] ; H coefficients |
%else ; sse2 |
pxor m2, m2 |
movh m1, [r0+r1 +8] |
punpcklbw m0, m2 |
punpcklbw m1, m2 |
pmullw m0, [pw_m8tom1] |
pmullw m1, [pw_1to8] |
paddw m0, m1 |
%endif |
movhlps m1, m0 |
%endif |
paddw m0, m1 |
%if cpuflag(mmxext) |
PSHUFLW m1, m0, 0xE |
%elif cpuflag(mmx) |
mova m1, m0 |
psrlq m1, 32 |
%endif |
paddw m0, m1 |
%if cpuflag(mmxext) |
PSHUFLW m1, m0, 0x1 |
%elif cpuflag(mmx) |
mova m1, m0 |
psrlq m1, 16 |
%endif |
paddw m0, m1 ; sum of H coefficients |
lea r4, [r0+r2*8-1] |
lea r3, [r0+r2*4-1] |
add r4, r2 |
%if ARCH_X86_64 |
%define e_reg r8 |
%else |
%define e_reg r0 |
%endif |
movzx e_reg, byte [r3+r2*2 ] |
movzx r5, byte [r4+r1 ] |
sub r5, e_reg |
movzx e_reg, byte [r3+r2 ] |
movzx r6, byte [r4 ] |
sub r6, e_reg |
lea r5, [r5+r6*2] |
movzx e_reg, byte [r3+r1 ] |
movzx r6, byte [r4+r2*2 ] |
sub r6, e_reg |
lea r5, [r5+r6*4] |
movzx e_reg, byte [r3 ] |
%if ARCH_X86_64 |
movzx r7, byte [r4+r2 ] |
sub r7, e_reg |
%else |
movzx r6, byte [r4+r2 ] |
sub r6, e_reg |
lea r5, [r5+r6*4] |
sub r5, r6 |
%endif |
lea e_reg, [r3+r1*4] |
lea r3, [r4+r2*4] |
movzx r4, byte [e_reg+r2 ] |
movzx r6, byte [r3 ] |
sub r6, r4 |
%if ARCH_X86_64 |
lea r6, [r7+r6*2] |
lea r5, [r5+r6*2] |
add r5, r6 |
%else |
lea r5, [r5+r6*4] |
lea r5, [r5+r6*2] |
%endif |
movzx r4, byte [e_reg ] |
%if ARCH_X86_64 |
movzx r7, byte [r3 +r2 ] |
sub r7, r4 |
sub r5, r7 |
%else |
movzx r6, byte [r3 +r2 ] |
sub r6, r4 |
lea r5, [r5+r6*8] |
sub r5, r6 |
%endif |
movzx r4, byte [e_reg+r1 ] |
movzx r6, byte [r3 +r2*2] |
sub r6, r4 |
%if ARCH_X86_64 |
add r6, r7 |
%endif |
lea r5, [r5+r6*8] |
movzx r4, byte [e_reg+r2*2] |
movzx r6, byte [r3 +r1 ] |
sub r6, r4 |
lea r5, [r5+r6*4] |
add r5, r6 ; sum of V coefficients |
%if ARCH_X86_64 == 0 |
mov r0, r0m |
%endif |
%ifidn %1, h264 |
lea r5, [r5*5+32] |
sar r5, 6 |
%elifidn %1, rv40 |
lea r5, [r5*5] |
sar r5, 6 |
%elifidn %1, svq3 |
test r5, r5 |
lea r6, [r5+3] |
cmovs r5, r6 |
sar r5, 2 ; V/4 |
lea r5, [r5*5] ; 5*(V/4) |
test r5, r5 |
lea r6, [r5+15] |
cmovs r5, r6 |
sar r5, 4 ; (5*(V/4))/16 |
%endif |
movzx r4, byte [r0+r1 +15] |
movzx r3, byte [r3+r2*2 ] |
lea r3, [r3+r4+1] |
shl r3, 4 |
movd r1d, m0 |
movsx r1d, r1w |
%ifnidn %1, svq3 |
%ifidn %1, h264 |
lea r1d, [r1d*5+32] |
%else ; rv40 |
lea r1d, [r1d*5] |
%endif |
sar r1d, 6 |
%else ; svq3 |
test r1d, r1d |
lea r4d, [r1d+3] |
cmovs r1d, r4d |
sar r1d, 2 ; H/4 |
lea r1d, [r1d*5] ; 5*(H/4) |
test r1d, r1d |
lea r4d, [r1d+15] |
cmovs r1d, r4d |
sar r1d, 4 ; (5*(H/4))/16 |
%endif |
movd m0, r1d |
add r1d, r5d |
add r3d, r1d |
shl r1d, 3 |
sub r3d, r1d ; a |
movd m1, r5d |
movd m3, r3d |
SPLATW m0, m0, 0 ; H |
SPLATW m1, m1, 0 ; V |
SPLATW m3, m3, 0 ; a |
%ifidn %1, svq3 |
SWAP 0, 1 |
%endif |
mova m2, m0 |
%if mmsize == 8 |
mova m5, m0 |
%endif |
pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) |
%if mmsize == 16 |
psllw m2, 3 |
%else |
psllw m5, 3 |
psllw m2, 2 |
mova m6, m5 |
paddw m6, m2 |
%endif |
paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H |
paddw m2, m0 ; a + {8,9,10,11,12,13,14,15}*H |
%if mmsize == 8 |
paddw m5, m0 ; a + {8,9,10,11}*H |
paddw m6, m0 ; a + {12,13,14,15}*H |
%endif |
mov r4, 8 |
.loop: |
mova m3, m0 ; b[0..7] |
mova m4, m2 ; b[8..15] |
psraw m3, 5 |
psraw m4, 5 |
packuswb m3, m4 |
mova [r0], m3 |
%if mmsize == 8 |
mova m3, m5 ; b[8..11] |
mova m4, m6 ; b[12..15] |
psraw m3, 5 |
psraw m4, 5 |
packuswb m3, m4 |
mova [r0+8], m3 |
%endif |
paddw m0, m1 |
paddw m2, m1 |
%if mmsize == 8 |
paddw m5, m1 |
paddw m6, m1 |
%endif |
mova m3, m0 ; b[0..7] |
mova m4, m2 ; b[8..15] |
psraw m3, 5 |
psraw m4, 5 |
packuswb m3, m4 |
mova [r0+r2], m3 |
%if mmsize == 8 |
mova m3, m5 ; b[8..11] |
mova m4, m6 ; b[12..15] |
psraw m3, 5 |
psraw m4, 5 |
packuswb m3, m4 |
mova [r0+r2+8], m3 |
%endif |
paddw m0, m1 |
paddw m2, m1 |
%if mmsize == 8 |
paddw m5, m1 |
paddw m6, m1 |
%endif |
lea r0, [r0+r2*2] |
dec r4 |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmx |
H264_PRED16x16_PLANE h264 |
H264_PRED16x16_PLANE rv40 |
H264_PRED16x16_PLANE svq3 |
INIT_MMX mmxext |
H264_PRED16x16_PLANE h264 |
H264_PRED16x16_PLANE rv40 |
H264_PRED16x16_PLANE svq3 |
INIT_XMM sse2 |
H264_PRED16x16_PLANE h264 |
H264_PRED16x16_PLANE rv40 |
H264_PRED16x16_PLANE svq3 |
INIT_XMM ssse3 |
H264_PRED16x16_PLANE h264 |
H264_PRED16x16_PLANE rv40 |
H264_PRED16x16_PLANE svq3 |
;----------------------------------------------------------------------------- |
; void pred8x8_plane_8(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro H264_PRED8x8_PLANE 0 |
cglobal pred8x8_plane_8, 2,9,7 |
mov r2, r1 ; +stride |
neg r1 ; -stride |
movd m0, [r0+r1 -1] |
%if mmsize == 8 |
pxor m2, m2 |
movh m1, [r0+r1 +4 ] |
punpcklbw m0, m2 |
punpcklbw m1, m2 |
pmullw m0, [pw_m4to4] |
pmullw m1, [pw_m4to4+8] |
%else ; mmsize == 16 |
%if cpuflag(ssse3) |
movhps m0, [r0+r1 +4] ; this reads 4 bytes more than necessary |
pmaddubsw m0, [plane8_shuf] ; H coefficients |
%else ; sse2 |
pxor m2, m2 |
movd m1, [r0+r1 +4] |
punpckldq m0, m1 |
punpcklbw m0, m2 |
pmullw m0, [pw_m4to4] |
%endif |
movhlps m1, m0 |
%endif |
paddw m0, m1 |
%if notcpuflag(ssse3) |
%if cpuflag(mmxext) |
PSHUFLW m1, m0, 0xE |
%elif cpuflag(mmx) |
mova m1, m0 |
psrlq m1, 32 |
%endif |
paddw m0, m1 |
%endif ; !ssse3 |
%if cpuflag(mmxext) |
PSHUFLW m1, m0, 0x1 |
%elif cpuflag(mmx) |
mova m1, m0 |
psrlq m1, 16 |
%endif |
paddw m0, m1 ; sum of H coefficients |
lea r4, [r0+r2*4-1] |
lea r3, [r0 -1] |
add r4, r2 |
%if ARCH_X86_64 |
%define e_reg r8 |
%else |
%define e_reg r0 |
%endif |
movzx e_reg, byte [r3+r2*2 ] |
movzx r5, byte [r4+r1 ] |
sub r5, e_reg |
movzx e_reg, byte [r3 ] |
%if ARCH_X86_64 |
movzx r7, byte [r4+r2 ] |
sub r7, e_reg |
sub r5, r7 |
%else |
movzx r6, byte [r4+r2 ] |
sub r6, e_reg |
lea r5, [r5+r6*4] |
sub r5, r6 |
%endif |
movzx e_reg, byte [r3+r1 ] |
movzx r6, byte [r4+r2*2 ] |
sub r6, e_reg |
%if ARCH_X86_64 |
add r6, r7 |
%endif |
lea r5, [r5+r6*4] |
movzx e_reg, byte [r3+r2 ] |
movzx r6, byte [r4 ] |
sub r6, e_reg |
lea r6, [r5+r6*2] |
lea r5, [r6*9+16] |
lea r5, [r5+r6*8] |
sar r5, 5 |
%if ARCH_X86_64 == 0 |
mov r0, r0m |
%endif |
movzx r3, byte [r4+r2*2 ] |
movzx r4, byte [r0+r1 +7] |
lea r3, [r3+r4+1] |
shl r3, 4 |
movd r1d, m0 |
movsx r1d, r1w |
imul r1d, 17 |
add r1d, 16 |
sar r1d, 5 |
movd m0, r1d |
add r1d, r5d |
sub r3d, r1d |
add r1d, r1d |
sub r3d, r1d ; a |
movd m1, r5d |
movd m3, r3d |
SPLATW m0, m0, 0 ; H |
SPLATW m1, m1, 0 ; V |
SPLATW m3, m3, 0 ; a |
%if mmsize == 8 |
mova m2, m0 |
%endif |
pmullw m0, [pw_0to7] ; 0*H, 1*H, ..., 7*H (words) |
paddw m0, m3 ; a + {0,1,2,3,4,5,6,7}*H |
%if mmsize == 8 |
psllw m2, 2 |
paddw m2, m0 ; a + {4,5,6,7}*H |
%endif |
mov r4, 4 |
ALIGN 16 |
.loop: |
%if mmsize == 16 |
mova m3, m0 ; b[0..7] |
paddw m0, m1 |
psraw m3, 5 |
mova m4, m0 ; V+b[0..7] |
paddw m0, m1 |
psraw m4, 5 |
packuswb m3, m4 |
movh [r0], m3 |
movhps [r0+r2], m3 |
%else ; mmsize == 8 |
mova m3, m0 ; b[0..3] |
mova m4, m2 ; b[4..7] |
paddw m0, m1 |
paddw m2, m1 |
psraw m3, 5 |
psraw m4, 5 |
mova m5, m0 ; V+b[0..3] |
mova m6, m2 ; V+b[4..7] |
paddw m0, m1 |
paddw m2, m1 |
psraw m5, 5 |
psraw m6, 5 |
packuswb m3, m4 |
packuswb m5, m6 |
mova [r0], m3 |
mova [r0+r2], m5 |
%endif |
lea r0, [r0+r2*2] |
dec r4 |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmx |
H264_PRED8x8_PLANE |
INIT_MMX mmxext |
H264_PRED8x8_PLANE |
INIT_XMM sse2 |
H264_PRED8x8_PLANE |
INIT_XMM ssse3 |
H264_PRED8x8_PLANE |
;----------------------------------------------------------------------------- |
; void pred8x8_vertical_8(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmx |
cglobal pred8x8_vertical_8, 2,2 |
sub r0, r1 |
movq mm0, [r0] |
%rep 3 |
movq [r0+r1*1], mm0 |
movq [r0+r1*2], mm0 |
lea r0, [r0+r1*2] |
%endrep |
movq [r0+r1*1], mm0 |
movq [r0+r1*2], mm0 |
RET |
;----------------------------------------------------------------------------- |
; void pred8x8_horizontal_8(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8_H 0 |
cglobal pred8x8_horizontal_8, 2,3 |
mov r2, 4 |
%if cpuflag(ssse3) |
mova m2, [pb_3] |
%endif |
.loop: |
SPLATB_LOAD m0, r0+r1*0-1, m2 |
SPLATB_LOAD m1, r0+r1*1-1, m2 |
mova [r0+r1*0], m0 |
mova [r0+r1*1], m1 |
lea r0, [r0+r1*2] |
dec r2 |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmx |
PRED8x8_H |
INIT_MMX mmxext |
PRED8x8_H |
INIT_MMX ssse3 |
PRED8x8_H |
;----------------------------------------------------------------------------- |
; void pred8x8_top_dc_8_mmxext(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred8x8_top_dc_8, 2,5 |
sub r0, r1 |
movq mm0, [r0] |
pxor mm1, mm1 |
pxor mm2, mm2 |
lea r2, [r0+r1*2] |
punpckhbw mm1, mm0 |
punpcklbw mm0, mm2 |
psadbw mm1, mm2 ; s1 |
lea r3, [r2+r1*2] |
psadbw mm0, mm2 ; s0 |
psrlw mm1, 1 |
psrlw mm0, 1 |
pavgw mm1, mm2 |
lea r4, [r3+r1*2] |
pavgw mm0, mm2 |
pshufw mm1, mm1, 0 |
pshufw mm0, mm0, 0 ; dc0 (w) |
packuswb mm0, mm1 ; dc0,dc1 (b) |
movq [r0+r1*1], mm0 |
movq [r0+r1*2], mm0 |
lea r0, [r3+r1*2] |
movq [r2+r1*1], mm0 |
movq [r2+r1*2], mm0 |
movq [r3+r1*1], mm0 |
movq [r3+r1*2], mm0 |
movq [r0+r1*1], mm0 |
movq [r0+r1*2], mm0 |
RET |
;----------------------------------------------------------------------------- |
; void pred8x8_dc_8_mmxext(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred8x8_dc_8, 2,5 |
sub r0, r1 |
pxor m7, m7 |
movd m0, [r0+0] |
movd m1, [r0+4] |
psadbw m0, m7 ; s0 |
mov r4, r0 |
psadbw m1, m7 ; s1 |
movzx r2d, byte [r0+r1*1-1] |
movzx r3d, byte [r0+r1*2-1] |
lea r0, [r0+r1*2] |
add r2d, r3d |
movzx r3d, byte [r0+r1*1-1] |
add r2d, r3d |
movzx r3d, byte [r0+r1*2-1] |
add r2d, r3d |
lea r0, [r0+r1*2] |
movd m2, r2d ; s2 |
movzx r2d, byte [r0+r1*1-1] |
movzx r3d, byte [r0+r1*2-1] |
lea r0, [r0+r1*2] |
add r2d, r3d |
movzx r3d, byte [r0+r1*1-1] |
add r2d, r3d |
movzx r3d, byte [r0+r1*2-1] |
add r2d, r3d |
movd m3, r2d ; s3 |
punpcklwd m0, m1 |
mov r0, r4 |
punpcklwd m2, m3 |
punpckldq m0, m2 ; s0, s1, s2, s3 |
pshufw m3, m0, 11110110b ; s2, s1, s3, s3 |
lea r2, [r0+r1*2] |
pshufw m0, m0, 01110100b ; s0, s1, s3, s1 |
paddw m0, m3 |
lea r3, [r2+r1*2] |
psrlw m0, 2 |
pavgw m0, m7 ; s0+s2, s1, s3, s1+s3 |
lea r4, [r3+r1*2] |
packuswb m0, m0 |
punpcklbw m0, m0 |
movq m1, m0 |
punpcklbw m0, m0 |
punpckhbw m1, m1 |
movq [r0+r1*1], m0 |
movq [r0+r1*2], m0 |
movq [r2+r1*1], m0 |
movq [r2+r1*2], m0 |
movq [r3+r1*1], m1 |
movq [r3+r1*2], m1 |
movq [r4+r1*1], m1 |
movq [r4+r1*2], m1 |
RET |
;----------------------------------------------------------------------------- |
; void pred8x8_dc_rv40_8(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred8x8_dc_rv40_8, 2,7 |
mov r4, r0 |
sub r0, r1 |
pxor mm0, mm0 |
psadbw mm0, [r0] |
dec r0 |
movzx r5d, byte [r0+r1*1] |
movd r6d, mm0 |
lea r0, [r0+r1*2] |
%rep 3 |
movzx r2d, byte [r0+r1*0] |
movzx r3d, byte [r0+r1*1] |
add r5d, r2d |
add r6d, r3d |
lea r0, [r0+r1*2] |
%endrep |
movzx r2d, byte [r0+r1*0] |
add r5d, r6d |
lea r2d, [r2+r5+8] |
shr r2d, 4 |
movd mm0, r2d |
punpcklbw mm0, mm0 |
pshufw mm0, mm0, 0 |
mov r3d, 4 |
.loop: |
movq [r4+r1*0], mm0 |
movq [r4+r1*1], mm0 |
lea r4, [r4+r1*2] |
dec r3d |
jg .loop |
REP_RET |
;----------------------------------------------------------------------------- |
; void pred8x8_tm_vp8_8(uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8_TM 0 |
cglobal pred8x8_tm_vp8_8, 2,6 |
sub r0, r1 |
pxor mm7, mm7 |
movq mm0, [r0] |
movq mm1, mm0 |
punpcklbw mm0, mm7 |
punpckhbw mm1, mm7 |
movzx r4d, byte [r0-1] |
mov r5d, 4 |
.loop: |
movzx r2d, byte [r0+r1*1-1] |
movzx r3d, byte [r0+r1*2-1] |
sub r2d, r4d |
sub r3d, r4d |
movd mm2, r2d |
movd mm4, r3d |
SPLATW mm2, mm2, 0 |
SPLATW mm4, mm4, 0 |
movq mm3, mm2 |
movq mm5, mm4 |
paddw mm2, mm0 |
paddw mm3, mm1 |
paddw mm4, mm0 |
paddw mm5, mm1 |
packuswb mm2, mm3 |
packuswb mm4, mm5 |
movq [r0+r1*1], mm2 |
movq [r0+r1*2], mm4 |
lea r0, [r0+r1*2] |
dec r5d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmx |
PRED8x8_TM |
INIT_MMX mmxext |
PRED8x8_TM |
INIT_XMM sse2 |
cglobal pred8x8_tm_vp8_8, 2,6,4 |
sub r0, r1 |
pxor xmm1, xmm1 |
movq xmm0, [r0] |
punpcklbw xmm0, xmm1 |
movzx r4d, byte [r0-1] |
mov r5d, 4 |
.loop: |
movzx r2d, byte [r0+r1*1-1] |
movzx r3d, byte [r0+r1*2-1] |
sub r2d, r4d |
sub r3d, r4d |
movd xmm2, r2d |
movd xmm3, r3d |
pshuflw xmm2, xmm2, 0 |
pshuflw xmm3, xmm3, 0 |
punpcklqdq xmm2, xmm2 |
punpcklqdq xmm3, xmm3 |
paddw xmm2, xmm0 |
paddw xmm3, xmm0 |
packuswb xmm2, xmm3 |
movq [r0+r1*1], xmm2 |
movhps [r0+r1*2], xmm2 |
lea r0, [r0+r1*2] |
dec r5d |
jg .loop |
REP_RET |
INIT_XMM ssse3 |
cglobal pred8x8_tm_vp8_8, 2,3,6 |
sub r0, r1 |
movdqa xmm4, [tm_shuf] |
pxor xmm1, xmm1 |
movq xmm0, [r0] |
punpcklbw xmm0, xmm1 |
movd xmm5, [r0-4] |
pshufb xmm5, xmm4 |
mov r2d, 4 |
.loop: |
movd xmm2, [r0+r1*1-4] |
movd xmm3, [r0+r1*2-4] |
pshufb xmm2, xmm4 |
pshufb xmm3, xmm4 |
psubw xmm2, xmm5 |
psubw xmm3, xmm5 |
paddw xmm2, xmm0 |
paddw xmm3, xmm0 |
packuswb xmm2, xmm3 |
movq [r0+r1*1], xmm2 |
movhps [r0+r1*2], xmm2 |
lea r0, [r0+r1*2] |
dec r2d |
jg .loop |
REP_RET |
; dest, left, right, src, tmp |
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 |
%macro PRED4x4_LOWPASS 5 |
mova %5, %2 |
pavgb %2, %3 |
pxor %3, %5 |
mova %1, %4 |
pand %3, [pb_1] |
psubusb %2, %3 |
pavgb %1, %2 |
%endmacro |
;----------------------------------------------------------------------------- |
; void pred8x8l_top_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_TOP_DC 0 |
cglobal pred8x8l_top_dc_8, 4,4 |
sub r0, r3 |
pxor mm7, mm7 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 ; top_left |
jz .fix_lt_2 |
test r2, r2 ; top_right |
jz .fix_tr_1 |
jmp .body |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 ; top_right |
jnz .body |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
.body: |
PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 |
psadbw mm7, mm0 |
paddw mm7, [pw_4] |
psrlw mm7, 3 |
pshufw mm7, mm7, 0 |
packuswb mm7, mm7 |
%rep 3 |
movq [r0+r3*1], mm7 |
movq [r0+r3*2], mm7 |
lea r0, [r0+r3*2] |
%endrep |
movq [r0+r3*1], mm7 |
movq [r0+r3*2], mm7 |
RET |
%endmacro |
INIT_MMX mmxext |
PRED8x8L_TOP_DC |
INIT_MMX ssse3 |
PRED8x8L_TOP_DC |
;----------------------------------------------------------------------------- |
;void pred8x8l_dc_8(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_DC 0 |
cglobal pred8x8l_dc_8, 4,5 |
sub r0, r3 |
lea r4, [r0+r3*2] |
movq mm0, [r0+r3*1-8] |
punpckhbw mm0, [r0+r3*0-8] |
movq mm1, [r4+r3*1-8] |
punpckhbw mm1, [r0+r3*2-8] |
mov r4, r0 |
punpckhwd mm1, mm0 |
lea r0, [r0+r3*4] |
movq mm2, [r0+r3*1-8] |
punpckhbw mm2, [r0+r3*0-8] |
lea r0, [r0+r3*2] |
movq mm3, [r0+r3*1-8] |
punpckhbw mm3, [r0+r3*0-8] |
punpckhwd mm3, mm2 |
punpckhdq mm3, mm1 |
lea r0, [r0+r3*2] |
movq mm0, [r0+r3*0-8] |
movq mm1, [r4] |
mov r0, r4 |
movq mm4, mm3 |
movq mm2, mm3 |
PALIGNR mm4, mm0, 7, mm0 |
PALIGNR mm1, mm2, 1, mm2 |
test r1, r1 |
jnz .do_left |
.fix_lt_1: |
movq mm5, mm3 |
pxor mm5, mm4 |
psrlq mm5, 56 |
psllq mm5, 48 |
pxor mm1, mm5 |
jmp .do_left |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 |
jnz .body |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
jmp .body |
.do_left: |
movq mm0, mm4 |
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
movq mm4, mm0 |
movq mm7, mm2 |
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
psllq mm1, 56 |
PALIGNR mm7, mm1, 7, mm3 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 |
jz .fix_lt_2 |
test r2, r2 |
jz .fix_tr_1 |
.body: |
lea r1, [r0+r3*2] |
PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 |
pxor mm0, mm0 |
pxor mm1, mm1 |
lea r2, [r1+r3*2] |
psadbw mm0, mm7 |
psadbw mm1, mm6 |
paddw mm0, [pw_8] |
paddw mm0, mm1 |
lea r4, [r2+r3*2] |
psrlw mm0, 4 |
pshufw mm0, mm0, 0 |
packuswb mm0, mm0 |
movq [r0+r3*1], mm0 |
movq [r0+r3*2], mm0 |
movq [r1+r3*1], mm0 |
movq [r1+r3*2], mm0 |
movq [r2+r3*1], mm0 |
movq [r2+r3*2], mm0 |
movq [r4+r3*1], mm0 |
movq [r4+r3*2], mm0 |
RET |
%endmacro |
INIT_MMX mmxext |
PRED8x8L_DC |
INIT_MMX ssse3 |
PRED8x8L_DC |
;----------------------------------------------------------------------------- |
; void pred8x8l_horizontal_8(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_HORIZONTAL 0 |
cglobal pred8x8l_horizontal_8, 4,4 |
sub r0, r3 |
lea r2, [r0+r3*2] |
movq mm0, [r0+r3*1-8] |
test r1, r1 |
lea r1, [r0+r3] |
cmovnz r1, r0 |
punpckhbw mm0, [r1+r3*0-8] |
movq mm1, [r2+r3*1-8] |
punpckhbw mm1, [r0+r3*2-8] |
mov r2, r0 |
punpckhwd mm1, mm0 |
lea r0, [r0+r3*4] |
movq mm2, [r0+r3*1-8] |
punpckhbw mm2, [r0+r3*0-8] |
lea r0, [r0+r3*2] |
movq mm3, [r0+r3*1-8] |
punpckhbw mm3, [r0+r3*0-8] |
punpckhwd mm3, mm2 |
punpckhdq mm3, mm1 |
lea r0, [r0+r3*2] |
movq mm0, [r0+r3*0-8] |
movq mm1, [r1+r3*0-8] |
mov r0, r2 |
movq mm4, mm3 |
movq mm2, mm3 |
PALIGNR mm4, mm0, 7, mm0 |
PALIGNR mm1, mm2, 1, mm2 |
movq mm0, mm4 |
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
movq mm4, mm0 |
movq mm7, mm2 |
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
psllq mm1, 56 |
PALIGNR mm7, mm1, 7, mm3 |
movq mm3, mm7 |
lea r1, [r0+r3*2] |
movq mm7, mm3 |
punpckhbw mm3, mm3 |
punpcklbw mm7, mm7 |
pshufw mm0, mm3, 0xff |
pshufw mm1, mm3, 0xaa |
lea r2, [r1+r3*2] |
pshufw mm2, mm3, 0x55 |
pshufw mm3, mm3, 0x00 |
pshufw mm4, mm7, 0xff |
pshufw mm5, mm7, 0xaa |
pshufw mm6, mm7, 0x55 |
pshufw mm7, mm7, 0x00 |
movq [r0+r3*1], mm0 |
movq [r0+r3*2], mm1 |
movq [r1+r3*1], mm2 |
movq [r1+r3*2], mm3 |
movq [r2+r3*1], mm4 |
movq [r2+r3*2], mm5 |
lea r0, [r2+r3*2] |
movq [r0+r3*1], mm6 |
movq [r0+r3*2], mm7 |
RET |
%endmacro |
INIT_MMX mmxext |
PRED8x8L_HORIZONTAL |
INIT_MMX ssse3 |
PRED8x8L_HORIZONTAL |
;----------------------------------------------------------------------------- |
; void pred8x8l_vertical_8(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_VERTICAL 0 |
cglobal pred8x8l_vertical_8, 4,4 |
sub r0, r3 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 ; top_left |
jz .fix_lt_2 |
test r2, r2 ; top_right |
jz .fix_tr_1 |
jmp .body |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 ; top_right |
jnz .body |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
.body: |
PRED4x4_LOWPASS mm0, mm2, mm1, mm3, mm5 |
%rep 3 |
movq [r0+r3*1], mm0 |
movq [r0+r3*2], mm0 |
lea r0, [r0+r3*2] |
%endrep |
movq [r0+r3*1], mm0 |
movq [r0+r3*2], mm0 |
RET |
%endmacro |
INIT_MMX mmxext |
PRED8x8L_VERTICAL |
INIT_MMX ssse3 |
PRED8x8L_VERTICAL |
;----------------------------------------------------------------------------- |
;void pred8x8l_down_left_8(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred8x8l_down_left_8, 4,5 |
sub r0, r3 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 |
jz .fix_lt_2 |
test r2, r2 |
jz .fix_tr_1 |
jmp .do_top |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 |
jnz .do_top |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
jmp .do_top |
.fix_tr_2: |
punpckhbw mm3, mm3 |
pshufw mm1, mm3, 0xFF |
jmp .do_topright |
.do_top: |
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
movq mm7, mm4 |
test r2, r2 |
jz .fix_tr_2 |
movq mm0, [r0+8] |
movq mm5, mm0 |
movq mm2, mm0 |
movq mm4, mm0 |
psrlq mm5, 56 |
PALIGNR mm2, mm3, 7, mm3 |
PALIGNR mm5, mm4, 1, mm4 |
PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 |
.do_topright: |
lea r1, [r0+r3*2] |
movq mm6, mm1 |
psrlq mm1, 56 |
movq mm4, mm1 |
lea r2, [r1+r3*2] |
movq mm2, mm6 |
PALIGNR mm2, mm7, 1, mm0 |
movq mm3, mm6 |
PALIGNR mm3, mm7, 7, mm0 |
PALIGNR mm4, mm6, 1, mm0 |
movq mm5, mm7 |
movq mm1, mm7 |
movq mm7, mm6 |
lea r4, [r2+r3*2] |
psllq mm1, 8 |
PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 |
PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 |
movq [r4+r3*2], mm1 |
movq mm2, mm0 |
psllq mm1, 8 |
psrlq mm2, 56 |
psllq mm0, 8 |
por mm1, mm2 |
movq [r4+r3*1], mm1 |
movq mm2, mm0 |
psllq mm1, 8 |
psrlq mm2, 56 |
psllq mm0, 8 |
por mm1, mm2 |
movq [r2+r3*2], mm1 |
movq mm2, mm0 |
psllq mm1, 8 |
psrlq mm2, 56 |
psllq mm0, 8 |
por mm1, mm2 |
movq [r2+r3*1], mm1 |
movq mm2, mm0 |
psllq mm1, 8 |
psrlq mm2, 56 |
psllq mm0, 8 |
por mm1, mm2 |
movq [r1+r3*2], mm1 |
movq mm2, mm0 |
psllq mm1, 8 |
psrlq mm2, 56 |
psllq mm0, 8 |
por mm1, mm2 |
movq [r1+r3*1], mm1 |
movq mm2, mm0 |
psllq mm1, 8 |
psrlq mm2, 56 |
psllq mm0, 8 |
por mm1, mm2 |
movq [r0+r3*2], mm1 |
psllq mm1, 8 |
psrlq mm0, 56 |
por mm1, mm0 |
movq [r0+r3*1], mm1 |
RET |
%macro PRED8x8L_DOWN_LEFT 0 |
cglobal pred8x8l_down_left_8, 4,4 |
sub r0, r3 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 ; top_left |
jz .fix_lt_2 |
test r2, r2 ; top_right |
jz .fix_tr_1 |
jmp .do_top |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 ; top_right |
jnz .do_top |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
jmp .do_top |
.fix_tr_2: |
punpckhbw mm3, mm3 |
pshufw mm1, mm3, 0xFF |
jmp .do_topright |
.do_top: |
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
movq2dq xmm3, mm4 |
test r2, r2 ; top_right |
jz .fix_tr_2 |
movq mm0, [r0+8] |
movq mm5, mm0 |
movq mm2, mm0 |
movq mm4, mm0 |
psrlq mm5, 56 |
PALIGNR mm2, mm3, 7, mm3 |
PALIGNR mm5, mm4, 1, mm4 |
PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 |
.do_topright: |
movq2dq xmm4, mm1 |
psrlq mm1, 56 |
movq2dq xmm5, mm1 |
lea r1, [r0+r3*2] |
pslldq xmm4, 8 |
por xmm3, xmm4 |
movdqa xmm2, xmm3 |
psrldq xmm2, 1 |
pslldq xmm5, 15 |
por xmm2, xmm5 |
lea r2, [r1+r3*2] |
movdqa xmm1, xmm3 |
pslldq xmm1, 1 |
INIT_XMM cpuname |
PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 |
psrldq xmm0, 1 |
movq [r0+r3*1], xmm0 |
psrldq xmm0, 1 |
movq [r0+r3*2], xmm0 |
psrldq xmm0, 1 |
lea r0, [r2+r3*2] |
movq [r1+r3*1], xmm0 |
psrldq xmm0, 1 |
movq [r1+r3*2], xmm0 |
psrldq xmm0, 1 |
movq [r2+r3*1], xmm0 |
psrldq xmm0, 1 |
movq [r2+r3*2], xmm0 |
psrldq xmm0, 1 |
movq [r0+r3*1], xmm0 |
psrldq xmm0, 1 |
movq [r0+r3*2], xmm0 |
RET |
%endmacro |
INIT_MMX sse2 |
PRED8x8L_DOWN_LEFT |
INIT_MMX ssse3 |
PRED8x8L_DOWN_LEFT |
;----------------------------------------------------------------------------- |
;void pred8x8l_down_right_8_mmxext(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred8x8l_down_right_8, 4,5 |
sub r0, r3 |
lea r4, [r0+r3*2] |
movq mm0, [r0+r3*1-8] |
punpckhbw mm0, [r0+r3*0-8] |
movq mm1, [r4+r3*1-8] |
punpckhbw mm1, [r0+r3*2-8] |
mov r4, r0 |
punpckhwd mm1, mm0 |
lea r0, [r0+r3*4] |
movq mm2, [r0+r3*1-8] |
punpckhbw mm2, [r0+r3*0-8] |
lea r0, [r0+r3*2] |
movq mm3, [r0+r3*1-8] |
punpckhbw mm3, [r0+r3*0-8] |
punpckhwd mm3, mm2 |
punpckhdq mm3, mm1 |
lea r0, [r0+r3*2] |
movq mm0, [r0+r3*0-8] |
movq mm1, [r4] |
mov r0, r4 |
movq mm4, mm3 |
movq mm2, mm3 |
PALIGNR mm4, mm0, 7, mm0 |
PALIGNR mm1, mm2, 1, mm2 |
test r1, r1 ; top_left |
jz .fix_lt_1 |
.do_left: |
movq mm0, mm4 |
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
movq mm4, mm0 |
movq mm7, mm2 |
movq mm6, mm2 |
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
psllq mm1, 56 |
PALIGNR mm7, mm1, 7, mm3 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 ; top_left |
jz .fix_lt_2 |
test r2, r2 ; top_right |
jz .fix_tr_1 |
.do_top: |
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
movq mm5, mm4 |
jmp .body |
.fix_lt_1: |
movq mm5, mm3 |
pxor mm5, mm4 |
psrlq mm5, 56 |
psllq mm5, 48 |
pxor mm1, mm5 |
jmp .do_left |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 ; top_right |
jnz .do_top |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
jmp .do_top |
.body: |
lea r1, [r0+r3*2] |
movq mm1, mm7 |
movq mm7, mm5 |
movq mm5, mm6 |
movq mm2, mm7 |
lea r2, [r1+r3*2] |
PALIGNR mm2, mm6, 1, mm0 |
movq mm3, mm7 |
PALIGNR mm3, mm6, 7, mm0 |
movq mm4, mm7 |
lea r4, [r2+r3*2] |
psrlq mm4, 8 |
PRED4x4_LOWPASS mm0, mm1, mm2, mm5, mm6 |
PRED4x4_LOWPASS mm1, mm3, mm4, mm7, mm6 |
movq [r4+r3*2], mm0 |
movq mm2, mm1 |
psrlq mm0, 8 |
psllq mm2, 56 |
psrlq mm1, 8 |
por mm0, mm2 |
movq [r4+r3*1], mm0 |
movq mm2, mm1 |
psrlq mm0, 8 |
psllq mm2, 56 |
psrlq mm1, 8 |
por mm0, mm2 |
movq [r2+r3*2], mm0 |
movq mm2, mm1 |
psrlq mm0, 8 |
psllq mm2, 56 |
psrlq mm1, 8 |
por mm0, mm2 |
movq [r2+r3*1], mm0 |
movq mm2, mm1 |
psrlq mm0, 8 |
psllq mm2, 56 |
psrlq mm1, 8 |
por mm0, mm2 |
movq [r1+r3*2], mm0 |
movq mm2, mm1 |
psrlq mm0, 8 |
psllq mm2, 56 |
psrlq mm1, 8 |
por mm0, mm2 |
movq [r1+r3*1], mm0 |
movq mm2, mm1 |
psrlq mm0, 8 |
psllq mm2, 56 |
psrlq mm1, 8 |
por mm0, mm2 |
movq [r0+r3*2], mm0 |
psrlq mm0, 8 |
psllq mm1, 56 |
por mm0, mm1 |
movq [r0+r3*1], mm0 |
RET |
%macro PRED8x8L_DOWN_RIGHT 0 |
cglobal pred8x8l_down_right_8, 4,5 |
sub r0, r3 |
lea r4, [r0+r3*2] |
movq mm0, [r0+r3*1-8] |
punpckhbw mm0, [r0+r3*0-8] |
movq mm1, [r4+r3*1-8] |
punpckhbw mm1, [r0+r3*2-8] |
mov r4, r0 |
punpckhwd mm1, mm0 |
lea r0, [r0+r3*4] |
movq mm2, [r0+r3*1-8] |
punpckhbw mm2, [r0+r3*0-8] |
lea r0, [r0+r3*2] |
movq mm3, [r0+r3*1-8] |
punpckhbw mm3, [r0+r3*0-8] |
punpckhwd mm3, mm2 |
punpckhdq mm3, mm1 |
lea r0, [r0+r3*2] |
movq mm0, [r0+r3*0-8] |
movq mm1, [r4] |
mov r0, r4 |
movq mm4, mm3 |
movq mm2, mm3 |
PALIGNR mm4, mm0, 7, mm0 |
PALIGNR mm1, mm2, 1, mm2 |
test r1, r1 |
jz .fix_lt_1 |
jmp .do_left |
.fix_lt_1: |
movq mm5, mm3 |
pxor mm5, mm4 |
psrlq mm5, 56 |
psllq mm5, 48 |
pxor mm1, mm5 |
jmp .do_left |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 |
jnz .do_top |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
jmp .do_top |
.do_left: |
movq mm0, mm4 |
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
movq mm4, mm0 |
movq mm7, mm2 |
movq2dq xmm3, mm2 |
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
psllq mm1, 56 |
PALIGNR mm7, mm1, 7, mm3 |
movq2dq xmm1, mm7 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 |
jz .fix_lt_2 |
test r2, r2 |
jz .fix_tr_1 |
.do_top: |
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
movq2dq xmm4, mm4 |
lea r1, [r0+r3*2] |
movdqa xmm0, xmm3 |
pslldq xmm4, 8 |
por xmm3, xmm4 |
lea r2, [r1+r3*2] |
pslldq xmm4, 1 |
por xmm1, xmm4 |
psrldq xmm0, 7 |
pslldq xmm0, 15 |
psrldq xmm0, 7 |
por xmm1, xmm0 |
lea r0, [r2+r3*2] |
movdqa xmm2, xmm3 |
psrldq xmm2, 1 |
INIT_XMM cpuname |
PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm4 |
movdqa xmm1, xmm0 |
psrldq xmm1, 1 |
movq [r0+r3*2], xmm0 |
movq [r0+r3*1], xmm1 |
psrldq xmm0, 2 |
psrldq xmm1, 2 |
movq [r2+r3*2], xmm0 |
movq [r2+r3*1], xmm1 |
psrldq xmm0, 2 |
psrldq xmm1, 2 |
movq [r1+r3*2], xmm0 |
movq [r1+r3*1], xmm1 |
psrldq xmm0, 2 |
psrldq xmm1, 2 |
movq [r4+r3*2], xmm0 |
movq [r4+r3*1], xmm1 |
RET |
%endmacro |
INIT_MMX sse2 |
PRED8x8L_DOWN_RIGHT |
INIT_MMX ssse3 |
PRED8x8L_DOWN_RIGHT |
;----------------------------------------------------------------------------- |
; void pred8x8l_vertical_right_8(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred8x8l_vertical_right_8, 4,5 |
sub r0, r3 |
lea r4, [r0+r3*2] |
movq mm0, [r0+r3*1-8] |
punpckhbw mm0, [r0+r3*0-8] |
movq mm1, [r4+r3*1-8] |
punpckhbw mm1, [r0+r3*2-8] |
mov r4, r0 |
punpckhwd mm1, mm0 |
lea r0, [r0+r3*4] |
movq mm2, [r0+r3*1-8] |
punpckhbw mm2, [r0+r3*0-8] |
lea r0, [r0+r3*2] |
movq mm3, [r0+r3*1-8] |
punpckhbw mm3, [r0+r3*0-8] |
punpckhwd mm3, mm2 |
punpckhdq mm3, mm1 |
lea r0, [r0+r3*2] |
movq mm0, [r0+r3*0-8] |
movq mm1, [r4] |
mov r0, r4 |
movq mm4, mm3 |
movq mm2, mm3 |
PALIGNR mm4, mm0, 7, mm0 |
PALIGNR mm1, mm2, 1, mm2 |
test r1, r1 |
jz .fix_lt_1 |
jmp .do_left |
.fix_lt_1: |
movq mm5, mm3 |
pxor mm5, mm4 |
psrlq mm5, 56 |
psllq mm5, 48 |
pxor mm1, mm5 |
jmp .do_left |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 |
jnz .do_top |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
jmp .do_top |
.do_left: |
movq mm0, mm4 |
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
movq mm7, mm2 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 |
jz .fix_lt_2 |
test r2, r2 |
jz .fix_tr_1 |
.do_top: |
PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 |
lea r1, [r0+r3*2] |
movq mm2, mm6 |
movq mm3, mm6 |
PALIGNR mm3, mm7, 7, mm0 |
PALIGNR mm6, mm7, 6, mm1 |
movq mm4, mm3 |
pavgb mm3, mm2 |
lea r2, [r1+r3*2] |
PRED4x4_LOWPASS mm0, mm6, mm2, mm4, mm5 |
movq [r0+r3*1], mm3 |
movq [r0+r3*2], mm0 |
movq mm5, mm0 |
movq mm6, mm3 |
movq mm1, mm7 |
movq mm2, mm1 |
psllq mm2, 8 |
movq mm3, mm1 |
psllq mm3, 16 |
lea r4, [r2+r3*2] |
PRED4x4_LOWPASS mm0, mm1, mm3, mm2, mm4 |
PALIGNR mm6, mm0, 7, mm2 |
movq [r1+r3*1], mm6 |
psllq mm0, 8 |
PALIGNR mm5, mm0, 7, mm1 |
movq [r1+r3*2], mm5 |
psllq mm0, 8 |
PALIGNR mm6, mm0, 7, mm2 |
movq [r2+r3*1], mm6 |
psllq mm0, 8 |
PALIGNR mm5, mm0, 7, mm1 |
movq [r2+r3*2], mm5 |
psllq mm0, 8 |
PALIGNR mm6, mm0, 7, mm2 |
movq [r4+r3*1], mm6 |
psllq mm0, 8 |
PALIGNR mm5, mm0, 7, mm1 |
movq [r4+r3*2], mm5 |
RET |
%macro PRED8x8L_VERTICAL_RIGHT 0 |
cglobal pred8x8l_vertical_right_8, 4,5,7 |
; manually spill XMM registers for Win64 because |
; the code here is initialized with INIT_MMX |
WIN64_SPILL_XMM 7 |
sub r0, r3 |
lea r4, [r0+r3*2] |
movq mm0, [r0+r3*1-8] |
punpckhbw mm0, [r0+r3*0-8] |
movq mm1, [r4+r3*1-8] |
punpckhbw mm1, [r0+r3*2-8] |
mov r4, r0 |
punpckhwd mm1, mm0 |
lea r0, [r0+r3*4] |
movq mm2, [r0+r3*1-8] |
punpckhbw mm2, [r0+r3*0-8] |
lea r0, [r0+r3*2] |
movq mm3, [r0+r3*1-8] |
punpckhbw mm3, [r0+r3*0-8] |
punpckhwd mm3, mm2 |
punpckhdq mm3, mm1 |
lea r0, [r0+r3*2] |
movq mm0, [r0+r3*0-8] |
movq mm1, [r4] |
mov r0, r4 |
movq mm4, mm3 |
movq mm2, mm3 |
PALIGNR mm4, mm0, 7, mm0 |
PALIGNR mm1, mm2, 1, mm2 |
test r1, r1 |
jnz .do_left |
.fix_lt_1: |
movq mm5, mm3 |
pxor mm5, mm4 |
psrlq mm5, 56 |
psllq mm5, 48 |
pxor mm1, mm5 |
jmp .do_left |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 |
jnz .do_top |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
jmp .do_top |
.do_left: |
movq mm0, mm4 |
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
movq2dq xmm0, mm2 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 |
jz .fix_lt_2 |
test r2, r2 |
jz .fix_tr_1 |
.do_top: |
PRED4x4_LOWPASS mm6, mm2, mm1, mm3, mm5 |
lea r1, [r0+r3*2] |
movq2dq xmm4, mm6 |
pslldq xmm4, 8 |
por xmm0, xmm4 |
movdqa xmm6, [pw_ff00] |
movdqa xmm1, xmm0 |
lea r2, [r1+r3*2] |
movdqa xmm2, xmm0 |
movdqa xmm3, xmm0 |
pslldq xmm0, 1 |
pslldq xmm1, 2 |
pavgb xmm2, xmm0 |
INIT_XMM cpuname |
PRED4x4_LOWPASS xmm4, xmm3, xmm1, xmm0, xmm5 |
pandn xmm6, xmm4 |
movdqa xmm5, xmm4 |
psrlw xmm4, 8 |
packuswb xmm6, xmm4 |
movhlps xmm4, xmm6 |
movhps [r0+r3*2], xmm5 |
movhps [r0+r3*1], xmm2 |
psrldq xmm5, 4 |
movss xmm5, xmm6 |
psrldq xmm2, 4 |
movss xmm2, xmm4 |
lea r0, [r2+r3*2] |
psrldq xmm5, 1 |
psrldq xmm2, 1 |
movq [r0+r3*2], xmm5 |
movq [r0+r3*1], xmm2 |
psrldq xmm5, 1 |
psrldq xmm2, 1 |
movq [r2+r3*2], xmm5 |
movq [r2+r3*1], xmm2 |
psrldq xmm5, 1 |
psrldq xmm2, 1 |
movq [r1+r3*2], xmm5 |
movq [r1+r3*1], xmm2 |
RET |
%endmacro |
INIT_MMX sse2 |
PRED8x8L_VERTICAL_RIGHT |
INIT_MMX ssse3 |
PRED8x8L_VERTICAL_RIGHT |
;----------------------------------------------------------------------------- |
;void pred8x8l_vertical_left_8(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_VERTICAL_LEFT 0 |
cglobal pred8x8l_vertical_left_8, 4,4 |
sub r0, r3 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 |
jz .fix_lt_2 |
test r2, r2 |
jz .fix_tr_1 |
jmp .do_top |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 |
jnz .do_top |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
jmp .do_top |
.fix_tr_2: |
punpckhbw mm3, mm3 |
pshufw mm1, mm3, 0xFF |
jmp .do_topright |
.do_top: |
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
movq2dq xmm4, mm4 |
test r2, r2 |
jz .fix_tr_2 |
movq mm0, [r0+8] |
movq mm5, mm0 |
movq mm2, mm0 |
movq mm4, mm0 |
psrlq mm5, 56 |
PALIGNR mm2, mm3, 7, mm3 |
PALIGNR mm5, mm4, 1, mm4 |
PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 |
.do_topright: |
movq2dq xmm3, mm1 |
lea r1, [r0+r3*2] |
pslldq xmm3, 8 |
por xmm4, xmm3 |
movdqa xmm2, xmm4 |
movdqa xmm1, xmm4 |
movdqa xmm3, xmm4 |
psrldq xmm2, 1 |
pslldq xmm1, 1 |
pavgb xmm3, xmm2 |
lea r2, [r1+r3*2] |
INIT_XMM cpuname |
PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm4, xmm5 |
psrldq xmm0, 1 |
movq [r0+r3*1], xmm3 |
movq [r0+r3*2], xmm0 |
lea r0, [r2+r3*2] |
psrldq xmm3, 1 |
psrldq xmm0, 1 |
movq [r1+r3*1], xmm3 |
movq [r1+r3*2], xmm0 |
psrldq xmm3, 1 |
psrldq xmm0, 1 |
movq [r2+r3*1], xmm3 |
movq [r2+r3*2], xmm0 |
psrldq xmm3, 1 |
psrldq xmm0, 1 |
movq [r0+r3*1], xmm3 |
movq [r0+r3*2], xmm0 |
RET |
%endmacro |
INIT_MMX sse2 |
PRED8x8L_VERTICAL_LEFT |
INIT_MMX ssse3 |
PRED8x8L_VERTICAL_LEFT |
;----------------------------------------------------------------------------- |
; void pred8x8l_horizontal_up_8(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_HORIZONTAL_UP 0 |
cglobal pred8x8l_horizontal_up_8, 4,4 |
sub r0, r3 |
lea r2, [r0+r3*2] |
movq mm0, [r0+r3*1-8] |
test r1, r1 |
lea r1, [r0+r3] |
cmovnz r1, r0 |
punpckhbw mm0, [r1+r3*0-8] |
movq mm1, [r2+r3*1-8] |
punpckhbw mm1, [r0+r3*2-8] |
mov r2, r0 |
punpckhwd mm1, mm0 |
lea r0, [r0+r3*4] |
movq mm2, [r0+r3*1-8] |
punpckhbw mm2, [r0+r3*0-8] |
lea r0, [r0+r3*2] |
movq mm3, [r0+r3*1-8] |
punpckhbw mm3, [r0+r3*0-8] |
punpckhwd mm3, mm2 |
punpckhdq mm3, mm1 |
lea r0, [r0+r3*2] |
movq mm0, [r0+r3*0-8] |
movq mm1, [r1+r3*0-8] |
mov r0, r2 |
movq mm4, mm3 |
movq mm2, mm3 |
PALIGNR mm4, mm0, 7, mm0 |
PALIGNR mm1, mm2, 1, mm2 |
movq mm0, mm4 |
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
movq mm4, mm0 |
movq mm7, mm2 |
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
psllq mm1, 56 |
PALIGNR mm7, mm1, 7, mm3 |
lea r1, [r0+r3*2] |
pshufw mm0, mm7, 00011011b ; l6 l7 l4 l5 l2 l3 l0 l1 |
psllq mm7, 56 ; l7 .. .. .. .. .. .. .. |
movq mm2, mm0 |
psllw mm0, 8 |
psrlw mm2, 8 |
por mm2, mm0 ; l7 l6 l5 l4 l3 l2 l1 l0 |
movq mm3, mm2 |
movq mm4, mm2 |
movq mm5, mm2 |
psrlq mm2, 8 |
psrlq mm3, 16 |
lea r2, [r1+r3*2] |
por mm2, mm7 ; l7 l7 l6 l5 l4 l3 l2 l1 |
punpckhbw mm7, mm7 |
por mm3, mm7 ; l7 l7 l7 l6 l5 l4 l3 l2 |
pavgb mm4, mm2 |
PRED4x4_LOWPASS mm1, mm3, mm5, mm2, mm6 |
movq mm5, mm4 |
punpcklbw mm4, mm1 ; p4 p3 p2 p1 |
punpckhbw mm5, mm1 ; p8 p7 p6 p5 |
movq mm6, mm5 |
movq mm7, mm5 |
movq mm0, mm5 |
PALIGNR mm5, mm4, 2, mm1 |
pshufw mm1, mm6, 11111001b |
PALIGNR mm6, mm4, 4, mm2 |
pshufw mm2, mm7, 11111110b |
PALIGNR mm7, mm4, 6, mm3 |
pshufw mm3, mm0, 11111111b |
movq [r0+r3*1], mm4 |
movq [r0+r3*2], mm5 |
lea r0, [r2+r3*2] |
movq [r1+r3*1], mm6 |
movq [r1+r3*2], mm7 |
movq [r2+r3*1], mm0 |
movq [r2+r3*2], mm1 |
movq [r0+r3*1], mm2 |
movq [r0+r3*2], mm3 |
RET |
%endmacro |
INIT_MMX mmxext |
PRED8x8L_HORIZONTAL_UP |
INIT_MMX ssse3 |
PRED8x8L_HORIZONTAL_UP |
;----------------------------------------------------------------------------- |
;void pred8x8l_horizontal_down_8(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred8x8l_horizontal_down_8, 4,5 |
sub r0, r3 |
lea r4, [r0+r3*2] |
movq mm0, [r0+r3*1-8] |
punpckhbw mm0, [r0+r3*0-8] |
movq mm1, [r4+r3*1-8] |
punpckhbw mm1, [r0+r3*2-8] |
mov r4, r0 |
punpckhwd mm1, mm0 |
lea r0, [r0+r3*4] |
movq mm2, [r0+r3*1-8] |
punpckhbw mm2, [r0+r3*0-8] |
lea r0, [r0+r3*2] |
movq mm3, [r0+r3*1-8] |
punpckhbw mm3, [r0+r3*0-8] |
punpckhwd mm3, mm2 |
punpckhdq mm3, mm1 |
lea r0, [r0+r3*2] |
movq mm0, [r0+r3*0-8] |
movq mm1, [r4] |
mov r0, r4 |
movq mm4, mm3 |
movq mm2, mm3 |
PALIGNR mm4, mm0, 7, mm0 |
PALIGNR mm1, mm2, 1, mm2 |
test r1, r1 |
jnz .do_left |
.fix_lt_1: |
movq mm5, mm3 |
pxor mm5, mm4 |
psrlq mm5, 56 |
psllq mm5, 48 |
pxor mm1, mm5 |
jmp .do_left |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 |
jnz .do_top |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
jmp .do_top |
.do_left: |
movq mm0, mm4 |
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
movq mm4, mm0 |
movq mm7, mm2 |
movq mm6, mm2 |
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
psllq mm1, 56 |
PALIGNR mm7, mm1, 7, mm3 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 |
jz .fix_lt_2 |
test r2, r2 |
jz .fix_tr_1 |
.do_top: |
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
movq mm5, mm4 |
lea r1, [r0+r3*2] |
psllq mm7, 56 |
movq mm2, mm5 |
movq mm3, mm6 |
movq mm4, mm2 |
PALIGNR mm2, mm6, 7, mm5 |
PALIGNR mm6, mm7, 7, mm0 |
lea r2, [r1+r3*2] |
PALIGNR mm4, mm3, 1, mm7 |
movq mm5, mm3 |
pavgb mm3, mm6 |
PRED4x4_LOWPASS mm0, mm4, mm6, mm5, mm7 |
movq mm4, mm2 |
movq mm1, mm2 |
lea r4, [r2+r3*2] |
psrlq mm4, 16 |
psrlq mm1, 8 |
PRED4x4_LOWPASS mm6, mm4, mm2, mm1, mm5 |
movq mm7, mm3 |
punpcklbw mm3, mm0 |
punpckhbw mm7, mm0 |
movq mm1, mm7 |
movq mm0, mm7 |
movq mm4, mm7 |
movq [r4+r3*2], mm3 |
PALIGNR mm7, mm3, 2, mm5 |
movq [r4+r3*1], mm7 |
PALIGNR mm1, mm3, 4, mm5 |
movq [r2+r3*2], mm1 |
PALIGNR mm0, mm3, 6, mm3 |
movq [r2+r3*1], mm0 |
movq mm2, mm6 |
movq mm3, mm6 |
movq [r1+r3*2], mm4 |
PALIGNR mm6, mm4, 2, mm5 |
movq [r1+r3*1], mm6 |
PALIGNR mm2, mm4, 4, mm5 |
movq [r0+r3*2], mm2 |
PALIGNR mm3, mm4, 6, mm4 |
movq [r0+r3*1], mm3 |
RET |
%macro PRED8x8L_HORIZONTAL_DOWN 0 |
cglobal pred8x8l_horizontal_down_8, 4,5 |
sub r0, r3 |
lea r4, [r0+r3*2] |
movq mm0, [r0+r3*1-8] |
punpckhbw mm0, [r0+r3*0-8] |
movq mm1, [r4+r3*1-8] |
punpckhbw mm1, [r0+r3*2-8] |
mov r4, r0 |
punpckhwd mm1, mm0 |
lea r0, [r0+r3*4] |
movq mm2, [r0+r3*1-8] |
punpckhbw mm2, [r0+r3*0-8] |
lea r0, [r0+r3*2] |
movq mm3, [r0+r3*1-8] |
punpckhbw mm3, [r0+r3*0-8] |
punpckhwd mm3, mm2 |
punpckhdq mm3, mm1 |
lea r0, [r0+r3*2] |
movq mm0, [r0+r3*0-8] |
movq mm1, [r4] |
mov r0, r4 |
movq mm4, mm3 |
movq mm2, mm3 |
PALIGNR mm4, mm0, 7, mm0 |
PALIGNR mm1, mm2, 1, mm2 |
test r1, r1 |
jnz .do_left |
.fix_lt_1: |
movq mm5, mm3 |
pxor mm5, mm4 |
psrlq mm5, 56 |
psllq mm5, 48 |
pxor mm1, mm5 |
jmp .do_left |
.fix_lt_2: |
movq mm5, mm3 |
pxor mm5, mm2 |
psllq mm5, 56 |
psrlq mm5, 56 |
pxor mm2, mm5 |
test r2, r2 |
jnz .do_top |
.fix_tr_1: |
movq mm5, mm3 |
pxor mm5, mm1 |
psrlq mm5, 56 |
psllq mm5, 56 |
pxor mm1, mm5 |
jmp .do_top |
.fix_tr_2: |
punpckhbw mm3, mm3 |
pshufw mm1, mm3, 0xFF |
jmp .do_topright |
.do_left: |
movq mm0, mm4 |
PRED4x4_LOWPASS mm2, mm1, mm4, mm3, mm5 |
movq2dq xmm0, mm2 |
pslldq xmm0, 8 |
movq mm4, mm0 |
PRED4x4_LOWPASS mm1, mm3, mm0, mm4, mm5 |
movq2dq xmm2, mm1 |
pslldq xmm2, 15 |
psrldq xmm2, 8 |
por xmm0, xmm2 |
movq mm0, [r0-8] |
movq mm3, [r0] |
movq mm1, [r0+8] |
movq mm2, mm3 |
movq mm4, mm3 |
PALIGNR mm2, mm0, 7, mm0 |
PALIGNR mm1, mm4, 1, mm4 |
test r1, r1 |
jz .fix_lt_2 |
test r2, r2 |
jz .fix_tr_1 |
.do_top: |
PRED4x4_LOWPASS mm4, mm2, mm1, mm3, mm5 |
movq2dq xmm1, mm4 |
test r2, r2 |
jz .fix_tr_2 |
movq mm0, [r0+8] |
movq mm5, mm0 |
movq mm2, mm0 |
movq mm4, mm0 |
psrlq mm5, 56 |
PALIGNR mm2, mm3, 7, mm3 |
PALIGNR mm5, mm4, 1, mm4 |
PRED4x4_LOWPASS mm1, mm2, mm5, mm0, mm4 |
.do_topright: |
movq2dq xmm5, mm1 |
pslldq xmm5, 8 |
por xmm1, xmm5 |
INIT_XMM cpuname |
lea r2, [r4+r3*2] |
movdqa xmm2, xmm1 |
movdqa xmm3, xmm1 |
PALIGNR xmm1, xmm0, 7, xmm4 |
PALIGNR xmm2, xmm0, 9, xmm5 |
lea r1, [r2+r3*2] |
PALIGNR xmm3, xmm0, 8, xmm0 |
movdqa xmm4, xmm1 |
pavgb xmm4, xmm3 |
lea r0, [r1+r3*2] |
PRED4x4_LOWPASS xmm0, xmm1, xmm2, xmm3, xmm5 |
punpcklbw xmm4, xmm0 |
movhlps xmm0, xmm4 |
movq [r0+r3*2], xmm4 |
movq [r2+r3*2], xmm0 |
psrldq xmm4, 2 |
psrldq xmm0, 2 |
movq [r0+r3*1], xmm4 |
movq [r2+r3*1], xmm0 |
psrldq xmm4, 2 |
psrldq xmm0, 2 |
movq [r1+r3*2], xmm4 |
movq [r4+r3*2], xmm0 |
psrldq xmm4, 2 |
psrldq xmm0, 2 |
movq [r1+r3*1], xmm4 |
movq [r4+r3*1], xmm0 |
RET |
%endmacro |
INIT_MMX sse2 |
PRED8x8L_HORIZONTAL_DOWN |
INIT_MMX ssse3 |
PRED8x8L_HORIZONTAL_DOWN |
;----------------------------------------------------------------------------- |
; void pred4x4_dc_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred4x4_dc_8, 3,5 |
pxor mm7, mm7 |
mov r4, r0 |
sub r0, r2 |
movd mm0, [r0] |
psadbw mm0, mm7 |
movzx r1d, byte [r0+r2*1-1] |
movd r3d, mm0 |
add r3d, r1d |
movzx r1d, byte [r0+r2*2-1] |
lea r0, [r0+r2*2] |
add r3d, r1d |
movzx r1d, byte [r0+r2*1-1] |
add r3d, r1d |
movzx r1d, byte [r0+r2*2-1] |
add r3d, r1d |
add r3d, 4 |
shr r3d, 3 |
imul r3d, 0x01010101 |
mov [r4+r2*0], r3d |
mov [r0+r2*0], r3d |
mov [r0+r2*1], r3d |
mov [r0+r2*2], r3d |
RET |
;----------------------------------------------------------------------------- |
; void pred4x4_tm_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED4x4_TM 0 |
cglobal pred4x4_tm_vp8_8, 3,6 |
sub r0, r2 |
pxor mm7, mm7 |
movd mm0, [r0] |
punpcklbw mm0, mm7 |
movzx r4d, byte [r0-1] |
mov r5d, 2 |
.loop: |
movzx r1d, byte [r0+r2*1-1] |
movzx r3d, byte [r0+r2*2-1] |
sub r1d, r4d |
sub r3d, r4d |
movd mm2, r1d |
movd mm4, r3d |
%if cpuflag(mmxext) |
pshufw mm2, mm2, 0 |
pshufw mm4, mm4, 0 |
%else |
punpcklwd mm2, mm2 |
punpcklwd mm4, mm4 |
punpckldq mm2, mm2 |
punpckldq mm4, mm4 |
%endif |
paddw mm2, mm0 |
paddw mm4, mm0 |
packuswb mm2, mm2 |
packuswb mm4, mm4 |
movd [r0+r2*1], mm2 |
movd [r0+r2*2], mm4 |
lea r0, [r0+r2*2] |
dec r5d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmx |
PRED4x4_TM |
INIT_MMX mmxext |
PRED4x4_TM |
INIT_XMM ssse3 |
cglobal pred4x4_tm_vp8_8, 3,3 |
sub r0, r2 |
movq mm6, [tm_shuf] |
pxor mm1, mm1 |
movd mm0, [r0] |
punpcklbw mm0, mm1 |
movd mm7, [r0-4] |
pshufb mm7, mm6 |
lea r1, [r0+r2*2] |
movd mm2, [r0+r2*1-4] |
movd mm3, [r0+r2*2-4] |
movd mm4, [r1+r2*1-4] |
movd mm5, [r1+r2*2-4] |
pshufb mm2, mm6 |
pshufb mm3, mm6 |
pshufb mm4, mm6 |
pshufb mm5, mm6 |
psubw mm2, mm7 |
psubw mm3, mm7 |
psubw mm4, mm7 |
psubw mm5, mm7 |
paddw mm2, mm0 |
paddw mm3, mm0 |
paddw mm4, mm0 |
paddw mm5, mm0 |
packuswb mm2, mm2 |
packuswb mm3, mm3 |
packuswb mm4, mm4 |
packuswb mm5, mm5 |
movd [r0+r2*1], mm2 |
movd [r0+r2*2], mm3 |
movd [r1+r2*1], mm4 |
movd [r1+r2*2], mm5 |
RET |
;----------------------------------------------------------------------------- |
; void pred4x4_vertical_vp8_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred4x4_vertical_vp8_8, 3,3 |
sub r0, r2 |
movd m1, [r0-1] |
movd m0, [r0] |
mova m2, m0 ;t0 t1 t2 t3 |
punpckldq m0, [r1] ;t0 t1 t2 t3 t4 t5 t6 t7 |
lea r1, [r0+r2*2] |
psrlq m0, 8 ;t1 t2 t3 t4 |
PRED4x4_LOWPASS m3, m1, m0, m2, m4 |
movd [r0+r2*1], m3 |
movd [r0+r2*2], m3 |
movd [r1+r2*1], m3 |
movd [r1+r2*2], m3 |
RET |
;----------------------------------------------------------------------------- |
; void pred4x4_down_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred4x4_down_left_8, 3,3 |
sub r0, r2 |
movq m1, [r0] |
punpckldq m1, [r1] |
movq m2, m1 |
movq m3, m1 |
psllq m1, 8 |
pxor m2, m1 |
psrlq m2, 8 |
pxor m2, m3 |
PRED4x4_LOWPASS m0, m1, m2, m3, m4 |
lea r1, [r0+r2*2] |
psrlq m0, 8 |
movd [r0+r2*1], m0 |
psrlq m0, 8 |
movd [r0+r2*2], m0 |
psrlq m0, 8 |
movd [r1+r2*1], m0 |
psrlq m0, 8 |
movd [r1+r2*2], m0 |
RET |
;----------------------------------------------------------------------------- |
; void pred4x4_vertical_left_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred4x4_vertical_left_8, 3,3 |
sub r0, r2 |
movq m1, [r0] |
punpckldq m1, [r1] |
movq m3, m1 |
movq m2, m1 |
psrlq m3, 8 |
psrlq m2, 16 |
movq m4, m3 |
pavgb m4, m1 |
PRED4x4_LOWPASS m0, m1, m2, m3, m5 |
lea r1, [r0+r2*2] |
movh [r0+r2*1], m4 |
movh [r0+r2*2], m0 |
psrlq m4, 8 |
psrlq m0, 8 |
movh [r1+r2*1], m4 |
movh [r1+r2*2], m0 |
RET |
;----------------------------------------------------------------------------- |
; void pred4x4_horizontal_up_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred4x4_horizontal_up_8, 3,3 |
sub r0, r2 |
lea r1, [r0+r2*2] |
movd m0, [r0+r2*1-4] |
punpcklbw m0, [r0+r2*2-4] |
movd m1, [r1+r2*1-4] |
punpcklbw m1, [r1+r2*2-4] |
punpckhwd m0, m1 |
movq m1, m0 |
punpckhbw m1, m1 |
pshufw m1, m1, 0xFF |
punpckhdq m0, m1 |
movq m2, m0 |
movq m3, m0 |
movq m7, m0 |
psrlq m2, 16 |
psrlq m3, 8 |
pavgb m7, m3 |
PRED4x4_LOWPASS m4, m0, m2, m3, m5 |
punpcklbw m7, m4 |
movd [r0+r2*1], m7 |
psrlq m7, 16 |
movd [r0+r2*2], m7 |
psrlq m7, 16 |
movd [r1+r2*1], m7 |
movd [r1+r2*2], m1 |
RET |
;----------------------------------------------------------------------------- |
; void pred4x4_horizontal_down_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred4x4_horizontal_down_8, 3,3 |
sub r0, r2 |
lea r1, [r0+r2*2] |
movh m0, [r0-4] ; lt .. |
punpckldq m0, [r0] ; t3 t2 t1 t0 lt .. .. .. |
psllq m0, 8 ; t2 t1 t0 lt .. .. .. .. |
movd m1, [r1+r2*2-4] ; l3 |
punpcklbw m1, [r1+r2*1-4] ; l2 l3 |
movd m2, [r0+r2*2-4] ; l1 |
punpcklbw m2, [r0+r2*1-4] ; l0 l1 |
punpckhwd m1, m2 ; l0 l1 l2 l3 |
punpckhdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 |
movq m0, m1 |
movq m2, m1 |
movq m5, m1 |
psrlq m0, 16 ; .. .. t2 t1 t0 lt l0 l1 |
psrlq m2, 8 ; .. t2 t1 t0 lt l0 l1 l2 |
pavgb m5, m2 |
PRED4x4_LOWPASS m3, m1, m0, m2, m4 |
punpcklbw m5, m3 |
psrlq m3, 32 |
PALIGNR m3, m5, 6, m4 |
movh [r1+r2*2], m5 |
psrlq m5, 16 |
movh [r1+r2*1], m5 |
psrlq m5, 16 |
movh [r0+r2*2], m5 |
movh [r0+r2*1], m3 |
RET |
;----------------------------------------------------------------------------- |
; void pred4x4_vertical_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred4x4_vertical_right_8, 3,3 |
sub r0, r2 |
lea r1, [r0+r2*2] |
movh m0, [r0] ; ........t3t2t1t0 |
movq m5, m0 |
PALIGNR m0, [r0-8], 7, m1 ; ......t3t2t1t0lt |
pavgb m5, m0 |
PALIGNR m0, [r0+r2*1-8], 7, m1 ; ....t3t2t1t0ltl0 |
movq m1, m0 |
PALIGNR m0, [r0+r2*2-8], 7, m2 ; ..t3t2t1t0ltl0l1 |
movq m2, m0 |
PALIGNR m0, [r1+r2*1-8], 7, m3 ; t3t2t1t0ltl0l1l2 |
PRED4x4_LOWPASS m3, m1, m0, m2, m4 |
movq m1, m3 |
psrlq m3, 16 |
psllq m1, 48 |
movh [r0+r2*1], m5 |
movh [r0+r2*2], m3 |
PALIGNR m5, m1, 7, m2 |
psllq m1, 8 |
movh [r1+r2*1], m5 |
PALIGNR m3, m1, 7, m1 |
movh [r1+r2*2], m3 |
RET |
;----------------------------------------------------------------------------- |
; void pred4x4_down_right_8_mmxext(uint8_t *src, const uint8_t *topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred4x4_down_right_8, 3,3 |
sub r0, r2 |
lea r1, [r0+r2*2] |
movq m1, [r1-8] |
movq m2, [r0+r2*1-8] |
punpckhbw m2, [r0-8] |
movh m3, [r0] |
punpckhwd m1, m2 |
PALIGNR m3, m1, 5, m1 |
movq m1, m3 |
PALIGNR m3, [r1+r2*1-8], 7, m4 |
movq m2, m3 |
PALIGNR m3, [r1+r2*2-8], 7, m4 |
PRED4x4_LOWPASS m0, m3, m1, m2, m4 |
movh [r1+r2*2], m0 |
psrlq m0, 8 |
movh [r1+r2*1], m0 |
psrlq m0, 8 |
movh [r0+r2*2], m0 |
psrlq m0, 8 |
movh [r0+r2*1], m0 |
RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_intrapred_10bit.asm |
---|
0,0 → 1,1199 |
;***************************************************************************** |
;* MMX/SSE2/AVX-optimized 10-bit H.264 intra prediction code |
;***************************************************************************** |
;* Copyright (C) 2005-2011 x264 project |
;* |
;* Authors: Daniel Kang <daniel.d.kang@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
cextern pw_16 |
cextern pw_8 |
cextern pw_4 |
cextern pw_2 |
cextern pw_1 |
pw_m32101234: dw -3, -2, -1, 0, 1, 2, 3, 4 |
pw_m3: times 8 dw -3 |
pw_pixel_max: times 8 dw ((1 << 10)-1) |
pw_512: times 8 dw 512 |
pd_17: times 4 dd 17 |
pd_16: times 4 dd 16 |
SECTION .text |
; dest, left, right, src |
; output: %1 = (t[n-1] + t[n]*2 + t[n+1] + 2) >> 2 |
%macro PRED4x4_LOWPASS 4 |
paddw %2, %3 |
psrlw %2, 1 |
pavgw %1, %4, %2 |
%endmacro |
;----------------------------------------------------------------------------- |
; void pred4x4_down_right(pixel *src, const pixel *topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED4x4_DR 0 |
cglobal pred4x4_down_right_10, 3, 3 |
sub r0, r2 |
lea r1, [r0+r2*2] |
movhps m1, [r1-8] |
movhps m2, [r0+r2*1-8] |
movhps m4, [r0-8] |
punpckhwd m2, m4 |
movq m3, [r0] |
punpckhdq m1, m2 |
PALIGNR m3, m1, 10, m1 |
movhps m4, [r1+r2*1-8] |
PALIGNR m0, m3, m4, 14, m4 |
movhps m4, [r1+r2*2-8] |
PALIGNR m2, m0, m4, 14, m4 |
PRED4x4_LOWPASS m0, m2, m3, m0 |
movq [r1+r2*2], m0 |
psrldq m0, 2 |
movq [r1+r2*1], m0 |
psrldq m0, 2 |
movq [r0+r2*2], m0 |
psrldq m0, 2 |
movq [r0+r2*1], m0 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED4x4_DR |
INIT_XMM ssse3 |
PRED4x4_DR |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED4x4_DR |
%endif |
;----------------------------------------------------------------------------- |
; void pred4x4_vertical_right(pixel *src, const pixel *topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED4x4_VR 0 |
cglobal pred4x4_vertical_right_10, 3, 3, 6 |
sub r0, r2 |
lea r1, [r0+r2*2] |
movq m5, [r0] ; ........t3t2t1t0 |
movhps m1, [r0-8] |
PALIGNR m0, m5, m1, 14, m1 ; ......t3t2t1t0lt |
pavgw m5, m0 |
movhps m1, [r0+r2*1-8] |
PALIGNR m0, m1, 14, m1 ; ....t3t2t1t0ltl0 |
movhps m2, [r0+r2*2-8] |
PALIGNR m1, m0, m2, 14, m2 ; ..t3t2t1t0ltl0l1 |
movhps m3, [r1+r2*1-8] |
PALIGNR m2, m1, m3, 14, m3 ; t3t2t1t0ltl0l1l2 |
PRED4x4_LOWPASS m1, m0, m2, m1 |
pslldq m0, m1, 12 |
psrldq m1, 4 |
movq [r0+r2*1], m5 |
movq [r0+r2*2], m1 |
PALIGNR m5, m0, 14, m2 |
pslldq m0, 2 |
movq [r1+r2*1], m5 |
PALIGNR m1, m0, 14, m0 |
movq [r1+r2*2], m1 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED4x4_VR |
INIT_XMM ssse3 |
PRED4x4_VR |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED4x4_VR |
%endif |
;----------------------------------------------------------------------------- |
; void pred4x4_horizontal_down(pixel *src, const pixel *topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED4x4_HD 0 |
cglobal pred4x4_horizontal_down_10, 3, 3 |
sub r0, r2 |
lea r1, [r0+r2*2] |
movq m0, [r0-8] ; lt .. |
movhps m0, [r0] |
pslldq m0, 2 ; t2 t1 t0 lt .. .. .. .. |
movq m1, [r1+r2*2-8] ; l3 |
movq m3, [r1+r2*1-8] |
punpcklwd m1, m3 ; l2 l3 |
movq m2, [r0+r2*2-8] ; l1 |
movq m3, [r0+r2*1-8] |
punpcklwd m2, m3 ; l0 l1 |
punpckhdq m1, m2 ; l0 l1 l2 l3 |
punpckhqdq m1, m0 ; t2 t1 t0 lt l0 l1 l2 l3 |
psrldq m0, m1, 4 ; .. .. t2 t1 t0 lt l0 l1 |
psrldq m3, m1, 2 ; .. t2 t1 t0 lt l0 l1 l2 |
pavgw m5, m1, m3 |
PRED4x4_LOWPASS m3, m1, m0, m3 |
punpcklwd m5, m3 |
psrldq m3, 8 |
PALIGNR m3, m5, 12, m4 |
movq [r1+r2*2], m5 |
movhps [r0+r2*2], m5 |
psrldq m5, 4 |
movq [r1+r2*1], m5 |
movq [r0+r2*1], m3 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED4x4_HD |
INIT_XMM ssse3 |
PRED4x4_HD |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED4x4_HD |
%endif |
;----------------------------------------------------------------------------- |
; void pred4x4_dc(pixel *src, const pixel *topright, int stride) |
;----------------------------------------------------------------------------- |
%macro HADDD 2 ; sum junk |
%if mmsize == 16 |
movhlps %2, %1 |
paddd %1, %2 |
pshuflw %2, %1, 0xE |
paddd %1, %2 |
%else |
pshufw %2, %1, 0xE |
paddd %1, %2 |
%endif |
%endmacro |
%macro HADDW 2 |
pmaddwd %1, [pw_1] |
HADDD %1, %2 |
%endmacro |
INIT_MMX mmxext |
cglobal pred4x4_dc_10, 3, 3 |
sub r0, r2 |
lea r1, [r0+r2*2] |
movq m2, [r0+r2*1-8] |
paddw m2, [r0+r2*2-8] |
paddw m2, [r1+r2*1-8] |
paddw m2, [r1+r2*2-8] |
psrlq m2, 48 |
movq m0, [r0] |
HADDW m0, m1 |
paddw m0, [pw_4] |
paddw m0, m2 |
psrlw m0, 3 |
SPLATW m0, m0, 0 |
movq [r0+r2*1], m0 |
movq [r0+r2*2], m0 |
movq [r1+r2*1], m0 |
movq [r1+r2*2], m0 |
RET |
;----------------------------------------------------------------------------- |
; void pred4x4_down_left(pixel *src, const pixel *topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED4x4_DL 0 |
cglobal pred4x4_down_left_10, 3, 3 |
sub r0, r2 |
movq m0, [r0] |
movhps m0, [r1] |
psrldq m2, m0, 2 |
pslldq m3, m0, 2 |
pshufhw m2, m2, 10100100b |
PRED4x4_LOWPASS m0, m3, m2, m0 |
lea r1, [r0+r2*2] |
movhps [r1+r2*2], m0 |
psrldq m0, 2 |
movq [r0+r2*1], m0 |
psrldq m0, 2 |
movq [r0+r2*2], m0 |
psrldq m0, 2 |
movq [r1+r2*1], m0 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED4x4_DL |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED4x4_DL |
%endif |
;----------------------------------------------------------------------------- |
; void pred4x4_vertical_left(pixel *src, const pixel *topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED4x4_VL 0 |
cglobal pred4x4_vertical_left_10, 3, 3 |
sub r0, r2 |
movu m1, [r0] |
movhps m1, [r1] |
psrldq m0, m1, 2 |
psrldq m2, m1, 4 |
pavgw m4, m0, m1 |
PRED4x4_LOWPASS m0, m1, m2, m0 |
lea r1, [r0+r2*2] |
movq [r0+r2*1], m4 |
movq [r0+r2*2], m0 |
psrldq m4, 2 |
psrldq m0, 2 |
movq [r1+r2*1], m4 |
movq [r1+r2*2], m0 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED4x4_VL |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED4x4_VL |
%endif |
;----------------------------------------------------------------------------- |
; void pred4x4_horizontal_up(pixel *src, const pixel *topright, int stride) |
;----------------------------------------------------------------------------- |
INIT_MMX mmxext |
cglobal pred4x4_horizontal_up_10, 3, 3 |
sub r0, r2 |
lea r1, [r0+r2*2] |
movq m0, [r0+r2*1-8] |
punpckhwd m0, [r0+r2*2-8] |
movq m1, [r1+r2*1-8] |
punpckhwd m1, [r1+r2*2-8] |
punpckhdq m0, m1 |
pshufw m1, m1, 0xFF |
movq [r1+r2*2], m1 |
movd [r1+r2*1+4], m1 |
pshufw m2, m0, 11111001b |
movq m1, m2 |
pavgw m2, m0 |
pshufw m5, m0, 11111110b |
PRED4x4_LOWPASS m1, m0, m5, m1 |
movq m6, m2 |
punpcklwd m6, m1 |
movq [r0+r2*1], m6 |
psrlq m2, 16 |
psrlq m1, 16 |
punpcklwd m2, m1 |
movq [r0+r2*2], m2 |
psrlq m2, 32 |
movd [r1+r2*1], m2 |
RET |
;----------------------------------------------------------------------------- |
; void pred8x8_vertical(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
INIT_XMM sse2 |
cglobal pred8x8_vertical_10, 2, 2 |
sub r0, r1 |
mova m0, [r0] |
%rep 3 |
mova [r0+r1*1], m0 |
mova [r0+r1*2], m0 |
lea r0, [r0+r1*2] |
%endrep |
mova [r0+r1*1], m0 |
mova [r0+r1*2], m0 |
RET |
;----------------------------------------------------------------------------- |
; void pred8x8_horizontal(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
INIT_XMM sse2 |
cglobal pred8x8_horizontal_10, 2, 3 |
mov r2d, 4 |
.loop: |
movq m0, [r0+r1*0-8] |
movq m1, [r0+r1*1-8] |
pshuflw m0, m0, 0xff |
pshuflw m1, m1, 0xff |
punpcklqdq m0, m0 |
punpcklqdq m1, m1 |
mova [r0+r1*0], m0 |
mova [r0+r1*1], m1 |
lea r0, [r0+r1*2] |
dec r2d |
jg .loop |
REP_RET |
;----------------------------------------------------------------------------- |
; void predict_8x8_dc(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MOV8 2-3 |
; sort of a hack, but it works |
%if mmsize==8 |
movq [%1+0], %2 |
movq [%1+8], %3 |
%else |
movdqa [%1], %2 |
%endif |
%endmacro |
%macro PRED8x8_DC 1 |
cglobal pred8x8_dc_10, 2, 6 |
sub r0, r1 |
pxor m4, m4 |
movq m0, [r0+0] |
movq m1, [r0+8] |
%if mmsize==16 |
punpcklwd m0, m1 |
movhlps m1, m0 |
paddw m0, m1 |
%else |
pshufw m2, m0, 00001110b |
pshufw m3, m1, 00001110b |
paddw m0, m2 |
paddw m1, m3 |
punpcklwd m0, m1 |
%endif |
%1 m2, m0, 00001110b |
paddw m0, m2 |
lea r5, [r1*3] |
lea r4, [r0+r1*4] |
movzx r2d, word [r0+r1*1-2] |
movzx r3d, word [r0+r1*2-2] |
add r2d, r3d |
movzx r3d, word [r0+r5*1-2] |
add r2d, r3d |
movzx r3d, word [r4-2] |
add r2d, r3d |
movd m2, r2d ; s2 |
movzx r2d, word [r4+r1*1-2] |
movzx r3d, word [r4+r1*2-2] |
add r2d, r3d |
movzx r3d, word [r4+r5*1-2] |
add r2d, r3d |
movzx r3d, word [r4+r1*4-2] |
add r2d, r3d |
movd m3, r2d ; s3 |
punpcklwd m2, m3 |
punpckldq m0, m2 ; s0, s1, s2, s3 |
%1 m3, m0, 11110110b ; s2, s1, s3, s3 |
%1 m0, m0, 01110100b ; s0, s1, s3, s1 |
paddw m0, m3 |
psrlw m0, 2 |
pavgw m0, m4 ; s0+s2, s1, s3, s1+s3 |
%if mmsize==16 |
punpcklwd m0, m0 |
pshufd m3, m0, 11111010b |
punpckldq m0, m0 |
SWAP 0,1 |
%else |
pshufw m1, m0, 0x00 |
pshufw m2, m0, 0x55 |
pshufw m3, m0, 0xaa |
pshufw m4, m0, 0xff |
%endif |
MOV8 r0+r1*1, m1, m2 |
MOV8 r0+r1*2, m1, m2 |
MOV8 r0+r5*1, m1, m2 |
MOV8 r0+r1*4, m1, m2 |
MOV8 r4+r1*1, m3, m4 |
MOV8 r4+r1*2, m3, m4 |
MOV8 r4+r5*1, m3, m4 |
MOV8 r4+r1*4, m3, m4 |
RET |
%endmacro |
INIT_MMX mmxext |
PRED8x8_DC pshufw |
INIT_XMM sse2 |
PRED8x8_DC pshuflw |
;----------------------------------------------------------------------------- |
; void pred8x8_top_dc(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
INIT_XMM sse2 |
cglobal pred8x8_top_dc_10, 2, 4 |
sub r0, r1 |
mova m0, [r0] |
pshuflw m1, m0, 0x4e |
pshufhw m1, m1, 0x4e |
paddw m0, m1 |
pshuflw m1, m0, 0xb1 |
pshufhw m1, m1, 0xb1 |
paddw m0, m1 |
lea r2, [r1*3] |
lea r3, [r0+r1*4] |
paddw m0, [pw_2] |
psrlw m0, 2 |
mova [r0+r1*1], m0 |
mova [r0+r1*2], m0 |
mova [r0+r2*1], m0 |
mova [r0+r1*4], m0 |
mova [r3+r1*1], m0 |
mova [r3+r1*2], m0 |
mova [r3+r2*1], m0 |
mova [r3+r1*4], m0 |
RET |
;----------------------------------------------------------------------------- |
; void pred8x8_plane(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
INIT_XMM sse2 |
cglobal pred8x8_plane_10, 2, 7, 7 |
sub r0, r1 |
lea r2, [r1*3] |
lea r3, [r0+r1*4] |
mova m2, [r0] |
pmaddwd m2, [pw_m32101234] |
HADDD m2, m1 |
movd m0, [r0-4] |
psrld m0, 14 |
psubw m2, m0 ; H |
movd m0, [r3+r1*4-4] |
movd m1, [r0+12] |
paddw m0, m1 |
psllw m0, 4 ; 16*(src[7*stride-1] + src[-stride+7]) |
movzx r4d, word [r3+r1*1-2] ; src[4*stride-1] |
movzx r5d, word [r0+r2*1-2] ; src[2*stride-1] |
sub r4d, r5d |
movzx r6d, word [r3+r1*2-2] ; src[5*stride-1] |
movzx r5d, word [r0+r1*2-2] ; src[1*stride-1] |
sub r6d, r5d |
lea r4d, [r4+r6*2] |
movzx r5d, word [r3+r2*1-2] ; src[6*stride-1] |
movzx r6d, word [r0+r1*1-2] ; src[0*stride-1] |
sub r5d, r6d |
lea r5d, [r5*3] |
add r4d, r5d |
movzx r6d, word [r3+r1*4-2] ; src[7*stride-1] |
movzx r5d, word [r0+r1*0-2] ; src[ -stride-1] |
sub r6d, r5d |
lea r4d, [r4+r6*4] |
movd m3, r4d ; V |
punpckldq m2, m3 |
pmaddwd m2, [pd_17] |
paddd m2, [pd_16] |
psrad m2, 5 ; b, c |
mova m3, [pw_pixel_max] |
pxor m1, m1 |
SPLATW m0, m0, 1 |
SPLATW m4, m2, 2 |
SPLATW m2, m2, 0 |
pmullw m2, [pw_m32101234] ; b |
pmullw m5, m4, [pw_m3] ; c |
paddw m5, [pw_16] |
mov r2d, 8 |
add r0, r1 |
.loop: |
paddsw m6, m2, m5 |
paddsw m6, m0 |
psraw m6, 5 |
CLIPW m6, m1, m3 |
mova [r0], m6 |
paddw m5, m4 |
add r0, r1 |
dec r2d |
jg .loop |
REP_RET |
;----------------------------------------------------------------------------- |
; void pred8x8l_128_dc(pixel *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_128_DC 0 |
cglobal pred8x8l_128_dc_10, 4, 4 |
mova m0, [pw_512] ; (1<<(BIT_DEPTH-1)) |
lea r1, [r3*3] |
lea r2, [r0+r3*4] |
MOV8 r0+r3*0, m0, m0 |
MOV8 r0+r3*1, m0, m0 |
MOV8 r0+r3*2, m0, m0 |
MOV8 r0+r1*1, m0, m0 |
MOV8 r2+r3*0, m0, m0 |
MOV8 r2+r3*1, m0, m0 |
MOV8 r2+r3*2, m0, m0 |
MOV8 r2+r1*1, m0, m0 |
RET |
%endmacro |
INIT_MMX mmxext |
PRED8x8L_128_DC |
INIT_XMM sse2 |
PRED8x8L_128_DC |
;----------------------------------------------------------------------------- |
; void pred8x8l_top_dc(pixel *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_TOP_DC 0 |
cglobal pred8x8l_top_dc_10, 4, 4, 6 |
sub r0, r3 |
mova m0, [r0] |
shr r1d, 14 |
shr r2d, 13 |
neg r1 |
pslldq m1, m0, 2 |
psrldq m2, m0, 2 |
pinsrw m1, [r0+r1], 0 |
pinsrw m2, [r0+r2+14], 7 |
lea r1, [r3*3] |
lea r2, [r0+r3*4] |
PRED4x4_LOWPASS m0, m2, m1, m0 |
HADDW m0, m1 |
paddw m0, [pw_4] |
psrlw m0, 3 |
SPLATW m0, m0, 0 |
mova [r0+r3*1], m0 |
mova [r0+r3*2], m0 |
mova [r0+r1*1], m0 |
mova [r0+r3*4], m0 |
mova [r2+r3*1], m0 |
mova [r2+r3*2], m0 |
mova [r2+r1*1], m0 |
mova [r2+r3*4], m0 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED8x8L_TOP_DC |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED8x8L_TOP_DC |
%endif |
;----------------------------------------------------------------------------- |
;void pred8x8l_dc(pixel *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
;TODO: see if scalar is faster |
%macro PRED8x8L_DC 0 |
cglobal pred8x8l_dc_10, 4, 6, 6 |
sub r0, r3 |
lea r4, [r0+r3*4] |
lea r5, [r3*3] |
mova m0, [r0+r3*2-16] |
punpckhwd m0, [r0+r3*1-16] |
mova m1, [r4+r3*0-16] |
punpckhwd m1, [r0+r5*1-16] |
punpckhdq m1, m0 |
mova m2, [r4+r3*2-16] |
punpckhwd m2, [r4+r3*1-16] |
mova m3, [r4+r3*4-16] |
punpckhwd m3, [r4+r5*1-16] |
punpckhdq m3, m2 |
punpckhqdq m3, m1 |
mova m0, [r0] |
shr r1d, 14 |
shr r2d, 13 |
neg r1 |
pslldq m1, m0, 2 |
psrldq m2, m0, 2 |
pinsrw m1, [r0+r1], 0 |
pinsrw m2, [r0+r2+14], 7 |
not r1 |
and r1, r3 |
pslldq m4, m3, 2 |
psrldq m5, m3, 2 |
pshuflw m4, m4, 11100101b |
pinsrw m5, [r0+r1-2], 7 |
PRED4x4_LOWPASS m3, m4, m5, m3 |
PRED4x4_LOWPASS m0, m2, m1, m0 |
paddw m0, m3 |
HADDW m0, m1 |
paddw m0, [pw_8] |
psrlw m0, 4 |
SPLATW m0, m0 |
mova [r0+r3*1], m0 |
mova [r0+r3*2], m0 |
mova [r0+r5*1], m0 |
mova [r0+r3*4], m0 |
mova [r4+r3*1], m0 |
mova [r4+r3*2], m0 |
mova [r4+r5*1], m0 |
mova [r4+r3*4], m0 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED8x8L_DC |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED8x8L_DC |
%endif |
;----------------------------------------------------------------------------- |
; void pred8x8l_vertical(pixel *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_VERTICAL 0 |
cglobal pred8x8l_vertical_10, 4, 4, 6 |
sub r0, r3 |
mova m0, [r0] |
shr r1d, 14 |
shr r2d, 13 |
neg r1 |
pslldq m1, m0, 2 |
psrldq m2, m0, 2 |
pinsrw m1, [r0+r1], 0 |
pinsrw m2, [r0+r2+14], 7 |
lea r1, [r3*3] |
lea r2, [r0+r3*4] |
PRED4x4_LOWPASS m0, m2, m1, m0 |
mova [r0+r3*1], m0 |
mova [r0+r3*2], m0 |
mova [r0+r1*1], m0 |
mova [r0+r3*4], m0 |
mova [r2+r3*1], m0 |
mova [r2+r3*2], m0 |
mova [r2+r1*1], m0 |
mova [r2+r3*4], m0 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED8x8L_VERTICAL |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED8x8L_VERTICAL |
%endif |
;----------------------------------------------------------------------------- |
; void pred8x8l_horizontal(uint8_t *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_HORIZONTAL 0 |
cglobal pred8x8l_horizontal_10, 4, 4, 5 |
mova m0, [r0-16] |
shr r1d, 14 |
dec r1 |
and r1, r3 |
sub r1, r3 |
punpckhwd m0, [r0+r1-16] |
mova m1, [r0+r3*2-16] |
punpckhwd m1, [r0+r3*1-16] |
lea r2, [r0+r3*4] |
lea r1, [r3*3] |
punpckhdq m1, m0 |
mova m2, [r2+r3*0-16] |
punpckhwd m2, [r0+r1-16] |
mova m3, [r2+r3*2-16] |
punpckhwd m3, [r2+r3*1-16] |
punpckhdq m3, m2 |
punpckhqdq m3, m1 |
PALIGNR m4, m3, [r2+r1-16], 14, m0 |
pslldq m0, m4, 2 |
pshuflw m0, m0, 11100101b |
PRED4x4_LOWPASS m4, m3, m0, m4 |
punpckhwd m3, m4, m4 |
punpcklwd m4, m4 |
pshufd m0, m3, 0xff |
pshufd m1, m3, 0xaa |
pshufd m2, m3, 0x55 |
pshufd m3, m3, 0x00 |
mova [r0+r3*0], m0 |
mova [r0+r3*1], m1 |
mova [r0+r3*2], m2 |
mova [r0+r1*1], m3 |
pshufd m0, m4, 0xff |
pshufd m1, m4, 0xaa |
pshufd m2, m4, 0x55 |
pshufd m3, m4, 0x00 |
mova [r2+r3*0], m0 |
mova [r2+r3*1], m1 |
mova [r2+r3*2], m2 |
mova [r2+r1*1], m3 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED8x8L_HORIZONTAL |
INIT_XMM ssse3 |
PRED8x8L_HORIZONTAL |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED8x8L_HORIZONTAL |
%endif |
;----------------------------------------------------------------------------- |
;void pred8x8l_down_left(pixel *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_DOWN_LEFT 0 |
cglobal pred8x8l_down_left_10, 4, 4, 7 |
sub r0, r3 |
mova m3, [r0] |
shr r1d, 14 |
neg r1 |
shr r2d, 13 |
pslldq m1, m3, 2 |
psrldq m2, m3, 2 |
pinsrw m1, [r0+r1], 0 |
pinsrw m2, [r0+r2+14], 7 |
PRED4x4_LOWPASS m6, m2, m1, m3 |
jz .fix_tr ; flags from shr r2d |
mova m1, [r0+16] |
psrldq m5, m1, 2 |
PALIGNR m2, m1, m3, 14, m3 |
pshufhw m5, m5, 10100100b |
PRED4x4_LOWPASS m1, m2, m5, m1 |
.do_topright: |
lea r1, [r3*3] |
psrldq m5, m1, 14 |
lea r2, [r0+r3*4] |
PALIGNR m2, m1, m6, 2, m0 |
PALIGNR m3, m1, m6, 14, m0 |
PALIGNR m5, m1, 2, m0 |
pslldq m4, m6, 2 |
PRED4x4_LOWPASS m6, m4, m2, m6 |
PRED4x4_LOWPASS m1, m3, m5, m1 |
mova [r2+r3*4], m1 |
PALIGNR m1, m6, 14, m2 |
pslldq m6, 2 |
mova [r2+r1*1], m1 |
PALIGNR m1, m6, 14, m2 |
pslldq m6, 2 |
mova [r2+r3*2], m1 |
PALIGNR m1, m6, 14, m2 |
pslldq m6, 2 |
mova [r2+r3*1], m1 |
PALIGNR m1, m6, 14, m2 |
pslldq m6, 2 |
mova [r0+r3*4], m1 |
PALIGNR m1, m6, 14, m2 |
pslldq m6, 2 |
mova [r0+r1*1], m1 |
PALIGNR m1, m6, 14, m2 |
pslldq m6, 2 |
mova [r0+r3*2], m1 |
PALIGNR m1, m6, 14, m6 |
mova [r0+r3*1], m1 |
RET |
.fix_tr: |
punpckhwd m3, m3 |
pshufd m1, m3, 0xFF |
jmp .do_topright |
%endmacro |
INIT_XMM sse2 |
PRED8x8L_DOWN_LEFT |
INIT_XMM ssse3 |
PRED8x8L_DOWN_LEFT |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED8x8L_DOWN_LEFT |
%endif |
;----------------------------------------------------------------------------- |
;void pred8x8l_down_right(pixel *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_DOWN_RIGHT 0 |
; standard forbids this when has_topleft is false |
; no need to check |
cglobal pred8x8l_down_right_10, 4, 5, 8 |
sub r0, r3 |
lea r4, [r0+r3*4] |
lea r1, [r3*3] |
mova m0, [r0+r3*1-16] |
punpckhwd m0, [r0+r3*0-16] |
mova m1, [r0+r1*1-16] |
punpckhwd m1, [r0+r3*2-16] |
punpckhdq m1, m0 |
mova m2, [r4+r3*1-16] |
punpckhwd m2, [r4+r3*0-16] |
mova m3, [r4+r1*1-16] |
punpckhwd m3, [r4+r3*2-16] |
punpckhdq m3, m2 |
punpckhqdq m3, m1 |
mova m0, [r4+r3*4-16] |
mova m1, [r0] |
PALIGNR m4, m3, m0, 14, m0 |
PALIGNR m1, m3, 2, m2 |
pslldq m0, m4, 2 |
pshuflw m0, m0, 11100101b |
PRED4x4_LOWPASS m6, m1, m4, m3 |
PRED4x4_LOWPASS m4, m3, m0, m4 |
mova m3, [r0] |
shr r2d, 13 |
pslldq m1, m3, 2 |
psrldq m2, m3, 2 |
pinsrw m1, [r0-2], 0 |
pinsrw m2, [r0+r2+14], 7 |
PRED4x4_LOWPASS m3, m2, m1, m3 |
PALIGNR m2, m3, m6, 2, m0 |
PALIGNR m5, m3, m6, 14, m0 |
psrldq m7, m3, 2 |
PRED4x4_LOWPASS m6, m4, m2, m6 |
PRED4x4_LOWPASS m3, m5, m7, m3 |
mova [r4+r3*4], m6 |
PALIGNR m3, m6, 14, m2 |
pslldq m6, 2 |
mova [r0+r3*1], m3 |
PALIGNR m3, m6, 14, m2 |
pslldq m6, 2 |
mova [r0+r3*2], m3 |
PALIGNR m3, m6, 14, m2 |
pslldq m6, 2 |
mova [r0+r1*1], m3 |
PALIGNR m3, m6, 14, m2 |
pslldq m6, 2 |
mova [r0+r3*4], m3 |
PALIGNR m3, m6, 14, m2 |
pslldq m6, 2 |
mova [r4+r3*1], m3 |
PALIGNR m3, m6, 14, m2 |
pslldq m6, 2 |
mova [r4+r3*2], m3 |
PALIGNR m3, m6, 14, m6 |
mova [r4+r1*1], m3 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED8x8L_DOWN_RIGHT |
INIT_XMM ssse3 |
PRED8x8L_DOWN_RIGHT |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED8x8L_DOWN_RIGHT |
%endif |
;----------------------------------------------------------------------------- |
; void pred8x8l_vertical_right(pixel *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_VERTICAL_RIGHT 0 |
; likewise with 8x8l_down_right |
cglobal pred8x8l_vertical_right_10, 4, 5, 7 |
sub r0, r3 |
lea r4, [r0+r3*4] |
lea r1, [r3*3] |
mova m0, [r0+r3*1-16] |
punpckhwd m0, [r0+r3*0-16] |
mova m1, [r0+r1*1-16] |
punpckhwd m1, [r0+r3*2-16] |
punpckhdq m1, m0 |
mova m2, [r4+r3*1-16] |
punpckhwd m2, [r4+r3*0-16] |
mova m3, [r4+r1*1-16] |
punpckhwd m3, [r4+r3*2-16] |
punpckhdq m3, m2 |
punpckhqdq m3, m1 |
mova m0, [r4+r3*4-16] |
mova m1, [r0] |
PALIGNR m4, m3, m0, 14, m0 |
PALIGNR m1, m3, 2, m2 |
PRED4x4_LOWPASS m3, m1, m4, m3 |
mova m2, [r0] |
shr r2d, 13 |
pslldq m1, m2, 2 |
psrldq m5, m2, 2 |
pinsrw m1, [r0-2], 0 |
pinsrw m5, [r0+r2+14], 7 |
PRED4x4_LOWPASS m2, m5, m1, m2 |
PALIGNR m6, m2, m3, 12, m1 |
PALIGNR m5, m2, m3, 14, m0 |
PRED4x4_LOWPASS m0, m6, m2, m5 |
pavgw m2, m5 |
mova [r0+r3*2], m0 |
mova [r0+r3*1], m2 |
pslldq m6, m3, 4 |
pslldq m1, m3, 2 |
PRED4x4_LOWPASS m1, m3, m6, m1 |
PALIGNR m2, m1, 14, m4 |
mova [r0+r1*1], m2 |
pslldq m1, 2 |
PALIGNR m0, m1, 14, m3 |
mova [r0+r3*4], m0 |
pslldq m1, 2 |
PALIGNR m2, m1, 14, m4 |
mova [r4+r3*1], m2 |
pslldq m1, 2 |
PALIGNR m0, m1, 14, m3 |
mova [r4+r3*2], m0 |
pslldq m1, 2 |
PALIGNR m2, m1, 14, m4 |
mova [r4+r1*1], m2 |
pslldq m1, 2 |
PALIGNR m0, m1, 14, m1 |
mova [r4+r3*4], m0 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED8x8L_VERTICAL_RIGHT |
INIT_XMM ssse3 |
PRED8x8L_VERTICAL_RIGHT |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED8x8L_VERTICAL_RIGHT |
%endif |
;----------------------------------------------------------------------------- |
; void pred8x8l_horizontal_up(pixel *src, int has_topleft, int has_topright, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED8x8L_HORIZONTAL_UP 0 |
cglobal pred8x8l_horizontal_up_10, 4, 4, 6 |
mova m0, [r0+r3*0-16] |
punpckhwd m0, [r0+r3*1-16] |
shr r1d, 14 |
dec r1 |
and r1, r3 |
sub r1, r3 |
mova m4, [r0+r1*1-16] |
lea r1, [r3*3] |
lea r2, [r0+r3*4] |
mova m1, [r0+r3*2-16] |
punpckhwd m1, [r0+r1*1-16] |
punpckhdq m0, m1 |
mova m2, [r2+r3*0-16] |
punpckhwd m2, [r2+r3*1-16] |
mova m3, [r2+r3*2-16] |
punpckhwd m3, [r2+r1*1-16] |
punpckhdq m2, m3 |
punpckhqdq m0, m2 |
PALIGNR m1, m0, m4, 14, m4 |
psrldq m2, m0, 2 |
pshufhw m2, m2, 10100100b |
PRED4x4_LOWPASS m0, m1, m2, m0 |
psrldq m1, m0, 2 |
psrldq m2, m0, 4 |
pshufhw m1, m1, 10100100b |
pshufhw m2, m2, 01010100b |
pavgw m4, m0, m1 |
PRED4x4_LOWPASS m1, m2, m0, m1 |
punpckhwd m5, m4, m1 |
punpcklwd m4, m1 |
mova [r2+r3*0], m5 |
mova [r0+r3*0], m4 |
pshufd m0, m5, 11111001b |
pshufd m1, m5, 11111110b |
pshufd m2, m5, 11111111b |
mova [r2+r3*1], m0 |
mova [r2+r3*2], m1 |
mova [r2+r1*1], m2 |
PALIGNR m2, m5, m4, 4, m0 |
PALIGNR m3, m5, m4, 8, m1 |
PALIGNR m5, m5, m4, 12, m4 |
mova [r0+r3*1], m2 |
mova [r0+r3*2], m3 |
mova [r0+r1*1], m5 |
RET |
%endmacro |
INIT_XMM sse2 |
PRED8x8L_HORIZONTAL_UP |
INIT_XMM ssse3 |
PRED8x8L_HORIZONTAL_UP |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PRED8x8L_HORIZONTAL_UP |
%endif |
;----------------------------------------------------------------------------- |
; void pred16x16_vertical(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MOV16 3-5 |
mova [%1+ 0], %2 |
mova [%1+mmsize], %3 |
%if mmsize==8 |
mova [%1+ 16], %4 |
mova [%1+ 24], %5 |
%endif |
%endmacro |
%macro PRED16x16_VERTICAL 0 |
cglobal pred16x16_vertical_10, 2, 3 |
sub r0, r1 |
mov r2d, 8 |
mova m0, [r0+ 0] |
mova m1, [r0+mmsize] |
%if mmsize==8 |
mova m2, [r0+16] |
mova m3, [r0+24] |
%endif |
.loop: |
MOV16 r0+r1*1, m0, m1, m2, m3 |
MOV16 r0+r1*2, m0, m1, m2, m3 |
lea r0, [r0+r1*2] |
dec r2d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PRED16x16_VERTICAL |
INIT_XMM sse2 |
PRED16x16_VERTICAL |
;----------------------------------------------------------------------------- |
; void pred16x16_horizontal(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED16x16_HORIZONTAL 0 |
cglobal pred16x16_horizontal_10, 2, 3 |
mov r2d, 8 |
.vloop: |
movd m0, [r0+r1*0-4] |
movd m1, [r0+r1*1-4] |
SPLATW m0, m0, 1 |
SPLATW m1, m1, 1 |
MOV16 r0+r1*0, m0, m0, m0, m0 |
MOV16 r0+r1*1, m1, m1, m1, m1 |
lea r0, [r0+r1*2] |
dec r2d |
jg .vloop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PRED16x16_HORIZONTAL |
INIT_XMM sse2 |
PRED16x16_HORIZONTAL |
;----------------------------------------------------------------------------- |
; void pred16x16_dc(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED16x16_DC 0 |
cglobal pred16x16_dc_10, 2, 6 |
mov r5, r0 |
sub r0, r1 |
mova m0, [r0+0] |
paddw m0, [r0+mmsize] |
%if mmsize==8 |
paddw m0, [r0+16] |
paddw m0, [r0+24] |
%endif |
HADDW m0, m2 |
lea r0, [r0+r1-2] |
movzx r3d, word [r0] |
movzx r4d, word [r0+r1] |
%rep 7 |
lea r0, [r0+r1*2] |
movzx r2d, word [r0] |
add r3d, r2d |
movzx r2d, word [r0+r1] |
add r4d, r2d |
%endrep |
lea r3d, [r3+r4+16] |
movd m1, r3d |
paddw m0, m1 |
psrlw m0, 5 |
SPLATW m0, m0 |
mov r3d, 8 |
.loop: |
MOV16 r5+r1*0, m0, m0, m0, m0 |
MOV16 r5+r1*1, m0, m0, m0, m0 |
lea r5, [r5+r1*2] |
dec r3d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PRED16x16_DC |
INIT_XMM sse2 |
PRED16x16_DC |
;----------------------------------------------------------------------------- |
; void pred16x16_top_dc(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED16x16_TOP_DC 0 |
cglobal pred16x16_top_dc_10, 2, 3 |
sub r0, r1 |
mova m0, [r0+0] |
paddw m0, [r0+mmsize] |
%if mmsize==8 |
paddw m0, [r0+16] |
paddw m0, [r0+24] |
%endif |
HADDW m0, m2 |
SPLATW m0, m0 |
paddw m0, [pw_8] |
psrlw m0, 4 |
mov r2d, 8 |
.loop: |
MOV16 r0+r1*1, m0, m0, m0, m0 |
MOV16 r0+r1*2, m0, m0, m0, m0 |
lea r0, [r0+r1*2] |
dec r2d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PRED16x16_TOP_DC |
INIT_XMM sse2 |
PRED16x16_TOP_DC |
;----------------------------------------------------------------------------- |
; void pred16x16_left_dc(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED16x16_LEFT_DC 0 |
cglobal pred16x16_left_dc_10, 2, 6 |
mov r5, r0 |
sub r0, 2 |
movzx r3d, word [r0] |
movzx r4d, word [r0+r1] |
%rep 7 |
lea r0, [r0+r1*2] |
movzx r2d, word [r0] |
add r3d, r2d |
movzx r2d, word [r0+r1] |
add r4d, r2d |
%endrep |
lea r3d, [r3+r4+8] |
shr r3d, 4 |
movd m0, r3d |
SPLATW m0, m0 |
mov r3d, 8 |
.loop: |
MOV16 r5+r1*0, m0, m0, m0, m0 |
MOV16 r5+r1*1, m0, m0, m0, m0 |
lea r5, [r5+r1*2] |
dec r3d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PRED16x16_LEFT_DC |
INIT_XMM sse2 |
PRED16x16_LEFT_DC |
;----------------------------------------------------------------------------- |
; void pred16x16_128_dc(pixel *src, int stride) |
;----------------------------------------------------------------------------- |
%macro PRED16x16_128_DC 0 |
cglobal pred16x16_128_dc_10, 2,3 |
mova m0, [pw_512] |
mov r2d, 8 |
.loop: |
MOV16 r0+r1*0, m0, m0, m0, m0 |
MOV16 r0+r1*1, m0, m0, m0, m0 |
lea r0, [r0+r1*2] |
dec r2d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PRED16x16_128_DC |
INIT_XMM sse2 |
PRED16x16_128_DC |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_intrapred_init.c |
---|
0,0 → 1,402 |
/* |
* Copyright (c) 2010 Jason Garrett-Glaser |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/avcodec.h" |
#include "libavcodec/h264pred.h" |
#define PRED4x4(TYPE, DEPTH, OPT) \ |
void ff_pred4x4_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ |
const uint8_t *topright, \ |
ptrdiff_t stride); |
PRED4x4(dc, 10, mmxext) |
PRED4x4(down_left, 10, sse2) |
PRED4x4(down_left, 10, avx) |
PRED4x4(down_right, 10, sse2) |
PRED4x4(down_right, 10, ssse3) |
PRED4x4(down_right, 10, avx) |
PRED4x4(vertical_left, 10, sse2) |
PRED4x4(vertical_left, 10, avx) |
PRED4x4(vertical_right, 10, sse2) |
PRED4x4(vertical_right, 10, ssse3) |
PRED4x4(vertical_right, 10, avx) |
PRED4x4(horizontal_up, 10, mmxext) |
PRED4x4(horizontal_down, 10, sse2) |
PRED4x4(horizontal_down, 10, ssse3) |
PRED4x4(horizontal_down, 10, avx) |
#define PRED8x8(TYPE, DEPTH, OPT) \ |
void ff_pred8x8_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ |
ptrdiff_t stride); |
PRED8x8(dc, 10, mmxext) |
PRED8x8(dc, 10, sse2) |
PRED8x8(top_dc, 10, sse2) |
PRED8x8(plane, 10, sse2) |
PRED8x8(vertical, 10, sse2) |
PRED8x8(horizontal, 10, sse2) |
#define PRED8x8L(TYPE, DEPTH, OPT)\ |
void ff_pred8x8l_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ |
int has_topleft, \ |
int has_topright, \ |
ptrdiff_t stride); |
PRED8x8L(dc, 10, sse2) |
PRED8x8L(dc, 10, avx) |
PRED8x8L(128_dc, 10, mmxext) |
PRED8x8L(128_dc, 10, sse2) |
PRED8x8L(top_dc, 10, sse2) |
PRED8x8L(top_dc, 10, avx) |
PRED8x8L(vertical, 10, sse2) |
PRED8x8L(vertical, 10, avx) |
PRED8x8L(horizontal, 10, sse2) |
PRED8x8L(horizontal, 10, ssse3) |
PRED8x8L(horizontal, 10, avx) |
PRED8x8L(down_left, 10, sse2) |
PRED8x8L(down_left, 10, ssse3) |
PRED8x8L(down_left, 10, avx) |
PRED8x8L(down_right, 10, sse2) |
PRED8x8L(down_right, 10, ssse3) |
PRED8x8L(down_right, 10, avx) |
PRED8x8L(vertical_right, 10, sse2) |
PRED8x8L(vertical_right, 10, ssse3) |
PRED8x8L(vertical_right, 10, avx) |
PRED8x8L(horizontal_up, 10, sse2) |
PRED8x8L(horizontal_up, 10, ssse3) |
PRED8x8L(horizontal_up, 10, avx) |
#define PRED16x16(TYPE, DEPTH, OPT)\ |
void ff_pred16x16_ ## TYPE ## _ ## DEPTH ## _ ## OPT (uint8_t *src, \ |
ptrdiff_t stride); |
PRED16x16(dc, 10, mmxext) |
PRED16x16(dc, 10, sse2) |
PRED16x16(top_dc, 10, mmxext) |
PRED16x16(top_dc, 10, sse2) |
PRED16x16(128_dc, 10, mmxext) |
PRED16x16(128_dc, 10, sse2) |
PRED16x16(left_dc, 10, mmxext) |
PRED16x16(left_dc, 10, sse2) |
PRED16x16(vertical, 10, mmxext) |
PRED16x16(vertical, 10, sse2) |
PRED16x16(horizontal, 10, mmxext) |
PRED16x16(horizontal, 10, sse2) |
/* 8-bit versions */ |
PRED16x16(vertical, 8, mmx) |
PRED16x16(vertical, 8, sse) |
PRED16x16(horizontal, 8, mmx) |
PRED16x16(horizontal, 8, mmxext) |
PRED16x16(horizontal, 8, ssse3) |
PRED16x16(dc, 8, mmxext) |
PRED16x16(dc, 8, sse2) |
PRED16x16(dc, 8, ssse3) |
PRED16x16(plane_h264, 8, mmx) |
PRED16x16(plane_h264, 8, mmxext) |
PRED16x16(plane_h264, 8, sse2) |
PRED16x16(plane_h264, 8, ssse3) |
PRED16x16(plane_rv40, 8, mmx) |
PRED16x16(plane_rv40, 8, mmxext) |
PRED16x16(plane_rv40, 8, sse2) |
PRED16x16(plane_rv40, 8, ssse3) |
PRED16x16(plane_svq3, 8, mmx) |
PRED16x16(plane_svq3, 8, mmxext) |
PRED16x16(plane_svq3, 8, sse2) |
PRED16x16(plane_svq3, 8, ssse3) |
PRED16x16(tm_vp8, 8, mmx) |
PRED16x16(tm_vp8, 8, mmxext) |
PRED16x16(tm_vp8, 8, sse2) |
PRED8x8(top_dc, 8, mmxext) |
PRED8x8(dc_rv40, 8, mmxext) |
PRED8x8(dc, 8, mmxext) |
PRED8x8(vertical, 8, mmx) |
PRED8x8(horizontal, 8, mmx) |
PRED8x8(horizontal, 8, mmxext) |
PRED8x8(horizontal, 8, ssse3) |
PRED8x8(plane, 8, mmx) |
PRED8x8(plane, 8, mmxext) |
PRED8x8(plane, 8, sse2) |
PRED8x8(plane, 8, ssse3) |
PRED8x8(tm_vp8, 8, mmx) |
PRED8x8(tm_vp8, 8, mmxext) |
PRED8x8(tm_vp8, 8, sse2) |
PRED8x8(tm_vp8, 8, ssse3) |
PRED8x8L(top_dc, 8, mmxext) |
PRED8x8L(top_dc, 8, ssse3) |
PRED8x8L(dc, 8, mmxext) |
PRED8x8L(dc, 8, ssse3) |
PRED8x8L(horizontal, 8, mmxext) |
PRED8x8L(horizontal, 8, ssse3) |
PRED8x8L(vertical, 8, mmxext) |
PRED8x8L(vertical, 8, ssse3) |
PRED8x8L(down_left, 8, mmxext) |
PRED8x8L(down_left, 8, sse2) |
PRED8x8L(down_left, 8, ssse3) |
PRED8x8L(down_right, 8, mmxext) |
PRED8x8L(down_right, 8, sse2) |
PRED8x8L(down_right, 8, ssse3) |
PRED8x8L(vertical_right, 8, mmxext) |
PRED8x8L(vertical_right, 8, sse2) |
PRED8x8L(vertical_right, 8, ssse3) |
PRED8x8L(vertical_left, 8, sse2) |
PRED8x8L(vertical_left, 8, ssse3) |
PRED8x8L(horizontal_up, 8, mmxext) |
PRED8x8L(horizontal_up, 8, ssse3) |
PRED8x8L(horizontal_down, 8, mmxext) |
PRED8x8L(horizontal_down, 8, sse2) |
PRED8x8L(horizontal_down, 8, ssse3) |
PRED4x4(dc, 8, mmxext) |
PRED4x4(down_left, 8, mmxext) |
PRED4x4(down_right, 8, mmxext) |
PRED4x4(vertical_left, 8, mmxext) |
PRED4x4(vertical_right, 8, mmxext) |
PRED4x4(horizontal_up, 8, mmxext) |
PRED4x4(horizontal_down, 8, mmxext) |
PRED4x4(tm_vp8, 8, mmx) |
PRED4x4(tm_vp8, 8, mmxext) |
PRED4x4(tm_vp8, 8, ssse3) |
PRED4x4(vertical_vp8, 8, mmxext) |
av_cold void ff_h264_pred_init_x86(H264PredContext *h, int codec_id, |
const int bit_depth, |
const int chroma_format_idc) |
{ |
int cpu_flags = av_get_cpu_flags(); |
if (bit_depth == 8) { |
if (EXTERNAL_MMX(cpu_flags)) { |
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_8_mmx; |
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmx; |
if (chroma_format_idc == 1) { |
h->pred8x8 [VERT_PRED8x8 ] = ff_pred8x8_vertical_8_mmx; |
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmx; |
} |
if (codec_id == AV_CODEC_ID_VP8) { |
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmx; |
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmx; |
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmx; |
} else { |
if (chroma_format_idc == 1) |
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmx; |
if (codec_id == AV_CODEC_ID_SVQ3) { |
if (cpu_flags & AV_CPU_FLAG_CMOV) |
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_mmx; |
} else if (codec_id == AV_CODEC_ID_RV40) { |
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_mmx; |
} else { |
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_mmx; |
} |
} |
} |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_mmxext; |
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_mmxext; |
if (chroma_format_idc == 1) |
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_mmxext; |
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_mmxext; |
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_mmxext; |
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_mmxext; |
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_mmxext; |
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_mmxext; |
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_mmxext; |
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_mmxext; |
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_mmxext; |
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_mmxext; |
h->pred4x4 [DIAG_DOWN_RIGHT_PRED ] = ff_pred4x4_down_right_8_mmxext; |
h->pred4x4 [VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_8_mmxext; |
h->pred4x4 [HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_8_mmxext; |
h->pred4x4 [DC_PRED ] = ff_pred4x4_dc_8_mmxext; |
if (codec_id == AV_CODEC_ID_VP8 || codec_id == AV_CODEC_ID_H264) { |
h->pred4x4 [DIAG_DOWN_LEFT_PRED] = ff_pred4x4_down_left_8_mmxext; |
} |
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { |
h->pred4x4 [VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_8_mmxext; |
} |
if (codec_id != AV_CODEC_ID_RV40) { |
h->pred4x4 [HOR_UP_PRED ] = ff_pred4x4_horizontal_up_8_mmxext; |
} |
if (codec_id == AV_CODEC_ID_SVQ3 || codec_id == AV_CODEC_ID_H264) { |
if (chroma_format_idc == 1) { |
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_8_mmxext; |
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_8_mmxext; |
} |
} |
if (codec_id == AV_CODEC_ID_VP8) { |
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_mmxext; |
h->pred8x8 [DC_PRED8x8 ] = ff_pred8x8_dc_rv40_8_mmxext; |
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_mmxext; |
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_mmxext; |
h->pred4x4 [VERT_PRED ] = ff_pred4x4_vertical_vp8_8_mmxext; |
} else { |
if (chroma_format_idc == 1) |
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_mmxext; |
if (codec_id == AV_CODEC_ID_SVQ3) { |
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_svq3_8_mmxext; |
} else if (codec_id == AV_CODEC_ID_RV40) { |
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_rv40_8_mmxext; |
} else { |
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_plane_h264_8_mmxext; |
} |
} |
} |
if (EXTERNAL_SSE(cpu_flags)) { |
h->pred16x16[VERT_PRED8x8] = ff_pred16x16_vertical_8_sse; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_sse2; |
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_sse2; |
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_sse2; |
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_sse2; |
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_sse2; |
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_sse2; |
if (codec_id == AV_CODEC_ID_VP8) { |
h->pred16x16[PLANE_PRED8x8 ] = ff_pred16x16_tm_vp8_8_sse2; |
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_sse2; |
} else { |
if (chroma_format_idc == 1) |
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_sse2; |
if (codec_id == AV_CODEC_ID_SVQ3) { |
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_sse2; |
} else if (codec_id == AV_CODEC_ID_RV40) { |
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_sse2; |
} else { |
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_sse2; |
} |
} |
} |
if (EXTERNAL_SSSE3(cpu_flags)) { |
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_8_ssse3; |
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_8_ssse3; |
if (chroma_format_idc == 1) |
h->pred8x8 [HOR_PRED8x8 ] = ff_pred8x8_horizontal_8_ssse3; |
h->pred8x8l [TOP_DC_PRED ] = ff_pred8x8l_top_dc_8_ssse3; |
h->pred8x8l [DC_PRED ] = ff_pred8x8l_dc_8_ssse3; |
h->pred8x8l [HOR_PRED ] = ff_pred8x8l_horizontal_8_ssse3; |
h->pred8x8l [VERT_PRED ] = ff_pred8x8l_vertical_8_ssse3; |
h->pred8x8l [DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_8_ssse3; |
h->pred8x8l [DIAG_DOWN_RIGHT_PRED ] = ff_pred8x8l_down_right_8_ssse3; |
h->pred8x8l [VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_8_ssse3; |
h->pred8x8l [VERT_LEFT_PRED ] = ff_pred8x8l_vertical_left_8_ssse3; |
h->pred8x8l [HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_8_ssse3; |
h->pred8x8l [HOR_DOWN_PRED ] = ff_pred8x8l_horizontal_down_8_ssse3; |
if (codec_id == AV_CODEC_ID_VP8) { |
h->pred8x8 [PLANE_PRED8x8 ] = ff_pred8x8_tm_vp8_8_ssse3; |
h->pred4x4 [TM_VP8_PRED ] = ff_pred4x4_tm_vp8_8_ssse3; |
} else { |
if (chroma_format_idc == 1) |
h->pred8x8 [PLANE_PRED8x8] = ff_pred8x8_plane_8_ssse3; |
if (codec_id == AV_CODEC_ID_SVQ3) { |
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_svq3_8_ssse3; |
} else if (codec_id == AV_CODEC_ID_RV40) { |
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_rv40_8_ssse3; |
} else { |
h->pred16x16[PLANE_PRED8x8] = ff_pred16x16_plane_h264_8_ssse3; |
} |
} |
} |
} else if (bit_depth == 10) { |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
h->pred4x4[DC_PRED ] = ff_pred4x4_dc_10_mmxext; |
h->pred4x4[HOR_UP_PRED ] = ff_pred4x4_horizontal_up_10_mmxext; |
if (chroma_format_idc == 1) |
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_mmxext; |
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_mmxext; |
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_mmxext; |
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_mmxext; |
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_mmxext; |
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_mmxext; |
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_mmxext; |
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_mmxext; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_sse2; |
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_sse2; |
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_sse2; |
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_sse2; |
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_sse2; |
if (chroma_format_idc == 1) { |
h->pred8x8[DC_PRED8x8 ] = ff_pred8x8_dc_10_sse2; |
h->pred8x8[TOP_DC_PRED8x8 ] = ff_pred8x8_top_dc_10_sse2; |
h->pred8x8[PLANE_PRED8x8 ] = ff_pred8x8_plane_10_sse2; |
h->pred8x8[VERT_PRED8x8 ] = ff_pred8x8_vertical_10_sse2; |
h->pred8x8[HOR_PRED8x8 ] = ff_pred8x8_horizontal_10_sse2; |
} |
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_sse2; |
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_sse2; |
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_sse2; |
h->pred8x8l[DC_128_PRED ] = ff_pred8x8l_128_dc_10_sse2; |
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_sse2; |
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_sse2; |
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_sse2; |
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_sse2; |
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_sse2; |
h->pred16x16[DC_PRED8x8 ] = ff_pred16x16_dc_10_sse2; |
h->pred16x16[TOP_DC_PRED8x8 ] = ff_pred16x16_top_dc_10_sse2; |
h->pred16x16[DC_128_PRED8x8 ] = ff_pred16x16_128_dc_10_sse2; |
h->pred16x16[LEFT_DC_PRED8x8 ] = ff_pred16x16_left_dc_10_sse2; |
h->pred16x16[VERT_PRED8x8 ] = ff_pred16x16_vertical_10_sse2; |
h->pred16x16[HOR_PRED8x8 ] = ff_pred16x16_horizontal_10_sse2; |
} |
if (EXTERNAL_SSSE3(cpu_flags)) { |
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_ssse3; |
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_ssse3; |
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_ssse3; |
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_ssse3; |
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_ssse3; |
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_ssse3; |
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_ssse3; |
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_ssse3; |
} |
if (EXTERNAL_AVX(cpu_flags)) { |
h->pred4x4[DIAG_DOWN_LEFT_PRED ] = ff_pred4x4_down_left_10_avx; |
h->pred4x4[DIAG_DOWN_RIGHT_PRED] = ff_pred4x4_down_right_10_avx; |
h->pred4x4[VERT_LEFT_PRED ] = ff_pred4x4_vertical_left_10_avx; |
h->pred4x4[VERT_RIGHT_PRED ] = ff_pred4x4_vertical_right_10_avx; |
h->pred4x4[HOR_DOWN_PRED ] = ff_pred4x4_horizontal_down_10_avx; |
h->pred8x8l[VERT_PRED ] = ff_pred8x8l_vertical_10_avx; |
h->pred8x8l[HOR_PRED ] = ff_pred8x8l_horizontal_10_avx; |
h->pred8x8l[DC_PRED ] = ff_pred8x8l_dc_10_avx; |
h->pred8x8l[TOP_DC_PRED ] = ff_pred8x8l_top_dc_10_avx; |
h->pred8x8l[DIAG_DOWN_RIGHT_PRED] = ff_pred8x8l_down_right_10_avx; |
h->pred8x8l[DIAG_DOWN_LEFT_PRED ] = ff_pred8x8l_down_left_10_avx; |
h->pred8x8l[VERT_RIGHT_PRED ] = ff_pred8x8l_vertical_right_10_avx; |
h->pred8x8l[HOR_UP_PRED ] = ff_pred8x8l_horizontal_up_10_avx; |
} |
} |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_qpel.c |
---|
0,0 → 1,634 |
/* |
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |
* Copyright (c) 2011 Daniel Kang |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/h264qpel.h" |
#include "libavcodec/mpegvideo.h" |
#include "dsputil_x86.h" |
#if HAVE_YASM |
void ff_put_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels4_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
void ff_avg_pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
void ff_put_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
void ff_avg_pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
void ff_put_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
void ff_avg_pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, |
int dstStride, int src1Stride, int h); |
#define ff_put_pixels8_l2_sse2 ff_put_pixels8_l2_mmxext |
#define ff_avg_pixels8_l2_sse2 ff_avg_pixels8_l2_mmxext |
#define ff_put_pixels16_l2_sse2 ff_put_pixels16_l2_mmxext |
#define ff_avg_pixels16_l2_sse2 ff_avg_pixels16_l2_mmxext |
PIXELS16(static, ff_avg, , , _mmxext) |
PIXELS16(static, ff_put, , , _mmxext) |
#define DEF_QPEL(OPNAME)\ |
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ |
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ |
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_ssse3(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ |
void ff_ ## OPNAME ## _h264_qpel4_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ |
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_mmxext(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ |
void ff_ ## OPNAME ## _h264_qpel8_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride);\ |
void ff_ ## OPNAME ## _h264_qpel4_v_lowpass_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride);\ |
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_op_mmxext(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\ |
void ff_ ## OPNAME ## _h264_qpel8or16_v_lowpass_sse2(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h);\ |
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_v_mmxext(uint8_t *src, int16_t *tmp, int srcStride);\ |
void ff_ ## OPNAME ## _h264_qpel4_hv_lowpass_h_mmxext(int16_t *tmp, uint8_t *dst, int dstStride);\ |
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_mmxext(uint8_t *src, int16_t *tmp, int srcStride, int size);\ |
void ff_ ## OPNAME ## _h264_qpel8or16_hv1_lowpass_op_sse2(uint8_t *src, int16_t *tmp, int srcStride, int size);\ |
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_op_mmxext(uint8_t *dst, int16_t *tmp, int dstStride, int unused, int h);\ |
void ff_ ## OPNAME ## _h264_qpel8or16_hv2_lowpass_ssse3(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size);\ |
void ff_ ## OPNAME ## _pixels4_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h);\ |
void ff_ ## OPNAME ## _pixels8_l2_shift5_mmxext(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h); |
DEF_QPEL(avg) |
DEF_QPEL(put) |
#define QPEL_H264(OPNAME, OP, MMX)\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel4_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
int w=3;\ |
src -= 2*srcStride+2;\ |
while(w--){\ |
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_v_mmxext(src, tmp, srcStride);\ |
tmp += 4;\ |
src += 4;\ |
}\ |
tmp -= 3*4;\ |
ff_ ## OPNAME ## h264_qpel4_hv_lowpass_h_mmxext(tmp, dst, dstStride);\ |
}\ |
\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
src -= 2*srcStride;\ |
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\ |
src += 4;\ |
dst += 4;\ |
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_op_mmxext(dst, src, dstStride, srcStride, h);\ |
}\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_ ## MMX(int16_t *tmp, uint8_t *src, int tmpStride, int srcStride, int size){\ |
int w = (size+8)>>2;\ |
src -= 2*srcStride+2;\ |
while(w--){\ |
ff_ ## OPNAME ## h264_qpel8or16_hv1_lowpass_op_mmxext(src, tmp, srcStride, size);\ |
tmp += 4;\ |
src += 4;\ |
}\ |
}\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, int dstStride, int tmpStride, int size){\ |
int w = size>>4;\ |
do{\ |
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_op_mmxext(dst, tmp, dstStride, 0, size);\ |
tmp += 8;\ |
dst += 8;\ |
}while(w--);\ |
}\ |
\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
}\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
}\ |
\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ |
src += 8*srcStride;\ |
dst += 8*dstStride;\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ |
}\ |
\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ |
src += 8*dstStride;\ |
dst += 8*dstStride;\ |
src2 += 8*src2Stride;\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ |
}\ |
\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ |
ff_put_h264_qpel8or16_hv1_lowpass_ ## MMX(tmp, src, tmpStride, srcStride, size);\ |
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ |
}\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 8);\ |
}\ |
\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst , tmp , src , dstStride, tmpStride, srcStride, 16);\ |
}\ |
\ |
static av_always_inline void ff_ ## OPNAME ## pixels16_l2_shift5_ ## MMX(uint8_t *dst, int16_t *src16, uint8_t *src8, int dstStride, int src8Stride, int h)\ |
{\ |
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst , src16 , src8 , dstStride, src8Stride, h);\ |
ff_ ## OPNAME ## pixels8_l2_shift5_ ## MMX(dst+8, src16+8, src8+8, dstStride, src8Stride, h);\ |
}\ |
#if ARCH_X86_64 |
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ |
void ff_avg_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride); |
void ff_put_h264_qpel16_h_lowpass_l2_ssse3(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride); |
#else // ARCH_X86_64 |
#define QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_l2_ ## MMX(uint8_t *dst, uint8_t *src, uint8_t *src2, int dstStride, int src2Stride){\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ |
src += 8*dstStride;\ |
dst += 8*dstStride;\ |
src2 += 8*src2Stride;\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst , src , src2 , dstStride, src2Stride);\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_l2_ ## MMX(dst+8, src+8, src2+8, dstStride, src2Stride);\ |
} |
#endif // ARCH_X86_64 |
#define QPEL_H264_H_XMM(OPNAME, OP, MMX)\ |
QPEL_H264_H16_XMM(OPNAME, OP, MMX)\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_h_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ |
src += 8*srcStride;\ |
dst += 8*dstStride;\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst , src , dstStride, srcStride);\ |
ff_ ## OPNAME ## h264_qpel8_h_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride);\ |
}\ |
#define QPEL_H264_V_XMM(OPNAME, OP, MMX)\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
}\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_v_lowpass_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
ff_ ## OPNAME ## h264_qpel8or16_v_lowpass_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
} |
static av_always_inline void put_h264_qpel8or16_hv1_lowpass_sse2(int16_t *tmp, |
uint8_t *src, |
int tmpStride, |
int srcStride, |
int size) |
{ |
int w = (size+8)>>3; |
src -= 2*srcStride+2; |
while(w--){ |
ff_put_h264_qpel8or16_hv1_lowpass_op_sse2(src, tmp, srcStride, size); |
tmp += 8; |
src += 8; |
} |
} |
#define QPEL_H264_HV_XMM(OPNAME, OP, MMX)\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride, int size){\ |
put_h264_qpel8or16_hv1_lowpass_sse2(tmp, src, tmpStride, srcStride, size);\ |
ff_ ## OPNAME ## h264_qpel8or16_hv2_lowpass_ ## MMX(dst, tmp, dstStride, tmpStride, size);\ |
}\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel8_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 8);\ |
}\ |
static av_always_inline void ff_ ## OPNAME ## h264_qpel16_hv_lowpass_ ## MMX(uint8_t *dst, int16_t *tmp, uint8_t *src, int dstStride, int tmpStride, int srcStride){\ |
ff_ ## OPNAME ## h264_qpel8or16_hv_lowpass_ ## MMX(dst, tmp, src, dstStride, tmpStride, srcStride, 16);\ |
}\ |
#define ff_put_h264_qpel8_h_lowpass_l2_sse2 ff_put_h264_qpel8_h_lowpass_l2_mmxext |
#define ff_avg_h264_qpel8_h_lowpass_l2_sse2 ff_avg_h264_qpel8_h_lowpass_l2_mmxext |
#define ff_put_h264_qpel16_h_lowpass_l2_sse2 ff_put_h264_qpel16_h_lowpass_l2_mmxext |
#define ff_avg_h264_qpel16_h_lowpass_l2_sse2 ff_avg_h264_qpel16_h_lowpass_l2_mmxext |
#define ff_put_h264_qpel8_v_lowpass_ssse3 ff_put_h264_qpel8_v_lowpass_sse2 |
#define ff_avg_h264_qpel8_v_lowpass_ssse3 ff_avg_h264_qpel8_v_lowpass_sse2 |
#define ff_put_h264_qpel16_v_lowpass_ssse3 ff_put_h264_qpel16_v_lowpass_sse2 |
#define ff_avg_h264_qpel16_v_lowpass_ssse3 ff_avg_h264_qpel16_v_lowpass_sse2 |
#define ff_put_h264_qpel8or16_hv2_lowpass_sse2 ff_put_h264_qpel8or16_hv2_lowpass_mmxext |
#define ff_avg_h264_qpel8or16_hv2_lowpass_sse2 ff_avg_h264_qpel8or16_hv2_lowpass_mmxext |
#define H264_MC(OPNAME, SIZE, MMX, ALIGN) \ |
H264_MC_C(OPNAME, SIZE, MMX, ALIGN)\ |
H264_MC_V(OPNAME, SIZE, MMX, ALIGN)\ |
H264_MC_H(OPNAME, SIZE, MMX, ALIGN)\ |
H264_MC_HV(OPNAME, SIZE, MMX, ALIGN)\ |
static void put_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, |
ptrdiff_t stride) |
{ |
ff_put_pixels16_sse2(dst, src, stride, 16); |
} |
static void avg_h264_qpel16_mc00_sse2 (uint8_t *dst, uint8_t *src, |
ptrdiff_t stride) |
{ |
ff_avg_pixels16_sse2(dst, src, stride, 16); |
} |
#define put_h264_qpel8_mc00_sse2 put_h264_qpel8_mc00_mmxext |
#define avg_h264_qpel8_mc00_sse2 avg_h264_qpel8_mc00_mmxext |
#define H264_MC_C(OPNAME, SIZE, MMX, ALIGN) \ |
static void OPNAME ## h264_qpel ## SIZE ## _mc00_ ## MMX (uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
ff_ ## OPNAME ## pixels ## SIZE ## _ ## MMX(dst, src, stride, SIZE);\ |
}\ |
#define H264_MC_H(OPNAME, SIZE, MMX, ALIGN) \ |
static void OPNAME ## h264_qpel ## SIZE ## _mc10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src, stride, stride);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_ ## MMX(dst, src, stride, stride);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc30_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, src+1, stride, stride);\ |
}\ |
#define H264_MC_V(OPNAME, SIZE, MMX, ALIGN) \ |
static void OPNAME ## h264_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ |
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ |
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src, temp, stride, stride, SIZE);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _v_lowpass_ ## MMX(dst, src, stride, stride);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ |
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ |
ff_ ## OPNAME ## pixels ## SIZE ## _l2_ ## MMX(dst, src+stride, temp, stride, stride, SIZE);\ |
}\ |
#define H264_MC_HV(OPNAME, SIZE, MMX, ALIGN) \ |
static void OPNAME ## h264_qpel ## SIZE ## _mc11_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ |
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc31_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ |
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, temp, stride, SIZE);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc13_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ |
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src, SIZE, stride);\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc33_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*SIZE];\ |
ff_put_h264_qpel ## SIZE ## _v_lowpass_ ## MMX(temp, src+1, SIZE, stride);\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, temp, stride, SIZE);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc22_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint16_t, temp)[SIZE*(SIZE<8?12:24)];\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(dst, temp, src, stride, SIZE, stride);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc21_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ |
uint8_t * const halfHV= temp;\ |
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ |
av_assert2(((int)temp & 7) == 0);\ |
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src, halfHV, stride, SIZE);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc23_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ |
uint8_t * const halfHV= temp;\ |
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ |
av_assert2(((int)temp & 7) == 0);\ |
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ |
ff_ ## OPNAME ## h264_qpel ## SIZE ## _h_lowpass_l2_ ## MMX(dst, src+stride, halfHV, stride, SIZE);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc12_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ |
uint8_t * const halfHV= temp;\ |
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ |
av_assert2(((int)temp & 7) == 0);\ |
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ |
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+2, halfHV, stride, SIZE, SIZE);\ |
}\ |
\ |
static void OPNAME ## h264_qpel ## SIZE ## _mc32_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
{\ |
DECLARE_ALIGNED(ALIGN, uint8_t, temp)[SIZE*(SIZE<8?12:24)*2 + SIZE*SIZE];\ |
uint8_t * const halfHV= temp;\ |
int16_t * const halfV= (int16_t*)(temp + SIZE*SIZE);\ |
av_assert2(((int)temp & 7) == 0);\ |
ff_put_h264_qpel ## SIZE ## _hv_lowpass_ ## MMX(halfHV, halfV, src, SIZE, SIZE, stride);\ |
ff_ ## OPNAME ## pixels ## SIZE ## _l2_shift5_mmxext(dst, halfV+3, halfHV, stride, SIZE, SIZE);\ |
}\ |
#define H264_MC_4816(MMX)\ |
H264_MC(put_, 4, MMX, 8)\ |
H264_MC(put_, 8, MMX, 8)\ |
H264_MC(put_, 16,MMX, 8)\ |
H264_MC(avg_, 4, MMX, 8)\ |
H264_MC(avg_, 8, MMX, 8)\ |
H264_MC(avg_, 16,MMX, 8)\ |
#define H264_MC_816(QPEL, XMM)\ |
QPEL(put_, 8, XMM, 16)\ |
QPEL(put_, 16,XMM, 16)\ |
QPEL(avg_, 8, XMM, 16)\ |
QPEL(avg_, 16,XMM, 16)\ |
QPEL_H264(put_, PUT_OP, mmxext) |
QPEL_H264(avg_, AVG_MMXEXT_OP, mmxext) |
QPEL_H264_V_XMM(put_, PUT_OP, sse2) |
QPEL_H264_V_XMM(avg_,AVG_MMXEXT_OP, sse2) |
QPEL_H264_HV_XMM(put_, PUT_OP, sse2) |
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, sse2) |
QPEL_H264_H_XMM(put_, PUT_OP, ssse3) |
QPEL_H264_H_XMM(avg_,AVG_MMXEXT_OP, ssse3) |
QPEL_H264_HV_XMM(put_, PUT_OP, ssse3) |
QPEL_H264_HV_XMM(avg_,AVG_MMXEXT_OP, ssse3) |
H264_MC_4816(mmxext) |
H264_MC_816(H264_MC_V, sse2) |
H264_MC_816(H264_MC_HV, sse2) |
H264_MC_816(H264_MC_H, ssse3) |
H264_MC_816(H264_MC_HV, ssse3) |
//10bit |
#define LUMA_MC_OP(OP, NUM, DEPTH, TYPE, OPT) \ |
void ff_ ## OP ## _h264_qpel ## NUM ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT \ |
(uint8_t *dst, uint8_t *src, ptrdiff_t stride); |
#define LUMA_MC_ALL(DEPTH, TYPE, OPT) \ |
LUMA_MC_OP(put, 4, DEPTH, TYPE, OPT) \ |
LUMA_MC_OP(avg, 4, DEPTH, TYPE, OPT) \ |
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ |
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ |
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ |
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) |
#define LUMA_MC_816(DEPTH, TYPE, OPT) \ |
LUMA_MC_OP(put, 8, DEPTH, TYPE, OPT) \ |
LUMA_MC_OP(avg, 8, DEPTH, TYPE, OPT) \ |
LUMA_MC_OP(put, 16, DEPTH, TYPE, OPT) \ |
LUMA_MC_OP(avg, 16, DEPTH, TYPE, OPT) |
LUMA_MC_ALL(10, mc00, mmxext) |
LUMA_MC_ALL(10, mc10, mmxext) |
LUMA_MC_ALL(10, mc20, mmxext) |
LUMA_MC_ALL(10, mc30, mmxext) |
LUMA_MC_ALL(10, mc01, mmxext) |
LUMA_MC_ALL(10, mc11, mmxext) |
LUMA_MC_ALL(10, mc21, mmxext) |
LUMA_MC_ALL(10, mc31, mmxext) |
LUMA_MC_ALL(10, mc02, mmxext) |
LUMA_MC_ALL(10, mc12, mmxext) |
LUMA_MC_ALL(10, mc22, mmxext) |
LUMA_MC_ALL(10, mc32, mmxext) |
LUMA_MC_ALL(10, mc03, mmxext) |
LUMA_MC_ALL(10, mc13, mmxext) |
LUMA_MC_ALL(10, mc23, mmxext) |
LUMA_MC_ALL(10, mc33, mmxext) |
LUMA_MC_816(10, mc00, sse2) |
LUMA_MC_816(10, mc10, sse2) |
LUMA_MC_816(10, mc10, sse2_cache64) |
LUMA_MC_816(10, mc10, ssse3_cache64) |
LUMA_MC_816(10, mc20, sse2) |
LUMA_MC_816(10, mc20, sse2_cache64) |
LUMA_MC_816(10, mc20, ssse3_cache64) |
LUMA_MC_816(10, mc30, sse2) |
LUMA_MC_816(10, mc30, sse2_cache64) |
LUMA_MC_816(10, mc30, ssse3_cache64) |
LUMA_MC_816(10, mc01, sse2) |
LUMA_MC_816(10, mc11, sse2) |
LUMA_MC_816(10, mc21, sse2) |
LUMA_MC_816(10, mc31, sse2) |
LUMA_MC_816(10, mc02, sse2) |
LUMA_MC_816(10, mc12, sse2) |
LUMA_MC_816(10, mc22, sse2) |
LUMA_MC_816(10, mc32, sse2) |
LUMA_MC_816(10, mc03, sse2) |
LUMA_MC_816(10, mc13, sse2) |
LUMA_MC_816(10, mc23, sse2) |
LUMA_MC_816(10, mc33, sse2) |
#define QPEL16_OPMC(OP, MC, MMX)\ |
void ff_ ## OP ## _h264_qpel16_ ## MC ## _10_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride){\ |
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ |
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ |
src += 8*stride;\ |
dst += 8*stride;\ |
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst , src , stride);\ |
ff_ ## OP ## _h264_qpel8_ ## MC ## _10_ ## MMX(dst+16, src+16, stride);\ |
} |
#define QPEL16_OP(MC, MMX)\ |
QPEL16_OPMC(put, MC, MMX)\ |
QPEL16_OPMC(avg, MC, MMX) |
#define QPEL16(MMX)\ |
QPEL16_OP(mc00, MMX)\ |
QPEL16_OP(mc01, MMX)\ |
QPEL16_OP(mc02, MMX)\ |
QPEL16_OP(mc03, MMX)\ |
QPEL16_OP(mc10, MMX)\ |
QPEL16_OP(mc11, MMX)\ |
QPEL16_OP(mc12, MMX)\ |
QPEL16_OP(mc13, MMX)\ |
QPEL16_OP(mc20, MMX)\ |
QPEL16_OP(mc21, MMX)\ |
QPEL16_OP(mc22, MMX)\ |
QPEL16_OP(mc23, MMX)\ |
QPEL16_OP(mc30, MMX)\ |
QPEL16_OP(mc31, MMX)\ |
QPEL16_OP(mc32, MMX)\ |
QPEL16_OP(mc33, MMX) |
#if ARCH_X86_32 && HAVE_YASM && CONFIG_H264QPEL // ARCH_X86_64 implies SSE2+ |
QPEL16(mmxext) |
#endif |
#endif /* HAVE_YASM */ |
#define SET_QPEL_FUNCS(PFX, IDX, SIZE, CPU, PREFIX) \ |
do { \ |
c->PFX ## _pixels_tab[IDX][ 0] = PREFIX ## PFX ## SIZE ## _mc00_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 1] = PREFIX ## PFX ## SIZE ## _mc10_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 2] = PREFIX ## PFX ## SIZE ## _mc20_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 3] = PREFIX ## PFX ## SIZE ## _mc30_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 4] = PREFIX ## PFX ## SIZE ## _mc01_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 5] = PREFIX ## PFX ## SIZE ## _mc11_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 6] = PREFIX ## PFX ## SIZE ## _mc21_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 7] = PREFIX ## PFX ## SIZE ## _mc31_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 8] = PREFIX ## PFX ## SIZE ## _mc02_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][ 9] = PREFIX ## PFX ## SIZE ## _mc12_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][10] = PREFIX ## PFX ## SIZE ## _mc22_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][11] = PREFIX ## PFX ## SIZE ## _mc32_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][12] = PREFIX ## PFX ## SIZE ## _mc03_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][13] = PREFIX ## PFX ## SIZE ## _mc13_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][14] = PREFIX ## PFX ## SIZE ## _mc23_ ## CPU; \ |
c->PFX ## _pixels_tab[IDX][15] = PREFIX ## PFX ## SIZE ## _mc33_ ## CPU; \ |
} while (0) |
#define H264_QPEL_FUNCS(x, y, CPU) \ |
do { \ |
c->put_h264_qpel_pixels_tab[0][x + y * 4] = put_h264_qpel16_mc ## x ## y ## _ ## CPU; \ |
c->put_h264_qpel_pixels_tab[1][x + y * 4] = put_h264_qpel8_mc ## x ## y ## _ ## CPU; \ |
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = avg_h264_qpel16_mc ## x ## y ## _ ## CPU; \ |
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = avg_h264_qpel8_mc ## x ## y ## _ ## CPU; \ |
} while (0) |
#define H264_QPEL_FUNCS_10(x, y, CPU) \ |
do { \ |
c->put_h264_qpel_pixels_tab[0][x + y * 4] = ff_put_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ |
c->put_h264_qpel_pixels_tab[1][x + y * 4] = ff_put_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ |
c->avg_h264_qpel_pixels_tab[0][x + y * 4] = ff_avg_h264_qpel16_mc ## x ## y ## _10_ ## CPU; \ |
c->avg_h264_qpel_pixels_tab[1][x + y * 4] = ff_avg_h264_qpel8_mc ## x ## y ## _10_ ## CPU; \ |
} while (0) |
av_cold void ff_h264qpel_init_x86(H264QpelContext *c, int bit_depth) |
{ |
#if HAVE_YASM |
int high_bit_depth = bit_depth > 8; |
int cpu_flags = av_get_cpu_flags(); |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
if (!high_bit_depth) { |
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, mmxext, ); |
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, mmxext, ); |
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, mmxext, ); |
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, mmxext, ); |
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, mmxext, ); |
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, mmxext, ); |
} else if (bit_depth == 10) { |
#if ARCH_X86_32 |
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_mmxext, ff_); |
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_mmxext, ff_); |
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_mmxext, ff_); |
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_mmxext, ff_); |
#endif |
SET_QPEL_FUNCS(put_h264_qpel, 2, 4, 10_mmxext, ff_); |
SET_QPEL_FUNCS(avg_h264_qpel, 2, 4, 10_mmxext, ff_); |
} |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW) && !high_bit_depth) { |
// these functions are slower than mmx on AMD, but faster on Intel |
H264_QPEL_FUNCS(0, 0, sse2); |
} |
if (!high_bit_depth) { |
H264_QPEL_FUNCS(0, 1, sse2); |
H264_QPEL_FUNCS(0, 2, sse2); |
H264_QPEL_FUNCS(0, 3, sse2); |
H264_QPEL_FUNCS(1, 1, sse2); |
H264_QPEL_FUNCS(1, 2, sse2); |
H264_QPEL_FUNCS(1, 3, sse2); |
H264_QPEL_FUNCS(2, 1, sse2); |
H264_QPEL_FUNCS(2, 2, sse2); |
H264_QPEL_FUNCS(2, 3, sse2); |
H264_QPEL_FUNCS(3, 1, sse2); |
H264_QPEL_FUNCS(3, 2, sse2); |
H264_QPEL_FUNCS(3, 3, sse2); |
} |
if (bit_depth == 10) { |
SET_QPEL_FUNCS(put_h264_qpel, 0, 16, 10_sse2, ff_); |
SET_QPEL_FUNCS(put_h264_qpel, 1, 8, 10_sse2, ff_); |
SET_QPEL_FUNCS(avg_h264_qpel, 0, 16, 10_sse2, ff_); |
SET_QPEL_FUNCS(avg_h264_qpel, 1, 8, 10_sse2, ff_); |
H264_QPEL_FUNCS_10(1, 0, sse2_cache64); |
H264_QPEL_FUNCS_10(2, 0, sse2_cache64); |
H264_QPEL_FUNCS_10(3, 0, sse2_cache64); |
} |
} |
if (EXTERNAL_SSSE3(cpu_flags)) { |
if (!high_bit_depth) { |
H264_QPEL_FUNCS(1, 0, ssse3); |
H264_QPEL_FUNCS(1, 1, ssse3); |
H264_QPEL_FUNCS(1, 2, ssse3); |
H264_QPEL_FUNCS(1, 3, ssse3); |
H264_QPEL_FUNCS(2, 0, ssse3); |
H264_QPEL_FUNCS(2, 1, ssse3); |
H264_QPEL_FUNCS(2, 2, ssse3); |
H264_QPEL_FUNCS(2, 3, ssse3); |
H264_QPEL_FUNCS(3, 0, ssse3); |
H264_QPEL_FUNCS(3, 1, ssse3); |
H264_QPEL_FUNCS(3, 2, ssse3); |
H264_QPEL_FUNCS(3, 3, ssse3); |
} |
if (bit_depth == 10) { |
H264_QPEL_FUNCS_10(1, 0, ssse3_cache64); |
H264_QPEL_FUNCS_10(2, 0, ssse3_cache64); |
H264_QPEL_FUNCS_10(3, 0, ssse3_cache64); |
} |
} |
if (EXTERNAL_AVX(cpu_flags)) { |
/* AVX implies 64 byte cache lines without the need to avoid unaligned |
* memory accesses that cross the boundary between two cache lines. |
* TODO: Port X264_CPU_CACHELINE_32/64 detection from x264 to avoid |
* having to treat SSE2 functions with such properties as AVX. */ |
if (bit_depth == 10) { |
H264_QPEL_FUNCS_10(1, 0, sse2); |
H264_QPEL_FUNCS_10(2, 0, sse2); |
H264_QPEL_FUNCS_10(3, 0, sse2); |
} |
} |
#endif |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_qpel_10bit.asm |
---|
0,0 → 1,884 |
;***************************************************************************** |
;* MMX/SSE2/AVX-optimized 10-bit H.264 qpel code |
;***************************************************************************** |
;* Copyright (C) 2011 x264 project |
;* |
;* Authors: Daniel Kang <daniel.d.kang@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA 32 |
cextern pw_16 |
cextern pw_1 |
cextern pb_0 |
pw_pixel_max: times 8 dw ((1 << 10)-1) |
pad10: times 8 dw 10*1023 |
pad20: times 8 dw 20*1023 |
pad30: times 8 dw 30*1023 |
depad: times 4 dd 32*20*1023 + 512 |
depad2: times 8 dw 20*1023 + 16*1022 + 16 |
unpad: times 8 dw 16*1022/32 ; needs to be mod 16 |
tap1: times 4 dw 1, -5 |
tap2: times 4 dw 20, 20 |
tap3: times 4 dw -5, 1 |
pd_0f: times 4 dd 0xffff |
SECTION .text |
%macro AVG_MOV 2 |
pavgw %2, %1 |
mova %1, %2 |
%endmacro |
%macro ADDW 3 |
%if mmsize == 8 |
paddw %1, %2 |
%else |
movu %3, %2 |
paddw %1, %3 |
%endif |
%endmacro |
%macro FILT_H 4 |
paddw %1, %4 |
psubw %1, %2 ; a-b |
psraw %1, 2 ; (a-b)/4 |
psubw %1, %2 ; (a-b)/4-b |
paddw %1, %3 ; (a-b)/4-b+c |
psraw %1, 2 ; ((a-b)/4-b+c)/4 |
paddw %1, %3 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 |
%endmacro |
%macro PRELOAD_V 0 |
lea r3, [r2*3] |
sub r1, r3 |
movu m0, [r1+r2] |
movu m1, [r1+r2*2] |
add r1, r3 |
movu m2, [r1] |
movu m3, [r1+r2] |
movu m4, [r1+r2*2] |
add r1, r3 |
%endmacro |
%macro FILT_V 8 |
movu %6, [r1] |
paddw %1, %6 |
mova %7, %2 |
paddw %7, %5 |
mova %8, %3 |
paddw %8, %4 |
FILT_H %1, %7, %8, [pw_16] |
psraw %1, 1 |
CLIPW %1, [pb_0], [pw_pixel_max] |
%endmacro |
%macro MC 1 |
%define OP_MOV mova |
INIT_MMX mmxext |
%1 put, 4 |
INIT_XMM sse2 |
%1 put, 8 |
%define OP_MOV AVG_MOV |
INIT_MMX mmxext |
%1 avg, 4 |
INIT_XMM sse2 |
%1 avg, 8 |
%endmacro |
%macro MCAxA_OP 7 |
%if ARCH_X86_32 |
cglobal %1_h264_qpel%4_%2_10, %5,%6,%7 |
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX |
mov r0, r0m |
mov r1, r1m |
add r0, %3*2 |
add r1, %3*2 |
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX |
mov r0, r0m |
mov r1, r1m |
lea r0, [r0+r2*%3] |
lea r1, [r1+r2*%3] |
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX |
mov r0, r0m |
mov r1, r1m |
lea r0, [r0+r2*%3+%3*2] |
lea r1, [r1+r2*%3+%3*2] |
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX |
RET |
%else ; ARCH_X86_64 |
cglobal %1_h264_qpel%4_%2_10, %5,%6 + 2,%7 |
mov r%6, r0 |
%assign p1 %6+1 |
mov r %+ p1, r1 |
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX |
lea r0, [r%6+%3*2] |
lea r1, [r %+ p1+%3*2] |
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX |
lea r0, [r%6+r2*%3] |
lea r1, [r %+ p1+r2*%3] |
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX |
lea r0, [r%6+r2*%3+%3*2] |
lea r1, [r %+ p1+r2*%3+%3*2] |
%if UNIX64 == 0 ; fall through to function |
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX |
RET |
%endif |
%endif |
%endmacro |
;cpu, put/avg, mc, 4/8, ... |
%macro cglobal_mc 6 |
%assign i %3*2 |
%if ARCH_X86_32 || cpuflag(sse2) |
MCAxA_OP %1, %2, %3, i, %4,%5,%6 |
%endif |
cglobal %1_h264_qpel%3_%2_10, %4,%5,%6 |
%if UNIX64 == 0 ; no prologue or epilogue for UNIX64 |
call stub_%1_h264_qpel%3_%2_10 %+ SUFFIX |
RET |
%endif |
stub_%1_h264_qpel%3_%2_10 %+ SUFFIX: |
%endmacro |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc00(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro COPY4 0 |
movu m0, [r1 ] |
OP_MOV [r0 ], m0 |
movu m0, [r1+r2 ] |
OP_MOV [r0+r2 ], m0 |
movu m0, [r1+r2*2] |
OP_MOV [r0+r2*2], m0 |
movu m0, [r1+r3 ] |
OP_MOV [r0+r3 ], m0 |
%endmacro |
%macro MC00 1 |
INIT_MMX mmxext |
cglobal_mc %1, mc00, 4, 3,4,0 |
lea r3, [r2*3] |
COPY4 |
ret |
INIT_XMM sse2 |
cglobal %1_h264_qpel8_mc00_10, 3,4 |
lea r3, [r2*3] |
COPY4 |
lea r0, [r0+r2*4] |
lea r1, [r1+r2*4] |
COPY4 |
RET |
cglobal %1_h264_qpel16_mc00_10, 3,4 |
mov r3d, 8 |
.loop: |
movu m0, [r1 ] |
movu m1, [r1 +16] |
OP_MOV [r0 ], m0 |
OP_MOV [r0 +16], m1 |
movu m0, [r1+r2 ] |
movu m1, [r1+r2+16] |
OP_MOV [r0+r2 ], m0 |
OP_MOV [r0+r2+16], m1 |
lea r0, [r0+r2*2] |
lea r1, [r1+r2*2] |
dec r3d |
jg .loop |
REP_RET |
%endmacro |
%define OP_MOV mova |
MC00 put |
%define OP_MOV AVG_MOV |
MC00 avg |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc20(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC_CACHE 1 |
%define OP_MOV mova |
INIT_MMX mmxext |
%1 put, 4 |
INIT_XMM sse2, cache64 |
%1 put, 8 |
INIT_XMM ssse3, cache64 |
%1 put, 8 |
INIT_XMM sse2 |
%1 put, 8 |
%define OP_MOV AVG_MOV |
INIT_MMX mmxext |
%1 avg, 4 |
INIT_XMM sse2, cache64 |
%1 avg, 8 |
INIT_XMM ssse3, cache64 |
%1 avg, 8 |
INIT_XMM sse2 |
%1 avg, 8 |
%endmacro |
%macro MC20 2 |
cglobal_mc %1, mc20, %2, 3,4,9 |
mov r3d, %2 |
mova m1, [pw_pixel_max] |
%if num_mmregs > 8 |
mova m8, [pw_16] |
%define p16 m8 |
%else |
%define p16 [pw_16] |
%endif |
.nextrow: |
%if %0 == 4 |
movu m2, [r1-4] |
movu m3, [r1-2] |
movu m4, [r1+0] |
ADDW m2, [r1+6], m5 |
ADDW m3, [r1+4], m5 |
ADDW m4, [r1+2], m5 |
%else ; movu is slow on these processors |
%if mmsize==16 |
movu m2, [r1-4] |
movu m0, [r1+6] |
mova m6, m0 |
psrldq m0, 6 |
paddw m6, m2 |
PALIGNR m3, m0, m2, 2, m5 |
PALIGNR m7, m0, m2, 8, m5 |
paddw m3, m7 |
PALIGNR m4, m0, m2, 4, m5 |
PALIGNR m7, m0, m2, 6, m5 |
paddw m4, m7 |
SWAP 2, 6 |
%else |
movu m2, [r1-4] |
movu m6, [r1+4] |
PALIGNR m3, m6, m2, 2, m5 |
paddw m3, m6 |
PALIGNR m4, m6, m2, 4, m5 |
PALIGNR m7, m6, m2, 6, m5 |
paddw m4, m7 |
paddw m2, [r1+6] |
%endif |
%endif |
FILT_H m2, m3, m4, p16 |
psraw m2, 1 |
pxor m0, m0 |
CLIPW m2, m0, m1 |
OP_MOV [r0], m2 |
add r0, r2 |
add r1, r2 |
dec r3d |
jg .nextrow |
rep ret |
%endmacro |
MC_CACHE MC20 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc30(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC30 2 |
cglobal_mc %1, mc30, %2, 3,5,9 |
lea r4, [r1+2] |
jmp stub_%1_h264_qpel%2_mc10_10 %+ SUFFIX %+ .body |
%endmacro |
MC_CACHE MC30 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc10(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC10 2 |
cglobal_mc %1, mc10, %2, 3,5,9 |
mov r4, r1 |
.body: |
mov r3d, %2 |
mova m1, [pw_pixel_max] |
%if num_mmregs > 8 |
mova m8, [pw_16] |
%define p16 m8 |
%else |
%define p16 [pw_16] |
%endif |
.nextrow: |
%if %0 == 4 |
movu m2, [r1-4] |
movu m3, [r1-2] |
movu m4, [r1+0] |
ADDW m2, [r1+6], m5 |
ADDW m3, [r1+4], m5 |
ADDW m4, [r1+2], m5 |
%else ; movu is slow on these processors |
%if mmsize==16 |
movu m2, [r1-4] |
movu m0, [r1+6] |
mova m6, m0 |
psrldq m0, 6 |
paddw m6, m2 |
PALIGNR m3, m0, m2, 2, m5 |
PALIGNR m7, m0, m2, 8, m5 |
paddw m3, m7 |
PALIGNR m4, m0, m2, 4, m5 |
PALIGNR m7, m0, m2, 6, m5 |
paddw m4, m7 |
SWAP 2, 6 |
%else |
movu m2, [r1-4] |
movu m6, [r1+4] |
PALIGNR m3, m6, m2, 2, m5 |
paddw m3, m6 |
PALIGNR m4, m6, m2, 4, m5 |
PALIGNR m7, m6, m2, 6, m5 |
paddw m4, m7 |
paddw m2, [r1+6] |
%endif |
%endif |
FILT_H m2, m3, m4, p16 |
psraw m2, 1 |
pxor m0, m0 |
CLIPW m2, m0, m1 |
movu m3, [r4] |
pavgw m2, m3 |
OP_MOV [r0], m2 |
add r0, r2 |
add r1, r2 |
add r4, r2 |
dec r3d |
jg .nextrow |
rep ret |
%endmacro |
MC_CACHE MC10 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc02(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro V_FILT 10 |
v_filt%9_%10_10 |
add r4, r2 |
.no_addr4: |
FILT_V m0, m1, m2, m3, m4, m5, m6, m7 |
add r1, r2 |
add r0, r2 |
ret |
%endmacro |
INIT_MMX mmxext |
RESET_MM_PERMUTATION |
%assign i 0 |
%rep 4 |
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 4, i |
SWAP 0,1,2,3,4,5 |
%assign i i+1 |
%endrep |
INIT_XMM sse2 |
RESET_MM_PERMUTATION |
%assign i 0 |
%rep 6 |
V_FILT m0, m1, m2, m3, m4, m5, m6, m7, 8, i |
SWAP 0,1,2,3,4,5 |
%assign i i+1 |
%endrep |
%macro MC02 2 |
cglobal_mc %1, mc02, %2, 3,4,8 |
PRELOAD_V |
sub r0, r2 |
%assign j 0 |
%rep %2 |
%assign i (j % 6) |
call v_filt%2_ %+ i %+ _10.no_addr4 |
OP_MOV [r0], m0 |
SWAP 0,1,2,3,4,5 |
%assign j j+1 |
%endrep |
ret |
%endmacro |
MC MC02 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc01(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC01 2 |
cglobal_mc %1, mc01, %2, 3,5,8 |
mov r4, r1 |
.body: |
PRELOAD_V |
sub r4, r2 |
sub r0, r2 |
%assign j 0 |
%rep %2 |
%assign i (j % 6) |
call v_filt%2_ %+ i %+ _10 |
movu m7, [r4] |
pavgw m0, m7 |
OP_MOV [r0], m0 |
SWAP 0,1,2,3,4,5 |
%assign j j+1 |
%endrep |
ret |
%endmacro |
MC MC01 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc03(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC03 2 |
cglobal_mc %1, mc03, %2, 3,5,8 |
lea r4, [r1+r2] |
jmp stub_%1_h264_qpel%2_mc01_10 %+ SUFFIX %+ .body |
%endmacro |
MC MC03 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc11(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro H_FILT_AVG 2-3 |
h_filt%1_%2_10: |
;FILT_H with fewer registers and averaged with the FILT_V result |
;m6,m7 are tmp registers, m0 is the FILT_V result, the rest are to be used next in the next iteration |
;unfortunately I need three registers, so m5 will have to be re-read from memory |
movu m5, [r4-4] |
ADDW m5, [r4+6], m7 |
movu m6, [r4-2] |
ADDW m6, [r4+4], m7 |
paddw m5, [pw_16] |
psubw m5, m6 ; a-b |
psraw m5, 2 ; (a-b)/4 |
psubw m5, m6 ; (a-b)/4-b |
movu m6, [r4+0] |
ADDW m6, [r4+2], m7 |
paddw m5, m6 ; (a-b)/4-b+c |
psraw m5, 2 ; ((a-b)/4-b+c)/4 |
paddw m5, m6 ; ((a-b)/4-b+c)/4+c = (a-5*b+20*c)/16 |
psraw m5, 1 |
CLIPW m5, [pb_0], [pw_pixel_max] |
;avg FILT_V, FILT_H |
pavgw m0, m5 |
%if %0!=4 |
movu m5, [r1+r5] |
%endif |
ret |
%endmacro |
INIT_MMX mmxext |
RESET_MM_PERMUTATION |
%assign i 0 |
%rep 3 |
H_FILT_AVG 4, i |
SWAP 0,1,2,3,4,5 |
%assign i i+1 |
%endrep |
H_FILT_AVG 4, i, 0 |
INIT_XMM sse2 |
RESET_MM_PERMUTATION |
%assign i 0 |
%rep 6 |
%if i==1 |
H_FILT_AVG 8, i, 0 |
%else |
H_FILT_AVG 8, i |
%endif |
SWAP 0,1,2,3,4,5 |
%assign i i+1 |
%endrep |
%macro MC11 2 |
; this REALLY needs x86_64 |
cglobal_mc %1, mc11, %2, 3,6,8 |
mov r4, r1 |
.body: |
PRELOAD_V |
sub r0, r2 |
sub r4, r2 |
mov r5, r2 |
neg r5 |
%assign j 0 |
%rep %2 |
%assign i (j % 6) |
call v_filt%2_ %+ i %+ _10 |
call h_filt%2_ %+ i %+ _10 |
%if %2==8 && i==1 |
movu m5, [r1+r5] |
%endif |
OP_MOV [r0], m0 |
SWAP 0,1,2,3,4,5 |
%assign j j+1 |
%endrep |
ret |
%endmacro |
MC MC11 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc31(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC31 2 |
cglobal_mc %1, mc31, %2, 3,6,8 |
mov r4, r1 |
add r1, 2 |
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body |
%endmacro |
MC MC31 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc13(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC13 2 |
cglobal_mc %1, mc13, %2, 3,7,12 |
lea r4, [r1+r2] |
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body |
%endmacro |
MC MC13 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc33(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC33 2 |
cglobal_mc %1, mc33, %2, 3,6,8 |
lea r4, [r1+r2] |
add r1, 2 |
jmp stub_%1_h264_qpel%2_mc11_10 %+ SUFFIX %+ .body |
%endmacro |
MC MC33 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc22(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro FILT_H2 3 |
psubw %1, %2 ; a-b |
psubw %2, %3 ; b-c |
psllw %2, 2 |
psubw %1, %2 ; a-5*b+4*c |
psllw %3, 4 |
paddw %1, %3 ; a-5*b+20*c |
%endmacro |
%macro FILT_VNRD 8 |
movu %6, [r1] |
paddw %1, %6 |
mova %7, %2 |
paddw %7, %5 |
mova %8, %3 |
paddw %8, %4 |
FILT_H2 %1, %7, %8 |
%endmacro |
%macro HV 1 |
%if mmsize==16 |
%define PAD 12 |
%define COUNT 2 |
%else |
%define PAD 4 |
%define COUNT 3 |
%endif |
put_hv%1_10: |
neg r2 ; This actually saves instructions |
lea r1, [r1+r2*2-mmsize+PAD] |
lea r4, [rsp+PAD+gprsize] |
mov r3d, COUNT |
.v_loop: |
movu m0, [r1] |
sub r1, r2 |
movu m1, [r1] |
sub r1, r2 |
movu m2, [r1] |
sub r1, r2 |
movu m3, [r1] |
sub r1, r2 |
movu m4, [r1] |
sub r1, r2 |
%assign i 0 |
%rep %1-1 |
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 |
psubw m0, [pad20] |
movu [r4+i*mmsize*3], m0 |
sub r1, r2 |
SWAP 0,1,2,3,4,5 |
%assign i i+1 |
%endrep |
FILT_VNRD m0, m1, m2, m3, m4, m5, m6, m7 |
psubw m0, [pad20] |
movu [r4+i*mmsize*3], m0 |
add r4, mmsize |
lea r1, [r1+r2*8+mmsize] |
%if %1==8 |
lea r1, [r1+r2*4] |
%endif |
dec r3d |
jg .v_loop |
neg r2 |
ret |
%endmacro |
INIT_MMX mmxext |
HV 4 |
INIT_XMM sse2 |
HV 8 |
%macro H_LOOP 1 |
%if num_mmregs > 8 |
%define s1 m8 |
%define s2 m9 |
%define s3 m10 |
%define d1 m11 |
%else |
%define s1 [tap1] |
%define s2 [tap2] |
%define s3 [tap3] |
%define d1 [depad] |
%endif |
h%1_loop_op: |
movu m1, [r1+mmsize-4] |
movu m2, [r1+mmsize-2] |
mova m3, [r1+mmsize+0] |
movu m4, [r1+mmsize+2] |
movu m5, [r1+mmsize+4] |
movu m6, [r1+mmsize+6] |
%if num_mmregs > 8 |
pmaddwd m1, s1 |
pmaddwd m2, s1 |
pmaddwd m3, s2 |
pmaddwd m4, s2 |
pmaddwd m5, s3 |
pmaddwd m6, s3 |
paddd m1, d1 |
paddd m2, d1 |
%else |
mova m0, s1 |
pmaddwd m1, m0 |
pmaddwd m2, m0 |
mova m0, s2 |
pmaddwd m3, m0 |
pmaddwd m4, m0 |
mova m0, s3 |
pmaddwd m5, m0 |
pmaddwd m6, m0 |
mova m0, d1 |
paddd m1, m0 |
paddd m2, m0 |
%endif |
paddd m3, m5 |
paddd m4, m6 |
paddd m1, m3 |
paddd m2, m4 |
psrad m1, 10 |
psrad m2, 10 |
pslld m2, 16 |
pand m1, [pd_0f] |
por m1, m2 |
%if num_mmregs <= 8 |
pxor m0, m0 |
%endif |
CLIPW m1, m0, m7 |
add r1, mmsize*3 |
ret |
%endmacro |
INIT_MMX mmxext |
H_LOOP 4 |
INIT_XMM sse2 |
H_LOOP 8 |
%macro MC22 2 |
cglobal_mc %1, mc22, %2, 3,7,12 |
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) |
mov r6, rsp ; backup stack pointer |
and rsp, ~(mmsize-1) ; align stack |
sub rsp, PAD |
call put_hv%2_10 |
mov r3d, %2 |
mova m7, [pw_pixel_max] |
%if num_mmregs > 8 |
pxor m0, m0 |
mova m8, [tap1] |
mova m9, [tap2] |
mova m10, [tap3] |
mova m11, [depad] |
%endif |
mov r1, rsp |
.h_loop: |
call h%2_loop_op |
OP_MOV [r0], m1 |
add r0, r2 |
dec r3d |
jg .h_loop |
mov rsp, r6 ; restore stack pointer |
ret |
%endmacro |
MC MC22 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc12(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC12 2 |
cglobal_mc %1, mc12, %2, 3,7,12 |
%define PAD mmsize*8*4*2 ; SIZE*16*4*sizeof(pixel) |
mov r6, rsp ; backup stack pointer |
and rsp, ~(mmsize-1) ; align stack |
sub rsp, PAD |
call put_hv%2_10 |
xor r4d, r4d |
.body: |
mov r3d, %2 |
pxor m0, m0 |
mova m7, [pw_pixel_max] |
%if num_mmregs > 8 |
mova m8, [tap1] |
mova m9, [tap2] |
mova m10, [tap3] |
mova m11, [depad] |
%endif |
mov r1, rsp |
.h_loop: |
call h%2_loop_op |
movu m3, [r1+r4-2*mmsize] ; movu needed for mc32, etc |
paddw m3, [depad2] |
psrlw m3, 5 |
psubw m3, [unpad] |
CLIPW m3, m0, m7 |
pavgw m1, m3 |
OP_MOV [r0], m1 |
add r0, r2 |
dec r3d |
jg .h_loop |
mov rsp, r6 ; restore stack pointer |
ret |
%endmacro |
MC MC12 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc32(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC32 2 |
cglobal_mc %1, mc32, %2, 3,7,12 |
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) |
mov r6, rsp ; backup stack pointer |
and rsp, ~(mmsize-1) ; align stack |
sub rsp, PAD |
call put_hv%2_10 |
mov r4d, 2 ; sizeof(pixel) |
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body |
%endmacro |
MC MC32 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc21(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro H_NRD 1 |
put_h%1_10: |
add rsp, gprsize |
mov r3d, %1 |
xor r4d, r4d |
mova m6, [pad20] |
.nextrow: |
movu m2, [r5-4] |
movu m3, [r5-2] |
movu m4, [r5+0] |
ADDW m2, [r5+6], m5 |
ADDW m3, [r5+4], m5 |
ADDW m4, [r5+2], m5 |
FILT_H2 m2, m3, m4 |
psubw m2, m6 |
mova [rsp+r4], m2 |
add r4d, mmsize*3 |
add r5, r2 |
dec r3d |
jg .nextrow |
sub rsp, gprsize |
ret |
%endmacro |
INIT_MMX mmxext |
H_NRD 4 |
INIT_XMM sse2 |
H_NRD 8 |
%macro MC21 2 |
cglobal_mc %1, mc21, %2, 3,7,12 |
mov r5, r1 |
.body: |
%define PAD mmsize*8*3*2 ; SIZE*16*4*sizeof(pixel) |
mov r6, rsp ; backup stack pointer |
and rsp, ~(mmsize-1) ; align stack |
sub rsp, PAD |
call put_h%2_10 |
sub rsp, PAD |
call put_hv%2_10 |
mov r4d, PAD-mmsize ; H buffer |
jmp stub_%1_h264_qpel%2_mc12_10 %+ SUFFIX %+ .body |
%endmacro |
MC MC21 |
;----------------------------------------------------------------------------- |
; void h264_qpel_mc23(uint8_t *dst, uint8_t *src, int stride) |
;----------------------------------------------------------------------------- |
%macro MC23 2 |
cglobal_mc %1, mc23, %2, 3,7,12 |
lea r5, [r1+r2] |
jmp stub_%1_h264_qpel%2_mc21_10 %+ SUFFIX %+ .body |
%endmacro |
MC MC23 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_qpel_8bit.asm |
---|
0,0 → 1,862 |
;***************************************************************************** |
;* MMX/SSE2/SSSE3-optimized H.264 QPEL code |
;***************************************************************************** |
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |
;* Copyright (C) 2012 Daniel Kang |
;* |
;* Authors: Daniel Kang <daniel.d.kang@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA 32 |
cextern pw_16 |
cextern pw_5 |
cextern pb_0 |
SECTION .text |
%macro op_avgh 3 |
movh %3, %2 |
pavgb %1, %3 |
movh %2, %1 |
%endmacro |
%macro op_avg 2-3 |
pavgb %1, %2 |
mova %2, %1 |
%endmacro |
%macro op_puth 2-3 |
movh %2, %1 |
%endmacro |
%macro op_put 2-3 |
mova %2, %1 |
%endmacro |
%macro QPEL4_H_LOWPASS_OP 1 |
cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
pxor m7, m7 |
mova m4, [pw_5] |
mova m5, [pw_16] |
mov r4d, 4 |
.loop: |
movh m1, [r1-1] |
movh m2, [r1+0] |
movh m3, [r1+1] |
movh m0, [r1+2] |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
punpcklbw m0, m7 |
paddw m1, m0 |
paddw m2, m3 |
movh m0, [r1-2] |
movh m3, [r1+3] |
punpcklbw m0, m7 |
punpcklbw m3, m7 |
paddw m0, m3 |
psllw m2, 2 |
psubw m2, m1 |
pmullw m2, m4 |
paddw m0, m5 |
paddw m0, m2 |
psraw m0, 5 |
packuswb m0, m0 |
op_%1h m0, [r0], m6 |
add r0, r2 |
add r1, r3 |
dec r4d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
QPEL4_H_LOWPASS_OP put |
QPEL4_H_LOWPASS_OP avg |
%macro QPEL8_H_LOWPASS_OP 1 |
cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
mov r4d, 8 |
pxor m7, m7 |
mova m6, [pw_5] |
.loop: |
mova m0, [r1] |
mova m2, [r1+1] |
mova m1, m0 |
mova m3, m2 |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
paddw m0, m2 |
paddw m1, m3 |
psllw m0, 2 |
psllw m1, 2 |
mova m2, [r1-1] |
mova m4, [r1+2] |
mova m3, m2 |
mova m5, m4 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
punpcklbw m4, m7 |
punpckhbw m5, m7 |
paddw m2, m4 |
paddw m5, m3 |
psubw m0, m2 |
psubw m1, m5 |
pmullw m0, m6 |
pmullw m1, m6 |
movd m2, [r1-2] |
movd m5, [r1+7] |
punpcklbw m2, m7 |
punpcklbw m5, m7 |
paddw m2, m3 |
paddw m4, m5 |
mova m5, [pw_16] |
paddw m2, m5 |
paddw m4, m5 |
paddw m0, m2 |
paddw m1, m4 |
psraw m0, 5 |
psraw m1, 5 |
packuswb m0, m1 |
op_%1 m0, [r0], m4 |
add r0, r2 |
add r1, r3 |
dec r4d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
QPEL8_H_LOWPASS_OP put |
QPEL8_H_LOWPASS_OP avg |
%macro QPEL8_H_LOWPASS_OP_XMM 1 |
cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
mov r4d, 8 |
pxor m7, m7 |
mova m6, [pw_5] |
.loop: |
movu m1, [r1-2] |
mova m0, m1 |
punpckhbw m1, m7 |
punpcklbw m0, m7 |
mova m2, m1 |
mova m3, m1 |
mova m4, m1 |
mova m5, m1 |
palignr m4, m0, 2 |
palignr m3, m0, 4 |
palignr m2, m0, 6 |
palignr m1, m0, 8 |
palignr m5, m0, 10 |
paddw m0, m5 |
paddw m2, m3 |
paddw m1, m4 |
psllw m2, 2 |
psubw m2, m1 |
paddw m0, [pw_16] |
pmullw m2, m6 |
paddw m2, m0 |
psraw m2, 5 |
packuswb m2, m2 |
op_%1h m2, [r0], m4 |
add r1, r3 |
add r0, r2 |
dec r4d |
jne .loop |
REP_RET |
%endmacro |
INIT_XMM ssse3 |
QPEL8_H_LOWPASS_OP_XMM put |
QPEL8_H_LOWPASS_OP_XMM avg |
%macro QPEL4_H_LOWPASS_L2_OP 1 |
cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride |
movsxdifnidn r3, r3d |
movsxdifnidn r4, r4d |
pxor m7, m7 |
mova m4, [pw_5] |
mova m5, [pw_16] |
mov r5d, 4 |
.loop: |
movh m1, [r1-1] |
movh m2, [r1+0] |
movh m3, [r1+1] |
movh m0, [r1+2] |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
punpcklbw m0, m7 |
paddw m1, m0 |
paddw m2, m3 |
movh m0, [r1-2] |
movh m3, [r1+3] |
punpcklbw m0, m7 |
punpcklbw m3, m7 |
paddw m0, m3 |
psllw m2, 2 |
psubw m2, m1 |
pmullw m2, m4 |
paddw m0, m5 |
paddw m0, m2 |
movh m3, [r2] |
psraw m0, 5 |
packuswb m0, m0 |
pavgb m0, m3 |
op_%1h m0, [r0], m6 |
add r0, r3 |
add r1, r3 |
add r2, r4 |
dec r5d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
QPEL4_H_LOWPASS_L2_OP put |
QPEL4_H_LOWPASS_L2_OP avg |
%macro QPEL8_H_LOWPASS_L2_OP 1 |
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride |
movsxdifnidn r3, r3d |
movsxdifnidn r4, r4d |
mov r5d, 8 |
pxor m7, m7 |
mova m6, [pw_5] |
.loop: |
mova m0, [r1] |
mova m2, [r1+1] |
mova m1, m0 |
mova m3, m2 |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
paddw m0, m2 |
paddw m1, m3 |
psllw m0, 2 |
psllw m1, 2 |
mova m2, [r1-1] |
mova m4, [r1+2] |
mova m3, m2 |
mova m5, m4 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
punpcklbw m4, m7 |
punpckhbw m5, m7 |
paddw m2, m4 |
paddw m5, m3 |
psubw m0, m2 |
psubw m1, m5 |
pmullw m0, m6 |
pmullw m1, m6 |
movd m2, [r1-2] |
movd m5, [r1+7] |
punpcklbw m2, m7 |
punpcklbw m5, m7 |
paddw m2, m3 |
paddw m4, m5 |
mova m5, [pw_16] |
paddw m2, m5 |
paddw m4, m5 |
paddw m0, m2 |
paddw m1, m4 |
psraw m0, 5 |
psraw m1, 5 |
mova m4, [r2] |
packuswb m0, m1 |
pavgb m0, m4 |
op_%1 m0, [r0], m4 |
add r0, r3 |
add r1, r3 |
add r2, r4 |
dec r5d |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
QPEL8_H_LOWPASS_L2_OP put |
QPEL8_H_LOWPASS_L2_OP avg |
%macro QPEL8_H_LOWPASS_L2_OP_XMM 1 |
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride |
movsxdifnidn r3, r3d |
movsxdifnidn r4, r4d |
mov r5d, 8 |
pxor m7, m7 |
mova m6, [pw_5] |
.loop: |
lddqu m1, [r1-2] |
mova m0, m1 |
punpckhbw m1, m7 |
punpcklbw m0, m7 |
mova m2, m1 |
mova m3, m1 |
mova m4, m1 |
mova m5, m1 |
palignr m4, m0, 2 |
palignr m3, m0, 4 |
palignr m2, m0, 6 |
palignr m1, m0, 8 |
palignr m5, m0, 10 |
paddw m0, m5 |
paddw m2, m3 |
paddw m1, m4 |
psllw m2, 2 |
movh m3, [r2] |
psubw m2, m1 |
paddw m0, [pw_16] |
pmullw m2, m6 |
paddw m2, m0 |
psraw m2, 5 |
packuswb m2, m2 |
pavgb m2, m3 |
op_%1h m2, [r0], m4 |
add r1, r3 |
add r0, r3 |
add r2, r4 |
dec r5d |
jg .loop |
REP_RET |
%endmacro |
INIT_XMM ssse3 |
QPEL8_H_LOWPASS_L2_OP_XMM put |
QPEL8_H_LOWPASS_L2_OP_XMM avg |
; All functions that call this are required to have function arguments of |
; dst, src, dstStride, srcStride |
%macro FILT_V 1 |
mova m6, m2 |
movh m5, [r1] |
paddw m6, m3 |
psllw m6, 2 |
psubw m6, m1 |
psubw m6, m4 |
punpcklbw m5, m7 |
pmullw m6, [pw_5] |
paddw m0, [pw_16] |
add r1, r3 |
paddw m0, m5 |
paddw m6, m0 |
psraw m6, 5 |
packuswb m6, m6 |
op_%1h m6, [r0], m0 ; 1 |
add r0, r2 |
SWAP 0, 1, 2, 3, 4, 5 |
%endmacro |
%macro QPEL4_V_LOWPASS_OP 1 |
cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
sub r1, r3 |
sub r1, r3 |
pxor m7, m7 |
movh m0, [r1] |
movh m1, [r1+r3] |
lea r1, [r1+2*r3] |
movh m2, [r1] |
movh m3, [r1+r3] |
lea r1, [r1+2*r3] |
movh m4, [r1] |
add r1, r3 |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
punpcklbw m4, m7 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
RET |
%endmacro |
INIT_MMX mmxext |
QPEL4_V_LOWPASS_OP put |
QPEL4_V_LOWPASS_OP avg |
%macro QPEL8OR16_V_LOWPASS_OP 1 |
%if cpuflag(sse2) |
cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
sub r1, r3 |
sub r1, r3 |
%else |
cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
%endif |
pxor m7, m7 |
movh m0, [r1] |
movh m1, [r1+r3] |
lea r1, [r1+2*r3] |
movh m2, [r1] |
movh m3, [r1+r3] |
lea r1, [r1+2*r3] |
movh m4, [r1] |
add r1, r3 |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
punpcklbw m4, m7 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
cmp r4d, 16 |
jne .end |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
FILT_V %1 |
.end: |
REP_RET |
%endmacro |
INIT_MMX mmxext |
QPEL8OR16_V_LOWPASS_OP put |
QPEL8OR16_V_LOWPASS_OP avg |
INIT_XMM sse2 |
QPEL8OR16_V_LOWPASS_OP put |
QPEL8OR16_V_LOWPASS_OP avg |
; All functions that use this are required to have args: |
; src, tmp, srcSize |
%macro FILT_HV 1 ; offset |
mova m6, m2 |
movh m5, [r0] |
paddw m6, m3 |
psllw m6, 2 |
paddw m0, [pw_16] |
psubw m6, m1 |
psubw m6, m4 |
punpcklbw m5, m7 |
pmullw m6, [pw_5] |
paddw m0, m5 |
add r0, r2 |
paddw m6, m0 |
mova [r1+%1], m6 |
SWAP 0, 1, 2, 3, 4, 5 |
%endmacro |
%macro QPEL4_HV1_LOWPASS_OP 1 |
cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride |
movsxdifnidn r2, r2d |
pxor m7, m7 |
movh m0, [r0] |
movh m1, [r0+r2] |
lea r0, [r0+2*r2] |
movh m2, [r0] |
movh m3, [r0+r2] |
lea r0, [r0+2*r2] |
movh m4, [r0] |
add r0, r2 |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
punpcklbw m4, m7 |
FILT_HV 0*24 |
FILT_HV 1*24 |
FILT_HV 2*24 |
FILT_HV 3*24 |
RET |
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride |
movsxdifnidn r2, r2d |
mov r3d, 4 |
.loop: |
mova m0, [r0] |
paddw m0, [r0+10] |
mova m1, [r0+2] |
paddw m1, [r0+8] |
mova m2, [r0+4] |
paddw m2, [r0+6] |
psubw m0, m1 |
psraw m0, 2 |
psubw m0, m1 |
paddsw m0, m2 |
psraw m0, 2 |
paddw m0, m2 |
psraw m0, 6 |
packuswb m0, m0 |
op_%1h m0, [r1], m7 |
add r0, 24 |
add r1, r2 |
dec r3d |
jnz .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
QPEL4_HV1_LOWPASS_OP put |
QPEL4_HV1_LOWPASS_OP avg |
%macro QPEL8OR16_HV1_LOWPASS_OP 1 |
cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size |
movsxdifnidn r2, r2d |
pxor m7, m7 |
movh m0, [r0] |
movh m1, [r0+r2] |
lea r0, [r0+2*r2] |
movh m2, [r0] |
movh m3, [r0+r2] |
lea r0, [r0+2*r2] |
movh m4, [r0] |
add r0, r2 |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
punpcklbw m4, m7 |
FILT_HV 0*48 |
FILT_HV 1*48 |
FILT_HV 2*48 |
FILT_HV 3*48 |
FILT_HV 4*48 |
FILT_HV 5*48 |
FILT_HV 6*48 |
FILT_HV 7*48 |
cmp r3d, 16 |
jne .end |
FILT_HV 8*48 |
FILT_HV 9*48 |
FILT_HV 10*48 |
FILT_HV 11*48 |
FILT_HV 12*48 |
FILT_HV 13*48 |
FILT_HV 14*48 |
FILT_HV 15*48 |
.end: |
REP_RET |
%endmacro |
INIT_MMX mmxext |
QPEL8OR16_HV1_LOWPASS_OP put |
QPEL8OR16_HV1_LOWPASS_OP avg |
INIT_XMM sse2 |
QPEL8OR16_HV1_LOWPASS_OP put |
%macro QPEL8OR16_HV2_LOWPASS_OP 1 |
; unused is to match ssse3 and mmxext args |
cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h |
movsxdifnidn r2, r2d |
.loop: |
mova m0, [r1] |
mova m3, [r1+8] |
mova m1, [r1+2] |
mova m4, [r1+10] |
paddw m0, m4 |
paddw m1, m3 |
paddw m3, [r1+18] |
paddw m4, [r1+16] |
mova m2, [r1+4] |
mova m5, [r1+12] |
paddw m2, [r1+6] |
paddw m5, [r1+14] |
psubw m0, m1 |
psubw m3, m4 |
psraw m0, 2 |
psraw m3, 2 |
psubw m0, m1 |
psubw m3, m4 |
paddsw m0, m2 |
paddsw m3, m5 |
psraw m0, 2 |
psraw m3, 2 |
paddw m0, m2 |
paddw m3, m5 |
psraw m0, 6 |
psraw m3, 6 |
packuswb m0, m3 |
op_%1 m0, [r0], m7 |
add r1, 48 |
add r0, r2 |
dec r4d |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
QPEL8OR16_HV2_LOWPASS_OP put |
QPEL8OR16_HV2_LOWPASS_OP avg |
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1 |
cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
cmp r4d, 16 |
je .op16 |
.loop8: |
mova m1, [r1+16] |
mova m0, [r1] |
mova m2, m1 |
mova m3, m1 |
mova m4, m1 |
mova m5, m1 |
palignr m5, m0, 10 |
palignr m4, m0, 8 |
palignr m3, m0, 6 |
palignr m2, m0, 4 |
palignr m1, m0, 2 |
paddw m0, m5 |
paddw m1, m4 |
paddw m2, m3 |
psubw m0, m1 |
psraw m0, 2 |
psubw m0, m1 |
paddw m0, m2 |
psraw m0, 2 |
paddw m0, m2 |
psraw m0, 6 |
packuswb m0, m0 |
op_%1h m0, [r0], m7 |
add r1, 48 |
add r0, r2 |
dec r4d |
jne .loop8 |
jmp .done |
.op16: |
mova m4, [r1+32] |
mova m5, [r1+16] |
mova m7, [r1] |
mova m3, m4 |
mova m2, m4 |
mova m1, m4 |
mova m0, m4 |
palignr m0, m5, 10 |
palignr m1, m5, 8 |
palignr m2, m5, 6 |
palignr m3, m5, 4 |
palignr m4, m5, 2 |
paddw m0, m5 |
paddw m1, m4 |
paddw m2, m3 |
mova m6, m5 |
mova m4, m5 |
mova m3, m5 |
palignr m4, m7, 8 |
palignr m6, m7, 2 |
palignr m3, m7, 10 |
paddw m4, m6 |
mova m6, m5 |
palignr m5, m7, 6 |
palignr m6, m7, 4 |
paddw m3, m7 |
paddw m5, m6 |
psubw m0, m1 |
psubw m3, m4 |
psraw m0, 2 |
psraw m3, 2 |
psubw m0, m1 |
psubw m3, m4 |
paddw m0, m2 |
paddw m3, m5 |
psraw m0, 2 |
psraw m3, 2 |
paddw m0, m2 |
paddw m3, m5 |
psraw m0, 6 |
psraw m3, 6 |
packuswb m3, m0 |
op_%1 m3, [r0], m7 |
add r1, 48 |
add r0, r2 |
dec r4d |
jne .op16 |
.done: |
REP_RET |
%endmacro |
INIT_XMM ssse3 |
QPEL8OR16_HV2_LOWPASS_OP_XMM put |
QPEL8OR16_HV2_LOWPASS_OP_XMM avg |
%macro PIXELS4_L2_SHIFT5 1 |
cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h |
movsxdifnidn r3, r3d |
movsxdifnidn r4, r4d |
mova m0, [r1] |
mova m1, [r1+24] |
psraw m0, 5 |
psraw m1, 5 |
packuswb m0, m0 |
packuswb m1, m1 |
pavgb m0, [r2] |
pavgb m1, [r2+r4] |
op_%1h m0, [r0], m4 |
op_%1h m1, [r0+r3], m5 |
lea r2, [r2+r4*2] |
lea r0, [r0+r3*2] |
mova m0, [r1+48] |
mova m1, [r1+72] |
psraw m0, 5 |
psraw m1, 5 |
packuswb m0, m0 |
packuswb m1, m1 |
pavgb m0, [r2] |
pavgb m1, [r2+r4] |
op_%1h m0, [r0], m4 |
op_%1h m1, [r0+r3], m5 |
RET |
%endmacro |
INIT_MMX mmxext |
PIXELS4_L2_SHIFT5 put |
PIXELS4_L2_SHIFT5 avg |
%macro PIXELS8_L2_SHIFT5 1 |
cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h |
movsxdifnidn r3, r3d |
movsxdifnidn r4, r4d |
.loop: |
mova m0, [r1] |
mova m1, [r1+8] |
mova m2, [r1+48] |
mova m3, [r1+48+8] |
psraw m0, 5 |
psraw m1, 5 |
psraw m2, 5 |
psraw m3, 5 |
packuswb m0, m1 |
packuswb m2, m3 |
pavgb m0, [r2] |
pavgb m2, [r2+r4] |
op_%1 m0, [r0], m4 |
op_%1 m2, [r0+r3], m5 |
lea r2, [r2+2*r4] |
add r1, 48*2 |
lea r0, [r0+2*r3] |
sub r5d, 2 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PIXELS8_L2_SHIFT5 put |
PIXELS8_L2_SHIFT5 avg |
%if ARCH_X86_64 |
%macro QPEL16_H_LOWPASS_L2_OP 1 |
cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride |
movsxdifnidn r3, r3d |
movsxdifnidn r4, r4d |
mov r5d, 16 |
pxor m15, m15 |
mova m14, [pw_5] |
mova m13, [pw_16] |
.loop: |
lddqu m1, [r1+6] |
lddqu m7, [r1-2] |
mova m0, m1 |
punpckhbw m1, m15 |
punpcklbw m0, m15 |
punpcklbw m7, m15 |
mova m2, m1 |
mova m6, m0 |
mova m3, m1 |
mova m8, m0 |
mova m4, m1 |
mova m9, m0 |
mova m12, m0 |
mova m11, m1 |
palignr m11, m0, 10 |
palignr m12, m7, 10 |
palignr m4, m0, 2 |
palignr m9, m7, 2 |
palignr m3, m0, 4 |
palignr m8, m7, 4 |
palignr m2, m0, 6 |
palignr m6, m7, 6 |
paddw m11, m0 |
palignr m1, m0, 8 |
palignr m0, m7, 8 |
paddw m7, m12 |
paddw m2, m3 |
paddw m6, m8 |
paddw m1, m4 |
paddw m0, m9 |
psllw m2, 2 |
psllw m6, 2 |
psubw m2, m1 |
psubw m6, m0 |
paddw m11, m13 |
paddw m7, m13 |
pmullw m2, m14 |
pmullw m6, m14 |
lddqu m3, [r2] |
paddw m2, m11 |
paddw m6, m7 |
psraw m2, 5 |
psraw m6, 5 |
packuswb m6, m2 |
pavgb m6, m3 |
op_%1 m6, [r0], m11 |
add r1, r3 |
add r0, r3 |
add r2, r4 |
dec r5d |
jg .loop |
REP_RET |
%endmacro |
INIT_XMM ssse3 |
QPEL16_H_LOWPASS_L2_OP put |
QPEL16_H_LOWPASS_L2_OP avg |
%endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_weight.asm |
---|
0,0 → 1,317 |
;***************************************************************************** |
;* SSE2-optimized weighted prediction code |
;***************************************************************************** |
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |
;* Copyright (C) 2010 Eli Friedman <eli.friedman@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION .text |
;----------------------------------------------------------------------------- |
; biweight pred: |
; |
; void h264_biweight_16_sse2(uint8_t *dst, uint8_t *src, int stride, |
; int height, int log2_denom, int weightd, |
; int weights, int offset); |
; and |
; void h264_weight_16_sse2(uint8_t *dst, int stride, int height, |
; int log2_denom, int weight, int offset); |
;----------------------------------------------------------------------------- |
%macro WEIGHT_SETUP 0 |
add r5, r5 |
inc r5 |
movd m3, r4d |
movd m5, r5d |
movd m6, r3d |
pslld m5, m6 |
psrld m5, 1 |
%if mmsize == 16 |
pshuflw m3, m3, 0 |
pshuflw m5, m5, 0 |
punpcklqdq m3, m3 |
punpcklqdq m5, m5 |
%else |
pshufw m3, m3, 0 |
pshufw m5, m5, 0 |
%endif |
pxor m7, m7 |
%endmacro |
%macro WEIGHT_OP 2 |
movh m0, [r0+%1] |
movh m1, [r0+%2] |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
pmullw m0, m3 |
pmullw m1, m3 |
paddsw m0, m5 |
paddsw m1, m5 |
psraw m0, m6 |
psraw m1, m6 |
packuswb m0, m1 |
%endmacro |
INIT_MMX mmxext |
cglobal h264_weight_16, 6, 6, 0 |
WEIGHT_SETUP |
.nextrow: |
WEIGHT_OP 0, 4 |
mova [r0 ], m0 |
WEIGHT_OP 8, 12 |
mova [r0+8], m0 |
add r0, r1 |
dec r2d |
jnz .nextrow |
REP_RET |
%macro WEIGHT_FUNC_MM 2 |
cglobal h264_weight_%1, 6, 6, %2 |
WEIGHT_SETUP |
.nextrow: |
WEIGHT_OP 0, mmsize/2 |
mova [r0], m0 |
add r0, r1 |
dec r2d |
jnz .nextrow |
REP_RET |
%endmacro |
INIT_MMX mmxext |
WEIGHT_FUNC_MM 8, 0 |
INIT_XMM sse2 |
WEIGHT_FUNC_MM 16, 8 |
%macro WEIGHT_FUNC_HALF_MM 2 |
cglobal h264_weight_%1, 6, 6, %2 |
WEIGHT_SETUP |
sar r2d, 1 |
lea r3, [r1*2] |
.nextrow: |
WEIGHT_OP 0, r1 |
movh [r0], m0 |
%if mmsize == 16 |
movhps [r0+r1], m0 |
%else |
psrlq m0, 32 |
movh [r0+r1], m0 |
%endif |
add r0, r3 |
dec r2d |
jnz .nextrow |
REP_RET |
%endmacro |
INIT_MMX mmxext |
WEIGHT_FUNC_HALF_MM 4, 0 |
INIT_XMM sse2 |
WEIGHT_FUNC_HALF_MM 8, 8 |
%macro BIWEIGHT_SETUP 0 |
%if ARCH_X86_64 |
%define off_regd r7d |
%else |
%define off_regd r3d |
%endif |
mov off_regd, r7m |
add off_regd, 1 |
or off_regd, 1 |
add r4, 1 |
cmp r5, 128 |
jne .normal |
sar r5, 1 |
sar r6, 1 |
sar off_regd, 1 |
sub r4, 1 |
.normal |
%if cpuflag(ssse3) |
movd m4, r5d |
movd m0, r6d |
%else |
movd m3, r5d |
movd m4, r6d |
%endif |
movd m5, off_regd |
movd m6, r4d |
pslld m5, m6 |
psrld m5, 1 |
%if cpuflag(ssse3) |
punpcklbw m4, m0 |
pshuflw m4, m4, 0 |
pshuflw m5, m5, 0 |
punpcklqdq m4, m4 |
punpcklqdq m5, m5 |
%else |
%if mmsize == 16 |
pshuflw m3, m3, 0 |
pshuflw m4, m4, 0 |
pshuflw m5, m5, 0 |
punpcklqdq m3, m3 |
punpcklqdq m4, m4 |
punpcklqdq m5, m5 |
%else |
pshufw m3, m3, 0 |
pshufw m4, m4, 0 |
pshufw m5, m5, 0 |
%endif |
pxor m7, m7 |
%endif |
%endmacro |
%macro BIWEIGHT_STEPA 3 |
movh m%1, [r0+%3] |
movh m%2, [r1+%3] |
punpcklbw m%1, m7 |
punpcklbw m%2, m7 |
pmullw m%1, m3 |
pmullw m%2, m4 |
paddsw m%1, m%2 |
%endmacro |
%macro BIWEIGHT_STEPB 0 |
paddsw m0, m5 |
paddsw m1, m5 |
psraw m0, m6 |
psraw m1, m6 |
packuswb m0, m1 |
%endmacro |
INIT_MMX mmxext |
cglobal h264_biweight_16, 7, 8, 0 |
BIWEIGHT_SETUP |
movifnidn r3d, r3m |
.nextrow: |
BIWEIGHT_STEPA 0, 1, 0 |
BIWEIGHT_STEPA 1, 2, 4 |
BIWEIGHT_STEPB |
mova [r0], m0 |
BIWEIGHT_STEPA 0, 1, 8 |
BIWEIGHT_STEPA 1, 2, 12 |
BIWEIGHT_STEPB |
mova [r0+8], m0 |
add r0, r2 |
add r1, r2 |
dec r3d |
jnz .nextrow |
REP_RET |
%macro BIWEIGHT_FUNC_MM 2 |
cglobal h264_biweight_%1, 7, 8, %2 |
BIWEIGHT_SETUP |
movifnidn r3d, r3m |
.nextrow: |
BIWEIGHT_STEPA 0, 1, 0 |
BIWEIGHT_STEPA 1, 2, mmsize/2 |
BIWEIGHT_STEPB |
mova [r0], m0 |
add r0, r2 |
add r1, r2 |
dec r3d |
jnz .nextrow |
REP_RET |
%endmacro |
INIT_MMX mmxext |
BIWEIGHT_FUNC_MM 8, 0 |
INIT_XMM sse2 |
BIWEIGHT_FUNC_MM 16, 8 |
%macro BIWEIGHT_FUNC_HALF_MM 2 |
cglobal h264_biweight_%1, 7, 8, %2 |
BIWEIGHT_SETUP |
movifnidn r3d, r3m |
sar r3, 1 |
lea r4, [r2*2] |
.nextrow: |
BIWEIGHT_STEPA 0, 1, 0 |
BIWEIGHT_STEPA 1, 2, r2 |
BIWEIGHT_STEPB |
movh [r0], m0 |
%if mmsize == 16 |
movhps [r0+r2], m0 |
%else |
psrlq m0, 32 |
movh [r0+r2], m0 |
%endif |
add r0, r4 |
add r1, r4 |
dec r3d |
jnz .nextrow |
REP_RET |
%endmacro |
INIT_MMX mmxext |
BIWEIGHT_FUNC_HALF_MM 4, 0 |
INIT_XMM sse2 |
BIWEIGHT_FUNC_HALF_MM 8, 8 |
%macro BIWEIGHT_SSSE3_OP 0 |
pmaddubsw m0, m4 |
pmaddubsw m2, m4 |
paddsw m0, m5 |
paddsw m2, m5 |
psraw m0, m6 |
psraw m2, m6 |
packuswb m0, m2 |
%endmacro |
INIT_XMM ssse3 |
cglobal h264_biweight_16, 7, 8, 8 |
BIWEIGHT_SETUP |
movifnidn r3d, r3m |
.nextrow: |
movh m0, [r0] |
movh m2, [r0+8] |
movh m3, [r1+8] |
punpcklbw m0, [r1] |
punpcklbw m2, m3 |
BIWEIGHT_SSSE3_OP |
mova [r0], m0 |
add r0, r2 |
add r1, r2 |
dec r3d |
jnz .nextrow |
REP_RET |
INIT_XMM ssse3 |
cglobal h264_biweight_8, 7, 8, 8 |
BIWEIGHT_SETUP |
movifnidn r3d, r3m |
sar r3, 1 |
lea r4, [r2*2] |
.nextrow: |
movh m0, [r0] |
movh m1, [r1] |
movh m2, [r0+r2] |
movh m3, [r1+r2] |
punpcklbw m0, m1 |
punpcklbw m2, m3 |
BIWEIGHT_SSSE3_OP |
movh [r0], m0 |
movhps [r0+r2], m0 |
add r0, r4 |
add r1, r4 |
dec r3d |
jnz .nextrow |
REP_RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264_weight_10bit.asm |
---|
0,0 → 1,282 |
;***************************************************************************** |
;* MMX/SSE2/AVX-optimized 10-bit H.264 weighted prediction code |
;***************************************************************************** |
;* Copyright (C) 2005-2011 x264 project |
;* |
;* Authors: Daniel Kang <daniel.d.kang@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA 32 |
pw_pixel_max: times 8 dw ((1 << 10)-1) |
sq_1: dq 1 |
dq 0 |
cextern pw_1 |
SECTION .text |
;----------------------------------------------------------------------------- |
; void h264_weight(uint8_t *dst, int stride, int height, int log2_denom, |
; int weight, int offset); |
;----------------------------------------------------------------------------- |
%macro WEIGHT_PROLOGUE 0 |
.prologue: |
PROLOGUE 0,6,8 |
movifnidn r0, r0mp |
movifnidn r1d, r1m |
movifnidn r2d, r2m |
movifnidn r4d, r4m |
movifnidn r5d, r5m |
%endmacro |
%macro WEIGHT_SETUP 0 |
mova m0, [pw_1] |
movd m2, r3m |
pslld m0, m2 ; 1<<log2_denom |
SPLATW m0, m0 |
shl r5, 19 ; *8, move to upper half of dword |
lea r5, [r5+r4*2+0x10000] |
movd m3, r5d ; weight<<1 | 1+(offset<<(3)) |
pshufd m3, m3, 0 |
mova m4, [pw_pixel_max] |
paddw m2, [sq_1] ; log2_denom+1 |
%if notcpuflag(sse4) |
pxor m7, m7 |
%endif |
%endmacro |
%macro WEIGHT_OP 1-2 |
%if %0==1 |
mova m5, [r0+%1] |
punpckhwd m6, m5, m0 |
punpcklwd m5, m0 |
%else |
movq m5, [r0+%1] |
movq m6, [r0+%2] |
punpcklwd m5, m0 |
punpcklwd m6, m0 |
%endif |
pmaddwd m5, m3 |
pmaddwd m6, m3 |
psrad m5, m2 |
psrad m6, m2 |
%if cpuflag(sse4) |
packusdw m5, m6 |
pminsw m5, m4 |
%else |
packssdw m5, m6 |
CLIPW m5, m7, m4 |
%endif |
%endmacro |
%macro WEIGHT_FUNC_DBL 0 |
cglobal h264_weight_16_10 |
WEIGHT_PROLOGUE |
WEIGHT_SETUP |
.nextrow: |
WEIGHT_OP 0 |
mova [r0 ], m5 |
WEIGHT_OP 16 |
mova [r0+16], m5 |
add r0, r1 |
dec r2d |
jnz .nextrow |
REP_RET |
%endmacro |
INIT_XMM sse2 |
WEIGHT_FUNC_DBL |
INIT_XMM sse4 |
WEIGHT_FUNC_DBL |
%macro WEIGHT_FUNC_MM 0 |
cglobal h264_weight_8_10 |
WEIGHT_PROLOGUE |
WEIGHT_SETUP |
.nextrow: |
WEIGHT_OP 0 |
mova [r0], m5 |
add r0, r1 |
dec r2d |
jnz .nextrow |
REP_RET |
%endmacro |
INIT_XMM sse2 |
WEIGHT_FUNC_MM |
INIT_XMM sse4 |
WEIGHT_FUNC_MM |
%macro WEIGHT_FUNC_HALF_MM 0 |
cglobal h264_weight_4_10 |
WEIGHT_PROLOGUE |
sar r2d, 1 |
WEIGHT_SETUP |
lea r3, [r1*2] |
.nextrow: |
WEIGHT_OP 0, r1 |
movh [r0], m5 |
movhps [r0+r1], m5 |
add r0, r3 |
dec r2d |
jnz .nextrow |
REP_RET |
%endmacro |
INIT_XMM sse2 |
WEIGHT_FUNC_HALF_MM |
INIT_XMM sse4 |
WEIGHT_FUNC_HALF_MM |
;----------------------------------------------------------------------------- |
; void h264_biweight(uint8_t *dst, uint8_t *src, int stride, int height, |
; int log2_denom, int weightd, int weights, int offset); |
;----------------------------------------------------------------------------- |
%if ARCH_X86_32 |
DECLARE_REG_TMP 3 |
%else |
DECLARE_REG_TMP 7 |
%endif |
%macro BIWEIGHT_PROLOGUE 0 |
.prologue: |
PROLOGUE 0,8,8 |
movifnidn r0, r0mp |
movifnidn r1, r1mp |
movifnidn r2d, r2m |
movifnidn r5d, r5m |
movifnidn r6d, r6m |
movifnidn t0d, r7m |
%endmacro |
%macro BIWEIGHT_SETUP 0 |
lea t0, [t0*4+1] ; (offset<<2)+1 |
or t0, 1 |
shl r6, 16 |
or r5, r6 |
movd m4, r5d ; weightd | weights |
movd m5, t0d ; (offset+1)|1 |
movd m6, r4m ; log2_denom |
pslld m5, m6 ; (((offset<<2)+1)|1)<<log2_denom |
paddd m6, [sq_1] |
pshufd m4, m4, 0 |
pshufd m5, m5, 0 |
mova m3, [pw_pixel_max] |
movifnidn r3d, r3m |
%if notcpuflag(sse4) |
pxor m7, m7 |
%endif |
%endmacro |
%macro BIWEIGHT 1-2 |
%if %0==1 |
mova m0, [r0+%1] |
mova m1, [r1+%1] |
punpckhwd m2, m0, m1 |
punpcklwd m0, m1 |
%else |
movq m0, [r0+%1] |
movq m1, [r1+%1] |
punpcklwd m0, m1 |
movq m2, [r0+%2] |
movq m1, [r1+%2] |
punpcklwd m2, m1 |
%endif |
pmaddwd m0, m4 |
pmaddwd m2, m4 |
paddd m0, m5 |
paddd m2, m5 |
psrad m0, m6 |
psrad m2, m6 |
%if cpuflag(sse4) |
packusdw m0, m2 |
pminsw m0, m3 |
%else |
packssdw m0, m2 |
CLIPW m0, m7, m3 |
%endif |
%endmacro |
%macro BIWEIGHT_FUNC_DBL 0 |
cglobal h264_biweight_16_10 |
BIWEIGHT_PROLOGUE |
BIWEIGHT_SETUP |
.nextrow: |
BIWEIGHT 0 |
mova [r0 ], m0 |
BIWEIGHT 16 |
mova [r0+16], m0 |
add r0, r2 |
add r1, r2 |
dec r3d |
jnz .nextrow |
REP_RET |
%endmacro |
INIT_XMM sse2 |
BIWEIGHT_FUNC_DBL |
INIT_XMM sse4 |
BIWEIGHT_FUNC_DBL |
%macro BIWEIGHT_FUNC 0 |
cglobal h264_biweight_8_10 |
BIWEIGHT_PROLOGUE |
BIWEIGHT_SETUP |
.nextrow: |
BIWEIGHT 0 |
mova [r0], m0 |
add r0, r2 |
add r1, r2 |
dec r3d |
jnz .nextrow |
REP_RET |
%endmacro |
INIT_XMM sse2 |
BIWEIGHT_FUNC |
INIT_XMM sse4 |
BIWEIGHT_FUNC |
%macro BIWEIGHT_FUNC_HALF 0 |
cglobal h264_biweight_4_10 |
BIWEIGHT_PROLOGUE |
BIWEIGHT_SETUP |
sar r3d, 1 |
lea r4, [r2*2] |
.nextrow: |
BIWEIGHT 0, r2 |
movh [r0 ], m0 |
movhps [r0+r2], m0 |
add r0, r4 |
add r1, r4 |
dec r3d |
jnz .nextrow |
REP_RET |
%endmacro |
INIT_XMM sse2 |
BIWEIGHT_FUNC_HALF |
INIT_XMM sse4 |
BIWEIGHT_FUNC_HALF |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264chroma_init.c |
---|
0,0 → 1,119 |
/* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include <stdint.h> |
#include "config.h" |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/h264chroma.h" |
void ff_put_h264_chroma_mc8_rnd_mmx (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_h264_chroma_mc8_rnd_mmxext(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_h264_chroma_mc8_rnd_3dnow(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_put_h264_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_h264_chroma_mc4_mmxext (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_h264_chroma_mc4_3dnow (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_put_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_h264_chroma_mc2_mmxext (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_put_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_put_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_h264_chroma_mc8_rnd_ssse3(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_h264_chroma_mc4_ssse3 (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
#define CHROMA_MC(OP, NUM, DEPTH, OPT) \ |
void ff_ ## OP ## _h264_chroma_mc ## NUM ## _ ## DEPTH ## _ ## OPT \ |
(uint8_t *dst, uint8_t *src, \ |
int stride, int h, int x, int y); |
CHROMA_MC(put, 2, 10, mmxext) |
CHROMA_MC(avg, 2, 10, mmxext) |
CHROMA_MC(put, 4, 10, mmxext) |
CHROMA_MC(avg, 4, 10, mmxext) |
CHROMA_MC(put, 8, 10, sse2) |
CHROMA_MC(avg, 8, 10, sse2) |
CHROMA_MC(put, 8, 10, avx) |
CHROMA_MC(avg, 8, 10, avx) |
av_cold void ff_h264chroma_init_x86(H264ChromaContext *c, int bit_depth) |
{ |
#if HAVE_YASM |
int high_bit_depth = bit_depth > 8; |
int cpu_flags = av_get_cpu_flags(); |
if (EXTERNAL_MMX(cpu_flags) && !high_bit_depth) { |
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_mmx; |
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_mmx; |
} |
if (EXTERNAL_AMD3DNOW(cpu_flags) && !high_bit_depth) { |
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_3dnow; |
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_3dnow; |
} |
if (EXTERNAL_MMXEXT(cpu_flags) && !high_bit_depth) { |
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_mmxext; |
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_mmxext; |
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_mmxext; |
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_mmxext; |
} |
if (EXTERNAL_MMXEXT(cpu_flags) && bit_depth > 8 && bit_depth <= 10) { |
c->put_h264_chroma_pixels_tab[2] = ff_put_h264_chroma_mc2_10_mmxext; |
c->avg_h264_chroma_pixels_tab[2] = ff_avg_h264_chroma_mc2_10_mmxext; |
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_10_mmxext; |
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_10_mmxext; |
} |
if (EXTERNAL_SSE2(cpu_flags) && bit_depth > 8 && bit_depth <= 10) { |
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_sse2; |
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_sse2; |
} |
if (EXTERNAL_SSSE3(cpu_flags) && !high_bit_depth) { |
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_rnd_ssse3; |
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_rnd_ssse3; |
c->put_h264_chroma_pixels_tab[1] = ff_put_h264_chroma_mc4_ssse3; |
c->avg_h264_chroma_pixels_tab[1] = ff_avg_h264_chroma_mc4_ssse3; |
} |
if (EXTERNAL_AVX(cpu_flags) && bit_depth > 8 && bit_depth <= 10) { |
// AVX implies !cache64. |
// TODO: Port cache(32|64) detection from x264. |
c->put_h264_chroma_pixels_tab[0] = ff_put_h264_chroma_mc8_10_avx; |
c->avg_h264_chroma_pixels_tab[0] = ff_avg_h264_chroma_mc8_10_avx; |
} |
#endif |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/h264dsp_init.c |
---|
0,0 → 1,371 |
/* |
* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/h264dsp.h" |
#include "dsputil_x86.h" |
/***********************************/ |
/* IDCT */ |
#define IDCT_ADD_FUNC(NUM, DEPTH, OPT) \ |
void ff_h264_idct ## NUM ## _add_ ## DEPTH ## _ ## OPT(uint8_t *dst, \ |
int16_t *block, \ |
int stride); |
IDCT_ADD_FUNC(, 8, mmx) |
IDCT_ADD_FUNC(, 10, sse2) |
IDCT_ADD_FUNC(_dc, 8, mmxext) |
IDCT_ADD_FUNC(_dc, 10, mmxext) |
IDCT_ADD_FUNC(8_dc, 8, mmxext) |
IDCT_ADD_FUNC(8_dc, 10, sse2) |
IDCT_ADD_FUNC(8, 8, mmx) |
IDCT_ADD_FUNC(8, 8, sse2) |
IDCT_ADD_FUNC(8, 10, sse2) |
IDCT_ADD_FUNC(, 10, avx) |
IDCT_ADD_FUNC(8_dc, 10, avx) |
IDCT_ADD_FUNC(8, 10, avx) |
#define IDCT_ADD_REP_FUNC(NUM, REP, DEPTH, OPT) \ |
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ |
(uint8_t *dst, const int *block_offset, \ |
int16_t *block, int stride, const uint8_t nnzc[6 * 8]); |
IDCT_ADD_REP_FUNC(8, 4, 8, mmx) |
IDCT_ADD_REP_FUNC(8, 4, 8, mmxext) |
IDCT_ADD_REP_FUNC(8, 4, 8, sse2) |
IDCT_ADD_REP_FUNC(8, 4, 10, sse2) |
IDCT_ADD_REP_FUNC(8, 4, 10, avx) |
IDCT_ADD_REP_FUNC(, 16, 8, mmx) |
IDCT_ADD_REP_FUNC(, 16, 8, mmxext) |
IDCT_ADD_REP_FUNC(, 16, 8, sse2) |
IDCT_ADD_REP_FUNC(, 16, 10, sse2) |
IDCT_ADD_REP_FUNC(, 16intra, 8, mmx) |
IDCT_ADD_REP_FUNC(, 16intra, 8, mmxext) |
IDCT_ADD_REP_FUNC(, 16intra, 8, sse2) |
IDCT_ADD_REP_FUNC(, 16intra, 10, sse2) |
IDCT_ADD_REP_FUNC(, 16, 10, avx) |
IDCT_ADD_REP_FUNC(, 16intra, 10, avx) |
#define IDCT_ADD_REP_FUNC2(NUM, REP, DEPTH, OPT) \ |
void ff_h264_idct ## NUM ## _add ## REP ## _ ## DEPTH ## _ ## OPT \ |
(uint8_t **dst, const int *block_offset, \ |
int16_t *block, int stride, const uint8_t nnzc[6 * 8]); |
IDCT_ADD_REP_FUNC2(, 8, 8, mmx) |
IDCT_ADD_REP_FUNC2(, 8, 8, mmxext) |
IDCT_ADD_REP_FUNC2(, 8, 8, sse2) |
IDCT_ADD_REP_FUNC2(, 8, 10, sse2) |
IDCT_ADD_REP_FUNC2(, 8, 10, avx) |
void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul); |
void ff_h264_luma_dc_dequant_idct_sse2(int16_t *output, int16_t *input, int qmul); |
/***********************************/ |
/* deblocking */ |
void ff_h264_loop_filter_strength_mmxext(int16_t bS[2][4][4], uint8_t nnz[40], |
int8_t ref[2][40], |
int16_t mv[2][40][2], |
int bidir, int edges, int step, |
int mask_mv0, int mask_mv1, int field); |
#define LF_FUNC(DIR, TYPE, DEPTH, OPT) \ |
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ |
int stride, \ |
int alpha, \ |
int beta, \ |
int8_t *tc0); |
#define LF_IFUNC(DIR, TYPE, DEPTH, OPT) \ |
void ff_deblock_ ## DIR ## _ ## TYPE ## _ ## DEPTH ## _ ## OPT(uint8_t *pix, \ |
int stride, \ |
int alpha, \ |
int beta); |
#define LF_FUNCS(type, depth) \ |
LF_FUNC(h, chroma, depth, mmxext) \ |
LF_IFUNC(h, chroma_intra, depth, mmxext) \ |
LF_FUNC(v, chroma, depth, mmxext) \ |
LF_IFUNC(v, chroma_intra, depth, mmxext) \ |
LF_FUNC(h, luma, depth, mmxext) \ |
LF_IFUNC(h, luma_intra, depth, mmxext) \ |
LF_FUNC(h, luma, depth, sse2) \ |
LF_IFUNC(h, luma_intra, depth, sse2) \ |
LF_FUNC(v, luma, depth, sse2) \ |
LF_IFUNC(v, luma_intra, depth, sse2) \ |
LF_FUNC(h, chroma, depth, sse2) \ |
LF_IFUNC(h, chroma_intra, depth, sse2) \ |
LF_FUNC(v, chroma, depth, sse2) \ |
LF_IFUNC(v, chroma_intra, depth, sse2) \ |
LF_FUNC(h, luma, depth, avx) \ |
LF_IFUNC(h, luma_intra, depth, avx) \ |
LF_FUNC(v, luma, depth, avx) \ |
LF_IFUNC(v, luma_intra, depth, avx) \ |
LF_FUNC(h, chroma, depth, avx) \ |
LF_IFUNC(h, chroma_intra, depth, avx) \ |
LF_FUNC(v, chroma, depth, avx) \ |
LF_IFUNC(v, chroma_intra, depth, avx) |
LF_FUNCS(uint8_t, 8) |
LF_FUNCS(uint16_t, 10) |
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL |
LF_FUNC(v8, luma, 8, mmxext) |
static void deblock_v_luma_8_mmxext(uint8_t *pix, int stride, int alpha, |
int beta, int8_t *tc0) |
{ |
if ((tc0[0] & tc0[1]) >= 0) |
ff_deblock_v8_luma_8_mmxext(pix + 0, stride, alpha, beta, tc0); |
if ((tc0[2] & tc0[3]) >= 0) |
ff_deblock_v8_luma_8_mmxext(pix + 8, stride, alpha, beta, tc0 + 2); |
} |
LF_IFUNC(v8, luma_intra, 8, mmxext) |
static void deblock_v_luma_intra_8_mmxext(uint8_t *pix, int stride, |
int alpha, int beta) |
{ |
ff_deblock_v8_luma_intra_8_mmxext(pix + 0, stride, alpha, beta); |
ff_deblock_v8_luma_intra_8_mmxext(pix + 8, stride, alpha, beta); |
} |
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ |
LF_FUNC(v, luma, 10, mmxext) |
LF_IFUNC(v, luma_intra, 10, mmxext) |
/***********************************/ |
/* weighted prediction */ |
#define H264_WEIGHT(W, OPT) \ |
void ff_h264_weight_ ## W ## _ ## OPT(uint8_t *dst, int stride, \ |
int height, int log2_denom, \ |
int weight, int offset); |
#define H264_BIWEIGHT(W, OPT) \ |
void ff_h264_biweight_ ## W ## _ ## OPT(uint8_t *dst, uint8_t *src, \ |
int stride, int height, \ |
int log2_denom, int weightd, \ |
int weights, int offset); |
#define H264_BIWEIGHT_MMX(W) \ |
H264_WEIGHT(W, mmxext) \ |
H264_BIWEIGHT(W, mmxext) |
#define H264_BIWEIGHT_MMX_SSE(W) \ |
H264_BIWEIGHT_MMX(W) \ |
H264_WEIGHT(W, sse2) \ |
H264_BIWEIGHT(W, sse2) \ |
H264_BIWEIGHT(W, ssse3) |
H264_BIWEIGHT_MMX_SSE(16) |
H264_BIWEIGHT_MMX_SSE(8) |
H264_BIWEIGHT_MMX(4) |
#define H264_WEIGHT_10(W, DEPTH, OPT) \ |
void ff_h264_weight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ |
int stride, \ |
int height, \ |
int log2_denom, \ |
int weight, \ |
int offset); |
#define H264_BIWEIGHT_10(W, DEPTH, OPT) \ |
void ff_h264_biweight_ ## W ## _ ## DEPTH ## _ ## OPT(uint8_t *dst, \ |
uint8_t *src, \ |
int stride, \ |
int height, \ |
int log2_denom, \ |
int weightd, \ |
int weights, \ |
int offset); |
#define H264_BIWEIGHT_10_SSE(W, DEPTH) \ |
H264_WEIGHT_10(W, DEPTH, sse2) \ |
H264_WEIGHT_10(W, DEPTH, sse4) \ |
H264_BIWEIGHT_10(W, DEPTH, sse2) \ |
H264_BIWEIGHT_10(W, DEPTH, sse4) |
H264_BIWEIGHT_10_SSE(16, 10) |
H264_BIWEIGHT_10_SSE(8, 10) |
H264_BIWEIGHT_10_SSE(4, 10) |
av_cold void ff_h264dsp_init_x86(H264DSPContext *c, const int bit_depth, |
const int chroma_format_idc) |
{ |
#if HAVE_YASM |
int cpu_flags = av_get_cpu_flags(); |
if (chroma_format_idc == 1 && EXTERNAL_MMXEXT(cpu_flags)) |
c->h264_loop_filter_strength = ff_h264_loop_filter_strength_mmxext; |
if (bit_depth == 8) { |
if (EXTERNAL_MMX(cpu_flags)) { |
c->h264_idct_dc_add = |
c->h264_idct_add = ff_h264_idct_add_8_mmx; |
c->h264_idct8_dc_add = |
c->h264_idct8_add = ff_h264_idct8_add_8_mmx; |
c->h264_idct_add16 = ff_h264_idct_add16_8_mmx; |
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmx; |
if (chroma_format_idc == 1) |
c->h264_idct_add8 = ff_h264_idct_add8_8_mmx; |
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmx; |
if (cpu_flags & AV_CPU_FLAG_CMOV) |
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_mmx; |
} |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
c->h264_idct_dc_add = ff_h264_idct_dc_add_8_mmxext; |
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_8_mmxext; |
c->h264_idct_add16 = ff_h264_idct_add16_8_mmxext; |
c->h264_idct8_add4 = ff_h264_idct8_add4_8_mmxext; |
if (chroma_format_idc == 1) |
c->h264_idct_add8 = ff_h264_idct_add8_8_mmxext; |
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_mmxext; |
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_8_mmxext; |
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_8_mmxext; |
if (chroma_format_idc == 1) { |
c->h264_h_loop_filter_chroma = ff_deblock_h_chroma_8_mmxext; |
c->h264_h_loop_filter_chroma_intra = ff_deblock_h_chroma_intra_8_mmxext; |
} |
#if ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL |
c->h264_v_loop_filter_luma = deblock_v_luma_8_mmxext; |
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_mmxext; |
c->h264_v_loop_filter_luma_intra = deblock_v_luma_intra_8_mmxext; |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_mmxext; |
#endif /* ARCH_X86_32 && HAVE_MMXEXT_EXTERNAL */ |
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_mmxext; |
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_mmxext; |
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_mmxext; |
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_mmxext; |
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_mmxext; |
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_mmxext; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
c->h264_idct8_add = ff_h264_idct8_add_8_sse2; |
c->h264_idct_add16 = ff_h264_idct_add16_8_sse2; |
c->h264_idct8_add4 = ff_h264_idct8_add4_8_sse2; |
if (chroma_format_idc == 1) |
c->h264_idct_add8 = ff_h264_idct_add8_8_sse2; |
c->h264_idct_add16intra = ff_h264_idct_add16intra_8_sse2; |
c->h264_luma_dc_dequant_idct = ff_h264_luma_dc_dequant_idct_sse2; |
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_sse2; |
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_sse2; |
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_sse2; |
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_sse2; |
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_sse2; |
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_sse2; |
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_sse2; |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_sse2; |
} |
if (EXTERNAL_SSSE3(cpu_flags)) { |
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_ssse3; |
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_ssse3; |
} |
if (EXTERNAL_AVX(cpu_flags)) { |
c->h264_v_loop_filter_luma = ff_deblock_v_luma_8_avx; |
c->h264_h_loop_filter_luma = ff_deblock_h_luma_8_avx; |
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_8_avx; |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_8_avx; |
} |
} else if (bit_depth == 10) { |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
#if ARCH_X86_32 |
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_mmxext; |
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_mmxext; |
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_mmxext; |
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_mmxext; |
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_mmxext; |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_mmxext; |
#endif /* ARCH_X86_32 */ |
c->h264_idct_dc_add = ff_h264_idct_dc_add_10_mmxext; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
c->h264_idct_add = ff_h264_idct_add_10_sse2; |
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_sse2; |
c->h264_idct_add16 = ff_h264_idct_add16_10_sse2; |
if (chroma_format_idc == 1) |
c->h264_idct_add8 = ff_h264_idct_add8_10_sse2; |
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_sse2; |
#if HAVE_ALIGNED_STACK |
c->h264_idct8_add = ff_h264_idct8_add_10_sse2; |
c->h264_idct8_add4 = ff_h264_idct8_add4_10_sse2; |
#endif /* HAVE_ALIGNED_STACK */ |
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse2; |
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse2; |
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse2; |
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse2; |
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse2; |
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse2; |
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_sse2; |
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_sse2; |
#if HAVE_ALIGNED_STACK |
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_sse2; |
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_sse2; |
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_sse2; |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_sse2; |
#endif /* HAVE_ALIGNED_STACK */ |
} |
if (EXTERNAL_SSE4(cpu_flags)) { |
c->weight_h264_pixels_tab[0] = ff_h264_weight_16_10_sse4; |
c->weight_h264_pixels_tab[1] = ff_h264_weight_8_10_sse4; |
c->weight_h264_pixels_tab[2] = ff_h264_weight_4_10_sse4; |
c->biweight_h264_pixels_tab[0] = ff_h264_biweight_16_10_sse4; |
c->biweight_h264_pixels_tab[1] = ff_h264_biweight_8_10_sse4; |
c->biweight_h264_pixels_tab[2] = ff_h264_biweight_4_10_sse4; |
} |
if (EXTERNAL_AVX(cpu_flags)) { |
c->h264_idct_dc_add = |
c->h264_idct_add = ff_h264_idct_add_10_avx; |
c->h264_idct8_dc_add = ff_h264_idct8_dc_add_10_avx; |
c->h264_idct_add16 = ff_h264_idct_add16_10_avx; |
if (chroma_format_idc == 1) |
c->h264_idct_add8 = ff_h264_idct_add8_10_avx; |
c->h264_idct_add16intra = ff_h264_idct_add16intra_10_avx; |
#if HAVE_ALIGNED_STACK |
c->h264_idct8_add = ff_h264_idct8_add_10_avx; |
c->h264_idct8_add4 = ff_h264_idct8_add4_10_avx; |
#endif /* HAVE_ALIGNED_STACK */ |
c->h264_v_loop_filter_chroma = ff_deblock_v_chroma_10_avx; |
c->h264_v_loop_filter_chroma_intra = ff_deblock_v_chroma_intra_10_avx; |
#if HAVE_ALIGNED_STACK |
c->h264_v_loop_filter_luma = ff_deblock_v_luma_10_avx; |
c->h264_h_loop_filter_luma = ff_deblock_h_luma_10_avx; |
c->h264_v_loop_filter_luma_intra = ff_deblock_v_luma_intra_10_avx; |
c->h264_h_loop_filter_luma_intra = ff_deblock_h_luma_intra_10_avx; |
#endif /* HAVE_ALIGNED_STACK */ |
} |
} |
#endif |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/hpeldsp.asm |
---|
0,0 → 1,461 |
;****************************************************************************** |
;* |
;* Copyright (c) 2000-2001 Fabrice Bellard <fabrice@bellard.org> |
;* Copyright (c) Nick Kurshev <nickols_k@mail.ru> |
;* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> |
;* Copyright (c) 2002 Zdenek Kabelac <kabi@informatics.muni.cz> |
;* Copyright (c) 2013 Daniel Kang |
;* |
;* MMX optimized hpel functions |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
cextern pb_1 |
SECTION_TEXT |
; put_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro PUT_PIXELS8_X2 0 |
cglobal put_pixels8_x2, 4,5 |
lea r4, [r2*2] |
.loop: |
mova m0, [r1] |
mova m1, [r1+r2] |
PAVGB m0, [r1+1] |
PAVGB m1, [r1+r2+1] |
mova [r0], m0 |
mova [r0+r2], m1 |
add r1, r4 |
add r0, r4 |
mova m0, [r1] |
mova m1, [r1+r2] |
PAVGB m0, [r1+1] |
PAVGB m1, [r1+r2+1] |
add r1, r4 |
mova [r0], m0 |
mova [r0+r2], m1 |
add r0, r4 |
sub r3d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PUT_PIXELS8_X2 |
INIT_MMX 3dnow |
PUT_PIXELS8_X2 |
; put_pixels16_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro PUT_PIXELS_16 0 |
cglobal put_pixels16_x2, 4,5 |
lea r4, [r2*2] |
.loop: |
mova m0, [r1] |
mova m1, [r1+r2] |
mova m2, [r1+8] |
mova m3, [r1+r2+8] |
PAVGB m0, [r1+1] |
PAVGB m1, [r1+r2+1] |
PAVGB m2, [r1+9] |
PAVGB m3, [r1+r2+9] |
mova [r0], m0 |
mova [r0+r2], m1 |
mova [r0+8], m2 |
mova [r0+r2+8], m3 |
add r1, r4 |
add r0, r4 |
mova m0, [r1] |
mova m1, [r1+r2] |
mova m2, [r1+8] |
mova m3, [r1+r2+8] |
PAVGB m0, [r1+1] |
PAVGB m1, [r1+r2+1] |
PAVGB m2, [r1+9] |
PAVGB m3, [r1+r2+9] |
add r1, r4 |
mova [r0], m0 |
mova [r0+r2], m1 |
mova [r0+8], m2 |
mova [r0+r2+8], m3 |
add r0, r4 |
sub r3d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PUT_PIXELS_16 |
INIT_MMX 3dnow |
PUT_PIXELS_16 |
; put_no_rnd_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro PUT_NO_RND_PIXELS8_X2 0 |
cglobal put_no_rnd_pixels8_x2, 4,5 |
mova m6, [pb_1] |
lea r4, [r2*2] |
.loop: |
mova m0, [r1] |
mova m2, [r1+r2] |
mova m1, [r1+1] |
mova m3, [r1+r2+1] |
add r1, r4 |
psubusb m0, m6 |
psubusb m2, m6 |
PAVGB m0, m1 |
PAVGB m2, m3 |
mova [r0], m0 |
mova [r0+r2], m2 |
mova m0, [r1] |
mova m1, [r1+1] |
mova m2, [r1+r2] |
mova m3, [r1+r2+1] |
add r0, r4 |
add r1, r4 |
psubusb m0, m6 |
psubusb m2, m6 |
PAVGB m0, m1 |
PAVGB m2, m3 |
mova [r0], m0 |
mova [r0+r2], m2 |
add r0, r4 |
sub r3d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PUT_NO_RND_PIXELS8_X2 |
INIT_MMX 3dnow |
PUT_NO_RND_PIXELS8_X2 |
; put_no_rnd_pixels8_x2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro PUT_NO_RND_PIXELS8_X2_EXACT 0 |
cglobal put_no_rnd_pixels8_x2_exact, 4,5 |
lea r4, [r2*3] |
pcmpeqb m6, m6 |
.loop: |
mova m0, [r1] |
mova m2, [r1+r2] |
mova m1, [r1+1] |
mova m3, [r1+r2+1] |
pxor m0, m6 |
pxor m2, m6 |
pxor m1, m6 |
pxor m3, m6 |
PAVGB m0, m1 |
PAVGB m2, m3 |
pxor m0, m6 |
pxor m2, m6 |
mova [r0], m0 |
mova [r0+r2], m2 |
mova m0, [r1+r2*2] |
mova m1, [r1+r2*2+1] |
mova m2, [r1+r4] |
mova m3, [r1+r4+1] |
pxor m0, m6 |
pxor m1, m6 |
pxor m2, m6 |
pxor m3, m6 |
PAVGB m0, m1 |
PAVGB m2, m3 |
pxor m0, m6 |
pxor m2, m6 |
mova [r0+r2*2], m0 |
mova [r0+r4], m2 |
lea r1, [r1+r2*4] |
lea r0, [r0+r2*4] |
sub r3d, 4 |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PUT_NO_RND_PIXELS8_X2_EXACT |
INIT_MMX 3dnow |
PUT_NO_RND_PIXELS8_X2_EXACT |
; put_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro PUT_PIXELS8_Y2 0 |
cglobal put_pixels8_y2, 4,5 |
lea r4, [r2*2] |
mova m0, [r1] |
sub r0, r2 |
.loop: |
mova m1, [r1+r2] |
mova m2, [r1+r4] |
add r1, r4 |
PAVGB m0, m1 |
PAVGB m1, m2 |
mova [r0+r2], m0 |
mova [r0+r4], m1 |
mova m1, [r1+r2] |
mova m0, [r1+r4] |
add r0, r4 |
add r1, r4 |
PAVGB m2, m1 |
PAVGB m1, m0 |
mova [r0+r2], m2 |
mova [r0+r4], m1 |
add r0, r4 |
sub r3d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PUT_PIXELS8_Y2 |
INIT_MMX 3dnow |
PUT_PIXELS8_Y2 |
; put_no_rnd_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro PUT_NO_RND_PIXELS8_Y2 0 |
cglobal put_no_rnd_pixels8_y2, 4,5 |
mova m6, [pb_1] |
lea r4, [r2+r2] |
mova m0, [r1] |
sub r0, r2 |
.loop: |
mova m1, [r1+r2] |
mova m2, [r1+r4] |
add r1, r4 |
psubusb m1, m6 |
PAVGB m0, m1 |
PAVGB m1, m2 |
mova [r0+r2], m0 |
mova [r0+r4], m1 |
mova m1, [r1+r2] |
mova m0, [r1+r4] |
add r0, r4 |
add r1, r4 |
psubusb m1, m6 |
PAVGB m2, m1 |
PAVGB m1, m0 |
mova [r0+r2], m2 |
mova [r0+r4], m1 |
add r0, r4 |
sub r3d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PUT_NO_RND_PIXELS8_Y2 |
INIT_MMX 3dnow |
PUT_NO_RND_PIXELS8_Y2 |
; put_no_rnd_pixels8_y2_exact(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro PUT_NO_RND_PIXELS8_Y2_EXACT 0 |
cglobal put_no_rnd_pixels8_y2_exact, 4,5 |
lea r4, [r2*3] |
mova m0, [r1] |
pcmpeqb m6, m6 |
add r1, r2 |
pxor m0, m6 |
.loop: |
mova m1, [r1] |
mova m2, [r1+r2] |
pxor m1, m6 |
pxor m2, m6 |
PAVGB m0, m1 |
PAVGB m1, m2 |
pxor m0, m6 |
pxor m1, m6 |
mova [r0], m0 |
mova [r0+r2], m1 |
mova m1, [r1+r2*2] |
mova m0, [r1+r4] |
pxor m1, m6 |
pxor m0, m6 |
PAVGB m2, m1 |
PAVGB m1, m0 |
pxor m2, m6 |
pxor m1, m6 |
mova [r0+r2*2], m2 |
mova [r0+r4], m1 |
lea r1, [r1+r2*4] |
lea r0, [r0+r2*4] |
sub r3d, 4 |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PUT_NO_RND_PIXELS8_Y2_EXACT |
INIT_MMX 3dnow |
PUT_NO_RND_PIXELS8_Y2_EXACT |
; avg_pixels8(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro AVG_PIXELS8 0 |
cglobal avg_pixels8, 4,5 |
lea r4, [r2*2] |
.loop: |
mova m0, [r0] |
mova m1, [r0+r2] |
PAVGB m0, [r1] |
PAVGB m1, [r1+r2] |
mova [r0], m0 |
mova [r0+r2], m1 |
add r1, r4 |
add r0, r4 |
mova m0, [r0] |
mova m1, [r0+r2] |
PAVGB m0, [r1] |
PAVGB m1, [r1+r2] |
add r1, r4 |
mova [r0], m0 |
mova [r0+r2], m1 |
add r0, r4 |
sub r3d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX 3dnow |
AVG_PIXELS8 |
; avg_pixels8_x2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro AVG_PIXELS8_X2 0 |
cglobal avg_pixels8_x2, 4,5 |
lea r4, [r2*2] |
.loop: |
mova m0, [r1] |
mova m2, [r1+r2] |
PAVGB m0, [r1+1] |
PAVGB m2, [r1+r2+1] |
PAVGB m0, [r0] |
PAVGB m2, [r0+r2] |
add r1, r4 |
mova [r0], m0 |
mova [r0+r2], m2 |
mova m0, [r1] |
mova m2, [r1+r2] |
PAVGB m0, [r1+1] |
PAVGB m2, [r1+r2+1] |
add r0, r4 |
add r1, r4 |
PAVGB m0, [r0] |
PAVGB m2, [r0+r2] |
mova [r0], m0 |
mova [r0+r2], m2 |
add r0, r4 |
sub r3d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
AVG_PIXELS8_X2 |
INIT_MMX 3dnow |
AVG_PIXELS8_X2 |
; avg_pixels8_y2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro AVG_PIXELS8_Y2 0 |
cglobal avg_pixels8_y2, 4,5 |
lea r4, [r2*2] |
mova m0, [r1] |
sub r0, r2 |
.loop: |
mova m1, [r1+r2] |
mova m2, [r1+r4] |
add r1, r4 |
PAVGB m0, m1 |
PAVGB m1, m2 |
mova m3, [r0+r2] |
mova m4, [r0+r4] |
PAVGB m0, m3 |
PAVGB m1, m4 |
mova [r0+r2], m0 |
mova [r0+r4], m1 |
mova m1, [r1+r2] |
mova m0, [r1+r4] |
PAVGB m2, m1 |
PAVGB m1, m0 |
add r0, r4 |
add r1, r4 |
mova m3, [r0+r2] |
mova m4, [r0+r4] |
PAVGB m2, m3 |
PAVGB m1, m4 |
mova [r0+r2], m2 |
mova [r0+r4], m1 |
add r0, r4 |
sub r3d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
AVG_PIXELS8_Y2 |
INIT_MMX 3dnow |
AVG_PIXELS8_Y2 |
; avg_pixels8_xy2(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
%macro AVG_PIXELS8_XY2 0 |
cglobal avg_pixels8_xy2, 4,5 |
mova m6, [pb_1] |
lea r4, [r2*2] |
mova m0, [r1] |
pavgb m0, [r1+1] |
.loop: |
mova m2, [r1+r4] |
mova m1, [r1+r2] |
psubusb m2, m6 |
pavgb m1, [r1+r2+1] |
pavgb m2, [r1+r4+1] |
add r1, r4 |
pavgb m0, m1 |
pavgb m1, m2 |
pavgb m0, [r0] |
pavgb m1, [r0+r2] |
mova [r0], m0 |
mova [r0+r2], m1 |
mova m1, [r1+r2] |
mova m0, [r1+r4] |
pavgb m1, [r1+r2+1] |
pavgb m0, [r1+r4+1] |
add r0, r4 |
add r1, r4 |
pavgb m2, m1 |
pavgb m1, m0 |
pavgb m2, [r0] |
pavgb m1, [r0+r2] |
mova [r0], m2 |
mova [r0+r2], m1 |
add r0, r4 |
sub r3d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
AVG_PIXELS8_XY2 |
INIT_MMX 3dnow |
AVG_PIXELS8_XY2 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/hpeldsp_init.c |
---|
0,0 → 1,269 |
/* |
* MMX optimized DSP utils |
* Copyright (c) 2000, 2001 Fabrice Bellard |
* Copyright (c) 2002-2004 Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
* |
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
*/ |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/hpeldsp.h" |
#include "dsputil_x86.h" |
void ff_put_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels16_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels16_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_no_rnd_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_no_rnd_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_no_rnd_pixels8_x2_exact_mmxext(uint8_t *block, |
const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_no_rnd_pixels8_x2_exact_3dnow(uint8_t *block, |
const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_no_rnd_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_no_rnd_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_no_rnd_pixels8_y2_exact_mmxext(uint8_t *block, |
const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_put_no_rnd_pixels8_y2_exact_3dnow(uint8_t *block, |
const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels8_3dnow(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels8_x2_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels8_x2_3dnow(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels8_y2_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels8_y2_3dnow(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels8_xy2_mmxext(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
void ff_avg_pixels8_xy2_3dnow(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h); |
#define avg_pixels8_mmx ff_avg_pixels8_mmx |
#define avg_pixels8_x2_mmx ff_avg_pixels8_x2_mmx |
#define avg_pixels16_mmx ff_avg_pixels16_mmx |
#define avg_pixels8_xy2_mmx ff_avg_pixels8_xy2_mmx |
#define avg_pixels16_xy2_mmx ff_avg_pixels16_xy2_mmx |
#define put_pixels8_mmx ff_put_pixels8_mmx |
#define put_pixels16_mmx ff_put_pixels16_mmx |
#define put_pixels8_xy2_mmx ff_put_pixels8_xy2_mmx |
#define put_pixels16_xy2_mmx ff_put_pixels16_xy2_mmx |
#define avg_no_rnd_pixels16_mmx ff_avg_pixels16_mmx |
#define put_no_rnd_pixels8_mmx ff_put_pixels8_mmx |
#define put_no_rnd_pixels16_mmx ff_put_pixels16_mmx |
#if HAVE_INLINE_ASM |
/***********************************/ |
/* MMX no rounding */ |
#define DEF(x, y) x ## _no_rnd_ ## y ## _mmx |
#define SET_RND MOVQ_WONE |
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX_NO_RND(a, b, c, d, e, f) |
#define PAVGB(a, b, c, e) PAVGB_MMX_NO_RND(a, b, c, e) |
#define STATIC static |
#include "rnd_template.c" |
#include "hpeldsp_rnd_template.c" |
#undef DEF |
#undef SET_RND |
#undef PAVGBP |
#undef PAVGB |
#undef STATIC |
PIXELS16(static, avg_no_rnd, , _y2, _mmx) |
PIXELS16(static, put_no_rnd, , _y2, _mmx) |
PIXELS16(static, avg_no_rnd, , _xy2, _mmx) |
PIXELS16(static, put_no_rnd, , _xy2, _mmx) |
/***********************************/ |
/* MMX rounding */ |
#define DEF(x, y) x ## _ ## y ## _mmx |
#define SET_RND MOVQ_WTWO |
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) |
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) |
#include "hpeldsp_rnd_template.c" |
#undef DEF |
#undef SET_RND |
#undef PAVGBP |
#undef PAVGB |
PIXELS16(static, avg, , _y2, _mmx) |
PIXELS16(static, put, , _y2, _mmx) |
#endif /* HAVE_INLINE_ASM */ |
#if HAVE_YASM |
#define HPELDSP_AVG_PIXELS16(CPUEXT) \ |
PIXELS16(static, put_no_rnd, ff_, _x2, CPUEXT) \ |
PIXELS16(static, put, ff_, _y2, CPUEXT) \ |
PIXELS16(static, put_no_rnd, ff_, _y2, CPUEXT) \ |
PIXELS16(static, avg, ff_, , CPUEXT) \ |
PIXELS16(static, avg, ff_, _x2, CPUEXT) \ |
PIXELS16(static, avg, ff_, _y2, CPUEXT) \ |
PIXELS16(static, avg, ff_, _xy2, CPUEXT) |
HPELDSP_AVG_PIXELS16(_3dnow) |
HPELDSP_AVG_PIXELS16(_mmxext) |
#endif /* HAVE_YASM */ |
#define SET_HPEL_FUNCS(PFX, IDX, SIZE, CPU) \ |
do { \ |
c->PFX ## _pixels_tab IDX [0] = PFX ## _pixels ## SIZE ## _ ## CPU; \ |
c->PFX ## _pixels_tab IDX [1] = PFX ## _pixels ## SIZE ## _x2_ ## CPU; \ |
c->PFX ## _pixels_tab IDX [2] = PFX ## _pixels ## SIZE ## _y2_ ## CPU; \ |
c->PFX ## _pixels_tab IDX [3] = PFX ## _pixels ## SIZE ## _xy2_ ## CPU; \ |
} while (0) |
static void hpeldsp_init_mmx(HpelDSPContext *c, int flags, int cpu_flags) |
{ |
#if HAVE_MMX_INLINE |
SET_HPEL_FUNCS(put, [0], 16, mmx); |
SET_HPEL_FUNCS(put_no_rnd, [0], 16, mmx); |
SET_HPEL_FUNCS(avg, [0], 16, mmx); |
SET_HPEL_FUNCS(avg_no_rnd, , 16, mmx); |
SET_HPEL_FUNCS(put, [1], 8, mmx); |
SET_HPEL_FUNCS(put_no_rnd, [1], 8, mmx); |
SET_HPEL_FUNCS(avg, [1], 8, mmx); |
#endif /* HAVE_MMX_INLINE */ |
} |
static void hpeldsp_init_mmxext(HpelDSPContext *c, int flags, int cpu_flags) |
{ |
#if HAVE_MMXEXT_EXTERNAL |
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_mmxext; |
c->put_pixels_tab[0][2] = put_pixels16_y2_mmxext; |
c->avg_pixels_tab[0][0] = avg_pixels16_mmxext; |
c->avg_pixels_tab[0][1] = avg_pixels16_x2_mmxext; |
c->avg_pixels_tab[0][2] = avg_pixels16_y2_mmxext; |
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_mmxext; |
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_mmxext; |
c->avg_pixels_tab[1][0] = ff_avg_pixels8_mmxext; |
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_mmxext; |
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_mmxext; |
if (!(flags & CODEC_FLAG_BITEXACT)) { |
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_mmxext; |
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_mmxext; |
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_mmxext; |
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_mmxext; |
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_mmxext; |
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_mmxext; |
} |
if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { |
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_mmxext; |
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_mmxext; |
} |
#endif /* HAVE_MMXEXT_EXTERNAL */ |
} |
static void hpeldsp_init_3dnow(HpelDSPContext *c, int flags, int cpu_flags) |
{ |
#if HAVE_AMD3DNOW_EXTERNAL |
c->put_pixels_tab[0][1] = ff_put_pixels16_x2_3dnow; |
c->put_pixels_tab[0][2] = put_pixels16_y2_3dnow; |
c->avg_pixels_tab[0][0] = avg_pixels16_3dnow; |
c->avg_pixels_tab[0][1] = avg_pixels16_x2_3dnow; |
c->avg_pixels_tab[0][2] = avg_pixels16_y2_3dnow; |
c->put_pixels_tab[1][1] = ff_put_pixels8_x2_3dnow; |
c->put_pixels_tab[1][2] = ff_put_pixels8_y2_3dnow; |
c->avg_pixels_tab[1][0] = ff_avg_pixels8_3dnow; |
c->avg_pixels_tab[1][1] = ff_avg_pixels8_x2_3dnow; |
c->avg_pixels_tab[1][2] = ff_avg_pixels8_y2_3dnow; |
if (!(flags & CODEC_FLAG_BITEXACT)){ |
c->put_no_rnd_pixels_tab[0][1] = put_no_rnd_pixels16_x2_3dnow; |
c->put_no_rnd_pixels_tab[0][2] = put_no_rnd_pixels16_y2_3dnow; |
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_3dnow; |
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_3dnow; |
c->avg_pixels_tab[0][3] = avg_pixels16_xy2_3dnow; |
c->avg_pixels_tab[1][3] = ff_avg_pixels8_xy2_3dnow; |
} |
if (flags & CODEC_FLAG_BITEXACT && CONFIG_VP3_DECODER) { |
c->put_no_rnd_pixels_tab[1][1] = ff_put_no_rnd_pixels8_x2_exact_3dnow; |
c->put_no_rnd_pixels_tab[1][2] = ff_put_no_rnd_pixels8_y2_exact_3dnow; |
} |
#endif /* HAVE_AMD3DNOW_EXTERNAL */ |
} |
static void hpeldsp_init_sse2(HpelDSPContext *c, int flags, int cpu_flags) |
{ |
#if HAVE_SSE2_EXTERNAL |
if (!(cpu_flags & AV_CPU_FLAG_SSE2SLOW)) { |
// these functions are slower than mmx on AMD, but faster on Intel |
c->put_pixels_tab[0][0] = ff_put_pixels16_sse2; |
c->put_no_rnd_pixels_tab[0][0] = ff_put_pixels16_sse2; |
c->avg_pixels_tab[0][0] = ff_avg_pixels16_sse2; |
} |
#endif /* HAVE_SSE2_EXTERNAL */ |
} |
void ff_hpeldsp_init_x86(HpelDSPContext *c, int flags) |
{ |
int cpu_flags = av_get_cpu_flags(); |
if (INLINE_MMX(cpu_flags)) |
hpeldsp_init_mmx(c, flags, cpu_flags); |
if (EXTERNAL_MMXEXT(cpu_flags)) |
hpeldsp_init_mmxext(c, flags, cpu_flags); |
if (EXTERNAL_AMD3DNOW(cpu_flags)) |
hpeldsp_init_3dnow(c, flags, cpu_flags); |
if (EXTERNAL_SSE2(cpu_flags)) |
hpeldsp_init_sse2(c, flags, cpu_flags); |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/hpeldsp_mmx.c |
---|
0,0 → 1,52 |
/* |
* MMX-optimized avg/put pixel routines |
* |
* Copyright (c) 2001 Fabrice Bellard |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include <stddef.h> |
#include <stdint.h> |
#include "config.h" |
#include "dsputil_x86.h" |
#if HAVE_MMX_INLINE |
void ff_avg_pixels8_x2_mmx(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h) |
{ |
MOVQ_BFE(mm6); |
JUMPALIGN(); |
do { |
__asm__ volatile( |
"movq %1, %%mm0 \n\t" |
"movq 1%1, %%mm1 \n\t" |
"movq %0, %%mm3 \n\t" |
PAVGB_MMX(%%mm0, %%mm1, %%mm2, %%mm6) |
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) |
"movq %%mm0, %0 \n\t" |
:"+m"(*block) |
:"m"(*pixels) |
:"memory"); |
pixels += line_size; |
block += line_size; |
} while (--h); |
} |
#endif /* HAVE_MMX_INLINE */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/hpeldsp_rnd_template.c |
---|
0,0 → 1,198 |
/* |
* DSP utils mmx functions are compiled twice for rnd/no_rnd |
* Copyright (c) 2000, 2001 Fabrice Bellard |
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> |
* |
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
* and improved by Zdenek Kabelac <kabi@users.sf.net> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
// put_pixels |
static void DEF(put, pixels8_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
{ |
MOVQ_BFE(mm6); |
__asm__ volatile( |
"lea (%3, %3), %%"REG_a" \n\t" |
".p2align 3 \n\t" |
"1: \n\t" |
"movq (%1), %%mm0 \n\t" |
"movq 1(%1), %%mm1 \n\t" |
"movq (%1, %3), %%mm2 \n\t" |
"movq 1(%1, %3), %%mm3 \n\t" |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
"movq %%mm4, (%2) \n\t" |
"movq %%mm5, (%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"movq (%1), %%mm0 \n\t" |
"movq 1(%1), %%mm1 \n\t" |
"movq (%1, %3), %%mm2 \n\t" |
"movq 1(%1, %3), %%mm3 \n\t" |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
"movq %%mm4, (%2) \n\t" |
"movq %%mm5, (%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"subl $4, %0 \n\t" |
"jnz 1b \n\t" |
:"+g"(h), "+S"(pixels), "+D"(block) |
:"r"((x86_reg)line_size) |
:REG_a, "memory"); |
} |
static void DEF(put, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
{ |
MOVQ_BFE(mm6); |
__asm__ volatile( |
"lea (%3, %3), %%"REG_a" \n\t" |
".p2align 3 \n\t" |
"1: \n\t" |
"movq (%1), %%mm0 \n\t" |
"movq 1(%1), %%mm1 \n\t" |
"movq (%1, %3), %%mm2 \n\t" |
"movq 1(%1, %3), %%mm3 \n\t" |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
"movq %%mm4, (%2) \n\t" |
"movq %%mm5, (%2, %3) \n\t" |
"movq 8(%1), %%mm0 \n\t" |
"movq 9(%1), %%mm1 \n\t" |
"movq 8(%1, %3), %%mm2 \n\t" |
"movq 9(%1, %3), %%mm3 \n\t" |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
"movq %%mm4, 8(%2) \n\t" |
"movq %%mm5, 8(%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"movq (%1), %%mm0 \n\t" |
"movq 1(%1), %%mm1 \n\t" |
"movq (%1, %3), %%mm2 \n\t" |
"movq 1(%1, %3), %%mm3 \n\t" |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
"movq %%mm4, (%2) \n\t" |
"movq %%mm5, (%2, %3) \n\t" |
"movq 8(%1), %%mm0 \n\t" |
"movq 9(%1), %%mm1 \n\t" |
"movq 8(%1, %3), %%mm2 \n\t" |
"movq 9(%1, %3), %%mm3 \n\t" |
PAVGBP(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
"movq %%mm4, 8(%2) \n\t" |
"movq %%mm5, 8(%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"subl $4, %0 \n\t" |
"jnz 1b \n\t" |
:"+g"(h), "+S"(pixels), "+D"(block) |
:"r"((x86_reg)line_size) |
:REG_a, "memory"); |
} |
static void DEF(put, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
{ |
MOVQ_BFE(mm6); |
__asm__ volatile( |
"lea (%3, %3), %%"REG_a" \n\t" |
"movq (%1), %%mm0 \n\t" |
".p2align 3 \n\t" |
"1: \n\t" |
"movq (%1, %3), %%mm1 \n\t" |
"movq (%1, %%"REG_a"),%%mm2 \n\t" |
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) |
"movq %%mm4, (%2) \n\t" |
"movq %%mm5, (%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"movq (%1, %3), %%mm1 \n\t" |
"movq (%1, %%"REG_a"),%%mm0 \n\t" |
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) |
"movq %%mm4, (%2) \n\t" |
"movq %%mm5, (%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"subl $4, %0 \n\t" |
"jnz 1b \n\t" |
:"+g"(h), "+S"(pixels), "+D"(block) |
:"r"((x86_reg)line_size) |
:REG_a, "memory"); |
} |
static void DEF(avg, pixels16_x2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
{ |
MOVQ_BFE(mm6); |
JUMPALIGN(); |
do { |
__asm__ volatile( |
"movq %1, %%mm0 \n\t" |
"movq 1%1, %%mm1 \n\t" |
"movq %0, %%mm3 \n\t" |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) |
"movq %%mm0, %0 \n\t" |
"movq 8%1, %%mm0 \n\t" |
"movq 9%1, %%mm1 \n\t" |
"movq 8%0, %%mm3 \n\t" |
PAVGB(%%mm0, %%mm1, %%mm2, %%mm6) |
PAVGB_MMX(%%mm3, %%mm2, %%mm0, %%mm6) |
"movq %%mm0, 8%0 \n\t" |
:"+m"(*block) |
:"m"(*pixels) |
:"memory"); |
pixels += line_size; |
block += line_size; |
} while (--h); |
} |
static void DEF(avg, pixels8_y2)(uint8_t *block, const uint8_t *pixels, ptrdiff_t line_size, int h) |
{ |
MOVQ_BFE(mm6); |
__asm__ volatile( |
"lea (%3, %3), %%"REG_a" \n\t" |
"movq (%1), %%mm0 \n\t" |
".p2align 3 \n\t" |
"1: \n\t" |
"movq (%1, %3), %%mm1 \n\t" |
"movq (%1, %%"REG_a"), %%mm2 \n\t" |
PAVGBP(%%mm1, %%mm0, %%mm4, %%mm2, %%mm1, %%mm5) |
"movq (%2), %%mm3 \n\t" |
PAVGB_MMX(%%mm3, %%mm4, %%mm0, %%mm6) |
"movq (%2, %3), %%mm3 \n\t" |
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) |
"movq %%mm0, (%2) \n\t" |
"movq %%mm1, (%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"movq (%1, %3), %%mm1 \n\t" |
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
PAVGBP(%%mm1, %%mm2, %%mm4, %%mm0, %%mm1, %%mm5) |
"movq (%2), %%mm3 \n\t" |
PAVGB_MMX(%%mm3, %%mm4, %%mm2, %%mm6) |
"movq (%2, %3), %%mm3 \n\t" |
PAVGB_MMX(%%mm3, %%mm5, %%mm1, %%mm6) |
"movq %%mm2, (%2) \n\t" |
"movq %%mm1, (%2, %3) \n\t" |
"add %%"REG_a", %1 \n\t" |
"add %%"REG_a", %2 \n\t" |
"subl $4, %0 \n\t" |
"jnz 1b \n\t" |
:"+g"(h), "+S"(pixels), "+D"(block) |
:"r"((x86_reg)line_size) |
:REG_a, "memory"); |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/idct_mmx_xvid.c |
---|
0,0 → 1,562 |
/* |
* XVID MPEG-4 VIDEO CODEC |
* - MMX and XMM forward discrete cosine transform - |
* |
* Copyright(C) 2001 Peter Ross <pross@xvid.org> |
* |
* Originally provided by Intel at AP-922 |
* http://developer.intel.com/vtune/cbts/strmsimd/922down.htm |
* (See more app notes at http://developer.intel.com/vtune/cbts/strmsimd/appnotes.htm) |
* but in a limited edition. |
* New macro implements a column part for precise iDCT |
* The routine precision now satisfies IEEE standard 1180-1990. |
* |
* Copyright(C) 2000-2001 Peter Gubanov <peter@elecard.net.ru> |
* Rounding trick Copyright(C) 2000 Michel Lespinasse <walken@zoy.org> |
* |
* http://www.elecard.com/peter/idct.html |
* http://www.linuxvideo.org/mpeg2dec/ |
* |
* These examples contain code fragments for first stage iDCT 8x8 |
* (for rows) and first stage DCT 8x8 (for columns) |
* |
* conversion to gcc syntax by Michael Niedermayer |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public License |
* along with FFmpeg; if not, write to the Free Software Foundation, |
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include <inttypes.h> |
#include "config.h" |
#include "libavcodec/avcodec.h" |
#include "libavutil/mem.h" |
#include "dsputil_x86.h" |
#include "idct_xvid.h" |
#if HAVE_MMX_INLINE |
//============================================================================= |
// Macros and other preprocessor constants |
//============================================================================= |
#define BITS_INV_ACC 5 // 4 or 5 for IEEE |
#define SHIFT_INV_ROW (16 - BITS_INV_ACC) //11 |
#define SHIFT_INV_COL (1 + BITS_INV_ACC) //6 |
#define RND_INV_ROW (1024 * (6 - BITS_INV_ACC)) |
#define RND_INV_COL (16 * (BITS_INV_ACC - 3)) |
#define RND_INV_CORR (RND_INV_COL - 1) |
#define BITS_FRW_ACC 3 // 2 or 3 for accuracy |
#define SHIFT_FRW_COL BITS_FRW_ACC |
#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17) |
#define RND_FRW_ROW (262144*(BITS_FRW_ACC - 1)) |
//----------------------------------------------------------------------------- |
// Various memory constants (trigonometric values or rounding values) |
//----------------------------------------------------------------------------- |
DECLARE_ALIGNED(8, static const int16_t, tg_1_16)[4*4] = { |
13036,13036,13036,13036, // tg * (2<<16) + 0.5 |
27146,27146,27146,27146, // tg * (2<<16) + 0.5 |
-21746,-21746,-21746,-21746, // tg * (2<<16) + 0.5 |
23170,23170,23170,23170}; // cos * (2<<15) + 0.5 |
DECLARE_ALIGNED(8, static const int32_t, rounder_0)[2*8] = { |
65536,65536, |
3597,3597, |
2260,2260, |
1203,1203, |
0,0, |
120,120, |
512,512, |
512,512}; |
//----------------------------------------------------------------------------- |
// |
// The first stage iDCT 8x8 - inverse DCTs of rows |
// |
//----------------------------------------------------------------------------- |
// The 8-point inverse DCT direct algorithm |
//----------------------------------------------------------------------------- |
// |
// static const short w[32] = { |
// FIX(cos_4_16), FIX(cos_2_16), FIX(cos_4_16), FIX(cos_6_16), |
// FIX(cos_4_16), FIX(cos_6_16), -FIX(cos_4_16), -FIX(cos_2_16), |
// FIX(cos_4_16), -FIX(cos_6_16), -FIX(cos_4_16), FIX(cos_2_16), |
// FIX(cos_4_16), -FIX(cos_2_16), FIX(cos_4_16), -FIX(cos_6_16), |
// FIX(cos_1_16), FIX(cos_3_16), FIX(cos_5_16), FIX(cos_7_16), |
// FIX(cos_3_16), -FIX(cos_7_16), -FIX(cos_1_16), -FIX(cos_5_16), |
// FIX(cos_5_16), -FIX(cos_1_16), FIX(cos_7_16), FIX(cos_3_16), |
// FIX(cos_7_16), -FIX(cos_5_16), FIX(cos_3_16), -FIX(cos_1_16) }; |
// |
// #define DCT_8_INV_ROW(x, y) |
// { |
// int a0, a1, a2, a3, b0, b1, b2, b3; |
// |
// a0 =x[0]*w[0]+x[2]*w[1]+x[4]*w[2]+x[6]*w[3]; |
// a1 =x[0]*w[4]+x[2]*w[5]+x[4]*w[6]+x[6]*w[7]; |
// a2 = x[0] * w[ 8] + x[2] * w[ 9] + x[4] * w[10] + x[6] * w[11]; |
// a3 = x[0] * w[12] + x[2] * w[13] + x[4] * w[14] + x[6] * w[15]; |
// b0 = x[1] * w[16] + x[3] * w[17] + x[5] * w[18] + x[7] * w[19]; |
// b1 = x[1] * w[20] + x[3] * w[21] + x[5] * w[22] + x[7] * w[23]; |
// b2 = x[1] * w[24] + x[3] * w[25] + x[5] * w[26] + x[7] * w[27]; |
// b3 = x[1] * w[28] + x[3] * w[29] + x[5] * w[30] + x[7] * w[31]; |
// |
// y[0] = SHIFT_ROUND ( a0 + b0 ); |
// y[1] = SHIFT_ROUND ( a1 + b1 ); |
// y[2] = SHIFT_ROUND ( a2 + b2 ); |
// y[3] = SHIFT_ROUND ( a3 + b3 ); |
// y[4] = SHIFT_ROUND ( a3 - b3 ); |
// y[5] = SHIFT_ROUND ( a2 - b2 ); |
// y[6] = SHIFT_ROUND ( a1 - b1 ); |
// y[7] = SHIFT_ROUND ( a0 - b0 ); |
// } |
// |
//----------------------------------------------------------------------------- |
// |
// In this implementation the outputs of the iDCT-1D are multiplied |
// for rows 0,4 - by cos_4_16, |
// for rows 1,7 - by cos_1_16, |
// for rows 2,6 - by cos_2_16, |
// for rows 3,5 - by cos_3_16 |
// and are shifted to the left for better accuracy |
// |
// For the constants used, |
// FIX(float_const) = (short) (float_const * (1<<15) + 0.5) |
// |
//----------------------------------------------------------------------------- |
//----------------------------------------------------------------------------- |
// Tables for mmx processors |
//----------------------------------------------------------------------------- |
// Table for rows 0,4 - constants are multiplied by cos_4_16 |
DECLARE_ALIGNED(8, static const int16_t, tab_i_04_mmx)[32*4] = { |
16384,16384,16384,-16384, // movq-> w06 w04 w02 w00 |
21407,8867,8867,-21407, // w07 w05 w03 w01 |
16384,-16384,16384,16384, // w14 w12 w10 w08 |
-8867,21407,-21407,-8867, // w15 w13 w11 w09 |
22725,12873,19266,-22725, // w22 w20 w18 w16 |
19266,4520,-4520,-12873, // w23 w21 w19 w17 |
12873,4520,4520,19266, // w30 w28 w26 w24 |
-22725,19266,-12873,-22725, // w31 w29 w27 w25 |
// Table for rows 1,7 - constants are multiplied by cos_1_16 |
22725,22725,22725,-22725, // movq-> w06 w04 w02 w00 |
29692,12299,12299,-29692, // w07 w05 w03 w01 |
22725,-22725,22725,22725, // w14 w12 w10 w08 |
-12299,29692,-29692,-12299, // w15 w13 w11 w09 |
31521,17855,26722,-31521, // w22 w20 w18 w16 |
26722,6270,-6270,-17855, // w23 w21 w19 w17 |
17855,6270,6270,26722, // w30 w28 w26 w24 |
-31521,26722,-17855,-31521, // w31 w29 w27 w25 |
// Table for rows 2,6 - constants are multiplied by cos_2_16 |
21407,21407,21407,-21407, // movq-> w06 w04 w02 w00 |
27969,11585,11585,-27969, // w07 w05 w03 w01 |
21407,-21407,21407,21407, // w14 w12 w10 w08 |
-11585,27969,-27969,-11585, // w15 w13 w11 w09 |
29692,16819,25172,-29692, // w22 w20 w18 w16 |
25172,5906,-5906,-16819, // w23 w21 w19 w17 |
16819,5906,5906,25172, // w30 w28 w26 w24 |
-29692,25172,-16819,-29692, // w31 w29 w27 w25 |
// Table for rows 3,5 - constants are multiplied by cos_3_16 |
19266,19266,19266,-19266, // movq-> w06 w04 w02 w00 |
25172,10426,10426,-25172, // w07 w05 w03 w01 |
19266,-19266,19266,19266, // w14 w12 w10 w08 |
-10426,25172,-25172,-10426, // w15 w13 w11 w09 |
26722,15137,22654,-26722, // w22 w20 w18 w16 |
22654,5315,-5315,-15137, // w23 w21 w19 w17 |
15137,5315,5315,22654, // w30 w28 w26 w24 |
-26722,22654,-15137,-26722, // w31 w29 w27 w25 |
}; |
//----------------------------------------------------------------------------- |
// Tables for xmm processors |
//----------------------------------------------------------------------------- |
// %3 for rows 0,4 - constants are multiplied by cos_4_16 |
DECLARE_ALIGNED(8, static const int16_t, tab_i_04_xmm)[32*4] = { |
16384,21407,16384,8867, // movq-> w05 w04 w01 w00 |
16384,8867,-16384,-21407, // w07 w06 w03 w02 |
16384,-8867,16384,-21407, // w13 w12 w09 w08 |
-16384,21407,16384,-8867, // w15 w14 w11 w10 |
22725,19266,19266,-4520, // w21 w20 w17 w16 |
12873,4520,-22725,-12873, // w23 w22 w19 w18 |
12873,-22725,4520,-12873, // w29 w28 w25 w24 |
4520,19266,19266,-22725, // w31 w30 w27 w26 |
// %3 for rows 1,7 - constants are multiplied by cos_1_16 |
22725,29692,22725,12299, // movq-> w05 w04 w01 w00 |
22725,12299,-22725,-29692, // w07 w06 w03 w02 |
22725,-12299,22725,-29692, // w13 w12 w09 w08 |
-22725,29692,22725,-12299, // w15 w14 w11 w10 |
31521,26722,26722,-6270, // w21 w20 w17 w16 |
17855,6270,-31521,-17855, // w23 w22 w19 w18 |
17855,-31521,6270,-17855, // w29 w28 w25 w24 |
6270,26722,26722,-31521, // w31 w30 w27 w26 |
// %3 for rows 2,6 - constants are multiplied by cos_2_16 |
21407,27969,21407,11585, // movq-> w05 w04 w01 w00 |
21407,11585,-21407,-27969, // w07 w06 w03 w02 |
21407,-11585,21407,-27969, // w13 w12 w09 w08 |
-21407,27969,21407,-11585, // w15 w14 w11 w10 |
29692,25172,25172,-5906, // w21 w20 w17 w16 |
16819,5906,-29692,-16819, // w23 w22 w19 w18 |
16819,-29692,5906,-16819, // w29 w28 w25 w24 |
5906,25172,25172,-29692, // w31 w30 w27 w26 |
// %3 for rows 3,5 - constants are multiplied by cos_3_16 |
19266,25172,19266,10426, // movq-> w05 w04 w01 w00 |
19266,10426,-19266,-25172, // w07 w06 w03 w02 |
19266,-10426,19266,-25172, // w13 w12 w09 w08 |
-19266,25172,19266,-10426, // w15 w14 w11 w10 |
26722,22654,22654,-5315, // w21 w20 w17 w16 |
15137,5315,-26722,-15137, // w23 w22 w19 w18 |
15137,-26722,5315,-15137, // w29 w28 w25 w24 |
5315,22654,22654,-26722, // w31 w30 w27 w26 |
}; |
//============================================================================= |
// Helper macros for the code |
//============================================================================= |
//----------------------------------------------------------------------------- |
// DCT_8_INV_ROW_MMX( INP, OUT, TABLE, ROUNDER |
//----------------------------------------------------------------------------- |
#define DCT_8_INV_ROW_MMX(A1,A2,A3,A4)\ |
"movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\ |
"movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\ |
"movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\ |
"movq " #A3 ",%%mm3 \n\t"/* 3 ; w06 w04 w02 w00*/\ |
"punpcklwd %%mm1,%%mm0 \n\t"/* x5 x1 x4 x0*/\ |
"movq %%mm0,%%mm5 \n\t"/* 5 ; x5 x1 x4 x0*/\ |
"punpckldq %%mm0,%%mm0 \n\t"/* x4 x0 x4 x0*/\ |
"movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w05 w03 w01*/\ |
"punpckhwd %%mm1,%%mm2 \n\t"/* 1 ; x7 x3 x6 x2*/\ |
"pmaddwd %%mm0,%%mm3 \n\t"/* x4*w06+x0*w04 x4*w02+x0*w00*/\ |
"movq %%mm2,%%mm6 \n\t"/* 6 ; x7 x3 x6 x2*/\ |
"movq 32+" #A3 ",%%mm1 \n\t"/* 1 ; w22 w20 w18 w16*/\ |
"punpckldq %%mm2,%%mm2 \n\t"/* x6 x2 x6 x2*/\ |
"pmaddwd %%mm2,%%mm4 \n\t"/* x6*w07+x2*w05 x6*w03+x2*w01*/\ |
"punpckhdq %%mm5,%%mm5 \n\t"/* x5 x1 x5 x1*/\ |
"pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x4*w14+x0*w12 x4*w10+x0*w08*/\ |
"punpckhdq %%mm6,%%mm6 \n\t"/* x7 x3 x7 x3*/\ |
"movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w21 w19 w17*/\ |
"pmaddwd %%mm5,%%mm1 \n\t"/* x5*w22+x1*w20 x5*w18+x1*w16*/\ |
"paddd " #A4 ",%%mm3 \n\t"/* +%4*/\ |
"pmaddwd %%mm6,%%mm7 \n\t"/* x7*w23+x3*w21 x7*w19+x3*w17*/\ |
"pmaddwd 24+" #A3 ",%%mm2 \n\t"/* x6*w15+x2*w13 x6*w11+x2*w09*/\ |
"paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\ |
"pmaddwd 48+" #A3 ",%%mm5 \n\t"/* x5*w30+x1*w28 x5*w26+x1*w24*/\ |
"movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\ |
"pmaddwd 56+" #A3 ",%%mm6 \n\t"/* x7*w31+x3*w29 x7*w27+x3*w25*/\ |
"paddd %%mm7,%%mm1 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\ |
"paddd " #A4 ",%%mm0 \n\t"/* +%4*/\ |
"psubd %%mm1,%%mm3 \n\t"/* a1-b1 a0-b0*/\ |
"psrad $11,%%mm3 \n\t"/* y6=a1-b1 y7=a0-b0*/\ |
"paddd %%mm4,%%mm1 \n\t"/* 4 ; a1+b1 a0+b0*/\ |
"paddd %%mm2,%%mm0 \n\t"/* 2 ; a3=sum(even3) a2=sum(even2)*/\ |
"psrad $11,%%mm1 \n\t"/* y1=a1+b1 y0=a0+b0*/\ |
"paddd %%mm6,%%mm5 \n\t"/* 6 ; b3=sum(odd3) b2=sum(odd2)*/\ |
"movq %%mm0,%%mm4 \n\t"/* 4 ; a3 a2*/\ |
"paddd %%mm5,%%mm0 \n\t"/* a3+b3 a2+b2*/\ |
"psubd %%mm5,%%mm4 \n\t"/* 5 ; a3-b3 a2-b2*/\ |
"psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\ |
"psrad $11,%%mm4 \n\t"/* y4=a3-b3 y5=a2-b2*/\ |
"packssdw %%mm0,%%mm1 \n\t"/* 0 ; y3 y2 y1 y0*/\ |
"packssdw %%mm3,%%mm4 \n\t"/* 3 ; y6 y7 y4 y5*/\ |
"movq %%mm4,%%mm7 \n\t"/* 7 ; y6 y7 y4 y5*/\ |
"psrld $16,%%mm4 \n\t"/* 0 y6 0 y4*/\ |
"pslld $16,%%mm7 \n\t"/* y7 0 y5 0*/\ |
"movq %%mm1," #A2 " \n\t"/* 1 ; save y3 y2 y1 y0*/\ |
"por %%mm4,%%mm7 \n\t"/* 4 ; y7 y6 y5 y4*/\ |
"movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\ |
//----------------------------------------------------------------------------- |
// DCT_8_INV_ROW_XMM( INP, OUT, TABLE, ROUNDER |
//----------------------------------------------------------------------------- |
#define DCT_8_INV_ROW_XMM(A1,A2,A3,A4)\ |
"movq " #A1 ",%%mm0 \n\t"/* 0 ; x3 x2 x1 x0*/\ |
"movq 8+" #A1 ",%%mm1 \n\t"/* 1 ; x7 x6 x5 x4*/\ |
"movq %%mm0,%%mm2 \n\t"/* 2 ; x3 x2 x1 x0*/\ |
"movq " #A3 ",%%mm3 \n\t"/* 3 ; w05 w04 w01 w00*/\ |
"pshufw $0x88,%%mm0,%%mm0 \n\t"/* x2 x0 x2 x0*/\ |
"movq 8+" #A3 ",%%mm4 \n\t"/* 4 ; w07 w06 w03 w02*/\ |
"movq %%mm1,%%mm5 \n\t"/* 5 ; x7 x6 x5 x4*/\ |
"pmaddwd %%mm0,%%mm3 \n\t"/* x2*w05+x0*w04 x2*w01+x0*w00*/\ |
"movq 32+" #A3 ",%%mm6 \n\t"/* 6 ; w21 w20 w17 w16*/\ |
"pshufw $0x88,%%mm1,%%mm1 \n\t"/* x6 x4 x6 x4*/\ |
"pmaddwd %%mm1,%%mm4 \n\t"/* x6*w07+x4*w06 x6*w03+x4*w02*/\ |
"movq 40+" #A3 ",%%mm7 \n\t"/* 7 ; w23 w22 w19 w18*/\ |
"pshufw $0xdd,%%mm2,%%mm2 \n\t"/* x3 x1 x3 x1*/\ |
"pmaddwd %%mm2,%%mm6 \n\t"/* x3*w21+x1*w20 x3*w17+x1*w16*/\ |
"pshufw $0xdd,%%mm5,%%mm5 \n\t"/* x7 x5 x7 x5*/\ |
"pmaddwd %%mm5,%%mm7 \n\t"/* x7*w23+x5*w22 x7*w19+x5*w18*/\ |
"paddd " #A4 ",%%mm3 \n\t"/* +%4*/\ |
"pmaddwd 16+" #A3 ",%%mm0 \n\t"/* x2*w13+x0*w12 x2*w09+x0*w08*/\ |
"paddd %%mm4,%%mm3 \n\t"/* 4 ; a1=sum(even1) a0=sum(even0)*/\ |
"pmaddwd 24+" #A3 ",%%mm1 \n\t"/* x6*w15+x4*w14 x6*w11+x4*w10*/\ |
"movq %%mm3,%%mm4 \n\t"/* 4 ; a1 a0*/\ |
"pmaddwd 48+" #A3 ",%%mm2 \n\t"/* x3*w29+x1*w28 x3*w25+x1*w24*/\ |
"paddd %%mm7,%%mm6 \n\t"/* 7 ; b1=sum(odd1) b0=sum(odd0)*/\ |
"pmaddwd 56+" #A3 ",%%mm5 \n\t"/* x7*w31+x5*w30 x7*w27+x5*w26*/\ |
"paddd %%mm6,%%mm3 \n\t"/* a1+b1 a0+b0*/\ |
"paddd " #A4 ",%%mm0 \n\t"/* +%4*/\ |
"psrad $11,%%mm3 \n\t"/* y1=a1+b1 y0=a0+b0*/\ |
"paddd %%mm1,%%mm0 \n\t"/* 1 ; a3=sum(even3) a2=sum(even2)*/\ |
"psubd %%mm6,%%mm4 \n\t"/* 6 ; a1-b1 a0-b0*/\ |
"movq %%mm0,%%mm7 \n\t"/* 7 ; a3 a2*/\ |
"paddd %%mm5,%%mm2 \n\t"/* 5 ; b3=sum(odd3) b2=sum(odd2)*/\ |
"paddd %%mm2,%%mm0 \n\t"/* a3+b3 a2+b2*/\ |
"psrad $11,%%mm4 \n\t"/* y6=a1-b1 y7=a0-b0*/\ |
"psubd %%mm2,%%mm7 \n\t"/* 2 ; a3-b3 a2-b2*/\ |
"psrad $11,%%mm0 \n\t"/* y3=a3+b3 y2=a2+b2*/\ |
"psrad $11,%%mm7 \n\t"/* y4=a3-b3 y5=a2-b2*/\ |
"packssdw %%mm0,%%mm3 \n\t"/* 0 ; y3 y2 y1 y0*/\ |
"packssdw %%mm4,%%mm7 \n\t"/* 4 ; y6 y7 y4 y5*/\ |
"movq %%mm3, " #A2 " \n\t"/* 3 ; save y3 y2 y1 y0*/\ |
"pshufw $0xb1,%%mm7,%%mm7 \n\t"/* y7 y6 y5 y4*/\ |
"movq %%mm7,8 +" #A2 "\n\t"/* 7 ; save y7 y6 y5 y4*/\ |
//----------------------------------------------------------------------------- |
// |
// The first stage DCT 8x8 - forward DCTs of columns |
// |
// The %2puts are multiplied |
// for rows 0,4 - on cos_4_16, |
// for rows 1,7 - on cos_1_16, |
// for rows 2,6 - on cos_2_16, |
// for rows 3,5 - on cos_3_16 |
// and are shifted to the left for rise of accuracy |
// |
//----------------------------------------------------------------------------- |
// |
// The 8-point scaled forward DCT algorithm (26a8m) |
// |
//----------------------------------------------------------------------------- |
// |
// #define DCT_8_FRW_COL(x, y) |
//{ |
// short t0, t1, t2, t3, t4, t5, t6, t7; |
// short tp03, tm03, tp12, tm12, tp65, tm65; |
// short tp465, tm465, tp765, tm765; |
// |
// t0 = LEFT_SHIFT ( x[0] + x[7] ); |
// t1 = LEFT_SHIFT ( x[1] + x[6] ); |
// t2 = LEFT_SHIFT ( x[2] + x[5] ); |
// t3 = LEFT_SHIFT ( x[3] + x[4] ); |
// t4 = LEFT_SHIFT ( x[3] - x[4] ); |
// t5 = LEFT_SHIFT ( x[2] - x[5] ); |
// t6 = LEFT_SHIFT ( x[1] - x[6] ); |
// t7 = LEFT_SHIFT ( x[0] - x[7] ); |
// |
// tp03 = t0 + t3; |
// tm03 = t0 - t3; |
// tp12 = t1 + t2; |
// tm12 = t1 - t2; |
// |
// y[0] = tp03 + tp12; |
// y[4] = tp03 - tp12; |
// |
// y[2] = tm03 + tm12 * tg_2_16; |
// y[6] = tm03 * tg_2_16 - tm12; |
// |
// tp65 =(t6 +t5 )*cos_4_16; |
// tm65 =(t6 -t5 )*cos_4_16; |
// |
// tp765 = t7 + tp65; |
// tm765 = t7 - tp65; |
// tp465 = t4 + tm65; |
// tm465 = t4 - tm65; |
// |
// y[1] = tp765 + tp465 * tg_1_16; |
// y[7] = tp765 * tg_1_16 - tp465; |
// y[5] = tm765 * tg_3_16 + tm465; |
// y[3] = tm765 - tm465 * tg_3_16; |
//} |
// |
//----------------------------------------------------------------------------- |
//----------------------------------------------------------------------------- |
// DCT_8_INV_COL_4 INP,OUT |
//----------------------------------------------------------------------------- |
#define DCT_8_INV_COL(A1,A2)\ |
"movq 2*8(%3),%%mm0\n\t"\ |
"movq 16*3+" #A1 ",%%mm3\n\t"\ |
"movq %%mm0,%%mm1 \n\t"/* tg_3_16*/\ |
"movq 16*5+" #A1 ",%%mm5\n\t"\ |
"pmulhw %%mm3,%%mm0 \n\t"/* x3*(tg_3_16-1)*/\ |
"movq (%3),%%mm4\n\t"\ |
"pmulhw %%mm5,%%mm1 \n\t"/* x5*(tg_3_16-1)*/\ |
"movq 16*7+" #A1 ",%%mm7\n\t"\ |
"movq %%mm4,%%mm2 \n\t"/* tg_1_16*/\ |
"movq 16*1+" #A1 ",%%mm6\n\t"\ |
"pmulhw %%mm7,%%mm4 \n\t"/* x7*tg_1_16*/\ |
"paddsw %%mm3,%%mm0 \n\t"/* x3*tg_3_16*/\ |
"pmulhw %%mm6,%%mm2 \n\t"/* x1*tg_1_16*/\ |
"paddsw %%mm3,%%mm1 \n\t"/* x3+x5*(tg_3_16-1)*/\ |
"psubsw %%mm5,%%mm0 \n\t"/* x3*tg_3_16-x5 = tm35*/\ |
"movq 3*8(%3),%%mm3\n\t"\ |
"paddsw %%mm5,%%mm1 \n\t"/* x3+x5*tg_3_16 = tp35*/\ |
"paddsw %%mm6,%%mm4 \n\t"/* x1+tg_1_16*x7 = tp17*/\ |
"psubsw %%mm7,%%mm2 \n\t"/* x1*tg_1_16-x7 = tm17*/\ |
"movq %%mm4,%%mm5 \n\t"/* tp17*/\ |
"movq %%mm2,%%mm6 \n\t"/* tm17*/\ |
"paddsw %%mm1,%%mm5 \n\t"/* tp17+tp35 = b0*/\ |
"psubsw %%mm0,%%mm6 \n\t"/* tm17-tm35 = b3*/\ |
"psubsw %%mm1,%%mm4 \n\t"/* tp17-tp35 = t1*/\ |
"paddsw %%mm0,%%mm2 \n\t"/* tm17+tm35 = t2*/\ |
"movq 1*8(%3),%%mm7\n\t"\ |
"movq %%mm4,%%mm1 \n\t"/* t1*/\ |
"movq %%mm5,3*16 +" #A2 "\n\t"/* save b0*/\ |
"paddsw %%mm2,%%mm1 \n\t"/* t1+t2*/\ |
"movq %%mm6,5*16 +" #A2 "\n\t"/* save b3*/\ |
"psubsw %%mm2,%%mm4 \n\t"/* t1-t2*/\ |
"movq 2*16+" #A1 ",%%mm5\n\t"\ |
"movq %%mm7,%%mm0 \n\t"/* tg_2_16*/\ |
"movq 6*16+" #A1 ",%%mm6\n\t"\ |
"pmulhw %%mm5,%%mm0 \n\t"/* x2*tg_2_16*/\ |
"pmulhw %%mm6,%%mm7 \n\t"/* x6*tg_2_16*/\ |
"pmulhw %%mm3,%%mm1 \n\t"/* ocos_4_16*(t1+t2) = b1/2*/\ |
"movq 0*16+" #A1 ",%%mm2\n\t"\ |
"pmulhw %%mm3,%%mm4 \n\t"/* ocos_4_16*(t1-t2) = b2/2*/\ |
"psubsw %%mm6,%%mm0 \n\t"/* t2*tg_2_16-x6 = tm26*/\ |
"movq %%mm2,%%mm3 \n\t"/* x0*/\ |
"movq 4*16+" #A1 ",%%mm6\n\t"\ |
"paddsw %%mm5,%%mm7 \n\t"/* x2+x6*tg_2_16 = tp26*/\ |
"paddsw %%mm6,%%mm2 \n\t"/* x0+x4 = tp04*/\ |
"psubsw %%mm6,%%mm3 \n\t"/* x0-x4 = tm04*/\ |
"movq %%mm2,%%mm5 \n\t"/* tp04*/\ |
"movq %%mm3,%%mm6 \n\t"/* tm04*/\ |
"psubsw %%mm7,%%mm2 \n\t"/* tp04-tp26 = a3*/\ |
"paddsw %%mm0,%%mm3 \n\t"/* tm04+tm26 = a1*/\ |
"paddsw %%mm1,%%mm1 \n\t"/* b1*/\ |
"paddsw %%mm4,%%mm4 \n\t"/* b2*/\ |
"paddsw %%mm7,%%mm5 \n\t"/* tp04+tp26 = a0*/\ |
"psubsw %%mm0,%%mm6 \n\t"/* tm04-tm26 = a2*/\ |
"movq %%mm3,%%mm7 \n\t"/* a1*/\ |
"movq %%mm6,%%mm0 \n\t"/* a2*/\ |
"paddsw %%mm1,%%mm3 \n\t"/* a1+b1*/\ |
"paddsw %%mm4,%%mm6 \n\t"/* a2+b2*/\ |
"psraw $6,%%mm3 \n\t"/* dst1*/\ |
"psubsw %%mm1,%%mm7 \n\t"/* a1-b1*/\ |
"psraw $6,%%mm6 \n\t"/* dst2*/\ |
"psubsw %%mm4,%%mm0 \n\t"/* a2-b2*/\ |
"movq 3*16+" #A2 ",%%mm1 \n\t"/* load b0*/\ |
"psraw $6,%%mm7 \n\t"/* dst6*/\ |
"movq %%mm5,%%mm4 \n\t"/* a0*/\ |
"psraw $6,%%mm0 \n\t"/* dst5*/\ |
"movq %%mm3,1*16+" #A2 "\n\t"\ |
"paddsw %%mm1,%%mm5 \n\t"/* a0+b0*/\ |
"movq %%mm6,2*16+" #A2 "\n\t"\ |
"psubsw %%mm1,%%mm4 \n\t"/* a0-b0*/\ |
"movq 5*16+" #A2 ",%%mm3 \n\t"/* load b3*/\ |
"psraw $6,%%mm5 \n\t"/* dst0*/\ |
"movq %%mm2,%%mm6 \n\t"/* a3*/\ |
"psraw $6,%%mm4 \n\t"/* dst7*/\ |
"movq %%mm0,5*16+" #A2 "\n\t"\ |
"paddsw %%mm3,%%mm2 \n\t"/* a3+b3*/\ |
"movq %%mm7,6*16+" #A2 "\n\t"\ |
"psubsw %%mm3,%%mm6 \n\t"/* a3-b3*/\ |
"movq %%mm5,0*16+" #A2 "\n\t"\ |
"psraw $6,%%mm2 \n\t"/* dst3*/\ |
"movq %%mm4,7*16+" #A2 "\n\t"\ |
"psraw $6,%%mm6 \n\t"/* dst4*/\ |
"movq %%mm2,3*16+" #A2 "\n\t"\ |
"movq %%mm6,4*16+" #A2 "\n\t" |
//============================================================================= |
// Code |
//============================================================================= |
//----------------------------------------------------------------------------- |
// void idct_mmx(uint16_t block[64]); |
//----------------------------------------------------------------------------- |
void ff_idct_xvid_mmx(short *block){ |
__asm__ volatile( |
//# Process each row |
DCT_8_INV_ROW_MMX(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1)) |
DCT_8_INV_ROW_MMX(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1)) |
DCT_8_INV_ROW_MMX(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1)) |
DCT_8_INV_ROW_MMX(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1)) |
DCT_8_INV_ROW_MMX(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1)) |
DCT_8_INV_ROW_MMX(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) |
DCT_8_INV_ROW_MMX(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) |
DCT_8_INV_ROW_MMX(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) |
//# Process the columns (4 at a time) |
DCT_8_INV_COL(0(%0), 0(%0)) |
DCT_8_INV_COL(8(%0), 8(%0)) |
:: "r"(block), "r"(rounder_0), "r"(tab_i_04_mmx), "r"(tg_1_16)); |
} |
void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block) |
{ |
ff_idct_xvid_mmx(block); |
ff_put_pixels_clamped_mmx(block, dest, line_size); |
} |
void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block) |
{ |
ff_idct_xvid_mmx(block); |
ff_add_pixels_clamped_mmx(block, dest, line_size); |
} |
#endif /* HAVE_MMX_INLINE */ |
#if HAVE_MMXEXT_INLINE |
//----------------------------------------------------------------------------- |
// void idct_xmm(uint16_t block[64]); |
//----------------------------------------------------------------------------- |
void ff_idct_xvid_mmxext(short *block) |
{ |
__asm__ volatile( |
//# Process each row |
DCT_8_INV_ROW_XMM(0*16(%0), 0*16(%0), 64*0(%2), 8*0(%1)) |
DCT_8_INV_ROW_XMM(1*16(%0), 1*16(%0), 64*1(%2), 8*1(%1)) |
DCT_8_INV_ROW_XMM(2*16(%0), 2*16(%0), 64*2(%2), 8*2(%1)) |
DCT_8_INV_ROW_XMM(3*16(%0), 3*16(%0), 64*3(%2), 8*3(%1)) |
DCT_8_INV_ROW_XMM(4*16(%0), 4*16(%0), 64*0(%2), 8*4(%1)) |
DCT_8_INV_ROW_XMM(5*16(%0), 5*16(%0), 64*3(%2), 8*5(%1)) |
DCT_8_INV_ROW_XMM(6*16(%0), 6*16(%0), 64*2(%2), 8*6(%1)) |
DCT_8_INV_ROW_XMM(7*16(%0), 7*16(%0), 64*1(%2), 8*7(%1)) |
//# Process the columns (4 at a time) |
DCT_8_INV_COL(0(%0), 0(%0)) |
DCT_8_INV_COL(8(%0), 8(%0)) |
:: "r"(block), "r"(rounder_0), "r"(tab_i_04_xmm), "r"(tg_1_16)); |
} |
void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block) |
{ |
ff_idct_xvid_mmxext(block); |
ff_put_pixels_clamped_mmx(block, dest, line_size); |
} |
void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block) |
{ |
ff_idct_xvid_mmxext(block); |
ff_add_pixels_clamped_mmx(block, dest, line_size); |
} |
#endif /* HAVE_MMXEXT_INLINE */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/idct_sse2_xvid.c |
---|
0,0 → 1,407 |
/* |
* XVID MPEG-4 VIDEO CODEC |
* - SSE2 inverse discrete cosine transform - |
* |
* Copyright(C) 2003 Pascal Massimino <skal@planet-d.net> |
* |
* Conversion to gcc syntax with modifications |
* by Alexander Strange <astrange@ithinksw.com> |
* |
* Originally from dct/x86_asm/fdct_sse2_skal.asm in Xvid. |
* |
* This file is part of FFmpeg. |
* |
* Vertical pass is an implementation of the scheme: |
* Loeffler C., Ligtenberg A., and Moschytz C.S.: |
* Practical Fast 1D DCT Algorithm with Eleven Multiplications, |
* Proc. ICASSP 1989, 988-991. |
* |
* Horizontal pass is a double 4x4 vector/matrix multiplication, |
* (see also Intel's Application Note 922: |
* http://developer.intel.com/vtune/cbts/strmsimd/922down.htm |
* Copyright (C) 1999 Intel Corporation) |
* |
* More details at http://skal.planet-d.net/coding/dct.html |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public License |
* along with FFmpeg; if not, write to the Free Software Foundation, |
* Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/mem.h" |
#include "libavutil/x86/asm.h" |
#include "idct_xvid.h" |
#include "dsputil_x86.h" |
#if HAVE_SSE2_INLINE |
/** |
* @file |
* @brief SSE2 idct compatible with xvidmmx |
*/ |
#define X8(x) x,x,x,x,x,x,x,x |
#define ROW_SHIFT 11 |
#define COL_SHIFT 6 |
DECLARE_ASM_CONST(16, int16_t, tan1)[] = {X8(13036)}; // tan( pi/16) |
DECLARE_ASM_CONST(16, int16_t, tan2)[] = {X8(27146)}; // tan(2pi/16) = sqrt(2)-1 |
DECLARE_ASM_CONST(16, int16_t, tan3)[] = {X8(43790)}; // tan(3pi/16)-1 |
DECLARE_ASM_CONST(16, int16_t, sqrt2)[]= {X8(23170)}; // 0.5/sqrt(2) |
DECLARE_ASM_CONST(8, uint8_t, m127)[] = {X8(127)}; |
DECLARE_ASM_CONST(16, int16_t, iTab1)[] = { |
0x4000, 0x539f, 0xc000, 0xac61, 0x4000, 0xdd5d, 0x4000, 0xdd5d, |
0x4000, 0x22a3, 0x4000, 0x22a3, 0xc000, 0x539f, 0x4000, 0xac61, |
0x3249, 0x11a8, 0x4b42, 0xee58, 0x11a8, 0x4b42, 0x11a8, 0xcdb7, |
0x58c5, 0x4b42, 0xa73b, 0xcdb7, 0x3249, 0xa73b, 0x4b42, 0xa73b |
}; |
DECLARE_ASM_CONST(16, int16_t, iTab2)[] = { |
0x58c5, 0x73fc, 0xa73b, 0x8c04, 0x58c5, 0xcff5, 0x58c5, 0xcff5, |
0x58c5, 0x300b, 0x58c5, 0x300b, 0xa73b, 0x73fc, 0x58c5, 0x8c04, |
0x45bf, 0x187e, 0x6862, 0xe782, 0x187e, 0x6862, 0x187e, 0xba41, |
0x7b21, 0x6862, 0x84df, 0xba41, 0x45bf, 0x84df, 0x6862, 0x84df |
}; |
DECLARE_ASM_CONST(16, int16_t, iTab3)[] = { |
0x539f, 0x6d41, 0xac61, 0x92bf, 0x539f, 0xd2bf, 0x539f, 0xd2bf, |
0x539f, 0x2d41, 0x539f, 0x2d41, 0xac61, 0x6d41, 0x539f, 0x92bf, |
0x41b3, 0x1712, 0x6254, 0xe8ee, 0x1712, 0x6254, 0x1712, 0xbe4d, |
0x73fc, 0x6254, 0x8c04, 0xbe4d, 0x41b3, 0x8c04, 0x6254, 0x8c04 |
}; |
DECLARE_ASM_CONST(16, int16_t, iTab4)[] = { |
0x4b42, 0x6254, 0xb4be, 0x9dac, 0x4b42, 0xd746, 0x4b42, 0xd746, |
0x4b42, 0x28ba, 0x4b42, 0x28ba, 0xb4be, 0x6254, 0x4b42, 0x9dac, |
0x3b21, 0x14c3, 0x587e, 0xeb3d, 0x14c3, 0x587e, 0x14c3, 0xc4df, |
0x6862, 0x587e, 0x979e, 0xc4df, 0x3b21, 0x979e, 0x587e, 0x979e |
}; |
DECLARE_ASM_CONST(16, int32_t, walkenIdctRounders)[] = { |
65536, 65536, 65536, 65536, |
3597, 3597, 3597, 3597, |
2260, 2260, 2260, 2260, |
1203, 1203, 1203, 1203, |
120, 120, 120, 120, |
512, 512, 512, 512 |
}; |
// Temporary storage before the column pass |
#define ROW1 "%%xmm6" |
#define ROW3 "%%xmm4" |
#define ROW5 "%%xmm5" |
#define ROW7 "%%xmm7" |
#define CLEAR_ODD(r) "pxor "r","r" \n\t" |
#define PUT_ODD(dst) "pshufhw $0x1B, %%xmm2, "dst" \n\t" |
#if ARCH_X86_64 |
# define ROW0 "%%xmm8" |
# define REG0 ROW0 |
# define ROW2 "%%xmm9" |
# define REG2 ROW2 |
# define ROW4 "%%xmm10" |
# define REG4 ROW4 |
# define ROW6 "%%xmm11" |
# define REG6 ROW6 |
# define CLEAR_EVEN(r) CLEAR_ODD(r) |
# define PUT_EVEN(dst) PUT_ODD(dst) |
# define XMMS "%%xmm12" |
# define MOV_32_ONLY "#" |
# define SREG2 REG2 |
# define TAN3 "%%xmm13" |
# define TAN1 "%%xmm14" |
#else |
# define ROW0 "(%0)" |
# define REG0 "%%xmm4" |
# define ROW2 "2*16(%0)" |
# define REG2 "%%xmm4" |
# define ROW4 "4*16(%0)" |
# define REG4 "%%xmm6" |
# define ROW6 "6*16(%0)" |
# define REG6 "%%xmm6" |
# define CLEAR_EVEN(r) |
# define PUT_EVEN(dst) \ |
"pshufhw $0x1B, %%xmm2, %%xmm2 \n\t" \ |
"movdqa %%xmm2, "dst" \n\t" |
# define XMMS "%%xmm2" |
# define MOV_32_ONLY "movdqa " |
# define SREG2 "%%xmm7" |
# define TAN3 "%%xmm0" |
# define TAN1 "%%xmm2" |
#endif |
#define ROUND(x) "paddd "MANGLE(x) |
#define JZ(reg, to) \ |
"testl "reg","reg" \n\t" \ |
"jz "to" \n\t" |
#define JNZ(reg, to) \ |
"testl "reg","reg" \n\t" \ |
"jnz "to" \n\t" |
#define TEST_ONE_ROW(src, reg, clear) \ |
clear \ |
"movq "src", %%mm1 \n\t" \ |
"por 8+"src", %%mm1 \n\t" \ |
"paddusb %%mm0, %%mm1 \n\t" \ |
"pmovmskb %%mm1, "reg" \n\t" |
#define TEST_TWO_ROWS(row1, row2, reg1, reg2, clear1, clear2) \ |
clear1 \ |
clear2 \ |
"movq "row1", %%mm1 \n\t" \ |
"por 8+"row1", %%mm1 \n\t" \ |
"movq "row2", %%mm2 \n\t" \ |
"por 8+"row2", %%mm2 \n\t" \ |
"paddusb %%mm0, %%mm1 \n\t" \ |
"paddusb %%mm0, %%mm2 \n\t" \ |
"pmovmskb %%mm1, "reg1" \n\t" \ |
"pmovmskb %%mm2, "reg2" \n\t" |
///IDCT pass on rows. |
#define iMTX_MULT(src, table, rounder, put) \ |
"movdqa "src", %%xmm3 \n\t" \ |
"movdqa %%xmm3, %%xmm0 \n\t" \ |
"pshufd $0x11, %%xmm3, %%xmm1 \n\t" /* 4602 */ \ |
"punpcklqdq %%xmm0, %%xmm0 \n\t" /* 0246 */ \ |
"pmaddwd "table", %%xmm0 \n\t" \ |
"pmaddwd 16+"table", %%xmm1 \n\t" \ |
"pshufd $0xBB, %%xmm3, %%xmm2 \n\t" /* 5713 */ \ |
"punpckhqdq %%xmm3, %%xmm3 \n\t" /* 1357 */ \ |
"pmaddwd 32+"table", %%xmm2 \n\t" \ |
"pmaddwd 48+"table", %%xmm3 \n\t" \ |
"paddd %%xmm1, %%xmm0 \n\t" \ |
"paddd %%xmm3, %%xmm2 \n\t" \ |
rounder", %%xmm0 \n\t" \ |
"movdqa %%xmm2, %%xmm3 \n\t" \ |
"paddd %%xmm0, %%xmm2 \n\t" \ |
"psubd %%xmm3, %%xmm0 \n\t" \ |
"psrad $11, %%xmm2 \n\t" \ |
"psrad $11, %%xmm0 \n\t" \ |
"packssdw %%xmm0, %%xmm2 \n\t" \ |
put \ |
"1: \n\t" |
#define iLLM_HEAD \ |
"movdqa "MANGLE(tan3)", "TAN3" \n\t" \ |
"movdqa "MANGLE(tan1)", "TAN1" \n\t" \ |
///IDCT pass on columns. |
#define iLLM_PASS(dct) \ |
"movdqa "TAN3", %%xmm1 \n\t" \ |
"movdqa "TAN1", %%xmm3 \n\t" \ |
"pmulhw %%xmm4, "TAN3" \n\t" \ |
"pmulhw %%xmm5, %%xmm1 \n\t" \ |
"paddsw %%xmm4, "TAN3" \n\t" \ |
"paddsw %%xmm5, %%xmm1 \n\t" \ |
"psubsw %%xmm5, "TAN3" \n\t" \ |
"paddsw %%xmm4, %%xmm1 \n\t" \ |
"pmulhw %%xmm7, %%xmm3 \n\t" \ |
"pmulhw %%xmm6, "TAN1" \n\t" \ |
"paddsw %%xmm6, %%xmm3 \n\t" \ |
"psubsw %%xmm7, "TAN1" \n\t" \ |
"movdqa %%xmm3, %%xmm7 \n\t" \ |
"movdqa "TAN1", %%xmm6 \n\t" \ |
"psubsw %%xmm1, %%xmm3 \n\t" \ |
"psubsw "TAN3", "TAN1" \n\t" \ |
"paddsw %%xmm7, %%xmm1 \n\t" \ |
"paddsw %%xmm6, "TAN3" \n\t" \ |
"movdqa %%xmm3, %%xmm6 \n\t" \ |
"psubsw "TAN3", %%xmm3 \n\t" \ |
"paddsw %%xmm6, "TAN3" \n\t" \ |
"movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ |
"pmulhw %%xmm4, %%xmm3 \n\t" \ |
"pmulhw %%xmm4, "TAN3" \n\t" \ |
"paddsw "TAN3", "TAN3" \n\t" \ |
"paddsw %%xmm3, %%xmm3 \n\t" \ |
"movdqa "MANGLE(tan2)", %%xmm7 \n\t" \ |
MOV_32_ONLY ROW2", "REG2" \n\t" \ |
MOV_32_ONLY ROW6", "REG6" \n\t" \ |
"movdqa %%xmm7, %%xmm5 \n\t" \ |
"pmulhw "REG6", %%xmm7 \n\t" \ |
"pmulhw "REG2", %%xmm5 \n\t" \ |
"paddsw "REG2", %%xmm7 \n\t" \ |
"psubsw "REG6", %%xmm5 \n\t" \ |
MOV_32_ONLY ROW0", "REG0" \n\t" \ |
MOV_32_ONLY ROW4", "REG4" \n\t" \ |
MOV_32_ONLY" "TAN1", (%0) \n\t" \ |
"movdqa "REG0", "XMMS" \n\t" \ |
"psubsw "REG4", "REG0" \n\t" \ |
"paddsw "XMMS", "REG4" \n\t" \ |
"movdqa "REG4", "XMMS" \n\t" \ |
"psubsw %%xmm7, "REG4" \n\t" \ |
"paddsw "XMMS", %%xmm7 \n\t" \ |
"movdqa "REG0", "XMMS" \n\t" \ |
"psubsw %%xmm5, "REG0" \n\t" \ |
"paddsw "XMMS", %%xmm5 \n\t" \ |
"movdqa %%xmm5, "XMMS" \n\t" \ |
"psubsw "TAN3", %%xmm5 \n\t" \ |
"paddsw "XMMS", "TAN3" \n\t" \ |
"movdqa "REG0", "XMMS" \n\t" \ |
"psubsw %%xmm3, "REG0" \n\t" \ |
"paddsw "XMMS", %%xmm3 \n\t" \ |
MOV_32_ONLY" (%0), "TAN1" \n\t" \ |
"psraw $6, %%xmm5 \n\t" \ |
"psraw $6, "REG0" \n\t" \ |
"psraw $6, "TAN3" \n\t" \ |
"psraw $6, %%xmm3 \n\t" \ |
"movdqa "TAN3", 1*16("dct") \n\t" \ |
"movdqa %%xmm3, 2*16("dct") \n\t" \ |
"movdqa "REG0", 5*16("dct") \n\t" \ |
"movdqa %%xmm5, 6*16("dct") \n\t" \ |
"movdqa %%xmm7, %%xmm0 \n\t" \ |
"movdqa "REG4", %%xmm4 \n\t" \ |
"psubsw %%xmm1, %%xmm7 \n\t" \ |
"psubsw "TAN1", "REG4" \n\t" \ |
"paddsw %%xmm0, %%xmm1 \n\t" \ |
"paddsw %%xmm4, "TAN1" \n\t" \ |
"psraw $6, %%xmm1 \n\t" \ |
"psraw $6, %%xmm7 \n\t" \ |
"psraw $6, "TAN1" \n\t" \ |
"psraw $6, "REG4" \n\t" \ |
"movdqa %%xmm1, ("dct") \n\t" \ |
"movdqa "TAN1", 3*16("dct") \n\t" \ |
"movdqa "REG4", 4*16("dct") \n\t" \ |
"movdqa %%xmm7, 7*16("dct") \n\t" |
///IDCT pass on columns, assuming rows 4-7 are zero. |
#define iLLM_PASS_SPARSE(dct) \ |
"pmulhw %%xmm4, "TAN3" \n\t" \ |
"paddsw %%xmm4, "TAN3" \n\t" \ |
"movdqa %%xmm6, %%xmm3 \n\t" \ |
"pmulhw %%xmm6, "TAN1" \n\t" \ |
"movdqa %%xmm4, %%xmm1 \n\t" \ |
"psubsw %%xmm1, %%xmm3 \n\t" \ |
"paddsw %%xmm6, %%xmm1 \n\t" \ |
"movdqa "TAN1", %%xmm6 \n\t" \ |
"psubsw "TAN3", "TAN1" \n\t" \ |
"paddsw %%xmm6, "TAN3" \n\t" \ |
"movdqa %%xmm3, %%xmm6 \n\t" \ |
"psubsw "TAN3", %%xmm3 \n\t" \ |
"paddsw %%xmm6, "TAN3" \n\t" \ |
"movdqa "MANGLE(sqrt2)", %%xmm4 \n\t" \ |
"pmulhw %%xmm4, %%xmm3 \n\t" \ |
"pmulhw %%xmm4, "TAN3" \n\t" \ |
"paddsw "TAN3", "TAN3" \n\t" \ |
"paddsw %%xmm3, %%xmm3 \n\t" \ |
"movdqa "MANGLE(tan2)", %%xmm5 \n\t" \ |
MOV_32_ONLY ROW2", "SREG2" \n\t" \ |
"pmulhw "SREG2", %%xmm5 \n\t" \ |
MOV_32_ONLY ROW0", "REG0" \n\t" \ |
"movdqa "REG0", %%xmm6 \n\t" \ |
"psubsw "SREG2", %%xmm6 \n\t" \ |
"paddsw "REG0", "SREG2" \n\t" \ |
MOV_32_ONLY" "TAN1", (%0) \n\t" \ |
"movdqa "REG0", "XMMS" \n\t" \ |
"psubsw %%xmm5, "REG0" \n\t" \ |
"paddsw "XMMS", %%xmm5 \n\t" \ |
"movdqa %%xmm5, "XMMS" \n\t" \ |
"psubsw "TAN3", %%xmm5 \n\t" \ |
"paddsw "XMMS", "TAN3" \n\t" \ |
"movdqa "REG0", "XMMS" \n\t" \ |
"psubsw %%xmm3, "REG0" \n\t" \ |
"paddsw "XMMS", %%xmm3 \n\t" \ |
MOV_32_ONLY" (%0), "TAN1" \n\t" \ |
"psraw $6, %%xmm5 \n\t" \ |
"psraw $6, "REG0" \n\t" \ |
"psraw $6, "TAN3" \n\t" \ |
"psraw $6, %%xmm3 \n\t" \ |
"movdqa "TAN3", 1*16("dct") \n\t" \ |
"movdqa %%xmm3, 2*16("dct") \n\t" \ |
"movdqa "REG0", 5*16("dct") \n\t" \ |
"movdqa %%xmm5, 6*16("dct") \n\t" \ |
"movdqa "SREG2", %%xmm0 \n\t" \ |
"movdqa %%xmm6, %%xmm4 \n\t" \ |
"psubsw %%xmm1, "SREG2" \n\t" \ |
"psubsw "TAN1", %%xmm6 \n\t" \ |
"paddsw %%xmm0, %%xmm1 \n\t" \ |
"paddsw %%xmm4, "TAN1" \n\t" \ |
"psraw $6, %%xmm1 \n\t" \ |
"psraw $6, "SREG2" \n\t" \ |
"psraw $6, "TAN1" \n\t" \ |
"psraw $6, %%xmm6 \n\t" \ |
"movdqa %%xmm1, ("dct") \n\t" \ |
"movdqa "TAN1", 3*16("dct") \n\t" \ |
"movdqa %%xmm6, 4*16("dct") \n\t" \ |
"movdqa "SREG2", 7*16("dct") \n\t" |
inline void ff_idct_xvid_sse2(short *block) |
{ |
__asm__ volatile( |
"movq "MANGLE(m127)", %%mm0 \n\t" |
iMTX_MULT("(%0)", MANGLE(iTab1), ROUND(walkenIdctRounders), PUT_EVEN(ROW0)) |
iMTX_MULT("1*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+1*16), PUT_ODD(ROW1)) |
iMTX_MULT("2*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+2*16), PUT_EVEN(ROW2)) |
TEST_TWO_ROWS("3*16(%0)", "4*16(%0)", "%%eax", "%%ecx", CLEAR_ODD(ROW3), CLEAR_EVEN(ROW4)) |
JZ("%%eax", "1f") |
iMTX_MULT("3*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+3*16), PUT_ODD(ROW3)) |
TEST_TWO_ROWS("5*16(%0)", "6*16(%0)", "%%eax", "%%edx", CLEAR_ODD(ROW5), CLEAR_EVEN(ROW6)) |
TEST_ONE_ROW("7*16(%0)", "%%esi", CLEAR_ODD(ROW7)) |
iLLM_HEAD |
".p2align 4 \n\t" |
JNZ("%%ecx", "2f") |
JNZ("%%eax", "3f") |
JNZ("%%edx", "4f") |
JNZ("%%esi", "5f") |
iLLM_PASS_SPARSE("%0") |
"jmp 6f \n\t" |
"2: \n\t" |
iMTX_MULT("4*16(%0)", MANGLE(iTab1), "#", PUT_EVEN(ROW4)) |
"3: \n\t" |
iMTX_MULT("5*16(%0)", MANGLE(iTab4), ROUND(walkenIdctRounders+4*16), PUT_ODD(ROW5)) |
JZ("%%edx", "1f") |
"4: \n\t" |
iMTX_MULT("6*16(%0)", MANGLE(iTab3), ROUND(walkenIdctRounders+5*16), PUT_EVEN(ROW6)) |
JZ("%%esi", "1f") |
"5: \n\t" |
iMTX_MULT("7*16(%0)", MANGLE(iTab2), ROUND(walkenIdctRounders+5*16), PUT_ODD(ROW7)) |
#if ARCH_X86_32 |
iLLM_HEAD |
#endif |
iLLM_PASS("%0") |
"6: \n\t" |
: "+r"(block) |
: |
: XMM_CLOBBERS("%xmm0" , "%xmm1" , "%xmm2" , "%xmm3" , |
"%xmm4" , "%xmm5" , "%xmm6" , "%xmm7" ,) |
#if ARCH_X86_64 |
XMM_CLOBBERS("%xmm8" , "%xmm9" , "%xmm10", "%xmm11", |
"%xmm12", "%xmm13", "%xmm14",) |
#endif |
"%eax", "%ecx", "%edx", "%esi", "memory" |
); |
} |
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block) |
{ |
ff_idct_xvid_sse2(block); |
ff_put_pixels_clamped_mmx(block, dest, line_size); |
} |
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block) |
{ |
ff_idct_xvid_sse2(block); |
ff_add_pixels_clamped_mmx(block, dest, line_size); |
} |
#endif /* HAVE_SSE2_INLINE */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/idct_xvid.h |
---|
0,0 → 1,43 |
/* |
* XVID MPEG-4 VIDEO CODEC |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
/** |
* @file |
* header for Xvid IDCT functions |
*/ |
#ifndef AVCODEC_X86_IDCT_XVID_H |
#define AVCODEC_X86_IDCT_XVID_H |
#include <stdint.h> |
void ff_idct_xvid_mmx(short *block); |
void ff_idct_xvid_mmx_put(uint8_t *dest, int line_size, int16_t *block); |
void ff_idct_xvid_mmx_add(uint8_t *dest, int line_size, int16_t *block); |
void ff_idct_xvid_mmxext(short *block); |
void ff_idct_xvid_mmxext_put(uint8_t *dest, int line_size, int16_t *block); |
void ff_idct_xvid_mmxext_add(uint8_t *dest, int line_size, int16_t *block); |
void ff_idct_xvid_sse2(short *block); |
void ff_idct_xvid_sse2_put(uint8_t *dest, int line_size, short *block); |
void ff_idct_xvid_sse2_add(uint8_t *dest, int line_size, short *block); |
#endif /* AVCODEC_X86_IDCT_XVID_H */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/imdct36.asm |
---|
0,0 → 1,724 |
;****************************************************************************** |
;* 36 point SSE-optimized IMDCT transform |
;* Copyright (c) 2011 Vitor Sessak |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
align 16 |
ps_mask: dd 0, ~0, ~0, ~0 |
ps_mask2: dd 0, ~0, 0, ~0 |
ps_mask3: dd 0, 0, 0, ~0 |
ps_mask4: dd 0, ~0, 0, 0 |
ps_val1: dd -0.5, -0.5, -0.8660254038, -0.8660254038 |
ps_val2: dd 1.0, 1.0, 0.8660254038, 0.8660254038 |
ps_val3: dd 0.1736481777, 0.1736481777, 0.3420201433, 0.3420201433 |
ps_val4: dd -0.7660444431, -0.7660444431, 0.8660254038, 0.8660254038 |
ps_val5: dd -0.9396926208, -0.9396926208, -0.9848077530, -0.9848077530 |
ps_val6: dd 0.5, 0.5, -0.6427876097, -0.6427876097 |
ps_val7: dd 1.0, 1.0, -0.6427876097, -0.6427876097 |
ps_p1p1m1m1: dd 0, 0, 0x80000000, 0x80000000 |
ps_p1m1p1m1: dd 0, 0x80000000, 0, 0x80000000 |
ps_cosh: dd 1.0, 0.50190991877167369479, 1.0, 5.73685662283492756461 |
dd 1.0, 0.51763809020504152469, 1.0, 1.93185165257813657349 |
dd 1.0, 0.55168895948124587824, -1.0, -1.18310079157624925896 |
dd 1.0, 0.61038729438072803416, -1.0, -0.87172339781054900991 |
dd 1.0, 0.70710678118654752439, 0.0, 0.0 |
ps_cosh_sse3: dd 1.0, -0.50190991877167369479, 1.0, -5.73685662283492756461 |
dd 1.0, -0.51763809020504152469, 1.0, -1.93185165257813657349 |
dd 1.0, -0.55168895948124587824, -1.0, 1.18310079157624925896 |
dd 1.0, -0.61038729438072803416, -1.0, 0.87172339781054900991 |
dd 1.0, 0.70710678118654752439, 0.0, 0.0 |
costabs: times 4 dd 0.98480773 |
times 4 dd 0.93969262 |
times 4 dd 0.86602539 |
times 4 dd -0.76604444 |
times 4 dd -0.64278764 |
times 4 dd 0.50000000 |
times 4 dd -0.50000000 |
times 4 dd -0.34202015 |
times 4 dd -0.17364818 |
times 4 dd 0.50190992 |
times 4 dd 0.51763808 |
times 4 dd 0.55168896 |
times 4 dd 0.61038726 |
times 4 dd 0.70710677 |
times 4 dd 0.87172341 |
times 4 dd 1.18310082 |
times 4 dd 1.93185163 |
times 4 dd 5.73685646 |
%define SBLIMIT 32 |
SECTION_TEXT |
%macro PSHUFD 3 |
%if cpuflag(sse2) && notcpuflag(avx) |
pshufd %1, %2, %3 |
%else |
shufps %1, %2, %2, %3 |
%endif |
%endmacro |
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} |
; output %1={x3,x4,y1,y2} |
%macro BUILDINVHIGHLOW 3 |
%if cpuflag(avx) |
shufps %1, %2, %3, 0x4e |
%else |
movlhps %1, %3 |
movhlps %1, %2 |
%endif |
%endmacro |
; input %2={x1,x2,x3,x4}, %3={y1,y2,y3,y4} |
; output %1={x4,y1,y2,y3} |
%macro ROTLEFT 3 |
%if cpuflag(ssse3) |
palignr %1, %3, %2, 12 |
%else |
BUILDINVHIGHLOW %1, %2, %3 |
shufps %1, %1, %3, 0x99 |
%endif |
%endmacro |
%macro INVERTHL 2 |
%if cpuflag(sse2) |
PSHUFD %1, %2, 0x4e |
%else |
movhlps %1, %2 |
movlhps %1, %2 |
%endif |
%endmacro |
%macro BUTTERF 3 |
INVERTHL %2, %1 |
xorps %1, [ps_p1p1m1m1] |
addps %1, %2 |
%if cpuflag(sse3) |
mulps %1, %1, [ps_cosh_sse3 + %3] |
PSHUFD %2, %1, 0xb1 |
addsubps %1, %1, %2 |
%else |
mulps %1, [ps_cosh + %3] |
PSHUFD %2, %1, 0xb1 |
xorps %1, [ps_p1m1p1m1] |
addps %1, %2 |
%endif |
%endmacro |
%macro STORE 4 |
movhlps %2, %1 |
movss [%3 ], %1 |
movss [%3 + 2*%4], %2 |
shufps %1, %1, 0xb1 |
movss [%3 + %4], %1 |
movhlps %2, %1 |
movss [%3 + 3*%4], %2 |
%endmacro |
%macro LOAD 4 |
movlps %1, [%3 ] |
movhps %1, [%3 + %4] |
movlps %2, [%3 + 2*%4] |
movhps %2, [%3 + 3*%4] |
shufps %1, %2, 0x88 |
%endmacro |
%macro LOADA64 2 |
%if cpuflag(avx) |
movu %1, [%2] |
%else |
movlps %1, [%2] |
movhps %1, [%2 + 8] |
%endif |
%endmacro |
%macro DEFINE_IMDCT 0 |
cglobal imdct36_float, 4,4,9, out, buf, in, win |
; for(i=17;i>=1;i--) in[i] += in[i-1]; |
LOADA64 m0, inq |
LOADA64 m1, inq + 16 |
ROTLEFT m5, m0, m1 |
PSHUFD m6, m0, 0x93 |
andps m6, m6, [ps_mask] |
addps m0, m0, m6 |
LOADA64 m2, inq + 32 |
ROTLEFT m7, m1, m2 |
addps m1, m1, m5 |
LOADA64 m3, inq + 48 |
ROTLEFT m5, m2, m3 |
xorps m4, m4, m4 |
movlps m4, [inq+64] |
BUILDINVHIGHLOW m6, m3, m4 |
shufps m6, m6, m4, 0xa9 |
addps m4, m4, m6 |
addps m2, m2, m7 |
addps m3, m3, m5 |
; for(i=17;i>=3;i-=2) in[i] += in[i-2]; |
movlhps m5, m5, m0 |
andps m5, m5, [ps_mask3] |
BUILDINVHIGHLOW m7, m0, m1 |
andps m7, m7, [ps_mask2] |
addps m0, m0, m5 |
BUILDINVHIGHLOW m6, m1, m2 |
andps m6, m6, [ps_mask2] |
addps m1, m1, m7 |
BUILDINVHIGHLOW m7, m2, m3 |
andps m7, m7, [ps_mask2] |
addps m2, m2, m6 |
movhlps m6, m6, m3 |
andps m6, m6, [ps_mask4] |
addps m3, m3, m7 |
addps m4, m4, m6 |
; Populate tmp[] |
movlhps m6, m1, m5 ; zero out high values |
subps m6, m6, m4 |
subps m5, m0, m3 |
%if ARCH_X86_64 |
SWAP m5, m8 |
%endif |
mulps m7, m2, [ps_val1] |
%if ARCH_X86_64 |
mulps m5, m8, [ps_val2] |
%else |
mulps m5, m5, [ps_val2] |
%endif |
addps m7, m7, m5 |
mulps m5, m6, [ps_val1] |
subps m7, m7, m5 |
%if ARCH_X86_64 |
SWAP m5, m8 |
%else |
subps m5, m0, m3 |
%endif |
subps m5, m5, m6 |
addps m5, m5, m2 |
shufps m6, m4, m3, 0xe4 |
subps m6, m6, m2 |
mulps m6, m6, [ps_val3] |
addps m4, m4, m1 |
mulps m4, m4, [ps_val4] |
shufps m1, m1, m0, 0xe4 |
addps m1, m1, m2 |
mulps m1, m1, [ps_val5] |
mulps m3, m3, [ps_val6] |
mulps m0, m0, [ps_val7] |
addps m0, m0, m3 |
xorps m2, m1, [ps_p1p1m1m1] |
subps m2, m2, m4 |
addps m2, m2, m0 |
addps m3, m4, m0 |
subps m3, m3, m6 |
xorps m3, m3, [ps_p1p1m1m1] |
shufps m0, m0, m4, 0xe4 |
subps m0, m0, m1 |
addps m0, m0, m6 |
BUILDINVHIGHLOW m4, m2, m3 |
shufps m3, m3, m2, 0x4e |
; we have tmp = {SwAPLH(m0), SwAPLH(m7), m3, m4, m5} |
BUTTERF m0, m1, 0 |
BUTTERF m7, m2, 16 |
BUTTERF m3, m6, 32 |
BUTTERF m4, m1, 48 |
mulps m5, m5, [ps_cosh + 64] |
PSHUFD m1, m5, 0xe1 |
xorps m5, m5, [ps_p1m1p1m1] |
addps m5, m5, m1 |
; permutates: |
; m0 0 1 2 3 => 2 6 10 14 m1 |
; m7 4 5 6 7 => 3 7 11 15 m2 |
; m3 8 9 10 11 => 17 13 9 5 m3 |
; m4 12 13 14 15 => 16 12 8 4 m5 |
; m5 16 17 xx xx => 0 1 xx xx m0 |
unpckhps m1, m0, m7 |
unpckhps m6, m3, m4 |
movhlps m2, m6, m1 |
movlhps m1, m1, m6 |
unpcklps m5, m5, m4 |
unpcklps m3, m3, m7 |
movhlps m4, m3, m5 |
movlhps m5, m5, m3 |
SWAP m4, m3 |
; permutation done |
PSHUFD m6, m2, 0xb1 |
movss m4, [bufq + 4*68] |
movss m7, [bufq + 4*64] |
unpcklps m7, m7, m4 |
mulps m6, m6, [winq + 16*4] |
addps m6, m6, m7 |
movss [outq + 64*SBLIMIT], m6 |
shufps m6, m6, m6, 0xb1 |
movss [outq + 68*SBLIMIT], m6 |
mulps m6, m3, [winq + 4*4] |
LOAD m4, m7, bufq + 4*16, 16 |
addps m6, m6, m4 |
STORE m6, m7, outq + 16*SBLIMIT, 4*SBLIMIT |
shufps m4, m0, m3, 0xb5 |
mulps m4, m4, [winq + 8*4] |
LOAD m7, m6, bufq + 4*32, 16 |
addps m4, m4, m7 |
STORE m4, m6, outq + 32*SBLIMIT, 4*SBLIMIT |
shufps m3, m3, m2, 0xb1 |
mulps m3, m3, [winq + 12*4] |
LOAD m7, m6, bufq + 4*48, 16 |
addps m3, m3, m7 |
STORE m3, m7, outq + 48*SBLIMIT, 4*SBLIMIT |
mulps m2, m2, [winq] |
LOAD m6, m7, bufq, 16 |
addps m2, m2, m6 |
STORE m2, m7, outq, 4*SBLIMIT |
mulps m4, m1, [winq + 20*4] |
STORE m4, m7, bufq, 16 |
mulps m3, m5, [winq + 24*4] |
STORE m3, m7, bufq + 4*16, 16 |
shufps m0, m0, m5, 0xb0 |
mulps m0, m0, [winq + 28*4] |
STORE m0, m7, bufq + 4*32, 16 |
shufps m5, m5, m1, 0xb1 |
mulps m5, m5, [winq + 32*4] |
STORE m5, m7, bufq + 4*48, 16 |
shufps m1, m1, m1, 0xb1 |
mulps m1, m1, [winq + 36*4] |
movss [bufq + 4*64], m1 |
shufps m1, m1, 0xb1 |
movss [bufq + 4*68], m1 |
RET |
%endmacro |
INIT_XMM sse |
DEFINE_IMDCT |
INIT_XMM sse2 |
DEFINE_IMDCT |
INIT_XMM sse3 |
DEFINE_IMDCT |
INIT_XMM ssse3 |
DEFINE_IMDCT |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
DEFINE_IMDCT |
%endif |
INIT_XMM sse |
%if ARCH_X86_64 |
%define SPILL SWAP |
%define UNSPILL SWAP |
%define SPILLED(x) m %+ x |
%else |
%define SPILLED(x) [tmpq+(x-8)*16 + 32*4] |
%macro SPILL 2 ; xmm#, mempos |
movaps SPILLED(%2), m%1 |
%endmacro |
%macro UNSPILL 2 |
movaps m%1, SPILLED(%2) |
%endmacro |
%endif |
%macro DEFINE_FOUR_IMDCT 0 |
cglobal four_imdct36_float, 5,5,16, out, buf, in, win, tmp |
movlps m0, [inq+64] |
movhps m0, [inq+64 + 72] |
movlps m3, [inq+64 + 2*72] |
movhps m3, [inq+64 + 3*72] |
shufps m5, m0, m3, 0xdd |
shufps m0, m0, m3, 0x88 |
mova m1, [inq+48] |
movu m6, [inq+48 + 72] |
mova m7, [inq+48 + 2*72] |
movu m3, [inq+48 + 3*72] |
TRANSPOSE4x4PS 1, 6, 7, 3, 4 |
addps m4, m6, m7 |
mova [tmpq+4*28], m4 |
addps m7, m3 |
addps m6, m1 |
addps m3, m0 |
addps m0, m5 |
addps m0, m7 |
addps m7, m6 |
mova [tmpq+4*12], m7 |
SPILL 3, 12 |
mova m4, [inq+32] |
movu m5, [inq+32 + 72] |
mova m2, [inq+32 + 2*72] |
movu m7, [inq+32 + 3*72] |
TRANSPOSE4x4PS 4, 5, 2, 7, 3 |
addps m1, m7 |
SPILL 1, 11 |
addps m3, m5, m2 |
SPILL 3, 13 |
addps m7, m2 |
addps m5, m4 |
addps m6, m7 |
mova [tmpq], m6 |
addps m7, m5 |
mova [tmpq+4*16], m7 |
mova m2, [inq+16] |
movu m7, [inq+16 + 72] |
mova m1, [inq+16 + 2*72] |
movu m6, [inq+16 + 3*72] |
TRANSPOSE4x4PS 2, 7, 1, 6, 3 |
addps m4, m6 |
addps m6, m1 |
addps m1, m7 |
addps m7, m2 |
addps m5, m6 |
SPILL 5, 15 |
addps m6, m7 |
mulps m6, [costabs + 16*2] |
mova [tmpq+4*8], m6 |
SPILL 1, 10 |
SPILL 0, 14 |
mova m1, [inq] |
movu m6, [inq + 72] |
mova m3, [inq + 2*72] |
movu m5, [inq + 3*72] |
TRANSPOSE4x4PS 1, 6, 3, 5, 0 |
addps m2, m5 |
addps m5, m3 |
addps m7, m5 |
addps m3, m6 |
addps m6, m1 |
SPILL 7, 8 |
addps m5, m6 |
SPILL 6, 9 |
addps m6, m4, SPILLED(12) |
subps m6, m2 |
UNSPILL 7, 11 |
SPILL 5, 11 |
subps m5, m1, m7 |
mulps m7, [costabs + 16*5] |
addps m7, m1 |
mulps m0, m6, [costabs + 16*6] |
addps m0, m5 |
mova [tmpq+4*24], m0 |
addps m6, m5 |
mova [tmpq+4*4], m6 |
addps m6, m4, m2 |
mulps m6, [costabs + 16*1] |
subps m4, SPILLED(12) |
mulps m4, [costabs + 16*8] |
addps m2, SPILLED(12) |
mulps m2, [costabs + 16*3] |
subps m5, m7, m6 |
subps m5, m2 |
addps m6, m7 |
addps m6, m4 |
addps m7, m2 |
subps m7, m4 |
mova [tmpq+4*20], m7 |
mova m2, [tmpq+4*28] |
mova [tmpq+4*28], m5 |
UNSPILL 7, 13 |
subps m5, m7, m2 |
mulps m5, [costabs + 16*7] |
UNSPILL 1, 10 |
mulps m1, [costabs + 16*2] |
addps m4, m3, m2 |
mulps m4, [costabs + 16*4] |
addps m2, m7 |
addps m7, m3 |
mulps m7, [costabs] |
subps m3, m2 |
mulps m3, [costabs + 16*2] |
addps m2, m7, m5 |
addps m2, m1 |
SPILL 2, 10 |
addps m7, m4 |
subps m7, m1 |
SPILL 7, 12 |
subps m5, m4 |
subps m5, m1 |
UNSPILL 0, 14 |
SPILL 5, 13 |
addps m1, m0, SPILLED(15) |
subps m1, SPILLED(8) |
mova m4, [costabs + 16*5] |
mulps m4, [tmpq] |
UNSPILL 2, 9 |
addps m4, m2 |
subps m2, [tmpq] |
mulps m5, m1, [costabs + 16*6] |
addps m5, m2 |
SPILL 5, 9 |
addps m2, m1 |
SPILL 2, 14 |
UNSPILL 5, 15 |
subps m7, m5, m0 |
addps m5, SPILLED(8) |
mulps m5, [costabs + 16*1] |
mulps m7, [costabs + 16*8] |
addps m0, SPILLED(8) |
mulps m0, [costabs + 16*3] |
subps m2, m4, m5 |
subps m2, m0 |
SPILL 2, 15 |
addps m5, m4 |
addps m5, m7 |
addps m4, m0 |
subps m4, m7 |
SPILL 4, 8 |
mova m7, [tmpq+4*16] |
mova m2, [tmpq+4*12] |
addps m0, m7, m2 |
subps m0, SPILLED(11) |
mulps m0, [costabs + 16*2] |
addps m4, m7, SPILLED(11) |
mulps m4, [costabs] |
subps m7, m2 |
mulps m7, [costabs + 16*7] |
addps m2, SPILLED(11) |
mulps m2, [costabs + 16*4] |
addps m1, m7, [tmpq+4*8] |
addps m1, m4 |
addps m4, m2 |
subps m4, [tmpq+4*8] |
SPILL 4, 11 |
subps m7, m2 |
subps m7, [tmpq+4*8] |
addps m4, m6, SPILLED(10) |
subps m6, SPILLED(10) |
addps m2, m5, m1 |
mulps m2, [costabs + 16*9] |
subps m5, m1 |
mulps m5, [costabs + 16*17] |
subps m1, m4, m2 |
addps m4, m2 |
mulps m2, m1, [winq+4*36] |
addps m2, [bufq+4*36] |
mova [outq+1152], m2 |
mulps m1, [winq+4*32] |
addps m1, [bufq+4*32] |
mova [outq+1024], m1 |
mulps m1, m4, [winq+4*116] |
mova [bufq+4*36], m1 |
mulps m4, [winq+4*112] |
mova [bufq+4*32], m4 |
addps m2, m6, m5 |
subps m6, m5 |
mulps m1, m6, [winq+4*68] |
addps m1, [bufq+4*68] |
mova [outq+2176], m1 |
mulps m6, [winq] |
addps m6, [bufq] |
mova [outq], m6 |
mulps m1, m2, [winq+4*148] |
mova [bufq+4*68], m1 |
mulps m2, [winq+4*80] |
mova [bufq], m2 |
addps m5, m3, [tmpq+4*24] |
mova m2, [tmpq+4*24] |
subps m2, m3 |
mova m1, SPILLED(9) |
subps m1, m0 |
mulps m1, [costabs + 16*10] |
addps m0, SPILLED(9) |
mulps m0, [costabs + 16*16] |
addps m6, m5, m1 |
subps m5, m1 |
mulps m3, m5, [winq+4*40] |
addps m3, [bufq+4*40] |
mova [outq+1280], m3 |
mulps m5, [winq+4*28] |
addps m5, [bufq+4*28] |
mova [outq+896], m5 |
mulps m1, m6, [winq+4*120] |
mova [bufq+4*40], m1 |
mulps m6, [winq+4*108] |
mova [bufq+4*28], m6 |
addps m1, m2, m0 |
subps m2, m0 |
mulps m5, m2, [winq+4*64] |
addps m5, [bufq+4*64] |
mova [outq+2048], m5 |
mulps m2, [winq+4*4] |
addps m2, [bufq+4*4] |
mova [outq+128], m2 |
mulps m0, m1, [winq+4*144] |
mova [bufq+4*64], m0 |
mulps m1, [winq+4*84] |
mova [bufq+4*4], m1 |
mova m1, [tmpq+4*28] |
mova m5, m1 |
addps m1, SPILLED(13) |
subps m5, SPILLED(13) |
UNSPILL 3, 15 |
addps m2, m7, m3 |
mulps m2, [costabs + 16*11] |
subps m3, m7 |
mulps m3, [costabs + 16*15] |
addps m0, m2, m1 |
subps m1, m2 |
SWAP m0, m2 |
mulps m6, m1, [winq+4*44] |
addps m6, [bufq+4*44] |
mova [outq+1408], m6 |
mulps m1, [winq+4*24] |
addps m1, [bufq+4*24] |
mova [outq+768], m1 |
mulps m0, m2, [winq+4*124] |
mova [bufq+4*44], m0 |
mulps m2, [winq+4*104] |
mova [bufq+4*24], m2 |
addps m0, m5, m3 |
subps m5, m3 |
mulps m1, m5, [winq+4*60] |
addps m1, [bufq+4*60] |
mova [outq+1920], m1 |
mulps m5, [winq+4*8] |
addps m5, [bufq+4*8] |
mova [outq+256], m5 |
mulps m1, m0, [winq+4*140] |
mova [bufq+4*60], m1 |
mulps m0, [winq+4*88] |
mova [bufq+4*8], m0 |
mova m1, [tmpq+4*20] |
addps m1, SPILLED(12) |
mova m2, [tmpq+4*20] |
subps m2, SPILLED(12) |
UNSPILL 7, 8 |
subps m0, m7, SPILLED(11) |
addps m7, SPILLED(11) |
mulps m4, m7, [costabs + 16*12] |
mulps m0, [costabs + 16*14] |
addps m5, m1, m4 |
subps m1, m4 |
mulps m7, m1, [winq+4*48] |
addps m7, [bufq+4*48] |
mova [outq+1536], m7 |
mulps m1, [winq+4*20] |
addps m1, [bufq+4*20] |
mova [outq+640], m1 |
mulps m1, m5, [winq+4*128] |
mova [bufq+4*48], m1 |
mulps m5, [winq+4*100] |
mova [bufq+4*20], m5 |
addps m6, m2, m0 |
subps m2, m0 |
mulps m1, m2, [winq+4*56] |
addps m1, [bufq+4*56] |
mova [outq+1792], m1 |
mulps m2, [winq+4*12] |
addps m2, [bufq+4*12] |
mova [outq+384], m2 |
mulps m0, m6, [winq+4*136] |
mova [bufq+4*56], m0 |
mulps m6, [winq+4*92] |
mova [bufq+4*12], m6 |
UNSPILL 0, 14 |
mulps m0, [costabs + 16*13] |
mova m3, [tmpq+4*4] |
addps m2, m0, m3 |
subps m3, m0 |
mulps m0, m3, [winq+4*52] |
addps m0, [bufq+4*52] |
mova [outq+1664], m0 |
mulps m3, [winq+4*16] |
addps m3, [bufq+4*16] |
mova [outq+512], m3 |
mulps m0, m2, [winq+4*132] |
mova [bufq+4*52], m0 |
mulps m2, [winq+4*96] |
mova [bufq+4*16], m2 |
RET |
%endmacro |
INIT_XMM sse |
DEFINE_FOUR_IMDCT |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
DEFINE_FOUR_IMDCT |
%endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/lpc.c |
---|
0,0 → 1,159 |
/* |
* MMX optimized LPC DSP utils |
* Copyright (c) 2007 Loren Merritt |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/mem.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/lpc.h" |
DECLARE_ASM_CONST(16, double, pd_1)[2] = { 1.0, 1.0 }; |
DECLARE_ASM_CONST(16, double, pd_2)[2] = { 2.0, 2.0 }; |
#if HAVE_SSE2_INLINE |
static void lpc_apply_welch_window_sse2(const int32_t *data, int len, |
double *w_data) |
{ |
double c = 2.0 / (len-1.0); |
int n2 = len>>1; |
x86_reg i = -n2*sizeof(int32_t); |
x86_reg j = n2*sizeof(int32_t); |
__asm__ volatile( |
"movsd %4, %%xmm7 \n\t" |
"movapd "MANGLE(pd_1)", %%xmm6 \n\t" |
"movapd "MANGLE(pd_2)", %%xmm5 \n\t" |
"movlhps %%xmm7, %%xmm7 \n\t" |
"subpd %%xmm5, %%xmm7 \n\t" |
"addsd %%xmm6, %%xmm7 \n\t" |
"test $1, %5 \n\t" |
"jz 2f \n\t" |
#define WELCH(MOVPD, offset)\ |
"1: \n\t"\ |
"movapd %%xmm7, %%xmm1 \n\t"\ |
"mulpd %%xmm1, %%xmm1 \n\t"\ |
"movapd %%xmm6, %%xmm0 \n\t"\ |
"subpd %%xmm1, %%xmm0 \n\t"\ |
"pshufd $0x4e, %%xmm0, %%xmm1 \n\t"\ |
"cvtpi2pd (%3,%0), %%xmm2 \n\t"\ |
"cvtpi2pd "#offset"*4(%3,%1), %%xmm3 \n\t"\ |
"mulpd %%xmm0, %%xmm2 \n\t"\ |
"mulpd %%xmm1, %%xmm3 \n\t"\ |
"movapd %%xmm2, (%2,%0,2) \n\t"\ |
MOVPD" %%xmm3, "#offset"*8(%2,%1,2) \n\t"\ |
"subpd %%xmm5, %%xmm7 \n\t"\ |
"sub $8, %1 \n\t"\ |
"add $8, %0 \n\t"\ |
"jl 1b \n\t"\ |
WELCH("movupd", -1) |
"jmp 3f \n\t" |
"2: \n\t" |
WELCH("movapd", -2) |
"3: \n\t" |
:"+&r"(i), "+&r"(j) |
:"r"(w_data+n2), "r"(data+n2), "m"(c), "r"(len) |
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", |
"%xmm5", "%xmm6", "%xmm7") |
); |
#undef WELCH |
} |
static void lpc_compute_autocorr_sse2(const double *data, int len, int lag, |
double *autoc) |
{ |
int j; |
if((x86_reg)data & 15) |
data++; |
for(j=0; j<lag; j+=2){ |
x86_reg i = -len*sizeof(double); |
if(j == lag-2) { |
__asm__ volatile( |
"movsd "MANGLE(pd_1)", %%xmm0 \n\t" |
"movsd "MANGLE(pd_1)", %%xmm1 \n\t" |
"movsd "MANGLE(pd_1)", %%xmm2 \n\t" |
"1: \n\t" |
"movapd (%2,%0), %%xmm3 \n\t" |
"movupd -8(%3,%0), %%xmm4 \n\t" |
"movapd (%3,%0), %%xmm5 \n\t" |
"mulpd %%xmm3, %%xmm4 \n\t" |
"mulpd %%xmm3, %%xmm5 \n\t" |
"mulpd -16(%3,%0), %%xmm3 \n\t" |
"addpd %%xmm4, %%xmm1 \n\t" |
"addpd %%xmm5, %%xmm0 \n\t" |
"addpd %%xmm3, %%xmm2 \n\t" |
"add $16, %0 \n\t" |
"jl 1b \n\t" |
"movhlps %%xmm0, %%xmm3 \n\t" |
"movhlps %%xmm1, %%xmm4 \n\t" |
"movhlps %%xmm2, %%xmm5 \n\t" |
"addsd %%xmm3, %%xmm0 \n\t" |
"addsd %%xmm4, %%xmm1 \n\t" |
"addsd %%xmm5, %%xmm2 \n\t" |
"movsd %%xmm0, (%1) \n\t" |
"movsd %%xmm1, 8(%1) \n\t" |
"movsd %%xmm2, 16(%1) \n\t" |
:"+&r"(i) |
:"r"(autoc+j), "r"(data+len), "r"(data+len-j) |
:"memory" |
); |
} else { |
__asm__ volatile( |
"movsd "MANGLE(pd_1)", %%xmm0 \n\t" |
"movsd "MANGLE(pd_1)", %%xmm1 \n\t" |
"1: \n\t" |
"movapd (%3,%0), %%xmm3 \n\t" |
"movupd -8(%4,%0), %%xmm4 \n\t" |
"mulpd %%xmm3, %%xmm4 \n\t" |
"mulpd (%4,%0), %%xmm3 \n\t" |
"addpd %%xmm4, %%xmm1 \n\t" |
"addpd %%xmm3, %%xmm0 \n\t" |
"add $16, %0 \n\t" |
"jl 1b \n\t" |
"movhlps %%xmm0, %%xmm3 \n\t" |
"movhlps %%xmm1, %%xmm4 \n\t" |
"addsd %%xmm3, %%xmm0 \n\t" |
"addsd %%xmm4, %%xmm1 \n\t" |
"movsd %%xmm0, %1 \n\t" |
"movsd %%xmm1, %2 \n\t" |
:"+&r"(i), "=m"(autoc[j]), "=m"(autoc[j+1]) |
:"r"(data+len), "r"(data+len-j) |
); |
} |
} |
} |
#endif /* HAVE_SSE2_INLINE */ |
av_cold void ff_lpc_init_x86(LPCContext *c) |
{ |
#if HAVE_SSE2_INLINE |
int cpu_flags = av_get_cpu_flags(); |
if (HAVE_SSE2_INLINE && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { |
c->lpc_apply_welch_window = lpc_apply_welch_window_sse2; |
c->lpc_compute_autocorr = lpc_compute_autocorr_sse2; |
} |
#endif /* HAVE_SSE2_INLINE */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mathops.h |
---|
0,0 → 1,128 |
/* |
* simple math operations |
* Copyright (c) 2006 Michael Niedermayer <michaelni@gmx.at> et al |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#ifndef AVCODEC_X86_MATHOPS_H |
#define AVCODEC_X86_MATHOPS_H |
#include "config.h" |
#include "libavutil/common.h" |
#if HAVE_INLINE_ASM |
#if ARCH_X86_32 |
#define MULL MULL |
static av_always_inline av_const int MULL(int a, int b, unsigned shift) |
{ |
int rt, dummy; |
__asm__ ( |
"imull %3 \n\t" |
"shrdl %4, %%edx, %%eax \n\t" |
:"=a"(rt), "=d"(dummy) |
:"a"(a), "rm"(b), "ci"((uint8_t)shift) |
); |
return rt; |
} |
#define MULH MULH |
static av_always_inline av_const int MULH(int a, int b) |
{ |
int rt, dummy; |
__asm__ ( |
"imull %3" |
:"=d"(rt), "=a"(dummy) |
:"a"(a), "rm"(b) |
); |
return rt; |
} |
#define MUL64 MUL64 |
static av_always_inline av_const int64_t MUL64(int a, int b) |
{ |
int64_t rt; |
__asm__ ( |
"imull %2" |
:"=A"(rt) |
:"a"(a), "rm"(b) |
); |
return rt; |
} |
#endif /* ARCH_X86_32 */ |
#if HAVE_I686 |
/* median of 3 */ |
#define mid_pred mid_pred |
static inline av_const int mid_pred(int a, int b, int c) |
{ |
int i=b; |
__asm__ ( |
"cmp %2, %1 \n\t" |
"cmovg %1, %0 \n\t" |
"cmovg %2, %1 \n\t" |
"cmp %3, %1 \n\t" |
"cmovl %3, %1 \n\t" |
"cmp %1, %0 \n\t" |
"cmovg %1, %0 \n\t" |
:"+&r"(i), "+&r"(a) |
:"r"(b), "r"(c) |
); |
return i; |
} |
#define COPY3_IF_LT(x, y, a, b, c, d)\ |
__asm__ volatile(\ |
"cmpl %0, %3 \n\t"\ |
"cmovl %3, %0 \n\t"\ |
"cmovl %4, %1 \n\t"\ |
"cmovl %5, %2 \n\t"\ |
: "+&r" (x), "+&r" (a), "+r" (c)\ |
: "r" (y), "r" (b), "r" (d)\ |
); |
#endif /* HAVE_I686 */ |
#define MASK_ABS(mask, level) \ |
__asm__ ("cltd \n\t" \ |
"xorl %1, %0 \n\t" \ |
"subl %1, %0 \n\t" \ |
: "+a"(level), "=&d"(mask)) |
// avoid +32 for shift optimization (gcc should do that ...) |
#define NEG_SSR32 NEG_SSR32 |
static inline int32_t NEG_SSR32( int32_t a, int8_t s){ |
__asm__ ("sarl %1, %0\n\t" |
: "+r" (a) |
: "ic" ((uint8_t)(-s)) |
); |
return a; |
} |
#define NEG_USR32 NEG_USR32 |
static inline uint32_t NEG_USR32(uint32_t a, int8_t s){ |
__asm__ ("shrl %1, %0\n\t" |
: "+r" (a) |
: "ic" ((uint8_t)(-s)) |
); |
return a; |
} |
#endif /* HAVE_INLINE_ASM */ |
#endif /* AVCODEC_X86_MATHOPS_H */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mlpdsp.c |
---|
0,0 → 1,186 |
/* |
* MLP DSP functions x86-optimized |
* Copyright (c) 2009 Ramiro Polla |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/mlpdsp.h" |
#include "libavcodec/mlp.h" |
#if HAVE_7REGS && HAVE_INLINE_ASM |
extern char ff_mlp_firorder_8; |
extern char ff_mlp_firorder_7; |
extern char ff_mlp_firorder_6; |
extern char ff_mlp_firorder_5; |
extern char ff_mlp_firorder_4; |
extern char ff_mlp_firorder_3; |
extern char ff_mlp_firorder_2; |
extern char ff_mlp_firorder_1; |
extern char ff_mlp_firorder_0; |
extern char ff_mlp_iirorder_4; |
extern char ff_mlp_iirorder_3; |
extern char ff_mlp_iirorder_2; |
extern char ff_mlp_iirorder_1; |
extern char ff_mlp_iirorder_0; |
static const void *firtable[9] = { &ff_mlp_firorder_0, &ff_mlp_firorder_1, |
&ff_mlp_firorder_2, &ff_mlp_firorder_3, |
&ff_mlp_firorder_4, &ff_mlp_firorder_5, |
&ff_mlp_firorder_6, &ff_mlp_firorder_7, |
&ff_mlp_firorder_8 }; |
static const void *iirtable[5] = { &ff_mlp_iirorder_0, &ff_mlp_iirorder_1, |
&ff_mlp_iirorder_2, &ff_mlp_iirorder_3, |
&ff_mlp_iirorder_4 }; |
#if ARCH_X86_64 |
#define MLPMUL(label, offset, offs, offc) \ |
LABEL_MANGLE(label)": \n\t" \ |
"movslq "offset"+"offs"(%0), %%rax\n\t" \ |
"movslq "offset"+"offc"(%1), %%rdx\n\t" \ |
"imul %%rdx, %%rax\n\t" \ |
"add %%rax, %%rsi\n\t" |
#define FIRMULREG(label, offset, firc)\ |
LABEL_MANGLE(label)": \n\t" \ |
"movslq "#offset"(%0), %%rax\n\t" \ |
"imul %"#firc", %%rax\n\t" \ |
"add %%rax, %%rsi\n\t" |
#define CLEAR_ACCUM \ |
"xor %%rsi, %%rsi\n\t" |
#define SHIFT_ACCUM \ |
"shr %%cl, %%rsi\n\t" |
#define ACCUM "%%rdx" |
#define RESULT "%%rsi" |
#define RESULT32 "%%esi" |
#else /* if ARCH_X86_32 */ |
#define MLPMUL(label, offset, offs, offc) \ |
LABEL_MANGLE(label)": \n\t" \ |
"mov "offset"+"offs"(%0), %%eax\n\t" \ |
"imull "offset"+"offc"(%1) \n\t" \ |
"add %%eax , %%esi\n\t" \ |
"adc %%edx , %%ecx\n\t" |
#define FIRMULREG(label, offset, firc) \ |
MLPMUL(label, #offset, "0", "0") |
#define CLEAR_ACCUM \ |
"xor %%esi, %%esi\n\t" \ |
"xor %%ecx, %%ecx\n\t" |
#define SHIFT_ACCUM \ |
"mov %%ecx, %%edx\n\t" \ |
"mov %%esi, %%eax\n\t" \ |
"movzbl %7 , %%ecx\n\t" \ |
"shrd %%cl, %%edx, %%eax\n\t" \ |
#define ACCUM "%%edx" |
#define RESULT "%%eax" |
#define RESULT32 "%%eax" |
#endif /* !ARCH_X86_64 */ |
#define BINC AV_STRINGIFY(4* MAX_CHANNELS) |
#define IOFFS AV_STRINGIFY(4*(MAX_FIR_ORDER + MAX_BLOCKSIZE)) |
#define IOFFC AV_STRINGIFY(4* MAX_FIR_ORDER) |
#define FIRMUL(label, offset) MLPMUL(label, #offset, "0", "0") |
#define IIRMUL(label, offset) MLPMUL(label, #offset, IOFFS, IOFFC) |
static void mlp_filter_channel_x86(int32_t *state, const int32_t *coeff, |
int firorder, int iirorder, |
unsigned int filter_shift, int32_t mask, |
int blocksize, int32_t *sample_buffer) |
{ |
const void *firjump = firtable[firorder]; |
const void *iirjump = iirtable[iirorder]; |
blocksize = -blocksize; |
__asm__ volatile( |
"1: \n\t" |
CLEAR_ACCUM |
"jmp *%5 \n\t" |
FIRMUL (ff_mlp_firorder_8, 0x1c ) |
FIRMUL (ff_mlp_firorder_7, 0x18 ) |
FIRMUL (ff_mlp_firorder_6, 0x14 ) |
FIRMUL (ff_mlp_firorder_5, 0x10 ) |
FIRMUL (ff_mlp_firorder_4, 0x0c ) |
FIRMULREG(ff_mlp_firorder_3, 0x08,10) |
FIRMULREG(ff_mlp_firorder_2, 0x04, 9) |
FIRMULREG(ff_mlp_firorder_1, 0x00, 8) |
LABEL_MANGLE(ff_mlp_firorder_0)":\n\t" |
"jmp *%6 \n\t" |
IIRMUL (ff_mlp_iirorder_4, 0x0c ) |
IIRMUL (ff_mlp_iirorder_3, 0x08 ) |
IIRMUL (ff_mlp_iirorder_2, 0x04 ) |
IIRMUL (ff_mlp_iirorder_1, 0x00 ) |
LABEL_MANGLE(ff_mlp_iirorder_0)":\n\t" |
SHIFT_ACCUM |
"mov "RESULT" ,"ACCUM" \n\t" |
"add (%2) ,"RESULT" \n\t" |
"and %4 ,"RESULT" \n\t" |
"sub $4 , %0 \n\t" |
"mov "RESULT32", (%0) \n\t" |
"mov "RESULT32", (%2) \n\t" |
"add $"BINC" , %2 \n\t" |
"sub "ACCUM" ,"RESULT" \n\t" |
"mov "RESULT32","IOFFS"(%0) \n\t" |
"incl %3 \n\t" |
"js 1b \n\t" |
: /* 0*/"+r"(state), |
/* 1*/"+r"(coeff), |
/* 2*/"+r"(sample_buffer), |
#if ARCH_X86_64 |
/* 3*/"+r"(blocksize) |
: /* 4*/"r"((x86_reg)mask), /* 5*/"r"(firjump), |
/* 6*/"r"(iirjump) , /* 7*/"c"(filter_shift) |
, /* 8*/"r"((int64_t)coeff[0]) |
, /* 9*/"r"((int64_t)coeff[1]) |
, /*10*/"r"((int64_t)coeff[2]) |
: "rax", "rdx", "rsi" |
#else /* ARCH_X86_32 */ |
/* 3*/"+m"(blocksize) |
: /* 4*/"m"( mask), /* 5*/"m"(firjump), |
/* 6*/"m"(iirjump) , /* 7*/"m"(filter_shift) |
: "eax", "edx", "esi", "ecx" |
#endif /* !ARCH_X86_64 */ |
); |
} |
#endif /* HAVE_7REGS && HAVE_INLINE_ASM */ |
av_cold void ff_mlpdsp_init_x86(MLPDSPContext *c) |
{ |
#if HAVE_7REGS && HAVE_INLINE_ASM |
int cpu_flags = av_get_cpu_flags(); |
if (INLINE_MMX(cpu_flags)) |
c->mlp_filter_channel = mlp_filter_channel_x86; |
#endif |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/motion_est.c |
---|
0,0 → 1,474 |
/* |
* MMX optimized motion estimation |
* Copyright (c) 2001 Fabrice Bellard |
* Copyright (c) 2002-2004 Michael Niedermayer |
* |
* mostly by Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/avassert.h" |
#include "libavutil/mem.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "dsputil_x86.h" |
#if HAVE_INLINE_ASM |
DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={ |
0x0000000000000000ULL, |
0x0001000100010001ULL, |
0x0002000200020002ULL, |
}; |
DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL; |
static inline void sad8_1_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
{ |
x86_reg len= -(x86_reg)stride*h; |
__asm__ volatile( |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
"movq (%2, %%"REG_a"), %%mm2 \n\t" |
"movq (%2, %%"REG_a"), %%mm4 \n\t" |
"add %3, %%"REG_a" \n\t" |
"psubusb %%mm0, %%mm2 \n\t" |
"psubusb %%mm4, %%mm0 \n\t" |
"movq (%1, %%"REG_a"), %%mm1 \n\t" |
"movq (%2, %%"REG_a"), %%mm3 \n\t" |
"movq (%2, %%"REG_a"), %%mm5 \n\t" |
"psubusb %%mm1, %%mm3 \n\t" |
"psubusb %%mm5, %%mm1 \n\t" |
"por %%mm2, %%mm0 \n\t" |
"por %%mm1, %%mm3 \n\t" |
"movq %%mm0, %%mm1 \n\t" |
"movq %%mm3, %%mm2 \n\t" |
"punpcklbw %%mm7, %%mm0 \n\t" |
"punpckhbw %%mm7, %%mm1 \n\t" |
"punpcklbw %%mm7, %%mm3 \n\t" |
"punpckhbw %%mm7, %%mm2 \n\t" |
"paddw %%mm1, %%mm0 \n\t" |
"paddw %%mm3, %%mm2 \n\t" |
"paddw %%mm2, %%mm0 \n\t" |
"paddw %%mm0, %%mm6 \n\t" |
"add %3, %%"REG_a" \n\t" |
" js 1b \n\t" |
: "+a" (len) |
: "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride) |
); |
} |
static inline void sad8_1_mmxext(uint8_t *blk1, uint8_t *blk2, |
int stride, int h) |
{ |
__asm__ volatile( |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%1), %%mm0 \n\t" |
"movq (%1, %3), %%mm1 \n\t" |
"psadbw (%2), %%mm0 \n\t" |
"psadbw (%2, %3), %%mm1 \n\t" |
"paddw %%mm0, %%mm6 \n\t" |
"paddw %%mm1, %%mm6 \n\t" |
"lea (%1,%3,2), %1 \n\t" |
"lea (%2,%3,2), %2 \n\t" |
"sub $2, %0 \n\t" |
" jg 1b \n\t" |
: "+r" (h), "+r" (blk1), "+r" (blk2) |
: "r" ((x86_reg)stride) |
); |
} |
static int sad16_sse2(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h) |
{ |
int ret; |
__asm__ volatile( |
"pxor %%xmm2, %%xmm2 \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movdqu (%1), %%xmm0 \n\t" |
"movdqu (%1, %4), %%xmm1 \n\t" |
"psadbw (%2), %%xmm0 \n\t" |
"psadbw (%2, %4), %%xmm1 \n\t" |
"paddw %%xmm0, %%xmm2 \n\t" |
"paddw %%xmm1, %%xmm2 \n\t" |
"lea (%1,%4,2), %1 \n\t" |
"lea (%2,%4,2), %2 \n\t" |
"sub $2, %0 \n\t" |
" jg 1b \n\t" |
"movhlps %%xmm2, %%xmm0 \n\t" |
"paddw %%xmm0, %%xmm2 \n\t" |
"movd %%xmm2, %3 \n\t" |
: "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret) |
: "r" ((x86_reg)stride) |
); |
return ret; |
} |
static inline void sad8_x2a_mmxext(uint8_t *blk1, uint8_t *blk2, |
int stride, int h) |
{ |
__asm__ volatile( |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%1), %%mm0 \n\t" |
"movq (%1, %3), %%mm1 \n\t" |
"pavgb 1(%1), %%mm0 \n\t" |
"pavgb 1(%1, %3), %%mm1 \n\t" |
"psadbw (%2), %%mm0 \n\t" |
"psadbw (%2, %3), %%mm1 \n\t" |
"paddw %%mm0, %%mm6 \n\t" |
"paddw %%mm1, %%mm6 \n\t" |
"lea (%1,%3,2), %1 \n\t" |
"lea (%2,%3,2), %2 \n\t" |
"sub $2, %0 \n\t" |
" jg 1b \n\t" |
: "+r" (h), "+r" (blk1), "+r" (blk2) |
: "r" ((x86_reg)stride) |
); |
} |
static inline void sad8_y2a_mmxext(uint8_t *blk1, uint8_t *blk2, |
int stride, int h) |
{ |
__asm__ volatile( |
"movq (%1), %%mm0 \n\t" |
"add %3, %1 \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%1), %%mm1 \n\t" |
"movq (%1, %3), %%mm2 \n\t" |
"pavgb %%mm1, %%mm0 \n\t" |
"pavgb %%mm2, %%mm1 \n\t" |
"psadbw (%2), %%mm0 \n\t" |
"psadbw (%2, %3), %%mm1 \n\t" |
"paddw %%mm0, %%mm6 \n\t" |
"paddw %%mm1, %%mm6 \n\t" |
"movq %%mm2, %%mm0 \n\t" |
"lea (%1,%3,2), %1 \n\t" |
"lea (%2,%3,2), %2 \n\t" |
"sub $2, %0 \n\t" |
" jg 1b \n\t" |
: "+r" (h), "+r" (blk1), "+r" (blk2) |
: "r" ((x86_reg)stride) |
); |
} |
static inline void sad8_4_mmxext(uint8_t *blk1, uint8_t *blk2, |
int stride, int h) |
{ |
__asm__ volatile( |
"movq "MANGLE(bone)", %%mm5 \n\t" |
"movq (%1), %%mm0 \n\t" |
"pavgb 1(%1), %%mm0 \n\t" |
"add %3, %1 \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%1), %%mm1 \n\t" |
"movq (%1,%3), %%mm2 \n\t" |
"pavgb 1(%1), %%mm1 \n\t" |
"pavgb 1(%1,%3), %%mm2 \n\t" |
"psubusb %%mm5, %%mm1 \n\t" |
"pavgb %%mm1, %%mm0 \n\t" |
"pavgb %%mm2, %%mm1 \n\t" |
"psadbw (%2), %%mm0 \n\t" |
"psadbw (%2,%3), %%mm1 \n\t" |
"paddw %%mm0, %%mm6 \n\t" |
"paddw %%mm1, %%mm6 \n\t" |
"movq %%mm2, %%mm0 \n\t" |
"lea (%1,%3,2), %1 \n\t" |
"lea (%2,%3,2), %2 \n\t" |
"sub $2, %0 \n\t" |
" jg 1b \n\t" |
: "+r" (h), "+r" (blk1), "+r" (blk2) |
: "r" ((x86_reg)stride) |
); |
} |
static inline void sad8_2_mmx(uint8_t *blk1a, uint8_t *blk1b, uint8_t *blk2, int stride, int h) |
{ |
x86_reg len= -(x86_reg)stride*h; |
__asm__ volatile( |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
"movq (%2, %%"REG_a"), %%mm1 \n\t" |
"movq (%1, %%"REG_a"), %%mm2 \n\t" |
"movq (%2, %%"REG_a"), %%mm3 \n\t" |
"punpcklbw %%mm7, %%mm0 \n\t" |
"punpcklbw %%mm7, %%mm1 \n\t" |
"punpckhbw %%mm7, %%mm2 \n\t" |
"punpckhbw %%mm7, %%mm3 \n\t" |
"paddw %%mm0, %%mm1 \n\t" |
"paddw %%mm2, %%mm3 \n\t" |
"movq (%3, %%"REG_a"), %%mm4 \n\t" |
"movq (%3, %%"REG_a"), %%mm2 \n\t" |
"paddw %%mm5, %%mm1 \n\t" |
"paddw %%mm5, %%mm3 \n\t" |
"psrlw $1, %%mm1 \n\t" |
"psrlw $1, %%mm3 \n\t" |
"packuswb %%mm3, %%mm1 \n\t" |
"psubusb %%mm1, %%mm4 \n\t" |
"psubusb %%mm2, %%mm1 \n\t" |
"por %%mm4, %%mm1 \n\t" |
"movq %%mm1, %%mm0 \n\t" |
"punpcklbw %%mm7, %%mm0 \n\t" |
"punpckhbw %%mm7, %%mm1 \n\t" |
"paddw %%mm1, %%mm0 \n\t" |
"paddw %%mm0, %%mm6 \n\t" |
"add %4, %%"REG_a" \n\t" |
" js 1b \n\t" |
: "+a" (len) |
: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride) |
); |
} |
static inline void sad8_4_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
{ |
x86_reg len= -(x86_reg)stride*h; |
__asm__ volatile( |
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
"movq %%mm0, %%mm1 \n\t" |
"movq %%mm2, %%mm3 \n\t" |
"punpcklbw %%mm7, %%mm0 \n\t" |
"punpckhbw %%mm7, %%mm1 \n\t" |
"punpcklbw %%mm7, %%mm2 \n\t" |
"punpckhbw %%mm7, %%mm3 \n\t" |
"paddw %%mm2, %%mm0 \n\t" |
"paddw %%mm3, %%mm1 \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%2, %%"REG_a"), %%mm2 \n\t" |
"movq 1(%2, %%"REG_a"), %%mm4 \n\t" |
"movq %%mm2, %%mm3 \n\t" |
"movq %%mm4, %%mm5 \n\t" |
"punpcklbw %%mm7, %%mm2 \n\t" |
"punpckhbw %%mm7, %%mm3 \n\t" |
"punpcklbw %%mm7, %%mm4 \n\t" |
"punpckhbw %%mm7, %%mm5 \n\t" |
"paddw %%mm4, %%mm2 \n\t" |
"paddw %%mm5, %%mm3 \n\t" |
"movq 16+"MANGLE(round_tab)", %%mm5 \n\t" |
"paddw %%mm2, %%mm0 \n\t" |
"paddw %%mm3, %%mm1 \n\t" |
"paddw %%mm5, %%mm0 \n\t" |
"paddw %%mm5, %%mm1 \n\t" |
"movq (%3, %%"REG_a"), %%mm4 \n\t" |
"movq (%3, %%"REG_a"), %%mm5 \n\t" |
"psrlw $2, %%mm0 \n\t" |
"psrlw $2, %%mm1 \n\t" |
"packuswb %%mm1, %%mm0 \n\t" |
"psubusb %%mm0, %%mm4 \n\t" |
"psubusb %%mm5, %%mm0 \n\t" |
"por %%mm4, %%mm0 \n\t" |
"movq %%mm0, %%mm4 \n\t" |
"punpcklbw %%mm7, %%mm0 \n\t" |
"punpckhbw %%mm7, %%mm4 \n\t" |
"paddw %%mm0, %%mm6 \n\t" |
"paddw %%mm4, %%mm6 \n\t" |
"movq %%mm2, %%mm0 \n\t" |
"movq %%mm3, %%mm1 \n\t" |
"add %4, %%"REG_a" \n\t" |
" js 1b \n\t" |
: "+a" (len) |
: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride) |
); |
} |
static inline int sum_mmx(void) |
{ |
int ret; |
__asm__ volatile( |
"movq %%mm6, %%mm0 \n\t" |
"psrlq $32, %%mm6 \n\t" |
"paddw %%mm0, %%mm6 \n\t" |
"movq %%mm6, %%mm0 \n\t" |
"psrlq $16, %%mm6 \n\t" |
"paddw %%mm0, %%mm6 \n\t" |
"movd %%mm6, %0 \n\t" |
: "=r" (ret) |
); |
return ret&0xFFFF; |
} |
static inline int sum_mmxext(void) |
{ |
int ret; |
__asm__ volatile( |
"movd %%mm6, %0 \n\t" |
: "=r" (ret) |
); |
return ret; |
} |
static inline void sad8_x2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
{ |
sad8_2_mmx(blk1, blk1+1, blk2, stride, h); |
} |
static inline void sad8_y2a_mmx(uint8_t *blk1, uint8_t *blk2, int stride, int h) |
{ |
sad8_2_mmx(blk1, blk1+stride, blk2, stride, h); |
} |
#define PIX_SAD(suf)\ |
static int sad8_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
{\ |
av_assert2(h==8);\ |
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ |
"pxor %%mm6, %%mm6 \n\t":);\ |
\ |
sad8_1_ ## suf(blk1, blk2, stride, 8);\ |
\ |
return sum_ ## suf();\ |
}\ |
static int sad8_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
{\ |
av_assert2(h==8);\ |
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ |
"pxor %%mm6, %%mm6 \n\t"\ |
"movq %0, %%mm5 \n\t"\ |
:: "m"(round_tab[1]) \ |
);\ |
\ |
sad8_x2a_ ## suf(blk1, blk2, stride, 8);\ |
\ |
return sum_ ## suf();\ |
}\ |
\ |
static int sad8_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
{\ |
av_assert2(h==8);\ |
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ |
"pxor %%mm6, %%mm6 \n\t"\ |
"movq %0, %%mm5 \n\t"\ |
:: "m"(round_tab[1]) \ |
);\ |
\ |
sad8_y2a_ ## suf(blk1, blk2, stride, 8);\ |
\ |
return sum_ ## suf();\ |
}\ |
\ |
static int sad8_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
{\ |
av_assert2(h==8);\ |
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ |
"pxor %%mm6, %%mm6 \n\t"\ |
::);\ |
\ |
sad8_4_ ## suf(blk1, blk2, stride, 8);\ |
\ |
return sum_ ## suf();\ |
}\ |
\ |
static int sad16_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
{\ |
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ |
"pxor %%mm6, %%mm6 \n\t":);\ |
\ |
sad8_1_ ## suf(blk1 , blk2 , stride, h);\ |
sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\ |
\ |
return sum_ ## suf();\ |
}\ |
static int sad16_x2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
{\ |
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ |
"pxor %%mm6, %%mm6 \n\t"\ |
"movq %0, %%mm5 \n\t"\ |
:: "m"(round_tab[1]) \ |
);\ |
\ |
sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\ |
sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\ |
\ |
return sum_ ## suf();\ |
}\ |
static int sad16_y2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
{\ |
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ |
"pxor %%mm6, %%mm6 \n\t"\ |
"movq %0, %%mm5 \n\t"\ |
:: "m"(round_tab[1]) \ |
);\ |
\ |
sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\ |
sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\ |
\ |
return sum_ ## suf();\ |
}\ |
static int sad16_xy2_ ## suf(void *v, uint8_t *blk2, uint8_t *blk1, int stride, int h)\ |
{\ |
__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\ |
"pxor %%mm6, %%mm6 \n\t"\ |
::);\ |
\ |
sad8_4_ ## suf(blk1 , blk2 , stride, h);\ |
sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\ |
\ |
return sum_ ## suf();\ |
}\ |
PIX_SAD(mmx) |
PIX_SAD(mmxext) |
#endif /* HAVE_INLINE_ASM */ |
av_cold void ff_dsputil_init_pix_mmx(DSPContext *c, AVCodecContext *avctx) |
{ |
#if HAVE_INLINE_ASM |
int cpu_flags = av_get_cpu_flags(); |
if (INLINE_MMX(cpu_flags)) { |
c->pix_abs[0][0] = sad16_mmx; |
c->pix_abs[0][1] = sad16_x2_mmx; |
c->pix_abs[0][2] = sad16_y2_mmx; |
c->pix_abs[0][3] = sad16_xy2_mmx; |
c->pix_abs[1][0] = sad8_mmx; |
c->pix_abs[1][1] = sad8_x2_mmx; |
c->pix_abs[1][2] = sad8_y2_mmx; |
c->pix_abs[1][3] = sad8_xy2_mmx; |
c->sad[0]= sad16_mmx; |
c->sad[1]= sad8_mmx; |
} |
if (INLINE_MMXEXT(cpu_flags)) { |
c->pix_abs[0][0] = sad16_mmxext; |
c->pix_abs[1][0] = sad8_mmxext; |
c->sad[0] = sad16_mmxext; |
c->sad[1] = sad8_mmxext; |
if(!(avctx->flags & CODEC_FLAG_BITEXACT)){ |
c->pix_abs[0][1] = sad16_x2_mmxext; |
c->pix_abs[0][2] = sad16_y2_mmxext; |
c->pix_abs[0][3] = sad16_xy2_mmxext; |
c->pix_abs[1][1] = sad8_x2_mmxext; |
c->pix_abs[1][2] = sad8_y2_mmxext; |
c->pix_abs[1][3] = sad8_xy2_mmxext; |
} |
} |
if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) { |
c->sad[0]= sad16_sse2; |
} |
#endif /* HAVE_INLINE_ASM */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpeg4qpel.asm |
---|
0,0 → 1,560 |
;****************************************************************************** |
;* mpeg4 qpel |
;* Copyright (c) 2003 Michael Niedermayer <michaelni@gmx.at> |
;* Copyright (c) 2008 Loren Merritt |
;* Copyright (c) 2013 Daniel Kang |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
cextern pb_1 |
cextern pw_3 |
cextern pw_15 |
cextern pw_16 |
cextern pw_20 |
SECTION_TEXT |
; put_no_rnd_pixels8_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
%macro PUT_NO_RND_PIXELS8_L2 0 |
cglobal put_no_rnd_pixels8_l2, 6,6 |
movsxdifnidn r4, r4d |
movsxdifnidn r3, r3d |
pcmpeqb m6, m6 |
test r5d, 1 |
je .loop |
mova m0, [r1] |
mova m1, [r2] |
add r1, r4 |
add r2, 8 |
pxor m0, m6 |
pxor m1, m6 |
PAVGB m0, m1 |
pxor m0, m6 |
mova [r0], m0 |
add r0, r3 |
dec r5d |
.loop: |
mova m0, [r1] |
add r1, r4 |
mova m1, [r1] |
add r1, r4 |
mova m2, [r2] |
mova m3, [r2+8] |
pxor m0, m6 |
pxor m1, m6 |
pxor m2, m6 |
pxor m3, m6 |
PAVGB m0, m2 |
PAVGB m1, m3 |
pxor m0, m6 |
pxor m1, m6 |
mova [r0], m0 |
add r0, r3 |
mova [r0], m1 |
add r0, r3 |
mova m0, [r1] |
add r1, r4 |
mova m1, [r1] |
add r1, r4 |
mova m2, [r2+16] |
mova m3, [r2+24] |
pxor m0, m6 |
pxor m1, m6 |
pxor m2, m6 |
pxor m3, m6 |
PAVGB m0, m2 |
PAVGB m1, m3 |
pxor m0, m6 |
pxor m1, m6 |
mova [r0], m0 |
add r0, r3 |
mova [r0], m1 |
add r0, r3 |
add r2, 32 |
sub r5d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PUT_NO_RND_PIXELS8_L2 |
; put_no_rnd_pixels16_l2(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
%macro PUT_NO_RND_PIXELS16_l2 0 |
cglobal put_no_rnd_pixels16_l2, 6,6 |
movsxdifnidn r3, r3d |
movsxdifnidn r4, r4d |
pcmpeqb m6, m6 |
test r5d, 1 |
je .loop |
mova m0, [r1] |
mova m1, [r1+8] |
mova m2, [r2] |
mova m3, [r2+8] |
pxor m0, m6 |
pxor m1, m6 |
pxor m2, m6 |
pxor m3, m6 |
PAVGB m0, m2 |
PAVGB m1, m3 |
pxor m0, m6 |
pxor m1, m6 |
add r1, r4 |
add r2, 16 |
mova [r0], m0 |
mova [r0+8], m1 |
add r0, r3 |
dec r5d |
.loop: |
mova m0, [r1] |
mova m1, [r1+8] |
add r1, r4 |
mova m2, [r2] |
mova m3, [r2+8] |
pxor m0, m6 |
pxor m1, m6 |
pxor m2, m6 |
pxor m3, m6 |
PAVGB m0, m2 |
PAVGB m1, m3 |
pxor m0, m6 |
pxor m1, m6 |
mova [r0], m0 |
mova [r0+8], m1 |
add r0, r3 |
mova m0, [r1] |
mova m1, [r1+8] |
add r1, r4 |
mova m2, [r2+16] |
mova m3, [r2+24] |
pxor m0, m6 |
pxor m1, m6 |
pxor m2, m6 |
pxor m3, m6 |
PAVGB m0, m2 |
PAVGB m1, m3 |
pxor m0, m6 |
pxor m1, m6 |
mova [r0], m0 |
mova [r0+8], m1 |
add r0, r3 |
add r2, 32 |
sub r5d, 2 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PUT_NO_RND_PIXELS16_l2 |
INIT_MMX 3dnow |
PUT_NO_RND_PIXELS16_l2 |
%macro MPEG4_QPEL16_H_LOWPASS 1 |
cglobal %1_mpeg4_qpel16_h_lowpass, 5, 5, 0, 16 |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
pxor m7, m7 |
.loop: |
mova m0, [r1] |
mova m1, m0 |
mova m2, m0 |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
pshufw m5, m0, 0x90 |
pshufw m6, m0, 0x41 |
mova m3, m2 |
mova m4, m2 |
psllq m2, 8 |
psllq m3, 16 |
psllq m4, 24 |
punpckhbw m2, m7 |
punpckhbw m3, m7 |
punpckhbw m4, m7 |
paddw m5, m3 |
paddw m6, m2 |
paddw m5, m5 |
psubw m6, m5 |
pshufw m5, m0, 6 |
pmullw m6, [pw_3] |
paddw m0, m4 |
paddw m5, m1 |
pmullw m0, [pw_20] |
psubw m0, m5 |
paddw m6, [PW_ROUND] |
paddw m0, m6 |
psraw m0, 5 |
mova [rsp+8], m0 |
mova m0, [r1+5] |
mova m5, m0 |
mova m6, m0 |
psrlq m0, 8 |
psrlq m5, 16 |
punpcklbw m0, m7 |
punpcklbw m5, m7 |
paddw m2, m0 |
paddw m3, m5 |
paddw m2, m2 |
psubw m3, m2 |
mova m2, m6 |
psrlq m6, 24 |
punpcklbw m2, m7 |
punpcklbw m6, m7 |
pmullw m3, [pw_3] |
paddw m1, m2 |
paddw m4, m6 |
pmullw m1, [pw_20] |
psubw m3, m4 |
paddw m1, [PW_ROUND] |
paddw m3, m1 |
psraw m3, 5 |
mova m1, [rsp+8] |
packuswb m1, m3 |
OP_MOV [r0], m1, m4 |
mova m1, [r1+9] |
mova m4, m1 |
mova m3, m1 |
psrlq m1, 8 |
psrlq m4, 16 |
punpcklbw m1, m7 |
punpcklbw m4, m7 |
paddw m5, m1 |
paddw m0, m4 |
paddw m5, m5 |
psubw m0, m5 |
mova m5, m3 |
psrlq m3, 24 |
pmullw m0, [pw_3] |
punpcklbw m3, m7 |
paddw m2, m3 |
psubw m0, m2 |
mova m2, m5 |
punpcklbw m2, m7 |
punpckhbw m5, m7 |
paddw m6, m2 |
pmullw m6, [pw_20] |
paddw m0, [PW_ROUND] |
paddw m0, m6 |
psraw m0, 5 |
paddw m3, m5 |
pshufw m6, m5, 0xf9 |
paddw m6, m4 |
pshufw m4, m5, 0xbe |
pshufw m5, m5, 0x6f |
paddw m4, m1 |
paddw m5, m2 |
paddw m6, m6 |
psubw m4, m6 |
pmullw m3, [pw_20] |
pmullw m4, [pw_3] |
psubw m3, m5 |
paddw m4, [PW_ROUND] |
paddw m4, m3 |
psraw m4, 5 |
packuswb m0, m4 |
OP_MOV [r0+8], m0, m4 |
add r1, r3 |
add r0, r2 |
dec r4d |
jne .loop |
REP_RET |
%endmacro |
%macro PUT_OP 2-3 |
mova %1, %2 |
%endmacro |
%macro AVG_OP 2-3 |
mova %3, %1 |
pavgb %2, %3 |
mova %1, %2 |
%endmacro |
INIT_MMX mmxext |
%define PW_ROUND pw_16 |
%define OP_MOV PUT_OP |
MPEG4_QPEL16_H_LOWPASS put |
%define PW_ROUND pw_16 |
%define OP_MOV AVG_OP |
MPEG4_QPEL16_H_LOWPASS avg |
%define PW_ROUND pw_15 |
%define OP_MOV PUT_OP |
MPEG4_QPEL16_H_LOWPASS put_no_rnd |
%macro MPEG4_QPEL8_H_LOWPASS 1 |
cglobal %1_mpeg4_qpel8_h_lowpass, 5, 5, 0, 8 |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
pxor m7, m7 |
.loop: |
mova m0, [r1] |
mova m1, m0 |
mova m2, m0 |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
pshufw m5, m0, 0x90 |
pshufw m6, m0, 0x41 |
mova m3, m2 |
mova m4, m2 |
psllq m2, 8 |
psllq m3, 16 |
psllq m4, 24 |
punpckhbw m2, m7 |
punpckhbw m3, m7 |
punpckhbw m4, m7 |
paddw m5, m3 |
paddw m6, m2 |
paddw m5, m5 |
psubw m6, m5 |
pshufw m5, m0, 0x6 |
pmullw m6, [pw_3] |
paddw m0, m4 |
paddw m5, m1 |
pmullw m0, [pw_20] |
psubw m0, m5 |
paddw m6, [PW_ROUND] |
paddw m0, m6 |
psraw m0, 5 |
movh m5, [r1+5] |
punpcklbw m5, m7 |
pshufw m6, m5, 0xf9 |
paddw m1, m5 |
paddw m2, m6 |
pshufw m6, m5, 0xbe |
pshufw m5, m5, 0x6f |
paddw m3, m6 |
paddw m4, m5 |
paddw m2, m2 |
psubw m3, m2 |
pmullw m1, [pw_20] |
pmullw m3, [pw_3] |
psubw m3, m4 |
paddw m1, [PW_ROUND] |
paddw m3, m1 |
psraw m3, 5 |
packuswb m0, m3 |
OP_MOV [r0], m0, m4 |
add r1, r3 |
add r0, r2 |
dec r4d |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
%define PW_ROUND pw_16 |
%define OP_MOV PUT_OP |
MPEG4_QPEL8_H_LOWPASS put |
%define PW_ROUND pw_16 |
%define OP_MOV AVG_OP |
MPEG4_QPEL8_H_LOWPASS avg |
%define PW_ROUND pw_15 |
%define OP_MOV PUT_OP |
MPEG4_QPEL8_H_LOWPASS put_no_rnd |
%macro QPEL_V_LOW 5 |
paddw m0, m1 |
mova m4, [pw_20] |
pmullw m4, m0 |
mova m0, %4 |
mova m5, %1 |
paddw m5, m0 |
psubw m4, m5 |
mova m5, %2 |
mova m6, %3 |
paddw m5, m3 |
paddw m6, m2 |
paddw m6, m6 |
psubw m5, m6 |
pmullw m5, [pw_3] |
paddw m4, [PW_ROUND] |
paddw m5, m4 |
psraw m5, 5 |
packuswb m5, m5 |
OP_MOV %5, m5, m7 |
SWAP 0,1,2,3 |
%endmacro |
%macro MPEG4_QPEL16_V_LOWPASS 1 |
cglobal %1_mpeg4_qpel16_v_lowpass, 4, 6, 0, 544 |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
mov r4d, 17 |
mov r5, rsp |
pxor m7, m7 |
.looph: |
mova m0, [r1] |
mova m1, [r1] |
mova m2, [r1+8] |
mova m3, [r1+8] |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
punpcklbw m2, m7 |
punpckhbw m3, m7 |
mova [r5], m0 |
mova [r5+0x88], m1 |
mova [r5+0x110], m2 |
mova [r5+0x198], m3 |
add r5, 8 |
add r1, r3 |
dec r4d |
jne .looph |
; NOTE: r1 CHANGES VALUES: r1 -> 4 - 14*dstStride |
mov r4d, 4 |
mov r1, 4 |
neg r2 |
lea r1, [r1+r2*8] |
lea r1, [r1+r2*4] |
lea r1, [r1+r2*2] |
neg r2 |
mov r5, rsp |
.loopv: |
pxor m7, m7 |
mova m0, [r5+ 0x0] |
mova m1, [r5+ 0x8] |
mova m2, [r5+0x10] |
mova m3, [r5+0x18] |
QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] |
QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] |
lea r0, [r0+r2*2] |
QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] |
QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] |
lea r0, [r0+r2*2] |
QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] |
QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x48], [r0+r2] |
lea r0, [r0+r2*2] |
QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x50], [r0] |
QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x58], [r0+r2] |
lea r0, [r0+r2*2] |
QPEL_V_LOW [r5+0x28], [r5+0x30], [r5+0x38], [r5+0x60], [r0] |
QPEL_V_LOW [r5+0x30], [r5+0x38], [r5+0x40], [r5+0x68], [r0+r2] |
lea r0, [r0+r2*2] |
QPEL_V_LOW [r5+0x38], [r5+0x40], [r5+0x48], [r5+0x70], [r0] |
QPEL_V_LOW [r5+0x40], [r5+0x48], [r5+0x50], [r5+0x78], [r0+r2] |
lea r0, [r0+r2*2] |
QPEL_V_LOW [r5+0x48], [r5+0x50], [r5+0x58], [r5+0x80], [r0] |
QPEL_V_LOW [r5+0x50], [r5+0x58], [r5+0x60], [r5+0x80], [r0+r2] |
lea r0, [r0+r2*2] |
QPEL_V_LOW [r5+0x58], [r5+0x60], [r5+0x68], [r5+0x78], [r0] |
QPEL_V_LOW [r5+0x60], [r5+0x68], [r5+0x70], [r5+0x70], [r0+r2] |
add r5, 0x88 |
add r0, r1 |
dec r4d |
jne .loopv |
REP_RET |
%endmacro |
%macro PUT_OPH 2-3 |
movh %1, %2 |
%endmacro |
%macro AVG_OPH 2-3 |
movh %3, %1 |
pavgb %2, %3 |
movh %1, %2 |
%endmacro |
INIT_MMX mmxext |
%define PW_ROUND pw_16 |
%define OP_MOV PUT_OPH |
MPEG4_QPEL16_V_LOWPASS put |
%define PW_ROUND pw_16 |
%define OP_MOV AVG_OPH |
MPEG4_QPEL16_V_LOWPASS avg |
%define PW_ROUND pw_15 |
%define OP_MOV PUT_OPH |
MPEG4_QPEL16_V_LOWPASS put_no_rnd |
%macro MPEG4_QPEL8_V_LOWPASS 1 |
cglobal %1_mpeg4_qpel8_v_lowpass, 4, 6, 0, 288 |
movsxdifnidn r2, r2d |
movsxdifnidn r3, r3d |
mov r4d, 9 |
mov r5, rsp |
pxor m7, m7 |
.looph: |
mova m0, [r1] |
mova m1, [r1] |
punpcklbw m0, m7 |
punpckhbw m1, m7 |
mova [r5], m0 |
mova [r5+0x48], m1 |
add r5, 8 |
add r1, r3 |
dec r4d |
jne .looph |
; NOTE: r1 CHANGES VALUES: r1 -> 4 - 6*dstStride |
mov r4d, 2 |
mov r1, 4 |
neg r2 |
lea r1, [r1+r2*4] |
lea r1, [r1+r2*2] |
neg r2 |
mov r5, rsp |
.loopv: |
pxor m7, m7 |
mova m0, [r5+ 0x0] |
mova m1, [r5+ 0x8] |
mova m2, [r5+0x10] |
mova m3, [r5+0x18] |
QPEL_V_LOW [r5+0x10], [r5+ 0x8], [r5+ 0x0], [r5+0x20], [r0] |
QPEL_V_LOW [r5+ 0x8], [r5+ 0x0], [r5+ 0x0], [r5+0x28], [r0+r2] |
lea r0, [r0+r2*2] |
QPEL_V_LOW [r5+ 0x0], [r5+ 0x0], [r5+ 0x8], [r5+0x30], [r0] |
QPEL_V_LOW [r5+ 0x0], [r5+ 0x8], [r5+0x10], [r5+0x38], [r0+r2] |
lea r0, [r0+r2*2] |
QPEL_V_LOW [r5+ 0x8], [r5+0x10], [r5+0x18], [r5+0x40], [r0] |
QPEL_V_LOW [r5+0x10], [r5+0x18], [r5+0x20], [r5+0x40], [r0+r2] |
lea r0, [r0+r2*2] |
QPEL_V_LOW [r5+0x18], [r5+0x20], [r5+0x28], [r5+0x38], [r0] |
QPEL_V_LOW [r5+0x20], [r5+0x28], [r5+0x30], [r5+0x30], [r0+r2] |
add r5, 0x48 |
add r0, r1 |
dec r4d |
jne .loopv |
REP_RET |
%endmacro |
INIT_MMX mmxext |
%define PW_ROUND pw_16 |
%define OP_MOV PUT_OPH |
MPEG4_QPEL8_V_LOWPASS put |
%define PW_ROUND pw_16 |
%define OP_MOV AVG_OPH |
MPEG4_QPEL8_V_LOWPASS avg |
%define PW_ROUND pw_15 |
%define OP_MOV PUT_OPH |
MPEG4_QPEL8_V_LOWPASS put_no_rnd |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpegaudiodsp.c |
---|
0,0 → 1,277 |
/* |
* MMX optimized MP3 decoding functions |
* Copyright (c) 2010 Vitor Sessak |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/internal.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/mpegaudiodsp.h" |
#define DECL(CPU)\ |
static void imdct36_blocks_ ## CPU(float *out, float *buf, float *in, int count, int switch_point, int block_type);\ |
void ff_imdct36_float_ ## CPU(float *out, float *buf, float *in, float *win); |
DECL(sse) |
DECL(sse2) |
DECL(sse3) |
DECL(ssse3) |
DECL(avx) |
void ff_four_imdct36_float_sse(float *out, float *buf, float *in, float *win, |
float *tmpbuf); |
void ff_four_imdct36_float_avx(float *out, float *buf, float *in, float *win, |
float *tmpbuf); |
DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40]; |
#if HAVE_SSE2_INLINE |
#define MACS(rt, ra, rb) rt+=(ra)*(rb) |
#define MLSS(rt, ra, rb) rt-=(ra)*(rb) |
#define SUM8(op, sum, w, p) \ |
{ \ |
op(sum, (w)[0 * 64], (p)[0 * 64]); \ |
op(sum, (w)[1 * 64], (p)[1 * 64]); \ |
op(sum, (w)[2 * 64], (p)[2 * 64]); \ |
op(sum, (w)[3 * 64], (p)[3 * 64]); \ |
op(sum, (w)[4 * 64], (p)[4 * 64]); \ |
op(sum, (w)[5 * 64], (p)[5 * 64]); \ |
op(sum, (w)[6 * 64], (p)[6 * 64]); \ |
op(sum, (w)[7 * 64], (p)[7 * 64]); \ |
} |
static void apply_window(const float *buf, const float *win1, |
const float *win2, float *sum1, float *sum2, int len) |
{ |
x86_reg count = - 4*len; |
const float *win1a = win1+len; |
const float *win2a = win2+len; |
const float *bufa = buf+len; |
float *sum1a = sum1+len; |
float *sum2a = sum2+len; |
#define MULT(a, b) \ |
"movaps " #a "(%1,%0), %%xmm1 \n\t" \ |
"movaps " #a "(%3,%0), %%xmm2 \n\t" \ |
"mulps %%xmm2, %%xmm1 \n\t" \ |
"subps %%xmm1, %%xmm0 \n\t" \ |
"mulps " #b "(%2,%0), %%xmm2 \n\t" \ |
"subps %%xmm2, %%xmm4 \n\t" \ |
__asm__ volatile( |
"1: \n\t" |
"xorps %%xmm0, %%xmm0 \n\t" |
"xorps %%xmm4, %%xmm4 \n\t" |
MULT( 0, 0) |
MULT( 256, 64) |
MULT( 512, 128) |
MULT( 768, 192) |
MULT(1024, 256) |
MULT(1280, 320) |
MULT(1536, 384) |
MULT(1792, 448) |
"movaps %%xmm0, (%4,%0) \n\t" |
"movaps %%xmm4, (%5,%0) \n\t" |
"add $16, %0 \n\t" |
"jl 1b \n\t" |
:"+&r"(count) |
:"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a) |
); |
#undef MULT |
} |
static void apply_window_mp3(float *in, float *win, int *unused, float *out, |
int incr) |
{ |
LOCAL_ALIGNED_16(float, suma, [17]); |
LOCAL_ALIGNED_16(float, sumb, [17]); |
LOCAL_ALIGNED_16(float, sumc, [17]); |
LOCAL_ALIGNED_16(float, sumd, [17]); |
float sum; |
/* copy to avoid wrap */ |
__asm__ volatile( |
"movaps 0(%0), %%xmm0 \n\t" \ |
"movaps 16(%0), %%xmm1 \n\t" \ |
"movaps 32(%0), %%xmm2 \n\t" \ |
"movaps 48(%0), %%xmm3 \n\t" \ |
"movaps %%xmm0, 0(%1) \n\t" \ |
"movaps %%xmm1, 16(%1) \n\t" \ |
"movaps %%xmm2, 32(%1) \n\t" \ |
"movaps %%xmm3, 48(%1) \n\t" \ |
"movaps 64(%0), %%xmm0 \n\t" \ |
"movaps 80(%0), %%xmm1 \n\t" \ |
"movaps 96(%0), %%xmm2 \n\t" \ |
"movaps 112(%0), %%xmm3 \n\t" \ |
"movaps %%xmm0, 64(%1) \n\t" \ |
"movaps %%xmm1, 80(%1) \n\t" \ |
"movaps %%xmm2, 96(%1) \n\t" \ |
"movaps %%xmm3, 112(%1) \n\t" |
::"r"(in), "r"(in+512) |
:"memory" |
); |
apply_window(in + 16, win , win + 512, suma, sumc, 16); |
apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16); |
SUM8(MACS, suma[0], win + 32, in + 48); |
sumc[ 0] = 0; |
sumb[16] = 0; |
sumd[16] = 0; |
#define SUMS(suma, sumb, sumc, sumd, out1, out2) \ |
"movups " #sumd "(%4), %%xmm0 \n\t" \ |
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ |
"subps " #suma "(%1), %%xmm0 \n\t" \ |
"movaps %%xmm0," #out1 "(%0) \n\t" \ |
\ |
"movups " #sumc "(%3), %%xmm0 \n\t" \ |
"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \ |
"addps " #sumb "(%2), %%xmm0 \n\t" \ |
"movaps %%xmm0," #out2 "(%0) \n\t" |
if (incr == 1) { |
__asm__ volatile( |
SUMS( 0, 48, 4, 52, 0, 112) |
SUMS(16, 32, 20, 36, 16, 96) |
SUMS(32, 16, 36, 20, 32, 80) |
SUMS(48, 0, 52, 4, 48, 64) |
:"+&r"(out) |
:"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0]) |
:"memory" |
); |
out += 16*incr; |
} else { |
int j; |
float *out2 = out + 32 * incr; |
out[0 ] = -suma[ 0]; |
out += incr; |
out2 -= incr; |
for(j=1;j<16;j++) { |
*out = -suma[ j] + sumd[16-j]; |
*out2 = sumb[16-j] + sumc[ j]; |
out += incr; |
out2 -= incr; |
} |
} |
sum = 0; |
SUM8(MLSS, sum, win + 16 + 32, in + 32); |
*out = sum; |
} |
#endif /* HAVE_SSE2_INLINE */ |
#if HAVE_YASM |
#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \ |
static void imdct36_blocks_ ## CPU1(float *out, float *buf, float *in, \ |
int count, int switch_point, int block_type) \ |
{ \ |
int align_end = count - (count & 3); \ |
int j; \ |
for (j = 0; j < align_end; j+= 4) { \ |
LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \ |
float *win = mdct_win_sse[switch_point && j < 4][block_type]; \ |
/* apply window & overlap with previous buffer */ \ |
\ |
/* select window */ \ |
ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \ |
in += 4*18; \ |
buf += 4*18; \ |
out += 4; \ |
} \ |
for (; j < count; j++) { \ |
/* apply window & overlap with previous buffer */ \ |
\ |
/* select window */ \ |
int win_idx = (switch_point && j < 2) ? 0 : block_type; \ |
float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \ |
\ |
ff_imdct36_float_ ## CPU1(out, buf, in, win); \ |
\ |
in += 18; \ |
buf++; \ |
out++; \ |
} \ |
} |
#if HAVE_SSE |
DECL_IMDCT_BLOCKS(sse,sse) |
DECL_IMDCT_BLOCKS(sse2,sse) |
DECL_IMDCT_BLOCKS(sse3,sse) |
DECL_IMDCT_BLOCKS(ssse3,sse) |
#endif |
#if HAVE_AVX_EXTERNAL |
DECL_IMDCT_BLOCKS(avx,avx) |
#endif |
#endif /* HAVE_YASM */ |
av_cold void ff_mpadsp_init_x86(MPADSPContext *s) |
{ |
int cpu_flags = av_get_cpu_flags(); |
int i, j; |
for (j = 0; j < 4; j++) { |
for (i = 0; i < 40; i ++) { |
mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i]; |
mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i]; |
mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i]; |
mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; |
mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i]; |
mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i]; |
mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i]; |
mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i]; |
} |
} |
#if HAVE_SSE2_INLINE |
if (cpu_flags & AV_CPU_FLAG_SSE2) { |
s->apply_window_float = apply_window_mp3; |
} |
#endif /* HAVE_SSE2_INLINE */ |
#if HAVE_YASM |
if (EXTERNAL_SSE(cpu_flags)) { |
s->imdct36_blocks_float = imdct36_blocks_sse; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
s->imdct36_blocks_float = imdct36_blocks_sse2; |
} |
if (EXTERNAL_SSE3(cpu_flags)) { |
s->imdct36_blocks_float = imdct36_blocks_sse3; |
} |
if (EXTERNAL_SSSE3(cpu_flags)) { |
s->imdct36_blocks_float = imdct36_blocks_ssse3; |
} |
if (EXTERNAL_AVX(cpu_flags)) { |
s->imdct36_blocks_float = imdct36_blocks_avx; |
} |
#endif /* HAVE_YASM */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpegvideo.c |
---|
0,0 → 1,577 |
/* |
* Optimized for ia32 CPUs by Nick Kurshev <nickols_k@mail.ru> |
* h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/avcodec.h" |
#include "libavcodec/mpegvideo.h" |
#include "dsputil_x86.h" |
#if HAVE_MMX_INLINE |
static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, |
int16_t *block, int n, int qscale) |
{ |
x86_reg level, qmul, qadd, nCoeffs; |
qmul = qscale << 1; |
av_assert2(s->block_last_index[n]>=0 || s->h263_aic); |
if (!s->h263_aic) { |
if (n < 4) |
level = block[0] * s->y_dc_scale; |
else |
level = block[0] * s->c_dc_scale; |
qadd = (qscale - 1) | 1; |
}else{ |
qadd = 0; |
level= block[0]; |
} |
if(s->ac_pred) |
nCoeffs=63; |
else |
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; |
__asm__ volatile( |
"movd %1, %%mm6 \n\t" //qmul |
"packssdw %%mm6, %%mm6 \n\t" |
"packssdw %%mm6, %%mm6 \n\t" |
"movd %2, %%mm5 \n\t" //qadd |
"pxor %%mm7, %%mm7 \n\t" |
"packssdw %%mm5, %%mm5 \n\t" |
"packssdw %%mm5, %%mm5 \n\t" |
"psubw %%mm5, %%mm7 \n\t" |
"pxor %%mm4, %%mm4 \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%0, %3), %%mm0 \n\t" |
"movq 8(%0, %3), %%mm1 \n\t" |
"pmullw %%mm6, %%mm0 \n\t" |
"pmullw %%mm6, %%mm1 \n\t" |
"movq (%0, %3), %%mm2 \n\t" |
"movq 8(%0, %3), %%mm3 \n\t" |
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
"pxor %%mm2, %%mm0 \n\t" |
"pxor %%mm3, %%mm1 \n\t" |
"paddw %%mm7, %%mm0 \n\t" |
"paddw %%mm7, %%mm1 \n\t" |
"pxor %%mm0, %%mm2 \n\t" |
"pxor %%mm1, %%mm3 \n\t" |
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 |
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 |
"pandn %%mm2, %%mm0 \n\t" |
"pandn %%mm3, %%mm1 \n\t" |
"movq %%mm0, (%0, %3) \n\t" |
"movq %%mm1, 8(%0, %3) \n\t" |
"add $16, %3 \n\t" |
"jng 1b \n\t" |
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) |
: "memory" |
); |
block[0]= level; |
} |
static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, |
int16_t *block, int n, int qscale) |
{ |
x86_reg qmul, qadd, nCoeffs; |
qmul = qscale << 1; |
qadd = (qscale - 1) | 1; |
av_assert2(s->block_last_index[n]>=0 || s->h263_aic); |
nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; |
__asm__ volatile( |
"movd %1, %%mm6 \n\t" //qmul |
"packssdw %%mm6, %%mm6 \n\t" |
"packssdw %%mm6, %%mm6 \n\t" |
"movd %2, %%mm5 \n\t" //qadd |
"pxor %%mm7, %%mm7 \n\t" |
"packssdw %%mm5, %%mm5 \n\t" |
"packssdw %%mm5, %%mm5 \n\t" |
"psubw %%mm5, %%mm7 \n\t" |
"pxor %%mm4, %%mm4 \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%0, %3), %%mm0 \n\t" |
"movq 8(%0, %3), %%mm1 \n\t" |
"pmullw %%mm6, %%mm0 \n\t" |
"pmullw %%mm6, %%mm1 \n\t" |
"movq (%0, %3), %%mm2 \n\t" |
"movq 8(%0, %3), %%mm3 \n\t" |
"pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
"pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
"pxor %%mm2, %%mm0 \n\t" |
"pxor %%mm3, %%mm1 \n\t" |
"paddw %%mm7, %%mm0 \n\t" |
"paddw %%mm7, %%mm1 \n\t" |
"pxor %%mm0, %%mm2 \n\t" |
"pxor %%mm1, %%mm3 \n\t" |
"pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 |
"pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 |
"pandn %%mm2, %%mm0 \n\t" |
"pandn %%mm3, %%mm1 \n\t" |
"movq %%mm0, (%0, %3) \n\t" |
"movq %%mm1, 8(%0, %3) \n\t" |
"add $16, %3 \n\t" |
"jng 1b \n\t" |
::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) |
: "memory" |
); |
} |
static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, |
int16_t *block, int n, int qscale) |
{ |
x86_reg nCoeffs; |
const uint16_t *quant_matrix; |
int block0; |
av_assert2(s->block_last_index[n]>=0); |
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; |
if (n < 4) |
block0 = block[0] * s->y_dc_scale; |
else |
block0 = block[0] * s->c_dc_scale; |
/* XXX: only mpeg1 */ |
quant_matrix = s->intra_matrix; |
__asm__ volatile( |
"pcmpeqw %%mm7, %%mm7 \n\t" |
"psrlw $15, %%mm7 \n\t" |
"movd %2, %%mm6 \n\t" |
"packssdw %%mm6, %%mm6 \n\t" |
"packssdw %%mm6, %%mm6 \n\t" |
"mov %3, %%"REG_a" \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%0, %%"REG_a"), %%mm0 \n\t" |
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
"movq (%1, %%"REG_a"), %%mm4 \n\t" |
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
"pxor %%mm2, %%mm2 \n\t" |
"pxor %%mm3, %%mm3 \n\t" |
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
"pxor %%mm2, %%mm0 \n\t" |
"pxor %%mm3, %%mm1 \n\t" |
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q |
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q |
"pxor %%mm4, %%mm4 \n\t" |
"pxor %%mm5, %%mm5 \n\t" // FIXME slow |
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
"psraw $3, %%mm0 \n\t" |
"psraw $3, %%mm1 \n\t" |
"psubw %%mm7, %%mm0 \n\t" |
"psubw %%mm7, %%mm1 \n\t" |
"por %%mm7, %%mm0 \n\t" |
"por %%mm7, %%mm1 \n\t" |
"pxor %%mm2, %%mm0 \n\t" |
"pxor %%mm3, %%mm1 \n\t" |
"psubw %%mm2, %%mm0 \n\t" |
"psubw %%mm3, %%mm1 \n\t" |
"pandn %%mm0, %%mm4 \n\t" |
"pandn %%mm1, %%mm5 \n\t" |
"movq %%mm4, (%0, %%"REG_a") \n\t" |
"movq %%mm5, 8(%0, %%"REG_a") \n\t" |
"add $16, %%"REG_a" \n\t" |
"js 1b \n\t" |
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) |
: "%"REG_a, "memory" |
); |
block[0]= block0; |
} |
static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, |
int16_t *block, int n, int qscale) |
{ |
x86_reg nCoeffs; |
const uint16_t *quant_matrix; |
av_assert2(s->block_last_index[n]>=0); |
nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; |
quant_matrix = s->inter_matrix; |
__asm__ volatile( |
"pcmpeqw %%mm7, %%mm7 \n\t" |
"psrlw $15, %%mm7 \n\t" |
"movd %2, %%mm6 \n\t" |
"packssdw %%mm6, %%mm6 \n\t" |
"packssdw %%mm6, %%mm6 \n\t" |
"mov %3, %%"REG_a" \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%0, %%"REG_a"), %%mm0 \n\t" |
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
"movq (%1, %%"REG_a"), %%mm4 \n\t" |
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
"pxor %%mm2, %%mm2 \n\t" |
"pxor %%mm3, %%mm3 \n\t" |
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
"pxor %%mm2, %%mm0 \n\t" |
"pxor %%mm3, %%mm1 \n\t" |
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 |
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 |
"paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 |
"paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 |
"pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q |
"pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q |
"pxor %%mm4, %%mm4 \n\t" |
"pxor %%mm5, %%mm5 \n\t" // FIXME slow |
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
"psraw $4, %%mm0 \n\t" |
"psraw $4, %%mm1 \n\t" |
"psubw %%mm7, %%mm0 \n\t" |
"psubw %%mm7, %%mm1 \n\t" |
"por %%mm7, %%mm0 \n\t" |
"por %%mm7, %%mm1 \n\t" |
"pxor %%mm2, %%mm0 \n\t" |
"pxor %%mm3, %%mm1 \n\t" |
"psubw %%mm2, %%mm0 \n\t" |
"psubw %%mm3, %%mm1 \n\t" |
"pandn %%mm0, %%mm4 \n\t" |
"pandn %%mm1, %%mm5 \n\t" |
"movq %%mm4, (%0, %%"REG_a") \n\t" |
"movq %%mm5, 8(%0, %%"REG_a") \n\t" |
"add $16, %%"REG_a" \n\t" |
"js 1b \n\t" |
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) |
: "%"REG_a, "memory" |
); |
} |
static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, |
int16_t *block, int n, int qscale) |
{ |
x86_reg nCoeffs; |
const uint16_t *quant_matrix; |
int block0; |
av_assert2(s->block_last_index[n]>=0); |
if(s->alternate_scan) nCoeffs= 63; //FIXME |
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; |
if (n < 4) |
block0 = block[0] * s->y_dc_scale; |
else |
block0 = block[0] * s->c_dc_scale; |
quant_matrix = s->intra_matrix; |
__asm__ volatile( |
"pcmpeqw %%mm7, %%mm7 \n\t" |
"psrlw $15, %%mm7 \n\t" |
"movd %2, %%mm6 \n\t" |
"packssdw %%mm6, %%mm6 \n\t" |
"packssdw %%mm6, %%mm6 \n\t" |
"mov %3, %%"REG_a" \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%0, %%"REG_a"), %%mm0 \n\t" |
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
"movq (%1, %%"REG_a"), %%mm4 \n\t" |
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
"pxor %%mm2, %%mm2 \n\t" |
"pxor %%mm3, %%mm3 \n\t" |
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
"pxor %%mm2, %%mm0 \n\t" |
"pxor %%mm3, %%mm1 \n\t" |
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q |
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q |
"pxor %%mm4, %%mm4 \n\t" |
"pxor %%mm5, %%mm5 \n\t" // FIXME slow |
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
"psraw $3, %%mm0 \n\t" |
"psraw $3, %%mm1 \n\t" |
"pxor %%mm2, %%mm0 \n\t" |
"pxor %%mm3, %%mm1 \n\t" |
"psubw %%mm2, %%mm0 \n\t" |
"psubw %%mm3, %%mm1 \n\t" |
"pandn %%mm0, %%mm4 \n\t" |
"pandn %%mm1, %%mm5 \n\t" |
"movq %%mm4, (%0, %%"REG_a") \n\t" |
"movq %%mm5, 8(%0, %%"REG_a") \n\t" |
"add $16, %%"REG_a" \n\t" |
"jng 1b \n\t" |
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) |
: "%"REG_a, "memory" |
); |
block[0]= block0; |
//Note, we do not do mismatch control for intra as errors cannot accumulate |
} |
static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, |
int16_t *block, int n, int qscale) |
{ |
x86_reg nCoeffs; |
const uint16_t *quant_matrix; |
av_assert2(s->block_last_index[n]>=0); |
if(s->alternate_scan) nCoeffs= 63; //FIXME |
else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; |
quant_matrix = s->inter_matrix; |
__asm__ volatile( |
"pcmpeqw %%mm7, %%mm7 \n\t" |
"psrlq $48, %%mm7 \n\t" |
"movd %2, %%mm6 \n\t" |
"packssdw %%mm6, %%mm6 \n\t" |
"packssdw %%mm6, %%mm6 \n\t" |
"mov %3, %%"REG_a" \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
"movq (%0, %%"REG_a"), %%mm0 \n\t" |
"movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
"movq (%1, %%"REG_a"), %%mm4 \n\t" |
"movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
"pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
"pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
"pxor %%mm2, %%mm2 \n\t" |
"pxor %%mm3, %%mm3 \n\t" |
"pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
"pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
"pxor %%mm2, %%mm0 \n\t" |
"pxor %%mm3, %%mm1 \n\t" |
"psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
"psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
"paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 |
"paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 |
"pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q |
"pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q |
"paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q |
"paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q |
"pxor %%mm4, %%mm4 \n\t" |
"pxor %%mm5, %%mm5 \n\t" // FIXME slow |
"pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
"pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
"psrlw $4, %%mm0 \n\t" |
"psrlw $4, %%mm1 \n\t" |
"pxor %%mm2, %%mm0 \n\t" |
"pxor %%mm3, %%mm1 \n\t" |
"psubw %%mm2, %%mm0 \n\t" |
"psubw %%mm3, %%mm1 \n\t" |
"pandn %%mm0, %%mm4 \n\t" |
"pandn %%mm1, %%mm5 \n\t" |
"pxor %%mm4, %%mm7 \n\t" |
"pxor %%mm5, %%mm7 \n\t" |
"movq %%mm4, (%0, %%"REG_a") \n\t" |
"movq %%mm5, 8(%0, %%"REG_a") \n\t" |
"add $16, %%"REG_a" \n\t" |
"jng 1b \n\t" |
"movd 124(%0, %3), %%mm0 \n\t" |
"movq %%mm7, %%mm6 \n\t" |
"psrlq $32, %%mm7 \n\t" |
"pxor %%mm6, %%mm7 \n\t" |
"movq %%mm7, %%mm6 \n\t" |
"psrlq $16, %%mm7 \n\t" |
"pxor %%mm6, %%mm7 \n\t" |
"pslld $31, %%mm7 \n\t" |
"psrlq $15, %%mm7 \n\t" |
"pxor %%mm7, %%mm0 \n\t" |
"movd %%mm0, 124(%0, %3) \n\t" |
::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) |
: "%"REG_a, "memory" |
); |
} |
static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){ |
const int intra= s->mb_intra; |
int *sum= s->dct_error_sum[intra]; |
uint16_t *offset= s->dct_offset[intra]; |
s->dct_count[intra]++; |
__asm__ volatile( |
"pxor %%mm7, %%mm7 \n\t" |
"1: \n\t" |
"pxor %%mm0, %%mm0 \n\t" |
"pxor %%mm1, %%mm1 \n\t" |
"movq (%0), %%mm2 \n\t" |
"movq 8(%0), %%mm3 \n\t" |
"pcmpgtw %%mm2, %%mm0 \n\t" |
"pcmpgtw %%mm3, %%mm1 \n\t" |
"pxor %%mm0, %%mm2 \n\t" |
"pxor %%mm1, %%mm3 \n\t" |
"psubw %%mm0, %%mm2 \n\t" |
"psubw %%mm1, %%mm3 \n\t" |
"movq %%mm2, %%mm4 \n\t" |
"movq %%mm3, %%mm5 \n\t" |
"psubusw (%2), %%mm2 \n\t" |
"psubusw 8(%2), %%mm3 \n\t" |
"pxor %%mm0, %%mm2 \n\t" |
"pxor %%mm1, %%mm3 \n\t" |
"psubw %%mm0, %%mm2 \n\t" |
"psubw %%mm1, %%mm3 \n\t" |
"movq %%mm2, (%0) \n\t" |
"movq %%mm3, 8(%0) \n\t" |
"movq %%mm4, %%mm2 \n\t" |
"movq %%mm5, %%mm3 \n\t" |
"punpcklwd %%mm7, %%mm4 \n\t" |
"punpckhwd %%mm7, %%mm2 \n\t" |
"punpcklwd %%mm7, %%mm5 \n\t" |
"punpckhwd %%mm7, %%mm3 \n\t" |
"paddd (%1), %%mm4 \n\t" |
"paddd 8(%1), %%mm2 \n\t" |
"paddd 16(%1), %%mm5 \n\t" |
"paddd 24(%1), %%mm3 \n\t" |
"movq %%mm4, (%1) \n\t" |
"movq %%mm2, 8(%1) \n\t" |
"movq %%mm5, 16(%1) \n\t" |
"movq %%mm3, 24(%1) \n\t" |
"add $16, %0 \n\t" |
"add $32, %1 \n\t" |
"add $16, %2 \n\t" |
"cmp %3, %0 \n\t" |
" jb 1b \n\t" |
: "+r" (block), "+r" (sum), "+r" (offset) |
: "r"(block+64) |
); |
} |
static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){ |
const int intra= s->mb_intra; |
int *sum= s->dct_error_sum[intra]; |
uint16_t *offset= s->dct_offset[intra]; |
s->dct_count[intra]++; |
__asm__ volatile( |
"pxor %%xmm7, %%xmm7 \n\t" |
"1: \n\t" |
"pxor %%xmm0, %%xmm0 \n\t" |
"pxor %%xmm1, %%xmm1 \n\t" |
"movdqa (%0), %%xmm2 \n\t" |
"movdqa 16(%0), %%xmm3 \n\t" |
"pcmpgtw %%xmm2, %%xmm0 \n\t" |
"pcmpgtw %%xmm3, %%xmm1 \n\t" |
"pxor %%xmm0, %%xmm2 \n\t" |
"pxor %%xmm1, %%xmm3 \n\t" |
"psubw %%xmm0, %%xmm2 \n\t" |
"psubw %%xmm1, %%xmm3 \n\t" |
"movdqa %%xmm2, %%xmm4 \n\t" |
"movdqa %%xmm3, %%xmm5 \n\t" |
"psubusw (%2), %%xmm2 \n\t" |
"psubusw 16(%2), %%xmm3 \n\t" |
"pxor %%xmm0, %%xmm2 \n\t" |
"pxor %%xmm1, %%xmm3 \n\t" |
"psubw %%xmm0, %%xmm2 \n\t" |
"psubw %%xmm1, %%xmm3 \n\t" |
"movdqa %%xmm2, (%0) \n\t" |
"movdqa %%xmm3, 16(%0) \n\t" |
"movdqa %%xmm4, %%xmm6 \n\t" |
"movdqa %%xmm5, %%xmm0 \n\t" |
"punpcklwd %%xmm7, %%xmm4 \n\t" |
"punpckhwd %%xmm7, %%xmm6 \n\t" |
"punpcklwd %%xmm7, %%xmm5 \n\t" |
"punpckhwd %%xmm7, %%xmm0 \n\t" |
"paddd (%1), %%xmm4 \n\t" |
"paddd 16(%1), %%xmm6 \n\t" |
"paddd 32(%1), %%xmm5 \n\t" |
"paddd 48(%1), %%xmm0 \n\t" |
"movdqa %%xmm4, (%1) \n\t" |
"movdqa %%xmm6, 16(%1) \n\t" |
"movdqa %%xmm5, 32(%1) \n\t" |
"movdqa %%xmm0, 48(%1) \n\t" |
"add $32, %0 \n\t" |
"add $64, %1 \n\t" |
"add $32, %2 \n\t" |
"cmp %3, %0 \n\t" |
" jb 1b \n\t" |
: "+r" (block), "+r" (sum), "+r" (offset) |
: "r"(block+64) |
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", |
"%xmm4", "%xmm5", "%xmm6", "%xmm7") |
); |
} |
#endif /* HAVE_MMX_INLINE */ |
av_cold void ff_MPV_common_init_x86(MpegEncContext *s) |
{ |
#if HAVE_MMX_INLINE |
int cpu_flags = av_get_cpu_flags(); |
if (INLINE_MMX(cpu_flags)) { |
s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; |
s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; |
s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; |
s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; |
if(!(s->flags & CODEC_FLAG_BITEXACT)) |
s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; |
s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; |
s->denoise_dct = denoise_dct_mmx; |
} |
if (INLINE_SSE2(cpu_flags)) { |
s->denoise_dct = denoise_dct_sse2; |
} |
#endif /* HAVE_MMX_INLINE */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpegvideoenc.c |
---|
0,0 → 1,107 |
/* |
* The simplest mpeg encoder (well, it was the simplest!) |
* Copyright (c) 2000,2001 Fabrice Bellard |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/avcodec.h" |
#include "libavcodec/dct.h" |
#include "libavcodec/mpegvideo.h" |
#include "dsputil_x86.h" |
extern uint16_t ff_inv_zigzag_direct16[64]; |
#if HAVE_MMX_INLINE |
#define COMPILE_TEMPLATE_MMXEXT 0 |
#define COMPILE_TEMPLATE_SSE2 0 |
#define COMPILE_TEMPLATE_SSSE3 0 |
#define RENAME(a) a ## _MMX |
#define RENAMEl(a) a ## _mmx |
#include "mpegvideoenc_template.c" |
#endif /* HAVE_MMX_INLINE */ |
#if HAVE_MMXEXT_INLINE |
#undef COMPILE_TEMPLATE_SSSE3 |
#undef COMPILE_TEMPLATE_SSE2 |
#undef COMPILE_TEMPLATE_MMXEXT |
#define COMPILE_TEMPLATE_MMXEXT 1 |
#define COMPILE_TEMPLATE_SSE2 0 |
#define COMPILE_TEMPLATE_SSSE3 0 |
#undef RENAME |
#undef RENAMEl |
#define RENAME(a) a ## _MMXEXT |
#define RENAMEl(a) a ## _mmxext |
#include "mpegvideoenc_template.c" |
#endif /* HAVE_MMXEXT_INLINE */ |
#if HAVE_SSE2_INLINE |
#undef COMPILE_TEMPLATE_MMXEXT |
#undef COMPILE_TEMPLATE_SSE2 |
#undef COMPILE_TEMPLATE_SSSE3 |
#define COMPILE_TEMPLATE_MMXEXT 0 |
#define COMPILE_TEMPLATE_SSE2 1 |
#define COMPILE_TEMPLATE_SSSE3 0 |
#undef RENAME |
#undef RENAMEl |
#define RENAME(a) a ## _SSE2 |
#define RENAMEl(a) a ## _sse2 |
#include "mpegvideoenc_template.c" |
#endif /* HAVE_SSE2_INLINE */ |
#if HAVE_SSSE3_INLINE |
#undef COMPILE_TEMPLATE_MMXEXT |
#undef COMPILE_TEMPLATE_SSE2 |
#undef COMPILE_TEMPLATE_SSSE3 |
#define COMPILE_TEMPLATE_MMXEXT 0 |
#define COMPILE_TEMPLATE_SSE2 1 |
#define COMPILE_TEMPLATE_SSSE3 1 |
#undef RENAME |
#undef RENAMEl |
#define RENAME(a) a ## _SSSE3 |
#define RENAMEl(a) a ## _sse2 |
#include "mpegvideoenc_template.c" |
#endif /* HAVE_SSSE3_INLINE */ |
av_cold void ff_dct_encode_init_x86(MpegEncContext *s) |
{ |
const int dct_algo = s->avctx->dct_algo; |
if (dct_algo == FF_DCT_AUTO || dct_algo == FF_DCT_MMX) { |
#if HAVE_MMX_INLINE |
int cpu_flags = av_get_cpu_flags(); |
if (INLINE_MMX(cpu_flags)) |
s->dct_quantize = dct_quantize_MMX; |
#endif |
#if HAVE_MMXEXT_INLINE |
if (INLINE_MMXEXT(cpu_flags)) |
s->dct_quantize = dct_quantize_MMXEXT; |
#endif |
#if HAVE_SSE2_INLINE |
if (INLINE_SSE2(cpu_flags)) |
s->dct_quantize = dct_quantize_SSE2; |
#endif |
#if HAVE_SSSE3_INLINE |
if (INLINE_SSSE3(cpu_flags)) |
s->dct_quantize = dct_quantize_SSSE3; |
#endif |
} |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpegvideoenc_template.c |
---|
0,0 → 1,364 |
/* |
* MPEG video MMX templates |
* |
* Copyright (c) 2002 Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#undef MMREG_WIDTH |
#undef MM |
#undef MOVQ |
#undef SPREADW |
#undef PMAXW |
#undef PMAX |
#undef SAVE_SIGN |
#undef RESTORE_SIGN |
#if COMPILE_TEMPLATE_SSE2 |
#define MMREG_WIDTH "16" |
#define MM "%%xmm" |
#define MOVQ "movdqa" |
#define SPREADW(a) \ |
"pshuflw $0, "a", "a" \n\t"\ |
"punpcklwd "a", "a" \n\t" |
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t" |
#define PMAX(a,b) \ |
"movhlps "a", "b" \n\t"\ |
PMAXW(b, a)\ |
"pshuflw $0x0E, "a", "b" \n\t"\ |
PMAXW(b, a)\ |
"pshuflw $0x01, "a", "b" \n\t"\ |
PMAXW(b, a) |
#else |
#define MMREG_WIDTH "8" |
#define MM "%%mm" |
#define MOVQ "movq" |
#if COMPILE_TEMPLATE_MMXEXT |
#define SPREADW(a) "pshufw $0, "a", "a" \n\t" |
#define PMAXW(a,b) "pmaxsw "a", "b" \n\t" |
#define PMAX(a,b) \ |
"pshufw $0x0E, "a", "b" \n\t"\ |
PMAXW(b, a)\ |
"pshufw $0x01, "a", "b" \n\t"\ |
PMAXW(b, a) |
#else |
#define SPREADW(a) \ |
"punpcklwd "a", "a" \n\t"\ |
"punpcklwd "a", "a" \n\t" |
#define PMAXW(a,b) \ |
"psubusw "a", "b" \n\t"\ |
"paddw "a", "b" \n\t" |
#define PMAX(a,b) \ |
"movq "a", "b" \n\t"\ |
"psrlq $32, "a" \n\t"\ |
PMAXW(b, a)\ |
"movq "a", "b" \n\t"\ |
"psrlq $16, "a" \n\t"\ |
PMAXW(b, a) |
#endif |
#endif |
#if COMPILE_TEMPLATE_SSSE3 |
#define SAVE_SIGN(a,b) \ |
"movdqa "b", "a" \n\t"\ |
"pabsw "b", "b" \n\t" |
#define RESTORE_SIGN(a,b) \ |
"psignw "a", "b" \n\t" |
#else |
#define SAVE_SIGN(a,b) \ |
"pxor "a", "a" \n\t"\ |
"pcmpgtw "b", "a" \n\t" /* block[i] <= 0 ? 0xFF : 0x00 */\ |
"pxor "a", "b" \n\t"\ |
"psubw "a", "b" \n\t" /* ABS(block[i]) */ |
#define RESTORE_SIGN(a,b) \ |
"pxor "a", "b" \n\t"\ |
"psubw "a", "b" \n\t" // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) |
#endif |
static int RENAME(dct_quantize)(MpegEncContext *s, |
int16_t *block, int n, |
int qscale, int *overflow) |
{ |
x86_reg last_non_zero_p1; |
int level=0, q; //=0 is because gcc says uninitialized ... |
const uint16_t *qmat, *bias; |
LOCAL_ALIGNED_16(int16_t, temp_block, [64]); |
av_assert2((7&(int)(&temp_block[0])) == 0); //did gcc align it correctly? |
//s->fdct (block); |
RENAMEl(ff_fdct) (block); //cannot be anything else ... |
if(s->dct_error_sum) |
s->denoise_dct(s, block); |
if (s->mb_intra) { |
int dummy; |
if (n < 4){ |
q = s->y_dc_scale; |
bias = s->q_intra_matrix16[qscale][1]; |
qmat = s->q_intra_matrix16[qscale][0]; |
}else{ |
q = s->c_dc_scale; |
bias = s->q_chroma_intra_matrix16[qscale][1]; |
qmat = s->q_chroma_intra_matrix16[qscale][0]; |
} |
/* note: block[0] is assumed to be positive */ |
if (!s->h263_aic) { |
__asm__ volatile ( |
"mul %%ecx \n\t" |
: "=d" (level), "=a"(dummy) |
: "a" ((block[0]>>2) + q), "c" (ff_inverse[q<<1]) |
); |
} else |
/* For AIC we skip quant/dequant of INTRADC */ |
level = (block[0] + 4)>>3; |
block[0]=0; //avoid fake overflow |
// temp_block[0] = (block[0] + (q >> 1)) / q; |
last_non_zero_p1 = 1; |
} else { |
last_non_zero_p1 = 0; |
bias = s->q_inter_matrix16[qscale][1]; |
qmat = s->q_inter_matrix16[qscale][0]; |
} |
if((s->out_format == FMT_H263 || s->out_format == FMT_H261) && s->mpeg_quant==0){ |
__asm__ volatile( |
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 |
SPREADW(MM"3") |
"pxor "MM"7, "MM"7 \n\t" // 0 |
"pxor "MM"4, "MM"4 \n\t" // 0 |
MOVQ" (%2), "MM"5 \n\t" // qmat[0] |
"pxor "MM"6, "MM"6 \n\t" |
"psubw (%3), "MM"6 \n\t" // -bias[0] |
"mov $-128, %%"REG_a" \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] |
SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) |
"psubusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] |
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16 |
"por "MM"0, "MM"4 \n\t" |
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) |
MOVQ" "MM"0, (%5, %%"REG_a") \n\t" |
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 |
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" |
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 |
"pandn "MM"1, "MM"0 \n\t" |
PMAXW(MM"0", MM"3") |
"add $"MMREG_WIDTH", %%"REG_a" \n\t" |
" js 1b \n\t" |
PMAX(MM"3", MM"0") |
"movd "MM"3, %%"REG_a" \n\t" |
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 |
: "+a" (last_non_zero_p1) |
: "r" (block+64), "r" (qmat), "r" (bias), |
"r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64) |
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", |
"%xmm4", "%xmm5", "%xmm6", "%xmm7") |
); |
}else{ // FMT_H263 |
__asm__ volatile( |
"movd %%"REG_a", "MM"3 \n\t" // last_non_zero_p1 |
SPREADW(MM"3") |
"pxor "MM"7, "MM"7 \n\t" // 0 |
"pxor "MM"4, "MM"4 \n\t" // 0 |
"mov $-128, %%"REG_a" \n\t" |
".p2align 4 \n\t" |
"1: \n\t" |
MOVQ" (%1, %%"REG_a"), "MM"0 \n\t" // block[i] |
SAVE_SIGN(MM"1", MM"0") // ABS(block[i]) |
MOVQ" (%3, %%"REG_a"), "MM"6 \n\t" // bias[0] |
"paddusw "MM"6, "MM"0 \n\t" // ABS(block[i]) + bias[0] |
MOVQ" (%2, %%"REG_a"), "MM"5 \n\t" // qmat[i] |
"pmulhw "MM"5, "MM"0 \n\t" // (ABS(block[i])*qmat[0] + bias[0]*qmat[0])>>16 |
"por "MM"0, "MM"4 \n\t" |
RESTORE_SIGN(MM"1", MM"0") // out=((ABS(block[i])*qmat[0] - bias[0]*qmat[0])>>16)*sign(block[i]) |
MOVQ" "MM"0, (%5, %%"REG_a") \n\t" |
"pcmpeqw "MM"7, "MM"0 \n\t" // out==0 ? 0xFF : 0x00 |
MOVQ" (%4, %%"REG_a"), "MM"1 \n\t" |
MOVQ" "MM"7, (%1, %%"REG_a") \n\t" // 0 |
"pandn "MM"1, "MM"0 \n\t" |
PMAXW(MM"0", MM"3") |
"add $"MMREG_WIDTH", %%"REG_a" \n\t" |
" js 1b \n\t" |
PMAX(MM"3", MM"0") |
"movd "MM"3, %%"REG_a" \n\t" |
"movzb %%al, %%"REG_a" \n\t" // last_non_zero_p1 |
: "+a" (last_non_zero_p1) |
: "r" (block+64), "r" (qmat+64), "r" (bias+64), |
"r" (ff_inv_zigzag_direct16+64), "r" (temp_block+64) |
XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", |
"%xmm4", "%xmm5", "%xmm6", "%xmm7") |
); |
} |
__asm__ volatile( |
"movd %1, "MM"1 \n\t" // max_qcoeff |
SPREADW(MM"1") |
"psubusw "MM"1, "MM"4 \n\t" |
"packuswb "MM"4, "MM"4 \n\t" |
#if COMPILE_TEMPLATE_SSE2 |
"packuswb "MM"4, "MM"4 \n\t" |
#endif |
"movd "MM"4, %0 \n\t" // *overflow |
: "=g" (*overflow) |
: "g" (s->max_qcoeff) |
); |
if(s->mb_intra) block[0]= level; |
else block[0]= temp_block[0]; |
if(s->dsp.idct_permutation_type == FF_SIMPLE_IDCT_PERM){ |
if(last_non_zero_p1 <= 1) goto end; |
block[0x08] = temp_block[0x01]; block[0x10] = temp_block[0x08]; |
block[0x20] = temp_block[0x10]; |
if(last_non_zero_p1 <= 4) goto end; |
block[0x18] = temp_block[0x09]; block[0x04] = temp_block[0x02]; |
block[0x09] = temp_block[0x03]; |
if(last_non_zero_p1 <= 7) goto end; |
block[0x14] = temp_block[0x0A]; block[0x28] = temp_block[0x11]; |
block[0x12] = temp_block[0x18]; block[0x02] = temp_block[0x20]; |
if(last_non_zero_p1 <= 11) goto end; |
block[0x1A] = temp_block[0x19]; block[0x24] = temp_block[0x12]; |
block[0x19] = temp_block[0x0B]; block[0x01] = temp_block[0x04]; |
block[0x0C] = temp_block[0x05]; |
if(last_non_zero_p1 <= 16) goto end; |
block[0x11] = temp_block[0x0C]; block[0x29] = temp_block[0x13]; |
block[0x16] = temp_block[0x1A]; block[0x0A] = temp_block[0x21]; |
block[0x30] = temp_block[0x28]; block[0x22] = temp_block[0x30]; |
block[0x38] = temp_block[0x29]; block[0x06] = temp_block[0x22]; |
if(last_non_zero_p1 <= 24) goto end; |
block[0x1B] = temp_block[0x1B]; block[0x21] = temp_block[0x14]; |
block[0x1C] = temp_block[0x0D]; block[0x05] = temp_block[0x06]; |
block[0x0D] = temp_block[0x07]; block[0x15] = temp_block[0x0E]; |
block[0x2C] = temp_block[0x15]; block[0x13] = temp_block[0x1C]; |
if(last_non_zero_p1 <= 32) goto end; |
block[0x0B] = temp_block[0x23]; block[0x34] = temp_block[0x2A]; |
block[0x2A] = temp_block[0x31]; block[0x32] = temp_block[0x38]; |
block[0x3A] = temp_block[0x39]; block[0x26] = temp_block[0x32]; |
block[0x39] = temp_block[0x2B]; block[0x03] = temp_block[0x24]; |
if(last_non_zero_p1 <= 40) goto end; |
block[0x1E] = temp_block[0x1D]; block[0x25] = temp_block[0x16]; |
block[0x1D] = temp_block[0x0F]; block[0x2D] = temp_block[0x17]; |
block[0x17] = temp_block[0x1E]; block[0x0E] = temp_block[0x25]; |
block[0x31] = temp_block[0x2C]; block[0x2B] = temp_block[0x33]; |
if(last_non_zero_p1 <= 48) goto end; |
block[0x36] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; |
block[0x23] = temp_block[0x34]; block[0x3C] = temp_block[0x2D]; |
block[0x07] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; |
block[0x0F] = temp_block[0x27]; block[0x35] = temp_block[0x2E]; |
if(last_non_zero_p1 <= 56) goto end; |
block[0x2E] = temp_block[0x35]; block[0x33] = temp_block[0x3C]; |
block[0x3E] = temp_block[0x3D]; block[0x27] = temp_block[0x36]; |
block[0x3D] = temp_block[0x2F]; block[0x2F] = temp_block[0x37]; |
block[0x37] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; |
}else if(s->dsp.idct_permutation_type == FF_LIBMPEG2_IDCT_PERM){ |
if(last_non_zero_p1 <= 1) goto end; |
block[0x04] = temp_block[0x01]; |
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; |
if(last_non_zero_p1 <= 4) goto end; |
block[0x0C] = temp_block[0x09]; block[0x01] = temp_block[0x02]; |
block[0x05] = temp_block[0x03]; |
if(last_non_zero_p1 <= 7) goto end; |
block[0x09] = temp_block[0x0A]; block[0x14] = temp_block[0x11]; |
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; |
if(last_non_zero_p1 <= 11) goto end; |
block[0x1C] = temp_block[0x19]; |
block[0x11] = temp_block[0x12]; block[0x0D] = temp_block[0x0B]; |
block[0x02] = temp_block[0x04]; block[0x06] = temp_block[0x05]; |
if(last_non_zero_p1 <= 16) goto end; |
block[0x0A] = temp_block[0x0C]; block[0x15] = temp_block[0x13]; |
block[0x19] = temp_block[0x1A]; block[0x24] = temp_block[0x21]; |
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; |
block[0x2C] = temp_block[0x29]; block[0x21] = temp_block[0x22]; |
if(last_non_zero_p1 <= 24) goto end; |
block[0x1D] = temp_block[0x1B]; block[0x12] = temp_block[0x14]; |
block[0x0E] = temp_block[0x0D]; block[0x03] = temp_block[0x06]; |
block[0x07] = temp_block[0x07]; block[0x0B] = temp_block[0x0E]; |
block[0x16] = temp_block[0x15]; block[0x1A] = temp_block[0x1C]; |
if(last_non_zero_p1 <= 32) goto end; |
block[0x25] = temp_block[0x23]; block[0x29] = temp_block[0x2A]; |
block[0x34] = temp_block[0x31]; block[0x38] = temp_block[0x38]; |
block[0x3C] = temp_block[0x39]; block[0x31] = temp_block[0x32]; |
block[0x2D] = temp_block[0x2B]; block[0x22] = temp_block[0x24]; |
if(last_non_zero_p1 <= 40) goto end; |
block[0x1E] = temp_block[0x1D]; block[0x13] = temp_block[0x16]; |
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; |
block[0x1B] = temp_block[0x1E]; block[0x26] = temp_block[0x25]; |
block[0x2A] = temp_block[0x2C]; block[0x35] = temp_block[0x33]; |
if(last_non_zero_p1 <= 48) goto end; |
block[0x39] = temp_block[0x3A]; block[0x3D] = temp_block[0x3B]; |
block[0x32] = temp_block[0x34]; block[0x2E] = temp_block[0x2D]; |
block[0x23] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; |
block[0x27] = temp_block[0x27]; block[0x2B] = temp_block[0x2E]; |
if(last_non_zero_p1 <= 56) goto end; |
block[0x36] = temp_block[0x35]; block[0x3A] = temp_block[0x3C]; |
block[0x3E] = temp_block[0x3D]; block[0x33] = temp_block[0x36]; |
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; |
block[0x3B] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; |
}else{ |
if(last_non_zero_p1 <= 1) goto end; |
block[0x01] = temp_block[0x01]; |
block[0x08] = temp_block[0x08]; block[0x10] = temp_block[0x10]; |
if(last_non_zero_p1 <= 4) goto end; |
block[0x09] = temp_block[0x09]; block[0x02] = temp_block[0x02]; |
block[0x03] = temp_block[0x03]; |
if(last_non_zero_p1 <= 7) goto end; |
block[0x0A] = temp_block[0x0A]; block[0x11] = temp_block[0x11]; |
block[0x18] = temp_block[0x18]; block[0x20] = temp_block[0x20]; |
if(last_non_zero_p1 <= 11) goto end; |
block[0x19] = temp_block[0x19]; |
block[0x12] = temp_block[0x12]; block[0x0B] = temp_block[0x0B]; |
block[0x04] = temp_block[0x04]; block[0x05] = temp_block[0x05]; |
if(last_non_zero_p1 <= 16) goto end; |
block[0x0C] = temp_block[0x0C]; block[0x13] = temp_block[0x13]; |
block[0x1A] = temp_block[0x1A]; block[0x21] = temp_block[0x21]; |
block[0x28] = temp_block[0x28]; block[0x30] = temp_block[0x30]; |
block[0x29] = temp_block[0x29]; block[0x22] = temp_block[0x22]; |
if(last_non_zero_p1 <= 24) goto end; |
block[0x1B] = temp_block[0x1B]; block[0x14] = temp_block[0x14]; |
block[0x0D] = temp_block[0x0D]; block[0x06] = temp_block[0x06]; |
block[0x07] = temp_block[0x07]; block[0x0E] = temp_block[0x0E]; |
block[0x15] = temp_block[0x15]; block[0x1C] = temp_block[0x1C]; |
if(last_non_zero_p1 <= 32) goto end; |
block[0x23] = temp_block[0x23]; block[0x2A] = temp_block[0x2A]; |
block[0x31] = temp_block[0x31]; block[0x38] = temp_block[0x38]; |
block[0x39] = temp_block[0x39]; block[0x32] = temp_block[0x32]; |
block[0x2B] = temp_block[0x2B]; block[0x24] = temp_block[0x24]; |
if(last_non_zero_p1 <= 40) goto end; |
block[0x1D] = temp_block[0x1D]; block[0x16] = temp_block[0x16]; |
block[0x0F] = temp_block[0x0F]; block[0x17] = temp_block[0x17]; |
block[0x1E] = temp_block[0x1E]; block[0x25] = temp_block[0x25]; |
block[0x2C] = temp_block[0x2C]; block[0x33] = temp_block[0x33]; |
if(last_non_zero_p1 <= 48) goto end; |
block[0x3A] = temp_block[0x3A]; block[0x3B] = temp_block[0x3B]; |
block[0x34] = temp_block[0x34]; block[0x2D] = temp_block[0x2D]; |
block[0x26] = temp_block[0x26]; block[0x1F] = temp_block[0x1F]; |
block[0x27] = temp_block[0x27]; block[0x2E] = temp_block[0x2E]; |
if(last_non_zero_p1 <= 56) goto end; |
block[0x35] = temp_block[0x35]; block[0x3C] = temp_block[0x3C]; |
block[0x3D] = temp_block[0x3D]; block[0x36] = temp_block[0x36]; |
block[0x2F] = temp_block[0x2F]; block[0x37] = temp_block[0x37]; |
block[0x3E] = temp_block[0x3E]; block[0x3F] = temp_block[0x3F]; |
} |
end: |
return last_non_zero_p1 - 1; |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/pngdsp.asm |
---|
0,0 → 1,173 |
;****************************************************************************** |
;* x86 optimizations for PNG decoding |
;* |
;* Copyright (c) 2008 Loren Merritt <lorenm@u.washington.edu> |
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
cextern pw_255 |
SECTION_TEXT |
; %1 = nr. of xmm registers used |
%macro ADD_BYTES_FN 1 |
cglobal add_bytes_l2, 4, 6, %1, dst, src1, src2, wa, w, i |
%if ARCH_X86_64 |
movsxd waq, wad |
%endif |
xor iq, iq |
; vector loop |
mov wq, waq |
and waq, ~(mmsize*2-1) |
jmp .end_v |
.loop_v: |
mova m0, [src1q+iq] |
mova m1, [src1q+iq+mmsize] |
paddb m0, [src2q+iq] |
paddb m1, [src2q+iq+mmsize] |
mova [dstq+iq ], m0 |
mova [dstq+iq+mmsize], m1 |
add iq, mmsize*2 |
.end_v: |
cmp iq, waq |
jl .loop_v |
%if mmsize == 16 |
; vector loop |
mov waq, wq |
and waq, ~7 |
jmp .end_l |
.loop_l: |
movq mm0, [src1q+iq] |
paddb mm0, [src2q+iq] |
movq [dstq+iq ], mm0 |
add iq, 8 |
.end_l: |
cmp iq, waq |
jl .loop_l |
%endif |
; scalar loop for leftover |
jmp .end_s |
.loop_s: |
mov wab, [src1q+iq] |
add wab, [src2q+iq] |
mov [dstq+iq], wab |
inc iq |
.end_s: |
cmp iq, wq |
jl .loop_s |
REP_RET |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
ADD_BYTES_FN 0 |
%endif |
INIT_XMM sse2 |
ADD_BYTES_FN 2 |
%macro ADD_PAETH_PRED_FN 1 |
cglobal add_png_paeth_prediction, 5, 7, %1, dst, src, top, w, bpp, end, cntr |
%if ARCH_X86_64 |
movsxd bppq, bppd |
movsxd wq, wd |
%endif |
lea endq, [dstq+wq-(mmsize/2-1)] |
sub topq, dstq |
sub srcq, dstq |
sub dstq, bppq |
pxor m7, m7 |
PUSH dstq |
lea cntrq, [bppq-1] |
shr cntrq, 2 + mmsize/16 |
.bpp_loop: |
lea dstq, [dstq+cntrq*(mmsize/2)] |
movh m0, [dstq] |
movh m1, [topq+dstq] |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
add dstq, bppq |
.loop: |
mova m2, m1 |
movh m1, [topq+dstq] |
mova m3, m2 |
punpcklbw m1, m7 |
mova m4, m2 |
psubw m3, m1 |
psubw m4, m0 |
mova m5, m3 |
paddw m5, m4 |
%if cpuflag(ssse3) |
pabsw m3, m3 |
pabsw m4, m4 |
pabsw m5, m5 |
%else ; !cpuflag(ssse3) |
psubw m7, m5 |
pmaxsw m5, m7 |
pxor m6, m6 |
pxor m7, m7 |
psubw m6, m3 |
psubw m7, m4 |
pmaxsw m3, m6 |
pmaxsw m4, m7 |
pxor m7, m7 |
%endif ; cpuflag(ssse3) |
mova m6, m4 |
pminsw m6, m5 |
pcmpgtw m3, m6 |
pcmpgtw m4, m5 |
mova m6, m4 |
pand m4, m3 |
pandn m6, m3 |
pandn m3, m0 |
movh m0, [srcq+dstq] |
pand m6, m1 |
pand m2, m4 |
punpcklbw m0, m7 |
paddw m0, m6 |
paddw m3, m2 |
paddw m0, m3 |
pand m0, [pw_255] |
mova m3, m0 |
packuswb m3, m3 |
movh [dstq], m3 |
add dstq, bppq |
cmp dstq, endq |
jle .loop |
mov dstq, [rsp] |
dec cntrq |
jge .bpp_loop |
POP dstq |
RET |
%endmacro |
INIT_MMX mmxext |
ADD_PAETH_PRED_FN 0 |
INIT_MMX ssse3 |
ADD_PAETH_PRED_FN 0 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/pngdsp_init.c |
---|
0,0 → 1,50 |
/* |
* x86 PNG optimizations. |
* Copyright (c) 2008 Loren Merrit <lorenm@u.washington.edu> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/common.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/pngdsp.h" |
void ff_add_png_paeth_prediction_mmxext(uint8_t *dst, uint8_t *src, |
uint8_t *top, int w, int bpp); |
void ff_add_png_paeth_prediction_ssse3(uint8_t *dst, uint8_t *src, |
uint8_t *top, int w, int bpp); |
void ff_add_bytes_l2_mmx (uint8_t *dst, uint8_t *src1, |
uint8_t *src2, int w); |
void ff_add_bytes_l2_sse2(uint8_t *dst, uint8_t *src1, |
uint8_t *src2, int w); |
av_cold void ff_pngdsp_init_x86(PNGDSPContext *dsp) |
{ |
int cpu_flags = av_get_cpu_flags(); |
#if ARCH_X86_32 |
if (EXTERNAL_MMX(cpu_flags)) |
dsp->add_bytes_l2 = ff_add_bytes_l2_mmx; |
#endif |
if (EXTERNAL_MMXEXT(cpu_flags)) |
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_mmxext; |
if (EXTERNAL_SSE2(cpu_flags)) |
dsp->add_bytes_l2 = ff_add_bytes_l2_sse2; |
if (EXTERNAL_SSSE3(cpu_flags)) |
dsp->add_paeth_prediction = ff_add_png_paeth_prediction_ssse3; |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/proresdsp.asm |
---|
0,0 → 1,326 |
;****************************************************************************** |
;* x86-SIMD-optimized IDCT for prores |
;* this is identical to "simple" IDCT written by Michael Niedermayer |
;* except for the clip range |
;* |
;* Copyright (c) 2011 Ronald S. Bultje <rsbultje@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* 51, Inc., Foundation Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
%define W1sh2 22725 ; W1 = 90901 = 22725<<2 + 1 |
%define W2sh2 21407 ; W2 = 85627 = 21407<<2 - 1 |
%define W3sh2 19265 ; W3 = 77062 = 19265<<2 + 2 |
%define W4sh2 16384 ; W4 = 65535 = 16384<<2 - 1 |
%define W5sh2 12873 ; W5 = 51491 = 12873<<2 - 1 |
%define W6sh2 8867 ; W6 = 35468 = 8867<<2 |
%define W7sh2 4520 ; W7 = 18081 = 4520<<2 + 1 |
%if ARCH_X86_64 |
SECTION_RODATA |
w4_plus_w2: times 4 dw W4sh2, +W2sh2 |
w4_min_w2: times 4 dw W4sh2, -W2sh2 |
w4_plus_w6: times 4 dw W4sh2, +W6sh2 |
w4_min_w6: times 4 dw W4sh2, -W6sh2 |
w1_plus_w3: times 4 dw W1sh2, +W3sh2 |
w3_min_w1: times 4 dw W3sh2, -W1sh2 |
w7_plus_w3: times 4 dw W7sh2, +W3sh2 |
w3_min_w7: times 4 dw W3sh2, -W7sh2 |
w1_plus_w5: times 4 dw W1sh2, +W5sh2 |
w5_min_w1: times 4 dw W5sh2, -W1sh2 |
w5_plus_w7: times 4 dw W5sh2, +W7sh2 |
w7_min_w5: times 4 dw W7sh2, -W5sh2 |
pw_88: times 8 dw 0x2008 |
cextern pw_1 |
cextern pw_4 |
cextern pw_512 |
cextern pw_1019 |
section .text align=16 |
; interleave data while maintaining source |
; %1=type, %2=dstlo, %3=dsthi, %4=src, %5=interleave |
%macro SBUTTERFLY3 5 |
punpckl%1 m%2, m%4, m%5 |
punpckh%1 m%3, m%4, m%5 |
%endmacro |
; %1/%2=src1/dst1, %3/%4=dst2, %5/%6=src2, %7=shift |
; action: %3/%4 = %1/%2 - %5/%6; %1/%2 += %5/%6 |
; %1/%2/%3/%4 >>= %7; dword -> word (in %1/%3) |
%macro SUMSUB_SHPK 7 |
psubd %3, %1, %5 ; { a0 - b0 }[0-3] |
psubd %4, %2, %6 ; { a0 - b0 }[4-7] |
paddd %1, %5 ; { a0 + b0 }[0-3] |
paddd %2, %6 ; { a0 + b0 }[4-7] |
psrad %1, %7 |
psrad %2, %7 |
psrad %3, %7 |
psrad %4, %7 |
packssdw %1, %2 ; row[0] |
packssdw %3, %4 ; row[7] |
%endmacro |
; %1 = row or col (for rounding variable) |
; %2 = number of bits to shift at the end |
%macro IDCT_1D 2 |
; a0 = (W4 * row[0]) + (1 << (15 - 1)); |
; a1 = a0; |
; a2 = a0; |
; a3 = a0; |
; a0 += W2 * row[2]; |
; a1 += W6 * row[2]; |
; a2 -= W6 * row[2]; |
; a3 -= W2 * row[2]; |
%ifidn %1, col |
paddw m10,[pw_88] |
%endif |
%ifidn %1, row |
paddw m10,[pw_1] |
%endif |
SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[0], row[2] }[0-3]/[4-7] |
pmaddwd m2, m0, [w4_plus_w6] |
pmaddwd m3, m1, [w4_plus_w6] |
pmaddwd m4, m0, [w4_min_w6] |
pmaddwd m5, m1, [w4_min_w6] |
pmaddwd m6, m0, [w4_min_w2] |
pmaddwd m7, m1, [w4_min_w2] |
pmaddwd m0, [w4_plus_w2] |
pmaddwd m1, [w4_plus_w2] |
; a0: -1*row[0]-1*row[2] |
; a1: -1*row[0] |
; a2: -1*row[0] |
; a3: -1*row[0]+1*row[2] |
; a0 += W4*row[4] + W6*row[6]; i.e. -1*row[4] |
; a1 -= W4*row[4] + W2*row[6]; i.e. -1*row[4]-1*row[6] |
; a2 -= W4*row[4] - W2*row[6]; i.e. -1*row[4]+1*row[6] |
; a3 += W4*row[4] - W6*row[6]; i.e. -1*row[4] |
SBUTTERFLY3 wd, 8, 9, 13, 12 ; { row[4], row[6] }[0-3]/[4-7] |
pmaddwd m10, m8, [w4_plus_w6] |
pmaddwd m11, m9, [w4_plus_w6] |
paddd m0, m10 ; a0[0-3] |
paddd m1, m11 ; a0[4-7] |
pmaddwd m10, m8, [w4_min_w6] |
pmaddwd m11, m9, [w4_min_w6] |
paddd m6, m10 ; a3[0-3] |
paddd m7, m11 ; a3[4-7] |
pmaddwd m10, m8, [w4_min_w2] |
pmaddwd m11, m9, [w4_min_w2] |
pmaddwd m8, [w4_plus_w2] |
pmaddwd m9, [w4_plus_w2] |
psubd m4, m10 ; a2[0-3] intermediate |
psubd m5, m11 ; a2[4-7] intermediate |
psubd m2, m8 ; a1[0-3] intermediate |
psubd m3, m9 ; a1[4-7] intermediate |
; load/store |
mova [r2+ 0], m0 |
mova [r2+ 32], m2 |
mova [r2+ 64], m4 |
mova [r2+ 96], m6 |
mova m10,[r2+ 16] ; { row[1] }[0-7] |
mova m8, [r2+ 48] ; { row[3] }[0-7] |
mova m13,[r2+ 80] ; { row[5] }[0-7] |
mova m14,[r2+112] ; { row[7] }[0-7] |
mova [r2+ 16], m1 |
mova [r2+ 48], m3 |
mova [r2+ 80], m5 |
mova [r2+112], m7 |
%ifidn %1, row |
pmullw m10,[r3+ 16] |
pmullw m8, [r3+ 48] |
pmullw m13,[r3+ 80] |
pmullw m14,[r3+112] |
%endif |
; b0 = MUL(W1, row[1]); |
; MAC(b0, W3, row[3]); |
; b1 = MUL(W3, row[1]); |
; MAC(b1, -W7, row[3]); |
; b2 = MUL(W5, row[1]); |
; MAC(b2, -W1, row[3]); |
; b3 = MUL(W7, row[1]); |
; MAC(b3, -W5, row[3]); |
SBUTTERFLY3 wd, 0, 1, 10, 8 ; { row[1], row[3] }[0-3]/[4-7] |
pmaddwd m2, m0, [w3_min_w7] |
pmaddwd m3, m1, [w3_min_w7] |
pmaddwd m4, m0, [w5_min_w1] |
pmaddwd m5, m1, [w5_min_w1] |
pmaddwd m6, m0, [w7_min_w5] |
pmaddwd m7, m1, [w7_min_w5] |
pmaddwd m0, [w1_plus_w3] |
pmaddwd m1, [w1_plus_w3] |
; b0: +1*row[1]+2*row[3] |
; b1: +2*row[1]-1*row[3] |
; b2: -1*row[1]-1*row[3] |
; b3: +1*row[1]+1*row[3] |
; MAC(b0, W5, row[5]); |
; MAC(b0, W7, row[7]); |
; MAC(b1, -W1, row[5]); |
; MAC(b1, -W5, row[7]); |
; MAC(b2, W7, row[5]); |
; MAC(b2, W3, row[7]); |
; MAC(b3, W3, row[5]); |
; MAC(b3, -W1, row[7]); |
SBUTTERFLY3 wd, 8, 9, 13, 14 ; { row[5], row[7] }[0-3]/[4-7] |
; b0: -1*row[5]+1*row[7] |
; b1: -1*row[5]+1*row[7] |
; b2: +1*row[5]+2*row[7] |
; b3: +2*row[5]-1*row[7] |
pmaddwd m10, m8, [w1_plus_w5] |
pmaddwd m11, m9, [w1_plus_w5] |
pmaddwd m12, m8, [w5_plus_w7] |
pmaddwd m13, m9, [w5_plus_w7] |
psubd m2, m10 ; b1[0-3] |
psubd m3, m11 ; b1[4-7] |
paddd m0, m12 ; b0[0-3] |
paddd m1, m13 ; b0[4-7] |
pmaddwd m12, m8, [w7_plus_w3] |
pmaddwd m13, m9, [w7_plus_w3] |
pmaddwd m8, [w3_min_w1] |
pmaddwd m9, [w3_min_w1] |
paddd m4, m12 ; b2[0-3] |
paddd m5, m13 ; b2[4-7] |
paddd m6, m8 ; b3[0-3] |
paddd m7, m9 ; b3[4-7] |
; row[0] = (a0 + b0) >> 15; |
; row[7] = (a0 - b0) >> 15; |
; row[1] = (a1 + b1) >> 15; |
; row[6] = (a1 - b1) >> 15; |
; row[2] = (a2 + b2) >> 15; |
; row[5] = (a2 - b2) >> 15; |
; row[3] = (a3 + b3) >> 15; |
; row[4] = (a3 - b3) >> 15; |
mova m8, [r2+ 0] ; a0[0-3] |
mova m9, [r2+16] ; a0[4-7] |
SUMSUB_SHPK m8, m9, m10, m11, m0, m1, %2 |
mova m0, [r2+32] ; a1[0-3] |
mova m1, [r2+48] ; a1[4-7] |
SUMSUB_SHPK m0, m1, m9, m11, m2, m3, %2 |
mova m1, [r2+64] ; a2[0-3] |
mova m2, [r2+80] ; a2[4-7] |
SUMSUB_SHPK m1, m2, m11, m3, m4, m5, %2 |
mova m2, [r2+96] ; a3[0-3] |
mova m3, [r2+112] ; a3[4-7] |
SUMSUB_SHPK m2, m3, m4, m5, m6, m7, %2 |
%endmacro |
; void prores_idct_put_10_<opt>(uint8_t *pixels, int stride, |
; int16_t *block, const int16_t *qmat); |
%macro idct_put_fn 1 |
cglobal prores_idct_put_10, 4, 4, %1 |
movsxd r1, r1d |
pxor m15, m15 ; zero |
; for (i = 0; i < 8; i++) |
; idctRowCondDC(block + i*8); |
mova m10,[r2+ 0] ; { row[0] }[0-7] |
mova m8, [r2+32] ; { row[2] }[0-7] |
mova m13,[r2+64] ; { row[4] }[0-7] |
mova m12,[r2+96] ; { row[6] }[0-7] |
pmullw m10,[r3+ 0] |
pmullw m8, [r3+32] |
pmullw m13,[r3+64] |
pmullw m12,[r3+96] |
IDCT_1D row, 15 |
; transpose for second part of IDCT |
TRANSPOSE8x8W 8, 0, 1, 2, 4, 11, 9, 10, 3 |
mova [r2+ 16], m0 |
mova [r2+ 48], m2 |
mova [r2+ 80], m11 |
mova [r2+112], m10 |
SWAP 8, 10 |
SWAP 1, 8 |
SWAP 4, 13 |
SWAP 9, 12 |
; for (i = 0; i < 8; i++) |
; idctSparseColAdd(dest + i, line_size, block + i); |
IDCT_1D col, 18 |
; clip/store |
mova m3, [pw_4] |
mova m5, [pw_1019] |
pmaxsw m8, m3 |
pmaxsw m0, m3 |
pmaxsw m1, m3 |
pmaxsw m2, m3 |
pmaxsw m4, m3 |
pmaxsw m11, m3 |
pmaxsw m9, m3 |
pmaxsw m10, m3 |
pminsw m8, m5 |
pminsw m0, m5 |
pminsw m1, m5 |
pminsw m2, m5 |
pminsw m4, m5 |
pminsw m11, m5 |
pminsw m9, m5 |
pminsw m10, m5 |
lea r2, [r1*3] |
mova [r0 ], m8 |
mova [r0+r1 ], m0 |
mova [r0+r1*2], m1 |
mova [r0+r2 ], m2 |
lea r0, [r0+r1*4] |
mova [r0 ], m4 |
mova [r0+r1 ], m11 |
mova [r0+r1*2], m9 |
mova [r0+r2 ], m10 |
RET |
%endmacro |
%macro SIGNEXTEND 2-3 |
%if cpuflag(sse4) ; dstlow, dsthigh |
movhlps %2, %1 |
pmovsxwd %1, %1 |
pmovsxwd %2, %2 |
%elif cpuflag(sse2) ; dstlow, dsthigh, tmp |
pxor %3, %3 |
pcmpgtw %3, %1 |
mova %2, %1 |
punpcklwd %1, %3 |
punpckhwd %2, %3 |
%endif |
%endmacro |
INIT_XMM sse2 |
idct_put_fn 16 |
INIT_XMM sse4 |
idct_put_fn 16 |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
idct_put_fn 16 |
%endif |
%endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/proresdsp_init.c |
---|
0,0 → 1,58 |
/* |
* Apple ProRes compatible decoder |
* |
* Copyright (c) 2010-2011 Maxim Poliakovski |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/attributes.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/dsputil.h" |
#include "libavcodec/proresdsp.h" |
void ff_prores_idct_put_10_sse2(uint16_t *dst, int linesize, |
int16_t *block, const int16_t *qmat); |
void ff_prores_idct_put_10_sse4(uint16_t *dst, int linesize, |
int16_t *block, const int16_t *qmat); |
void ff_prores_idct_put_10_avx (uint16_t *dst, int linesize, |
int16_t *block, const int16_t *qmat); |
av_cold void ff_proresdsp_x86_init(ProresDSPContext *dsp, AVCodecContext *avctx) |
{ |
#if ARCH_X86_64 |
int cpu_flags = av_get_cpu_flags(); |
if(avctx->flags & CODEC_FLAG_BITEXACT) |
return; |
if (EXTERNAL_SSE2(cpu_flags)) { |
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; |
dsp->idct_put = ff_prores_idct_put_10_sse2; |
} |
if (EXTERNAL_SSE4(cpu_flags)) { |
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; |
dsp->idct_put = ff_prores_idct_put_10_sse4; |
} |
if (EXTERNAL_AVX(cpu_flags)) { |
dsp->idct_permutation_type = FF_TRANSPOSE_IDCT_PERM; |
dsp->idct_put = ff_prores_idct_put_10_avx; |
} |
#endif /* ARCH_X86_64 */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/qpel.asm |
---|
0,0 → 1,176 |
;****************************************************************************** |
;* MMX optimized DSP utils |
;* Copyright (c) 2008 Loren Merritt |
;* Copyright (c) 2003-2013 Michael Niedermayer |
;* Copyright (c) 2013 Daniel Kang |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION .text |
%macro op_avgh 3 |
movh %3, %2 |
pavgb %1, %3 |
movh %2, %1 |
%endmacro |
%macro op_avg 2 |
pavgb %1, %2 |
mova %2, %1 |
%endmacro |
%macro op_puth 2-3 |
movh %2, %1 |
%endmacro |
%macro op_put 2 |
mova %2, %1 |
%endmacro |
; void pixels4_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
%macro PIXELS4_L2 1 |
%define OP op_%1h |
cglobal %1_pixels4_l2, 6,6 |
movsxdifnidn r3, r3d |
movsxdifnidn r4, r4d |
test r5d, 1 |
je .loop |
movd m0, [r1] |
movd m1, [r2] |
add r1, r4 |
add r2, 4 |
pavgb m0, m1 |
OP m0, [r0], m3 |
add r0, r3 |
dec r5d |
.loop: |
mova m0, [r1] |
mova m1, [r1+r4] |
lea r1, [r1+2*r4] |
pavgb m0, [r2] |
pavgb m1, [r2+4] |
OP m0, [r0], m3 |
OP m1, [r0+r3], m3 |
lea r0, [r0+2*r3] |
mova m0, [r1] |
mova m1, [r1+r4] |
lea r1, [r1+2*r4] |
pavgb m0, [r2+8] |
pavgb m1, [r2+12] |
OP m0, [r0], m3 |
OP m1, [r0+r3], m3 |
lea r0, [r0+2*r3] |
add r2, 16 |
sub r5d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PIXELS4_L2 put |
PIXELS4_L2 avg |
; void pixels8_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
%macro PIXELS8_L2 1 |
%define OP op_%1 |
cglobal %1_pixels8_l2, 6,6 |
movsxdifnidn r3, r3d |
movsxdifnidn r4, r4d |
test r5d, 1 |
je .loop |
mova m0, [r1] |
mova m1, [r2] |
add r1, r4 |
add r2, 8 |
pavgb m0, m1 |
OP m0, [r0] |
add r0, r3 |
dec r5d |
.loop: |
mova m0, [r1] |
mova m1, [r1+r4] |
lea r1, [r1+2*r4] |
pavgb m0, [r2] |
pavgb m1, [r2+8] |
OP m0, [r0] |
OP m1, [r0+r3] |
lea r0, [r0+2*r3] |
mova m0, [r1] |
mova m1, [r1+r4] |
lea r1, [r1+2*r4] |
pavgb m0, [r2+16] |
pavgb m1, [r2+24] |
OP m0, [r0] |
OP m1, [r0+r3] |
lea r0, [r0+2*r3] |
add r2, 32 |
sub r5d, 4 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PIXELS8_L2 put |
PIXELS8_L2 avg |
; void pixels16_l2_mmxext(uint8_t *dst, uint8_t *src1, uint8_t *src2, int dstStride, int src1Stride, int h) |
%macro PIXELS16_L2 1 |
%define OP op_%1 |
cglobal %1_pixels16_l2, 6,6 |
movsxdifnidn r3, r3d |
movsxdifnidn r4, r4d |
test r5d, 1 |
je .loop |
mova m0, [r1] |
mova m1, [r1+8] |
pavgb m0, [r2] |
pavgb m1, [r2+8] |
add r1, r4 |
add r2, 16 |
OP m0, [r0] |
OP m1, [r0+8] |
add r0, r3 |
dec r5d |
.loop: |
mova m0, [r1] |
mova m1, [r1+8] |
add r1, r4 |
pavgb m0, [r2] |
pavgb m1, [r2+8] |
OP m0, [r0] |
OP m1, [r0+8] |
add r0, r3 |
mova m0, [r1] |
mova m1, [r1+8] |
add r1, r4 |
pavgb m0, [r2+16] |
pavgb m1, [r2+24] |
OP m0, [r0] |
OP m1, [r0+8] |
add r0, r3 |
add r2, 32 |
sub r5d, 2 |
jne .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PIXELS16_L2 put |
PIXELS16_L2 avg |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rnd_mmx.c |
---|
0,0 → 1,35 |
/* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "config.h" |
#include "dsputil_x86.h" |
#if HAVE_INLINE_ASM |
#define DEF(x, y) ff_ ## x ## _ ## y ## _mmx |
#define SET_RND MOVQ_WTWO |
#define PAVGBP(a, b, c, d, e, f) PAVGBP_MMX(a, b, c, d, e, f) |
#define PAVGB(a, b, c, e) PAVGB_MMX(a, b, c, e) |
#define STATIC |
#include "rnd_template.c" |
PIXELS16(, ff_avg, , _xy2, _mmx) |
PIXELS16(, ff_put, , _xy2, _mmx) |
#endif /* HAVE_INLINE_ASM */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rnd_template.c |
---|
0,0 → 1,173 |
/* |
* DSP utils mmx functions are compiled twice for rnd/no_rnd |
* Copyright (c) 2000, 2001 Fabrice Bellard |
* Copyright (c) 2003-2004 Michael Niedermayer <michaelni@gmx.at> |
* |
* MMX optimization by Nick Kurshev <nickols_k@mail.ru> |
* mostly rewritten by Michael Niedermayer <michaelni@gmx.at> |
* and improved by Zdenek Kabelac <kabi@users.sf.net> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include <stddef.h> |
#include <stdint.h> |
// put_pixels |
STATIC void DEF(put, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h) |
{ |
MOVQ_ZERO(mm7); |
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version |
__asm__ volatile( |
"movq (%1), %%mm0 \n\t" |
"movq 1(%1), %%mm4 \n\t" |
"movq %%mm0, %%mm1 \n\t" |
"movq %%mm4, %%mm5 \n\t" |
"punpcklbw %%mm7, %%mm0 \n\t" |
"punpcklbw %%mm7, %%mm4 \n\t" |
"punpckhbw %%mm7, %%mm1 \n\t" |
"punpckhbw %%mm7, %%mm5 \n\t" |
"paddusw %%mm0, %%mm4 \n\t" |
"paddusw %%mm1, %%mm5 \n\t" |
"xor %%"REG_a", %%"REG_a" \n\t" |
"add %3, %1 \n\t" |
".p2align 3 \n\t" |
"1: \n\t" |
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
"movq %%mm0, %%mm1 \n\t" |
"movq %%mm2, %%mm3 \n\t" |
"punpcklbw %%mm7, %%mm0 \n\t" |
"punpcklbw %%mm7, %%mm2 \n\t" |
"punpckhbw %%mm7, %%mm1 \n\t" |
"punpckhbw %%mm7, %%mm3 \n\t" |
"paddusw %%mm2, %%mm0 \n\t" |
"paddusw %%mm3, %%mm1 \n\t" |
"paddusw %%mm6, %%mm4 \n\t" |
"paddusw %%mm6, %%mm5 \n\t" |
"paddusw %%mm0, %%mm4 \n\t" |
"paddusw %%mm1, %%mm5 \n\t" |
"psrlw $2, %%mm4 \n\t" |
"psrlw $2, %%mm5 \n\t" |
"packuswb %%mm5, %%mm4 \n\t" |
"movq %%mm4, (%2, %%"REG_a") \n\t" |
"add %3, %%"REG_a" \n\t" |
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
"movq 1(%1, %%"REG_a"), %%mm4 \n\t" |
"movq %%mm2, %%mm3 \n\t" |
"movq %%mm4, %%mm5 \n\t" |
"punpcklbw %%mm7, %%mm2 \n\t" |
"punpcklbw %%mm7, %%mm4 \n\t" |
"punpckhbw %%mm7, %%mm3 \n\t" |
"punpckhbw %%mm7, %%mm5 \n\t" |
"paddusw %%mm2, %%mm4 \n\t" |
"paddusw %%mm3, %%mm5 \n\t" |
"paddusw %%mm6, %%mm0 \n\t" |
"paddusw %%mm6, %%mm1 \n\t" |
"paddusw %%mm4, %%mm0 \n\t" |
"paddusw %%mm5, %%mm1 \n\t" |
"psrlw $2, %%mm0 \n\t" |
"psrlw $2, %%mm1 \n\t" |
"packuswb %%mm1, %%mm0 \n\t" |
"movq %%mm0, (%2, %%"REG_a") \n\t" |
"add %3, %%"REG_a" \n\t" |
"subl $2, %0 \n\t" |
"jnz 1b \n\t" |
:"+g"(h), "+S"(pixels) |
:"D"(block), "r"((x86_reg)line_size) |
:REG_a, "memory"); |
} |
// avg_pixels |
// this routine is 'slightly' suboptimal but mostly unused |
STATIC void DEF(avg, pixels8_xy2)(uint8_t *block, const uint8_t *pixels, |
ptrdiff_t line_size, int h) |
{ |
MOVQ_ZERO(mm7); |
SET_RND(mm6); // =2 for rnd and =1 for no_rnd version |
__asm__ volatile( |
"movq (%1), %%mm0 \n\t" |
"movq 1(%1), %%mm4 \n\t" |
"movq %%mm0, %%mm1 \n\t" |
"movq %%mm4, %%mm5 \n\t" |
"punpcklbw %%mm7, %%mm0 \n\t" |
"punpcklbw %%mm7, %%mm4 \n\t" |
"punpckhbw %%mm7, %%mm1 \n\t" |
"punpckhbw %%mm7, %%mm5 \n\t" |
"paddusw %%mm0, %%mm4 \n\t" |
"paddusw %%mm1, %%mm5 \n\t" |
"xor %%"REG_a", %%"REG_a" \n\t" |
"add %3, %1 \n\t" |
".p2align 3 \n\t" |
"1: \n\t" |
"movq (%1, %%"REG_a"), %%mm0 \n\t" |
"movq 1(%1, %%"REG_a"), %%mm2 \n\t" |
"movq %%mm0, %%mm1 \n\t" |
"movq %%mm2, %%mm3 \n\t" |
"punpcklbw %%mm7, %%mm0 \n\t" |
"punpcklbw %%mm7, %%mm2 \n\t" |
"punpckhbw %%mm7, %%mm1 \n\t" |
"punpckhbw %%mm7, %%mm3 \n\t" |
"paddusw %%mm2, %%mm0 \n\t" |
"paddusw %%mm3, %%mm1 \n\t" |
"paddusw %%mm6, %%mm4 \n\t" |
"paddusw %%mm6, %%mm5 \n\t" |
"paddusw %%mm0, %%mm4 \n\t" |
"paddusw %%mm1, %%mm5 \n\t" |
"psrlw $2, %%mm4 \n\t" |
"psrlw $2, %%mm5 \n\t" |
"movq (%2, %%"REG_a"), %%mm3 \n\t" |
"packuswb %%mm5, %%mm4 \n\t" |
"pcmpeqd %%mm2, %%mm2 \n\t" |
"paddb %%mm2, %%mm2 \n\t" |
PAVGB_MMX(%%mm3, %%mm4, %%mm5, %%mm2) |
"movq %%mm5, (%2, %%"REG_a") \n\t" |
"add %3, %%"REG_a" \n\t" |
"movq (%1, %%"REG_a"), %%mm2 \n\t" // 0 <-> 2 1 <-> 3 |
"movq 1(%1, %%"REG_a"), %%mm4 \n\t" |
"movq %%mm2, %%mm3 \n\t" |
"movq %%mm4, %%mm5 \n\t" |
"punpcklbw %%mm7, %%mm2 \n\t" |
"punpcklbw %%mm7, %%mm4 \n\t" |
"punpckhbw %%mm7, %%mm3 \n\t" |
"punpckhbw %%mm7, %%mm5 \n\t" |
"paddusw %%mm2, %%mm4 \n\t" |
"paddusw %%mm3, %%mm5 \n\t" |
"paddusw %%mm6, %%mm0 \n\t" |
"paddusw %%mm6, %%mm1 \n\t" |
"paddusw %%mm4, %%mm0 \n\t" |
"paddusw %%mm5, %%mm1 \n\t" |
"psrlw $2, %%mm0 \n\t" |
"psrlw $2, %%mm1 \n\t" |
"movq (%2, %%"REG_a"), %%mm3 \n\t" |
"packuswb %%mm1, %%mm0 \n\t" |
"pcmpeqd %%mm2, %%mm2 \n\t" |
"paddb %%mm2, %%mm2 \n\t" |
PAVGB_MMX(%%mm3, %%mm0, %%mm1, %%mm2) |
"movq %%mm1, (%2, %%"REG_a") \n\t" |
"add %3, %%"REG_a" \n\t" |
"subl $2, %0 \n\t" |
"jnz 1b \n\t" |
:"+g"(h), "+S"(pixels) |
:"D"(block), "r"((x86_reg)line_size) |
:REG_a, "memory"); |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rv34dsp.asm |
---|
0,0 → 1,196 |
;****************************************************************************** |
;* MMX/SSE2-optimized functions for the RV30 and RV40 decoders |
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
pw_row_coeffs: times 4 dw 13 |
times 4 dw 17 |
times 4 dw 7 |
pd_512: times 2 dd 0x200 |
pw_col_coeffs: dw 13, 13, 13, -13 |
dw 17, 7, 7, -17 |
dw 13, -13, 13, 13 |
dw -7, 17, -17, -7 |
SECTION .text |
%macro IDCT_DC_NOROUND 1 |
imul %1, 13*13*3 |
sar %1, 11 |
%endmacro |
%macro IDCT_DC_ROUND 1 |
imul %1, 13*13 |
add %1, 0x200 |
sar %1, 10 |
%endmacro |
%macro rv34_idct 1 |
cglobal rv34_idct_%1, 1, 2, 0 |
movsx r1, word [r0] |
IDCT_DC r1 |
movd m0, r1d |
pshufw m0, m0, 0 |
movq [r0+ 0], m0 |
movq [r0+ 8], m0 |
movq [r0+16], m0 |
movq [r0+24], m0 |
REP_RET |
%endmacro |
INIT_MMX mmxext |
%define IDCT_DC IDCT_DC_ROUND |
rv34_idct dc |
%define IDCT_DC IDCT_DC_NOROUND |
rv34_idct dc_noround |
; ff_rv34_idct_dc_add_mmx(uint8_t *dst, int stride, int dc); |
INIT_MMX mmx |
cglobal rv34_idct_dc_add, 3, 3 |
; calculate DC |
IDCT_DC_ROUND r2 |
pxor m1, m1 |
movd m0, r2d |
psubw m1, m0 |
packuswb m0, m0 |
packuswb m1, m1 |
punpcklbw m0, m0 |
punpcklbw m1, m1 |
punpcklwd m0, m0 |
punpcklwd m1, m1 |
; add DC |
lea r2, [r0+r1*2] |
movh m2, [r0] |
movh m3, [r0+r1] |
movh m4, [r2] |
movh m5, [r2+r1] |
paddusb m2, m0 |
paddusb m3, m0 |
paddusb m4, m0 |
paddusb m5, m0 |
psubusb m2, m1 |
psubusb m3, m1 |
psubusb m4, m1 |
psubusb m5, m1 |
movh [r0], m2 |
movh [r0+r1], m3 |
movh [r2], m4 |
movh [r2+r1], m5 |
RET |
; Load coeffs and perform row transform |
; Output: coeffs in mm[0467], rounder in mm5 |
%macro ROW_TRANSFORM 1 |
pxor mm7, mm7 |
mova mm0, [%1+ 0*8] |
mova mm1, [%1+ 1*8] |
mova mm2, [%1+ 2*8] |
mova mm3, [%1+ 3*8] |
mova [%1+ 0*8], mm7 |
mova [%1+ 1*8], mm7 |
mova [%1+ 2*8], mm7 |
mova [%1+ 3*8], mm7 |
mova mm4, mm0 |
mova mm6, [pw_row_coeffs+ 0] |
paddsw mm0, mm2 ; b0 + b2 |
psubsw mm4, mm2 ; b0 - b2 |
pmullw mm0, mm6 ; *13 = z0 |
pmullw mm4, mm6 ; *13 = z1 |
mova mm5, mm1 |
pmullw mm1, [pw_row_coeffs+ 8] ; b1*17 |
pmullw mm5, [pw_row_coeffs+16] ; b1* 7 |
mova mm7, mm3 |
pmullw mm3, [pw_row_coeffs+ 8] ; b3*17 |
pmullw mm7, [pw_row_coeffs+16] ; b3* 7 |
paddsw mm1, mm7 ; z3 = b1*17 + b3* 7 |
psubsw mm5, mm3 ; z2 = b1* 7 - b3*17 |
mova mm7, mm0 |
mova mm6, mm4 |
paddsw mm0, mm1 ; z0 + z3 |
psubsw mm7, mm1 ; z0 - z3 |
paddsw mm4, mm5 ; z1 + z2 |
psubsw mm6, mm5 ; z1 - z2 |
mova mm5, [pd_512] ; 0x200 |
%endmacro |
; ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); |
%macro COL_TRANSFORM 4 |
pshufw mm3, %2, 0xDD ; col. 1,3,1,3 |
pshufw %2, %2, 0x88 ; col. 0,2,0,2 |
pmaddwd %2, %3 ; 13*c0+13*c2 | 13*c0-13*c2 = z0 | z1 |
pmaddwd mm3, %4 ; 17*c1+ 7*c3 | 7*c1-17*c3 = z3 | z2 |
paddd %2, mm5 |
pshufw mm1, %2, 01001110b ; z1 | z0 |
pshufw mm2, mm3, 01001110b ; z2 | z3 |
paddd %2, mm3 ; z0+z3 | z1+z2 |
psubd mm1, mm2 ; z1-z2 | z0-z3 |
movd mm3, %1 |
psrad %2, 10 |
pxor mm2, mm2 |
psrad mm1, 10 |
punpcklbw mm3, mm2 |
packssdw %2, mm1 |
paddw %2, mm3 |
packuswb %2, %2 |
movd %1, %2 |
%endmacro |
INIT_MMX mmxext |
cglobal rv34_idct_add, 3,3,0, d, s, b |
ROW_TRANSFORM bq |
COL_TRANSFORM [dq], mm0, [pw_col_coeffs+ 0], [pw_col_coeffs+ 8] |
mova mm0, [pw_col_coeffs+ 0] |
COL_TRANSFORM [dq+sq], mm4, mm0, [pw_col_coeffs+ 8] |
mova mm4, [pw_col_coeffs+ 8] |
lea dq, [dq + 2*sq] |
COL_TRANSFORM [dq], mm6, mm0, mm4 |
COL_TRANSFORM [dq+sq], mm7, mm0, mm4 |
ret |
; ff_rv34_idct_dc_add_sse4(uint8_t *dst, int stride, int dc); |
INIT_XMM sse4 |
cglobal rv34_idct_dc_add, 3, 3, 6 |
; load data |
IDCT_DC_ROUND r2 |
pxor m1, m1 |
; calculate DC |
movd m0, r2d |
lea r2, [r0+r1*2] |
movd m2, [r0] |
movd m3, [r0+r1] |
pshuflw m0, m0, 0 |
movd m4, [r2] |
movd m5, [r2+r1] |
punpcklqdq m0, m0 |
punpckldq m2, m3 |
punpckldq m4, m5 |
punpcklbw m2, m1 |
punpcklbw m4, m1 |
paddw m2, m0 |
paddw m4, m0 |
packuswb m2, m4 |
movd [r0], m2 |
pextrd [r0+r1], m2, 1 |
pextrd [r2], m2, 2 |
pextrd [r2+r1], m2, 3 |
RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rv34dsp_init.c |
---|
0,0 → 1,45 |
/* |
* RV30/40 MMX/SSE2 optimizations |
* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/rv34dsp.h" |
void ff_rv34_idct_dc_mmxext(int16_t *block); |
void ff_rv34_idct_dc_noround_mmxext(int16_t *block); |
void ff_rv34_idct_dc_add_mmx(uint8_t *dst, ptrdiff_t stride, int dc); |
void ff_rv34_idct_dc_add_sse4(uint8_t *dst, ptrdiff_t stride, int dc); |
void ff_rv34_idct_add_mmxext(uint8_t *dst, ptrdiff_t stride, int16_t *block); |
av_cold void ff_rv34dsp_init_x86(RV34DSPContext* c) |
{ |
int cpu_flags = av_get_cpu_flags(); |
if (EXTERNAL_MMX(cpu_flags)) |
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_mmx; |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
c->rv34_inv_transform_dc = ff_rv34_idct_dc_noround_mmxext; |
c->rv34_idct_add = ff_rv34_idct_add_mmxext; |
} |
if (EXTERNAL_SSE4(cpu_flags)) |
c->rv34_idct_dc_add = ff_rv34_idct_dc_add_sse4; |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rv40dsp.asm |
---|
0,0 → 1,505 |
;****************************************************************************** |
;* MMX/SSE2-optimized functions for the RV40 decoder |
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
align 16 |
pw_1024: times 8 dw 1 << (16 - 6) ; pw_1024 |
sixtap_filter_hb_m: times 8 db 1, -5 |
times 8 db 52, 20 |
; multiplied by 2 to have the same shift |
times 8 db 2, -10 |
times 8 db 40, 40 |
; back to normal |
times 8 db 1, -5 |
times 8 db 20, 52 |
sixtap_filter_v_m: times 8 dw 1 |
times 8 dw -5 |
times 8 dw 52 |
times 8 dw 20 |
; multiplied by 2 to have the same shift |
times 8 dw 2 |
times 8 dw -10 |
times 8 dw 40 |
times 8 dw 40 |
; back to normal |
times 8 dw 1 |
times 8 dw -5 |
times 8 dw 20 |
times 8 dw 52 |
%ifdef PIC |
%define sixtap_filter_hw picregq |
%define sixtap_filter_hb picregq |
%define sixtap_filter_v picregq |
%define npicregs 1 |
%else |
%define sixtap_filter_hw sixtap_filter_hw_m |
%define sixtap_filter_hb sixtap_filter_hb_m |
%define sixtap_filter_v sixtap_filter_v_m |
%define npicregs 0 |
%endif |
filter_h6_shuf1: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
filter_h6_shuf2: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
filter_h6_shuf3: db 5, 4, 6, 5, 7, 6, 8, 7, 9, 8, 10, 9, 11, 10, 12, 11 |
cextern pw_32 |
cextern pw_16 |
cextern pw_512 |
SECTION .text |
;----------------------------------------------------------------------------- |
; subpel MC functions: |
; |
; void [put|rv40]_rv40_qpel_[h|v]_<opt>(uint8_t *dst, int deststride, |
; uint8_t *src, int srcstride, |
; int len, int m); |
;---------------------------------------------------------------------- |
%macro LOAD 2 |
%if WIN64 |
movsxd %1q, %1d |
%endif |
%ifdef PIC |
add %1q, picregq |
%else |
add %1q, %2 |
%endif |
%endmacro |
%macro STORE 3 |
%ifidn %3, avg |
movh %2, [dstq] |
%endif |
packuswb %1, %1 |
%ifidn %3, avg |
%if cpuflag(3dnow) |
pavgusb %1, %2 |
%else |
pavgb %1, %2 |
%endif |
%endif |
movh [dstq], %1 |
%endmacro |
%macro FILTER_V 1 |
cglobal %1_rv40_qpel_v, 6,6+npicregs,12, dst, dststride, src, srcstride, height, my, picreg |
%ifdef PIC |
lea picregq, [sixtap_filter_v_m] |
%endif |
pxor m7, m7 |
LOAD my, sixtap_filter_v |
; read 5 lines |
sub srcq, srcstrideq |
sub srcq, srcstrideq |
movh m0, [srcq] |
movh m1, [srcq+srcstrideq] |
movh m2, [srcq+srcstrideq*2] |
lea srcq, [srcq+srcstrideq*2] |
add srcq, srcstrideq |
movh m3, [srcq] |
movh m4, [srcq+srcstrideq] |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
punpcklbw m4, m7 |
%ifdef m8 |
mova m8, [myq+ 0] |
mova m9, [myq+16] |
mova m10, [myq+32] |
mova m11, [myq+48] |
%define COEFF05 m8 |
%define COEFF14 m9 |
%define COEFF2 m10 |
%define COEFF3 m11 |
%else |
%define COEFF05 [myq+ 0] |
%define COEFF14 [myq+16] |
%define COEFF2 [myq+32] |
%define COEFF3 [myq+48] |
%endif |
.nextrow: |
mova m6, m1 |
movh m5, [srcq+2*srcstrideq] ; read new row |
paddw m6, m4 |
punpcklbw m5, m7 |
pmullw m6, COEFF14 |
paddw m0, m5 |
pmullw m0, COEFF05 |
paddw m6, m0 |
mova m0, m1 |
paddw m6, [pw_32] |
mova m1, m2 |
pmullw m2, COEFF2 |
paddw m6, m2 |
mova m2, m3 |
pmullw m3, COEFF3 |
paddw m6, m3 |
; round/clip/store |
mova m3, m4 |
psraw m6, 6 |
mova m4, m5 |
STORE m6, m5, %1 |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
%endmacro |
%macro FILTER_H 1 |
cglobal %1_rv40_qpel_h, 6, 6+npicregs, 12, dst, dststride, src, srcstride, height, mx, picreg |
%ifdef PIC |
lea picregq, [sixtap_filter_v_m] |
%endif |
pxor m7, m7 |
LOAD mx, sixtap_filter_v |
mova m6, [pw_32] |
%ifdef m8 |
mova m8, [mxq+ 0] |
mova m9, [mxq+16] |
mova m10, [mxq+32] |
mova m11, [mxq+48] |
%define COEFF05 m8 |
%define COEFF14 m9 |
%define COEFF2 m10 |
%define COEFF3 m11 |
%else |
%define COEFF05 [mxq+ 0] |
%define COEFF14 [mxq+16] |
%define COEFF2 [mxq+32] |
%define COEFF3 [mxq+48] |
%endif |
.nextrow: |
movq m0, [srcq-2] |
movq m5, [srcq+3] |
movq m1, [srcq-1] |
movq m4, [srcq+2] |
punpcklbw m0, m7 |
punpcklbw m5, m7 |
punpcklbw m1, m7 |
punpcklbw m4, m7 |
movq m2, [srcq-0] |
movq m3, [srcq+1] |
paddw m0, m5 |
paddw m1, m4 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
pmullw m0, COEFF05 |
pmullw m1, COEFF14 |
pmullw m2, COEFF2 |
pmullw m3, COEFF3 |
paddw m0, m6 |
paddw m1, m2 |
paddw m0, m3 |
paddw m0, m1 |
psraw m0, 6 |
STORE m0, m1, %1 |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
FILTER_V put |
FILTER_H put |
INIT_MMX mmxext |
FILTER_V avg |
FILTER_H avg |
INIT_MMX 3dnow |
FILTER_V avg |
FILTER_H avg |
%endif |
INIT_XMM sse2 |
FILTER_H put |
FILTER_H avg |
FILTER_V put |
FILTER_V avg |
%macro FILTER_SSSE3 1 |
cglobal %1_rv40_qpel_v, 6,6+npicregs,8, dst, dststride, src, srcstride, height, my, picreg |
%ifdef PIC |
lea picregq, [sixtap_filter_hb_m] |
%endif |
; read 5 lines |
sub srcq, srcstrideq |
LOAD my, sixtap_filter_hb |
sub srcq, srcstrideq |
movh m0, [srcq] |
movh m1, [srcq+srcstrideq] |
movh m2, [srcq+srcstrideq*2] |
lea srcq, [srcq+srcstrideq*2] |
add srcq, srcstrideq |
mova m5, [myq] |
movh m3, [srcq] |
movh m4, [srcq+srcstrideq] |
lea srcq, [srcq+2*srcstrideq] |
.nextrow: |
mova m6, m2 |
punpcklbw m0, m1 |
punpcklbw m6, m3 |
pmaddubsw m0, m5 |
pmaddubsw m6, [myq+16] |
movh m7, [srcq] ; read new row |
paddw m6, m0 |
mova m0, m1 |
mova m1, m2 |
mova m2, m3 |
mova m3, m4 |
mova m4, m7 |
punpcklbw m7, m3 |
pmaddubsw m7, m5 |
paddw m6, m7 |
pmulhrsw m6, [pw_512] |
STORE m6, m7, %1 |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
cglobal %1_rv40_qpel_h, 6,6+npicregs,8, dst, dststride, src, srcstride, height, mx, picreg |
%ifdef PIC |
lea picregq, [sixtap_filter_hb_m] |
%endif |
mova m3, [filter_h6_shuf2] |
mova m4, [filter_h6_shuf3] |
LOAD mx, sixtap_filter_hb |
mova m5, [mxq] ; set up 6tap filter in bytes |
mova m6, [mxq+16] |
mova m7, [filter_h6_shuf1] |
.nextrow: |
movu m0, [srcq-2] |
mova m1, m0 |
mova m2, m0 |
pshufb m0, m7 |
pshufb m1, m3 |
pshufb m2, m4 |
pmaddubsw m0, m5 |
pmaddubsw m1, m6 |
pmaddubsw m2, m5 |
paddw m0, m1 |
paddw m0, m2 |
pmulhrsw m0, [pw_512] |
STORE m0, m1, %1 |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
%endmacro |
INIT_XMM ssse3 |
FILTER_SSSE3 put |
FILTER_SSSE3 avg |
; %1=5bits weights?, %2=dst %3=src1 %4=src3 %5=stride if sse2 |
%macro RV40_WCORE 4-5 |
movh m4, [%3 + r6 + 0] |
movh m5, [%4 + r6 + 0] |
%if %0 == 4 |
%define OFFSET r6 + mmsize / 2 |
%else |
; 8x8 block and sse2, stride was provided |
%define OFFSET r6 |
add r6, r5 |
%endif |
movh m6, [%3 + OFFSET] |
movh m7, [%4 + OFFSET] |
%if %1 == 0 |
; 14bits weights |
punpcklbw m4, m0 |
punpcklbw m5, m0 |
punpcklbw m6, m0 |
punpcklbw m7, m0 |
psllw m4, 7 |
psllw m5, 7 |
psllw m6, 7 |
psllw m7, 7 |
pmulhw m4, m3 |
pmulhw m5, m2 |
pmulhw m6, m3 |
pmulhw m7, m2 |
paddw m4, m5 |
paddw m6, m7 |
%else |
; 5bits weights |
%if cpuflag(ssse3) |
punpcklbw m4, m5 |
punpcklbw m6, m7 |
pmaddubsw m4, m3 |
pmaddubsw m6, m3 |
%else |
punpcklbw m4, m0 |
punpcklbw m5, m0 |
punpcklbw m6, m0 |
punpcklbw m7, m0 |
pmullw m4, m3 |
pmullw m5, m2 |
pmullw m6, m3 |
pmullw m7, m2 |
paddw m4, m5 |
paddw m6, m7 |
%endif |
%endif |
; bias and shift down |
%if cpuflag(ssse3) |
pmulhrsw m4, m1 |
pmulhrsw m6, m1 |
%else |
paddw m4, m1 |
paddw m6, m1 |
psrlw m4, 5 |
psrlw m6, 5 |
%endif |
packuswb m4, m6 |
%if %0 == 5 |
; Only called for 8x8 blocks and sse2 |
sub r6, r5 |
movh [%2 + r6], m4 |
add r6, r5 |
movhps [%2 + r6], m4 |
%else |
mova [%2 + r6], m4 |
%endif |
%endmacro |
%macro MAIN_LOOP 2 |
%if mmsize == 8 |
RV40_WCORE %2, r0, r1, r2 |
%if %1 == 16 |
RV40_WCORE %2, r0 + 8, r1 + 8, r2 + 8 |
%endif |
; Prepare for next loop |
add r6, r5 |
%else |
%ifidn %1, 8 |
RV40_WCORE %2, r0, r1, r2, r5 |
; Prepare 2 next lines |
add r6, r5 |
%else |
RV40_WCORE %2, r0, r1, r2 |
; Prepare single next line |
add r6, r5 |
%endif |
%endif |
%endmacro |
; rv40_weight_func_%1(uint8_t *dst, uint8_t *src1, uint8_t *src2, int w1, int w2, int stride) |
; %1=size %2=num of xmm regs |
; The weights are FP0.14 notation of fractions depending on pts. |
; For timebases without rounding error (i.e. PAL), the fractions |
; can be simplified, and several operations can be avoided. |
; Therefore, we check here whether they are multiples of 2^9 for |
; those simplifications to occur. |
%macro RV40_WEIGHT 3 |
cglobal rv40_weight_func_%1_%2, 6, 7, 8 |
%if cpuflag(ssse3) |
mova m1, [pw_1024] |
%else |
mova m1, [pw_16] |
%endif |
pxor m0, m0 |
; Set loop counter and increments |
mov r6, r5 |
shl r6, %3 |
add r0, r6 |
add r1, r6 |
add r2, r6 |
neg r6 |
movd m2, r3d |
movd m3, r4d |
%ifidn %1,rnd |
%define RND 0 |
SPLATW m2, m2 |
%else |
%define RND 1 |
%if cpuflag(ssse3) |
punpcklbw m3, m2 |
%else |
SPLATW m2, m2 |
%endif |
%endif |
SPLATW m3, m3 |
.loop: |
MAIN_LOOP %2, RND |
jnz .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
RV40_WEIGHT rnd, 8, 3 |
RV40_WEIGHT rnd, 16, 4 |
RV40_WEIGHT nornd, 8, 3 |
RV40_WEIGHT nornd, 16, 4 |
INIT_XMM sse2 |
RV40_WEIGHT rnd, 8, 3 |
RV40_WEIGHT rnd, 16, 4 |
RV40_WEIGHT nornd, 8, 3 |
RV40_WEIGHT nornd, 16, 4 |
INIT_XMM ssse3 |
RV40_WEIGHT rnd, 8, 3 |
RV40_WEIGHT rnd, 16, 4 |
RV40_WEIGHT nornd, 8, 3 |
RV40_WEIGHT nornd, 16, 4 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/rv40dsp_init.c |
---|
0,0 → 1,270 |
/* |
* RV40 decoder motion compensation functions x86-optimised |
* Copyright (c) 2008 Konstantin Shishkov |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
/** |
* @file |
* RV40 decoder motion compensation functions x86-optimised |
* 2,0 and 0,2 have h264 equivalents. |
* 3,3 is bugged in the rv40 format and maps to _xy2 version |
*/ |
#include "libavcodec/rv34dsp.h" |
#include "libavutil/attributes.h" |
#include "libavutil/mem.h" |
#include "libavutil/x86/cpu.h" |
#include "dsputil_x86.h" |
#if HAVE_YASM |
void ff_put_rv40_chroma_mc8_mmx (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_rv40_chroma_mc8_mmxext(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_rv40_chroma_mc8_3dnow(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_put_rv40_chroma_mc4_mmx (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_rv40_chroma_mc4_mmxext(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_rv40_chroma_mc4_3dnow(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
#define DECLARE_WEIGHT(opt) \ |
void ff_rv40_weight_func_rnd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ |
int w1, int w2, ptrdiff_t stride); \ |
void ff_rv40_weight_func_rnd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ |
int w1, int w2, ptrdiff_t stride); \ |
void ff_rv40_weight_func_nornd_16_##opt(uint8_t *dst, uint8_t *src1, uint8_t *src2, \ |
int w1, int w2, ptrdiff_t stride); \ |
void ff_rv40_weight_func_nornd_8_##opt (uint8_t *dst, uint8_t *src1, uint8_t *src2, \ |
int w1, int w2, ptrdiff_t stride); |
DECLARE_WEIGHT(mmxext) |
DECLARE_WEIGHT(sse2) |
DECLARE_WEIGHT(ssse3) |
/** @{ */ |
/** |
* Define one qpel function. |
* LOOPSIZE must be already set to the number of pixels processed per |
* iteration in the inner loop of the called functions. |
* COFF(x) must be already defined so as to provide the offset into any |
* array of coeffs used by the called function for the qpel position x. |
*/ |
#define QPEL_FUNC_DECL(OP, SIZE, PH, PV, OPT) \ |
static void OP ## rv40_qpel ##SIZE ##_mc ##PH ##PV ##OPT(uint8_t *dst, \ |
uint8_t *src, \ |
ptrdiff_t stride) \ |
{ \ |
int i; \ |
if (PH && PV) { \ |
DECLARE_ALIGNED(16, uint8_t, tmp)[SIZE * (SIZE + 5)]; \ |
uint8_t *tmpptr = tmp + SIZE * 2; \ |
src -= stride * 2; \ |
\ |
for (i = 0; i < SIZE; i += LOOPSIZE) \ |
ff_put_rv40_qpel_h ##OPT(tmp + i, SIZE, src + i, stride, \ |
SIZE + 5, HCOFF(PH)); \ |
for (i = 0; i < SIZE; i += LOOPSIZE) \ |
ff_ ##OP ##rv40_qpel_v ##OPT(dst + i, stride, tmpptr + i, \ |
SIZE, SIZE, VCOFF(PV)); \ |
} else if (PV) { \ |
for (i = 0; i < SIZE; i += LOOPSIZE) \ |
ff_ ##OP ##rv40_qpel_v ## OPT(dst + i, stride, src + i, \ |
stride, SIZE, VCOFF(PV)); \ |
} else { \ |
for (i = 0; i < SIZE; i += LOOPSIZE) \ |
ff_ ##OP ##rv40_qpel_h ## OPT(dst + i, stride, src + i, \ |
stride, SIZE, HCOFF(PH)); \ |
} \ |
}; |
/** Declare functions for sizes 8 and 16 and given operations |
* and qpel position. */ |
#define QPEL_FUNCS_DECL(OP, PH, PV, OPT) \ |
QPEL_FUNC_DECL(OP, 8, PH, PV, OPT) \ |
QPEL_FUNC_DECL(OP, 16, PH, PV, OPT) |
/** Declare all functions for all sizes and qpel positions */ |
#define QPEL_MC_DECL(OP, OPT) \ |
void ff_ ##OP ##rv40_qpel_h ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ |
const uint8_t *src, \ |
ptrdiff_t srcStride, \ |
int len, int m); \ |
void ff_ ##OP ##rv40_qpel_v ##OPT(uint8_t *dst, ptrdiff_t dstStride, \ |
const uint8_t *src, \ |
ptrdiff_t srcStride, \ |
int len, int m); \ |
QPEL_FUNCS_DECL(OP, 0, 1, OPT) \ |
QPEL_FUNCS_DECL(OP, 0, 3, OPT) \ |
QPEL_FUNCS_DECL(OP, 1, 0, OPT) \ |
QPEL_FUNCS_DECL(OP, 1, 1, OPT) \ |
QPEL_FUNCS_DECL(OP, 1, 2, OPT) \ |
QPEL_FUNCS_DECL(OP, 1, 3, OPT) \ |
QPEL_FUNCS_DECL(OP, 2, 1, OPT) \ |
QPEL_FUNCS_DECL(OP, 2, 2, OPT) \ |
QPEL_FUNCS_DECL(OP, 2, 3, OPT) \ |
QPEL_FUNCS_DECL(OP, 3, 0, OPT) \ |
QPEL_FUNCS_DECL(OP, 3, 1, OPT) \ |
QPEL_FUNCS_DECL(OP, 3, 2, OPT) |
/** @} */ |
#define LOOPSIZE 8 |
#define HCOFF(x) (32 * (x - 1)) |
#define VCOFF(x) (32 * (x - 1)) |
QPEL_MC_DECL(put_, _ssse3) |
QPEL_MC_DECL(avg_, _ssse3) |
#undef LOOPSIZE |
#undef HCOFF |
#undef VCOFF |
#define LOOPSIZE 8 |
#define HCOFF(x) (64 * (x - 1)) |
#define VCOFF(x) (64 * (x - 1)) |
QPEL_MC_DECL(put_, _sse2) |
QPEL_MC_DECL(avg_, _sse2) |
#if ARCH_X86_32 |
#undef LOOPSIZE |
#undef HCOFF |
#undef VCOFF |
#define LOOPSIZE 4 |
#define HCOFF(x) (64 * (x - 1)) |
#define VCOFF(x) (64 * (x - 1)) |
QPEL_MC_DECL(put_, _mmx) |
#define ff_put_rv40_qpel_h_mmxext ff_put_rv40_qpel_h_mmx |
#define ff_put_rv40_qpel_v_mmxext ff_put_rv40_qpel_v_mmx |
QPEL_MC_DECL(avg_, _mmxext) |
#define ff_put_rv40_qpel_h_3dnow ff_put_rv40_qpel_h_mmx |
#define ff_put_rv40_qpel_v_3dnow ff_put_rv40_qpel_v_mmx |
QPEL_MC_DECL(avg_, _3dnow) |
#endif |
/** @{ */ |
/** Set one function */ |
#define QPEL_FUNC_SET(OP, SIZE, PH, PV, OPT) \ |
c-> OP ## pixels_tab[2 - SIZE / 8][4 * PV + PH] = OP ## rv40_qpel ##SIZE ## _mc ##PH ##PV ##OPT; |
/** Set functions put and avg for sizes 8 and 16 and a given qpel position */ |
#define QPEL_FUNCS_SET(OP, PH, PV, OPT) \ |
QPEL_FUNC_SET(OP, 8, PH, PV, OPT) \ |
QPEL_FUNC_SET(OP, 16, PH, PV, OPT) |
/** Set all functions for all sizes and qpel positions */ |
#define QPEL_MC_SET(OP, OPT) \ |
QPEL_FUNCS_SET (OP, 0, 1, OPT) \ |
QPEL_FUNCS_SET (OP, 0, 3, OPT) \ |
QPEL_FUNCS_SET (OP, 1, 0, OPT) \ |
QPEL_FUNCS_SET (OP, 1, 1, OPT) \ |
QPEL_FUNCS_SET (OP, 1, 2, OPT) \ |
QPEL_FUNCS_SET (OP, 1, 3, OPT) \ |
QPEL_FUNCS_SET (OP, 2, 1, OPT) \ |
QPEL_FUNCS_SET (OP, 2, 2, OPT) \ |
QPEL_FUNCS_SET (OP, 2, 3, OPT) \ |
QPEL_FUNCS_SET (OP, 3, 0, OPT) \ |
QPEL_FUNCS_SET (OP, 3, 1, OPT) \ |
QPEL_FUNCS_SET (OP, 3, 2, OPT) |
/** @} */ |
#endif /* HAVE_YASM */ |
#if HAVE_MMX_INLINE |
static void put_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, |
ptrdiff_t stride) |
{ |
ff_put_pixels8_xy2_mmx(dst, src, stride, 8); |
} |
static void put_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, |
ptrdiff_t stride) |
{ |
ff_put_pixels16_xy2_mmx(dst, src, stride, 16); |
} |
static void avg_rv40_qpel8_mc33_mmx(uint8_t *dst, uint8_t *src, |
ptrdiff_t stride) |
{ |
ff_avg_pixels8_xy2_mmx(dst, src, stride, 8); |
} |
static void avg_rv40_qpel16_mc33_mmx(uint8_t *dst, uint8_t *src, |
ptrdiff_t stride) |
{ |
ff_avg_pixels16_xy2_mmx(dst, src, stride, 16); |
} |
#endif /* HAVE_MMX_INLINE */ |
av_cold void ff_rv40dsp_init_x86(RV34DSPContext *c) |
{ |
int cpu_flags = av_get_cpu_flags(); |
#if HAVE_MMX_INLINE |
if (INLINE_MMX(cpu_flags)) { |
c->put_pixels_tab[0][15] = put_rv40_qpel16_mc33_mmx; |
c->put_pixels_tab[1][15] = put_rv40_qpel8_mc33_mmx; |
c->avg_pixels_tab[0][15] = avg_rv40_qpel16_mc33_mmx; |
c->avg_pixels_tab[1][15] = avg_rv40_qpel8_mc33_mmx; |
} |
#endif /* HAVE_MMX_INLINE */ |
#if HAVE_YASM |
if (EXTERNAL_MMX(cpu_flags)) { |
c->put_chroma_pixels_tab[0] = ff_put_rv40_chroma_mc8_mmx; |
c->put_chroma_pixels_tab[1] = ff_put_rv40_chroma_mc4_mmx; |
#if ARCH_X86_32 |
QPEL_MC_SET(put_, _mmx) |
#endif |
} |
if (EXTERNAL_AMD3DNOW(cpu_flags)) { |
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_3dnow; |
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_3dnow; |
#if ARCH_X86_32 |
QPEL_MC_SET(avg_, _3dnow) |
#endif |
} |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
c->avg_chroma_pixels_tab[0] = ff_avg_rv40_chroma_mc8_mmxext; |
c->avg_chroma_pixels_tab[1] = ff_avg_rv40_chroma_mc4_mmxext; |
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_mmxext; |
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_mmxext; |
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_mmxext; |
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_mmxext; |
#if ARCH_X86_32 |
QPEL_MC_SET(avg_, _mmxext) |
#endif |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_sse2; |
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_sse2; |
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_sse2; |
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_sse2; |
QPEL_MC_SET(put_, _sse2) |
QPEL_MC_SET(avg_, _sse2) |
} |
if (EXTERNAL_SSSE3(cpu_flags)) { |
c->rv40_weight_pixels_tab[0][0] = ff_rv40_weight_func_rnd_16_ssse3; |
c->rv40_weight_pixels_tab[0][1] = ff_rv40_weight_func_rnd_8_ssse3; |
c->rv40_weight_pixels_tab[1][0] = ff_rv40_weight_func_nornd_16_ssse3; |
c->rv40_weight_pixels_tab[1][1] = ff_rv40_weight_func_nornd_8_ssse3; |
QPEL_MC_SET(put_, _ssse3) |
QPEL_MC_SET(avg_, _ssse3) |
} |
#endif /* HAVE_YASM */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/sbrdsp.asm |
---|
0,0 → 1,425 |
;****************************************************************************** |
;* AAC Spectral Band Replication decoding functions |
;* Copyright (C) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
; mask equivalent for multiply by -1.0 1.0 |
ps_mask times 2 dd 1<<31, 0 |
ps_mask2 times 2 dd 0, 1<<31 |
ps_neg times 4 dd 1<<31 |
ps_noise0 times 2 dd 1.0, 0.0, |
ps_noise2 times 2 dd -1.0, 0.0 |
ps_noise13 dd 0.0, 1.0, 0.0, -1.0 |
dd 0.0, -1.0, 0.0, 1.0 |
dd 0.0, 1.0, 0.0, -1.0 |
cextern sbr_noise_table |
SECTION_TEXT |
INIT_XMM sse |
cglobal sbr_sum_square, 2, 3, 6 |
mov r2, r1 |
xorps m0, m0 |
xorps m1, m1 |
sar r2, 3 |
jz .prepare |
.loop: |
movu m2, [r0 + 0] |
movu m3, [r0 + 16] |
movu m4, [r0 + 32] |
movu m5, [r0 + 48] |
mulps m2, m2 |
mulps m3, m3 |
mulps m4, m4 |
mulps m5, m5 |
addps m0, m2 |
addps m1, m3 |
addps m0, m4 |
addps m1, m5 |
add r0, 64 |
dec r2 |
jnz .loop |
.prepare: |
and r1, 7 |
sar r1, 1 |
jz .end |
; len is a multiple of 2, thus there are at least 4 elements to process |
.endloop: |
movu m2, [r0] |
add r0, 16 |
mulps m2, m2 |
dec r1 |
addps m0, m2 |
jnz .endloop |
.end: |
addps m0, m1 |
movhlps m2, m0 |
addps m0, m2 |
movss m1, m0 |
shufps m0, m0, 1 |
addss m0, m1 |
%if ARCH_X86_64 == 0 |
movss r0m, m0 |
fld dword r0m |
%endif |
RET |
%define STEP 40*4*2 |
cglobal sbr_hf_g_filt, 5, 6, 5 |
lea r1, [r1 + 8*r4] ; offset by ixh elements into X_high |
mov r5, r3 |
and r3, 0xFC |
lea r2, [r2 + r3*4] |
lea r0, [r0 + r3*8] |
neg r3 |
jz .loop1 |
.loop4: |
movlps m0, [r2 + 4*r3 + 0] |
movlps m1, [r2 + 4*r3 + 8] |
movlps m2, [r1 + 0*STEP] |
movlps m3, [r1 + 2*STEP] |
movhps m2, [r1 + 1*STEP] |
movhps m3, [r1 + 3*STEP] |
unpcklps m0, m0 |
unpcklps m1, m1 |
mulps m0, m2 |
mulps m1, m3 |
movu [r0 + 8*r3 + 0], m0 |
movu [r0 + 8*r3 + 16], m1 |
add r1, 4*STEP |
add r3, 4 |
jnz .loop4 |
and r5, 3 ; number of single element loops |
jz .end |
.loop1: ; element 0 and 1 can be computed at the same time |
movss m0, [r2] |
movlps m2, [r1] |
unpcklps m0, m0 |
mulps m2, m0 |
movlps [r0], m2 |
add r0, 8 |
add r2, 4 |
add r1, STEP |
dec r5 |
jnz .loop1 |
.end: |
RET |
; static void sbr_hf_gen_c(float (*X_high)[2], const float (*X_low)[2], |
; const float alpha0[2], const float alpha1[2], |
; float bw, int start, int end) |
; |
cglobal sbr_hf_gen, 4,4,8, X_high, X_low, alpha0, alpha1, BW, S, E |
; load alpha factors |
%define bw m0 |
%if ARCH_X86_64 == 0 || WIN64 |
movss bw, BWm |
%endif |
movlps m2, [alpha1q] |
movlps m1, [alpha0q] |
shufps bw, bw, 0 |
mulps m2, bw ; (a1[0] a1[1])*bw |
mulps m1, bw ; (a0[0] a0[1])*bw = (a2 a3) |
mulps m2, bw ; (a1[0] a1[1])*bw*bw = (a0 a1) |
mova m3, m1 |
mova m4, m2 |
; Set pointers |
%if ARCH_X86_64 == 0 || WIN64 |
; start and end 6th and 7th args on stack |
mov r2d, Sm |
mov r3d, Em |
%define start r2q |
%define end r3q |
%else |
; BW does not actually occupy a register, so shift by 1 |
%define start BWq |
%define end Sq |
%endif |
sub start, end ; neg num of loops |
lea X_highq, [X_highq + end*2*4] |
lea X_lowq, [X_lowq + end*2*4 - 2*2*4] |
shl start, 3 ; offset from num loops |
mova m0, [X_lowq + start] |
shufps m3, m3, q1111 |
shufps m4, m4, q1111 |
xorps m3, [ps_mask] |
shufps m1, m1, q0000 |
shufps m2, m2, q0000 |
xorps m4, [ps_mask] |
.loop2: |
movu m7, [X_lowq + start + 8] ; BbCc |
mova m6, m0 |
mova m5, m7 |
shufps m0, m0, q2301 ; aAbB |
shufps m7, m7, q2301 ; bBcC |
mulps m0, m4 |
mulps m7, m3 |
mulps m6, m2 |
mulps m5, m1 |
addps m7, m0 |
mova m0, [X_lowq + start +16] ; CcDd |
addps m7, m0 |
addps m6, m5 |
addps m7, m6 |
mova [X_highq + start], m7 |
add start, 16 |
jnz .loop2 |
RET |
cglobal sbr_sum64x5, 1,2,4,z |
lea r1q, [zq+ 256] |
.loop: |
mova m0, [zq+ 0] |
mova m2, [zq+ 16] |
mova m1, [zq+ 256] |
mova m3, [zq+ 272] |
addps m0, [zq+ 512] |
addps m2, [zq+ 528] |
addps m1, [zq+ 768] |
addps m3, [zq+ 784] |
addps m0, [zq+1024] |
addps m2, [zq+1040] |
addps m0, m1 |
addps m2, m3 |
mova [zq], m0 |
mova [zq+16], m2 |
add zq, 32 |
cmp zq, r1q |
jne .loop |
REP_RET |
INIT_XMM sse |
cglobal sbr_qmf_post_shuffle, 2,3,4,W,z |
lea r2q, [zq + (64-4)*4] |
mova m3, [ps_neg] |
.loop: |
mova m1, [zq] |
xorps m0, m3, [r2q] |
shufps m0, m0, m0, q0123 |
unpcklps m2, m0, m1 |
unpckhps m0, m0, m1 |
mova [Wq + 0], m2 |
mova [Wq + 16], m0 |
add Wq, 32 |
sub r2q, 16 |
add zq, 16 |
cmp zq, r2q |
jl .loop |
REP_RET |
INIT_XMM sse |
cglobal sbr_neg_odd_64, 1,2,4,z |
lea r1q, [zq+256] |
.loop: |
mova m0, [zq+ 0] |
mova m1, [zq+16] |
mova m2, [zq+32] |
mova m3, [zq+48] |
xorps m0, [ps_mask2] |
xorps m1, [ps_mask2] |
xorps m2, [ps_mask2] |
xorps m3, [ps_mask2] |
mova [zq+ 0], m0 |
mova [zq+16], m1 |
mova [zq+32], m2 |
mova [zq+48], m3 |
add zq, 64 |
cmp zq, r1q |
jne .loop |
REP_RET |
; sbr_qmf_deint_bfly(float *v, const float *src0, const float *src1) |
%macro SBR_QMF_DEINT_BFLY 0 |
cglobal sbr_qmf_deint_bfly, 3,5,8, v,src0,src1,vrev,c |
mov cq, 64*4-2*mmsize |
lea vrevq, [vq + 64*4] |
.loop: |
mova m0, [src0q+cq] |
mova m1, [src1q] |
mova m4, [src0q+cq+mmsize] |
mova m5, [src1q+mmsize] |
%if cpuflag(sse2) |
pshufd m2, m0, q0123 |
pshufd m3, m1, q0123 |
pshufd m6, m4, q0123 |
pshufd m7, m5, q0123 |
%else |
shufps m2, m0, m0, q0123 |
shufps m3, m1, m1, q0123 |
shufps m6, m4, m4, q0123 |
shufps m7, m5, m5, q0123 |
%endif |
addps m5, m2 |
subps m0, m7 |
addps m1, m6 |
subps m4, m3 |
mova [vrevq], m1 |
mova [vrevq+mmsize], m5 |
mova [vq+cq], m0 |
mova [vq+cq+mmsize], m4 |
add src1q, 2*mmsize |
add vrevq, 2*mmsize |
sub cq, 2*mmsize |
jge .loop |
REP_RET |
%endmacro |
INIT_XMM sse |
SBR_QMF_DEINT_BFLY |
INIT_XMM sse2 |
SBR_QMF_DEINT_BFLY |
INIT_XMM sse2 |
cglobal sbr_qmf_pre_shuffle, 1,4,6,z |
%define OFFSET (32*4-2*mmsize) |
mov r3q, OFFSET |
lea r1q, [zq + (32+1)*4] |
lea r2q, [zq + 64*4] |
mova m5, [ps_neg] |
.loop: |
movu m0, [r1q] |
movu m2, [r1q + mmsize] |
movu m1, [zq + r3q + 4 + mmsize] |
movu m3, [zq + r3q + 4] |
pxor m2, m5 |
pxor m0, m5 |
pshufd m2, m2, q0123 |
pshufd m0, m0, q0123 |
SBUTTERFLY dq, 2, 3, 4 |
SBUTTERFLY dq, 0, 1, 4 |
mova [r2q + 2*r3q + 0*mmsize], m2 |
mova [r2q + 2*r3q + 1*mmsize], m3 |
mova [r2q + 2*r3q + 2*mmsize], m0 |
mova [r2q + 2*r3q + 3*mmsize], m1 |
add r1q, 2*mmsize |
sub r3q, 2*mmsize |
jge .loop |
movq m2, [zq] |
movq [r2q], m2 |
REP_RET |
%ifdef PIC |
%define NREGS 1 |
%if UNIX64 |
%define NOISE_TABLE r6q ; r5q is m_max |
%else |
%define NOISE_TABLE r5q |
%endif |
%else |
%define NREGS 0 |
%define NOISE_TABLE sbr_noise_table |
%endif |
%macro LOAD_NST 1 |
%ifdef PIC |
lea NOISE_TABLE, [%1] |
mova m0, [kxq + NOISE_TABLE] |
%else |
mova m0, [kxq + %1] |
%endif |
%endmacro |
INIT_XMM sse2 |
; sbr_hf_apply_noise_0(float (*Y)[2], const float *s_m, |
; const float *q_filt, int noise, |
; int kx, int m_max) |
cglobal sbr_hf_apply_noise_0, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max |
mova m0, [ps_noise0] |
jmp apply_noise_main |
; sbr_hf_apply_noise_1(float (*Y)[2], const float *s_m, |
; const float *q_filt, int noise, |
; int kx, int m_max) |
cglobal sbr_hf_apply_noise_1, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max |
and kxq, 1 |
shl kxq, 4 |
LOAD_NST ps_noise13 |
jmp apply_noise_main |
; sbr_hf_apply_noise_2(float (*Y)[2], const float *s_m, |
; const float *q_filt, int noise, |
; int kx, int m_max) |
cglobal sbr_hf_apply_noise_2, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max |
mova m0, [ps_noise2] |
jmp apply_noise_main |
; sbr_hf_apply_noise_3(float (*Y)[2], const float *s_m, |
; const float *q_filt, int noise, |
; int kx, int m_max) |
cglobal sbr_hf_apply_noise_3, 5,5+NREGS+UNIX64,8, Y,s_m,q_filt,noise,kx,m_max |
and kxq, 1 |
shl kxq, 4 |
LOAD_NST ps_noise13+16 |
apply_noise_main: |
%if ARCH_X86_64 == 0 || WIN64 |
mov kxd, m_maxm |
%define count kxq |
%else |
%define count m_maxq |
%endif |
dec noiseq |
shl count, 2 |
%ifdef PIC |
lea NOISE_TABLE, [sbr_noise_table] |
%endif |
lea Yq, [Yq + 2*count] |
add s_mq, count |
add q_filtq, count |
shl noiseq, 3 |
pxor m5, m5 |
neg count |
.loop: |
mova m1, [q_filtq + count] |
movu m3, [noiseq + NOISE_TABLE + 1*mmsize] |
movu m4, [noiseq + NOISE_TABLE + 2*mmsize] |
add noiseq, 2*mmsize |
and noiseq, 0x1ff<<3 |
punpckhdq m2, m1, m1 |
punpckldq m1, m1 |
mulps m1, m3 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] |
mulps m2, m4 ; m2 = q_filt[m] * ff_sbr_noise_table[noise] |
mova m3, [s_mq + count] |
; TODO: replace by a vpermd in AVX2 |
punpckhdq m4, m3, m3 |
punpckldq m3, m3 |
pcmpeqd m6, m3, m5 ; m6 == 0 |
pcmpeqd m7, m4, m5 ; m7 == 0 |
mulps m3, m0 ; s_m[m] * phi_sign |
mulps m4, m0 ; s_m[m] * phi_sign |
pand m1, m6 |
pand m2, m7 |
movu m6, [Yq + 2*count] |
movu m7, [Yq + 2*count + mmsize] |
addps m3, m1 |
addps m4, m2 |
addps m6, m3 |
addps m7, m4 |
movu [Yq + 2*count], m6 |
movu [Yq + 2*count + mmsize], m7 |
add count, mmsize |
jl .loop |
RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/sbrdsp_init.c |
---|
0,0 → 1,76 |
/* |
* AAC Spectral Band Replication decoding functions |
* Copyright (c) 2012 Christophe Gisquet <christophe.gisquet@gmail.com> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "config.h" |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/sbrdsp.h" |
float ff_sbr_sum_square_sse(float (*x)[2], int n); |
void ff_sbr_sum64x5_sse(float *z); |
void ff_sbr_hf_g_filt_sse(float (*Y)[2], const float (*X_high)[40][2], |
const float *g_filt, int m_max, intptr_t ixh); |
void ff_sbr_hf_gen_sse(float (*X_high)[2], const float (*X_low)[2], |
const float alpha0[2], const float alpha1[2], |
float bw, int start, int end); |
void ff_sbr_neg_odd_64_sse(float *z); |
void ff_sbr_qmf_post_shuffle_sse(float W[32][2], const float *z); |
void ff_sbr_qmf_deint_bfly_sse(float *v, const float *src0, const float *src1); |
void ff_sbr_qmf_deint_bfly_sse2(float *v, const float *src0, const float *src1); |
void ff_sbr_qmf_pre_shuffle_sse2(float *z); |
void ff_sbr_hf_apply_noise_0_sse2(float (*Y)[2], const float *s_m, |
const float *q_filt, int noise, |
int kx, int m_max); |
void ff_sbr_hf_apply_noise_1_sse2(float (*Y)[2], const float *s_m, |
const float *q_filt, int noise, |
int kx, int m_max); |
void ff_sbr_hf_apply_noise_2_sse2(float (*Y)[2], const float *s_m, |
const float *q_filt, int noise, |
int kx, int m_max); |
void ff_sbr_hf_apply_noise_3_sse2(float (*Y)[2], const float *s_m, |
const float *q_filt, int noise, |
int kx, int m_max); |
av_cold void ff_sbrdsp_init_x86(SBRDSPContext *s) |
{ |
int cpu_flags = av_get_cpu_flags(); |
if (EXTERNAL_SSE(cpu_flags)) { |
s->neg_odd_64 = ff_sbr_neg_odd_64_sse; |
s->sum_square = ff_sbr_sum_square_sse; |
s->sum64x5 = ff_sbr_sum64x5_sse; |
s->hf_g_filt = ff_sbr_hf_g_filt_sse; |
s->hf_gen = ff_sbr_hf_gen_sse; |
s->qmf_post_shuffle = ff_sbr_qmf_post_shuffle_sse; |
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
s->qmf_deint_bfly = ff_sbr_qmf_deint_bfly_sse2; |
s->qmf_pre_shuffle = ff_sbr_qmf_pre_shuffle_sse2; |
s->hf_apply_noise[0] = ff_sbr_hf_apply_noise_0_sse2; |
s->hf_apply_noise[1] = ff_sbr_hf_apply_noise_1_sse2; |
s->hf_apply_noise[2] = ff_sbr_hf_apply_noise_2_sse2; |
s->hf_apply_noise[3] = ff_sbr_hf_apply_noise_3_sse2; |
} |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/simple_idct.c |
---|
0,0 → 1,1167 |
/* |
* Simple IDCT MMX |
* |
* Copyright (c) 2001, 2002 Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavcodec/simple_idct.h" |
#include "libavutil/mem.h" |
#include "dsputil_x86.h" |
#if HAVE_INLINE_ASM |
/* |
23170.475006 |
22725.260826 |
21406.727617 |
19265.545870 |
16384.000000 |
12872.826198 |
8866.956905 |
4520.335430 |
*/ |
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 |
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
#define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
#define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
#define ROW_SHIFT 11 |
#define COL_SHIFT 20 // 6 |
DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL; |
DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL; |
DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= { |
1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, |
// 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, |
// 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), |
1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, |
// the 1 = ((1<<(COL_SHIFT-1))/C4)<<ROW_SHIFT :) |
// 0, 0, 0, 0, |
// 0, 0, 0, 0, |
C4, C4, C4, C4, |
C4, -C4, C4, -C4, |
C2, C6, C2, C6, |
C6, -C2, C6, -C2, |
C1, C3, C1, C3, |
C5, C7, C5, C7, |
C3, -C7, C3, -C7, |
-C1, -C5, -C1, -C5, |
C5, -C1, C5, -C1, |
C7, C3, C7, C3, |
C7, -C5, C7, -C5, |
C3, -C1, C3, -C1 |
}; |
static inline void idct(int16_t *block) |
{ |
LOCAL_ALIGNED_8(int64_t, align_tmp, [16]); |
int16_t * const temp= (int16_t*)align_tmp; |
__asm__ volatile( |
#if 0 //Alternative, simpler variant |
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
#rounder ", %%mm4 \n\t"\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
#rounder ", %%mm0 \n\t"\ |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
"paddd %%mm0, %%mm0 \n\t" \ |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
"movq %%mm7, " #dst " \n\t"\ |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
"movq %%mm2, 24+" #dst " \n\t"\ |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
"movq %%mm2, 8+" #dst " \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
"movq %%mm4, 16+" #dst " \n\t"\ |
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ |
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ |
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"movd %%mm7, " #dst " \n\t"\ |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"movd %%mm0, 16+" #dst " \n\t"\ |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"movd %%mm2, 96+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"movd %%mm4, 112+" #dst " \n\t"\ |
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"psrad $" #shift ", %%mm5 \n\t"\ |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"movd %%mm2, 32+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
"movd %%mm6, 48+" #dst " \n\t"\ |
"movd %%mm4, 64+" #dst " \n\t"\ |
"movd %%mm5, 80+" #dst " \n\t"\ |
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
"movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
"pand %%mm0, %%mm4 \n\t"\ |
"por %%mm1, %%mm4 \n\t"\ |
"por %%mm2, %%mm4 \n\t"\ |
"por %%mm3, %%mm4 \n\t"\ |
"packssdw %%mm4,%%mm4 \n\t"\ |
"movd %%mm4, %%eax \n\t"\ |
"orl %%eax, %%eax \n\t"\ |
"jz 1f \n\t"\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
#rounder ", %%mm4 \n\t"\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
#rounder ", %%mm0 \n\t"\ |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
"paddd %%mm0, %%mm0 \n\t" \ |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
"movq %%mm7, " #dst " \n\t"\ |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
"movq %%mm2, 24+" #dst " \n\t"\ |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
"movq %%mm2, 8+" #dst " \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
"movq %%mm4, 16+" #dst " \n\t"\ |
"jmp 2f \n\t"\ |
"1: \n\t"\ |
"pslld $16, %%mm0 \n\t"\ |
"#paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
"psrad $13, %%mm0 \n\t"\ |
"packssdw %%mm0, %%mm0 \n\t"\ |
"movq %%mm0, " #dst " \n\t"\ |
"movq %%mm0, 8+" #dst " \n\t"\ |
"movq %%mm0, 16+" #dst " \n\t"\ |
"movq %%mm0, 24+" #dst " \n\t"\ |
"2: \n\t" |
//IDCT( src0, src4, src1, src5, dst, rounder, shift) |
ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) |
/*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) |
ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) |
ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ |
DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) |
DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) |
DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) |
//IDCT( src0, src4, src1, src5, dst, shift) |
COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
#else |
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
"movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
"pand %%mm0, %%mm4 \n\t"\ |
"por %%mm1, %%mm4 \n\t"\ |
"por %%mm2, %%mm4 \n\t"\ |
"por %%mm3, %%mm4 \n\t"\ |
"packssdw %%mm4,%%mm4 \n\t"\ |
"movd %%mm4, %%eax \n\t"\ |
"orl %%eax, %%eax \n\t"\ |
"jz 1f \n\t"\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
#rounder ", %%mm4 \n\t"\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
#rounder ", %%mm0 \n\t"\ |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
"paddd %%mm0, %%mm0 \n\t" \ |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
"movq %%mm7, " #dst " \n\t"\ |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
"movq %%mm2, 24+" #dst " \n\t"\ |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
"movq %%mm2, 8+" #dst " \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
"movq %%mm4, 16+" #dst " \n\t"\ |
"jmp 2f \n\t"\ |
"1: \n\t"\ |
"pslld $16, %%mm0 \n\t"\ |
"paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
"psrad $13, %%mm0 \n\t"\ |
"packssdw %%mm0, %%mm0 \n\t"\ |
"movq %%mm0, " #dst " \n\t"\ |
"movq %%mm0, 8+" #dst " \n\t"\ |
"movq %%mm0, 16+" #dst " \n\t"\ |
"movq %%mm0, 24+" #dst " \n\t"\ |
"2: \n\t" |
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
"movq %%mm0, %%mm4 \n\t"\ |
"por %%mm1, %%mm4 \n\t"\ |
"por %%mm2, %%mm4 \n\t"\ |
"por %%mm3, %%mm4 \n\t"\ |
"packssdw %%mm4,%%mm4 \n\t"\ |
"movd %%mm4, %%eax \n\t"\ |
"orl %%eax, %%eax \n\t"\ |
"jz " #bt " \n\t"\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
#rounder ", %%mm4 \n\t"\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
#rounder ", %%mm0 \n\t"\ |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
"paddd %%mm0, %%mm0 \n\t" \ |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
"movq %%mm7, " #dst " \n\t"\ |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
"movq %%mm2, 24+" #dst " \n\t"\ |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
"movq %%mm2, 8+" #dst " \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
"movq %%mm4, 16+" #dst " \n\t"\ |
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
#rounder ", %%mm4 \n\t"\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
"movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
"pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
#rounder ", %%mm0 \n\t"\ |
"paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
"paddd %%mm0, %%mm0 \n\t" \ |
"psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
"paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
"movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
"pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
"paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
"packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
"movq %%mm7, " #dst " \n\t"\ |
"movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
"movq %%mm2, 24+" #dst " \n\t"\ |
"pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
"pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
"movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
"paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
"movq %%mm2, 8+" #dst " \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
"movq %%mm4, 16+" #dst " \n\t"\ |
//IDCT( src0, src4, src1, src5, dst, rounder, shift) |
DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) |
Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) |
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) |
#undef IDCT |
#define IDCT(src0, src4, src1, src5, dst, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ |
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ |
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"movd %%mm7, " #dst " \n\t"\ |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"movd %%mm0, 16+" #dst " \n\t"\ |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"movd %%mm2, 96+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"movd %%mm4, 112+" #dst " \n\t"\ |
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"psrad $" #shift ", %%mm5 \n\t"\ |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"movd %%mm2, 32+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
"movd %%mm6, 48+" #dst " \n\t"\ |
"movd %%mm4, 64+" #dst " \n\t"\ |
"movd %%mm5, 80+" #dst " \n\t" |
//IDCT( src0, src4, src1, src5, dst, shift) |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
"jmp 9f \n\t" |
"# .p2align 4 \n\t"\ |
"4: \n\t" |
Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) |
#undef IDCT |
#define IDCT(src0, src4, src1, src5, dst, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
"movd %%mm1, " #dst " \n\t"\ |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"movd %%mm0, 16+" #dst " \n\t"\ |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"movd %%mm2, 96+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"movd %%mm4, 112+" #dst " \n\t"\ |
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ |
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"psrad $" #shift ", %%mm5 \n\t"\ |
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"movd %%mm2, 32+" #dst " \n\t"\ |
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
"movd %%mm6, 48+" #dst " \n\t"\ |
"movd %%mm1, 64+" #dst " \n\t"\ |
"movd %%mm5, 80+" #dst " \n\t" |
//IDCT( src0, src4, src1, src5, dst, shift) |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
"jmp 9f \n\t" |
"# .p2align 4 \n\t"\ |
"6: \n\t" |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) |
#undef IDCT |
#define IDCT(src0, src4, src1, src5, dst, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
"movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
"paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
"paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
"movd %%mm1, " #dst " \n\t"\ |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"movd %%mm0, 16+" #dst " \n\t"\ |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"movd %%mm2, 96+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"movd %%mm4, 112+" #dst " \n\t"\ |
"movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ |
"pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
"paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"psrad $" #shift ", %%mm5 \n\t"\ |
"movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"movd %%mm2, 32+" #dst " \n\t"\ |
"packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
"movd %%mm6, 48+" #dst " \n\t"\ |
"movd %%mm1, 64+" #dst " \n\t"\ |
"movd %%mm5, 80+" #dst " \n\t" |
//IDCT( src0, src4, src1, src5, dst, shift) |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
"jmp 9f \n\t" |
"# .p2align 4 \n\t"\ |
"2: \n\t" |
Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) |
#undef IDCT |
#define IDCT(src0, src4, src1, src5, dst, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
"movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
"pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
"pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
"paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ |
"movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ |
"pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"movd %%mm7, " #dst " \n\t"\ |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"movd %%mm0, 16+" #dst " \n\t"\ |
"packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
"movd %%mm2, 96+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"movd %%mm4, 112+" #dst " \n\t"\ |
"movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
"movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
"pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
"movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
"pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
"paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
"paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"psrad $" #shift ", %%mm5 \n\t"\ |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
"paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ |
"paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"movd %%mm2, 32+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
"movd %%mm6, 48+" #dst " \n\t"\ |
"movd %%mm4, 64+" #dst " \n\t"\ |
"movd %%mm5, 80+" #dst " \n\t" |
//IDCT( src0, src4, src1, src5, dst, shift) |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
"jmp 9f \n\t" |
"# .p2align 4 \n\t"\ |
"3: \n\t" |
#undef IDCT |
#define IDCT(src0, src4, src1, src5, dst, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 64(%2), %%mm3 \n\t"\ |
"pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
"paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"movd %%mm7, " #dst " \n\t"\ |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"movd %%mm0, 16+" #dst " \n\t"\ |
"packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ |
"movd %%mm1, 96+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"movd %%mm4, 112+" #dst " \n\t"\ |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
"movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ |
"paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"psrad $" #shift ", %%mm5 \n\t"\ |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"movd %%mm1, 32+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
"movd %%mm6, 48+" #dst " \n\t"\ |
"movd %%mm4, 64+" #dst " \n\t"\ |
"movd %%mm5, 80+" #dst " \n\t" |
//IDCT( src0, src4, src1, src5, dst, shift) |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
"jmp 9f \n\t" |
"# .p2align 4 \n\t"\ |
"5: \n\t" |
#undef IDCT |
#define IDCT(src0, src4, src1, src5, dst, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ |
"movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ |
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ |
"paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ |
"psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ |
"paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ |
"paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ |
"psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm3 \n\t"\ |
"packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ |
"movq %%mm4, " #dst " \n\t"\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ |
"movq %%mm0, 16+" #dst " \n\t"\ |
"movq %%mm0, 96+" #dst " \n\t"\ |
"movq %%mm4, 112+" #dst " \n\t"\ |
"psrad $" #shift ", %%mm5 \n\t"\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
"movq %%mm5, 32+" #dst " \n\t"\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"movq %%mm6, 48+" #dst " \n\t"\ |
"movq %%mm6, 64+" #dst " \n\t"\ |
"movq %%mm5, 80+" #dst " \n\t" |
//IDCT( src0, src4, src1, src5, dst, shift) |
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
"jmp 9f \n\t" |
"# .p2align 4 \n\t"\ |
"1: \n\t" |
#undef IDCT |
#define IDCT(src0, src4, src1, src5, dst, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
"movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
"pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
"movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
"pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
"movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
"pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
"paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
"psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
"movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
"psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
"movq 64(%2), %%mm1 \n\t"\ |
"pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
"paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
"psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"psrad $" #shift ", %%mm7 \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ |
"paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"psrad $" #shift ", %%mm3 \n\t"\ |
"packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
"movd %%mm7, " #dst " \n\t"\ |
"packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
"movd %%mm0, 16+" #dst " \n\t"\ |
"packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ |
"movd %%mm3, 96+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
"movd %%mm4, 112+" #dst " \n\t"\ |
"movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
"pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
"pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
"movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ |
"paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
"psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
"psrad $" #shift ", %%mm3 \n\t"\ |
"psrad $" #shift ", %%mm5 \n\t"\ |
"movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
"paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
"psrad $" #shift ", %%mm6 \n\t"\ |
"packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
"movd %%mm3, 32+" #dst " \n\t"\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
"movd %%mm6, 48+" #dst " \n\t"\ |
"packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
"packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
"movd %%mm4, 64+" #dst " \n\t"\ |
"movd %%mm5, 80+" #dst " \n\t" |
//IDCT( src0, src4, src1, src5, dst, shift) |
IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
"jmp 9f \n\t" |
"# .p2align 4 \n\t" |
"7: \n\t" |
#undef IDCT |
#define IDCT(src0, src4, src1, src5, dst, shift) \ |
"movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
"movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"psrad $" #shift ", %%mm4 \n\t"\ |
"psrad $" #shift ", %%mm0 \n\t"\ |
"movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ |
"movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ |
"pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
"movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ |
"pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
"movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ |
"psrad $" #shift ", %%mm1 \n\t"\ |
"packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ |
"movq %%mm4, " #dst " \n\t"\ |
"psrad $" #shift ", %%mm2 \n\t"\ |
"packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ |
"movq %%mm0, 16+" #dst " \n\t"\ |
"movq %%mm0, 96+" #dst " \n\t"\ |
"movq %%mm4, 112+" #dst " \n\t"\ |
"movq %%mm0, 32+" #dst " \n\t"\ |
"movq %%mm4, 48+" #dst " \n\t"\ |
"movq %%mm4, 64+" #dst " \n\t"\ |
"movq %%mm0, 80+" #dst " \n\t" |
//IDCT( src0, src4, src1, src5, dst, shift) |
IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
//IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
//IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
#endif |
/* |
Input |
00 40 04 44 20 60 24 64 |
10 30 14 34 50 70 54 74 |
01 41 03 43 21 61 23 63 |
11 31 13 33 51 71 53 73 |
02 42 06 46 22 62 26 66 |
12 32 16 36 52 72 56 76 |
05 45 07 47 25 65 27 67 |
15 35 17 37 55 75 57 77 |
Temp |
00 04 10 14 20 24 30 34 |
40 44 50 54 60 64 70 74 |
01 03 11 13 21 23 31 33 |
41 43 51 53 61 63 71 73 |
02 06 12 16 22 26 32 36 |
42 46 52 56 62 66 72 76 |
05 07 15 17 25 27 35 37 |
45 47 55 57 65 67 75 77 |
*/ |
"9: \n\t" |
:: "r" (block), "r" (temp), "r" (coeffs) |
: "%eax" |
); |
} |
void ff_simple_idct_mmx(int16_t *block) |
{ |
idct(block); |
} |
//FIXME merge add/put into the idct |
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block) |
{ |
idct(block); |
ff_put_pixels_clamped_mmx(block, dest, line_size); |
} |
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block) |
{ |
idct(block); |
ff_add_pixels_clamped_mmx(block, dest, line_size); |
} |
#endif /* HAVE_INLINE_ASM */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/snowdsp.c |
---|
0,0 → 1,902 |
/* |
* MMX and SSE2 optimized snow DSP utils |
* Copyright (c) 2005-2006 Robert Edele <yartrebo@earthlink.net> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavcodec/avcodec.h" |
#include "libavcodec/snow.h" |
#include "libavcodec/snow_dwt.h" |
#include "dsputil_x86.h" |
#if HAVE_INLINE_ASM |
static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){ |
const int w2= (width+1)>>1; |
const int w_l= (width>>1); |
const int w_r= w2 - 1; |
int i; |
{ // Lift 0 |
IDWTELEM * const ref = b + w2 - 1; |
IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice |
// (the first time erroneously), we allow the SSE2 code to run an extra pass. |
// The savings in code and time are well worth having to store this value and |
// calculate b[0] correctly afterwards. |
i = 0; |
__asm__ volatile( |
"pcmpeqd %%xmm7, %%xmm7 \n\t" |
"pcmpeqd %%xmm3, %%xmm3 \n\t" |
"psllw $1, %%xmm3 \n\t" |
"paddw %%xmm7, %%xmm3 \n\t" |
"psllw $13, %%xmm3 \n\t" |
::); |
for(; i<w_l-15; i+=16){ |
__asm__ volatile( |
"movdqu (%1), %%xmm1 \n\t" |
"movdqu 16(%1), %%xmm5 \n\t" |
"movdqu 2(%1), %%xmm2 \n\t" |
"movdqu 18(%1), %%xmm6 \n\t" |
"paddw %%xmm1, %%xmm2 \n\t" |
"paddw %%xmm5, %%xmm6 \n\t" |
"paddw %%xmm7, %%xmm2 \n\t" |
"paddw %%xmm7, %%xmm6 \n\t" |
"pmulhw %%xmm3, %%xmm2 \n\t" |
"pmulhw %%xmm3, %%xmm6 \n\t" |
"paddw (%0), %%xmm2 \n\t" |
"paddw 16(%0), %%xmm6 \n\t" |
"movdqa %%xmm2, (%0) \n\t" |
"movdqa %%xmm6, 16(%0) \n\t" |
:: "r"(&b[i]), "r"(&ref[i]) |
: "memory" |
); |
} |
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); |
b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); |
} |
{ // Lift 1 |
IDWTELEM * const dst = b+w2; |
i = 0; |
for(; (((x86_reg)&dst[i]) & 0x1F) && i<w_r; i++){ |
dst[i] = dst[i] - (b[i] + b[i + 1]); |
} |
for(; i<w_r-15; i+=16){ |
__asm__ volatile( |
"movdqu (%1), %%xmm1 \n\t" |
"movdqu 16(%1), %%xmm5 \n\t" |
"movdqu 2(%1), %%xmm2 \n\t" |
"movdqu 18(%1), %%xmm6 \n\t" |
"paddw %%xmm1, %%xmm2 \n\t" |
"paddw %%xmm5, %%xmm6 \n\t" |
"movdqa (%0), %%xmm0 \n\t" |
"movdqa 16(%0), %%xmm4 \n\t" |
"psubw %%xmm2, %%xmm0 \n\t" |
"psubw %%xmm6, %%xmm4 \n\t" |
"movdqa %%xmm0, (%0) \n\t" |
"movdqa %%xmm4, 16(%0) \n\t" |
:: "r"(&dst[i]), "r"(&b[i]) |
: "memory" |
); |
} |
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); |
} |
{ // Lift 2 |
IDWTELEM * const ref = b+w2 - 1; |
IDWTELEM b_0 = b[0]; |
i = 0; |
__asm__ volatile( |
"psllw $15, %%xmm7 \n\t" |
"pcmpeqw %%xmm6, %%xmm6 \n\t" |
"psrlw $13, %%xmm6 \n\t" |
"paddw %%xmm7, %%xmm6 \n\t" |
::); |
for(; i<w_l-15; i+=16){ |
__asm__ volatile( |
"movdqu (%1), %%xmm0 \n\t" |
"movdqu 16(%1), %%xmm4 \n\t" |
"movdqu 2(%1), %%xmm1 \n\t" |
"movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts |
"paddw %%xmm6, %%xmm0 \n\t" |
"paddw %%xmm6, %%xmm4 \n\t" |
"paddw %%xmm7, %%xmm1 \n\t" |
"paddw %%xmm7, %%xmm5 \n\t" |
"pavgw %%xmm1, %%xmm0 \n\t" |
"pavgw %%xmm5, %%xmm4 \n\t" |
"psubw %%xmm7, %%xmm0 \n\t" |
"psubw %%xmm7, %%xmm4 \n\t" |
"psraw $1, %%xmm0 \n\t" |
"psraw $1, %%xmm4 \n\t" |
"movdqa (%0), %%xmm1 \n\t" |
"movdqa 16(%0), %%xmm5 \n\t" |
"paddw %%xmm1, %%xmm0 \n\t" |
"paddw %%xmm5, %%xmm4 \n\t" |
"psraw $2, %%xmm0 \n\t" |
"psraw $2, %%xmm4 \n\t" |
"paddw %%xmm1, %%xmm0 \n\t" |
"paddw %%xmm5, %%xmm4 \n\t" |
"movdqa %%xmm0, (%0) \n\t" |
"movdqa %%xmm4, 16(%0) \n\t" |
:: "r"(&b[i]), "r"(&ref[i]) |
: "memory" |
); |
} |
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); |
b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS); |
} |
{ // Lift 3 |
IDWTELEM * const src = b+w2; |
i = 0; |
for(; (((x86_reg)&temp[i]) & 0x1F) && i<w_r; i++){ |
temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); |
} |
for(; i<w_r-7; i+=8){ |
__asm__ volatile( |
"movdqu 2(%1), %%xmm2 \n\t" |
"movdqu 18(%1), %%xmm6 \n\t" |
"paddw (%1), %%xmm2 \n\t" |
"paddw 16(%1), %%xmm6 \n\t" |
"movdqu (%0), %%xmm0 \n\t" |
"movdqu 16(%0), %%xmm4 \n\t" |
"paddw %%xmm2, %%xmm0 \n\t" |
"paddw %%xmm6, %%xmm4 \n\t" |
"psraw $1, %%xmm2 \n\t" |
"psraw $1, %%xmm6 \n\t" |
"paddw %%xmm0, %%xmm2 \n\t" |
"paddw %%xmm4, %%xmm6 \n\t" |
"movdqa %%xmm2, (%2) \n\t" |
"movdqa %%xmm6, 16(%2) \n\t" |
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) |
: "memory" |
); |
} |
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); |
} |
{ |
snow_interleave_line_header(&i, width, b, temp); |
for (; (i & 0x3E) != 0x3E; i-=2){ |
b[i+1] = temp[i>>1]; |
b[i] = b[i>>1]; |
} |
for (i-=62; i>=0; i-=64){ |
__asm__ volatile( |
"movdqa (%1), %%xmm0 \n\t" |
"movdqa 16(%1), %%xmm2 \n\t" |
"movdqa 32(%1), %%xmm4 \n\t" |
"movdqa 48(%1), %%xmm6 \n\t" |
"movdqa (%1), %%xmm1 \n\t" |
"movdqa 16(%1), %%xmm3 \n\t" |
"movdqa 32(%1), %%xmm5 \n\t" |
"movdqa 48(%1), %%xmm7 \n\t" |
"punpcklwd (%2), %%xmm0 \n\t" |
"punpcklwd 16(%2), %%xmm2 \n\t" |
"punpcklwd 32(%2), %%xmm4 \n\t" |
"punpcklwd 48(%2), %%xmm6 \n\t" |
"movdqa %%xmm0, (%0) \n\t" |
"movdqa %%xmm2, 32(%0) \n\t" |
"movdqa %%xmm4, 64(%0) \n\t" |
"movdqa %%xmm6, 96(%0) \n\t" |
"punpckhwd (%2), %%xmm1 \n\t" |
"punpckhwd 16(%2), %%xmm3 \n\t" |
"punpckhwd 32(%2), %%xmm5 \n\t" |
"punpckhwd 48(%2), %%xmm7 \n\t" |
"movdqa %%xmm1, 16(%0) \n\t" |
"movdqa %%xmm3, 48(%0) \n\t" |
"movdqa %%xmm5, 80(%0) \n\t" |
"movdqa %%xmm7, 112(%0) \n\t" |
:: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) |
: "memory" |
); |
} |
} |
} |
static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){ |
const int w2= (width+1)>>1; |
const int w_l= (width>>1); |
const int w_r= w2 - 1; |
int i; |
{ // Lift 0 |
IDWTELEM * const ref = b + w2 - 1; |
i = 1; |
b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); |
__asm__ volatile( |
"pcmpeqw %%mm7, %%mm7 \n\t" |
"pcmpeqw %%mm3, %%mm3 \n\t" |
"psllw $1, %%mm3 \n\t" |
"paddw %%mm7, %%mm3 \n\t" |
"psllw $13, %%mm3 \n\t" |
::); |
for(; i<w_l-7; i+=8){ |
__asm__ volatile( |
"movq (%1), %%mm2 \n\t" |
"movq 8(%1), %%mm6 \n\t" |
"paddw 2(%1), %%mm2 \n\t" |
"paddw 10(%1), %%mm6 \n\t" |
"paddw %%mm7, %%mm2 \n\t" |
"paddw %%mm7, %%mm6 \n\t" |
"pmulhw %%mm3, %%mm2 \n\t" |
"pmulhw %%mm3, %%mm6 \n\t" |
"paddw (%0), %%mm2 \n\t" |
"paddw 8(%0), %%mm6 \n\t" |
"movq %%mm2, (%0) \n\t" |
"movq %%mm6, 8(%0) \n\t" |
:: "r"(&b[i]), "r"(&ref[i]) |
: "memory" |
); |
} |
snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); |
} |
{ // Lift 1 |
IDWTELEM * const dst = b+w2; |
i = 0; |
for(; i<w_r-7; i+=8){ |
__asm__ volatile( |
"movq (%1), %%mm2 \n\t" |
"movq 8(%1), %%mm6 \n\t" |
"paddw 2(%1), %%mm2 \n\t" |
"paddw 10(%1), %%mm6 \n\t" |
"movq (%0), %%mm0 \n\t" |
"movq 8(%0), %%mm4 \n\t" |
"psubw %%mm2, %%mm0 \n\t" |
"psubw %%mm6, %%mm4 \n\t" |
"movq %%mm0, (%0) \n\t" |
"movq %%mm4, 8(%0) \n\t" |
:: "r"(&dst[i]), "r"(&b[i]) |
: "memory" |
); |
} |
snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); |
} |
{ // Lift 2 |
IDWTELEM * const ref = b+w2 - 1; |
i = 1; |
b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); |
__asm__ volatile( |
"psllw $15, %%mm7 \n\t" |
"pcmpeqw %%mm6, %%mm6 \n\t" |
"psrlw $13, %%mm6 \n\t" |
"paddw %%mm7, %%mm6 \n\t" |
::); |
for(; i<w_l-7; i+=8){ |
__asm__ volatile( |
"movq (%1), %%mm0 \n\t" |
"movq 8(%1), %%mm4 \n\t" |
"movq 2(%1), %%mm1 \n\t" |
"movq 10(%1), %%mm5 \n\t" |
"paddw %%mm6, %%mm0 \n\t" |
"paddw %%mm6, %%mm4 \n\t" |
"paddw %%mm7, %%mm1 \n\t" |
"paddw %%mm7, %%mm5 \n\t" |
"pavgw %%mm1, %%mm0 \n\t" |
"pavgw %%mm5, %%mm4 \n\t" |
"psubw %%mm7, %%mm0 \n\t" |
"psubw %%mm7, %%mm4 \n\t" |
"psraw $1, %%mm0 \n\t" |
"psraw $1, %%mm4 \n\t" |
"movq (%0), %%mm1 \n\t" |
"movq 8(%0), %%mm5 \n\t" |
"paddw %%mm1, %%mm0 \n\t" |
"paddw %%mm5, %%mm4 \n\t" |
"psraw $2, %%mm0 \n\t" |
"psraw $2, %%mm4 \n\t" |
"paddw %%mm1, %%mm0 \n\t" |
"paddw %%mm5, %%mm4 \n\t" |
"movq %%mm0, (%0) \n\t" |
"movq %%mm4, 8(%0) \n\t" |
:: "r"(&b[i]), "r"(&ref[i]) |
: "memory" |
); |
} |
snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); |
} |
{ // Lift 3 |
IDWTELEM * const src = b+w2; |
i = 0; |
for(; i<w_r-7; i+=8){ |
__asm__ volatile( |
"movq 2(%1), %%mm2 \n\t" |
"movq 10(%1), %%mm6 \n\t" |
"paddw (%1), %%mm2 \n\t" |
"paddw 8(%1), %%mm6 \n\t" |
"movq (%0), %%mm0 \n\t" |
"movq 8(%0), %%mm4 \n\t" |
"paddw %%mm2, %%mm0 \n\t" |
"paddw %%mm6, %%mm4 \n\t" |
"psraw $1, %%mm2 \n\t" |
"psraw $1, %%mm6 \n\t" |
"paddw %%mm0, %%mm2 \n\t" |
"paddw %%mm4, %%mm6 \n\t" |
"movq %%mm2, (%2) \n\t" |
"movq %%mm6, 8(%2) \n\t" |
:: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) |
: "memory" |
); |
} |
snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); |
} |
{ |
snow_interleave_line_header(&i, width, b, temp); |
for (; (i & 0x1E) != 0x1E; i-=2){ |
b[i+1] = temp[i>>1]; |
b[i] = b[i>>1]; |
} |
for (i-=30; i>=0; i-=32){ |
__asm__ volatile( |
"movq (%1), %%mm0 \n\t" |
"movq 8(%1), %%mm2 \n\t" |
"movq 16(%1), %%mm4 \n\t" |
"movq 24(%1), %%mm6 \n\t" |
"movq (%1), %%mm1 \n\t" |
"movq 8(%1), %%mm3 \n\t" |
"movq 16(%1), %%mm5 \n\t" |
"movq 24(%1), %%mm7 \n\t" |
"punpcklwd (%2), %%mm0 \n\t" |
"punpcklwd 8(%2), %%mm2 \n\t" |
"punpcklwd 16(%2), %%mm4 \n\t" |
"punpcklwd 24(%2), %%mm6 \n\t" |
"movq %%mm0, (%0) \n\t" |
"movq %%mm2, 16(%0) \n\t" |
"movq %%mm4, 32(%0) \n\t" |
"movq %%mm6, 48(%0) \n\t" |
"punpckhwd (%2), %%mm1 \n\t" |
"punpckhwd 8(%2), %%mm3 \n\t" |
"punpckhwd 16(%2), %%mm5 \n\t" |
"punpckhwd 24(%2), %%mm7 \n\t" |
"movq %%mm1, 8(%0) \n\t" |
"movq %%mm3, 24(%0) \n\t" |
"movq %%mm5, 40(%0) \n\t" |
"movq %%mm7, 56(%0) \n\t" |
:: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) |
: "memory" |
); |
} |
} |
} |
#if HAVE_7REGS |
#define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ |
""op" ("r",%%"REG_d"), %%"t0" \n\t"\ |
""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ |
""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ |
""op" 48("r",%%"REG_d"), %%"t3" \n\t" |
#define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ |
snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) |
#define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ |
snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) |
#define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ |
"psubw %%"s0", %%"t0" \n\t"\ |
"psubw %%"s1", %%"t1" \n\t"\ |
"psubw %%"s2", %%"t2" \n\t"\ |
"psubw %%"s3", %%"t3" \n\t" |
#define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ |
"movdqa %%"s0", ("w",%%"REG_d") \n\t"\ |
"movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ |
"movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ |
"movdqa %%"s3", 48("w",%%"REG_d") \n\t" |
#define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ |
"psraw $"n", %%"t0" \n\t"\ |
"psraw $"n", %%"t1" \n\t"\ |
"psraw $"n", %%"t2" \n\t"\ |
"psraw $"n", %%"t3" \n\t" |
#define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ |
"paddw %%"s0", %%"t0" \n\t"\ |
"paddw %%"s1", %%"t1" \n\t"\ |
"paddw %%"s2", %%"t2" \n\t"\ |
"paddw %%"s3", %%"t3" \n\t" |
#define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ |
"pmulhw %%"s0", %%"t0" \n\t"\ |
"pmulhw %%"s1", %%"t1" \n\t"\ |
"pmulhw %%"s2", %%"t2" \n\t"\ |
"pmulhw %%"s3", %%"t3" \n\t" |
#define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ |
"movdqa %%"s0", %%"t0" \n\t"\ |
"movdqa %%"s1", %%"t1" \n\t"\ |
"movdqa %%"s2", %%"t2" \n\t"\ |
"movdqa %%"s3", %%"t3" \n\t" |
static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ |
x86_reg i = width; |
while(i & 0x1F) |
{ |
i--; |
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; |
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; |
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; |
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; |
} |
i+=i; |
__asm__ volatile ( |
"jmp 2f \n\t" |
"1: \n\t" |
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") |
"pcmpeqw %%xmm0, %%xmm0 \n\t" |
"pcmpeqw %%xmm2, %%xmm2 \n\t" |
"paddw %%xmm2, %%xmm2 \n\t" |
"paddw %%xmm0, %%xmm2 \n\t" |
"psllw $13, %%xmm2 \n\t" |
snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") |
snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") |
snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") |
snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") |
snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") |
snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") |
"pcmpeqw %%xmm7, %%xmm7 \n\t" |
"pcmpeqw %%xmm5, %%xmm5 \n\t" |
"psllw $15, %%xmm7 \n\t" |
"psrlw $13, %%xmm5 \n\t" |
"paddw %%xmm7, %%xmm5 \n\t" |
snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") |
"movq (%2,%%"REG_d"), %%xmm1 \n\t" |
"movq 8(%2,%%"REG_d"), %%xmm3 \n\t" |
"paddw %%xmm7, %%xmm1 \n\t" |
"paddw %%xmm7, %%xmm3 \n\t" |
"pavgw %%xmm1, %%xmm0 \n\t" |
"pavgw %%xmm3, %%xmm2 \n\t" |
"movq 16(%2,%%"REG_d"), %%xmm1 \n\t" |
"movq 24(%2,%%"REG_d"), %%xmm3 \n\t" |
"paddw %%xmm7, %%xmm1 \n\t" |
"paddw %%xmm7, %%xmm3 \n\t" |
"pavgw %%xmm1, %%xmm4 \n\t" |
"pavgw %%xmm3, %%xmm6 \n\t" |
snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") |
snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") |
"2: \n\t" |
"sub $64, %%"REG_d" \n\t" |
"jge 1b \n\t" |
:"+d"(i) |
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); |
} |
#define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ |
""op" ("r",%%"REG_d"), %%"t0" \n\t"\ |
""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ |
""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ |
""op" 24("r",%%"REG_d"), %%"t3" \n\t" |
#define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ |
snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) |
#define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ |
snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) |
#define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ |
"movq %%"s0", ("w",%%"REG_d") \n\t"\ |
"movq %%"s1", 8("w",%%"REG_d") \n\t"\ |
"movq %%"s2", 16("w",%%"REG_d") \n\t"\ |
"movq %%"s3", 24("w",%%"REG_d") \n\t" |
#define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ |
"movq %%"s0", %%"t0" \n\t"\ |
"movq %%"s1", %%"t1" \n\t"\ |
"movq %%"s2", %%"t2" \n\t"\ |
"movq %%"s3", %%"t3" \n\t" |
static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ |
x86_reg i = width; |
while(i & 15) |
{ |
i--; |
b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; |
b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; |
b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; |
b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; |
} |
i+=i; |
__asm__ volatile( |
"jmp 2f \n\t" |
"1: \n\t" |
snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") |
snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") |
"pcmpeqw %%mm0, %%mm0 \n\t" |
"pcmpeqw %%mm2, %%mm2 \n\t" |
"paddw %%mm2, %%mm2 \n\t" |
"paddw %%mm0, %%mm2 \n\t" |
"psllw $13, %%mm2 \n\t" |
snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") |
snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") |
snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") |
snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") |
snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") |
snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") |
snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") |
"pcmpeqw %%mm7, %%mm7 \n\t" |
"pcmpeqw %%mm5, %%mm5 \n\t" |
"psllw $15, %%mm7 \n\t" |
"psrlw $13, %%mm5 \n\t" |
"paddw %%mm7, %%mm5 \n\t" |
snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") |
"movq (%2,%%"REG_d"), %%mm1 \n\t" |
"movq 8(%2,%%"REG_d"), %%mm3 \n\t" |
"paddw %%mm7, %%mm1 \n\t" |
"paddw %%mm7, %%mm3 \n\t" |
"pavgw %%mm1, %%mm0 \n\t" |
"pavgw %%mm3, %%mm2 \n\t" |
"movq 16(%2,%%"REG_d"), %%mm1 \n\t" |
"movq 24(%2,%%"REG_d"), %%mm3 \n\t" |
"paddw %%mm7, %%mm1 \n\t" |
"paddw %%mm7, %%mm3 \n\t" |
"pavgw %%mm1, %%mm4 \n\t" |
"pavgw %%mm3, %%mm6 \n\t" |
snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") |
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") |
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") |
snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") |
snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") |
snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") |
snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") |
snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") |
snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") |
snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") |
"2: \n\t" |
"sub $32, %%"REG_d" \n\t" |
"jge 1b \n\t" |
:"+d"(i) |
:"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); |
} |
#endif //HAVE_7REGS |
#define snow_inner_add_yblock_sse2_header \ |
IDWTELEM * * dst_array = sb->line + src_y;\ |
x86_reg tmp;\ |
__asm__ volatile(\ |
"mov %7, %%"REG_c" \n\t"\ |
"mov %6, %2 \n\t"\ |
"mov %4, %%"REG_S" \n\t"\ |
"pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ |
"pcmpeqd %%xmm3, %%xmm3 \n\t"\ |
"psllw $15, %%xmm3 \n\t"\ |
"psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ |
"1: \n\t"\ |
"mov %1, %%"REG_D" \n\t"\ |
"mov (%%"REG_D"), %%"REG_D" \n\t"\ |
"add %3, %%"REG_D" \n\t" |
#define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ |
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
"movq (%%"REG_d"), %%"out_reg1" \n\t"\ |
"movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ |
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\ |
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\ |
"movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ |
"movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ |
"punpcklbw %%xmm7, %%xmm0 \n\t"\ |
"punpcklbw %%xmm7, %%xmm4 \n\t"\ |
"pmullw %%xmm0, %%"out_reg1" \n\t"\ |
"pmullw %%xmm4, %%"out_reg2" \n\t" |
#define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ |
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
"movq (%%"REG_d"), %%"out_reg1" \n\t"\ |
"movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ |
"punpcklbw %%xmm7, %%"out_reg1" \n\t"\ |
"punpcklbw %%xmm7, %%"out_reg2" \n\t"\ |
"movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ |
"movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ |
"punpcklbw %%xmm7, %%xmm0 \n\t"\ |
"punpcklbw %%xmm7, %%xmm4 \n\t"\ |
"pmullw %%xmm0, %%"out_reg1" \n\t"\ |
"pmullw %%xmm4, %%"out_reg2" \n\t" |
#define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ |
snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ |
"paddusw %%xmm2, %%xmm1 \n\t"\ |
"paddusw %%xmm6, %%xmm5 \n\t" |
#define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ |
snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ |
"paddusw %%xmm2, %%xmm1 \n\t"\ |
"paddusw %%xmm6, %%xmm5 \n\t" |
#define snow_inner_add_yblock_sse2_end_common1\ |
"add $32, %%"REG_S" \n\t"\ |
"add %%"REG_c", %0 \n\t"\ |
"add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ |
"add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ |
"add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ |
"add %%"REG_c", (%%"REG_a") \n\t" |
#define snow_inner_add_yblock_sse2_end_common2\ |
"jnz 1b \n\t"\ |
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ |
:\ |
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ |
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
#define snow_inner_add_yblock_sse2_end_8\ |
"sal $1, %%"REG_c" \n\t"\ |
"add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\ |
snow_inner_add_yblock_sse2_end_common1\ |
"sar $1, %%"REG_c" \n\t"\ |
"sub $2, %2 \n\t"\ |
snow_inner_add_yblock_sse2_end_common2 |
#define snow_inner_add_yblock_sse2_end_16\ |
"add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\ |
snow_inner_add_yblock_sse2_end_common1\ |
"dec %2 \n\t"\ |
snow_inner_add_yblock_sse2_end_common2 |
static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, |
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
snow_inner_add_yblock_sse2_header |
snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") |
snow_inner_add_yblock_sse2_accum_8("2", "8") |
snow_inner_add_yblock_sse2_accum_8("1", "128") |
snow_inner_add_yblock_sse2_accum_8("0", "136") |
"mov %0, %%"REG_d" \n\t" |
"movdqa (%%"REG_D"), %%xmm0 \n\t" |
"movdqa %%xmm1, %%xmm2 \n\t" |
"punpckhwd %%xmm7, %%xmm1 \n\t" |
"punpcklwd %%xmm7, %%xmm2 \n\t" |
"paddd %%xmm2, %%xmm0 \n\t" |
"movdqa 16(%%"REG_D"), %%xmm2 \n\t" |
"paddd %%xmm1, %%xmm2 \n\t" |
"paddd %%xmm3, %%xmm0 \n\t" |
"paddd %%xmm3, %%xmm2 \n\t" |
"mov %1, %%"REG_D" \n\t" |
"mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" |
"add %3, %%"REG_D" \n\t" |
"movdqa (%%"REG_D"), %%xmm4 \n\t" |
"movdqa %%xmm5, %%xmm6 \n\t" |
"punpckhwd %%xmm7, %%xmm5 \n\t" |
"punpcklwd %%xmm7, %%xmm6 \n\t" |
"paddd %%xmm6, %%xmm4 \n\t" |
"movdqa 16(%%"REG_D"), %%xmm6 \n\t" |
"paddd %%xmm5, %%xmm6 \n\t" |
"paddd %%xmm3, %%xmm4 \n\t" |
"paddd %%xmm3, %%xmm6 \n\t" |
"psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ |
"psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ |
"packssdw %%xmm2, %%xmm0 \n\t" |
"packuswb %%xmm7, %%xmm0 \n\t" |
"movq %%xmm0, (%%"REG_d") \n\t" |
"psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ |
"psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ |
"packssdw %%xmm6, %%xmm4 \n\t" |
"packuswb %%xmm7, %%xmm4 \n\t" |
"movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" |
snow_inner_add_yblock_sse2_end_8 |
} |
static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, |
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
snow_inner_add_yblock_sse2_header |
snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") |
snow_inner_add_yblock_sse2_accum_16("2", "16") |
snow_inner_add_yblock_sse2_accum_16("1", "512") |
snow_inner_add_yblock_sse2_accum_16("0", "528") |
"mov %0, %%"REG_d" \n\t" |
"psrlw $4, %%xmm1 \n\t" |
"psrlw $4, %%xmm5 \n\t" |
"paddw (%%"REG_D"), %%xmm1 \n\t" |
"paddw 16(%%"REG_D"), %%xmm5 \n\t" |
"paddw %%xmm3, %%xmm1 \n\t" |
"paddw %%xmm3, %%xmm5 \n\t" |
"psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ |
"psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ |
"packuswb %%xmm5, %%xmm1 \n\t" |
"movdqu %%xmm1, (%%"REG_d") \n\t" |
snow_inner_add_yblock_sse2_end_16 |
} |
#define snow_inner_add_yblock_mmx_header \ |
IDWTELEM * * dst_array = sb->line + src_y;\ |
x86_reg tmp;\ |
__asm__ volatile(\ |
"mov %7, %%"REG_c" \n\t"\ |
"mov %6, %2 \n\t"\ |
"mov %4, %%"REG_S" \n\t"\ |
"pxor %%mm7, %%mm7 \n\t" /* 0 */\ |
"pcmpeqd %%mm3, %%mm3 \n\t"\ |
"psllw $15, %%mm3 \n\t"\ |
"psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ |
"1: \n\t"\ |
"mov %1, %%"REG_D" \n\t"\ |
"mov (%%"REG_D"), %%"REG_D" \n\t"\ |
"add %3, %%"REG_D" \n\t" |
#define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ |
"mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
"movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ |
"movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ |
"punpcklbw %%mm7, %%"out_reg1" \n\t"\ |
"punpcklbw %%mm7, %%"out_reg2" \n\t"\ |
"movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ |
"movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ |
"punpcklbw %%mm7, %%mm0 \n\t"\ |
"punpcklbw %%mm7, %%mm4 \n\t"\ |
"pmullw %%mm0, %%"out_reg1" \n\t"\ |
"pmullw %%mm4, %%"out_reg2" \n\t" |
#define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ |
snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ |
"paddusw %%mm2, %%mm1 \n\t"\ |
"paddusw %%mm6, %%mm5 \n\t" |
#define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ |
"mov %0, %%"REG_d" \n\t"\ |
"psrlw $4, %%mm1 \n\t"\ |
"psrlw $4, %%mm5 \n\t"\ |
"paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ |
"paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ |
"paddw %%mm3, %%mm1 \n\t"\ |
"paddw %%mm3, %%mm5 \n\t"\ |
"psraw $4, %%mm1 \n\t"\ |
"psraw $4, %%mm5 \n\t"\ |
"packuswb %%mm5, %%mm1 \n\t"\ |
"movq %%mm1, "write_offset"(%%"REG_d") \n\t" |
#define snow_inner_add_yblock_mmx_end(s_step)\ |
"add $"s_step", %%"REG_S" \n\t"\ |
"add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ |
"add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ |
"add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ |
"add %%"REG_c", (%%"REG_a") \n\t"\ |
"add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\ |
"add %%"REG_c", %0 \n\t"\ |
"dec %2 \n\t"\ |
"jnz 1b \n\t"\ |
:"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ |
:\ |
"rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ |
"%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, |
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
snow_inner_add_yblock_mmx_header |
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") |
snow_inner_add_yblock_mmx_accum("2", "8", "0") |
snow_inner_add_yblock_mmx_accum("1", "128", "0") |
snow_inner_add_yblock_mmx_accum("0", "136", "0") |
snow_inner_add_yblock_mmx_mix("0", "0") |
snow_inner_add_yblock_mmx_end("16") |
} |
static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, |
int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
snow_inner_add_yblock_mmx_header |
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") |
snow_inner_add_yblock_mmx_accum("2", "16", "0") |
snow_inner_add_yblock_mmx_accum("1", "512", "0") |
snow_inner_add_yblock_mmx_accum("0", "528", "0") |
snow_inner_add_yblock_mmx_mix("0", "0") |
snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") |
snow_inner_add_yblock_mmx_accum("2", "24", "8") |
snow_inner_add_yblock_mmx_accum("1", "520", "8") |
snow_inner_add_yblock_mmx_accum("0", "536", "8") |
snow_inner_add_yblock_mmx_mix("16", "8") |
snow_inner_add_yblock_mmx_end("32") |
} |
static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
if (b_w == 16) |
inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
else if (b_w == 8 && obmc_stride == 16) { |
if (!(b_h & 1)) |
inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
else |
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
} else |
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
} |
static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
if (b_w == 16) |
inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
else if (b_w == 8 && obmc_stride == 16) |
inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
else |
ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
} |
#endif /* HAVE_INLINE_ASM */ |
void ff_dwt_init_x86(SnowDWTContext *c) |
{ |
#if HAVE_INLINE_ASM |
int mm_flags = av_get_cpu_flags(); |
if (mm_flags & AV_CPU_FLAG_MMX) { |
if(mm_flags & AV_CPU_FLAG_SSE2 & 0){ |
c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; |
#if HAVE_7REGS |
c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; |
#endif |
c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; |
} |
else{ |
if (mm_flags & AV_CPU_FLAG_MMXEXT) { |
c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; |
#if HAVE_7REGS |
c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; |
#endif |
} |
c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; |
} |
} |
#endif /* HAVE_INLINE_ASM */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/v210-init.c |
---|
0,0 → 1,48 |
/* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/cpu.h" |
#include "libavcodec/v210dec.h" |
extern void ff_v210_planar_unpack_unaligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); |
extern void ff_v210_planar_unpack_unaligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); |
extern void ff_v210_planar_unpack_aligned_ssse3(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); |
extern void ff_v210_planar_unpack_aligned_avx(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width); |
av_cold void v210_x86_init(V210DecContext *s) |
{ |
int cpu_flags = av_get_cpu_flags(); |
#if HAVE_YASM |
if (s->aligned_input) { |
if (cpu_flags & AV_CPU_FLAG_SSSE3) |
s->unpack_frame = ff_v210_planar_unpack_aligned_ssse3; |
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) |
s->unpack_frame = ff_v210_planar_unpack_aligned_avx; |
} |
else { |
if (cpu_flags & AV_CPU_FLAG_SSSE3) |
s->unpack_frame = ff_v210_planar_unpack_unaligned_ssse3; |
if (HAVE_AVX_EXTERNAL && cpu_flags & AV_CPU_FLAG_AVX) |
s->unpack_frame = ff_v210_planar_unpack_unaligned_avx; |
} |
#endif |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/v210.asm |
---|
0,0 → 1,88 |
;****************************************************************************** |
;* V210 SIMD unpack |
;* Copyright (c) 2011 Loren Merritt <lorenm@u.washington.edu> |
;* Copyright (c) 2011 Kieran Kunhya <kieran@kunhya.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
v210_mask: times 4 dd 0x3ff |
v210_mult: dw 64,4,64,4,64,4,64,4 |
v210_luma_shuf: db 8,9,0,1,2,3,12,13,4,5,6,7,-1,-1,-1,-1 |
v210_chroma_shuf: db 0,1,8,9,6,7,-1,-1,2,3,4,5,12,13,-1,-1 |
SECTION .text |
%macro v210_planar_unpack 2 |
; v210_planar_unpack(const uint32_t *src, uint16_t *y, uint16_t *u, uint16_t *v, int width) |
cglobal v210_planar_unpack_%1_%2, 5, 5, 7 |
movsxdifnidn r4, r4d |
lea r1, [r1+2*r4] |
add r2, r4 |
add r3, r4 |
neg r4 |
mova m3, [v210_mult] |
mova m4, [v210_mask] |
mova m5, [v210_luma_shuf] |
mova m6, [v210_chroma_shuf] |
.loop |
%ifidn %1, unaligned |
movu m0, [r0] |
%else |
mova m0, [r0] |
%endif |
pmullw m1, m0, m3 |
psrld m0, 10 |
psrlw m1, 6 ; u0 v0 y1 y2 v1 u2 y4 y5 |
pand m0, m4 ; y0 __ u1 __ y3 __ v2 __ |
shufps m2, m1, m0, 0x8d ; y1 y2 y4 y5 y0 __ y3 __ |
pshufb m2, m5 ; y0 y1 y2 y3 y4 y5 __ __ |
movu [r1+2*r4], m2 |
shufps m1, m0, 0xd8 ; u0 v0 v1 u2 u1 __ v2 __ |
pshufb m1, m6 ; u0 u1 u2 __ v0 v1 v2 __ |
movq [r2+r4], m1 |
movhps [r3+r4], m1 |
add r0, mmsize |
add r4, 6 |
jl .loop |
REP_RET |
%endmacro |
INIT_XMM |
v210_planar_unpack unaligned, ssse3 |
%if HAVE_AVX_EXTERNAL |
INIT_AVX |
v210_planar_unpack unaligned, avx |
%endif |
INIT_XMM |
v210_planar_unpack aligned, ssse3 |
%if HAVE_AVX_EXTERNAL |
INIT_AVX |
v210_planar_unpack aligned, avx |
%endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vc1dsp.asm |
---|
0,0 → 1,317 |
;****************************************************************************** |
;* VC1 deblocking optimizations |
;* Copyright (c) 2009 David Conrad |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
cextern pw_4 |
cextern pw_5 |
section .text |
; dst_low, dst_high (src), zero |
; zero-extends one vector from 8 to 16 bits |
%macro UNPACK_8TO16 4 |
mova m%2, m%3 |
punpckh%1 m%3, m%4 |
punpckl%1 m%2, m%4 |
%endmacro |
%macro STORE_4_WORDS 6 |
%if cpuflag(sse4) |
pextrw %1, %5, %6+0 |
pextrw %2, %5, %6+1 |
pextrw %3, %5, %6+2 |
pextrw %4, %5, %6+3 |
%else |
movd %6d, %5 |
%if mmsize==16 |
psrldq %5, 4 |
%else |
psrlq %5, 32 |
%endif |
mov %1, %6w |
shr %6, 16 |
mov %2, %6w |
movd %6d, %5 |
mov %3, %6w |
shr %6, 16 |
mov %4, %6w |
%endif |
%endmacro |
; in: p1 p0 q0 q1, clobbers p0 |
; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3 |
%macro VC1_LOOP_FILTER_A0 4 |
psubw %1, %4 |
psubw %2, %3 |
paddw %1, %1 |
pmullw %2, [pw_5] |
psubw %1, %2 |
paddw %1, [pw_4] |
psraw %1, 3 |
%endmacro |
; in: p0 q0 a0 a1 a2 |
; m0 m1 m7 m6 m5 |
; %1: size |
; out: m0=p0' m1=q0' |
%macro VC1_FILTER 1 |
PABSW m4, m7 |
PABSW m3, m6 |
PABSW m2, m5 |
mova m6, m4 |
pminsw m3, m2 |
pcmpgtw m6, m3 ; if (a2 < a0 || a1 < a0) |
psubw m3, m4 |
pmullw m3, [pw_5] ; 5*(a3 - a0) |
PABSW m2, m3 |
psraw m2, 3 ; abs(d/8) |
pxor m7, m3 ; d_sign ^= a0_sign |
pxor m5, m5 |
movd m3, r2d |
%if %1 > 4 |
punpcklbw m3, m3 |
%endif |
punpcklbw m3, m5 |
pcmpgtw m3, m4 ; if (a0 < pq) |
pand m6, m3 |
mova m3, m0 |
psubw m3, m1 |
PABSW m4, m3 |
psraw m4, 1 |
pxor m3, m7 ; d_sign ^ clip_sign |
psraw m3, 15 |
pminsw m2, m4 ; min(d, clip) |
pcmpgtw m4, m5 |
pand m6, m4 ; filt3 (C return value) |
; each set of 4 pixels is not filtered if the 3rd is not |
%if mmsize==16 |
pshuflw m4, m6, 0xaa |
%if %1 > 4 |
pshufhw m4, m4, 0xaa |
%endif |
%else |
pshufw m4, m6, 0xaa |
%endif |
pandn m3, m4 |
pand m2, m6 |
pand m3, m2 ; d final |
psraw m7, 15 |
pxor m3, m7 |
psubw m3, m7 |
psubw m0, m3 |
paddw m1, m3 |
packuswb m0, m0 |
packuswb m1, m1 |
%endmacro |
; 1st param: size of filter |
; 2nd param: mov suffix equivalent to the filter size |
%macro VC1_V_LOOP_FILTER 2 |
pxor m5, m5 |
mov%2 m6, [r4] |
mov%2 m4, [r4+r1] |
mov%2 m7, [r4+2*r1] |
mov%2 m0, [r4+r3] |
punpcklbw m6, m5 |
punpcklbw m4, m5 |
punpcklbw m7, m5 |
punpcklbw m0, m5 |
VC1_LOOP_FILTER_A0 m6, m4, m7, m0 |
mov%2 m1, [r0] |
mov%2 m2, [r0+r1] |
punpcklbw m1, m5 |
punpcklbw m2, m5 |
mova m4, m0 |
VC1_LOOP_FILTER_A0 m7, m4, m1, m2 |
mov%2 m3, [r0+2*r1] |
mov%2 m4, [r0+r3] |
punpcklbw m3, m5 |
punpcklbw m4, m5 |
mova m5, m1 |
VC1_LOOP_FILTER_A0 m5, m2, m3, m4 |
VC1_FILTER %1 |
mov%2 [r4+r3], m0 |
mov%2 [r0], m1 |
%endmacro |
; 1st param: size of filter |
; NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register |
; 2nd (optional) param: temp register to use for storing words |
%macro VC1_H_LOOP_FILTER 1-2 |
%if %1 == 4 |
movq m0, [r0 -4] |
movq m1, [r0+ r1-4] |
movq m2, [r0+2*r1-4] |
movq m3, [r0+ r3-4] |
TRANSPOSE4x4B 0, 1, 2, 3, 4 |
%else |
movq m0, [r0 -4] |
movq m4, [r0+ r1-4] |
movq m1, [r0+2*r1-4] |
movq m5, [r0+ r3-4] |
movq m2, [r4 -4] |
movq m6, [r4+ r1-4] |
movq m3, [r4+2*r1-4] |
movq m7, [r4+ r3-4] |
punpcklbw m0, m4 |
punpcklbw m1, m5 |
punpcklbw m2, m6 |
punpcklbw m3, m7 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
%endif |
pxor m5, m5 |
UNPACK_8TO16 bw, 6, 0, 5 |
UNPACK_8TO16 bw, 7, 1, 5 |
VC1_LOOP_FILTER_A0 m6, m0, m7, m1 |
UNPACK_8TO16 bw, 4, 2, 5 |
mova m0, m1 ; m0 = p0 |
VC1_LOOP_FILTER_A0 m7, m1, m4, m2 |
UNPACK_8TO16 bw, 1, 3, 5 |
mova m5, m4 |
VC1_LOOP_FILTER_A0 m5, m2, m1, m3 |
SWAP 1, 4 ; m1 = q0 |
VC1_FILTER %1 |
punpcklbw m0, m1 |
%if %0 > 1 |
STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2 |
%if %1 > 4 |
psrldq m0, 4 |
STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2 |
%endif |
%else |
STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0 |
STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4 |
%endif |
%endmacro |
%macro START_V_FILTER 0 |
mov r4, r0 |
lea r3, [4*r1] |
sub r4, r3 |
lea r3, [r1+2*r1] |
imul r2, 0x01010101 |
%endmacro |
%macro START_H_FILTER 1 |
lea r3, [r1+2*r1] |
%if %1 > 4 |
lea r4, [r0+4*r1] |
%endif |
imul r2, 0x01010101 |
%endmacro |
%macro VC1_LF 0 |
cglobal vc1_v_loop_filter_internal |
VC1_V_LOOP_FILTER 4, d |
ret |
cglobal vc1_h_loop_filter_internal |
VC1_H_LOOP_FILTER 4, r4 |
ret |
; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq) |
cglobal vc1_v_loop_filter4, 3,5,0 |
START_V_FILTER |
call vc1_v_loop_filter_internal |
RET |
; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq) |
cglobal vc1_h_loop_filter4, 3,5,0 |
START_H_FILTER 4 |
call vc1_h_loop_filter_internal |
RET |
; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq) |
cglobal vc1_v_loop_filter8, 3,5,0 |
START_V_FILTER |
call vc1_v_loop_filter_internal |
add r4, 4 |
add r0, 4 |
call vc1_v_loop_filter_internal |
RET |
; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq) |
cglobal vc1_h_loop_filter8, 3,5,0 |
START_H_FILTER 4 |
call vc1_h_loop_filter_internal |
lea r0, [r0+4*r1] |
call vc1_h_loop_filter_internal |
RET |
%endmacro |
INIT_MMX mmxext |
VC1_LF |
INIT_XMM sse2 |
; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq) |
cglobal vc1_v_loop_filter8, 3,5,8 |
START_V_FILTER |
VC1_V_LOOP_FILTER 8, q |
RET |
; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq) |
cglobal vc1_h_loop_filter8, 3,6,8 |
START_H_FILTER 8 |
VC1_H_LOOP_FILTER 8, r5 |
RET |
INIT_MMX ssse3 |
; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq) |
cglobal vc1_v_loop_filter4, 3,5,0 |
START_V_FILTER |
VC1_V_LOOP_FILTER 4, d |
RET |
; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq) |
cglobal vc1_h_loop_filter4, 3,5,0 |
START_H_FILTER 4 |
VC1_H_LOOP_FILTER 4, r4 |
RET |
INIT_XMM ssse3 |
; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq) |
cglobal vc1_v_loop_filter8, 3,5,8 |
START_V_FILTER |
VC1_V_LOOP_FILTER 8, q |
RET |
; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq) |
cglobal vc1_h_loop_filter8, 3,6,8 |
START_H_FILTER 8 |
VC1_H_LOOP_FILTER 8, r5 |
RET |
INIT_XMM sse4 |
; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq) |
cglobal vc1_h_loop_filter8, 3,5,8 |
START_H_FILTER 8 |
VC1_H_LOOP_FILTER 8 |
RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vc1dsp.h |
---|
0,0 → 1,29 |
/* |
* VC-1 and WMV3 decoder - X86 DSP init functions |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#ifndef AVCODEC_X86_VC1DSP_H |
#define AVCODEC_X86_VC1DSP_H |
#include "libavcodec/vc1dsp.h" |
void ff_vc1dsp_init_mmx(VC1DSPContext *dsp); |
void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp); |
#endif /* AVCODEC_X86_VC1DSP_H */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vc1dsp_init.c |
---|
0,0 → 1,131 |
/* |
* VC-1 and WMV3 - DSP functions MMX-optimized |
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> |
* |
* Permission is hereby granted, free of charge, to any person |
* obtaining a copy of this software and associated documentation |
* files (the "Software"), to deal in the Software without |
* restriction, including without limitation the rights to use, |
* copy, modify, merge, publish, distribute, sublicense, and/or sell |
* copies of the Software, and to permit persons to whom the |
* Software is furnished to do so, subject to the following |
* conditions: |
* |
* The above copyright notice and this permission notice shall be |
* included in all copies or substantial portions of the Software. |
* |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
* OTHER DEALINGS IN THE SOFTWARE. |
*/ |
#include "libavutil/cpu.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/vc1dsp.h" |
#include "dsputil_x86.h" |
#include "vc1dsp.h" |
#include "config.h" |
#define LOOP_FILTER(EXT) \ |
void ff_vc1_v_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \ |
void ff_vc1_h_loop_filter4_ ## EXT(uint8_t *src, int stride, int pq); \ |
void ff_vc1_v_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \ |
void ff_vc1_h_loop_filter8_ ## EXT(uint8_t *src, int stride, int pq); \ |
\ |
static void vc1_v_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \ |
{ \ |
ff_vc1_v_loop_filter8_ ## EXT(src, stride, pq); \ |
ff_vc1_v_loop_filter8_ ## EXT(src+8, stride, pq); \ |
} \ |
\ |
static void vc1_h_loop_filter16_ ## EXT(uint8_t *src, int stride, int pq) \ |
{ \ |
ff_vc1_h_loop_filter8_ ## EXT(src, stride, pq); \ |
ff_vc1_h_loop_filter8_ ## EXT(src+8*stride, stride, pq); \ |
} |
#if HAVE_YASM |
LOOP_FILTER(mmxext) |
LOOP_FILTER(sse2) |
LOOP_FILTER(ssse3) |
void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq); |
static void vc1_h_loop_filter16_sse4(uint8_t *src, int stride, int pq) |
{ |
ff_vc1_h_loop_filter8_sse4(src, stride, pq); |
ff_vc1_h_loop_filter8_sse4(src+8*stride, stride, pq); |
} |
static void avg_vc1_mspel_mc00_mmxext(uint8_t *dst, const uint8_t *src, |
ptrdiff_t stride, int rnd) |
{ |
ff_avg_pixels8_mmxext(dst, src, stride, 8); |
} |
#endif /* HAVE_YASM */ |
void ff_put_vc1_chroma_mc8_nornd_mmx (uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_vc1_chroma_mc8_nornd_mmxext(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_vc1_chroma_mc8_nornd_3dnow(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_put_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
void ff_avg_vc1_chroma_mc8_nornd_ssse3(uint8_t *dst, uint8_t *src, |
int stride, int h, int x, int y); |
av_cold void ff_vc1dsp_init_x86(VC1DSPContext *dsp) |
{ |
int cpu_flags = av_get_cpu_flags(); |
if (INLINE_MMX(cpu_flags)) |
ff_vc1dsp_init_mmx(dsp); |
if (INLINE_MMXEXT(cpu_flags)) |
ff_vc1dsp_init_mmxext(dsp); |
#define ASSIGN_LF(EXT) \ |
dsp->vc1_v_loop_filter4 = ff_vc1_v_loop_filter4_ ## EXT; \ |
dsp->vc1_h_loop_filter4 = ff_vc1_h_loop_filter4_ ## EXT; \ |
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_ ## EXT; \ |
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_ ## EXT; \ |
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_ ## EXT; \ |
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_ ## EXT |
#if HAVE_YASM |
if (EXTERNAL_MMX(cpu_flags)) { |
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_mmx; |
} |
if (EXTERNAL_AMD3DNOW(cpu_flags)) { |
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_3dnow; |
} |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
ASSIGN_LF(mmxext); |
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[0] = avg_vc1_mspel_mc00_mmxext; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
dsp->vc1_v_loop_filter8 = ff_vc1_v_loop_filter8_sse2; |
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse2; |
dsp->vc1_v_loop_filter16 = vc1_v_loop_filter16_sse2; |
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse2; |
} |
if (EXTERNAL_SSSE3(cpu_flags)) { |
ASSIGN_LF(ssse3); |
dsp->put_no_rnd_vc1_chroma_pixels_tab[0] = ff_put_vc1_chroma_mc8_nornd_ssse3; |
dsp->avg_no_rnd_vc1_chroma_pixels_tab[0] = ff_avg_vc1_chroma_mc8_nornd_ssse3; |
} |
if (EXTERNAL_SSE4(cpu_flags)) { |
dsp->vc1_h_loop_filter8 = ff_vc1_h_loop_filter8_sse4; |
dsp->vc1_h_loop_filter16 = vc1_h_loop_filter16_sse4; |
} |
#endif /* HAVE_YASM */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vc1dsp_mmx.c |
---|
0,0 → 1,757 |
/* |
* VC-1 and WMV3 - DSP functions MMX-optimized |
* Copyright (c) 2007 Christophe GISQUET <christophe.gisquet@free.fr> |
* |
* Permission is hereby granted, free of charge, to any person |
* obtaining a copy of this software and associated documentation |
* files (the "Software"), to deal in the Software without |
* restriction, including without limitation the rights to use, |
* copy, modify, merge, publish, distribute, sublicense, and/or sell |
* copies of the Software, and to permit persons to whom the |
* Software is furnished to do so, subject to the following |
* conditions: |
* |
* The above copyright notice and this permission notice shall be |
* included in all copies or substantial portions of the Software. |
* |
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, |
* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES |
* OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND |
* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT |
* HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, |
* WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR |
* OTHER DEALINGS IN THE SOFTWARE. |
*/ |
#include "libavutil/cpu.h" |
#include "libavutil/mem.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/vc1dsp.h" |
#include "constants.h" |
#include "dsputil_x86.h" |
#include "vc1dsp.h" |
#if HAVE_INLINE_ASM |
#define OP_PUT(S,D) |
#define OP_AVG(S,D) "pavgb " #S ", " #D " \n\t" |
/** Add rounder from mm7 to mm3 and pack result at destination */ |
#define NORMALIZE_MMX(SHIFT) \ |
"paddw %%mm7, %%mm3 \n\t" /* +bias-r */ \ |
"paddw %%mm7, %%mm4 \n\t" /* +bias-r */ \ |
"psraw "SHIFT", %%mm3 \n\t" \ |
"psraw "SHIFT", %%mm4 \n\t" |
#define TRANSFER_DO_PACK(OP) \ |
"packuswb %%mm4, %%mm3 \n\t" \ |
OP((%2), %%mm3) \ |
"movq %%mm3, (%2) \n\t" |
#define TRANSFER_DONT_PACK(OP) \ |
OP(0(%2), %%mm3) \ |
OP(8(%2), %%mm4) \ |
"movq %%mm3, 0(%2) \n\t" \ |
"movq %%mm4, 8(%2) \n\t" |
/** @see MSPEL_FILTER13_CORE for use as UNPACK macro */ |
#define DO_UNPACK(reg) "punpcklbw %%mm0, " reg "\n\t" |
#define DONT_UNPACK(reg) |
/** Compute the rounder 32-r or 8-r and unpacks it to mm7 */ |
#define LOAD_ROUNDER_MMX(ROUND) \ |
"movd "ROUND", %%mm7 \n\t" \ |
"punpcklwd %%mm7, %%mm7 \n\t" \ |
"punpckldq %%mm7, %%mm7 \n\t" |
#define SHIFT2_LINE(OFF, R0,R1,R2,R3) \ |
"paddw %%mm"#R2", %%mm"#R1" \n\t" \ |
"movd (%0,%3), %%mm"#R0" \n\t" \ |
"pmullw %%mm6, %%mm"#R1" \n\t" \ |
"punpcklbw %%mm0, %%mm"#R0" \n\t" \ |
"movd (%0,%2), %%mm"#R3" \n\t" \ |
"psubw %%mm"#R0", %%mm"#R1" \n\t" \ |
"punpcklbw %%mm0, %%mm"#R3" \n\t" \ |
"paddw %%mm7, %%mm"#R1" \n\t" \ |
"psubw %%mm"#R3", %%mm"#R1" \n\t" \ |
"psraw %4, %%mm"#R1" \n\t" \ |
"movq %%mm"#R1", "#OFF"(%1) \n\t" \ |
"add %2, %0 \n\t" |
/** Sacrifying mm6 allows to pipeline loads from src */ |
static void vc1_put_ver_16b_shift2_mmx(int16_t *dst, |
const uint8_t *src, x86_reg stride, |
int rnd, int64_t shift) |
{ |
__asm__ volatile( |
"mov $3, %%"REG_c" \n\t" |
LOAD_ROUNDER_MMX("%5") |
"movq "MANGLE(ff_pw_9)", %%mm6 \n\t" |
"1: \n\t" |
"movd (%0), %%mm2 \n\t" |
"add %2, %0 \n\t" |
"movd (%0), %%mm3 \n\t" |
"punpcklbw %%mm0, %%mm2 \n\t" |
"punpcklbw %%mm0, %%mm3 \n\t" |
SHIFT2_LINE( 0, 1, 2, 3, 4) |
SHIFT2_LINE( 24, 2, 3, 4, 1) |
SHIFT2_LINE( 48, 3, 4, 1, 2) |
SHIFT2_LINE( 72, 4, 1, 2, 3) |
SHIFT2_LINE( 96, 1, 2, 3, 4) |
SHIFT2_LINE(120, 2, 3, 4, 1) |
SHIFT2_LINE(144, 3, 4, 1, 2) |
SHIFT2_LINE(168, 4, 1, 2, 3) |
"sub %6, %0 \n\t" |
"add $8, %1 \n\t" |
"dec %%"REG_c" \n\t" |
"jnz 1b \n\t" |
: "+r"(src), "+r"(dst) |
: "r"(stride), "r"(-2*stride), |
"m"(shift), "m"(rnd), "r"(9*stride-4) |
: "%"REG_c, "memory" |
); |
} |
/** |
* Data is already unpacked, so some operations can directly be made from |
* memory. |
*/ |
#define VC1_HOR_16b_SHIFT2(OP, OPNAME)\ |
static void OPNAME ## vc1_hor_16b_shift2_mmx(uint8_t *dst, x86_reg stride,\ |
const int16_t *src, int rnd)\ |
{\ |
int h = 8;\ |
\ |
src -= 1;\ |
rnd -= (-1+9+9-1)*1024; /* Add -1024 bias */\ |
__asm__ volatile(\ |
LOAD_ROUNDER_MMX("%4")\ |
"movq "MANGLE(ff_pw_128)", %%mm6\n\t"\ |
"movq "MANGLE(ff_pw_9)", %%mm5 \n\t"\ |
"1: \n\t"\ |
"movq 2*0+0(%1), %%mm1 \n\t"\ |
"movq 2*0+8(%1), %%mm2 \n\t"\ |
"movq 2*1+0(%1), %%mm3 \n\t"\ |
"movq 2*1+8(%1), %%mm4 \n\t"\ |
"paddw 2*3+0(%1), %%mm1 \n\t"\ |
"paddw 2*3+8(%1), %%mm2 \n\t"\ |
"paddw 2*2+0(%1), %%mm3 \n\t"\ |
"paddw 2*2+8(%1), %%mm4 \n\t"\ |
"pmullw %%mm5, %%mm3 \n\t"\ |
"pmullw %%mm5, %%mm4 \n\t"\ |
"psubw %%mm1, %%mm3 \n\t"\ |
"psubw %%mm2, %%mm4 \n\t"\ |
NORMALIZE_MMX("$7")\ |
/* Remove bias */\ |
"paddw %%mm6, %%mm3 \n\t"\ |
"paddw %%mm6, %%mm4 \n\t"\ |
TRANSFER_DO_PACK(OP)\ |
"add $24, %1 \n\t"\ |
"add %3, %2 \n\t"\ |
"decl %0 \n\t"\ |
"jnz 1b \n\t"\ |
: "+r"(h), "+r" (src), "+r" (dst)\ |
: "r"(stride), "m"(rnd)\ |
: "memory"\ |
);\ |
} |
VC1_HOR_16b_SHIFT2(OP_PUT, put_) |
VC1_HOR_16b_SHIFT2(OP_AVG, avg_) |
/** |
* Purely vertical or horizontal 1/2 shift interpolation. |
* Sacrify mm6 for *9 factor. |
*/ |
#define VC1_SHIFT2(OP, OPNAME)\ |
static void OPNAME ## vc1_shift2_mmx(uint8_t *dst, const uint8_t *src,\ |
x86_reg stride, int rnd, x86_reg offset)\ |
{\ |
rnd = 8-rnd;\ |
__asm__ volatile(\ |
"mov $8, %%"REG_c" \n\t"\ |
LOAD_ROUNDER_MMX("%5")\ |
"movq "MANGLE(ff_pw_9)", %%mm6\n\t"\ |
"1: \n\t"\ |
"movd 0(%0 ), %%mm3 \n\t"\ |
"movd 4(%0 ), %%mm4 \n\t"\ |
"movd 0(%0,%2), %%mm1 \n\t"\ |
"movd 4(%0,%2), %%mm2 \n\t"\ |
"add %2, %0 \n\t"\ |
"punpcklbw %%mm0, %%mm3 \n\t"\ |
"punpcklbw %%mm0, %%mm4 \n\t"\ |
"punpcklbw %%mm0, %%mm1 \n\t"\ |
"punpcklbw %%mm0, %%mm2 \n\t"\ |
"paddw %%mm1, %%mm3 \n\t"\ |
"paddw %%mm2, %%mm4 \n\t"\ |
"movd 0(%0,%3), %%mm1 \n\t"\ |
"movd 4(%0,%3), %%mm2 \n\t"\ |
"pmullw %%mm6, %%mm3 \n\t" /* 0,9,9,0*/\ |
"pmullw %%mm6, %%mm4 \n\t" /* 0,9,9,0*/\ |
"punpcklbw %%mm0, %%mm1 \n\t"\ |
"punpcklbw %%mm0, %%mm2 \n\t"\ |
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,0*/\ |
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,0*/\ |
"movd 0(%0,%2), %%mm1 \n\t"\ |
"movd 4(%0,%2), %%mm2 \n\t"\ |
"punpcklbw %%mm0, %%mm1 \n\t"\ |
"punpcklbw %%mm0, %%mm2 \n\t"\ |
"psubw %%mm1, %%mm3 \n\t" /*-1,9,9,-1*/\ |
"psubw %%mm2, %%mm4 \n\t" /*-1,9,9,-1*/\ |
NORMALIZE_MMX("$4")\ |
"packuswb %%mm4, %%mm3 \n\t"\ |
OP((%1), %%mm3)\ |
"movq %%mm3, (%1) \n\t"\ |
"add %6, %0 \n\t"\ |
"add %4, %1 \n\t"\ |
"dec %%"REG_c" \n\t"\ |
"jnz 1b \n\t"\ |
: "+r"(src), "+r"(dst)\ |
: "r"(offset), "r"(-2*offset), "g"(stride), "m"(rnd),\ |
"g"(stride-offset)\ |
: "%"REG_c, "memory"\ |
);\ |
} |
VC1_SHIFT2(OP_PUT, put_) |
VC1_SHIFT2(OP_AVG, avg_) |
/** |
* Core of the 1/4 and 3/4 shift bicubic interpolation. |
* |
* @param UNPACK Macro unpacking arguments from 8 to 16bits (can be empty). |
* @param MOVQ "movd 1" or "movq 2", if data read is already unpacked. |
* @param A1 Address of 1st tap (beware of unpacked/packed). |
* @param A2 Address of 2nd tap |
* @param A3 Address of 3rd tap |
* @param A4 Address of 4th tap |
*/ |
#define MSPEL_FILTER13_CORE(UNPACK, MOVQ, A1, A2, A3, A4) \ |
MOVQ "*0+"A1", %%mm1 \n\t" \ |
MOVQ "*4+"A1", %%mm2 \n\t" \ |
UNPACK("%%mm1") \ |
UNPACK("%%mm2") \ |
"pmullw "MANGLE(ff_pw_3)", %%mm1\n\t" \ |
"pmullw "MANGLE(ff_pw_3)", %%mm2\n\t" \ |
MOVQ "*0+"A2", %%mm3 \n\t" \ |
MOVQ "*4+"A2", %%mm4 \n\t" \ |
UNPACK("%%mm3") \ |
UNPACK("%%mm4") \ |
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ |
"pmullw %%mm6, %%mm4 \n\t" /* *18 */ \ |
"psubw %%mm1, %%mm3 \n\t" /* 18,-3 */ \ |
"psubw %%mm2, %%mm4 \n\t" /* 18,-3 */ \ |
MOVQ "*0+"A4", %%mm1 \n\t" \ |
MOVQ "*4+"A4", %%mm2 \n\t" \ |
UNPACK("%%mm1") \ |
UNPACK("%%mm2") \ |
"psllw $2, %%mm1 \n\t" /* 4* */ \ |
"psllw $2, %%mm2 \n\t" /* 4* */ \ |
"psubw %%mm1, %%mm3 \n\t" /* -4,18,-3 */ \ |
"psubw %%mm2, %%mm4 \n\t" /* -4,18,-3 */ \ |
MOVQ "*0+"A3", %%mm1 \n\t" \ |
MOVQ "*4+"A3", %%mm2 \n\t" \ |
UNPACK("%%mm1") \ |
UNPACK("%%mm2") \ |
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ |
"pmullw %%mm5, %%mm2 \n\t" /* *53 */ \ |
"paddw %%mm1, %%mm3 \n\t" /* 4,53,18,-3 */ \ |
"paddw %%mm2, %%mm4 \n\t" /* 4,53,18,-3 */ |
/** |
* Macro to build the vertical 16bits version of vc1_put_shift[13]. |
* Here, offset=src_stride. Parameters passed A1 to A4 must use |
* %3 (src_stride) and %4 (3*src_stride). |
* |
* @param NAME Either 1 or 3 |
* @see MSPEL_FILTER13_CORE for information on A1->A4 |
*/ |
#define MSPEL_FILTER13_VER_16B(NAME, A1, A2, A3, A4) \ |
static void \ |
vc1_put_ver_16b_ ## NAME ## _mmx(int16_t *dst, const uint8_t *src, \ |
x86_reg src_stride, \ |
int rnd, int64_t shift) \ |
{ \ |
int h = 8; \ |
src -= src_stride; \ |
__asm__ volatile( \ |
LOAD_ROUNDER_MMX("%5") \ |
"movq "MANGLE(ff_pw_53)", %%mm5\n\t" \ |
"movq "MANGLE(ff_pw_18)", %%mm6\n\t" \ |
".p2align 3 \n\t" \ |
"1: \n\t" \ |
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ |
NORMALIZE_MMX("%6") \ |
TRANSFER_DONT_PACK(OP_PUT) \ |
/* Last 3 (in fact 4) bytes on the line */ \ |
"movd 8+"A1", %%mm1 \n\t" \ |
DO_UNPACK("%%mm1") \ |
"movq %%mm1, %%mm3 \n\t" \ |
"paddw %%mm1, %%mm1 \n\t" \ |
"paddw %%mm3, %%mm1 \n\t" /* 3* */ \ |
"movd 8+"A2", %%mm3 \n\t" \ |
DO_UNPACK("%%mm3") \ |
"pmullw %%mm6, %%mm3 \n\t" /* *18 */ \ |
"psubw %%mm1, %%mm3 \n\t" /*18,-3 */ \ |
"movd 8+"A3", %%mm1 \n\t" \ |
DO_UNPACK("%%mm1") \ |
"pmullw %%mm5, %%mm1 \n\t" /* *53 */ \ |
"paddw %%mm1, %%mm3 \n\t" /*53,18,-3 */ \ |
"movd 8+"A4", %%mm1 \n\t" \ |
DO_UNPACK("%%mm1") \ |
"psllw $2, %%mm1 \n\t" /* 4* */ \ |
"psubw %%mm1, %%mm3 \n\t" \ |
"paddw %%mm7, %%mm3 \n\t" \ |
"psraw %6, %%mm3 \n\t" \ |
"movq %%mm3, 16(%2) \n\t" \ |
"add %3, %1 \n\t" \ |
"add $24, %2 \n\t" \ |
"decl %0 \n\t" \ |
"jnz 1b \n\t" \ |
: "+r"(h), "+r" (src), "+r" (dst) \ |
: "r"(src_stride), "r"(3*src_stride), \ |
"m"(rnd), "m"(shift) \ |
: "memory" \ |
); \ |
} |
/** |
* Macro to build the horizontal 16bits version of vc1_put_shift[13]. |
* Here, offset=16bits, so parameters passed A1 to A4 should be simple. |
* |
* @param NAME Either 1 or 3 |
* @see MSPEL_FILTER13_CORE for information on A1->A4 |
*/ |
#define MSPEL_FILTER13_HOR_16B(NAME, A1, A2, A3, A4, OP, OPNAME) \ |
static void \ |
OPNAME ## vc1_hor_16b_ ## NAME ## _mmx(uint8_t *dst, x86_reg stride, \ |
const int16_t *src, int rnd) \ |
{ \ |
int h = 8; \ |
src -= 1; \ |
rnd -= (-4+58+13-3)*256; /* Add -256 bias */ \ |
__asm__ volatile( \ |
LOAD_ROUNDER_MMX("%4") \ |
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ |
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ |
".p2align 3 \n\t" \ |
"1: \n\t" \ |
MSPEL_FILTER13_CORE(DONT_UNPACK, "movq 2", A1, A2, A3, A4) \ |
NORMALIZE_MMX("$7") \ |
/* Remove bias */ \ |
"paddw "MANGLE(ff_pw_128)", %%mm3 \n\t" \ |
"paddw "MANGLE(ff_pw_128)", %%mm4 \n\t" \ |
TRANSFER_DO_PACK(OP) \ |
"add $24, %1 \n\t" \ |
"add %3, %2 \n\t" \ |
"decl %0 \n\t" \ |
"jnz 1b \n\t" \ |
: "+r"(h), "+r" (src), "+r" (dst) \ |
: "r"(stride), "m"(rnd) \ |
: "memory" \ |
); \ |
} |
/** |
* Macro to build the 8bits, any direction, version of vc1_put_shift[13]. |
* Here, offset=src_stride. Parameters passed A1 to A4 must use |
* %3 (offset) and %4 (3*offset). |
* |
* @param NAME Either 1 or 3 |
* @see MSPEL_FILTER13_CORE for information on A1->A4 |
*/ |
#define MSPEL_FILTER13_8B(NAME, A1, A2, A3, A4, OP, OPNAME) \ |
static void \ |
OPNAME ## vc1_## NAME ## _mmx(uint8_t *dst, const uint8_t *src, \ |
x86_reg stride, int rnd, x86_reg offset) \ |
{ \ |
int h = 8; \ |
src -= offset; \ |
rnd = 32-rnd; \ |
__asm__ volatile ( \ |
LOAD_ROUNDER_MMX("%6") \ |
"movq "MANGLE(ff_pw_53)", %%mm5 \n\t" \ |
"movq "MANGLE(ff_pw_18)", %%mm6 \n\t" \ |
".p2align 3 \n\t" \ |
"1: \n\t" \ |
MSPEL_FILTER13_CORE(DO_UNPACK, "movd 1", A1, A2, A3, A4) \ |
NORMALIZE_MMX("$6") \ |
TRANSFER_DO_PACK(OP) \ |
"add %5, %1 \n\t" \ |
"add %5, %2 \n\t" \ |
"decl %0 \n\t" \ |
"jnz 1b \n\t" \ |
: "+r"(h), "+r" (src), "+r" (dst) \ |
: "r"(offset), "r"(3*offset), "g"(stride), "m"(rnd) \ |
: "memory" \ |
); \ |
} |
/** 1/4 shift bicubic interpolation */ |
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_PUT, put_) |
MSPEL_FILTER13_8B (shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )", OP_AVG, avg_) |
MSPEL_FILTER13_VER_16B(shift1, "0(%1,%4 )", "0(%1,%3,2)", "0(%1,%3 )", "0(%1 )") |
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_PUT, put_) |
MSPEL_FILTER13_HOR_16B(shift1, "2*3(%1)", "2*2(%1)", "2*1(%1)", "2*0(%1)", OP_AVG, avg_) |
/** 3/4 shift bicubic interpolation */ |
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_PUT, put_) |
MSPEL_FILTER13_8B (shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )", OP_AVG, avg_) |
MSPEL_FILTER13_VER_16B(shift3, "0(%1 )", "0(%1,%3 )", "0(%1,%3,2)", "0(%1,%4 )") |
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_PUT, put_) |
MSPEL_FILTER13_HOR_16B(shift3, "2*0(%1)", "2*1(%1)", "2*2(%1)", "2*3(%1)", OP_AVG, avg_) |
typedef void (*vc1_mspel_mc_filter_ver_16bits)(int16_t *dst, const uint8_t *src, x86_reg src_stride, int rnd, int64_t shift); |
typedef void (*vc1_mspel_mc_filter_hor_16bits)(uint8_t *dst, x86_reg dst_stride, const int16_t *src, int rnd); |
typedef void (*vc1_mspel_mc_filter_8bits)(uint8_t *dst, const uint8_t *src, x86_reg stride, int rnd, x86_reg offset); |
/** |
* Interpolate fractional pel values by applying proper vertical then |
* horizontal filter. |
* |
* @param dst Destination buffer for interpolated pels. |
* @param src Source buffer. |
* @param stride Stride for both src and dst buffers. |
* @param hmode Horizontal filter (expressed in quarter pixels shift). |
* @param hmode Vertical filter. |
* @param rnd Rounding bias. |
*/ |
#define VC1_MSPEL_MC(OP)\ |
static void OP ## vc1_mspel_mc(uint8_t *dst, const uint8_t *src, int stride,\ |
int hmode, int vmode, int rnd)\ |
{\ |
static const vc1_mspel_mc_filter_ver_16bits vc1_put_shift_ver_16bits[] =\ |
{ NULL, vc1_put_ver_16b_shift1_mmx, vc1_put_ver_16b_shift2_mmx, vc1_put_ver_16b_shift3_mmx };\ |
static const vc1_mspel_mc_filter_hor_16bits vc1_put_shift_hor_16bits[] =\ |
{ NULL, OP ## vc1_hor_16b_shift1_mmx, OP ## vc1_hor_16b_shift2_mmx, OP ## vc1_hor_16b_shift3_mmx };\ |
static const vc1_mspel_mc_filter_8bits vc1_put_shift_8bits[] =\ |
{ NULL, OP ## vc1_shift1_mmx, OP ## vc1_shift2_mmx, OP ## vc1_shift3_mmx };\ |
\ |
__asm__ volatile(\ |
"pxor %%mm0, %%mm0 \n\t"\ |
::: "memory"\ |
);\ |
\ |
if (vmode) { /* Vertical filter to apply */\ |
if (hmode) { /* Horizontal filter to apply, output to tmp */\ |
static const int shift_value[] = { 0, 5, 1, 5 };\ |
int shift = (shift_value[hmode]+shift_value[vmode])>>1;\ |
int r;\ |
DECLARE_ALIGNED(16, int16_t, tmp)[12*8];\ |
\ |
r = (1<<(shift-1)) + rnd-1;\ |
vc1_put_shift_ver_16bits[vmode](tmp, src-1, stride, r, shift);\ |
\ |
vc1_put_shift_hor_16bits[hmode](dst, stride, tmp+1, 64-rnd);\ |
return;\ |
}\ |
else { /* No horizontal filter, output 8 lines to dst */\ |
vc1_put_shift_8bits[vmode](dst, src, stride, 1-rnd, stride);\ |
return;\ |
}\ |
}\ |
\ |
/* Horizontal mode with no vertical mode */\ |
vc1_put_shift_8bits[hmode](dst, src, stride, rnd, 1);\ |
} |
VC1_MSPEL_MC(put_) |
VC1_MSPEL_MC(avg_) |
/** Macro to ease bicubic filter interpolation functions declarations */ |
#define DECLARE_FUNCTION(a, b) \ |
static void put_vc1_mspel_mc ## a ## b ## _mmx(uint8_t *dst, \ |
const uint8_t *src, \ |
ptrdiff_t stride, \ |
int rnd) \ |
{ \ |
put_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
}\ |
static void avg_vc1_mspel_mc ## a ## b ## _mmxext(uint8_t *dst, \ |
const uint8_t *src, \ |
ptrdiff_t stride, \ |
int rnd) \ |
{ \ |
avg_vc1_mspel_mc(dst, src, stride, a, b, rnd); \ |
} |
DECLARE_FUNCTION(0, 1) |
DECLARE_FUNCTION(0, 2) |
DECLARE_FUNCTION(0, 3) |
DECLARE_FUNCTION(1, 0) |
DECLARE_FUNCTION(1, 1) |
DECLARE_FUNCTION(1, 2) |
DECLARE_FUNCTION(1, 3) |
DECLARE_FUNCTION(2, 0) |
DECLARE_FUNCTION(2, 1) |
DECLARE_FUNCTION(2, 2) |
DECLARE_FUNCTION(2, 3) |
DECLARE_FUNCTION(3, 0) |
DECLARE_FUNCTION(3, 1) |
DECLARE_FUNCTION(3, 2) |
DECLARE_FUNCTION(3, 3) |
static void vc1_inv_trans_4x4_dc_mmxext(uint8_t *dest, int linesize, |
int16_t *block) |
{ |
int dc = block[0]; |
dc = (17 * dc + 4) >> 3; |
dc = (17 * dc + 64) >> 7; |
__asm__ volatile( |
"movd %0, %%mm0 \n\t" |
"pshufw $0, %%mm0, %%mm0 \n\t" |
"pxor %%mm1, %%mm1 \n\t" |
"psubw %%mm0, %%mm1 \n\t" |
"packuswb %%mm0, %%mm0 \n\t" |
"packuswb %%mm1, %%mm1 \n\t" |
::"r"(dc) |
); |
__asm__ volatile( |
"movd %0, %%mm2 \n\t" |
"movd %1, %%mm3 \n\t" |
"movd %2, %%mm4 \n\t" |
"movd %3, %%mm5 \n\t" |
"paddusb %%mm0, %%mm2 \n\t" |
"paddusb %%mm0, %%mm3 \n\t" |
"paddusb %%mm0, %%mm4 \n\t" |
"paddusb %%mm0, %%mm5 \n\t" |
"psubusb %%mm1, %%mm2 \n\t" |
"psubusb %%mm1, %%mm3 \n\t" |
"psubusb %%mm1, %%mm4 \n\t" |
"psubusb %%mm1, %%mm5 \n\t" |
"movd %%mm2, %0 \n\t" |
"movd %%mm3, %1 \n\t" |
"movd %%mm4, %2 \n\t" |
"movd %%mm5, %3 \n\t" |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
"+m"(*(uint32_t*)(dest+1*linesize)), |
"+m"(*(uint32_t*)(dest+2*linesize)), |
"+m"(*(uint32_t*)(dest+3*linesize)) |
); |
} |
static void vc1_inv_trans_4x8_dc_mmxext(uint8_t *dest, int linesize, |
int16_t *block) |
{ |
int dc = block[0]; |
dc = (17 * dc + 4) >> 3; |
dc = (12 * dc + 64) >> 7; |
__asm__ volatile( |
"movd %0, %%mm0 \n\t" |
"pshufw $0, %%mm0, %%mm0 \n\t" |
"pxor %%mm1, %%mm1 \n\t" |
"psubw %%mm0, %%mm1 \n\t" |
"packuswb %%mm0, %%mm0 \n\t" |
"packuswb %%mm1, %%mm1 \n\t" |
::"r"(dc) |
); |
__asm__ volatile( |
"movd %0, %%mm2 \n\t" |
"movd %1, %%mm3 \n\t" |
"movd %2, %%mm4 \n\t" |
"movd %3, %%mm5 \n\t" |
"paddusb %%mm0, %%mm2 \n\t" |
"paddusb %%mm0, %%mm3 \n\t" |
"paddusb %%mm0, %%mm4 \n\t" |
"paddusb %%mm0, %%mm5 \n\t" |
"psubusb %%mm1, %%mm2 \n\t" |
"psubusb %%mm1, %%mm3 \n\t" |
"psubusb %%mm1, %%mm4 \n\t" |
"psubusb %%mm1, %%mm5 \n\t" |
"movd %%mm2, %0 \n\t" |
"movd %%mm3, %1 \n\t" |
"movd %%mm4, %2 \n\t" |
"movd %%mm5, %3 \n\t" |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
"+m"(*(uint32_t*)(dest+1*linesize)), |
"+m"(*(uint32_t*)(dest+2*linesize)), |
"+m"(*(uint32_t*)(dest+3*linesize)) |
); |
dest += 4*linesize; |
__asm__ volatile( |
"movd %0, %%mm2 \n\t" |
"movd %1, %%mm3 \n\t" |
"movd %2, %%mm4 \n\t" |
"movd %3, %%mm5 \n\t" |
"paddusb %%mm0, %%mm2 \n\t" |
"paddusb %%mm0, %%mm3 \n\t" |
"paddusb %%mm0, %%mm4 \n\t" |
"paddusb %%mm0, %%mm5 \n\t" |
"psubusb %%mm1, %%mm2 \n\t" |
"psubusb %%mm1, %%mm3 \n\t" |
"psubusb %%mm1, %%mm4 \n\t" |
"psubusb %%mm1, %%mm5 \n\t" |
"movd %%mm2, %0 \n\t" |
"movd %%mm3, %1 \n\t" |
"movd %%mm4, %2 \n\t" |
"movd %%mm5, %3 \n\t" |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
"+m"(*(uint32_t*)(dest+1*linesize)), |
"+m"(*(uint32_t*)(dest+2*linesize)), |
"+m"(*(uint32_t*)(dest+3*linesize)) |
); |
} |
static void vc1_inv_trans_8x4_dc_mmxext(uint8_t *dest, int linesize, |
int16_t *block) |
{ |
int dc = block[0]; |
dc = ( 3 * dc + 1) >> 1; |
dc = (17 * dc + 64) >> 7; |
__asm__ volatile( |
"movd %0, %%mm0 \n\t" |
"pshufw $0, %%mm0, %%mm0 \n\t" |
"pxor %%mm1, %%mm1 \n\t" |
"psubw %%mm0, %%mm1 \n\t" |
"packuswb %%mm0, %%mm0 \n\t" |
"packuswb %%mm1, %%mm1 \n\t" |
::"r"(dc) |
); |
__asm__ volatile( |
"movq %0, %%mm2 \n\t" |
"movq %1, %%mm3 \n\t" |
"movq %2, %%mm4 \n\t" |
"movq %3, %%mm5 \n\t" |
"paddusb %%mm0, %%mm2 \n\t" |
"paddusb %%mm0, %%mm3 \n\t" |
"paddusb %%mm0, %%mm4 \n\t" |
"paddusb %%mm0, %%mm5 \n\t" |
"psubusb %%mm1, %%mm2 \n\t" |
"psubusb %%mm1, %%mm3 \n\t" |
"psubusb %%mm1, %%mm4 \n\t" |
"psubusb %%mm1, %%mm5 \n\t" |
"movq %%mm2, %0 \n\t" |
"movq %%mm3, %1 \n\t" |
"movq %%mm4, %2 \n\t" |
"movq %%mm5, %3 \n\t" |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
"+m"(*(uint32_t*)(dest+1*linesize)), |
"+m"(*(uint32_t*)(dest+2*linesize)), |
"+m"(*(uint32_t*)(dest+3*linesize)) |
); |
} |
static void vc1_inv_trans_8x8_dc_mmxext(uint8_t *dest, int linesize, |
int16_t *block) |
{ |
int dc = block[0]; |
dc = (3 * dc + 1) >> 1; |
dc = (3 * dc + 16) >> 5; |
__asm__ volatile( |
"movd %0, %%mm0 \n\t" |
"pshufw $0, %%mm0, %%mm0 \n\t" |
"pxor %%mm1, %%mm1 \n\t" |
"psubw %%mm0, %%mm1 \n\t" |
"packuswb %%mm0, %%mm0 \n\t" |
"packuswb %%mm1, %%mm1 \n\t" |
::"r"(dc) |
); |
__asm__ volatile( |
"movq %0, %%mm2 \n\t" |
"movq %1, %%mm3 \n\t" |
"movq %2, %%mm4 \n\t" |
"movq %3, %%mm5 \n\t" |
"paddusb %%mm0, %%mm2 \n\t" |
"paddusb %%mm0, %%mm3 \n\t" |
"paddusb %%mm0, %%mm4 \n\t" |
"paddusb %%mm0, %%mm5 \n\t" |
"psubusb %%mm1, %%mm2 \n\t" |
"psubusb %%mm1, %%mm3 \n\t" |
"psubusb %%mm1, %%mm4 \n\t" |
"psubusb %%mm1, %%mm5 \n\t" |
"movq %%mm2, %0 \n\t" |
"movq %%mm3, %1 \n\t" |
"movq %%mm4, %2 \n\t" |
"movq %%mm5, %3 \n\t" |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
"+m"(*(uint32_t*)(dest+1*linesize)), |
"+m"(*(uint32_t*)(dest+2*linesize)), |
"+m"(*(uint32_t*)(dest+3*linesize)) |
); |
dest += 4*linesize; |
__asm__ volatile( |
"movq %0, %%mm2 \n\t" |
"movq %1, %%mm3 \n\t" |
"movq %2, %%mm4 \n\t" |
"movq %3, %%mm5 \n\t" |
"paddusb %%mm0, %%mm2 \n\t" |
"paddusb %%mm0, %%mm3 \n\t" |
"paddusb %%mm0, %%mm4 \n\t" |
"paddusb %%mm0, %%mm5 \n\t" |
"psubusb %%mm1, %%mm2 \n\t" |
"psubusb %%mm1, %%mm3 \n\t" |
"psubusb %%mm1, %%mm4 \n\t" |
"psubusb %%mm1, %%mm5 \n\t" |
"movq %%mm2, %0 \n\t" |
"movq %%mm3, %1 \n\t" |
"movq %%mm4, %2 \n\t" |
"movq %%mm5, %3 \n\t" |
:"+m"(*(uint32_t*)(dest+0*linesize)), |
"+m"(*(uint32_t*)(dest+1*linesize)), |
"+m"(*(uint32_t*)(dest+2*linesize)), |
"+m"(*(uint32_t*)(dest+3*linesize)) |
); |
} |
static void put_vc1_mspel_mc00_mmx(uint8_t *dst, const uint8_t *src, |
ptrdiff_t stride, int rnd) |
{ |
ff_put_pixels8_mmx(dst, src, stride, 8); |
} |
av_cold void ff_vc1dsp_init_mmx(VC1DSPContext *dsp) |
{ |
dsp->put_vc1_mspel_pixels_tab[ 0] = put_vc1_mspel_mc00_mmx; |
dsp->put_vc1_mspel_pixels_tab[ 4] = put_vc1_mspel_mc01_mmx; |
dsp->put_vc1_mspel_pixels_tab[ 8] = put_vc1_mspel_mc02_mmx; |
dsp->put_vc1_mspel_pixels_tab[12] = put_vc1_mspel_mc03_mmx; |
dsp->put_vc1_mspel_pixels_tab[ 1] = put_vc1_mspel_mc10_mmx; |
dsp->put_vc1_mspel_pixels_tab[ 5] = put_vc1_mspel_mc11_mmx; |
dsp->put_vc1_mspel_pixels_tab[ 9] = put_vc1_mspel_mc12_mmx; |
dsp->put_vc1_mspel_pixels_tab[13] = put_vc1_mspel_mc13_mmx; |
dsp->put_vc1_mspel_pixels_tab[ 2] = put_vc1_mspel_mc20_mmx; |
dsp->put_vc1_mspel_pixels_tab[ 6] = put_vc1_mspel_mc21_mmx; |
dsp->put_vc1_mspel_pixels_tab[10] = put_vc1_mspel_mc22_mmx; |
dsp->put_vc1_mspel_pixels_tab[14] = put_vc1_mspel_mc23_mmx; |
dsp->put_vc1_mspel_pixels_tab[ 3] = put_vc1_mspel_mc30_mmx; |
dsp->put_vc1_mspel_pixels_tab[ 7] = put_vc1_mspel_mc31_mmx; |
dsp->put_vc1_mspel_pixels_tab[11] = put_vc1_mspel_mc32_mmx; |
dsp->put_vc1_mspel_pixels_tab[15] = put_vc1_mspel_mc33_mmx; |
} |
av_cold void ff_vc1dsp_init_mmxext(VC1DSPContext *dsp) |
{ |
dsp->avg_vc1_mspel_pixels_tab[ 4] = avg_vc1_mspel_mc01_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[ 8] = avg_vc1_mspel_mc02_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[12] = avg_vc1_mspel_mc03_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[ 1] = avg_vc1_mspel_mc10_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[ 5] = avg_vc1_mspel_mc11_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[ 9] = avg_vc1_mspel_mc12_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[13] = avg_vc1_mspel_mc13_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[ 2] = avg_vc1_mspel_mc20_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[ 6] = avg_vc1_mspel_mc21_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[10] = avg_vc1_mspel_mc22_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[14] = avg_vc1_mspel_mc23_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[ 3] = avg_vc1_mspel_mc30_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[ 7] = avg_vc1_mspel_mc31_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[11] = avg_vc1_mspel_mc32_mmxext; |
dsp->avg_vc1_mspel_pixels_tab[15] = avg_vc1_mspel_mc33_mmxext; |
dsp->vc1_inv_trans_8x8_dc = vc1_inv_trans_8x8_dc_mmxext; |
dsp->vc1_inv_trans_4x8_dc = vc1_inv_trans_4x8_dc_mmxext; |
dsp->vc1_inv_trans_8x4_dc = vc1_inv_trans_8x4_dc_mmxext; |
dsp->vc1_inv_trans_4x4_dc = vc1_inv_trans_4x4_dc_mmxext; |
} |
#endif /* HAVE_INLINE_ASM */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/videodsp.asm |
---|
0,0 → 1,444 |
;****************************************************************************** |
;* Core video DSP functions |
;* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION .text |
; slow vertical extension loop function. Works with variable-width, and |
; does per-line reading/writing of source data |
%macro V_COPY_ROW 2 ; type (top/body/bottom), h |
.%1_y_loop: ; do { |
mov wq, r7mp ; initialize w (r7mp = wmp) |
.%1_x_loop: ; do { |
movu m0, [srcq+wq] ; m0 = read($mmsize) |
movu [dstq+wq], m0 ; write(m0, $mmsize) |
add wq, mmsize ; w -= $mmsize |
cmp wq, -mmsize ; } while (w > $mmsize); |
jl .%1_x_loop |
movu m0, [srcq-mmsize] ; m0 = read($mmsize) |
movu [dstq-mmsize], m0 ; write(m0, $mmsize) |
%ifidn %1, body ; if ($type == body) { |
add srcq, src_strideq ; src += src_stride |
%endif ; } |
add dstq, dst_strideq ; dst += dst_stride |
dec %2 ; } while (--$h); |
jnz .%1_y_loop |
%endmacro |
%macro vvar_fn 0 |
; .----. <- zero |
; | | <- top is copied from first line in body of source |
; |----| <- start_y |
; | | <- body is copied verbatim (line-by-line) from source |
; |----| <- end_y |
; | | <- bottom is copied from last line in body of source |
; '----' <- bh |
%if ARCH_X86_64 |
cglobal emu_edge_vvar, 7, 8, 1, dst, dst_stride, src, src_stride, \ |
start_y, end_y, bh, w |
%else ; x86-32 |
cglobal emu_edge_vvar, 1, 6, 1, dst, src, start_y, end_y, bh, w |
%define src_strideq r3mp |
%define dst_strideq r1mp |
mov srcq, r2mp |
mov start_yq, r4mp |
mov end_yq, r5mp |
mov bhq, r6mp |
%endif |
sub bhq, end_yq ; bh -= end_q |
sub end_yq, start_yq ; end_q -= start_q |
add srcq, r7mp ; (r7mp = wmp) |
add dstq, r7mp ; (r7mp = wmp) |
neg r7mp ; (r7mp = wmp) |
test start_yq, start_yq ; if (start_q) { |
jz .body |
V_COPY_ROW top, start_yq ; v_copy_row(top, start_yq) |
.body: ; } |
V_COPY_ROW body, end_yq ; v_copy_row(body, end_yq) |
test bhq, bhq ; if (bh) { |
jz .end |
sub srcq, src_strideq ; src -= src_stride |
V_COPY_ROW bottom, bhq ; v_copy_row(bottom, bh) |
.end: ; } |
RET |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
vvar_fn |
%endif |
INIT_XMM sse |
vvar_fn |
%macro hvar_fn 0 |
cglobal emu_edge_hvar, 5, 6, 1, dst, dst_stride, start_x, n_words, h, w |
lea dstq, [dstq+n_wordsq*2] |
neg n_wordsq |
lea start_xq, [start_xq+n_wordsq*2] |
.y_loop: ; do { |
; FIXME also write a ssse3 version using pshufb |
movzx wd, byte [dstq+start_xq] ; w = read(1) |
imul wd, 0x01010101 ; w *= 0x01010101 |
movd m0, wd |
mov wq, n_wordsq ; initialize w |
%if cpuflag(sse2) |
pshufd m0, m0, q0000 ; splat |
%else ; mmx |
punpckldq m0, m0 ; splat |
%endif ; mmx/sse |
.x_loop: ; do { |
movu [dstq+wq*2], m0 ; write($reg, $mmsize) |
add wq, mmsize/2 ; w -= $mmsize/2 |
cmp wq, -mmsize/2 ; } while (w > $mmsize/2) |
jl .x_loop |
movu [dstq-mmsize], m0 ; write($reg, $mmsize) |
add dstq, dst_strideq ; dst += dst_stride |
dec hq ; } while (h--) |
jnz .y_loop |
RET |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
hvar_fn |
%endif |
INIT_XMM sse2 |
hvar_fn |
; macro to read/write a horizontal number of pixels (%2) to/from registers |
; on sse, - fills xmm0-15 for consecutive sets of 16 pixels |
; - if (%2 & 8) fills 8 bytes into xmm$next |
; - if (%2 & 4) fills 4 bytes into xmm$next |
; - if (%2 & 3) fills 1, 2 or 4 bytes in eax |
; on mmx, - fills mm0-7 for consecutive sets of 8 pixels |
; - if (%2 & 4) fills 4 bytes into mm$next |
; - if (%2 & 3) fills 1, 2 or 4 bytes in eax |
; writing data out is in the same way |
%macro READ_NUM_BYTES 2 |
%assign %%off 0 ; offset in source buffer |
%assign %%mmx_idx 0 ; mmx register index |
%assign %%xmm_idx 0 ; xmm register index |
%rep %2/mmsize |
%if mmsize == 16 |
movu xmm %+ %%xmm_idx, [srcq+%%off] |
%assign %%xmm_idx %%xmm_idx+1 |
%else ; mmx |
movu mm %+ %%mmx_idx, [srcq+%%off] |
%assign %%mmx_idx %%mmx_idx+1 |
%endif |
%assign %%off %%off+mmsize |
%endrep ; %2/mmsize |
%if mmsize == 16 |
%if (%2-%%off) >= 8 |
%if %2 > 16 && (%2-%%off) > 8 |
movu xmm %+ %%xmm_idx, [srcq+%2-16] |
%assign %%xmm_idx %%xmm_idx+1 |
%assign %%off %2 |
%else |
movq mm %+ %%mmx_idx, [srcq+%%off] |
%assign %%mmx_idx %%mmx_idx+1 |
%assign %%off %%off+8 |
%endif |
%endif ; (%2-%%off) >= 8 |
%endif |
%if (%2-%%off) >= 4 |
%if %2 > 8 && (%2-%%off) > 4 |
movq mm %+ %%mmx_idx, [srcq+%2-8] |
%assign %%off %2 |
%else |
movd mm %+ %%mmx_idx, [srcq+%%off] |
%assign %%off %%off+4 |
%endif |
%assign %%mmx_idx %%mmx_idx+1 |
%endif ; (%2-%%off) >= 4 |
%if (%2-%%off) >= 1 |
%if %2 >= 4 |
movd mm %+ %%mmx_idx, [srcq+%2-4] |
%elif (%2-%%off) == 1 |
mov valb, [srcq+%2-1] |
%elif (%2-%%off) == 2 |
mov valw, [srcq+%2-2] |
%elifidn %1, body |
mov vald, [srcq+%2-3] |
%else |
movd mm %+ %%mmx_idx, [srcq+%2-3] |
%endif |
%endif ; (%2-%%off) >= 1 |
%endmacro ; READ_NUM_BYTES |
%macro WRITE_NUM_BYTES 2 |
%assign %%off 0 ; offset in destination buffer |
%assign %%mmx_idx 0 ; mmx register index |
%assign %%xmm_idx 0 ; xmm register index |
%rep %2/mmsize |
%if mmsize == 16 |
movu [dstq+%%off], xmm %+ %%xmm_idx |
%assign %%xmm_idx %%xmm_idx+1 |
%else ; mmx |
movu [dstq+%%off], mm %+ %%mmx_idx |
%assign %%mmx_idx %%mmx_idx+1 |
%endif |
%assign %%off %%off+mmsize |
%endrep ; %2/mmsize |
%if mmsize == 16 |
%if (%2-%%off) >= 8 |
%if %2 > 16 && (%2-%%off) > 8 |
movu [dstq+%2-16], xmm %+ %%xmm_idx |
%assign %%xmm_idx %%xmm_idx+1 |
%assign %%off %2 |
%else |
movq [dstq+%%off], mm %+ %%mmx_idx |
%assign %%mmx_idx %%mmx_idx+1 |
%assign %%off %%off+8 |
%endif |
%endif ; (%2-%%off) >= 8 |
%endif |
%if (%2-%%off) >= 4 |
%if %2 > 8 && (%2-%%off) > 4 |
movq [dstq+%2-8], mm %+ %%mmx_idx |
%assign %%off %2 |
%else |
movd [dstq+%%off], mm %+ %%mmx_idx |
%assign %%off %%off+4 |
%endif |
%assign %%mmx_idx %%mmx_idx+1 |
%endif ; (%2-%%off) >= 4 |
%if (%2-%%off) >= 1 |
%if %2 >= 4 |
movd [dstq+%2-4], mm %+ %%mmx_idx |
%elif (%2-%%off) == 1 |
mov [dstq+%2-1], valb |
%elif (%2-%%off) == 2 |
mov [dstq+%2-2], valw |
%elifidn %1, body |
mov [dstq+%2-3], valw |
shr vald, 16 |
mov [dstq+%2-1], valb |
%else |
movd vald, mm %+ %%mmx_idx |
mov [dstq+%2-3], valw |
shr vald, 16 |
mov [dstq+%2-1], valb |
%endif |
%endif ; (%2-%%off) >= 1 |
%endmacro ; WRITE_NUM_BYTES |
; vertical top/bottom extend and body copy fast loops |
; these are function pointers to set-width line copy functions, i.e. |
; they read a fixed number of pixels into set registers, and write |
; those out into the destination buffer |
%macro VERTICAL_EXTEND 2 |
%assign %%n %1 |
%rep 1+%2-%1 |
%if %%n <= 3 |
%if ARCH_X86_64 |
cglobal emu_edge_vfix %+ %%n, 6, 8, 0, dst, dst_stride, src, src_stride, \ |
start_y, end_y, val, bh |
mov bhq, r6mp ; r6mp = bhmp |
%else ; x86-32 |
cglobal emu_edge_vfix %+ %%n, 0, 6, 0, val, dst, src, start_y, end_y, bh |
mov dstq, r0mp |
mov srcq, r2mp |
mov start_yq, r4mp |
mov end_yq, r5mp |
mov bhq, r6mp |
%define dst_strideq r1mp |
%define src_strideq r3mp |
%endif ; x86-64/32 |
%else |
%if ARCH_X86_64 |
cglobal emu_edge_vfix %+ %%n, 7, 7, 1, dst, dst_stride, src, src_stride, \ |
start_y, end_y, bh |
%else ; x86-32 |
cglobal emu_edge_vfix %+ %%n, 1, 5, 1, dst, src, start_y, end_y, bh |
mov srcq, r2mp |
mov start_yq, r4mp |
mov end_yq, r5mp |
mov bhq, r6mp |
%define dst_strideq r1mp |
%define src_strideq r3mp |
%endif ; x86-64/32 |
%endif |
; FIXME move this to c wrapper? |
sub bhq, end_yq ; bh -= end_y |
sub end_yq, start_yq ; end_y -= start_y |
; extend pixels above body |
test start_yq, start_yq ; if (start_y) { |
jz .body_loop |
READ_NUM_BYTES top, %%n ; $variable_regs = read($n) |
.top_loop: ; do { |
WRITE_NUM_BYTES top, %%n ; write($variable_regs, $n) |
add dstq, dst_strideq ; dst += linesize |
dec start_yq ; } while (--start_y) |
jnz .top_loop ; } |
; copy body pixels |
.body_loop: ; do { |
READ_NUM_BYTES body, %%n ; $variable_regs = read($n) |
WRITE_NUM_BYTES body, %%n ; write($variable_regs, $n) |
add dstq, dst_strideq ; dst += dst_stride |
add srcq, src_strideq ; src += src_stride |
dec end_yq ; } while (--end_y) |
jnz .body_loop |
; copy bottom pixels |
test bhq, bhq ; if (block_h) { |
jz .end |
sub srcq, src_strideq ; src -= linesize |
READ_NUM_BYTES bottom, %%n ; $variable_regs = read($n) |
.bottom_loop: ; do { |
WRITE_NUM_BYTES bottom, %%n ; write($variable_regs, $n) |
add dstq, dst_strideq ; dst += linesize |
dec bhq ; } while (--bh) |
jnz .bottom_loop ; } |
.end: |
RET |
%assign %%n %%n+1 |
%endrep ; 1+%2-%1 |
%endmacro ; VERTICAL_EXTEND |
INIT_MMX mmx |
VERTICAL_EXTEND 1, 15 |
%if ARCH_X86_32 |
VERTICAL_EXTEND 16, 22 |
%endif |
INIT_XMM sse |
VERTICAL_EXTEND 16, 22 |
; left/right (horizontal) fast extend functions |
; these are essentially identical to the vertical extend ones above, |
; just left/right separated because number of pixels to extend is |
; obviously not the same on both sides. |
%macro READ_V_PIXEL 2 |
movzx vald, byte %2 |
imul vald, 0x01010101 |
%if %1 >= 8 |
movd m0, vald |
%if mmsize == 16 |
pshufd m0, m0, q0000 |
%else |
punpckldq m0, m0 |
%endif ; mmsize == 16 |
%endif ; %1 > 16 |
%endmacro ; READ_V_PIXEL |
%macro WRITE_V_PIXEL 2 |
%assign %%off 0 |
%if %1 >= 8 |
%rep %1/mmsize |
movu [%2+%%off], m0 |
%assign %%off %%off+mmsize |
%endrep ; %1/mmsize |
%if mmsize == 16 |
%if %1-%%off >= 8 |
%if %1 > 16 && %1-%%off > 8 |
movu [%2+%1-16], m0 |
%assign %%off %1 |
%else |
movq [%2+%%off], m0 |
%assign %%off %%off+8 |
%endif |
%endif ; %1-%%off >= 8 |
%endif ; mmsize == 16 |
%if %1-%%off >= 4 |
%if %1 > 8 && %1-%%off > 4 |
movq [%2+%1-8], m0 |
%assign %%off %1 |
%else |
movd [%2+%%off], m0 |
%assign %%off %%off+4 |
%endif |
%endif ; %1-%%off >= 4 |
%else ; %1 < 8 |
%rep %1/4 |
mov [%2+%%off], vald |
%assign %%off %%off+4 |
%endrep ; %1/4 |
%endif ; %1 >=/< 8 |
%if %1-%%off == 2 |
mov [%2+%%off], valw |
%endif ; (%1-%%off)/2 |
%endmacro ; WRITE_V_PIXEL |
%macro H_EXTEND 2 |
%assign %%n %1 |
%rep 1+(%2-%1)/2 |
cglobal emu_edge_hfix %+ %%n, 4, 5, 1, dst, dst_stride, start_x, bh, val |
.loop_y: ; do { |
READ_V_PIXEL %%n, [dstq+start_xq] ; $variable_regs = read($n) |
WRITE_V_PIXEL %%n, dstq ; write($variable_regs, $n) |
add dstq, dst_strideq ; dst += dst_stride |
dec bhq ; } while (--bh) |
jnz .loop_y |
RET |
%assign %%n %%n+2 |
%endrep ; 1+(%2-%1)/2 |
%endmacro ; H_EXTEND |
INIT_MMX mmx |
H_EXTEND 2, 14 |
%if ARCH_X86_32 |
H_EXTEND 16, 22 |
%endif |
INIT_XMM sse2 |
H_EXTEND 16, 22 |
%macro PREFETCH_FN 1 |
cglobal prefetch, 3, 3, 0, buf, stride, h |
.loop: |
%1 [bufq] |
add bufq, strideq |
dec hd |
jg .loop |
REP_RET |
%endmacro |
INIT_MMX mmxext |
PREFETCH_FN prefetcht0 |
%if ARCH_X86_32 |
INIT_MMX 3dnow |
PREFETCH_FN prefetch |
%endif |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/videodsp_init.c |
---|
0,0 → 1,266 |
/* |
* Copyright (C) 2002-2012 Michael Niedermayer |
* Copyright (C) 2012 Ronald S. Bultje |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "config.h" |
#include "libavutil/attributes.h" |
#include "libavutil/avassert.h" |
#include "libavutil/common.h" |
#include "libavutil/cpu.h" |
#include "libavutil/mem.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/videodsp.h" |
#if HAVE_YASM |
typedef void emu_edge_vfix_func(uint8_t *dst, x86_reg dst_stride, |
const uint8_t *src, x86_reg src_stride, |
x86_reg start_y, x86_reg end_y, x86_reg bh); |
typedef void emu_edge_vvar_func(uint8_t *dst, x86_reg dst_stride, |
const uint8_t *src, x86_reg src_stride, |
x86_reg start_y, x86_reg end_y, x86_reg bh, |
x86_reg w); |
extern emu_edge_vfix_func ff_emu_edge_vfix1_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix2_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix3_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix4_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix5_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix6_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix7_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix8_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix9_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix10_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix11_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix12_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix13_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix14_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix15_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix16_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix17_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix18_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix19_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix20_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix21_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix22_mmx; |
#if ARCH_X86_32 |
static emu_edge_vfix_func *vfixtbl_mmx[22] = { |
&ff_emu_edge_vfix1_mmx, &ff_emu_edge_vfix2_mmx, &ff_emu_edge_vfix3_mmx, |
&ff_emu_edge_vfix4_mmx, &ff_emu_edge_vfix5_mmx, &ff_emu_edge_vfix6_mmx, |
&ff_emu_edge_vfix7_mmx, &ff_emu_edge_vfix8_mmx, &ff_emu_edge_vfix9_mmx, |
&ff_emu_edge_vfix10_mmx, &ff_emu_edge_vfix11_mmx, &ff_emu_edge_vfix12_mmx, |
&ff_emu_edge_vfix13_mmx, &ff_emu_edge_vfix14_mmx, &ff_emu_edge_vfix15_mmx, |
&ff_emu_edge_vfix16_mmx, &ff_emu_edge_vfix17_mmx, &ff_emu_edge_vfix18_mmx, |
&ff_emu_edge_vfix19_mmx, &ff_emu_edge_vfix20_mmx, &ff_emu_edge_vfix21_mmx, |
&ff_emu_edge_vfix22_mmx |
}; |
#endif |
extern emu_edge_vvar_func ff_emu_edge_vvar_mmx; |
extern emu_edge_vfix_func ff_emu_edge_vfix16_sse; |
extern emu_edge_vfix_func ff_emu_edge_vfix17_sse; |
extern emu_edge_vfix_func ff_emu_edge_vfix18_sse; |
extern emu_edge_vfix_func ff_emu_edge_vfix19_sse; |
extern emu_edge_vfix_func ff_emu_edge_vfix20_sse; |
extern emu_edge_vfix_func ff_emu_edge_vfix21_sse; |
extern emu_edge_vfix_func ff_emu_edge_vfix22_sse; |
static emu_edge_vfix_func *vfixtbl_sse[22] = { |
ff_emu_edge_vfix1_mmx, ff_emu_edge_vfix2_mmx, ff_emu_edge_vfix3_mmx, |
ff_emu_edge_vfix4_mmx, ff_emu_edge_vfix5_mmx, ff_emu_edge_vfix6_mmx, |
ff_emu_edge_vfix7_mmx, ff_emu_edge_vfix8_mmx, ff_emu_edge_vfix9_mmx, |
ff_emu_edge_vfix10_mmx, ff_emu_edge_vfix11_mmx, ff_emu_edge_vfix12_mmx, |
ff_emu_edge_vfix13_mmx, ff_emu_edge_vfix14_mmx, ff_emu_edge_vfix15_mmx, |
ff_emu_edge_vfix16_sse, ff_emu_edge_vfix17_sse, ff_emu_edge_vfix18_sse, |
ff_emu_edge_vfix19_sse, ff_emu_edge_vfix20_sse, ff_emu_edge_vfix21_sse, |
ff_emu_edge_vfix22_sse |
}; |
extern emu_edge_vvar_func ff_emu_edge_vvar_sse; |
typedef void emu_edge_hfix_func(uint8_t *dst, x86_reg dst_stride, |
x86_reg start_x, x86_reg bh); |
typedef void emu_edge_hvar_func(uint8_t *dst, x86_reg dst_stride, |
x86_reg start_x, x86_reg n_words, x86_reg bh); |
extern emu_edge_hfix_func ff_emu_edge_hfix2_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix4_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix6_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix8_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix10_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix12_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix14_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix16_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix18_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix20_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix22_mmx; |
#if ARCH_X86_32 |
static emu_edge_hfix_func *hfixtbl_mmx[11] = { |
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, |
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, |
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_mmx, ff_emu_edge_hfix18_mmx, |
ff_emu_edge_hfix20_mmx, ff_emu_edge_hfix22_mmx |
}; |
#endif |
extern emu_edge_hvar_func ff_emu_edge_hvar_mmx; |
extern emu_edge_hfix_func ff_emu_edge_hfix16_sse2; |
extern emu_edge_hfix_func ff_emu_edge_hfix18_sse2; |
extern emu_edge_hfix_func ff_emu_edge_hfix20_sse2; |
extern emu_edge_hfix_func ff_emu_edge_hfix22_sse2; |
static emu_edge_hfix_func *hfixtbl_sse2[11] = { |
ff_emu_edge_hfix2_mmx, ff_emu_edge_hfix4_mmx, ff_emu_edge_hfix6_mmx, |
ff_emu_edge_hfix8_mmx, ff_emu_edge_hfix10_mmx, ff_emu_edge_hfix12_mmx, |
ff_emu_edge_hfix14_mmx, ff_emu_edge_hfix16_sse2, ff_emu_edge_hfix18_sse2, |
ff_emu_edge_hfix20_sse2, ff_emu_edge_hfix22_sse2 |
}; |
extern emu_edge_hvar_func ff_emu_edge_hvar_sse2; |
static av_always_inline void emulated_edge_mc(uint8_t *dst, ptrdiff_t dst_stride, |
const uint8_t *src, ptrdiff_t src_stride, |
x86_reg block_w, x86_reg block_h, |
x86_reg src_x, x86_reg src_y, |
x86_reg w, x86_reg h, |
emu_edge_vfix_func **vfix_tbl, |
emu_edge_vvar_func *v_extend_var, |
emu_edge_hfix_func **hfix_tbl, |
emu_edge_hvar_func *h_extend_var) |
{ |
x86_reg start_y, start_x, end_y, end_x, src_y_add = 0, p; |
if(!w || !h) |
return; |
if (src_y >= h) { |
src -= src_y*src_stride; |
src_y_add = h - 1; |
src_y = h - 1; |
} else if (src_y <= -block_h) { |
src -= src_y*src_stride; |
src_y_add = 1 - block_h; |
src_y = 1 - block_h; |
} |
if (src_x >= w) { |
src += w - 1 - src_x; |
src_x = w - 1; |
} else if (src_x <= -block_w) { |
src += 1 - block_w - src_x; |
src_x = 1 - block_w; |
} |
start_y = FFMAX(0, -src_y); |
start_x = FFMAX(0, -src_x); |
end_y = FFMIN(block_h, h-src_y); |
end_x = FFMIN(block_w, w-src_x); |
av_assert2(start_x < end_x && block_w > 0); |
av_assert2(start_y < end_y && block_h > 0); |
// fill in the to-be-copied part plus all above/below |
src += (src_y_add + start_y) * src_stride + start_x; |
w = end_x - start_x; |
if (w <= 22) { |
vfix_tbl[w - 1](dst + start_x, dst_stride, src, src_stride, |
start_y, end_y, block_h); |
} else { |
v_extend_var(dst + start_x, dst_stride, src, src_stride, |
start_y, end_y, block_h, w); |
} |
// fill left |
if (start_x) { |
if (start_x <= 22) { |
hfix_tbl[(start_x - 1) >> 1](dst, dst_stride, start_x, block_h); |
} else { |
h_extend_var(dst, dst_stride, |
start_x, (start_x + 1) >> 1, block_h); |
} |
} |
// fill right |
p = block_w - end_x; |
if (p) { |
if (p <= 22) { |
hfix_tbl[(p - 1) >> 1](dst + end_x - (p & 1), dst_stride, |
-!(p & 1), block_h); |
} else { |
h_extend_var(dst + end_x - (p & 1), dst_stride, |
-!(p & 1), (p + 1) >> 1, block_h); |
} |
} |
} |
#if ARCH_X86_32 |
static av_noinline void emulated_edge_mc_mmx(uint8_t *buf, ptrdiff_t buf_stride, |
const uint8_t *src, ptrdiff_t src_stride, |
int block_w, int block_h, |
int src_x, int src_y, int w, int h) |
{ |
emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h, |
src_x, src_y, w, h, vfixtbl_mmx, &ff_emu_edge_vvar_mmx, |
hfixtbl_mmx, &ff_emu_edge_hvar_mmx); |
} |
static av_noinline void emulated_edge_mc_sse(uint8_t *buf, ptrdiff_t buf_stride, |
const uint8_t *src, ptrdiff_t src_stride, |
int block_w, int block_h, |
int src_x, int src_y, int w, int h) |
{ |
emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h, |
src_x, src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, |
hfixtbl_mmx, &ff_emu_edge_hvar_mmx); |
} |
#endif |
static av_noinline void emulated_edge_mc_sse2(uint8_t *buf, ptrdiff_t buf_stride, |
const uint8_t *src, ptrdiff_t src_stride, |
int block_w, int block_h, |
int src_x, int src_y, int w, int h) |
{ |
emulated_edge_mc(buf, buf_stride, src, src_stride, block_w, block_h, src_x, |
src_y, w, h, vfixtbl_sse, &ff_emu_edge_vvar_sse, |
hfixtbl_sse2, &ff_emu_edge_hvar_sse2); |
} |
#endif /* HAVE_YASM */ |
void ff_prefetch_mmxext(uint8_t *buf, ptrdiff_t stride, int h); |
void ff_prefetch_3dnow(uint8_t *buf, ptrdiff_t stride, int h); |
av_cold void ff_videodsp_init_x86(VideoDSPContext *ctx, int bpc) |
{ |
#if HAVE_YASM |
int cpu_flags = av_get_cpu_flags(); |
#if ARCH_X86_32 |
if (EXTERNAL_MMX(cpu_flags) && bpc <= 8) { |
ctx->emulated_edge_mc = emulated_edge_mc_mmx; |
} |
if (EXTERNAL_AMD3DNOW(cpu_flags)) { |
ctx->prefetch = ff_prefetch_3dnow; |
} |
#endif /* ARCH_X86_32 */ |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
ctx->prefetch = ff_prefetch_mmxext; |
} |
#if ARCH_X86_32 |
if (EXTERNAL_SSE(cpu_flags) && bpc <= 8) { |
ctx->emulated_edge_mc = emulated_edge_mc_sse; |
} |
#endif /* ARCH_X86_32 */ |
if (EXTERNAL_SSE2(cpu_flags) && bpc <= 8) { |
ctx->emulated_edge_mc = emulated_edge_mc_sse2; |
} |
#endif /* HAVE_YASM */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vorbisdsp.asm |
---|
0,0 → 1,83 |
;****************************************************************************** |
;* Vorbis x86 optimizations |
;* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
pdw_80000000: times 4 dd 0x80000000 |
SECTION .text |
%if ARCH_X86_32 |
INIT_MMX 3dnow |
cglobal vorbis_inverse_coupling, 3, 3, 6, mag, ang, block_size |
pxor m7, m7 |
lea magq, [magq+block_sizeq*4] |
lea angq, [angq+block_sizeq*4] |
neg block_sizeq |
.loop: |
mova m0, [magq+block_sizeq*4] |
mova m1, [angq+block_sizeq*4] |
mova m2, m0 |
mova m3, m1 |
pfcmpge m2, m7 ; m <= 0.0 |
pfcmpge m3, m7 ; a <= 0.0 |
pslld m2, 31 ; keep only the sign bit |
pxor m1, m2 |
mova m4, m3 |
pand m3, m1 |
pandn m4, m1 |
pfadd m3, m0 ; a = m + ((a < 0) & (a ^ sign(m))) |
pfsub m0, m4 ; m = m + ((a > 0) & (a ^ sign(m))) |
mova [angq+block_sizeq*4], m3 |
mova [magq+block_sizeq*4], m0 |
add block_sizeq, 2 |
jl .loop |
femms |
RET |
%endif |
INIT_XMM sse |
cglobal vorbis_inverse_coupling, 3, 4, 6, mag, ang, block_size, cntr |
mova m5, [pdw_80000000] |
xor cntrq, cntrq |
align 16 |
.loop: |
mova m0, [magq+cntrq*4] |
mova m1, [angq+cntrq*4] |
xorps m2, m2 |
xorps m3, m3 |
cmpleps m2, m0 ; m <= 0.0 |
cmpleps m3, m1 ; a <= 0.0 |
andps m2, m5 ; keep only the sign bit |
xorps m1, m2 |
mova m4, m3 |
andps m3, m1 |
andnps m4, m1 |
addps m3, m0 ; a = m + ((a < 0) & (a ^ sign(m))) |
subps m0, m4 ; m = m + ((a > 0) & (a ^ sign(m))) |
mova [angq+cntrq*4], m3 |
mova [magq+cntrq*4], m0 |
add cntrq, 4 |
cmp cntrq, block_sizeq |
jl .loop |
RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vorbisdsp_init.c |
---|
0,0 → 1,44 |
/* |
* Copyright (C) 2006 Loren Merritt <lorenm@u.washington.edu> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "config.h" |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/vorbisdsp.h" |
void ff_vorbis_inverse_coupling_3dnow(float *mag, float *ang, |
intptr_t blocksize); |
void ff_vorbis_inverse_coupling_sse(float *mag, float *ang, |
intptr_t blocksize); |
av_cold void ff_vorbisdsp_init_x86(VorbisDSPContext *dsp) |
{ |
#if HAVE_YASM |
int cpu_flags = av_get_cpu_flags(); |
#if ARCH_X86_32 |
if (EXTERNAL_AMD3DNOW(cpu_flags)) |
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_3dnow; |
#endif /* ARCH_X86_32 */ |
if (EXTERNAL_SSE(cpu_flags)) |
dsp->vorbis_inverse_coupling = ff_vorbis_inverse_coupling_sse; |
#endif /* HAVE_YASM */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp3dsp.asm |
---|
0,0 → 1,709 |
;****************************************************************************** |
;* MMX/SSE2-optimized functions for the VP3 decoder |
;* Copyright (c) 2007 Aurelien Jacobs <aurel@gnuage.org> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
; MMX-optimized functions cribbed from the original VP3 source code. |
SECTION_RODATA |
vp3_idct_data: times 8 dw 64277 |
times 8 dw 60547 |
times 8 dw 54491 |
times 8 dw 46341 |
times 8 dw 36410 |
times 8 dw 25080 |
times 8 dw 12785 |
pb_7: times 8 db 0x07 |
pb_1F: times 8 db 0x1f |
pb_81: times 8 db 0x81 |
cextern pb_1 |
cextern pb_3 |
cextern pb_80 |
cextern pw_8 |
SECTION .text |
; this is off by one or two for some cases when filter_limit is greater than 63 |
; in: p0 in mm6, p1 in mm4, p2 in mm2, p3 in mm1 |
; out: p1 in mm4, p2 in mm3 |
%macro VP3_LOOP_FILTER 0 |
movq m7, m6 |
pand m6, [pb_7] ; p0&7 |
psrlw m7, 3 |
pand m7, [pb_1F] ; p0>>3 |
movq m3, m2 ; p2 |
pxor m2, m4 |
pand m2, [pb_1] ; (p2^p1)&1 |
movq m5, m2 |
paddb m2, m2 |
paddb m2, m5 ; 3*(p2^p1)&1 |
paddb m2, m6 ; extra bits lost in shifts |
pcmpeqb m0, m0 |
pxor m1, m0 ; 255 - p3 |
pavgb m1, m2 ; (256 - p3 + extrabits) >> 1 |
pxor m0, m4 ; 255 - p1 |
pavgb m0, m3 ; (256 + p2-p1) >> 1 |
paddb m1, [pb_3] |
pavgb m1, m0 ; 128+2+( p2-p1 - p3) >> 2 |
pavgb m1, m0 ; 128+1+(3*(p2-p1) - p3) >> 3 |
paddusb m7, m1 ; d+128+1 |
movq m6, [pb_81] |
psubusb m6, m7 |
psubusb m7, [pb_81] |
movq m5, [r2+516] ; flim |
pminub m6, m5 |
pminub m7, m5 |
movq m0, m6 |
movq m1, m7 |
paddb m6, m6 |
paddb m7, m7 |
pminub m6, m5 |
pminub m7, m5 |
psubb m6, m0 |
psubb m7, m1 |
paddusb m4, m7 |
psubusb m4, m6 |
psubusb m3, m7 |
paddusb m3, m6 |
%endmacro |
%macro STORE_4_WORDS 1 |
movd r2d, %1 |
mov [r0 -1], r2w |
psrlq %1, 32 |
shr r2, 16 |
mov [r0+r1 -1], r2w |
movd r2d, %1 |
mov [r0+r1*2-1], r2w |
shr r2, 16 |
mov [r0+r3 -1], r2w |
%endmacro |
INIT_MMX mmxext |
cglobal vp3_v_loop_filter, 3, 4 |
%if ARCH_X86_64 |
movsxd r1, r1d |
%endif |
mov r3, r1 |
neg r1 |
movq m6, [r0+r1*2] |
movq m4, [r0+r1 ] |
movq m2, [r0 ] |
movq m1, [r0+r3 ] |
VP3_LOOP_FILTER |
movq [r0+r1], m4 |
movq [r0 ], m3 |
RET |
cglobal vp3_h_loop_filter, 3, 4 |
%if ARCH_X86_64 |
movsxd r1, r1d |
%endif |
lea r3, [r1*3] |
movd m6, [r0 -2] |
movd m4, [r0+r1 -2] |
movd m2, [r0+r1*2-2] |
movd m1, [r0+r3 -2] |
lea r0, [r0+r1*4 ] |
punpcklbw m6, [r0 -2] |
punpcklbw m4, [r0+r1 -2] |
punpcklbw m2, [r0+r1*2-2] |
punpcklbw m1, [r0+r3 -2] |
sub r0, r3 |
sub r0, r1 |
TRANSPOSE4x4B 6, 4, 2, 1, 0 |
VP3_LOOP_FILTER |
SBUTTERFLY bw, 4, 3, 5 |
STORE_4_WORDS m4 |
lea r0, [r0+r1*4 ] |
STORE_4_WORDS m3 |
RET |
; from original comments: The Macro does IDct on 4 1-D Dcts |
%macro BeginIDCT 0 |
movq m2, I(3) |
movq m6, C(3) |
movq m4, m2 |
movq m7, J(5) |
pmulhw m4, m6 ; r4 = c3*i3 - i3 |
movq m1, C(5) |
pmulhw m6, m7 ; r6 = c3*i5 - i5 |
movq m5, m1 |
pmulhw m1, m2 ; r1 = c5*i3 - i3 |
movq m3, I(1) |
pmulhw m5, m7 ; r5 = c5*i5 - i5 |
movq m0, C(1) |
paddw m4, m2 ; r4 = c3*i3 |
paddw m6, m7 ; r6 = c3*i5 |
paddw m2, m1 ; r2 = c5*i3 |
movq m1, J(7) |
paddw m7, m5 ; r7 = c5*i5 |
movq m5, m0 ; r5 = c1 |
pmulhw m0, m3 ; r0 = c1*i1 - i1 |
paddsw m4, m7 ; r4 = C = c3*i3 + c5*i5 |
pmulhw m5, m1 ; r5 = c1*i7 - i7 |
movq m7, C(7) |
psubsw m6, m2 ; r6 = D = c3*i5 - c5*i3 |
paddw m0, m3 ; r0 = c1*i1 |
pmulhw m3, m7 ; r3 = c7*i1 |
movq m2, I(2) |
pmulhw m7, m1 ; r7 = c7*i7 |
paddw m5, m1 ; r5 = c1*i7 |
movq m1, m2 ; r1 = i2 |
pmulhw m2, C(2) ; r2 = c2*i2 - i2 |
psubsw m3, m5 ; r3 = B = c7*i1 - c1*i7 |
movq m5, J(6) |
paddsw m0, m7 ; r0 = A = c1*i1 + c7*i7 |
movq m7, m5 ; r7 = i6 |
psubsw m0, m4 ; r0 = A - C |
pmulhw m5, C(2) ; r5 = c2*i6 - i6 |
paddw m2, m1 ; r2 = c2*i2 |
pmulhw m1, C(6) ; r1 = c6*i2 |
paddsw m4, m4 ; r4 = C + C |
paddsw m4, m0 ; r4 = C. = A + C |
psubsw m3, m6 ; r3 = B - D |
paddw m5, m7 ; r5 = c2*i6 |
paddsw m6, m6 ; r6 = D + D |
pmulhw m7, C(6) ; r7 = c6*i6 |
paddsw m6, m3 ; r6 = D. = B + D |
movq I(1), m4 ; save C. at I(1) |
psubsw m1, m5 ; r1 = H = c6*i2 - c2*i6 |
movq m4, C(4) |
movq m5, m3 ; r5 = B - D |
pmulhw m3, m4 ; r3 = (c4 - 1) * (B - D) |
paddsw m7, m2 ; r3 = (c4 - 1) * (B - D) |
movq I(2), m6 ; save D. at I(2) |
movq m2, m0 ; r2 = A - C |
movq m6, I(0) |
pmulhw m0, m4 ; r0 = (c4 - 1) * (A - C) |
paddw m5, m3 ; r5 = B. = c4 * (B - D) |
movq m3, J(4) |
psubsw m5, m1 ; r5 = B.. = B. - H |
paddw m2, m0 ; r0 = A. = c4 * (A - C) |
psubsw m6, m3 ; r6 = i0 - i4 |
movq m0, m6 |
pmulhw m6, m4 ; r6 = (c4 - 1) * (i0 - i4) |
paddsw m3, m3 ; r3 = i4 + i4 |
paddsw m1, m1 ; r1 = H + H |
paddsw m3, m0 ; r3 = i0 + i4 |
paddsw m1, m5 ; r1 = H. = B + H |
pmulhw m4, m3 ; r4 = (c4 - 1) * (i0 + i4) |
paddsw m6, m0 ; r6 = F = c4 * (i0 - i4) |
psubsw m6, m2 ; r6 = F. = F - A. |
paddsw m2, m2 ; r2 = A. + A. |
movq m0, I(1) ; r0 = C. |
paddsw m2, m6 ; r2 = A.. = F + A. |
paddw m4, m3 ; r4 = E = c4 * (i0 + i4) |
psubsw m2, m1 ; r2 = R2 = A.. - H. |
%endmacro |
; RowIDCT gets ready to transpose |
%macro RowIDCT 0 |
BeginIDCT |
movq m3, I(2) ; r3 = D. |
psubsw m4, m7 ; r4 = E. = E - G |
paddsw m1, m1 ; r1 = H. + H. |
paddsw m7, m7 ; r7 = G + G |
paddsw m1, m2 ; r1 = R1 = A.. + H. |
paddsw m7, m4 ; r1 = R1 = A.. + H. |
psubsw m4, m3 ; r4 = R4 = E. - D. |
paddsw m3, m3 |
psubsw m6, m5 ; r6 = R6 = F. - B.. |
paddsw m5, m5 |
paddsw m3, m4 ; r3 = R3 = E. + D. |
paddsw m5, m6 ; r5 = R5 = F. + B.. |
psubsw m7, m0 ; r7 = R7 = G. - C. |
paddsw m0, m0 |
movq I(1), m1 ; save R1 |
paddsw m0, m7 ; r0 = R0 = G. + C. |
%endmacro |
; Column IDCT normalizes and stores final results |
%macro ColumnIDCT 0 |
BeginIDCT |
paddsw m2, OC_8 ; adjust R2 (and R1) for shift |
paddsw m1, m1 ; r1 = H. + H. |
paddsw m1, m2 ; r1 = R1 = A.. + H. |
psraw m2, 4 ; r2 = NR2 |
psubsw m4, m7 ; r4 = E. = E - G |
psraw m1, 4 ; r1 = NR2 |
movq m3, I(2) ; r3 = D. |
paddsw m7, m7 ; r7 = G + G |
movq I(2), m2 ; store NR2 at I2 |
paddsw m7, m4 ; r7 = G. = E + G |
movq I(1), m1 ; store NR1 at I1 |
psubsw m4, m3 ; r4 = R4 = E. - D. |
paddsw m4, OC_8 ; adjust R4 (and R3) for shift |
paddsw m3, m3 ; r3 = D. + D. |
paddsw m3, m4 ; r3 = R3 = E. + D. |
psraw m4, 4 ; r4 = NR4 |
psubsw m6, m5 ; r6 = R6 = F. - B.. |
psraw m3, 4 ; r3 = NR3 |
paddsw m6, OC_8 ; adjust R6 (and R5) for shift |
paddsw m5, m5 ; r5 = B.. + B.. |
paddsw m5, m6 ; r5 = R5 = F. + B.. |
psraw m6, 4 ; r6 = NR6 |
movq J(4), m4 ; store NR4 at J4 |
psraw m5, 4 ; r5 = NR5 |
movq I(3), m3 ; store NR3 at I3 |
psubsw m7, m0 ; r7 = R7 = G. - C. |
paddsw m7, OC_8 ; adjust R7 (and R0) for shift |
paddsw m0, m0 ; r0 = C. + C. |
paddsw m0, m7 ; r0 = R0 = G. + C. |
psraw m7, 4 ; r7 = NR7 |
movq J(6), m6 ; store NR6 at J6 |
psraw m0, 4 ; r0 = NR0 |
movq J(5), m5 ; store NR5 at J5 |
movq J(7), m7 ; store NR7 at J7 |
movq I(0), m0 ; store NR0 at I0 |
%endmacro |
; Following macro does two 4x4 transposes in place. |
; |
; At entry (we assume): |
; |
; r0 = a3 a2 a1 a0 |
; I(1) = b3 b2 b1 b0 |
; r2 = c3 c2 c1 c0 |
; r3 = d3 d2 d1 d0 |
; |
; r4 = e3 e2 e1 e0 |
; r5 = f3 f2 f1 f0 |
; r6 = g3 g2 g1 g0 |
; r7 = h3 h2 h1 h0 |
; |
; At exit, we have: |
; |
; I(0) = d0 c0 b0 a0 |
; I(1) = d1 c1 b1 a1 |
; I(2) = d2 c2 b2 a2 |
; I(3) = d3 c3 b3 a3 |
; |
; J(4) = h0 g0 f0 e0 |
; J(5) = h1 g1 f1 e1 |
; J(6) = h2 g2 f2 e2 |
; J(7) = h3 g3 f3 e3 |
; |
; I(0) I(1) I(2) I(3) is the transpose of r0 I(1) r2 r3. |
; J(4) J(5) J(6) J(7) is the transpose of r4 r5 r6 r7. |
; |
; Since r1 is free at entry, we calculate the Js first. |
%macro Transpose 0 |
movq m1, m4 ; r1 = e3 e2 e1 e0 |
punpcklwd m4, m5 ; r4 = f1 e1 f0 e0 |
movq I(0), m0 ; save a3 a2 a1 a0 |
punpckhwd m1, m5 ; r1 = f3 e3 f2 e2 |
movq m0, m6 ; r0 = g3 g2 g1 g0 |
punpcklwd m6, m7 ; r6 = h1 g1 h0 g0 |
movq m5, m4 ; r5 = f1 e1 f0 e0 |
punpckldq m4, m6 ; r4 = h0 g0 f0 e0 = R4 |
punpckhdq m5, m6 ; r5 = h1 g1 f1 e1 = R5 |
movq m6, m1 ; r6 = f3 e3 f2 e2 |
movq J(4), m4 |
punpckhwd m0, m7 ; r0 = h3 g3 h2 g2 |
movq J(5), m5 |
punpckhdq m6, m0 ; r6 = h3 g3 f3 e3 = R7 |
movq m4, I(0) ; r4 = a3 a2 a1 a0 |
punpckldq m1, m0 ; r1 = h2 g2 f2 e2 = R6 |
movq m5, I(1) ; r5 = b3 b2 b1 b0 |
movq m0, m4 ; r0 = a3 a2 a1 a0 |
movq J(7), m6 |
punpcklwd m0, m5 ; r0 = b1 a1 b0 a0 |
movq J(6), m1 |
punpckhwd m4, m5 ; r4 = b3 a3 b2 a2 |
movq m5, m2 ; r5 = c3 c2 c1 c0 |
punpcklwd m2, m3 ; r2 = d1 c1 d0 c0 |
movq m1, m0 ; r1 = b1 a1 b0 a0 |
punpckldq m0, m2 ; r0 = d0 c0 b0 a0 = R0 |
punpckhdq m1, m2 ; r1 = d1 c1 b1 a1 = R1 |
movq m2, m4 ; r2 = b3 a3 b2 a2 |
movq I(0), m0 |
punpckhwd m5, m3 ; r5 = d3 c3 d2 c2 |
movq I(1), m1 |
punpckhdq m4, m5 ; r4 = d3 c3 b3 a3 = R3 |
punpckldq m2, m5 ; r2 = d2 c2 b2 a2 = R2 |
movq I(3), m4 |
movq I(2), m2 |
%endmacro |
%macro VP3_1D_IDCT_SSE2 0 |
movdqa m2, I(3) ; xmm2 = i3 |
movdqa m6, C(3) ; xmm6 = c3 |
movdqa m4, m2 ; xmm4 = i3 |
movdqa m7, I(5) ; xmm7 = i5 |
pmulhw m4, m6 ; xmm4 = c3 * i3 - i3 |
movdqa m1, C(5) ; xmm1 = c5 |
pmulhw m6, m7 ; xmm6 = c3 * i5 - i5 |
movdqa m5, m1 ; xmm5 = c5 |
pmulhw m1, m2 ; xmm1 = c5 * i3 - i3 |
movdqa m3, I(1) ; xmm3 = i1 |
pmulhw m5, m7 ; xmm5 = c5 * i5 - i5 |
movdqa m0, C(1) ; xmm0 = c1 |
paddw m4, m2 ; xmm4 = c3 * i3 |
paddw m6, m7 ; xmm6 = c3 * i5 |
paddw m2, m1 ; xmm2 = c5 * i3 |
movdqa m1, I(7) ; xmm1 = i7 |
paddw m7, m5 ; xmm7 = c5 * i5 |
movdqa m5, m0 ; xmm5 = c1 |
pmulhw m0, m3 ; xmm0 = c1 * i1 - i1 |
paddsw m4, m7 ; xmm4 = c3 * i3 + c5 * i5 = C |
pmulhw m5, m1 ; xmm5 = c1 * i7 - i7 |
movdqa m7, C(7) ; xmm7 = c7 |
psubsw m6, m2 ; xmm6 = c3 * i5 - c5 * i3 = D |
paddw m0, m3 ; xmm0 = c1 * i1 |
pmulhw m3, m7 ; xmm3 = c7 * i1 |
movdqa m2, I(2) ; xmm2 = i2 |
pmulhw m7, m1 ; xmm7 = c7 * i7 |
paddw m5, m1 ; xmm5 = c1 * i7 |
movdqa m1, m2 ; xmm1 = i2 |
pmulhw m2, C(2) ; xmm2 = i2 * c2 -i2 |
psubsw m3, m5 ; xmm3 = c7 * i1 - c1 * i7 = B |
movdqa m5, I(6) ; xmm5 = i6 |
paddsw m0, m7 ; xmm0 = c1 * i1 + c7 * i7 = A |
movdqa m7, m5 ; xmm7 = i6 |
psubsw m0, m4 ; xmm0 = A - C |
pmulhw m5, C(2) ; xmm5 = c2 * i6 - i6 |
paddw m2, m1 ; xmm2 = i2 * c2 |
pmulhw m1, C(6) ; xmm1 = c6 * i2 |
paddsw m4, m4 ; xmm4 = C + C |
paddsw m4, m0 ; xmm4 = A + C = C. |
psubsw m3, m6 ; xmm3 = B - D |
paddw m5, m7 ; xmm5 = c2 * i6 |
paddsw m6, m6 ; xmm6 = D + D |
pmulhw m7, C(6) ; xmm7 = c6 * i6 |
paddsw m6, m3 ; xmm6 = B + D = D. |
movdqa I(1), m4 ; Save C. at I(1) |
psubsw m1, m5 ; xmm1 = c6 * i2 - c2 * i6 = H |
movdqa m4, C(4) ; xmm4 = C4 |
movdqa m5, m3 ; xmm5 = B - D |
pmulhw m3, m4 ; xmm3 = ( c4 -1 ) * ( B - D ) |
paddsw m7, m2 ; xmm7 = c2 * i2 + c6 * i6 = G |
movdqa I(2), m6 ; save D. at I(2) |
movdqa m2, m0 ; xmm2 = A - C |
movdqa m6, I(0) ; xmm6 = i0 |
pmulhw m0, m4 ; xmm0 = ( c4 - 1 ) * ( A - C ) = A. |
paddw m5, m3 ; xmm5 = c4 * ( B - D ) = B. |
movdqa m3, I(4) ; xmm3 = i4 |
psubsw m5, m1 ; xmm5 = B. - H = B.. |
paddw m2, m0 ; xmm2 = c4 * ( A - C) = A. |
psubsw m6, m3 ; xmm6 = i0 - i4 |
movdqa m0, m6 ; xmm0 = i0 - i4 |
pmulhw m6, m4 ; xmm6 = (c4 - 1) * (i0 - i4) = F |
paddsw m3, m3 ; xmm3 = i4 + i4 |
paddsw m1, m1 ; xmm1 = H + H |
paddsw m3, m0 ; xmm3 = i0 + i4 |
paddsw m1, m5 ; xmm1 = B. + H = H. |
pmulhw m4, m3 ; xmm4 = ( c4 - 1 ) * ( i0 + i4 ) |
paddw m6, m0 ; xmm6 = c4 * ( i0 - i4 ) |
psubsw m6, m2 ; xmm6 = F - A. = F. |
paddsw m2, m2 ; xmm2 = A. + A. |
movdqa m0, I(1) ; Load C. from I(1) |
paddsw m2, m6 ; xmm2 = F + A. = A.. |
paddw m4, m3 ; xmm4 = c4 * ( i0 + i4 ) = 3 |
psubsw m2, m1 ; xmm2 = A.. - H. = R2 |
ADD(m2) ; Adjust R2 and R1 before shifting |
paddsw m1, m1 ; xmm1 = H. + H. |
paddsw m1, m2 ; xmm1 = A.. + H. = R1 |
SHIFT(m2) ; xmm2 = op2 |
psubsw m4, m7 ; xmm4 = E - G = E. |
SHIFT(m1) ; xmm1 = op1 |
movdqa m3, I(2) ; Load D. from I(2) |
paddsw m7, m7 ; xmm7 = G + G |
paddsw m7, m4 ; xmm7 = E + G = G. |
psubsw m4, m3 ; xmm4 = E. - D. = R4 |
ADD(m4) ; Adjust R4 and R3 before shifting |
paddsw m3, m3 ; xmm3 = D. + D. |
paddsw m3, m4 ; xmm3 = E. + D. = R3 |
SHIFT(m4) ; xmm4 = op4 |
psubsw m6, m5 ; xmm6 = F. - B..= R6 |
SHIFT(m3) ; xmm3 = op3 |
ADD(m6) ; Adjust R6 and R5 before shifting |
paddsw m5, m5 ; xmm5 = B.. + B.. |
paddsw m5, m6 ; xmm5 = F. + B.. = R5 |
SHIFT(m6) ; xmm6 = op6 |
SHIFT(m5) ; xmm5 = op5 |
psubsw m7, m0 ; xmm7 = G. - C. = R7 |
ADD(m7) ; Adjust R7 and R0 before shifting |
paddsw m0, m0 ; xmm0 = C. + C. |
paddsw m0, m7 ; xmm0 = G. + C. |
SHIFT(m7) ; xmm7 = op7 |
SHIFT(m0) ; xmm0 = op0 |
%endmacro |
%macro PUT_BLOCK 8 |
movdqa O(0), m%1 |
movdqa O(1), m%2 |
movdqa O(2), m%3 |
movdqa O(3), m%4 |
movdqa O(4), m%5 |
movdqa O(5), m%6 |
movdqa O(6), m%7 |
movdqa O(7), m%8 |
%endmacro |
%macro VP3_IDCT 1 |
%if mmsize == 16 |
%define I(x) [%1+16*x] |
%define O(x) [%1+16*x] |
%define C(x) [vp3_idct_data+16*(x-1)] |
%define SHIFT(x) |
%define ADD(x) |
VP3_1D_IDCT_SSE2 |
%if ARCH_X86_64 |
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
%else |
TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%1], [%1+16] |
%endif |
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 |
%define SHIFT(x) psraw x, 4 |
%define ADD(x) paddsw x, [pw_8] |
VP3_1D_IDCT_SSE2 |
PUT_BLOCK 0, 1, 2, 3, 4, 5, 6, 7 |
%else ; mmsize == 8 |
; eax = quantized input |
; ebx = dequantizer matrix |
; ecx = IDCT constants |
; M(I) = ecx + MaskOffset(0) + I * 8 |
; C(I) = ecx + CosineOffset(32) + (I-1) * 8 |
; edx = output |
; r0..r7 = mm0..mm7 |
%define OC_8 [pw_8] |
%define C(x) [vp3_idct_data+16*(x-1)] |
; at this point, function has completed dequantization + dezigzag + |
; partial transposition; now do the idct itself |
%define I(x) [%1+16*x] |
%define J(x) [%1+16*x] |
RowIDCT |
Transpose |
%define I(x) [%1+16*x+8] |
%define J(x) [%1+16*x+8] |
RowIDCT |
Transpose |
%define I(x) [%1+16* x] |
%define J(x) [%1+16*(x-4)+8] |
ColumnIDCT |
%define I(x) [%1+16* x +64] |
%define J(x) [%1+16*(x-4)+72] |
ColumnIDCT |
%endif ; mmsize == 16/8 |
%endmacro |
%macro vp3_idct_funcs 0 |
cglobal vp3_idct_put, 3, 4, 9 |
VP3_IDCT r2 |
movsxdifnidn r1, r1d |
mova m4, [pb_80] |
lea r3, [r1*3] |
%assign %%i 0 |
%rep 16/mmsize |
mova m0, [r2+mmsize*0+%%i] |
mova m1, [r2+mmsize*2+%%i] |
mova m2, [r2+mmsize*4+%%i] |
mova m3, [r2+mmsize*6+%%i] |
%if mmsize == 8 |
packsswb m0, [r2+mmsize*8+%%i] |
packsswb m1, [r2+mmsize*10+%%i] |
packsswb m2, [r2+mmsize*12+%%i] |
packsswb m3, [r2+mmsize*14+%%i] |
%else |
packsswb m0, [r2+mmsize*1+%%i] |
packsswb m1, [r2+mmsize*3+%%i] |
packsswb m2, [r2+mmsize*5+%%i] |
packsswb m3, [r2+mmsize*7+%%i] |
%endif |
paddb m0, m4 |
paddb m1, m4 |
paddb m2, m4 |
paddb m3, m4 |
movq [r0 ], m0 |
%if mmsize == 8 |
movq [r0+r1 ], m1 |
movq [r0+r1*2], m2 |
movq [r0+r3 ], m3 |
%else |
movhps [r0+r1 ], m0 |
movq [r0+r1*2], m1 |
movhps [r0+r3 ], m1 |
%endif |
%if %%i == 0 |
lea r0, [r0+r1*4] |
%endif |
%if mmsize == 16 |
movq [r0 ], m2 |
movhps [r0+r1 ], m2 |
movq [r0+r1*2], m3 |
movhps [r0+r3 ], m3 |
%endif |
%assign %%i %%i+8 |
%endrep |
pxor m0, m0 |
%assign %%offset 0 |
%rep 128/mmsize |
mova [r2+%%offset], m0 |
%assign %%offset %%offset+mmsize |
%endrep |
RET |
cglobal vp3_idct_add, 3, 4, 9 |
VP3_IDCT r2 |
movsxdifnidn r1, r1d |
lea r3, [r1*3] |
pxor m4, m4 |
%if mmsize == 16 |
%assign %%i 0 |
%rep 2 |
movq m0, [r0] |
movq m1, [r0+r1] |
movq m2, [r0+r1*2] |
movq m3, [r0+r3] |
punpcklbw m0, m4 |
punpcklbw m1, m4 |
punpcklbw m2, m4 |
punpcklbw m3, m4 |
paddsw m0, [r2+ 0+%%i] |
paddsw m1, [r2+16+%%i] |
paddsw m2, [r2+32+%%i] |
paddsw m3, [r2+48+%%i] |
packuswb m0, m1 |
packuswb m2, m3 |
movq [r0 ], m0 |
movhps [r0+r1 ], m0 |
movq [r0+r1*2], m2 |
movhps [r0+r3 ], m2 |
%if %%i == 0 |
lea r0, [r0+r1*4] |
%endif |
%assign %%i %%i+64 |
%endrep |
%else |
%assign %%i 0 |
%rep 2 |
movq m0, [r0] |
movq m1, [r0+r1] |
movq m2, [r0+r1*2] |
movq m3, [r0+r3] |
movq m5, m0 |
movq m6, m1 |
movq m7, m2 |
punpcklbw m0, m4 |
punpcklbw m1, m4 |
punpcklbw m2, m4 |
punpckhbw m5, m4 |
punpckhbw m6, m4 |
punpckhbw m7, m4 |
paddsw m0, [r2+ 0+%%i] |
paddsw m1, [r2+16+%%i] |
paddsw m2, [r2+32+%%i] |
paddsw m5, [r2+64+%%i] |
paddsw m6, [r2+80+%%i] |
paddsw m7, [r2+96+%%i] |
packuswb m0, m5 |
movq m5, m3 |
punpcklbw m3, m4 |
punpckhbw m5, m4 |
packuswb m1, m6 |
paddsw m3, [r2+48+%%i] |
paddsw m5, [r2+112+%%i] |
packuswb m2, m7 |
packuswb m3, m5 |
movq [r0 ], m0 |
movq [r0+r1 ], m1 |
movq [r0+r1*2], m2 |
movq [r0+r3 ], m3 |
%if %%i == 0 |
lea r0, [r0+r1*4] |
%endif |
%assign %%i %%i+8 |
%endrep |
%endif |
%assign %%i 0 |
%rep 128/mmsize |
mova [r2+%%i], m4 |
%assign %%i %%i+mmsize |
%endrep |
RET |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
vp3_idct_funcs |
%endif |
INIT_XMM sse2 |
vp3_idct_funcs |
%macro DC_ADD 0 |
movq m2, [r0 ] |
movq m3, [r0+r1 ] |
paddusb m2, m0 |
movq m4, [r0+r1*2] |
paddusb m3, m0 |
movq m5, [r0+r2 ] |
paddusb m4, m0 |
paddusb m5, m0 |
psubusb m2, m1 |
psubusb m3, m1 |
movq [r0 ], m2 |
psubusb m4, m1 |
movq [r0+r1 ], m3 |
psubusb m5, m1 |
movq [r0+r1*2], m4 |
movq [r0+r2 ], m5 |
%endmacro |
INIT_MMX mmxext |
cglobal vp3_idct_dc_add, 3, 4 |
%if ARCH_X86_64 |
movsxd r1, r1d |
%endif |
movsx r3, word [r2] |
mov word [r2], 0 |
lea r2, [r1*3] |
add r3, 15 |
sar r3, 5 |
movd m0, r3d |
pshufw m0, m0, 0x0 |
pxor m1, m1 |
psubw m1, m0 |
packuswb m0, m0 |
packuswb m1, m1 |
DC_ADD |
lea r0, [r0+r1*4] |
DC_ADD |
RET |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp3dsp_init.c |
---|
0,0 → 1,128 |
/* |
* Copyright (c) 2009 David Conrad <lessen42@gmail.com> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include <stdint.h> |
#include "libavutil/attributes.h" |
#include "libavutil/cpu.h" |
#include "libavutil/x86/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavcodec/avcodec.h" |
#include "libavcodec/dsputil.h" |
#include "libavcodec/vp3dsp.h" |
#include "config.h" |
void ff_vp3_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block); |
void ff_vp3_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block); |
void ff_vp3_idct_put_sse2(uint8_t *dest, int line_size, int16_t *block); |
void ff_vp3_idct_add_sse2(uint8_t *dest, int line_size, int16_t *block); |
void ff_vp3_idct_dc_add_mmxext(uint8_t *dest, int line_size, |
int16_t *block); |
void ff_vp3_v_loop_filter_mmxext(uint8_t *src, int stride, |
int *bounding_values); |
void ff_vp3_h_loop_filter_mmxext(uint8_t *src, int stride, |
int *bounding_values); |
#if HAVE_MMX_INLINE |
#define MOVQ_BFE(regd) \ |
__asm__ volatile ( \ |
"pcmpeqd %%"#regd", %%"#regd" \n\t" \ |
"paddb %%"#regd", %%"#regd" \n\t" ::) |
#define PAVGBP_MMX_NO_RND(rega, regb, regr, regc, regd, regp) \ |
"movq "#rega", "#regr" \n\t" \ |
"movq "#regc", "#regp" \n\t" \ |
"pand "#regb", "#regr" \n\t" \ |
"pand "#regd", "#regp" \n\t" \ |
"pxor "#rega", "#regb" \n\t" \ |
"pxor "#regc", "#regd" \n\t" \ |
"pand %%mm6, "#regb" \n\t" \ |
"pand %%mm6, "#regd" \n\t" \ |
"psrlq $1, "#regb" \n\t" \ |
"psrlq $1, "#regd" \n\t" \ |
"paddb "#regb", "#regr" \n\t" \ |
"paddb "#regd", "#regp" \n\t" |
static void put_vp_no_rnd_pixels8_l2_mmx(uint8_t *dst, const uint8_t *a, const uint8_t *b, ptrdiff_t stride, int h) |
{ |
// START_TIMER |
MOVQ_BFE(mm6); |
__asm__ volatile( |
"1: \n\t" |
"movq (%1), %%mm0 \n\t" |
"movq (%2), %%mm1 \n\t" |
"movq (%1,%4), %%mm2 \n\t" |
"movq (%2,%4), %%mm3 \n\t" |
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
"movq %%mm4, (%3) \n\t" |
"movq %%mm5, (%3,%4) \n\t" |
"movq (%1,%4,2), %%mm0 \n\t" |
"movq (%2,%4,2), %%mm1 \n\t" |
"movq (%1,%5), %%mm2 \n\t" |
"movq (%2,%5), %%mm3 \n\t" |
"lea (%1,%4,4), %1 \n\t" |
"lea (%2,%4,4), %2 \n\t" |
PAVGBP_MMX_NO_RND(%%mm0, %%mm1, %%mm4, %%mm2, %%mm3, %%mm5) |
"movq %%mm4, (%3,%4,2) \n\t" |
"movq %%mm5, (%3,%5) \n\t" |
"lea (%3,%4,4), %3 \n\t" |
"subl $4, %0 \n\t" |
"jnz 1b \n\t" |
:"+r"(h), "+r"(a), "+r"(b), "+r"(dst) |
:"r"((x86_reg)stride), "r"((x86_reg)3L*stride) |
:"memory"); |
// STOP_TIMER("put_vp_no_rnd_pixels8_l2_mmx") |
} |
#endif /* HAVE_MMX_INLINE */ |
av_cold void ff_vp3dsp_init_x86(VP3DSPContext *c, int flags) |
{ |
int cpu_flags = av_get_cpu_flags(); |
#if HAVE_MMX_INLINE |
c->put_no_rnd_pixels_l2 = put_vp_no_rnd_pixels8_l2_mmx; |
#endif /* HAVE_MMX_INLINE */ |
#if ARCH_X86_32 |
if (EXTERNAL_MMX(cpu_flags)) { |
c->idct_put = ff_vp3_idct_put_mmx; |
c->idct_add = ff_vp3_idct_add_mmx; |
} |
#endif |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
c->idct_dc_add = ff_vp3_idct_dc_add_mmxext; |
if (!(flags & CODEC_FLAG_BITEXACT)) { |
c->v_loop_filter = ff_vp3_v_loop_filter_mmxext; |
c->h_loop_filter = ff_vp3_h_loop_filter_mmxext; |
} |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
c->idct_put = ff_vp3_idct_put_sse2; |
c->idct_add = ff_vp3_idct_add_sse2; |
} |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp56_arith.h |
---|
0,0 → 1,54 |
/** |
* VP5 and VP6 compatible video decoder (arith decoder) |
* |
* Copyright (C) 2006 Aurelien Jacobs <aurel@gnuage.org> |
* Copyright (C) 2010 Eli Friedman |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#ifndef AVCODEC_X86_VP56_ARITH_H |
#define AVCODEC_X86_VP56_ARITH_H |
#if HAVE_INLINE_ASM && HAVE_FAST_CMOV |
#define vp56_rac_get_prob vp56_rac_get_prob |
static av_always_inline int vp56_rac_get_prob(VP56RangeCoder *c, uint8_t prob) |
{ |
unsigned int code_word = vp56_rac_renorm(c); |
unsigned int high = c->high; |
unsigned int low = 1 + (((high - 1) * prob) >> 8); |
unsigned int low_shift = low << 16; |
int bit = 0; |
__asm__( |
"subl %4, %1 \n\t" |
"subl %3, %2 \n\t" |
"leal (%2, %3), %3 \n\t" |
"setae %b0 \n\t" |
"cmovb %4, %1 \n\t" |
"cmovb %3, %2 \n\t" |
: "+q"(bit), "+r"(high), "+r"(code_word), "+r"(low_shift) |
: "r"(low) |
); |
c->high = high; |
c->code_word = code_word; |
return bit; |
} |
#endif |
#endif /* AVCODEC_X86_VP56_ARITH_H */ |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp6dsp.asm |
---|
0,0 → 1,170 |
;****************************************************************************** |
;* MMX/SSE2-optimized functions for the VP6 decoder |
;* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> |
;* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
cextern pw_64 |
SECTION .text |
%macro DIAG4 6 |
%if mmsize == 8 |
movq m0, [%1+%2] |
movq m1, [%1+%3] |
movq m3, m0 |
movq m4, m1 |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
punpckhbw m3, m7 |
punpckhbw m4, m7 |
pmullw m0, [rsp+8*11] ; src[x-8 ] * biweight [0] |
pmullw m1, [rsp+8*12] ; src[x ] * biweight [1] |
pmullw m3, [rsp+8*11] ; src[x-8 ] * biweight [0] |
pmullw m4, [rsp+8*12] ; src[x ] * biweight [1] |
paddw m0, m1 |
paddw m3, m4 |
movq m1, [%1+%4] |
movq m2, [%1+%5] |
movq m4, m1 |
movq m5, m2 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpckhbw m4, m7 |
punpckhbw m5, m7 |
pmullw m1, [rsp+8*13] ; src[x+8 ] * biweight [2] |
pmullw m2, [rsp+8*14] ; src[x+16] * biweight [3] |
pmullw m4, [rsp+8*13] ; src[x+8 ] * biweight [2] |
pmullw m5, [rsp+8*14] ; src[x+16] * biweight [3] |
paddw m1, m2 |
paddw m4, m5 |
paddsw m0, m1 |
paddsw m3, m4 |
paddsw m0, m6 ; Add 64 |
paddsw m3, m6 ; Add 64 |
psraw m0, 7 |
psraw m3, 7 |
packuswb m0, m3 |
movq [%6], m0 |
%else ; mmsize == 16 |
movq m0, [%1+%2] |
movq m1, [%1+%3] |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
pmullw m0, m4 ; src[x-8 ] * biweight [0] |
pmullw m1, m5 ; src[x ] * biweight [1] |
paddw m0, m1 |
movq m1, [%1+%4] |
movq m2, [%1+%5] |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
pmullw m1, m6 ; src[x+8 ] * biweight [2] |
pmullw m2, m3 ; src[x+16] * biweight [3] |
paddw m1, m2 |
paddsw m0, m1 |
paddsw m0, [pw_64] ; Add 64 |
psraw m0, 7 |
packuswb m0, m0 |
movq [%6], m0 |
%endif ; mmsize == 8/16 |
%endmacro |
%macro SPLAT4REGS 0 |
%if mmsize == 8 |
movq m5, m3 |
punpcklwd m3, m3 |
movq m4, m3 |
punpckldq m3, m3 |
punpckhdq m4, m4 |
punpckhwd m5, m5 |
movq m2, m5 |
punpckhdq m2, m2 |
punpckldq m5, m5 |
movq [rsp+8*11], m3 |
movq [rsp+8*12], m4 |
movq [rsp+8*13], m5 |
movq [rsp+8*14], m2 |
%else ; mmsize == 16 |
pshuflw m4, m3, 0x0 |
pshuflw m5, m3, 0x55 |
pshuflw m6, m3, 0xAA |
pshuflw m3, m3, 0xFF |
punpcklqdq m4, m4 |
punpcklqdq m5, m5 |
punpcklqdq m6, m6 |
punpcklqdq m3, m3 |
%endif ; mmsize == 8/16 |
%endmacro |
%macro vp6_filter_diag4 0 |
; void ff_vp6_filter_diag4_<opt>(uint8_t *dst, uint8_t *src, int stride, |
; const int16_t h_weight[4], const int16_t v_weights[4]) |
cglobal vp6_filter_diag4, 5, 7, 8 |
mov r5, rsp ; backup stack pointer |
and rsp, ~(mmsize-1) ; align stack |
%if mmsize == 16 |
sub rsp, 8*11 |
%else |
sub rsp, 8*15 |
movq m6, [pw_64] |
%endif |
%if ARCH_X86_64 |
movsxd r2, r2d |
%endif |
sub r1, r2 |
pxor m7, m7 |
movq m3, [r3] |
SPLAT4REGS |
mov r3, rsp |
mov r6, 11 |
.nextrow: |
DIAG4 r1, -1, 0, 1, 2, r3 |
add r3, 8 |
add r1, r2 |
dec r6 |
jnz .nextrow |
movq m3, [r4] |
SPLAT4REGS |
lea r3, [rsp+8] |
mov r6, 8 |
.nextcol: |
DIAG4 r3, -8, 0, 8, 16, r0 |
add r3, 8 |
add r0, r2 |
dec r6 |
jnz .nextcol |
mov rsp, r5 ; restore stack pointer |
RET |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
vp6_filter_diag4 |
%endif |
INIT_XMM sse2 |
vp6_filter_diag4 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp6dsp_init.c |
---|
0,0 → 1,45 |
/* |
* VP6 MMX/SSE2 optimizations |
* Copyright (C) 2009 Sebastien Lucas <sebastien.lucas@gmail.com> |
* Copyright (C) 2009 Zuxy Meng <zuxy.meng@gmail.com> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/cpu.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/vp56dsp.h" |
void ff_vp6_filter_diag4_mmx(uint8_t *dst, uint8_t *src, int stride, |
const int16_t *h_weights,const int16_t *v_weights); |
void ff_vp6_filter_diag4_sse2(uint8_t *dst, uint8_t *src, int stride, |
const int16_t *h_weights,const int16_t *v_weights); |
av_cold void ff_vp6dsp_init_x86(VP56DSPContext* c, enum AVCodecID codec) |
{ |
int cpu_flags = av_get_cpu_flags(); |
#if ARCH_X86_32 |
if (EXTERNAL_MMX(cpu_flags)) { |
c->vp6_filter_diag4 = ff_vp6_filter_diag4_mmx; |
} |
#endif |
if (EXTERNAL_SSE2(cpu_flags)) { |
c->vp6_filter_diag4 = ff_vp6_filter_diag4_sse2; |
} |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp8dsp.asm |
---|
0,0 → 1,2780 |
;****************************************************************************** |
;* VP8 MMXEXT optimizations |
;* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |
;* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
fourtap_filter_hw_m: times 4 dw -6, 123 |
times 4 dw 12, -1 |
times 4 dw -9, 93 |
times 4 dw 50, -6 |
times 4 dw -6, 50 |
times 4 dw 93, -9 |
times 4 dw -1, 12 |
times 4 dw 123, -6 |
sixtap_filter_hw_m: times 4 dw 2, -11 |
times 4 dw 108, 36 |
times 4 dw -8, 1 |
times 4 dw 3, -16 |
times 4 dw 77, 77 |
times 4 dw -16, 3 |
times 4 dw 1, -8 |
times 4 dw 36, 108 |
times 4 dw -11, 2 |
fourtap_filter_hb_m: times 8 db -6, 123 |
times 8 db 12, -1 |
times 8 db -9, 93 |
times 8 db 50, -6 |
times 8 db -6, 50 |
times 8 db 93, -9 |
times 8 db -1, 12 |
times 8 db 123, -6 |
sixtap_filter_hb_m: times 8 db 2, 1 |
times 8 db -11, 108 |
times 8 db 36, -8 |
times 8 db 3, 3 |
times 8 db -16, 77 |
times 8 db 77, -16 |
times 8 db 1, 2 |
times 8 db -8, 36 |
times 8 db 108, -11 |
fourtap_filter_v_m: times 8 dw -6 |
times 8 dw 123 |
times 8 dw 12 |
times 8 dw -1 |
times 8 dw -9 |
times 8 dw 93 |
times 8 dw 50 |
times 8 dw -6 |
times 8 dw -6 |
times 8 dw 50 |
times 8 dw 93 |
times 8 dw -9 |
times 8 dw -1 |
times 8 dw 12 |
times 8 dw 123 |
times 8 dw -6 |
sixtap_filter_v_m: times 8 dw 2 |
times 8 dw -11 |
times 8 dw 108 |
times 8 dw 36 |
times 8 dw -8 |
times 8 dw 1 |
times 8 dw 3 |
times 8 dw -16 |
times 8 dw 77 |
times 8 dw 77 |
times 8 dw -16 |
times 8 dw 3 |
times 8 dw 1 |
times 8 dw -8 |
times 8 dw 36 |
times 8 dw 108 |
times 8 dw -11 |
times 8 dw 2 |
bilinear_filter_vw_m: times 8 dw 1 |
times 8 dw 2 |
times 8 dw 3 |
times 8 dw 4 |
times 8 dw 5 |
times 8 dw 6 |
times 8 dw 7 |
bilinear_filter_vb_m: times 8 db 7, 1 |
times 8 db 6, 2 |
times 8 db 5, 3 |
times 8 db 4, 4 |
times 8 db 3, 5 |
times 8 db 2, 6 |
times 8 db 1, 7 |
%ifdef PIC |
%define fourtap_filter_hw picregq |
%define sixtap_filter_hw picregq |
%define fourtap_filter_hb picregq |
%define sixtap_filter_hb picregq |
%define fourtap_filter_v picregq |
%define sixtap_filter_v picregq |
%define bilinear_filter_vw picregq |
%define bilinear_filter_vb picregq |
%define npicregs 1 |
%else |
%define fourtap_filter_hw fourtap_filter_hw_m |
%define sixtap_filter_hw sixtap_filter_hw_m |
%define fourtap_filter_hb fourtap_filter_hb_m |
%define sixtap_filter_hb sixtap_filter_hb_m |
%define fourtap_filter_v fourtap_filter_v_m |
%define sixtap_filter_v sixtap_filter_v_m |
%define bilinear_filter_vw bilinear_filter_vw_m |
%define bilinear_filter_vb bilinear_filter_vb_m |
%define npicregs 0 |
%endif |
filter_h2_shuf: db 0, 1, 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8 |
filter_h4_shuf: db 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10 |
filter_h6_shuf1: db 0, 5, 1, 6, 2, 7, 3, 8, 4, 9, 5, 10, 6, 11, 7, 12 |
filter_h6_shuf2: db 1, 2, 2, 3, 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9 |
filter_h6_shuf3: db 3, 4, 4, 5, 5, 6, 6, 7, 7, 8, 8, 9, 9, 10, 10, 11 |
pw_27: times 8 dw 27 |
pw_63: times 8 dw 63 |
pw_256: times 8 dw 256 |
pw_20091: times 4 dw 20091 |
pw_17734: times 4 dw 17734 |
pb_4: times 16 db 4 |
pb_F8: times 16 db 0xF8 |
pb_FE: times 16 db 0xFE |
pb_27_63: times 8 db 27, 63 |
pb_18_63: times 8 db 18, 63 |
pb_9_63: times 8 db 9, 63 |
cextern pb_1 |
cextern pw_3 |
cextern pb_3 |
cextern pw_4 |
cextern pw_9 |
cextern pw_18 |
cextern pw_64 |
cextern pb_80 |
SECTION .text |
;----------------------------------------------------------------------------- |
; subpel MC functions: |
; |
; void put_vp8_epel<size>_h<htap>v<vtap>_<opt>(uint8_t *dst, int deststride, |
; uint8_t *src, int srcstride, |
; int height, int mx, int my); |
;----------------------------------------------------------------------------- |
%macro FILTER_SSSE3 1 |
cglobal put_vp8_epel%1_h6, 6, 6 + npicregs, 8, dst, dststride, src, srcstride, height, mx, picreg |
lea mxd, [mxq*3] |
mova m3, [filter_h6_shuf2] |
mova m4, [filter_h6_shuf3] |
%ifdef PIC |
lea picregq, [sixtap_filter_hb_m] |
%endif |
mova m5, [sixtap_filter_hb+mxq*8-48] ; set up 6tap filter in bytes |
mova m6, [sixtap_filter_hb+mxq*8-32] |
mova m7, [sixtap_filter_hb+mxq*8-16] |
.nextrow: |
movu m0, [srcq-2] |
mova m1, m0 |
mova m2, m0 |
%if mmsize == 8 |
; For epel4, we need 9 bytes, but only 8 get loaded; to compensate, do the |
; shuffle with a memory operand |
punpcklbw m0, [srcq+3] |
%else |
pshufb m0, [filter_h6_shuf1] |
%endif |
pshufb m1, m3 |
pshufb m2, m4 |
pmaddubsw m0, m5 |
pmaddubsw m1, m6 |
pmaddubsw m2, m7 |
paddsw m0, m1 |
paddsw m0, m2 |
pmulhrsw m0, [pw_256] |
packuswb m0, m0 |
movh [dstq], m0 ; store |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
cglobal put_vp8_epel%1_h4, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg |
shl mxd, 4 |
mova m2, [pw_256] |
mova m3, [filter_h2_shuf] |
mova m4, [filter_h4_shuf] |
%ifdef PIC |
lea picregq, [fourtap_filter_hb_m] |
%endif |
mova m5, [fourtap_filter_hb+mxq-16] ; set up 4tap filter in bytes |
mova m6, [fourtap_filter_hb+mxq] |
.nextrow: |
movu m0, [srcq-1] |
mova m1, m0 |
pshufb m0, m3 |
pshufb m1, m4 |
pmaddubsw m0, m5 |
pmaddubsw m1, m6 |
paddsw m0, m1 |
pmulhrsw m0, m2 |
packuswb m0, m0 |
movh [dstq], m0 ; store |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my |
shl myd, 4 |
%ifdef PIC |
lea picregq, [fourtap_filter_hb_m] |
%endif |
mova m5, [fourtap_filter_hb+myq-16] |
mova m6, [fourtap_filter_hb+myq] |
mova m7, [pw_256] |
; read 3 lines |
sub srcq, srcstrideq |
movh m0, [srcq] |
movh m1, [srcq+ srcstrideq] |
movh m2, [srcq+2*srcstrideq] |
add srcq, srcstrideq |
.nextrow: |
movh m3, [srcq+2*srcstrideq] ; read new row |
mova m4, m0 |
mova m0, m1 |
punpcklbw m4, m1 |
mova m1, m2 |
punpcklbw m2, m3 |
pmaddubsw m4, m5 |
pmaddubsw m2, m6 |
paddsw m4, m2 |
mova m2, m3 |
pmulhrsw m4, m7 |
packuswb m4, m4 |
movh [dstq], m4 |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my |
lea myd, [myq*3] |
%ifdef PIC |
lea picregq, [sixtap_filter_hb_m] |
%endif |
lea myq, [sixtap_filter_hb+myq*8] |
; read 5 lines |
sub srcq, srcstrideq |
sub srcq, srcstrideq |
movh m0, [srcq] |
movh m1, [srcq+srcstrideq] |
movh m2, [srcq+srcstrideq*2] |
lea srcq, [srcq+srcstrideq*2] |
add srcq, srcstrideq |
movh m3, [srcq] |
movh m4, [srcq+srcstrideq] |
.nextrow: |
movh m5, [srcq+2*srcstrideq] ; read new row |
mova m6, m0 |
punpcklbw m6, m5 |
mova m0, m1 |
punpcklbw m1, m2 |
mova m7, m3 |
punpcklbw m7, m4 |
pmaddubsw m6, [myq-48] |
pmaddubsw m1, [myq-32] |
pmaddubsw m7, [myq-16] |
paddsw m6, m1 |
paddsw m6, m7 |
mova m1, m2 |
mova m2, m3 |
pmulhrsw m6, [pw_256] |
mova m3, m4 |
packuswb m6, m6 |
mova m4, m5 |
movh [dstq], m6 |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
%endmacro |
INIT_MMX ssse3 |
FILTER_SSSE3 4 |
INIT_XMM ssse3 |
FILTER_SSSE3 8 |
; 4x4 block, H-only 4-tap filter |
INIT_MMX mmxext |
cglobal put_vp8_epel4_h4, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg |
shl mxd, 4 |
%ifdef PIC |
lea picregq, [fourtap_filter_hw_m] |
%endif |
movq mm4, [fourtap_filter_hw+mxq-16] ; set up 4tap filter in words |
movq mm5, [fourtap_filter_hw+mxq] |
movq mm7, [pw_64] |
pxor mm6, mm6 |
.nextrow: |
movq mm1, [srcq-1] ; (ABCDEFGH) load 8 horizontal pixels |
; first set of 2 pixels |
movq mm2, mm1 ; byte ABCD.. |
punpcklbw mm1, mm6 ; byte->word ABCD |
pshufw mm0, mm2, 9 ; byte CDEF.. |
punpcklbw mm0, mm6 ; byte->word CDEF |
pshufw mm3, mm1, 0x94 ; word ABBC |
pshufw mm1, mm0, 0x94 ; word CDDE |
pmaddwd mm3, mm4 ; multiply 2px with F0/F1 |
movq mm0, mm1 ; backup for second set of pixels |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |
paddd mm3, mm1 ; finish 1st 2px |
; second set of 2 pixels, use backup of above |
punpckhbw mm2, mm6 ; byte->word EFGH |
pmaddwd mm0, mm4 ; multiply backed up 2px with F0/F1 |
pshufw mm1, mm2, 0x94 ; word EFFG |
pmaddwd mm1, mm5 ; multiply 2px with F2/F3 |
paddd mm0, mm1 ; finish 2nd 2px |
; merge two sets of 2 pixels into one set of 4, round/clip/store |
packssdw mm3, mm0 ; merge dword->word (4px) |
paddsw mm3, mm7 ; rounding |
psraw mm3, 7 |
packuswb mm3, mm6 ; clip and word->bytes |
movd [dstq], mm3 ; store |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
; 4x4 block, H-only 6-tap filter |
INIT_MMX mmxext |
cglobal put_vp8_epel4_h6, 6, 6 + npicregs, 0, dst, dststride, src, srcstride, height, mx, picreg |
lea mxd, [mxq*3] |
%ifdef PIC |
lea picregq, [sixtap_filter_hw_m] |
%endif |
movq mm4, [sixtap_filter_hw+mxq*8-48] ; set up 4tap filter in words |
movq mm5, [sixtap_filter_hw+mxq*8-32] |
movq mm6, [sixtap_filter_hw+mxq*8-16] |
movq mm7, [pw_64] |
pxor mm3, mm3 |
.nextrow: |
movq mm1, [srcq-2] ; (ABCDEFGH) load 8 horizontal pixels |
; first set of 2 pixels |
movq mm2, mm1 ; byte ABCD.. |
punpcklbw mm1, mm3 ; byte->word ABCD |
pshufw mm0, mm2, 0x9 ; byte CDEF.. |
punpckhbw mm2, mm3 ; byte->word EFGH |
punpcklbw mm0, mm3 ; byte->word CDEF |
pshufw mm1, mm1, 0x94 ; word ABBC |
pshufw mm2, mm2, 0x94 ; word EFFG |
pmaddwd mm1, mm4 ; multiply 2px with F0/F1 |
pshufw mm3, mm0, 0x94 ; word CDDE |
movq mm0, mm3 ; backup for second set of pixels |
pmaddwd mm3, mm5 ; multiply 2px with F2/F3 |
paddd mm1, mm3 ; add to 1st 2px cache |
movq mm3, mm2 ; backup for second set of pixels |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |
paddd mm1, mm2 ; finish 1st 2px |
; second set of 2 pixels, use backup of above |
movd mm2, [srcq+3] ; byte FGHI (prevent overreads) |
pmaddwd mm0, mm4 ; multiply 1st backed up 2px with F0/F1 |
pmaddwd mm3, mm5 ; multiply 2nd backed up 2px with F2/F3 |
paddd mm0, mm3 ; add to 2nd 2px cache |
pxor mm3, mm3 |
punpcklbw mm2, mm3 ; byte->word FGHI |
pshufw mm2, mm2, 0xE9 ; word GHHI |
pmaddwd mm2, mm6 ; multiply 2px with F4/F5 |
paddd mm0, mm2 ; finish 2nd 2px |
; merge two sets of 2 pixels into one set of 4, round/clip/store |
packssdw mm1, mm0 ; merge dword->word (4px) |
paddsw mm1, mm7 ; rounding |
psraw mm1, 7 |
packuswb mm1, mm3 ; clip and word->bytes |
movd [dstq], mm1 ; store |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
INIT_XMM sse2 |
cglobal put_vp8_epel8_h4, 6, 6 + npicregs, 10, dst, dststride, src, srcstride, height, mx, picreg |
shl mxd, 5 |
%ifdef PIC |
lea picregq, [fourtap_filter_v_m] |
%endif |
lea mxq, [fourtap_filter_v+mxq-32] |
pxor m7, m7 |
mova m4, [pw_64] |
mova m5, [mxq+ 0] |
mova m6, [mxq+16] |
%ifdef m8 |
mova m8, [mxq+32] |
mova m9, [mxq+48] |
%endif |
.nextrow: |
movq m0, [srcq-1] |
movq m1, [srcq-0] |
movq m2, [srcq+1] |
movq m3, [srcq+2] |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
pmullw m0, m5 |
pmullw m1, m6 |
%ifdef m8 |
pmullw m2, m8 |
pmullw m3, m9 |
%else |
pmullw m2, [mxq+32] |
pmullw m3, [mxq+48] |
%endif |
paddsw m0, m1 |
paddsw m2, m3 |
paddsw m0, m2 |
paddsw m0, m4 |
psraw m0, 7 |
packuswb m0, m7 |
movh [dstq], m0 ; store |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
INIT_XMM sse2 |
cglobal put_vp8_epel8_h6, 6, 6 + npicregs, 14, dst, dststride, src, srcstride, height, mx, picreg |
lea mxd, [mxq*3] |
shl mxd, 4 |
%ifdef PIC |
lea picregq, [sixtap_filter_v_m] |
%endif |
lea mxq, [sixtap_filter_v+mxq-96] |
pxor m7, m7 |
mova m6, [pw_64] |
%ifdef m8 |
mova m8, [mxq+ 0] |
mova m9, [mxq+16] |
mova m10, [mxq+32] |
mova m11, [mxq+48] |
mova m12, [mxq+64] |
mova m13, [mxq+80] |
%endif |
.nextrow: |
movq m0, [srcq-2] |
movq m1, [srcq-1] |
movq m2, [srcq-0] |
movq m3, [srcq+1] |
movq m4, [srcq+2] |
movq m5, [srcq+3] |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
punpcklbw m4, m7 |
punpcklbw m5, m7 |
%ifdef m8 |
pmullw m0, m8 |
pmullw m1, m9 |
pmullw m2, m10 |
pmullw m3, m11 |
pmullw m4, m12 |
pmullw m5, m13 |
%else |
pmullw m0, [mxq+ 0] |
pmullw m1, [mxq+16] |
pmullw m2, [mxq+32] |
pmullw m3, [mxq+48] |
pmullw m4, [mxq+64] |
pmullw m5, [mxq+80] |
%endif |
paddsw m1, m4 |
paddsw m0, m5 |
paddsw m1, m2 |
paddsw m0, m3 |
paddsw m0, m1 |
paddsw m0, m6 |
psraw m0, 7 |
packuswb m0, m7 |
movh [dstq], m0 ; store |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
%macro FILTER_V 1 |
; 4x4 block, V-only 4-tap filter |
cglobal put_vp8_epel%1_v4, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my |
shl myd, 5 |
%ifdef PIC |
lea picregq, [fourtap_filter_v_m] |
%endif |
lea myq, [fourtap_filter_v+myq-32] |
mova m6, [pw_64] |
pxor m7, m7 |
mova m5, [myq+48] |
; read 3 lines |
sub srcq, srcstrideq |
movh m0, [srcq] |
movh m1, [srcq+ srcstrideq] |
movh m2, [srcq+2*srcstrideq] |
add srcq, srcstrideq |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
.nextrow: |
; first calculate negative taps (to prevent losing positive overflows) |
movh m4, [srcq+2*srcstrideq] ; read new row |
punpcklbw m4, m7 |
mova m3, m4 |
pmullw m0, [myq+0] |
pmullw m4, m5 |
paddsw m4, m0 |
; then calculate positive taps |
mova m0, m1 |
pmullw m1, [myq+16] |
paddsw m4, m1 |
mova m1, m2 |
pmullw m2, [myq+32] |
paddsw m4, m2 |
mova m2, m3 |
; round/clip/store |
paddsw m4, m6 |
psraw m4, 7 |
packuswb m4, m7 |
movh [dstq], m4 |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
; 4x4 block, V-only 6-tap filter |
cglobal put_vp8_epel%1_v6, 7, 7, 8, dst, dststride, src, srcstride, height, picreg, my |
shl myd, 4 |
lea myq, [myq*3] |
%ifdef PIC |
lea picregq, [sixtap_filter_v_m] |
%endif |
lea myq, [sixtap_filter_v+myq-96] |
pxor m7, m7 |
; read 5 lines |
sub srcq, srcstrideq |
sub srcq, srcstrideq |
movh m0, [srcq] |
movh m1, [srcq+srcstrideq] |
movh m2, [srcq+srcstrideq*2] |
lea srcq, [srcq+srcstrideq*2] |
add srcq, srcstrideq |
movh m3, [srcq] |
movh m4, [srcq+srcstrideq] |
punpcklbw m0, m7 |
punpcklbw m1, m7 |
punpcklbw m2, m7 |
punpcklbw m3, m7 |
punpcklbw m4, m7 |
.nextrow: |
; first calculate negative taps (to prevent losing positive overflows) |
mova m5, m1 |
pmullw m5, [myq+16] |
mova m6, m4 |
pmullw m6, [myq+64] |
paddsw m6, m5 |
; then calculate positive taps |
movh m5, [srcq+2*srcstrideq] ; read new row |
punpcklbw m5, m7 |
pmullw m0, [myq+0] |
paddsw m6, m0 |
mova m0, m1 |
mova m1, m2 |
pmullw m2, [myq+32] |
paddsw m6, m2 |
mova m2, m3 |
pmullw m3, [myq+48] |
paddsw m6, m3 |
mova m3, m4 |
mova m4, m5 |
pmullw m5, [myq+80] |
paddsw m6, m5 |
; round/clip/store |
paddsw m6, [pw_64] |
psraw m6, 7 |
packuswb m6, m7 |
movh [dstq], m6 |
; go to next line |
add dstq, dststrideq |
add srcq, srcstrideq |
dec heightd ; next row |
jg .nextrow |
REP_RET |
%endmacro |
INIT_MMX mmxext |
FILTER_V 4 |
INIT_XMM sse2 |
FILTER_V 8 |
%macro FILTER_BILINEAR 1 |
cglobal put_vp8_bilinear%1_v, 7, 7, 7, dst, dststride, src, srcstride, height, picreg, my |
shl myd, 4 |
%ifdef PIC |
lea picregq, [bilinear_filter_vw_m] |
%endif |
pxor m6, m6 |
mova m5, [bilinear_filter_vw+myq-1*16] |
neg myq |
mova m4, [bilinear_filter_vw+myq+7*16] |
.nextrow: |
movh m0, [srcq+srcstrideq*0] |
movh m1, [srcq+srcstrideq*1] |
movh m3, [srcq+srcstrideq*2] |
punpcklbw m0, m6 |
punpcklbw m1, m6 |
punpcklbw m3, m6 |
mova m2, m1 |
pmullw m0, m4 |
pmullw m1, m5 |
pmullw m2, m4 |
pmullw m3, m5 |
paddsw m0, m1 |
paddsw m2, m3 |
psraw m0, 2 |
psraw m2, 2 |
pavgw m0, m6 |
pavgw m2, m6 |
%if mmsize == 8 |
packuswb m0, m0 |
packuswb m2, m2 |
movh [dstq+dststrideq*0], m0 |
movh [dstq+dststrideq*1], m2 |
%else |
packuswb m0, m2 |
movh [dstq+dststrideq*0], m0 |
movhps [dstq+dststrideq*1], m0 |
%endif |
lea dstq, [dstq+dststrideq*2] |
lea srcq, [srcq+srcstrideq*2] |
sub heightd, 2 |
jg .nextrow |
REP_RET |
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 7, dst, dststride, src, srcstride, height, mx, picreg |
shl mxd, 4 |
%ifdef PIC |
lea picregq, [bilinear_filter_vw_m] |
%endif |
pxor m6, m6 |
mova m5, [bilinear_filter_vw+mxq-1*16] |
neg mxq |
mova m4, [bilinear_filter_vw+mxq+7*16] |
.nextrow: |
movh m0, [srcq+srcstrideq*0+0] |
movh m1, [srcq+srcstrideq*0+1] |
movh m2, [srcq+srcstrideq*1+0] |
movh m3, [srcq+srcstrideq*1+1] |
punpcklbw m0, m6 |
punpcklbw m1, m6 |
punpcklbw m2, m6 |
punpcklbw m3, m6 |
pmullw m0, m4 |
pmullw m1, m5 |
pmullw m2, m4 |
pmullw m3, m5 |
paddsw m0, m1 |
paddsw m2, m3 |
psraw m0, 2 |
psraw m2, 2 |
pavgw m0, m6 |
pavgw m2, m6 |
%if mmsize == 8 |
packuswb m0, m0 |
packuswb m2, m2 |
movh [dstq+dststrideq*0], m0 |
movh [dstq+dststrideq*1], m2 |
%else |
packuswb m0, m2 |
movh [dstq+dststrideq*0], m0 |
movhps [dstq+dststrideq*1], m0 |
%endif |
lea dstq, [dstq+dststrideq*2] |
lea srcq, [srcq+srcstrideq*2] |
sub heightd, 2 |
jg .nextrow |
REP_RET |
%endmacro |
INIT_MMX mmxext |
FILTER_BILINEAR 4 |
INIT_XMM sse2 |
FILTER_BILINEAR 8 |
%macro FILTER_BILINEAR_SSSE3 1 |
cglobal put_vp8_bilinear%1_v, 7, 7, 5, dst, dststride, src, srcstride, height, picreg, my |
shl myd, 4 |
%ifdef PIC |
lea picregq, [bilinear_filter_vb_m] |
%endif |
pxor m4, m4 |
mova m3, [bilinear_filter_vb+myq-16] |
.nextrow: |
movh m0, [srcq+srcstrideq*0] |
movh m1, [srcq+srcstrideq*1] |
movh m2, [srcq+srcstrideq*2] |
punpcklbw m0, m1 |
punpcklbw m1, m2 |
pmaddubsw m0, m3 |
pmaddubsw m1, m3 |
psraw m0, 2 |
psraw m1, 2 |
pavgw m0, m4 |
pavgw m1, m4 |
%if mmsize==8 |
packuswb m0, m0 |
packuswb m1, m1 |
movh [dstq+dststrideq*0], m0 |
movh [dstq+dststrideq*1], m1 |
%else |
packuswb m0, m1 |
movh [dstq+dststrideq*0], m0 |
movhps [dstq+dststrideq*1], m0 |
%endif |
lea dstq, [dstq+dststrideq*2] |
lea srcq, [srcq+srcstrideq*2] |
sub heightd, 2 |
jg .nextrow |
REP_RET |
cglobal put_vp8_bilinear%1_h, 6, 6 + npicregs, 5, dst, dststride, src, srcstride, height, mx, picreg |
shl mxd, 4 |
%ifdef PIC |
lea picregq, [bilinear_filter_vb_m] |
%endif |
pxor m4, m4 |
mova m2, [filter_h2_shuf] |
mova m3, [bilinear_filter_vb+mxq-16] |
.nextrow: |
movu m0, [srcq+srcstrideq*0] |
movu m1, [srcq+srcstrideq*1] |
pshufb m0, m2 |
pshufb m1, m2 |
pmaddubsw m0, m3 |
pmaddubsw m1, m3 |
psraw m0, 2 |
psraw m1, 2 |
pavgw m0, m4 |
pavgw m1, m4 |
%if mmsize==8 |
packuswb m0, m0 |
packuswb m1, m1 |
movh [dstq+dststrideq*0], m0 |
movh [dstq+dststrideq*1], m1 |
%else |
packuswb m0, m1 |
movh [dstq+dststrideq*0], m0 |
movhps [dstq+dststrideq*1], m0 |
%endif |
lea dstq, [dstq+dststrideq*2] |
lea srcq, [srcq+srcstrideq*2] |
sub heightd, 2 |
jg .nextrow |
REP_RET |
%endmacro |
INIT_MMX ssse3 |
FILTER_BILINEAR_SSSE3 4 |
INIT_XMM ssse3 |
FILTER_BILINEAR_SSSE3 8 |
INIT_MMX mmx |
cglobal put_vp8_pixels8, 5, 5, 0, dst, dststride, src, srcstride, height |
.nextrow: |
movq mm0, [srcq+srcstrideq*0] |
movq mm1, [srcq+srcstrideq*1] |
lea srcq, [srcq+srcstrideq*2] |
movq [dstq+dststrideq*0], mm0 |
movq [dstq+dststrideq*1], mm1 |
lea dstq, [dstq+dststrideq*2] |
sub heightd, 2 |
jg .nextrow |
REP_RET |
%if ARCH_X86_32 |
INIT_MMX mmx |
cglobal put_vp8_pixels16, 5, 5, 0, dst, dststride, src, srcstride, height |
.nextrow: |
movq mm0, [srcq+srcstrideq*0+0] |
movq mm1, [srcq+srcstrideq*0+8] |
movq mm2, [srcq+srcstrideq*1+0] |
movq mm3, [srcq+srcstrideq*1+8] |
lea srcq, [srcq+srcstrideq*2] |
movq [dstq+dststrideq*0+0], mm0 |
movq [dstq+dststrideq*0+8], mm1 |
movq [dstq+dststrideq*1+0], mm2 |
movq [dstq+dststrideq*1+8], mm3 |
lea dstq, [dstq+dststrideq*2] |
sub heightd, 2 |
jg .nextrow |
REP_RET |
%endif |
INIT_XMM sse |
cglobal put_vp8_pixels16, 5, 5, 2, dst, dststride, src, srcstride, height |
.nextrow: |
movups xmm0, [srcq+srcstrideq*0] |
movups xmm1, [srcq+srcstrideq*1] |
lea srcq, [srcq+srcstrideq*2] |
movaps [dstq+dststrideq*0], xmm0 |
movaps [dstq+dststrideq*1], xmm1 |
lea dstq, [dstq+dststrideq*2] |
sub heightd, 2 |
jg .nextrow |
REP_RET |
;----------------------------------------------------------------------------- |
; void vp8_idct_dc_add_<opt>(uint8_t *dst, int16_t block[16], int stride); |
;----------------------------------------------------------------------------- |
%macro ADD_DC 4 |
%4 m2, [dst1q+%3] |
%4 m3, [dst1q+strideq+%3] |
%4 m4, [dst2q+%3] |
%4 m5, [dst2q+strideq+%3] |
paddusb m2, %1 |
paddusb m3, %1 |
paddusb m4, %1 |
paddusb m5, %1 |
psubusb m2, %2 |
psubusb m3, %2 |
psubusb m4, %2 |
psubusb m5, %2 |
%4 [dst1q+%3], m2 |
%4 [dst1q+strideq+%3], m3 |
%4 [dst2q+%3], m4 |
%4 [dst2q+strideq+%3], m5 |
%endmacro |
INIT_MMX mmx |
cglobal vp8_idct_dc_add, 3, 3, 0, dst, block, stride |
; load data |
movd m0, [blockq] |
; calculate DC |
paddw m0, [pw_4] |
pxor m1, m1 |
psraw m0, 3 |
movd [blockq], m1 |
psubw m1, m0 |
packuswb m0, m0 |
packuswb m1, m1 |
punpcklbw m0, m0 |
punpcklbw m1, m1 |
punpcklwd m0, m0 |
punpcklwd m1, m1 |
; add DC |
DEFINE_ARGS dst1, dst2, stride |
lea dst2q, [dst1q+strideq*2] |
ADD_DC m0, m1, 0, movh |
RET |
INIT_XMM sse4 |
cglobal vp8_idct_dc_add, 3, 3, 6, dst, block, stride |
; load data |
movd m0, [blockq] |
pxor m1, m1 |
; calculate DC |
paddw m0, [pw_4] |
movd [blockq], m1 |
DEFINE_ARGS dst1, dst2, stride |
lea dst2q, [dst1q+strideq*2] |
movd m2, [dst1q] |
movd m3, [dst1q+strideq] |
movd m4, [dst2q] |
movd m5, [dst2q+strideq] |
psraw m0, 3 |
pshuflw m0, m0, 0 |
punpcklqdq m0, m0 |
punpckldq m2, m3 |
punpckldq m4, m5 |
punpcklbw m2, m1 |
punpcklbw m4, m1 |
paddw m2, m0 |
paddw m4, m0 |
packuswb m2, m4 |
movd [dst1q], m2 |
pextrd [dst1q+strideq], m2, 1 |
pextrd [dst2q], m2, 2 |
pextrd [dst2q+strideq], m2, 3 |
RET |
;----------------------------------------------------------------------------- |
; void vp8_idct_dc_add4y_<opt>(uint8_t *dst, int16_t block[4][16], int stride); |
;----------------------------------------------------------------------------- |
%if ARCH_X86_32 |
INIT_MMX mmx |
cglobal vp8_idct_dc_add4y, 3, 3, 0, dst, block, stride |
; load data |
movd m0, [blockq+32*0] ; A |
movd m1, [blockq+32*2] ; C |
punpcklwd m0, [blockq+32*1] ; A B |
punpcklwd m1, [blockq+32*3] ; C D |
punpckldq m0, m1 ; A B C D |
pxor m6, m6 |
; calculate DC |
paddw m0, [pw_4] |
movd [blockq+32*0], m6 |
movd [blockq+32*1], m6 |
movd [blockq+32*2], m6 |
movd [blockq+32*3], m6 |
psraw m0, 3 |
psubw m6, m0 |
packuswb m0, m0 |
packuswb m6, m6 |
punpcklbw m0, m0 ; AABBCCDD |
punpcklbw m6, m6 ; AABBCCDD |
movq m1, m0 |
movq m7, m6 |
punpcklbw m0, m0 ; AAAABBBB |
punpckhbw m1, m1 ; CCCCDDDD |
punpcklbw m6, m6 ; AAAABBBB |
punpckhbw m7, m7 ; CCCCDDDD |
; add DC |
DEFINE_ARGS dst1, dst2, stride |
lea dst2q, [dst1q+strideq*2] |
ADD_DC m0, m6, 0, mova |
ADD_DC m1, m7, 8, mova |
RET |
%endif |
INIT_XMM sse2 |
cglobal vp8_idct_dc_add4y, 3, 3, 6, dst, block, stride |
; load data |
movd m0, [blockq+32*0] ; A |
movd m1, [blockq+32*2] ; C |
punpcklwd m0, [blockq+32*1] ; A B |
punpcklwd m1, [blockq+32*3] ; C D |
punpckldq m0, m1 ; A B C D |
pxor m1, m1 |
; calculate DC |
paddw m0, [pw_4] |
movd [blockq+32*0], m1 |
movd [blockq+32*1], m1 |
movd [blockq+32*2], m1 |
movd [blockq+32*3], m1 |
psraw m0, 3 |
psubw m1, m0 |
packuswb m0, m0 |
packuswb m1, m1 |
punpcklbw m0, m0 |
punpcklbw m1, m1 |
punpcklbw m0, m0 |
punpcklbw m1, m1 |
; add DC |
DEFINE_ARGS dst1, dst2, stride |
lea dst2q, [dst1q+strideq*2] |
ADD_DC m0, m1, 0, mova |
RET |
;----------------------------------------------------------------------------- |
; void vp8_idct_dc_add4uv_<opt>(uint8_t *dst, int16_t block[4][16], int stride); |
;----------------------------------------------------------------------------- |
INIT_MMX mmx |
cglobal vp8_idct_dc_add4uv, 3, 3, 0, dst, block, stride |
; load data |
movd m0, [blockq+32*0] ; A |
movd m1, [blockq+32*2] ; C |
punpcklwd m0, [blockq+32*1] ; A B |
punpcklwd m1, [blockq+32*3] ; C D |
punpckldq m0, m1 ; A B C D |
pxor m6, m6 |
; calculate DC |
paddw m0, [pw_4] |
movd [blockq+32*0], m6 |
movd [blockq+32*1], m6 |
movd [blockq+32*2], m6 |
movd [blockq+32*3], m6 |
psraw m0, 3 |
psubw m6, m0 |
packuswb m0, m0 |
packuswb m6, m6 |
punpcklbw m0, m0 ; AABBCCDD |
punpcklbw m6, m6 ; AABBCCDD |
movq m1, m0 |
movq m7, m6 |
punpcklbw m0, m0 ; AAAABBBB |
punpckhbw m1, m1 ; CCCCDDDD |
punpcklbw m6, m6 ; AAAABBBB |
punpckhbw m7, m7 ; CCCCDDDD |
; add DC |
DEFINE_ARGS dst1, dst2, stride |
lea dst2q, [dst1q+strideq*2] |
ADD_DC m0, m6, 0, mova |
lea dst1q, [dst1q+strideq*4] |
lea dst2q, [dst2q+strideq*4] |
ADD_DC m1, m7, 0, mova |
RET |
;----------------------------------------------------------------------------- |
; void vp8_idct_add_<opt>(uint8_t *dst, int16_t block[16], int stride); |
;----------------------------------------------------------------------------- |
; calculate %1=mul_35468(%1)-mul_20091(%2); %2=mul_20091(%1)+mul_35468(%2) |
; this macro assumes that m6/m7 have words for 20091/17734 loaded |
%macro VP8_MULTIPLY_SUMSUB 4 |
mova %3, %1 |
mova %4, %2 |
pmulhw %3, m6 ;20091(1) |
pmulhw %4, m6 ;20091(2) |
paddw %3, %1 |
paddw %4, %2 |
paddw %1, %1 |
paddw %2, %2 |
pmulhw %1, m7 ;35468(1) |
pmulhw %2, m7 ;35468(2) |
psubw %1, %4 |
paddw %2, %3 |
%endmacro |
; calculate x0=%1+%3; x1=%1-%3 |
; x2=mul_35468(%2)-mul_20091(%4); x3=mul_20091(%2)+mul_35468(%4) |
; %1=x0+x3 (tmp0); %2=x1+x2 (tmp1); %3=x1-x2 (tmp2); %4=x0-x3 (tmp3) |
; %5/%6 are temporary registers |
; we assume m6/m7 have constant words 20091/17734 loaded in them |
%macro VP8_IDCT_TRANSFORM4x4_1D 6 |
SUMSUB_BA w, %3, %1, %5 ;t0, t1 |
VP8_MULTIPLY_SUMSUB m%2, m%4, m%5,m%6 ;t2, t3 |
SUMSUB_BA w, %4, %3, %5 ;tmp0, tmp3 |
SUMSUB_BA w, %2, %1, %5 ;tmp1, tmp2 |
SWAP %4, %1 |
SWAP %4, %3 |
%endmacro |
%macro VP8_IDCT_ADD 0 |
cglobal vp8_idct_add, 3, 3, 0, dst, block, stride |
; load block data |
movq m0, [blockq+ 0] |
movq m1, [blockq+ 8] |
movq m2, [blockq+16] |
movq m3, [blockq+24] |
movq m6, [pw_20091] |
movq m7, [pw_17734] |
%if cpuflag(sse) |
xorps xmm0, xmm0 |
movaps [blockq+ 0], xmm0 |
movaps [blockq+16], xmm0 |
%else |
pxor m4, m4 |
movq [blockq+ 0], m4 |
movq [blockq+ 8], m4 |
movq [blockq+16], m4 |
movq [blockq+24], m4 |
%endif |
; actual IDCT |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
paddw m0, [pw_4] |
VP8_IDCT_TRANSFORM4x4_1D 0, 1, 2, 3, 4, 5 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
; store |
pxor m4, m4 |
DEFINE_ARGS dst1, dst2, stride |
lea dst2q, [dst1q+2*strideq] |
STORE_DIFFx2 m0, m1, m6, m7, m4, 3, dst1q, strideq |
STORE_DIFFx2 m2, m3, m6, m7, m4, 3, dst2q, strideq |
RET |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
VP8_IDCT_ADD |
%endif |
INIT_MMX sse |
VP8_IDCT_ADD |
;----------------------------------------------------------------------------- |
; void vp8_luma_dc_wht_mmxext(int16_t block[4][4][16], int16_t dc[16]) |
;----------------------------------------------------------------------------- |
%macro SCATTER_WHT 3 |
movd dc1d, m%1 |
movd dc2d, m%2 |
mov [blockq+2*16*(0+%3)], dc1w |
mov [blockq+2*16*(1+%3)], dc2w |
shr dc1d, 16 |
shr dc2d, 16 |
psrlq m%1, 32 |
psrlq m%2, 32 |
mov [blockq+2*16*(4+%3)], dc1w |
mov [blockq+2*16*(5+%3)], dc2w |
movd dc1d, m%1 |
movd dc2d, m%2 |
mov [blockq+2*16*(8+%3)], dc1w |
mov [blockq+2*16*(9+%3)], dc2w |
shr dc1d, 16 |
shr dc2d, 16 |
mov [blockq+2*16*(12+%3)], dc1w |
mov [blockq+2*16*(13+%3)], dc2w |
%endmacro |
%macro HADAMARD4_1D 4 |
SUMSUB_BADC w, %2, %1, %4, %3 |
SUMSUB_BADC w, %4, %2, %3, %1 |
SWAP %1, %4, %3 |
%endmacro |
%macro VP8_DC_WHT 0 |
cglobal vp8_luma_dc_wht, 2, 3, 0, block, dc1, dc2 |
movq m0, [dc1q] |
movq m1, [dc1q+8] |
movq m2, [dc1q+16] |
movq m3, [dc1q+24] |
%if cpuflag(sse) |
xorps xmm0, xmm0 |
movaps [dc1q+ 0], xmm0 |
movaps [dc1q+16], xmm0 |
%else |
pxor m4, m4 |
movq [dc1q+ 0], m4 |
movq [dc1q+ 8], m4 |
movq [dc1q+16], m4 |
movq [dc1q+24], m4 |
%endif |
HADAMARD4_1D 0, 1, 2, 3 |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
paddw m0, [pw_3] |
HADAMARD4_1D 0, 1, 2, 3 |
psraw m0, 3 |
psraw m1, 3 |
psraw m2, 3 |
psraw m3, 3 |
SCATTER_WHT 0, 1, 0 |
SCATTER_WHT 2, 3, 2 |
RET |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
VP8_DC_WHT |
%endif |
INIT_MMX sse |
VP8_DC_WHT |
;----------------------------------------------------------------------------- |
; void vp8_h/v_loop_filter_simple_<opt>(uint8_t *dst, int stride, int flim); |
;----------------------------------------------------------------------------- |
; macro called with 7 mm register indexes as argument, and 4 regular registers |
; |
; first 4 mm registers will carry the transposed pixel data |
; the other three are scratchspace (one would be sufficient, but this allows |
; for more spreading/pipelining and thus faster execution on OOE CPUs) |
; |
; first two regular registers are buf+4*stride and buf+5*stride |
; third is -stride, fourth is +stride |
%macro READ_8x4_INTERLEAVED 11 |
; interleave 8 (A-H) rows of 4 pixels each |
movd m%1, [%8+%10*4] ; A0-3 |
movd m%5, [%9+%10*4] ; B0-3 |
movd m%2, [%8+%10*2] ; C0-3 |
movd m%6, [%8+%10] ; D0-3 |
movd m%3, [%8] ; E0-3 |
movd m%7, [%9] ; F0-3 |
movd m%4, [%9+%11] ; G0-3 |
punpcklbw m%1, m%5 ; A/B interleaved |
movd m%5, [%9+%11*2] ; H0-3 |
punpcklbw m%2, m%6 ; C/D interleaved |
punpcklbw m%3, m%7 ; E/F interleaved |
punpcklbw m%4, m%5 ; G/H interleaved |
%endmacro |
; macro called with 7 mm register indexes as argument, and 5 regular registers |
; first 11 mean the same as READ_8x4_TRANSPOSED above |
; fifth regular register is scratchspace to reach the bottom 8 rows, it |
; will be set to second regular register + 8*stride at the end |
%macro READ_16x4_INTERLEAVED 12 |
; transpose 16 (A-P) rows of 4 pixels each |
lea %12, [r0+8*r2] |
; read (and interleave) those addressable by %8 (=r0), A/C/D/E/I/K/L/M |
movd m%1, [%8+%10*4] ; A0-3 |
movd m%3, [%12+%10*4] ; I0-3 |
movd m%2, [%8+%10*2] ; C0-3 |
movd m%4, [%12+%10*2] ; K0-3 |
movd m%6, [%8+%10] ; D0-3 |
movd m%5, [%12+%10] ; L0-3 |
movd m%7, [%12] ; M0-3 |
add %12, %11 |
punpcklbw m%1, m%3 ; A/I |
movd m%3, [%8] ; E0-3 |
punpcklbw m%2, m%4 ; C/K |
punpcklbw m%6, m%5 ; D/L |
punpcklbw m%3, m%7 ; E/M |
punpcklbw m%2, m%6 ; C/D/K/L interleaved |
; read (and interleave) those addressable by %9 (=r4), B/F/G/H/J/N/O/P |
movd m%5, [%9+%10*4] ; B0-3 |
movd m%4, [%12+%10*4] ; J0-3 |
movd m%7, [%9] ; F0-3 |
movd m%6, [%12] ; N0-3 |
punpcklbw m%5, m%4 ; B/J |
punpcklbw m%7, m%6 ; F/N |
punpcklbw m%1, m%5 ; A/B/I/J interleaved |
punpcklbw m%3, m%7 ; E/F/M/N interleaved |
movd m%4, [%9+%11] ; G0-3 |
movd m%6, [%12+%11] ; O0-3 |
movd m%5, [%9+%11*2] ; H0-3 |
movd m%7, [%12+%11*2] ; P0-3 |
punpcklbw m%4, m%6 ; G/O |
punpcklbw m%5, m%7 ; H/P |
punpcklbw m%4, m%5 ; G/H/O/P interleaved |
%endmacro |
; write 4 mm registers of 2 dwords each |
; first four arguments are mm register indexes containing source data |
; last four are registers containing buf+4*stride, buf+5*stride, |
; -stride and +stride |
%macro WRITE_4x2D 8 |
; write out (2 dwords per register) |
movd [%5+%7*4], m%1 |
movd [%5+%7*2], m%2 |
movd [%5], m%3 |
movd [%6+%8], m%4 |
punpckhdq m%1, m%1 |
punpckhdq m%2, m%2 |
punpckhdq m%3, m%3 |
punpckhdq m%4, m%4 |
movd [%6+%7*4], m%1 |
movd [%5+%7], m%2 |
movd [%6], m%3 |
movd [%6+%8*2], m%4 |
%endmacro |
; write 4 xmm registers of 4 dwords each |
; arguments same as WRITE_2x4D, but with an extra register, so that the 5 regular |
; registers contain buf+4*stride, buf+5*stride, buf+12*stride, -stride and +stride |
; we add 1*stride to the third regular registry in the process |
; the 10th argument is 16 if it's a Y filter (i.e. all regular registers cover the |
; same memory region), or 8 if they cover two separate buffers (third one points to |
; a different memory region than the first two), allowing for more optimal code for |
; the 16-width case |
%macro WRITE_4x4D 10 |
; write out (4 dwords per register), start with dwords zero |
movd [%5+%8*4], m%1 |
movd [%5], m%2 |
movd [%7+%8*4], m%3 |
movd [%7], m%4 |
; store dwords 1 |
psrldq m%1, 4 |
psrldq m%2, 4 |
psrldq m%3, 4 |
psrldq m%4, 4 |
movd [%6+%8*4], m%1 |
movd [%6], m%2 |
%if %10 == 16 |
movd [%6+%9*4], m%3 |
%endif |
movd [%7+%9], m%4 |
; write dwords 2 |
psrldq m%1, 4 |
psrldq m%2, 4 |
%if %10 == 8 |
movd [%5+%8*2], m%1 |
movd %5d, m%3 |
%endif |
psrldq m%3, 4 |
psrldq m%4, 4 |
%if %10 == 16 |
movd [%5+%8*2], m%1 |
%endif |
movd [%6+%9], m%2 |
movd [%7+%8*2], m%3 |
movd [%7+%9*2], m%4 |
add %7, %9 |
; store dwords 3 |
psrldq m%1, 4 |
psrldq m%2, 4 |
psrldq m%3, 4 |
psrldq m%4, 4 |
%if %10 == 8 |
mov [%7+%8*4], %5d |
movd [%6+%8*2], m%1 |
%else |
movd [%5+%8], m%1 |
%endif |
movd [%6+%9*2], m%2 |
movd [%7+%8*2], m%3 |
movd [%7+%9*2], m%4 |
%endmacro |
; write 4 or 8 words in the mmx/xmm registers as 8 lines |
; 1 and 2 are the registers to write, this can be the same (for SSE2) |
; for pre-SSE4: |
; 3 is a general-purpose register that we will clobber |
; for SSE4: |
; 3 is a pointer to the destination's 5th line |
; 4 is a pointer to the destination's 4th line |
; 5/6 is -stride and +stride |
%macro WRITE_2x4W 6 |
movd %3d, %1 |
punpckhdq %1, %1 |
mov [%4+%5*4], %3w |
shr %3, 16 |
add %4, %6 |
mov [%4+%5*4], %3w |
movd %3d, %1 |
add %4, %5 |
mov [%4+%5*2], %3w |
shr %3, 16 |
mov [%4+%5 ], %3w |
movd %3d, %2 |
punpckhdq %2, %2 |
mov [%4 ], %3w |
shr %3, 16 |
mov [%4+%6 ], %3w |
movd %3d, %2 |
add %4, %6 |
mov [%4+%6 ], %3w |
shr %3, 16 |
mov [%4+%6*2], %3w |
add %4, %5 |
%endmacro |
%macro WRITE_8W 5 |
%if cpuflag(sse4) |
pextrw [%3+%4*4], %1, 0 |
pextrw [%2+%4*4], %1, 1 |
pextrw [%3+%4*2], %1, 2 |
pextrw [%3+%4 ], %1, 3 |
pextrw [%3 ], %1, 4 |
pextrw [%2 ], %1, 5 |
pextrw [%2+%5 ], %1, 6 |
pextrw [%2+%5*2], %1, 7 |
%else |
movd %2d, %1 |
psrldq %1, 4 |
mov [%3+%4*4], %2w |
shr %2, 16 |
add %3, %5 |
mov [%3+%4*4], %2w |
movd %2d, %1 |
psrldq %1, 4 |
add %3, %4 |
mov [%3+%4*2], %2w |
shr %2, 16 |
mov [%3+%4 ], %2w |
movd %2d, %1 |
psrldq %1, 4 |
mov [%3 ], %2w |
shr %2, 16 |
mov [%3+%5 ], %2w |
movd %2d, %1 |
add %3, %5 |
mov [%3+%5 ], %2w |
shr %2, 16 |
mov [%3+%5*2], %2w |
%endif |
%endmacro |
%macro SIMPLE_LOOPFILTER 2 |
cglobal vp8_%1_loop_filter_simple, 3, %2, 8, dst, stride, flim, cntr |
%if mmsize == 8 ; mmx/mmxext |
mov cntrq, 2 |
%endif |
%if cpuflag(ssse3) |
pxor m0, m0 |
%endif |
SPLATB_REG m7, flim, m0 ; splat "flim" into register |
; set up indexes to address 4 rows |
%if mmsize == 8 |
DEFINE_ARGS dst1, mstride, stride, cntr, dst2 |
%else |
DEFINE_ARGS dst1, mstride, stride, dst3, dst2 |
%endif |
mov strideq, mstrideq |
neg mstrideq |
%ifidn %1, h |
lea dst1q, [dst1q+4*strideq-2] |
%endif |
%if mmsize == 8 ; mmx / mmxext |
.next8px: |
%endif |
%ifidn %1, v |
; read 4 half/full rows of pixels |
mova m0, [dst1q+mstrideq*2] ; p1 |
mova m1, [dst1q+mstrideq] ; p0 |
mova m2, [dst1q] ; q0 |
mova m3, [dst1q+ strideq] ; q1 |
%else ; h |
lea dst2q, [dst1q+ strideq] |
%if mmsize == 8 ; mmx/mmxext |
READ_8x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq |
%else ; sse2 |
READ_16x4_INTERLEAVED 0, 1, 2, 3, 4, 5, 6, dst1q, dst2q, mstrideq, strideq, dst3q |
%endif |
TRANSPOSE4x4W 0, 1, 2, 3, 4 |
%endif |
; simple_limit |
mova m5, m2 ; m5=backup of q0 |
mova m6, m1 ; m6=backup of p0 |
psubusb m1, m2 ; p0-q0 |
psubusb m2, m6 ; q0-p0 |
por m1, m2 ; FFABS(p0-q0) |
paddusb m1, m1 ; m1=FFABS(p0-q0)*2 |
mova m4, m3 |
mova m2, m0 |
psubusb m3, m0 ; q1-p1 |
psubusb m0, m4 ; p1-q1 |
por m3, m0 ; FFABS(p1-q1) |
mova m0, [pb_80] |
pxor m2, m0 |
pxor m4, m0 |
psubsb m2, m4 ; m2=p1-q1 (signed) backup for below |
pand m3, [pb_FE] |
psrlq m3, 1 ; m3=FFABS(p1-q1)/2, this can be used signed |
paddusb m3, m1 |
psubusb m3, m7 |
pxor m1, m1 |
pcmpeqb m3, m1 ; abs(p0-q0)*2+abs(p1-q1)/2<=flim mask(0xff/0x0) |
; filter_common (use m2/p1-q1, m4=q0, m6=p0, m5/q0-p0 and m3/mask) |
mova m4, m5 |
pxor m5, m0 |
pxor m0, m6 |
psubsb m5, m0 ; q0-p0 (signed) |
paddsb m2, m5 |
paddsb m2, m5 |
paddsb m2, m5 ; a=(p1-q1) + 3*(q0-p0) |
pand m2, m3 ; apply filter mask (m3) |
mova m3, [pb_F8] |
mova m1, m2 |
paddsb m2, [pb_4] ; f1<<3=a+4 |
paddsb m1, [pb_3] ; f2<<3=a+3 |
pand m2, m3 |
pand m1, m3 ; cache f2<<3 |
pxor m0, m0 |
pxor m3, m3 |
pcmpgtb m0, m2 ; which values are <0? |
psubb m3, m2 ; -f1<<3 |
psrlq m2, 3 ; +f1 |
psrlq m3, 3 ; -f1 |
pand m3, m0 |
pandn m0, m2 |
psubusb m4, m0 |
paddusb m4, m3 ; q0-f1 |
pxor m0, m0 |
pxor m3, m3 |
pcmpgtb m0, m1 ; which values are <0? |
psubb m3, m1 ; -f2<<3 |
psrlq m1, 3 ; +f2 |
psrlq m3, 3 ; -f2 |
pand m3, m0 |
pandn m0, m1 |
paddusb m6, m0 |
psubusb m6, m3 ; p0+f2 |
; store |
%ifidn %1, v |
mova [dst1q], m4 |
mova [dst1q+mstrideq], m6 |
%else ; h |
inc dst1q |
SBUTTERFLY bw, 6, 4, 0 |
%if mmsize == 16 ; sse2 |
%if cpuflag(sse4) |
inc dst2q |
%endif |
WRITE_8W m6, dst2q, dst1q, mstrideq, strideq |
lea dst2q, [dst3q+mstrideq+1] |
%if cpuflag(sse4) |
inc dst3q |
%endif |
WRITE_8W m4, dst3q, dst2q, mstrideq, strideq |
%else ; mmx/mmxext |
WRITE_2x4W m6, m4, dst2q, dst1q, mstrideq, strideq |
%endif |
%endif |
%if mmsize == 8 ; mmx/mmxext |
; next 8 pixels |
%ifidn %1, v |
add dst1q, 8 ; advance 8 cols = pixels |
%else ; h |
lea dst1q, [dst1q+strideq*8-1] ; advance 8 rows = lines |
%endif |
dec cntrq |
jg .next8px |
REP_RET |
%else ; sse2 |
RET |
%endif |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
SIMPLE_LOOPFILTER v, 4 |
SIMPLE_LOOPFILTER h, 5 |
INIT_MMX mmxext |
SIMPLE_LOOPFILTER v, 4 |
SIMPLE_LOOPFILTER h, 5 |
%endif |
INIT_XMM sse2 |
SIMPLE_LOOPFILTER v, 3 |
SIMPLE_LOOPFILTER h, 5 |
INIT_XMM ssse3 |
SIMPLE_LOOPFILTER v, 3 |
SIMPLE_LOOPFILTER h, 5 |
INIT_XMM sse4 |
SIMPLE_LOOPFILTER h, 5 |
;----------------------------------------------------------------------------- |
; void vp8_h/v_loop_filter<size>_inner_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
; int flimE, int flimI, int hev_thr); |
;----------------------------------------------------------------------------- |
%macro INNER_LOOPFILTER 2 |
%define stack_size 0 |
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
%ifidn %1, v ; [3]=hev() result |
%define stack_size mmsize * -4 |
%else ; h ; extra storage space for transposes |
%define stack_size mmsize * -5 |
%endif |
%endif |
%if %2 == 8 ; chroma |
cglobal vp8_%1_loop_filter8uv_inner, 6, 6, 13, stack_size, dst, dst8, stride, flimE, flimI, hevthr |
%else ; luma |
cglobal vp8_%1_loop_filter16y_inner, 5, 5, 13, stack_size, dst, stride, flimE, flimI, hevthr |
%endif |
%if cpuflag(ssse3) |
pxor m7, m7 |
%endif |
%ifndef m8 |
; splat function arguments |
SPLATB_REG m0, flimEq, m7 ; E |
SPLATB_REG m1, flimIq, m7 ; I |
SPLATB_REG m2, hevthrq, m7 ; hev_thresh |
%define m_flimE [rsp] |
%define m_flimI [rsp+mmsize] |
%define m_hevthr [rsp+mmsize*2] |
%define m_maskres [rsp+mmsize*3] |
%define m_p0backup [rsp+mmsize*3] |
%define m_q0backup [rsp+mmsize*4] |
mova m_flimE, m0 |
mova m_flimI, m1 |
mova m_hevthr, m2 |
%else |
%define m_flimE m9 |
%define m_flimI m10 |
%define m_hevthr m11 |
%define m_maskres m12 |
%define m_p0backup m12 |
%define m_q0backup m8 |
; splat function arguments |
SPLATB_REG m_flimE, flimEq, m7 ; E |
SPLATB_REG m_flimI, flimIq, m7 ; I |
SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh |
%endif |
%if %2 == 8 ; chroma |
DEFINE_ARGS dst1, dst8, mstride, stride, dst2 |
%elif mmsize == 8 |
DEFINE_ARGS dst1, mstride, stride, dst2, cntr |
mov cntrq, 2 |
%else |
DEFINE_ARGS dst1, mstride, stride, dst2, dst8 |
%endif |
mov strideq, mstrideq |
neg mstrideq |
%ifidn %1, h |
lea dst1q, [dst1q+strideq*4-4] |
%if %2 == 8 ; chroma |
lea dst8q, [dst8q+strideq*4-4] |
%endif |
%endif |
%if mmsize == 8 |
.next8px: |
%endif |
; read |
lea dst2q, [dst1q+strideq] |
%ifidn %1, v |
%if %2 == 8 && mmsize == 16 |
%define movrow movh |
%else |
%define movrow mova |
%endif |
movrow m0, [dst1q+mstrideq*4] ; p3 |
movrow m1, [dst2q+mstrideq*4] ; p2 |
movrow m2, [dst1q+mstrideq*2] ; p1 |
movrow m5, [dst2q] ; q1 |
movrow m6, [dst2q+ strideq*1] ; q2 |
movrow m7, [dst2q+ strideq*2] ; q3 |
%if mmsize == 16 && %2 == 8 |
movhps m0, [dst8q+mstrideq*4] |
movhps m2, [dst8q+mstrideq*2] |
add dst8q, strideq |
movhps m1, [dst8q+mstrideq*4] |
movhps m5, [dst8q] |
movhps m6, [dst8q+ strideq ] |
movhps m7, [dst8q+ strideq*2] |
add dst8q, mstrideq |
%endif |
%elif mmsize == 8 ; mmx/mmxext (h) |
; read 8 rows of 8px each |
movu m0, [dst1q+mstrideq*4] |
movu m1, [dst2q+mstrideq*4] |
movu m2, [dst1q+mstrideq*2] |
movu m3, [dst1q+mstrideq ] |
movu m4, [dst1q] |
movu m5, [dst2q] |
movu m6, [dst2q+ strideq ] |
; 8x8 transpose |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |
mova m_q0backup, m1 |
movu m7, [dst2q+ strideq*2] |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
mova m1, m_q0backup |
mova m_q0backup, m2 ; store q0 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
mova m_p0backup, m5 ; store p0 |
SWAP 1, 4 |
SWAP 2, 4 |
SWAP 6, 3 |
SWAP 5, 3 |
%else ; sse2 (h) |
%if %2 == 16 |
lea dst8q, [dst1q+ strideq*8] |
%endif |
; read 16 rows of 8px each, interleave |
movh m0, [dst1q+mstrideq*4] |
movh m1, [dst8q+mstrideq*4] |
movh m2, [dst1q+mstrideq*2] |
movh m5, [dst8q+mstrideq*2] |
movh m3, [dst1q+mstrideq ] |
movh m6, [dst8q+mstrideq ] |
movh m4, [dst1q] |
movh m7, [dst8q] |
punpcklbw m0, m1 ; A/I |
punpcklbw m2, m5 ; C/K |
punpcklbw m3, m6 ; D/L |
punpcklbw m4, m7 ; E/M |
add dst8q, strideq |
movh m1, [dst2q+mstrideq*4] |
movh m6, [dst8q+mstrideq*4] |
movh m5, [dst2q] |
movh m7, [dst8q] |
punpcklbw m1, m6 ; B/J |
punpcklbw m5, m7 ; F/N |
movh m6, [dst2q+ strideq ] |
movh m7, [dst8q+ strideq ] |
punpcklbw m6, m7 ; G/O |
; 8x16 transpose |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |
%ifdef m8 |
SWAP 1, 8 |
%else |
mova m_q0backup, m1 |
%endif |
movh m7, [dst2q+ strideq*2] |
movh m1, [dst8q+ strideq*2] |
punpcklbw m7, m1 ; H/P |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
%ifdef m8 |
SWAP 1, 8 |
SWAP 2, 8 |
%else |
mova m1, m_q0backup |
mova m_q0backup, m2 ; store q0 |
%endif |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
%ifdef m12 |
SWAP 5, 12 |
%else |
mova m_p0backup, m5 ; store p0 |
%endif |
SWAP 1, 4 |
SWAP 2, 4 |
SWAP 6, 3 |
SWAP 5, 3 |
%endif |
; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
mova m4, m1 |
SWAP 4, 1 |
psubusb m4, m0 ; p2-p3 |
psubusb m0, m1 ; p3-p2 |
por m0, m4 ; abs(p3-p2) |
mova m4, m2 |
SWAP 4, 2 |
psubusb m4, m1 ; p1-p2 |
psubusb m1, m2 ; p2-p1 |
por m1, m4 ; abs(p2-p1) |
mova m4, m6 |
SWAP 4, 6 |
psubusb m4, m7 ; q2-q3 |
psubusb m7, m6 ; q3-q2 |
por m7, m4 ; abs(q3-q2) |
mova m4, m5 |
SWAP 4, 5 |
psubusb m4, m6 ; q1-q2 |
psubusb m6, m5 ; q2-q1 |
por m6, m4 ; abs(q2-q1) |
%if notcpuflag(mmxext) |
mova m4, m_flimI |
pxor m3, m3 |
psubusb m0, m4 |
psubusb m1, m4 |
psubusb m7, m4 |
psubusb m6, m4 |
pcmpeqb m0, m3 ; abs(p3-p2) <= I |
pcmpeqb m1, m3 ; abs(p2-p1) <= I |
pcmpeqb m7, m3 ; abs(q3-q2) <= I |
pcmpeqb m6, m3 ; abs(q2-q1) <= I |
pand m0, m1 |
pand m7, m6 |
pand m0, m7 |
%else ; mmxext/sse2 |
pmaxub m0, m1 |
pmaxub m6, m7 |
pmaxub m0, m6 |
%endif |
; normal_limit and high_edge_variance for p1-p0, q1-q0 |
SWAP 7, 3 ; now m7 is zero |
%ifidn %1, v |
movrow m3, [dst1q+mstrideq ] ; p0 |
%if mmsize == 16 && %2 == 8 |
movhps m3, [dst8q+mstrideq ] |
%endif |
%elifdef m12 |
SWAP 3, 12 |
%else |
mova m3, m_p0backup |
%endif |
mova m1, m2 |
SWAP 1, 2 |
mova m6, m3 |
SWAP 3, 6 |
psubusb m1, m3 ; p1-p0 |
psubusb m6, m2 ; p0-p1 |
por m1, m6 ; abs(p1-p0) |
%if notcpuflag(mmxext) |
mova m6, m1 |
psubusb m1, m4 |
psubusb m6, m_hevthr |
pcmpeqb m1, m7 ; abs(p1-p0) <= I |
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
pand m0, m1 |
mova m_maskres, m6 |
%else ; mmxext/sse2 |
pmaxub m0, m1 ; max_I |
SWAP 1, 4 ; max_hev_thresh |
%endif |
SWAP 6, 4 ; now m6 is I |
%ifidn %1, v |
movrow m4, [dst1q] ; q0 |
%if mmsize == 16 && %2 == 8 |
movhps m4, [dst8q] |
%endif |
%elifdef m8 |
SWAP 4, 8 |
%else |
mova m4, m_q0backup |
%endif |
mova m1, m4 |
SWAP 1, 4 |
mova m7, m5 |
SWAP 7, 5 |
psubusb m1, m5 ; q0-q1 |
psubusb m7, m4 ; q1-q0 |
por m1, m7 ; abs(q1-q0) |
%if notcpuflag(mmxext) |
mova m7, m1 |
psubusb m1, m6 |
psubusb m7, m_hevthr |
pxor m6, m6 |
pcmpeqb m1, m6 ; abs(q1-q0) <= I |
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
mova m6, m_maskres |
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
pand m6, m7 |
%else ; mmxext/sse2 |
pxor m7, m7 |
pmaxub m0, m1 |
pmaxub m6, m1 |
psubusb m0, m_flimI |
psubusb m6, m_hevthr |
pcmpeqb m0, m7 ; max(abs(..)) <= I |
pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
%endif |
%ifdef m12 |
SWAP 6, 12 |
%else |
mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
%endif |
; simple_limit |
mova m1, m3 |
SWAP 1, 3 |
mova m6, m4 ; keep copies of p0/q0 around for later use |
SWAP 6, 4 |
psubusb m1, m4 ; p0-q0 |
psubusb m6, m3 ; q0-p0 |
por m1, m6 ; abs(q0-p0) |
paddusb m1, m1 ; m1=2*abs(q0-p0) |
mova m7, m2 |
SWAP 7, 2 |
mova m6, m5 |
SWAP 6, 5 |
psubusb m7, m5 ; p1-q1 |
psubusb m6, m2 ; q1-p1 |
por m7, m6 ; abs(q1-p1) |
pxor m6, m6 |
pand m7, [pb_FE] |
psrlq m7, 1 ; abs(q1-p1)/2 |
paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
psubusb m7, m_flimE |
pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
pand m0, m7 ; normal_limit result |
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
%ifdef m8 ; x86-64 && sse2 |
mova m8, [pb_80] |
%define m_pb_80 m8 |
%else ; x86-32 or mmx/mmxext |
%define m_pb_80 [pb_80] |
%endif |
mova m1, m4 |
mova m7, m3 |
pxor m1, m_pb_80 |
pxor m7, m_pb_80 |
psubsb m1, m7 ; (signed) q0-p0 |
mova m6, m2 |
mova m7, m5 |
pxor m6, m_pb_80 |
pxor m7, m_pb_80 |
psubsb m6, m7 ; (signed) p1-q1 |
mova m7, m_maskres |
pandn m7, m6 |
paddsb m7, m1 |
paddsb m7, m1 |
paddsb m7, m1 ; 3*(q0-p0)+is4tap?(p1-q1) |
pand m7, m0 |
mova m1, [pb_F8] |
mova m6, m7 |
paddsb m7, [pb_3] |
paddsb m6, [pb_4] |
pand m7, m1 |
pand m6, m1 |
pxor m1, m1 |
pxor m0, m0 |
pcmpgtb m1, m7 |
psubb m0, m7 |
psrlq m7, 3 ; +f2 |
psrlq m0, 3 ; -f2 |
pand m0, m1 |
pandn m1, m7 |
psubusb m3, m0 |
paddusb m3, m1 ; p0+f2 |
pxor m1, m1 |
pxor m0, m0 |
pcmpgtb m0, m6 |
psubb m1, m6 |
psrlq m6, 3 ; +f1 |
psrlq m1, 3 ; -f1 |
pand m1, m0 |
pandn m0, m6 |
psubusb m4, m0 |
paddusb m4, m1 ; q0-f1 |
%ifdef m12 |
SWAP 6, 12 |
%else |
mova m6, m_maskres |
%endif |
%if notcpuflag(mmxext) |
mova m7, [pb_1] |
%else ; mmxext/sse2 |
pxor m7, m7 |
%endif |
pand m0, m6 |
pand m1, m6 |
%if notcpuflag(mmxext) |
paddusb m0, m7 |
pand m1, [pb_FE] |
pandn m7, m0 |
psrlq m1, 1 |
psrlq m7, 1 |
SWAP 0, 7 |
%else ; mmxext/sse2 |
psubusb m1, [pb_1] |
pavgb m0, m7 ; a |
pavgb m1, m7 ; -a |
%endif |
psubusb m5, m0 |
psubusb m2, m1 |
paddusb m5, m1 ; q1-a |
paddusb m2, m0 ; p1+a |
; store |
%ifidn %1, v |
movrow [dst1q+mstrideq*2], m2 |
movrow [dst1q+mstrideq ], m3 |
movrow [dst1q], m4 |
movrow [dst1q+ strideq ], m5 |
%if mmsize == 16 && %2 == 8 |
movhps [dst8q+mstrideq*2], m2 |
movhps [dst8q+mstrideq ], m3 |
movhps [dst8q], m4 |
movhps [dst8q+ strideq ], m5 |
%endif |
%else ; h |
add dst1q, 2 |
add dst2q, 2 |
; 4x8/16 transpose |
TRANSPOSE4x4B 2, 3, 4, 5, 6 |
%if mmsize == 8 ; mmx/mmxext (h) |
WRITE_4x2D 2, 3, 4, 5, dst1q, dst2q, mstrideq, strideq |
%else ; sse2 (h) |
lea dst8q, [dst8q+mstrideq +2] |
WRITE_4x4D 2, 3, 4, 5, dst1q, dst2q, dst8q, mstrideq, strideq, %2 |
%endif |
%endif |
%if mmsize == 8 |
%if %2 == 8 ; chroma |
%ifidn %1, h |
sub dst1q, 2 |
%endif |
cmp dst1q, dst8q |
mov dst1q, dst8q |
jnz .next8px |
%else |
%ifidn %1, h |
lea dst1q, [dst1q+ strideq*8-2] |
%else ; v |
add dst1q, 8 |
%endif |
dec cntrq |
jg .next8px |
%endif |
REP_RET |
%else ; mmsize == 16 |
RET |
%endif |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
INNER_LOOPFILTER v, 16 |
INNER_LOOPFILTER h, 16 |
INNER_LOOPFILTER v, 8 |
INNER_LOOPFILTER h, 8 |
INIT_MMX mmxext |
INNER_LOOPFILTER v, 16 |
INNER_LOOPFILTER h, 16 |
INNER_LOOPFILTER v, 8 |
INNER_LOOPFILTER h, 8 |
%endif |
INIT_XMM sse2 |
INNER_LOOPFILTER v, 16 |
INNER_LOOPFILTER h, 16 |
INNER_LOOPFILTER v, 8 |
INNER_LOOPFILTER h, 8 |
INIT_XMM ssse3 |
INNER_LOOPFILTER v, 16 |
INNER_LOOPFILTER h, 16 |
INNER_LOOPFILTER v, 8 |
INNER_LOOPFILTER h, 8 |
;----------------------------------------------------------------------------- |
; void vp8_h/v_loop_filter<size>_mbedge_<opt>(uint8_t *dst, [uint8_t *v,] int stride, |
; int flimE, int flimI, int hev_thr); |
;----------------------------------------------------------------------------- |
%macro MBEDGE_LOOPFILTER 2 |
%define stack_size 0 |
%ifndef m8 ; stack layout: [0]=E, [1]=I, [2]=hev_thr |
%if mmsize == 16 ; [3]=hev() result |
; [4]=filter tmp result |
; [5]/[6] = p2/q2 backup |
; [7]=lim_res sign result |
%define stack_size mmsize * -7 |
%else ; 8 ; extra storage space for transposes |
%define stack_size mmsize * -8 |
%endif |
%endif |
%if %2 == 8 ; chroma |
cglobal vp8_%1_loop_filter8uv_mbedge, 6, 6, 15, stack_size, dst1, dst8, stride, flimE, flimI, hevthr |
%else ; luma |
cglobal vp8_%1_loop_filter16y_mbedge, 5, 5, 15, stack_size, dst1, stride, flimE, flimI, hevthr |
%endif |
%if cpuflag(ssse3) |
pxor m7, m7 |
%endif |
%ifndef m8 |
; splat function arguments |
SPLATB_REG m0, flimEq, m7 ; E |
SPLATB_REG m1, flimIq, m7 ; I |
SPLATB_REG m2, hevthrq, m7 ; hev_thresh |
%define m_flimE [rsp] |
%define m_flimI [rsp+mmsize] |
%define m_hevthr [rsp+mmsize*2] |
%define m_maskres [rsp+mmsize*3] |
%define m_limres [rsp+mmsize*4] |
%define m_p0backup [rsp+mmsize*3] |
%define m_q0backup [rsp+mmsize*4] |
%define m_p2backup [rsp+mmsize*5] |
%define m_q2backup [rsp+mmsize*6] |
%if mmsize == 16 |
%define m_limsign [rsp] |
%else |
%define m_limsign [rsp+mmsize*7] |
%endif |
mova m_flimE, m0 |
mova m_flimI, m1 |
mova m_hevthr, m2 |
%else ; sse2 on x86-64 |
%define m_flimE m9 |
%define m_flimI m10 |
%define m_hevthr m11 |
%define m_maskres m12 |
%define m_limres m8 |
%define m_p0backup m12 |
%define m_q0backup m8 |
%define m_p2backup m13 |
%define m_q2backup m14 |
%define m_limsign m9 |
; splat function arguments |
SPLATB_REG m_flimE, flimEq, m7 ; E |
SPLATB_REG m_flimI, flimIq, m7 ; I |
SPLATB_REG m_hevthr, hevthrq, m7 ; hev_thresh |
%endif |
%if %2 == 8 ; chroma |
DEFINE_ARGS dst1, dst8, mstride, stride, dst2 |
%elif mmsize == 8 |
DEFINE_ARGS dst1, mstride, stride, dst2, cntr |
mov cntrq, 2 |
%else |
DEFINE_ARGS dst1, mstride, stride, dst2, dst8 |
%endif |
mov strideq, mstrideq |
neg mstrideq |
%ifidn %1, h |
lea dst1q, [dst1q+strideq*4-4] |
%if %2 == 8 ; chroma |
lea dst8q, [dst8q+strideq*4-4] |
%endif |
%endif |
%if mmsize == 8 |
.next8px: |
%endif |
; read |
lea dst2q, [dst1q+ strideq ] |
%ifidn %1, v |
%if %2 == 8 && mmsize == 16 |
%define movrow movh |
%else |
%define movrow mova |
%endif |
movrow m0, [dst1q+mstrideq*4] ; p3 |
movrow m1, [dst2q+mstrideq*4] ; p2 |
movrow m2, [dst1q+mstrideq*2] ; p1 |
movrow m5, [dst2q] ; q1 |
movrow m6, [dst2q+ strideq ] ; q2 |
movrow m7, [dst2q+ strideq*2] ; q3 |
%if mmsize == 16 && %2 == 8 |
movhps m0, [dst8q+mstrideq*4] |
movhps m2, [dst8q+mstrideq*2] |
add dst8q, strideq |
movhps m1, [dst8q+mstrideq*4] |
movhps m5, [dst8q] |
movhps m6, [dst8q+ strideq ] |
movhps m7, [dst8q+ strideq*2] |
add dst8q, mstrideq |
%endif |
%elif mmsize == 8 ; mmx/mmxext (h) |
; read 8 rows of 8px each |
movu m0, [dst1q+mstrideq*4] |
movu m1, [dst2q+mstrideq*4] |
movu m2, [dst1q+mstrideq*2] |
movu m3, [dst1q+mstrideq ] |
movu m4, [dst1q] |
movu m5, [dst2q] |
movu m6, [dst2q+ strideq ] |
; 8x8 transpose |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |
mova m_q0backup, m1 |
movu m7, [dst2q+ strideq*2] |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
mova m1, m_q0backup |
mova m_q0backup, m2 ; store q0 |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
mova m_p0backup, m5 ; store p0 |
SWAP 1, 4 |
SWAP 2, 4 |
SWAP 6, 3 |
SWAP 5, 3 |
%else ; sse2 (h) |
%if %2 == 16 |
lea dst8q, [dst1q+ strideq*8 ] |
%endif |
; read 16 rows of 8px each, interleave |
movh m0, [dst1q+mstrideq*4] |
movh m1, [dst8q+mstrideq*4] |
movh m2, [dst1q+mstrideq*2] |
movh m5, [dst8q+mstrideq*2] |
movh m3, [dst1q+mstrideq ] |
movh m6, [dst8q+mstrideq ] |
movh m4, [dst1q] |
movh m7, [dst8q] |
punpcklbw m0, m1 ; A/I |
punpcklbw m2, m5 ; C/K |
punpcklbw m3, m6 ; D/L |
punpcklbw m4, m7 ; E/M |
add dst8q, strideq |
movh m1, [dst2q+mstrideq*4] |
movh m6, [dst8q+mstrideq*4] |
movh m5, [dst2q] |
movh m7, [dst8q] |
punpcklbw m1, m6 ; B/J |
punpcklbw m5, m7 ; F/N |
movh m6, [dst2q+ strideq ] |
movh m7, [dst8q+ strideq ] |
punpcklbw m6, m7 ; G/O |
; 8x16 transpose |
TRANSPOSE4x4B 0, 1, 2, 3, 7 |
%ifdef m8 |
SWAP 1, 8 |
%else |
mova m_q0backup, m1 |
%endif |
movh m7, [dst2q+ strideq*2] |
movh m1, [dst8q+ strideq*2] |
punpcklbw m7, m1 ; H/P |
TRANSPOSE4x4B 4, 5, 6, 7, 1 |
SBUTTERFLY dq, 0, 4, 1 ; p3/p2 |
SBUTTERFLY dq, 2, 6, 1 ; q0/q1 |
SBUTTERFLY dq, 3, 7, 1 ; q2/q3 |
%ifdef m8 |
SWAP 1, 8 |
SWAP 2, 8 |
%else |
mova m1, m_q0backup |
mova m_q0backup, m2 ; store q0 |
%endif |
SBUTTERFLY dq, 1, 5, 2 ; p1/p0 |
%ifdef m12 |
SWAP 5, 12 |
%else |
mova m_p0backup, m5 ; store p0 |
%endif |
SWAP 1, 4 |
SWAP 2, 4 |
SWAP 6, 3 |
SWAP 5, 3 |
%endif |
; normal_limit for p3-p2, p2-p1, q3-q2 and q2-q1 |
mova m4, m1 |
SWAP 4, 1 |
psubusb m4, m0 ; p2-p3 |
psubusb m0, m1 ; p3-p2 |
por m0, m4 ; abs(p3-p2) |
mova m4, m2 |
SWAP 4, 2 |
psubusb m4, m1 ; p1-p2 |
mova m_p2backup, m1 |
psubusb m1, m2 ; p2-p1 |
por m1, m4 ; abs(p2-p1) |
mova m4, m6 |
SWAP 4, 6 |
psubusb m4, m7 ; q2-q3 |
psubusb m7, m6 ; q3-q2 |
por m7, m4 ; abs(q3-q2) |
mova m4, m5 |
SWAP 4, 5 |
psubusb m4, m6 ; q1-q2 |
mova m_q2backup, m6 |
psubusb m6, m5 ; q2-q1 |
por m6, m4 ; abs(q2-q1) |
%if notcpuflag(mmxext) |
mova m4, m_flimI |
pxor m3, m3 |
psubusb m0, m4 |
psubusb m1, m4 |
psubusb m7, m4 |
psubusb m6, m4 |
pcmpeqb m0, m3 ; abs(p3-p2) <= I |
pcmpeqb m1, m3 ; abs(p2-p1) <= I |
pcmpeqb m7, m3 ; abs(q3-q2) <= I |
pcmpeqb m6, m3 ; abs(q2-q1) <= I |
pand m0, m1 |
pand m7, m6 |
pand m0, m7 |
%else ; mmxext/sse2 |
pmaxub m0, m1 |
pmaxub m6, m7 |
pmaxub m0, m6 |
%endif |
; normal_limit and high_edge_variance for p1-p0, q1-q0 |
SWAP 7, 3 ; now m7 is zero |
%ifidn %1, v |
movrow m3, [dst1q+mstrideq ] ; p0 |
%if mmsize == 16 && %2 == 8 |
movhps m3, [dst8q+mstrideq ] |
%endif |
%elifdef m12 |
SWAP 3, 12 |
%else |
mova m3, m_p0backup |
%endif |
mova m1, m2 |
SWAP 1, 2 |
mova m6, m3 |
SWAP 3, 6 |
psubusb m1, m3 ; p1-p0 |
psubusb m6, m2 ; p0-p1 |
por m1, m6 ; abs(p1-p0) |
%if notcpuflag(mmxext) |
mova m6, m1 |
psubusb m1, m4 |
psubusb m6, m_hevthr |
pcmpeqb m1, m7 ; abs(p1-p0) <= I |
pcmpeqb m6, m7 ; abs(p1-p0) <= hev_thresh |
pand m0, m1 |
mova m_maskres, m6 |
%else ; mmxext/sse2 |
pmaxub m0, m1 ; max_I |
SWAP 1, 4 ; max_hev_thresh |
%endif |
SWAP 6, 4 ; now m6 is I |
%ifidn %1, v |
movrow m4, [dst1q] ; q0 |
%if mmsize == 16 && %2 == 8 |
movhps m4, [dst8q] |
%endif |
%elifdef m8 |
SWAP 4, 8 |
%else |
mova m4, m_q0backup |
%endif |
mova m1, m4 |
SWAP 1, 4 |
mova m7, m5 |
SWAP 7, 5 |
psubusb m1, m5 ; q0-q1 |
psubusb m7, m4 ; q1-q0 |
por m1, m7 ; abs(q1-q0) |
%if notcpuflag(mmxext) |
mova m7, m1 |
psubusb m1, m6 |
psubusb m7, m_hevthr |
pxor m6, m6 |
pcmpeqb m1, m6 ; abs(q1-q0) <= I |
pcmpeqb m7, m6 ; abs(q1-q0) <= hev_thresh |
mova m6, m_maskres |
pand m0, m1 ; abs([pq][321]-[pq][210]) <= I |
pand m6, m7 |
%else ; mmxext/sse2 |
pxor m7, m7 |
pmaxub m0, m1 |
pmaxub m6, m1 |
psubusb m0, m_flimI |
psubusb m6, m_hevthr |
pcmpeqb m0, m7 ; max(abs(..)) <= I |
pcmpeqb m6, m7 ; !(max(abs..) > thresh) |
%endif |
%ifdef m12 |
SWAP 6, 12 |
%else |
mova m_maskres, m6 ; !(abs(p1-p0) > hev_t || abs(q1-q0) > hev_t) |
%endif |
; simple_limit |
mova m1, m3 |
SWAP 1, 3 |
mova m6, m4 ; keep copies of p0/q0 around for later use |
SWAP 6, 4 |
psubusb m1, m4 ; p0-q0 |
psubusb m6, m3 ; q0-p0 |
por m1, m6 ; abs(q0-p0) |
paddusb m1, m1 ; m1=2*abs(q0-p0) |
mova m7, m2 |
SWAP 7, 2 |
mova m6, m5 |
SWAP 6, 5 |
psubusb m7, m5 ; p1-q1 |
psubusb m6, m2 ; q1-p1 |
por m7, m6 ; abs(q1-p1) |
pxor m6, m6 |
pand m7, [pb_FE] |
psrlq m7, 1 ; abs(q1-p1)/2 |
paddusb m7, m1 ; abs(q0-p0)*2+abs(q1-p1)/2 |
psubusb m7, m_flimE |
pcmpeqb m7, m6 ; abs(q0-p0)*2+abs(q1-p1)/2 <= E |
pand m0, m7 ; normal_limit result |
; filter_common; at this point, m2-m5=p1-q1 and m0 is filter_mask |
%ifdef m8 ; x86-64 && sse2 |
mova m8, [pb_80] |
%define m_pb_80 m8 |
%else ; x86-32 or mmx/mmxext |
%define m_pb_80 [pb_80] |
%endif |
mova m1, m4 |
mova m7, m3 |
pxor m1, m_pb_80 |
pxor m7, m_pb_80 |
psubsb m1, m7 ; (signed) q0-p0 |
mova m6, m2 |
mova m7, m5 |
pxor m6, m_pb_80 |
pxor m7, m_pb_80 |
psubsb m6, m7 ; (signed) p1-q1 |
mova m7, m_maskres |
paddsb m6, m1 |
paddsb m6, m1 |
paddsb m6, m1 |
pand m6, m0 |
%ifdef m8 |
mova m_limres, m6 ; 3*(qp-p0)+(p1-q1) masked for filter_mbedge |
pand m_limres, m7 |
%else |
mova m0, m6 |
pand m0, m7 |
mova m_limres, m0 |
%endif |
pandn m7, m6 ; 3*(q0-p0)+(p1-q1) masked for filter_common |
mova m1, [pb_F8] |
mova m6, m7 |
paddsb m7, [pb_3] |
paddsb m6, [pb_4] |
pand m7, m1 |
pand m6, m1 |
pxor m1, m1 |
pxor m0, m0 |
pcmpgtb m1, m7 |
psubb m0, m7 |
psrlq m7, 3 ; +f2 |
psrlq m0, 3 ; -f2 |
pand m0, m1 |
pandn m1, m7 |
psubusb m3, m0 |
paddusb m3, m1 ; p0+f2 |
pxor m1, m1 |
pxor m0, m0 |
pcmpgtb m0, m6 |
psubb m1, m6 |
psrlq m6, 3 ; +f1 |
psrlq m1, 3 ; -f1 |
pand m1, m0 |
pandn m0, m6 |
psubusb m4, m0 |
paddusb m4, m1 ; q0-f1 |
; filter_mbedge (m2-m5 = p1-q1; lim_res carries w) |
%if cpuflag(ssse3) |
mova m7, [pb_1] |
%else |
mova m7, [pw_63] |
%endif |
%ifdef m8 |
SWAP 1, 8 |
%else |
mova m1, m_limres |
%endif |
pxor m0, m0 |
mova m6, m1 |
pcmpgtb m0, m1 ; which are negative |
%if cpuflag(ssse3) |
punpcklbw m6, m7 ; interleave with "1" for rounding |
punpckhbw m1, m7 |
%else |
punpcklbw m6, m0 ; signed byte->word |
punpckhbw m1, m0 |
%endif |
mova m_limsign, m0 |
%if cpuflag(ssse3) |
mova m7, [pb_27_63] |
%ifndef m8 |
mova m_limres, m1 |
%endif |
%ifdef m10 |
SWAP 0, 10 ; don't lose lim_sign copy |
%endif |
mova m0, m7 |
pmaddubsw m7, m6 |
SWAP 6, 7 |
pmaddubsw m0, m1 |
SWAP 1, 0 |
%ifdef m10 |
SWAP 0, 10 |
%else |
mova m0, m_limsign |
%endif |
%else |
mova m_maskres, m6 ; backup for later in filter |
mova m_limres, m1 |
pmullw m6, [pw_27] |
pmullw m1, [pw_27] |
paddw m6, m7 |
paddw m1, m7 |
%endif |
psraw m6, 7 |
psraw m1, 7 |
packsswb m6, m1 ; a0 |
pxor m1, m1 |
psubb m1, m6 |
pand m1, m0 ; -a0 |
pandn m0, m6 ; +a0 |
%if cpuflag(ssse3) |
mova m6, [pb_18_63] ; pipelining |
%endif |
psubusb m3, m1 |
paddusb m4, m1 |
paddusb m3, m0 ; p0+a0 |
psubusb m4, m0 ; q0-a0 |
%if cpuflag(ssse3) |
SWAP 6, 7 |
%ifdef m10 |
SWAP 1, 10 |
%else |
mova m1, m_limres |
%endif |
mova m0, m7 |
pmaddubsw m7, m6 |
SWAP 6, 7 |
pmaddubsw m0, m1 |
SWAP 1, 0 |
%ifdef m10 |
SWAP 0, 10 |
%endif |
mova m0, m_limsign |
%else |
mova m6, m_maskres |
mova m1, m_limres |
pmullw m6, [pw_18] |
pmullw m1, [pw_18] |
paddw m6, m7 |
paddw m1, m7 |
%endif |
mova m0, m_limsign |
psraw m6, 7 |
psraw m1, 7 |
packsswb m6, m1 ; a1 |
pxor m1, m1 |
psubb m1, m6 |
pand m1, m0 ; -a1 |
pandn m0, m6 ; +a1 |
%if cpuflag(ssse3) |
mova m6, [pb_9_63] |
%endif |
psubusb m2, m1 |
paddusb m5, m1 |
paddusb m2, m0 ; p1+a1 |
psubusb m5, m0 ; q1-a1 |
%if cpuflag(ssse3) |
SWAP 6, 7 |
%ifdef m10 |
SWAP 1, 10 |
%else |
mova m1, m_limres |
%endif |
mova m0, m7 |
pmaddubsw m7, m6 |
SWAP 6, 7 |
pmaddubsw m0, m1 |
SWAP 1, 0 |
%else |
%ifdef m8 |
SWAP 6, 12 |
SWAP 1, 8 |
%else |
mova m6, m_maskres |
mova m1, m_limres |
%endif |
pmullw m6, [pw_9] |
pmullw m1, [pw_9] |
paddw m6, m7 |
paddw m1, m7 |
%endif |
%ifdef m9 |
SWAP 7, 9 |
%else |
mova m7, m_limsign |
%endif |
psraw m6, 7 |
psraw m1, 7 |
packsswb m6, m1 ; a1 |
pxor m0, m0 |
psubb m0, m6 |
pand m0, m7 ; -a1 |
pandn m7, m6 ; +a1 |
%ifdef m8 |
SWAP 1, 13 |
SWAP 6, 14 |
%else |
mova m1, m_p2backup |
mova m6, m_q2backup |
%endif |
psubusb m1, m0 |
paddusb m6, m0 |
paddusb m1, m7 ; p1+a1 |
psubusb m6, m7 ; q1-a1 |
; store |
%ifidn %1, v |
movrow [dst2q+mstrideq*4], m1 |
movrow [dst1q+mstrideq*2], m2 |
movrow [dst1q+mstrideq ], m3 |
movrow [dst1q], m4 |
movrow [dst2q], m5 |
movrow [dst2q+ strideq ], m6 |
%if mmsize == 16 && %2 == 8 |
add dst8q, mstrideq |
movhps [dst8q+mstrideq*2], m1 |
movhps [dst8q+mstrideq ], m2 |
movhps [dst8q], m3 |
add dst8q, strideq |
movhps [dst8q], m4 |
movhps [dst8q+ strideq ], m5 |
movhps [dst8q+ strideq*2], m6 |
%endif |
%else ; h |
inc dst1q |
inc dst2q |
; 4x8/16 transpose |
TRANSPOSE4x4B 1, 2, 3, 4, 0 |
SBUTTERFLY bw, 5, 6, 0 |
%if mmsize == 8 ; mmx/mmxext (h) |
WRITE_4x2D 1, 2, 3, 4, dst1q, dst2q, mstrideq, strideq |
add dst1q, 4 |
WRITE_2x4W m5, m6, dst2q, dst1q, mstrideq, strideq |
%else ; sse2 (h) |
lea dst8q, [dst8q+mstrideq+1] |
WRITE_4x4D 1, 2, 3, 4, dst1q, dst2q, dst8q, mstrideq, strideq, %2 |
lea dst1q, [dst2q+mstrideq+4] |
lea dst8q, [dst8q+mstrideq+4] |
%if cpuflag(sse4) |
add dst2q, 4 |
%endif |
WRITE_8W m5, dst2q, dst1q, mstrideq, strideq |
%if cpuflag(sse4) |
lea dst2q, [dst8q+ strideq ] |
%endif |
WRITE_8W m6, dst2q, dst8q, mstrideq, strideq |
%endif |
%endif |
%if mmsize == 8 |
%if %2 == 8 ; chroma |
%ifidn %1, h |
sub dst1q, 5 |
%endif |
cmp dst1q, dst8q |
mov dst1q, dst8q |
jnz .next8px |
%else |
%ifidn %1, h |
lea dst1q, [dst1q+ strideq*8-5] |
%else ; v |
add dst1q, 8 |
%endif |
dec cntrq |
jg .next8px |
%endif |
REP_RET |
%else ; mmsize == 16 |
RET |
%endif |
%endmacro |
%if ARCH_X86_32 |
INIT_MMX mmx |
MBEDGE_LOOPFILTER v, 16 |
MBEDGE_LOOPFILTER h, 16 |
MBEDGE_LOOPFILTER v, 8 |
MBEDGE_LOOPFILTER h, 8 |
INIT_MMX mmxext |
MBEDGE_LOOPFILTER v, 16 |
MBEDGE_LOOPFILTER h, 16 |
MBEDGE_LOOPFILTER v, 8 |
MBEDGE_LOOPFILTER h, 8 |
%endif |
INIT_XMM sse2 |
MBEDGE_LOOPFILTER v, 16 |
MBEDGE_LOOPFILTER h, 16 |
MBEDGE_LOOPFILTER v, 8 |
MBEDGE_LOOPFILTER h, 8 |
INIT_XMM ssse3 |
MBEDGE_LOOPFILTER v, 16 |
MBEDGE_LOOPFILTER h, 16 |
MBEDGE_LOOPFILTER v, 8 |
MBEDGE_LOOPFILTER h, 8 |
INIT_XMM sse4 |
MBEDGE_LOOPFILTER h, 16 |
MBEDGE_LOOPFILTER h, 8 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp8dsp_init.c |
---|
0,0 → 1,441 |
/* |
* VP8 DSP functions x86-optimized |
* Copyright (c) 2010 Ronald S. Bultje <rsbultje@gmail.com> |
* Copyright (c) 2010 Jason Garrett-Glaser <darkshikari@gmail.com> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/cpu.h" |
#include "libavutil/mem.h" |
#include "libavutil/x86/asm.h" |
#include "libavutil/x86/cpu.h" |
#include "libavcodec/vp8dsp.h" |
#if HAVE_YASM |
/* |
* MC functions |
*/ |
void ff_put_vp8_epel4_h4_mmxext(uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel4_h6_mmxext(uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel4_v4_mmxext(uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel4_v6_mmxext(uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel8_h4_sse2 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel8_h6_sse2 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel8_v4_sse2 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel8_v6_sse2 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel4_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel4_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel4_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel4_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel8_h4_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel8_h6_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel8_v4_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_epel8_v6_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_bilinear4_h_mmxext(uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_bilinear8_h_sse2 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_bilinear4_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_bilinear8_h_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_bilinear4_v_mmxext(uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_bilinear8_v_sse2 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_bilinear4_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_bilinear8_v_ssse3 (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_pixels8_mmx (uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_pixels16_mmx(uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
void ff_put_vp8_pixels16_sse(uint8_t *dst, ptrdiff_t dststride, |
uint8_t *src, ptrdiff_t srcstride, |
int height, int mx, int my); |
#define TAP_W16(OPT, FILTERTYPE, TAPTYPE) \ |
static void ff_put_vp8_ ## FILTERTYPE ## 16_ ## TAPTYPE ## _ ## OPT( \ |
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ |
ptrdiff_t srcstride, int height, int mx, int my) \ |
{ \ |
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ |
dst, dststride, src, srcstride, height, mx, my); \ |
ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ |
dst + 8, dststride, src + 8, srcstride, height, mx, my); \ |
} |
#define TAP_W8(OPT, FILTERTYPE, TAPTYPE) \ |
static void ff_put_vp8_ ## FILTERTYPE ## 8_ ## TAPTYPE ## _ ## OPT( \ |
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ |
ptrdiff_t srcstride, int height, int mx, int my) \ |
{ \ |
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ |
dst, dststride, src, srcstride, height, mx, my); \ |
ff_put_vp8_ ## FILTERTYPE ## 4_ ## TAPTYPE ## _ ## OPT( \ |
dst + 4, dststride, src + 4, srcstride, height, mx, my); \ |
} |
#if ARCH_X86_32 |
TAP_W8 (mmxext, epel, h4) |
TAP_W8 (mmxext, epel, h6) |
TAP_W16(mmxext, epel, h6) |
TAP_W8 (mmxext, epel, v4) |
TAP_W8 (mmxext, epel, v6) |
TAP_W16(mmxext, epel, v6) |
TAP_W8 (mmxext, bilinear, h) |
TAP_W16(mmxext, bilinear, h) |
TAP_W8 (mmxext, bilinear, v) |
TAP_W16(mmxext, bilinear, v) |
#endif |
TAP_W16(sse2, epel, h6) |
TAP_W16(sse2, epel, v6) |
TAP_W16(sse2, bilinear, h) |
TAP_W16(sse2, bilinear, v) |
TAP_W16(ssse3, epel, h6) |
TAP_W16(ssse3, epel, v6) |
TAP_W16(ssse3, bilinear, h) |
TAP_W16(ssse3, bilinear, v) |
#define HVTAP(OPT, ALIGN, TAPNUMX, TAPNUMY, SIZE, MAXHEIGHT) \ |
static void ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## v ## TAPNUMY ## _ ## OPT( \ |
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ |
ptrdiff_t srcstride, int height, int mx, int my) \ |
{ \ |
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + TAPNUMY - 1)]; \ |
uint8_t *tmpptr = tmp + SIZE * (TAPNUMY / 2 - 1); \ |
src -= srcstride * (TAPNUMY / 2 - 1); \ |
ff_put_vp8_epel ## SIZE ## _h ## TAPNUMX ## _ ## OPT( \ |
tmp, SIZE, src, srcstride, height + TAPNUMY - 1, mx, my); \ |
ff_put_vp8_epel ## SIZE ## _v ## TAPNUMY ## _ ## OPT( \ |
dst, dststride, tmpptr, SIZE, height, mx, my); \ |
} |
#if ARCH_X86_32 |
#define HVTAPMMX(x, y) \ |
HVTAP(mmxext, 8, x, y, 4, 8) \ |
HVTAP(mmxext, 8, x, y, 8, 16) |
HVTAP(mmxext, 8, 6, 6, 16, 16) |
#else |
#define HVTAPMMX(x, y) \ |
HVTAP(mmxext, 8, x, y, 4, 8) |
#endif |
HVTAPMMX(4, 4) |
HVTAPMMX(4, 6) |
HVTAPMMX(6, 4) |
HVTAPMMX(6, 6) |
#define HVTAPSSE2(x, y, w) \ |
HVTAP(sse2, 16, x, y, w, 16) \ |
HVTAP(ssse3, 16, x, y, w, 16) |
HVTAPSSE2(4, 4, 8) |
HVTAPSSE2(4, 6, 8) |
HVTAPSSE2(6, 4, 8) |
HVTAPSSE2(6, 6, 8) |
HVTAPSSE2(6, 6, 16) |
HVTAP(ssse3, 16, 4, 4, 4, 8) |
HVTAP(ssse3, 16, 4, 6, 4, 8) |
HVTAP(ssse3, 16, 6, 4, 4, 8) |
HVTAP(ssse3, 16, 6, 6, 4, 8) |
#define HVBILIN(OPT, ALIGN, SIZE, MAXHEIGHT) \ |
static void ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT( \ |
uint8_t *dst, ptrdiff_t dststride, uint8_t *src, \ |
ptrdiff_t srcstride, int height, int mx, int my) \ |
{ \ |
DECLARE_ALIGNED(ALIGN, uint8_t, tmp)[SIZE * (MAXHEIGHT + 2)]; \ |
ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT( \ |
tmp, SIZE, src, srcstride, height + 1, mx, my); \ |
ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT( \ |
dst, dststride, tmp, SIZE, height, mx, my); \ |
} |
HVBILIN(mmxext, 8, 4, 8) |
#if ARCH_X86_32 |
HVBILIN(mmxext, 8, 8, 16) |
HVBILIN(mmxext, 8, 16, 16) |
#endif |
HVBILIN(sse2, 8, 8, 16) |
HVBILIN(sse2, 8, 16, 16) |
HVBILIN(ssse3, 8, 4, 8) |
HVBILIN(ssse3, 8, 8, 16) |
HVBILIN(ssse3, 8, 16, 16) |
void ff_vp8_idct_dc_add_mmx(uint8_t *dst, int16_t block[16], |
ptrdiff_t stride); |
void ff_vp8_idct_dc_add_sse4(uint8_t *dst, int16_t block[16], |
ptrdiff_t stride); |
void ff_vp8_idct_dc_add4y_mmx(uint8_t *dst, int16_t block[4][16], |
ptrdiff_t stride); |
void ff_vp8_idct_dc_add4y_sse2(uint8_t *dst, int16_t block[4][16], |
ptrdiff_t stride); |
void ff_vp8_idct_dc_add4uv_mmx(uint8_t *dst, int16_t block[2][16], |
ptrdiff_t stride); |
void ff_vp8_luma_dc_wht_mmx(int16_t block[4][4][16], int16_t dc[16]); |
void ff_vp8_luma_dc_wht_sse(int16_t block[4][4][16], int16_t dc[16]); |
void ff_vp8_idct_add_mmx(uint8_t *dst, int16_t block[16], ptrdiff_t stride); |
void ff_vp8_idct_add_sse(uint8_t *dst, int16_t block[16], ptrdiff_t stride); |
#define DECLARE_LOOP_FILTER(NAME) \ |
void ff_vp8_v_loop_filter_simple_ ## NAME(uint8_t *dst, \ |
ptrdiff_t stride, \ |
int flim); \ |
void ff_vp8_h_loop_filter_simple_ ## NAME(uint8_t *dst, \ |
ptrdiff_t stride, \ |
int flim); \ |
void ff_vp8_v_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ |
ptrdiff_t stride, \ |
int e, int i, int hvt); \ |
void ff_vp8_h_loop_filter16y_inner_ ## NAME (uint8_t *dst, \ |
ptrdiff_t stride, \ |
int e, int i, int hvt); \ |
void ff_vp8_v_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ |
uint8_t *dstV, \ |
ptrdiff_t s, \ |
int e, int i, int hvt); \ |
void ff_vp8_h_loop_filter8uv_inner_ ## NAME (uint8_t *dstU, \ |
uint8_t *dstV, \ |
ptrdiff_t s, \ |
int e, int i, int hvt); \ |
void ff_vp8_v_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ |
ptrdiff_t stride, \ |
int e, int i, int hvt); \ |
void ff_vp8_h_loop_filter16y_mbedge_ ## NAME(uint8_t *dst, \ |
ptrdiff_t stride, \ |
int e, int i, int hvt); \ |
void ff_vp8_v_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ |
uint8_t *dstV, \ |
ptrdiff_t s, \ |
int e, int i, int hvt); \ |
void ff_vp8_h_loop_filter8uv_mbedge_ ## NAME(uint8_t *dstU, \ |
uint8_t *dstV, \ |
ptrdiff_t s, \ |
int e, int i, int hvt); |
DECLARE_LOOP_FILTER(mmx) |
DECLARE_LOOP_FILTER(mmxext) |
DECLARE_LOOP_FILTER(sse2) |
DECLARE_LOOP_FILTER(ssse3) |
DECLARE_LOOP_FILTER(sse4) |
#endif /* HAVE_YASM */ |
#define VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) \ |
c->put_vp8_epel_pixels_tab[IDX][0][2] = ff_put_vp8_epel ## SIZE ## _h6_ ## OPT; \ |
c->put_vp8_epel_pixels_tab[IDX][2][0] = ff_put_vp8_epel ## SIZE ## _v6_ ## OPT; \ |
c->put_vp8_epel_pixels_tab[IDX][2][2] = ff_put_vp8_epel ## SIZE ## _h6v6_ ## OPT |
#define VP8_MC_FUNC(IDX, SIZE, OPT) \ |
c->put_vp8_epel_pixels_tab[IDX][0][1] = ff_put_vp8_epel ## SIZE ## _h4_ ## OPT; \ |
c->put_vp8_epel_pixels_tab[IDX][1][0] = ff_put_vp8_epel ## SIZE ## _v4_ ## OPT; \ |
c->put_vp8_epel_pixels_tab[IDX][1][1] = ff_put_vp8_epel ## SIZE ## _h4v4_ ## OPT; \ |
c->put_vp8_epel_pixels_tab[IDX][1][2] = ff_put_vp8_epel ## SIZE ## _h6v4_ ## OPT; \ |
c->put_vp8_epel_pixels_tab[IDX][2][1] = ff_put_vp8_epel ## SIZE ## _h4v6_ ## OPT; \ |
VP8_LUMA_MC_FUNC(IDX, SIZE, OPT) |
#define VP8_BILINEAR_MC_FUNC(IDX, SIZE, OPT) \ |
c->put_vp8_bilinear_pixels_tab[IDX][0][1] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \ |
c->put_vp8_bilinear_pixels_tab[IDX][0][2] = ff_put_vp8_bilinear ## SIZE ## _h_ ## OPT; \ |
c->put_vp8_bilinear_pixels_tab[IDX][1][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \ |
c->put_vp8_bilinear_pixels_tab[IDX][1][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ |
c->put_vp8_bilinear_pixels_tab[IDX][1][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ |
c->put_vp8_bilinear_pixels_tab[IDX][2][0] = ff_put_vp8_bilinear ## SIZE ## _v_ ## OPT; \ |
c->put_vp8_bilinear_pixels_tab[IDX][2][1] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT; \ |
c->put_vp8_bilinear_pixels_tab[IDX][2][2] = ff_put_vp8_bilinear ## SIZE ## _hv_ ## OPT |
av_cold void ff_vp8dsp_init_x86(VP8DSPContext* c) |
{ |
#if HAVE_YASM |
int cpu_flags = av_get_cpu_flags(); |
if (EXTERNAL_MMX(cpu_flags)) { |
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_mmx; |
c->vp8_idct_dc_add4uv = ff_vp8_idct_dc_add4uv_mmx; |
#if ARCH_X86_32 |
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_mmx; |
c->vp8_idct_add = ff_vp8_idct_add_mmx; |
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_mmx; |
c->put_vp8_epel_pixels_tab[0][0][0] = |
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_mmx; |
#endif |
c->put_vp8_epel_pixels_tab[1][0][0] = |
c->put_vp8_bilinear_pixels_tab[1][0][0] = ff_put_vp8_pixels8_mmx; |
#if ARCH_X86_32 |
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmx; |
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmx; |
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmx; |
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmx; |
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmx; |
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmx; |
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmx; |
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmx; |
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmx; |
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmx; |
#endif |
} |
/* note that 4-tap width=16 functions are missing because w=16 |
* is only used for luma, and luma is always a copy or sixtap. */ |
if (EXTERNAL_MMXEXT(cpu_flags)) { |
VP8_MC_FUNC(2, 4, mmxext); |
VP8_BILINEAR_MC_FUNC(2, 4, mmxext); |
#if ARCH_X86_32 |
VP8_LUMA_MC_FUNC(0, 16, mmxext); |
VP8_MC_FUNC(1, 8, mmxext); |
VP8_BILINEAR_MC_FUNC(0, 16, mmxext); |
VP8_BILINEAR_MC_FUNC(1, 8, mmxext); |
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_mmxext; |
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_mmxext; |
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_mmxext; |
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_mmxext; |
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_mmxext; |
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_mmxext; |
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_mmxext; |
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_mmxext; |
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_mmxext; |
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_mmxext; |
#endif |
} |
if (EXTERNAL_SSE(cpu_flags)) { |
c->vp8_idct_add = ff_vp8_idct_add_sse; |
c->vp8_luma_dc_wht = ff_vp8_luma_dc_wht_sse; |
c->put_vp8_epel_pixels_tab[0][0][0] = |
c->put_vp8_bilinear_pixels_tab[0][0][0] = ff_put_vp8_pixels16_sse; |
} |
if (HAVE_SSE2_EXTERNAL && cpu_flags & (AV_CPU_FLAG_SSE2 | AV_CPU_FLAG_SSE2SLOW)) { |
VP8_LUMA_MC_FUNC(0, 16, sse2); |
VP8_MC_FUNC(1, 8, sse2); |
VP8_BILINEAR_MC_FUNC(0, 16, sse2); |
VP8_BILINEAR_MC_FUNC(1, 8, sse2); |
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_sse2; |
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_sse2; |
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_sse2; |
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_sse2; |
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_sse2; |
} |
if (EXTERNAL_SSE2(cpu_flags)) { |
c->vp8_idct_dc_add4y = ff_vp8_idct_dc_add4y_sse2; |
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse2; |
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_sse2; |
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_sse2; |
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse2; |
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse2; |
} |
if (EXTERNAL_SSSE3(cpu_flags)) { |
VP8_LUMA_MC_FUNC(0, 16, ssse3); |
VP8_MC_FUNC(1, 8, ssse3); |
VP8_MC_FUNC(2, 4, ssse3); |
VP8_BILINEAR_MC_FUNC(0, 16, ssse3); |
VP8_BILINEAR_MC_FUNC(1, 8, ssse3); |
VP8_BILINEAR_MC_FUNC(2, 4, ssse3); |
c->vp8_v_loop_filter_simple = ff_vp8_v_loop_filter_simple_ssse3; |
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_ssse3; |
c->vp8_v_loop_filter16y_inner = ff_vp8_v_loop_filter16y_inner_ssse3; |
c->vp8_h_loop_filter16y_inner = ff_vp8_h_loop_filter16y_inner_ssse3; |
c->vp8_v_loop_filter8uv_inner = ff_vp8_v_loop_filter8uv_inner_ssse3; |
c->vp8_h_loop_filter8uv_inner = ff_vp8_h_loop_filter8uv_inner_ssse3; |
c->vp8_v_loop_filter16y = ff_vp8_v_loop_filter16y_mbedge_ssse3; |
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_ssse3; |
c->vp8_v_loop_filter8uv = ff_vp8_v_loop_filter8uv_mbedge_ssse3; |
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_ssse3; |
} |
if (EXTERNAL_SSE4(cpu_flags)) { |
c->vp8_idct_dc_add = ff_vp8_idct_dc_add_sse4; |
c->vp8_h_loop_filter_simple = ff_vp8_h_loop_filter_simple_sse4; |
c->vp8_h_loop_filter16y = ff_vp8_h_loop_filter16y_mbedge_sse4; |
c->vp8_h_loop_filter8uv = ff_vp8_h_loop_filter8uv_mbedge_sse4; |
} |
#endif /* HAVE_YASM */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp9dsp.asm |
---|
0,0 → 1,278 |
;****************************************************************************** |
;* VP9 SIMD optimizations |
;* |
;* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA |
; FIXME share with vp8dsp.asm |
pw_256: times 8 dw 256 |
%macro F8_TAPS 8 |
times 8 db %1, %2 |
times 8 db %3, %4 |
times 8 db %5, %6 |
times 8 db %7, %8 |
%endmacro |
; int8_t ff_filters_ssse3[3][15][4][16] |
const filters_ssse3 ; smooth |
F8_TAPS -3, -1, 32, 64, 38, 1, -3, 0 |
F8_TAPS -2, -2, 29, 63, 41, 2, -3, 0 |
F8_TAPS -2, -2, 26, 63, 43, 4, -4, 0 |
F8_TAPS -2, -3, 24, 62, 46, 5, -4, 0 |
F8_TAPS -2, -3, 21, 60, 49, 7, -4, 0 |
F8_TAPS -1, -4, 18, 59, 51, 9, -4, 0 |
F8_TAPS -1, -4, 16, 57, 53, 12, -4, -1 |
F8_TAPS -1, -4, 14, 55, 55, 14, -4, -1 |
F8_TAPS -1, -4, 12, 53, 57, 16, -4, -1 |
F8_TAPS 0, -4, 9, 51, 59, 18, -4, -1 |
F8_TAPS 0, -4, 7, 49, 60, 21, -3, -2 |
F8_TAPS 0, -4, 5, 46, 62, 24, -3, -2 |
F8_TAPS 0, -4, 4, 43, 63, 26, -2, -2 |
F8_TAPS 0, -3, 2, 41, 63, 29, -2, -2 |
F8_TAPS 0, -3, 1, 38, 64, 32, -1, -3 |
; regular |
F8_TAPS 0, 1, -5, 126, 8, -3, 1, 0 |
F8_TAPS -1, 3, -10, 122, 18, -6, 2, 0 |
F8_TAPS -1, 4, -13, 118, 27, -9, 3, -1 |
F8_TAPS -1, 4, -16, 112, 37, -11, 4, -1 |
F8_TAPS -1, 5, -18, 105, 48, -14, 4, -1 |
F8_TAPS -1, 5, -19, 97, 58, -16, 5, -1 |
F8_TAPS -1, 6, -19, 88, 68, -18, 5, -1 |
F8_TAPS -1, 6, -19, 78, 78, -19, 6, -1 |
F8_TAPS -1, 5, -18, 68, 88, -19, 6, -1 |
F8_TAPS -1, 5, -16, 58, 97, -19, 5, -1 |
F8_TAPS -1, 4, -14, 48, 105, -18, 5, -1 |
F8_TAPS -1, 4, -11, 37, 112, -16, 4, -1 |
F8_TAPS -1, 3, -9, 27, 118, -13, 4, -1 |
F8_TAPS 0, 2, -6, 18, 122, -10, 3, -1 |
F8_TAPS 0, 1, -3, 8, 126, -5, 1, 0 |
; sharp |
F8_TAPS -1, 3, -7, 127, 8, -3, 1, 0 |
F8_TAPS -2, 5, -13, 125, 17, -6, 3, -1 |
F8_TAPS -3, 7, -17, 121, 27, -10, 5, -2 |
F8_TAPS -4, 9, -20, 115, 37, -13, 6, -2 |
F8_TAPS -4, 10, -23, 108, 48, -16, 8, -3 |
F8_TAPS -4, 10, -24, 100, 59, -19, 9, -3 |
F8_TAPS -4, 11, -24, 90, 70, -21, 10, -4 |
F8_TAPS -4, 11, -23, 80, 80, -23, 11, -4 |
F8_TAPS -4, 10, -21, 70, 90, -24, 11, -4 |
F8_TAPS -3, 9, -19, 59, 100, -24, 10, -4 |
F8_TAPS -3, 8, -16, 48, 108, -23, 10, -4 |
F8_TAPS -2, 6, -13, 37, 115, -20, 9, -4 |
F8_TAPS -2, 5, -10, 27, 121, -17, 7, -3 |
F8_TAPS -1, 3, -6, 17, 125, -13, 5, -2 |
F8_TAPS 0, 1, -3, 8, 127, -7, 3, -1 |
SECTION .text |
%macro filter_h_fn 1 |
%assign %%px mmsize/2 |
cglobal %1_8tap_1d_h_ %+ %%px, 6, 6, 11, dst, dstride, src, sstride, h, filtery |
mova m6, [pw_256] |
mova m7, [filteryq+ 0] |
%if ARCH_X86_64 && mmsize > 8 |
mova m8, [filteryq+16] |
mova m9, [filteryq+32] |
mova m10, [filteryq+48] |
%endif |
.loop: |
movh m0, [srcq-3] |
movh m1, [srcq-2] |
movh m2, [srcq-1] |
movh m3, [srcq+0] |
movh m4, [srcq+1] |
movh m5, [srcq+2] |
punpcklbw m0, m1 |
punpcklbw m2, m3 |
movh m1, [srcq+3] |
movh m3, [srcq+4] |
add srcq, sstrideq |
punpcklbw m4, m5 |
punpcklbw m1, m3 |
pmaddubsw m0, m7 |
%if ARCH_X86_64 && mmsize > 8 |
pmaddubsw m2, m8 |
pmaddubsw m4, m9 |
pmaddubsw m1, m10 |
%else |
pmaddubsw m2, [filteryq+16] |
pmaddubsw m4, [filteryq+32] |
pmaddubsw m1, [filteryq+48] |
%endif |
paddw m0, m2 |
paddw m4, m1 |
paddsw m0, m4 |
pmulhrsw m0, m6 |
%ifidn %1, avg |
movh m1, [dstq] |
%endif |
packuswb m0, m0 |
%ifidn %1, avg |
pavgb m0, m1 |
%endif |
movh [dstq], m0 |
add dstq, dstrideq |
dec hd |
jg .loop |
RET |
%endmacro |
INIT_MMX ssse3 |
filter_h_fn put |
filter_h_fn avg |
INIT_XMM ssse3 |
filter_h_fn put |
filter_h_fn avg |
%macro filter_v_fn 1 |
%assign %%px mmsize/2 |
%if ARCH_X86_64 |
cglobal %1_8tap_1d_v_ %+ %%px, 6, 8, 11, dst, dstride, src, sstride, h, filtery, src4, sstride3 |
%else |
cglobal %1_8tap_1d_v_ %+ %%px, 4, 7, 11, dst, dstride, src, sstride, filtery, src4, sstride3 |
mov filteryq, r5mp |
%define hd r4mp |
%endif |
sub srcq, sstrideq |
lea sstride3q, [sstrideq*3] |
sub srcq, sstrideq |
mova m6, [pw_256] |
sub srcq, sstrideq |
mova m7, [filteryq+ 0] |
lea src4q, [srcq+sstrideq*4] |
%if ARCH_X86_64 && mmsize > 8 |
mova m8, [filteryq+16] |
mova m9, [filteryq+32] |
mova m10, [filteryq+48] |
%endif |
.loop: |
; FIXME maybe reuse loads from previous rows, or just |
; more generally unroll this to prevent multiple loads of |
; the same data? |
movh m0, [srcq] |
movh m1, [srcq+sstrideq] |
movh m2, [srcq+sstrideq*2] |
movh m3, [srcq+sstride3q] |
movh m4, [src4q] |
movh m5, [src4q+sstrideq] |
punpcklbw m0, m1 |
punpcklbw m2, m3 |
movh m1, [src4q+sstrideq*2] |
movh m3, [src4q+sstride3q] |
add srcq, sstrideq |
add src4q, sstrideq |
punpcklbw m4, m5 |
punpcklbw m1, m3 |
pmaddubsw m0, m7 |
%if ARCH_X86_64 && mmsize > 8 |
pmaddubsw m2, m8 |
pmaddubsw m4, m9 |
pmaddubsw m1, m10 |
%else |
pmaddubsw m2, [filteryq+16] |
pmaddubsw m4, [filteryq+32] |
pmaddubsw m1, [filteryq+48] |
%endif |
paddw m0, m2 |
paddw m4, m1 |
paddsw m0, m4 |
pmulhrsw m0, m6 |
%ifidn %1, avg |
movh m1, [dstq] |
%endif |
packuswb m0, m0 |
%ifidn %1, avg |
pavgb m0, m1 |
%endif |
movh [dstq], m0 |
add dstq, dstrideq |
dec hd |
jg .loop |
RET |
%endmacro |
INIT_MMX ssse3 |
filter_v_fn put |
filter_v_fn avg |
INIT_XMM ssse3 |
filter_v_fn put |
filter_v_fn avg |
%macro fpel_fn 6 |
%if %2 == 4 |
%define %%srcfn movh |
%define %%dstfn movh |
%else |
%define %%srcfn movu |
%define %%dstfn mova |
%endif |
%if %2 <= 16 |
cglobal %1%2, 5, 7, 4, dst, dstride, src, sstride, h, dstride3, sstride3 |
lea sstride3q, [sstrideq*3] |
lea dstride3q, [dstrideq*3] |
%else |
cglobal %1%2, 5, 5, 4, dst, dstride, src, sstride, h |
%endif |
.loop: |
%%srcfn m0, [srcq] |
%%srcfn m1, [srcq+s%3] |
%%srcfn m2, [srcq+s%4] |
%%srcfn m3, [srcq+s%5] |
lea srcq, [srcq+sstrideq*%6] |
%ifidn %1, avg |
pavgb m0, [dstq] |
pavgb m1, [dstq+d%3] |
pavgb m2, [dstq+d%4] |
pavgb m3, [dstq+d%5] |
%endif |
%%dstfn [dstq], m0 |
%%dstfn [dstq+d%3], m1 |
%%dstfn [dstq+d%4], m2 |
%%dstfn [dstq+d%5], m3 |
lea dstq, [dstq+dstrideq*%6] |
sub hd, %6 |
jnz .loop |
RET |
%endmacro |
%define d16 16 |
%define s16 16 |
INIT_MMX mmx |
fpel_fn put, 4, strideq, strideq*2, stride3q, 4 |
fpel_fn put, 8, strideq, strideq*2, stride3q, 4 |
INIT_MMX sse |
fpel_fn avg, 4, strideq, strideq*2, stride3q, 4 |
fpel_fn avg, 8, strideq, strideq*2, stride3q, 4 |
INIT_XMM sse |
fpel_fn put, 16, strideq, strideq*2, stride3q, 4 |
fpel_fn put, 32, mmsize, strideq, strideq+mmsize, 2 |
fpel_fn put, 64, mmsize, mmsize*2, mmsize*3, 1 |
INIT_XMM sse2 |
fpel_fn avg, 16, strideq, strideq*2, stride3q, 4 |
fpel_fn avg, 32, mmsize, strideq, strideq+mmsize, 2 |
fpel_fn avg, 64, mmsize, mmsize*2, mmsize*3, 1 |
%undef s16 |
%undef d16 |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/vp9dsp_init.c |
---|
0,0 → 1,214 |
/* |
* VP9 SIMD optimizations |
* |
* Copyright (c) 2013 Ronald S. Bultje <rsbultje gmail com> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/cpu.h" |
#include "libavutil/mem.h" |
#include "libavutil/x86/asm.h" |
#include "libavcodec/vp9dsp.h" |
#if HAVE_YASM |
#define fpel_func(avg, sz, opt) \ |
void ff_##avg##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ |
const uint8_t *src, ptrdiff_t src_stride, \ |
int h, int mx, int my) |
fpel_func(put, 4, mmx); |
fpel_func(put, 8, mmx); |
fpel_func(put, 16, sse); |
fpel_func(put, 32, sse); |
fpel_func(put, 64, sse); |
fpel_func(avg, 4, sse); |
fpel_func(avg, 8, sse); |
fpel_func(avg, 16, sse2); |
fpel_func(avg, 32, sse2); |
fpel_func(avg, 64, sse2); |
#undef fpel_func |
#define mc_func(avg, sz, dir, opt) \ |
void ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ |
const uint8_t *src, ptrdiff_t src_stride, \ |
int h, const int8_t (*filter)[16]) |
#define mc_funcs(sz) \ |
mc_func(put, sz, h, ssse3); \ |
mc_func(avg, sz, h, ssse3); \ |
mc_func(put, sz, v, ssse3); \ |
mc_func(avg, sz, v, ssse3) |
mc_funcs(4); |
mc_funcs(8); |
#undef mc_funcs |
#undef mc_func |
#define mc_rep_func(avg, sz, hsz, dir, opt) \ |
static av_always_inline void \ |
ff_##avg##_8tap_1d_##dir##_##sz##_##opt(uint8_t *dst, ptrdiff_t dst_stride, \ |
const uint8_t *src, ptrdiff_t src_stride, \ |
int h, const int8_t (*filter)[16]) \ |
{ \ |
ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst, dst_stride, src, \ |
src_stride, h, filter); \ |
ff_##avg##_8tap_1d_##dir##_##hsz##_##opt(dst + hsz, dst_stride, src + hsz, \ |
src_stride, h, filter); \ |
} |
#define mc_rep_funcs(sz, hsz) \ |
mc_rep_func(put, sz, hsz, h, ssse3); \ |
mc_rep_func(avg, sz, hsz, h, ssse3); \ |
mc_rep_func(put, sz, hsz, v, ssse3); \ |
mc_rep_func(avg, sz, hsz, v, ssse3) |
mc_rep_funcs(16, 8); |
mc_rep_funcs(32, 16); |
mc_rep_funcs(64, 32); |
#undef mc_rep_funcs |
#undef mc_rep_func |
extern const int8_t ff_filters_ssse3[3][15][4][16]; |
#define filter_8tap_2d_fn(op, sz, f, fname) \ |
static void op##_8tap_##fname##_##sz##hv_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \ |
const uint8_t *src, ptrdiff_t src_stride, \ |
int h, int mx, int my) \ |
{ \ |
LOCAL_ALIGNED_16(uint8_t, temp, [71 * 64]); \ |
ff_put_8tap_1d_h_##sz##_ssse3(temp, 64, src - 3 * src_stride, src_stride, \ |
h + 7, ff_filters_ssse3[f][mx - 1]); \ |
ff_##op##_8tap_1d_v_##sz##_ssse3(dst, dst_stride, temp + 3 * 64, 64, \ |
h, ff_filters_ssse3[f][my - 1]); \ |
} |
#define filters_8tap_2d_fn(op, sz) \ |
filter_8tap_2d_fn(op, sz, FILTER_8TAP_REGULAR, regular) \ |
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SHARP, sharp) \ |
filter_8tap_2d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth) |
#define filters_8tap_2d_fn2(op) \ |
filters_8tap_2d_fn(op, 64) \ |
filters_8tap_2d_fn(op, 32) \ |
filters_8tap_2d_fn(op, 16) \ |
filters_8tap_2d_fn(op, 8) \ |
filters_8tap_2d_fn(op, 4) |
filters_8tap_2d_fn2(put) |
filters_8tap_2d_fn2(avg) |
#undef filters_8tap_2d_fn2 |
#undef filters_8tap_2d_fn |
#undef filter_8tap_2d_fn |
#define filter_8tap_1d_fn(op, sz, f, fname, dir, dvar) \ |
static void op##_8tap_##fname##_##sz##dir##_ssse3(uint8_t *dst, ptrdiff_t dst_stride, \ |
const uint8_t *src, ptrdiff_t src_stride, \ |
int h, int mx, int my) \ |
{ \ |
ff_##op##_8tap_1d_##dir##_##sz##_ssse3(dst, dst_stride, src, src_stride, \ |
h, ff_filters_ssse3[f][dvar - 1]); \ |
} |
#define filters_8tap_1d_fn(op, sz, dir, dvar) \ |
filter_8tap_1d_fn(op, sz, FILTER_8TAP_REGULAR, regular, dir, dvar) \ |
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SHARP, sharp, dir, dvar) \ |
filter_8tap_1d_fn(op, sz, FILTER_8TAP_SMOOTH, smooth, dir, dvar) |
#define filters_8tap_1d_fn2(op, sz) \ |
filters_8tap_1d_fn(op, sz, h, mx) \ |
filters_8tap_1d_fn(op, sz, v, my) |
#define filters_8tap_1d_fn3(op) \ |
filters_8tap_1d_fn2(op, 64) \ |
filters_8tap_1d_fn2(op, 32) \ |
filters_8tap_1d_fn2(op, 16) \ |
filters_8tap_1d_fn2(op, 8) \ |
filters_8tap_1d_fn2(op, 4) |
filters_8tap_1d_fn3(put) |
filters_8tap_1d_fn3(avg) |
#undef filters_8tap_1d_fn |
#undef filters_8tap_1d_fn2 |
#undef filters_8tap_1d_fn3 |
#undef filter_8tap_1d_fn |
#endif /* HAVE_YASM */ |
av_cold void ff_vp9dsp_init_x86(VP9DSPContext *dsp) |
{ |
#if HAVE_YASM |
int cpu_flags = av_get_cpu_flags(); |
#define init_fpel(idx1, idx2, sz, type, opt) \ |
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][0][0] = \ |
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][0][0] = \ |
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][0][0] = \ |
dsp->mc[idx1][FILTER_BILINEAR ][idx2][0][0] = ff_##type##sz##_##opt |
#define init_subpel1(idx1, idx2, idxh, idxv, sz, dir, type, opt) \ |
dsp->mc[idx1][FILTER_8TAP_SMOOTH ][idx2][idxh][idxv] = type##_8tap_smooth_##sz##dir##_##opt; \ |
dsp->mc[idx1][FILTER_8TAP_REGULAR][idx2][idxh][idxv] = type##_8tap_regular_##sz##dir##_##opt; \ |
dsp->mc[idx1][FILTER_8TAP_SHARP ][idx2][idxh][idxv] = type##_8tap_sharp_##sz##dir##_##opt |
#define init_subpel2(idx, idxh, idxv, dir, type, opt) \ |
init_subpel1(0, idx, idxh, idxv, 64, dir, type, opt); \ |
init_subpel1(1, idx, idxh, idxv, 32, dir, type, opt); \ |
init_subpel1(2, idx, idxh, idxv, 16, dir, type, opt); \ |
init_subpel1(3, idx, idxh, idxv, 8, dir, type, opt); \ |
init_subpel1(4, idx, idxh, idxv, 4, dir, type, opt) |
#define init_subpel3(idx, type, opt) \ |
init_subpel2(idx, 1, 1, hv, type, opt); \ |
init_subpel2(idx, 0, 1, v, type, opt); \ |
init_subpel2(idx, 1, 0, h, type, opt) |
if (cpu_flags & AV_CPU_FLAG_MMX) { |
init_fpel(4, 0, 4, put, mmx); |
init_fpel(3, 0, 8, put, mmx); |
} |
if (cpu_flags & AV_CPU_FLAG_SSE) { |
init_fpel(2, 0, 16, put, sse); |
init_fpel(1, 0, 32, put, sse); |
init_fpel(0, 0, 64, put, sse); |
init_fpel(4, 1, 4, avg, sse); |
init_fpel(3, 1, 8, avg, sse); |
} |
if (cpu_flags & AV_CPU_FLAG_SSE2) { |
init_fpel(2, 1, 16, avg, sse2); |
init_fpel(1, 1, 32, avg, sse2); |
init_fpel(0, 1, 64, avg, sse2); |
} |
if (cpu_flags & AV_CPU_FLAG_SSSE3) { |
init_subpel3(0, put, ssse3); |
init_subpel3(1, avg, ssse3); |
} |
#undef init_fpel |
#undef init_subpel1 |
#undef init_subpel2 |
#undef init_subpel3 |
#endif /* HAVE_YASM */ |
} |
/contrib/sdk/sources/ffmpeg/libavcodec/x86/w64xmmtest.c |
---|
0,0 → 1,86 |
/* |
* check XMM registers for clobbers on Win64 |
* Copyright (c) 2012 Ronald S. Bultje <rsbultje@gmail.com> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavcodec/avcodec.h" |
#include "libavutil/x86/w64xmmtest.h" |
wrap(avcodec_open2(AVCodecContext *avctx, |
AVCodec *codec, |
AVDictionary **options)) |
{ |
testxmmclobbers(avcodec_open2, avctx, codec, options); |
} |
wrap(avcodec_decode_audio4(AVCodecContext *avctx, |
AVFrame *frame, |
int *got_frame_ptr, |
AVPacket *avpkt)) |
{ |
testxmmclobbers(avcodec_decode_audio4, avctx, frame, |
got_frame_ptr, avpkt); |
} |
wrap(avcodec_decode_video2(AVCodecContext *avctx, |
AVFrame *picture, |
int *got_picture_ptr, |
AVPacket *avpkt)) |
{ |
testxmmclobbers(avcodec_decode_video2, avctx, picture, |
got_picture_ptr, avpkt); |
} |
wrap(avcodec_decode_subtitle2(AVCodecContext *avctx, |
AVSubtitle *sub, |
int *got_sub_ptr, |
AVPacket *avpkt)) |
{ |
testxmmclobbers(avcodec_decode_subtitle2, avctx, sub, |
got_sub_ptr, avpkt); |
} |
wrap(avcodec_encode_audio2(AVCodecContext *avctx, |
AVPacket *avpkt, |
const AVFrame *frame, |
int *got_packet_ptr)) |
{ |
testxmmclobbers(avcodec_encode_audio2, avctx, avpkt, frame, |
got_packet_ptr); |
} |
wrap(avcodec_encode_video(AVCodecContext *avctx, |
uint8_t *buf, int buf_size, |
const AVFrame *pict)) |
{ |
testxmmclobbers(avcodec_encode_video, avctx, buf, buf_size, pict); |
} |
wrap(avcodec_encode_subtitle(AVCodecContext *avctx, |
uint8_t *buf, int buf_size, |
const AVSubtitle *sub)) |
{ |
testxmmclobbers(avcodec_encode_subtitle, avctx, buf, buf_size, sub); |
} |
wrap(avcodec_encode_video2(AVCodecContext *avctx, AVPacket *avpkt, |
const AVFrame *frame, int *got_packet_ptr)) |
{ |
testxmmclobbers(avcodec_encode_video2, avctx, avpkt, frame, got_packet_ptr); |
} |