/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/Makefile |
---|
0,0 → 1,9 |
YASM-OBJS += x86/audio_convert.o\ |
x86/rematrix.o\ |
x86/resample.o\ |
OBJS += x86/audio_convert_init.o\ |
x86/rematrix_init.o\ |
x86/resample_init.o\ |
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o |
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/audio_convert.asm |
---|
0,0 → 1,739 |
;****************************************************************************** |
;* Copyright (c) 2012 Michael Niedermayer |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA 32 |
flt2pm31: times 8 dd 4.6566129e-10 |
flt2p31 : times 8 dd 2147483648.0 |
flt2p15 : times 8 dd 32768.0 |
word_unpack_shuf : db 0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15 |
SECTION .text |
;to, from, a/u, log2_outsize, log_intsize, const |
%macro PACK_2CH 5-7 |
cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2 |
mov src2q , [srcq+gprsize] |
mov srcq , [srcq] |
mov dstq , [dstq] |
%ifidn %3, a |
test dstq, mmsize-1 |
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX |
test srcq, mmsize-1 |
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX |
test src2q, mmsize-1 |
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX |
%else |
pack_2ch_%2_to_%1_u_int %+ SUFFIX: |
%endif |
lea srcq , [srcq + (1<<%5)*lenq] |
lea src2q, [src2q + (1<<%5)*lenq] |
lea dstq , [dstq + (2<<%4)*lenq] |
neg lenq |
%7 m0,m1,m2,m3,m4,m5 |
.next: |
%if %4 >= %5 |
mov%3 m0, [ srcq +(1<<%5)*lenq] |
mova m1, m0 |
mov%3 m2, [ src2q+(1<<%5)*lenq] |
%if %5 == 1 |
punpcklwd m0, m2 |
punpckhwd m1, m2 |
%else |
punpckldq m0, m2 |
punpckhdq m1, m2 |
%endif |
%6 m0,m1,m2,m3,m4,m5 |
%else |
mov%3 m0, [ srcq +(1<<%5)*lenq] |
mov%3 m1, [mmsize + srcq +(1<<%5)*lenq] |
mov%3 m2, [ src2q+(1<<%5)*lenq] |
mov%3 m3, [mmsize + src2q+(1<<%5)*lenq] |
%6 m0,m1,m2,m3,m4,m5 |
mova m2, m0 |
punpcklwd m0, m1 |
punpckhwd m2, m1 |
SWAP 1,2 |
%endif |
mov%3 [ dstq+(2<<%4)*lenq], m0 |
mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1 |
%if %4 > %5 |
mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2 |
mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3 |
add lenq, 4*mmsize/(2<<%4) |
%else |
add lenq, 2*mmsize/(2<<%4) |
%endif |
jl .next |
REP_RET |
%endmacro |
%macro UNPACK_2CH 5-7 |
cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2 |
mov dst2q , [dstq+gprsize] |
mov srcq , [srcq] |
mov dstq , [dstq] |
%ifidn %3, a |
test dstq, mmsize-1 |
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX |
test srcq, mmsize-1 |
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX |
test dst2q, mmsize-1 |
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX |
%else |
unpack_2ch_%2_to_%1_u_int %+ SUFFIX: |
%endif |
lea srcq , [srcq + (2<<%5)*lenq] |
lea dstq , [dstq + (1<<%4)*lenq] |
lea dst2q, [dst2q + (1<<%4)*lenq] |
neg lenq |
%7 m0,m1,m2,m3,m4,m5 |
mova m6, [word_unpack_shuf] |
.next: |
mov%3 m0, [ srcq +(2<<%5)*lenq] |
mov%3 m2, [ mmsize + srcq +(2<<%5)*lenq] |
%if %5 == 1 |
%ifidn SUFFIX, _ssse3 |
pshufb m0, m6 |
mova m1, m0 |
pshufb m2, m6 |
punpcklqdq m0,m2 |
punpckhqdq m1,m2 |
%else |
mova m1, m0 |
punpcklwd m0,m2 |
punpckhwd m1,m2 |
mova m2, m0 |
punpcklwd m0,m1 |
punpckhwd m2,m1 |
mova m1, m0 |
punpcklwd m0,m2 |
punpckhwd m1,m2 |
%endif |
%else |
mova m1, m0 |
shufps m0, m2, 10001000b |
shufps m1, m2, 11011101b |
%endif |
%if %4 < %5 |
mov%3 m2, [2*mmsize + srcq +(2<<%5)*lenq] |
mova m3, m2 |
mov%3 m4, [3*mmsize + srcq +(2<<%5)*lenq] |
shufps m2, m4, 10001000b |
shufps m3, m4, 11011101b |
SWAP 1,2 |
%endif |
%6 m0,m1,m2,m3,m4,m5 |
mov%3 [ dstq+(1<<%4)*lenq], m0 |
%if %4 > %5 |
mov%3 [ dst2q+(1<<%4)*lenq], m2 |
mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1 |
mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3 |
add lenq, 2*mmsize/(1<<%4) |
%else |
mov%3 [ dst2q+(1<<%4)*lenq], m1 |
add lenq, mmsize/(1<<%4) |
%endif |
jl .next |
REP_RET |
%endmacro |
%macro CONV 5-7 |
cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len |
mov srcq , [srcq] |
mov dstq , [dstq] |
%ifidn %3, a |
test dstq, mmsize-1 |
jne %2_to_%1_u_int %+ SUFFIX |
test srcq, mmsize-1 |
jne %2_to_%1_u_int %+ SUFFIX |
%else |
%2_to_%1_u_int %+ SUFFIX: |
%endif |
lea srcq , [srcq + (1<<%5)*lenq] |
lea dstq , [dstq + (1<<%4)*lenq] |
neg lenq |
%7 m0,m1,m2,m3,m4,m5 |
.next: |
mov%3 m0, [ srcq +(1<<%5)*lenq] |
mov%3 m1, [ mmsize + srcq +(1<<%5)*lenq] |
%if %4 < %5 |
mov%3 m2, [2*mmsize + srcq +(1<<%5)*lenq] |
mov%3 m3, [3*mmsize + srcq +(1<<%5)*lenq] |
%endif |
%6 m0,m1,m2,m3,m4,m5 |
mov%3 [ dstq+(1<<%4)*lenq], m0 |
mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1 |
%if %4 > %5 |
mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2 |
mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3 |
add lenq, 4*mmsize/(1<<%4) |
%else |
add lenq, 2*mmsize/(1<<%4) |
%endif |
jl .next |
%if mmsize == 8 |
emms |
RET |
%else |
REP_RET |
%endif |
%endmacro |
%macro PACK_6CH 5-7 |
cglobal pack_6ch_%2_to_%1_%3, 2,8,7, dst, src, src1, src2, src3, src4, src5, len |
%if ARCH_X86_64 |
mov lend, r2d |
%else |
%define lend dword r2m |
%endif |
mov src1q, [srcq+1*gprsize] |
mov src2q, [srcq+2*gprsize] |
mov src3q, [srcq+3*gprsize] |
mov src4q, [srcq+4*gprsize] |
mov src5q, [srcq+5*gprsize] |
mov srcq, [srcq] |
mov dstq, [dstq] |
%ifidn %3, a |
test dstq, mmsize-1 |
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
test srcq, mmsize-1 |
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
test src1q, mmsize-1 |
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
test src2q, mmsize-1 |
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
test src3q, mmsize-1 |
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
test src4q, mmsize-1 |
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
test src5q, mmsize-1 |
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
%else |
pack_6ch_%2_to_%1_u_int %+ SUFFIX: |
%endif |
sub src1q, srcq |
sub src2q, srcq |
sub src3q, srcq |
sub src4q, srcq |
sub src5q, srcq |
%7 x,x,x,x,m7,x |
.loop: |
mov%3 m0, [srcq ] |
mov%3 m1, [srcq+src1q] |
mov%3 m2, [srcq+src2q] |
mov%3 m3, [srcq+src3q] |
mov%3 m4, [srcq+src4q] |
mov%3 m5, [srcq+src5q] |
%if cpuflag(sse) |
SBUTTERFLYPS 0, 1, 6 |
SBUTTERFLYPS 2, 3, 6 |
SBUTTERFLYPS 4, 5, 6 |
%if cpuflag(avx) |
blendps m6, m4, m0, 1100b |
%else |
movaps m6, m4 |
shufps m4, m0, q3210 |
SWAP 4,6 |
%endif |
movlhps m0, m2 |
movhlps m4, m2 |
%if cpuflag(avx) |
blendps m2, m5, m1, 1100b |
%else |
movaps m2, m5 |
shufps m5, m1, q3210 |
SWAP 2,5 |
%endif |
movlhps m1, m3 |
movhlps m5, m3 |
%6 m0,m6,x,x,m7,m3 |
%6 m4,m1,x,x,m7,m3 |
%6 m2,m5,x,x,m7,m3 |
mov %+ %3 %+ ps [dstq ], m0 |
mov %+ %3 %+ ps [dstq+16], m6 |
mov %+ %3 %+ ps [dstq+32], m4 |
mov %+ %3 %+ ps [dstq+48], m1 |
mov %+ %3 %+ ps [dstq+64], m2 |
mov %+ %3 %+ ps [dstq+80], m5 |
%else ; mmx |
SBUTTERFLY dq, 0, 1, 6 |
SBUTTERFLY dq, 2, 3, 6 |
SBUTTERFLY dq, 4, 5, 6 |
movq [dstq ], m0 |
movq [dstq+ 8], m2 |
movq [dstq+16], m4 |
movq [dstq+24], m1 |
movq [dstq+32], m3 |
movq [dstq+40], m5 |
%endif |
add srcq, mmsize |
add dstq, mmsize*6 |
sub lend, mmsize/4 |
jg .loop |
%if mmsize == 8 |
emms |
RET |
%else |
REP_RET |
%endif |
%endmacro |
%macro UNPACK_6CH 5-7 |
cglobal unpack_6ch_%2_to_%1_%3, 2, 8, 8, dst, src, dst1, dst2, dst3, dst4, dst5, len |
%if ARCH_X86_64 |
mov lend, r2d |
%else |
%define lend dword r2m |
%endif |
mov dst1q, [dstq+1*gprsize] |
mov dst2q, [dstq+2*gprsize] |
mov dst3q, [dstq+3*gprsize] |
mov dst4q, [dstq+4*gprsize] |
mov dst5q, [dstq+5*gprsize] |
mov dstq, [dstq] |
mov srcq, [srcq] |
%ifidn %3, a |
test dstq, mmsize-1 |
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
test srcq, mmsize-1 |
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
test dst1q, mmsize-1 |
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
test dst2q, mmsize-1 |
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
test dst3q, mmsize-1 |
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
test dst4q, mmsize-1 |
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
test dst5q, mmsize-1 |
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
%else |
unpack_6ch_%2_to_%1_u_int %+ SUFFIX: |
%endif |
sub dst1q, dstq |
sub dst2q, dstq |
sub dst3q, dstq |
sub dst4q, dstq |
sub dst5q, dstq |
%7 x,x,x,x,m7,x |
.loop: |
mov%3 m0, [srcq ] |
mov%3 m1, [srcq+16] |
mov%3 m2, [srcq+32] |
mov%3 m3, [srcq+48] |
mov%3 m4, [srcq+64] |
mov%3 m5, [srcq+80] |
SBUTTERFLYPS 0, 3, 6 |
SBUTTERFLYPS 1, 4, 6 |
SBUTTERFLYPS 2, 5, 6 |
SBUTTERFLYPS 0, 4, 6 |
SBUTTERFLYPS 3, 2, 6 |
SBUTTERFLYPS 1, 5, 6 |
SWAP 1, 4 |
SWAP 2, 3 |
%6 m0,m1,x,x,m7,m6 |
%6 m2,m3,x,x,m7,m6 |
%6 m4,m5,x,x,m7,m6 |
mov %+ %3 %+ ps [dstq ], m0 |
mov %+ %3 %+ ps [dstq+dst1q], m1 |
mov %+ %3 %+ ps [dstq+dst2q], m2 |
mov %+ %3 %+ ps [dstq+dst3q], m3 |
mov %+ %3 %+ ps [dstq+dst4q], m4 |
mov %+ %3 %+ ps [dstq+dst5q], m5 |
add srcq, mmsize*6 |
add dstq, mmsize |
sub lend, mmsize/4 |
jg .loop |
REP_RET |
%endmacro |
%define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32) |
%macro PACK_8CH 5-7 |
cglobal pack_8ch_%2_to_%1_%3, 2,PACK_8CH_GPRS,10, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7 |
mov dstq, [dstq] |
%if ARCH_X86_32 |
DEFINE_ARGS dst, src, src2, src3, src4, src5, src6 |
%define lend dword r2m |
%define src1q r0q |
%define src1m dword [rsp+32] |
%if HAVE_ALIGNED_STACK == 0 |
DEFINE_ARGS dst, src, src2, src3, src5, src6 |
%define src4q r0q |
%define src4m dword [rsp+36] |
%endif |
%define src7q r0q |
%define src7m dword [rsp+40] |
mov dstm, dstq |
%endif |
mov src7q, [srcq+7*gprsize] |
mov src6q, [srcq+6*gprsize] |
%if ARCH_X86_32 |
mov src7m, src7q |
%endif |
mov src5q, [srcq+5*gprsize] |
mov src4q, [srcq+4*gprsize] |
mov src3q, [srcq+3*gprsize] |
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 |
mov src4m, src4q |
%endif |
mov src2q, [srcq+2*gprsize] |
mov src1q, [srcq+1*gprsize] |
mov srcq, [srcq] |
%ifidn %3, a |
%if ARCH_X86_32 |
test dstmp, mmsize-1 |
%else |
test dstq, mmsize-1 |
%endif |
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
test srcq, mmsize-1 |
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
test src1q, mmsize-1 |
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
test src2q, mmsize-1 |
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
test src3q, mmsize-1 |
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 |
test src4m, mmsize-1 |
%else |
test src4q, mmsize-1 |
%endif |
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
test src5q, mmsize-1 |
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
test src6q, mmsize-1 |
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
%if ARCH_X86_32 |
test src7m, mmsize-1 |
%else |
test src7q, mmsize-1 |
%endif |
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
%else |
pack_8ch_%2_to_%1_u_int %+ SUFFIX: |
%endif |
sub src1q, srcq |
sub src2q, srcq |
sub src3q, srcq |
%if ARCH_X86_64 || HAVE_ALIGNED_STACK |
sub src4q, srcq |
%else |
sub src4m, srcq |
%endif |
sub src5q, srcq |
sub src6q, srcq |
%if ARCH_X86_64 |
sub src7q, srcq |
%else |
mov src1m, src1q |
sub src7m, srcq |
%endif |
%if ARCH_X86_64 |
%7 x,x,x,x,m9,x |
%elifidn %1, int32 |
%define m9 [flt2p31] |
%else |
%define m9 [flt2pm31] |
%endif |
.loop: |
mov%3 m0, [srcq ] |
mov%3 m1, [srcq+src1q] |
mov%3 m2, [srcq+src2q] |
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 |
mov src4q, src4m |
%endif |
mov%3 m3, [srcq+src3q] |
mov%3 m4, [srcq+src4q] |
mov%3 m5, [srcq+src5q] |
%if ARCH_X86_32 |
mov src7q, src7m |
%endif |
mov%3 m6, [srcq+src6q] |
mov%3 m7, [srcq+src7q] |
%if ARCH_X86_64 |
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 |
%6 m0,m1,x,x,m9,m8 |
%6 m2,m3,x,x,m9,m8 |
%6 m4,m5,x,x,m9,m8 |
%6 m6,m7,x,x,m9,m8 |
mov%3 [dstq], m0 |
%else |
mov dstq, dstm |
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1 |
%6 m0,m1,x,x,m9,m2 |
mova m2, [rsp] |
mov%3 [dstq], m0 |
%6 m2,m3,x,x,m9,m0 |
%6 m4,m5,x,x,m9,m0 |
%6 m6,m7,x,x,m9,m0 |
%endif |
mov%3 [dstq+16], m1 |
mov%3 [dstq+32], m2 |
mov%3 [dstq+48], m3 |
mov%3 [dstq+64], m4 |
mov%3 [dstq+80], m5 |
mov%3 [dstq+96], m6 |
mov%3 [dstq+112], m7 |
add srcq, mmsize |
add dstq, mmsize*8 |
%if ARCH_X86_32 |
mov dstm, dstq |
mov src1q, src1m |
%endif |
sub lend, mmsize/4 |
jg .loop |
REP_RET |
%endmacro |
%macro INT16_TO_INT32_N 6 |
pxor m2, m2 |
pxor m3, m3 |
punpcklwd m2, m1 |
punpckhwd m3, m1 |
SWAP 4,0 |
pxor m0, m0 |
pxor m1, m1 |
punpcklwd m0, m4 |
punpckhwd m1, m4 |
%endmacro |
%macro INT32_TO_INT16_N 6 |
psrad m0, 16 |
psrad m1, 16 |
psrad m2, 16 |
psrad m3, 16 |
packssdw m0, m1 |
packssdw m2, m3 |
SWAP 1,2 |
%endmacro |
%macro INT32_TO_FLOAT_INIT 6 |
mova %5, [flt2pm31] |
%endmacro |
%macro INT32_TO_FLOAT_N 6 |
cvtdq2ps %1, %1 |
cvtdq2ps %2, %2 |
mulps %1, %1, %5 |
mulps %2, %2, %5 |
%endmacro |
%macro FLOAT_TO_INT32_INIT 6 |
mova %5, [flt2p31] |
%endmacro |
%macro FLOAT_TO_INT32_N 6 |
mulps %1, %5 |
mulps %2, %5 |
cvtps2dq %6, %1 |
cmpps %1, %1, %5, 5 |
paddd %1, %6 |
cvtps2dq %6, %2 |
cmpps %2, %2, %5, 5 |
paddd %2, %6 |
%endmacro |
%macro INT16_TO_FLOAT_INIT 6 |
mova m5, [flt2pm31] |
%endmacro |
%macro INT16_TO_FLOAT_N 6 |
INT16_TO_INT32_N %1,%2,%3,%4,%5,%6 |
cvtdq2ps m0, m0 |
cvtdq2ps m1, m1 |
cvtdq2ps m2, m2 |
cvtdq2ps m3, m3 |
mulps m0, m0, m5 |
mulps m1, m1, m5 |
mulps m2, m2, m5 |
mulps m3, m3, m5 |
%endmacro |
%macro FLOAT_TO_INT16_INIT 6 |
mova m5, [flt2p15] |
%endmacro |
%macro FLOAT_TO_INT16_N 6 |
mulps m0, m5 |
mulps m1, m5 |
mulps m2, m5 |
mulps m3, m5 |
cvtps2dq m0, m0 |
cvtps2dq m1, m1 |
packssdw m0, m1 |
cvtps2dq m1, m2 |
cvtps2dq m3, m3 |
packssdw m1, m3 |
%endmacro |
%macro NOP_N 0-6 |
%endmacro |
INIT_MMX mmx |
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N |
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N |
CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N |
CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N |
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N |
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N |
INIT_XMM sse |
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N |
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N |
UNPACK_6CH float, float, u, 2, 2, NOP_N, NOP_N |
UNPACK_6CH float, float, a, 2, 2, NOP_N, NOP_N |
INIT_XMM sse2 |
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N |
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N |
CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N |
CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N |
PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N |
PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N |
PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N |
PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N |
PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N |
PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N |
PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N |
PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N |
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N |
UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N |
UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N |
UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N |
UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N |
UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N |
UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N |
UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N |
CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
UNPACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
UNPACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
UNPACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
UNPACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
PACK_8CH float, float, u, 2, 2, NOP_N, NOP_N |
PACK_8CH float, float, a, 2, 2, NOP_N, NOP_N |
PACK_8CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
PACK_8CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
PACK_8CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
PACK_8CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
INIT_XMM ssse3 |
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N |
UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N |
UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N |
UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N |
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
%if HAVE_AVX_EXTERNAL |
INIT_XMM avx |
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N |
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N |
UNPACK_6CH float, float, u, 2, 2, NOP_N, NOP_N |
UNPACK_6CH float, float, a, 2, 2, NOP_N, NOP_N |
PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
UNPACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
UNPACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
UNPACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
UNPACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
PACK_8CH float, float, u, 2, 2, NOP_N, NOP_N |
PACK_8CH float, float, a, 2, 2, NOP_N, NOP_N |
PACK_8CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
PACK_8CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
PACK_8CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
PACK_8CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
INIT_YMM avx |
CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
%endif |
%if HAVE_AVX2_EXTERNAL |
INIT_YMM avx2 |
CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
%endif |
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/audio_convert_init.c |
---|
0,0 → 1,181 |
/* |
* Copyright (C) 2012 Michael Niedermayer (michaelni@gmx.at) |
* |
* This file is part of libswresample |
* |
* libswresample is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* libswresample is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with libswresample; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/x86/cpu.h" |
#include "libswresample/swresample_internal.h" |
#include "libswresample/audioconvert.h" |
#define PROTO(pre, in, out, cap) void ff ## pre ## in## _to_ ##out## _a_ ##cap(uint8_t **dst, const uint8_t **src, int len); |
#define PROTO2(pre, out, cap) PROTO(pre, int16, out, cap) PROTO(pre, int32, out, cap) PROTO(pre, float, out, cap) |
#define PROTO3(pre, cap) PROTO2(pre, int16, cap) PROTO2(pre, int32, cap) PROTO2(pre, float, cap) |
#define PROTO4(pre) PROTO3(pre, mmx) PROTO3(pre, sse) PROTO3(pre, sse2) PROTO3(pre, ssse3) PROTO3(pre, sse4) PROTO3(pre, avx) PROTO3(pre, avx2) |
PROTO4(_) |
PROTO4(_pack_2ch_) |
PROTO4(_pack_6ch_) |
PROTO4(_pack_8ch_) |
PROTO4(_unpack_2ch_) |
PROTO4(_unpack_6ch_) |
av_cold void swri_audio_convert_init_x86(struct AudioConvert *ac, |
enum AVSampleFormat out_fmt, |
enum AVSampleFormat in_fmt, |
int channels){ |
int mm_flags = av_get_cpu_flags(); |
ac->simd_f= NULL; |
//FIXME add memcpy case |
#define MULTI_CAPS_FUNC(flag, cap) \ |
if (EXTERNAL_##flag(mm_flags)) {\ |
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S16 || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S16P)\ |
ac->simd_f = ff_int16_to_int32_a_ ## cap;\ |
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_S32P)\ |
ac->simd_f = ff_int32_to_int16_a_ ## cap;\ |
} |
MULTI_CAPS_FUNC(MMX, mmx) |
MULTI_CAPS_FUNC(SSE2, sse2) |
if(EXTERNAL_MMX(mm_flags)) { |
if(channels == 6) { |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_6ch_float_to_float_a_mmx; |
} |
} |
if(EXTERNAL_SSE(mm_flags)) { |
if(channels == 6) { |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_6ch_float_to_float_a_sse; |
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S32) |
ac->simd_f = ff_unpack_6ch_float_to_float_a_sse; |
} |
} |
if(EXTERNAL_SSE2(mm_flags)) { |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_int32_to_float_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S16 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S16P) |
ac->simd_f = ff_int16_to_float_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLTP) |
ac->simd_f = ff_float_to_int32_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLTP) |
ac->simd_f = ff_float_to_int16_a_sse2; |
if(channels == 2) { |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_2ch_int32_to_int32_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_S16P) |
ac->simd_f = ff_pack_2ch_int16_to_int16_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S16P) |
ac->simd_f = ff_pack_2ch_int16_to_int32_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_2ch_int32_to_int16_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S32) |
ac->simd_f = ff_unpack_2ch_int32_to_int32_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_S16) |
ac->simd_f = ff_unpack_2ch_int16_to_int16_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S16) |
ac->simd_f = ff_unpack_2ch_int16_to_int32_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_S32) |
ac->simd_f = ff_unpack_2ch_int32_to_int16_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_2ch_int32_to_float_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP) |
ac->simd_f = ff_pack_2ch_float_to_int32_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S16P) |
ac->simd_f = ff_pack_2ch_int16_to_float_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLTP) |
ac->simd_f = ff_pack_2ch_float_to_int16_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32) |
ac->simd_f = ff_unpack_2ch_int32_to_float_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLT) |
ac->simd_f = ff_unpack_2ch_float_to_int32_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S16) |
ac->simd_f = ff_unpack_2ch_int16_to_float_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLT) |
ac->simd_f = ff_unpack_2ch_float_to_int16_a_sse2; |
} |
if(channels == 6) { |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_6ch_int32_to_float_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP) |
ac->simd_f = ff_pack_6ch_float_to_int32_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32) |
ac->simd_f = ff_unpack_6ch_int32_to_float_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLT) |
ac->simd_f = ff_unpack_6ch_float_to_int32_a_sse2; |
} |
if(channels == 8) { |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_8ch_float_to_float_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_8ch_int32_to_float_a_sse2; |
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP) |
ac->simd_f = ff_pack_8ch_float_to_int32_a_sse2; |
} |
} |
if(EXTERNAL_SSSE3(mm_flags)) { |
if(channels == 2) { |
if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_S16) |
ac->simd_f = ff_unpack_2ch_int16_to_int16_a_ssse3; |
if( out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S16) |
ac->simd_f = ff_unpack_2ch_int16_to_int32_a_ssse3; |
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S16) |
ac->simd_f = ff_unpack_2ch_int16_to_float_a_ssse3; |
} |
} |
if(EXTERNAL_AVX_FAST(mm_flags)) { |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_int32_to_float_a_avx; |
} |
if(EXTERNAL_AVX(mm_flags)) { |
if(channels == 6) { |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_6ch_float_to_float_a_avx; |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_6ch_int32_to_float_a_avx; |
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP) |
ac->simd_f = ff_pack_6ch_float_to_int32_a_avx; |
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S32) |
ac->simd_f = ff_unpack_6ch_float_to_float_a_avx; |
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32) |
ac->simd_f = ff_unpack_6ch_int32_to_float_a_avx; |
if( out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLT) |
ac->simd_f = ff_unpack_6ch_float_to_int32_a_avx; |
} |
if(channels == 8) { |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_8ch_float_to_float_a_avx; |
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P) |
ac->simd_f = ff_pack_8ch_int32_to_float_a_avx; |
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP) |
ac->simd_f = ff_pack_8ch_float_to_int32_a_avx; |
} |
} |
if(EXTERNAL_AVX2(mm_flags)) { |
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLTP) |
ac->simd_f = ff_float_to_int32_a_avx2; |
} |
} |
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/rematrix.asm |
---|
0,0 → 1,250 |
;****************************************************************************** |
;* Copyright (c) 2012 Michael Niedermayer |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
SECTION_RODATA 32 |
dw1: times 8 dd 1 |
w1 : times 16 dw 1 |
SECTION .text |
%macro MIX2_FLT 1 |
cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len |
%ifidn %1, a |
test in1q, mmsize-1 |
jne mix_2_1_float_u_int %+ SUFFIX |
test in2q, mmsize-1 |
jne mix_2_1_float_u_int %+ SUFFIX |
test outq, mmsize-1 |
jne mix_2_1_float_u_int %+ SUFFIX |
%else |
mix_2_1_float_u_int %+ SUFFIX: |
%endif |
VBROADCASTSS m4, [coeffpq + 4*index1q] |
VBROADCASTSS m5, [coeffpq + 4*index2q] |
shl lend , 2 |
add in1q , lenq |
add in2q , lenq |
add outq , lenq |
neg lenq |
.next: |
%ifidn %1, a |
mulps m0, m4, [in1q + lenq ] |
mulps m1, m5, [in2q + lenq ] |
mulps m2, m4, [in1q + lenq + mmsize] |
mulps m3, m5, [in2q + lenq + mmsize] |
%else |
movu m0, [in1q + lenq ] |
movu m1, [in2q + lenq ] |
movu m2, [in1q + lenq + mmsize] |
movu m3, [in2q + lenq + mmsize] |
mulps m0, m0, m4 |
mulps m1, m1, m5 |
mulps m2, m2, m4 |
mulps m3, m3, m5 |
%endif |
addps m0, m0, m1 |
addps m2, m2, m3 |
mov%1 [outq + lenq ], m0 |
mov%1 [outq + lenq + mmsize], m2 |
add lenq, mmsize*2 |
jl .next |
REP_RET |
%endmacro |
%macro MIX1_FLT 1 |
cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len |
%ifidn %1, a |
test inq, mmsize-1 |
jne mix_1_1_float_u_int %+ SUFFIX |
test outq, mmsize-1 |
jne mix_1_1_float_u_int %+ SUFFIX |
%else |
mix_1_1_float_u_int %+ SUFFIX: |
%endif |
VBROADCASTSS m2, [coeffpq + 4*indexq] |
shl lenq , 2 |
add inq , lenq |
add outq , lenq |
neg lenq |
.next: |
%ifidn %1, a |
mulps m0, m2, [inq + lenq ] |
mulps m1, m2, [inq + lenq + mmsize] |
%else |
movu m0, [inq + lenq ] |
movu m1, [inq + lenq + mmsize] |
mulps m0, m0, m2 |
mulps m1, m1, m2 |
%endif |
mov%1 [outq + lenq ], m0 |
mov%1 [outq + lenq + mmsize], m1 |
add lenq, mmsize*2 |
jl .next |
REP_RET |
%endmacro |
%macro MIX1_INT16 1 |
cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len |
%ifidn %1, a |
test inq, mmsize-1 |
jne mix_1_1_int16_u_int %+ SUFFIX |
test outq, mmsize-1 |
jne mix_1_1_int16_u_int %+ SUFFIX |
%else |
mix_1_1_int16_u_int %+ SUFFIX: |
%endif |
movd m4, [coeffpq + 4*indexq] |
SPLATW m5, m4 |
psllq m4, 32 |
psrlq m4, 48 |
mova m0, [w1] |
psllw m0, m4 |
psrlw m0, 1 |
punpcklwd m5, m0 |
add lenq , lenq |
add inq , lenq |
add outq , lenq |
neg lenq |
.next: |
mov%1 m0, [inq + lenq ] |
mov%1 m2, [inq + lenq + mmsize] |
mova m1, m0 |
mova m3, m2 |
punpcklwd m0, [w1] |
punpckhwd m1, [w1] |
punpcklwd m2, [w1] |
punpckhwd m3, [w1] |
pmaddwd m0, m5 |
pmaddwd m1, m5 |
pmaddwd m2, m5 |
pmaddwd m3, m5 |
psrad m0, m4 |
psrad m1, m4 |
psrad m2, m4 |
psrad m3, m4 |
packssdw m0, m1 |
packssdw m2, m3 |
mov%1 [outq + lenq ], m0 |
mov%1 [outq + lenq + mmsize], m2 |
add lenq, mmsize*2 |
jl .next |
%if mmsize == 8 |
emms |
RET |
%else |
REP_RET |
%endif |
%endmacro |
%macro MIX2_INT16 1 |
cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len |
%ifidn %1, a |
test in1q, mmsize-1 |
jne mix_2_1_int16_u_int %+ SUFFIX |
test in2q, mmsize-1 |
jne mix_2_1_int16_u_int %+ SUFFIX |
test outq, mmsize-1 |
jne mix_2_1_int16_u_int %+ SUFFIX |
%else |
mix_2_1_int16_u_int %+ SUFFIX: |
%endif |
movd m4, [coeffpq + 4*index1q] |
movd m6, [coeffpq + 4*index2q] |
SPLATW m5, m4 |
SPLATW m6, m6 |
psllq m4, 32 |
psrlq m4, 48 |
mova m7, [dw1] |
pslld m7, m4 |
psrld m7, 1 |
punpcklwd m5, m6 |
add lend , lend |
add in1q , lenq |
add in2q , lenq |
add outq , lenq |
neg lenq |
.next: |
mov%1 m0, [in1q + lenq ] |
mov%1 m2, [in2q + lenq ] |
mova m1, m0 |
punpcklwd m0, m2 |
punpckhwd m1, m2 |
mov%1 m2, [in1q + lenq + mmsize] |
mov%1 m6, [in2q + lenq + mmsize] |
mova m3, m2 |
punpcklwd m2, m6 |
punpckhwd m3, m6 |
pmaddwd m0, m5 |
pmaddwd m1, m5 |
pmaddwd m2, m5 |
pmaddwd m3, m5 |
paddd m0, m7 |
paddd m1, m7 |
paddd m2, m7 |
paddd m3, m7 |
psrad m0, m4 |
psrad m1, m4 |
psrad m2, m4 |
psrad m3, m4 |
packssdw m0, m1 |
packssdw m2, m3 |
mov%1 [outq + lenq ], m0 |
mov%1 [outq + lenq + mmsize], m2 |
add lenq, mmsize*2 |
jl .next |
%if mmsize == 8 |
emms |
RET |
%else |
REP_RET |
%endif |
%endmacro |
INIT_MMX mmx |
MIX1_INT16 u |
MIX1_INT16 a |
MIX2_INT16 u |
MIX2_INT16 a |
INIT_XMM sse |
MIX2_FLT u |
MIX2_FLT a |
MIX1_FLT u |
MIX1_FLT a |
INIT_XMM sse2 |
MIX1_INT16 u |
MIX1_INT16 a |
MIX2_INT16 u |
MIX2_INT16 a |
%if HAVE_AVX_EXTERNAL |
INIT_YMM avx |
MIX2_FLT u |
MIX2_FLT a |
MIX1_FLT u |
MIX1_FLT a |
%endif |
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/rematrix_init.c |
---|
0,0 → 1,90 |
/* |
* Copyright (C) 2012 Michael Niedermayer (michaelni@gmx.at) |
* |
* This file is part of libswresample |
* |
* libswresample is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* libswresample is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with libswresample; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libavutil/x86/cpu.h" |
#include "libswresample/swresample_internal.h" |
#define D(type, simd) \ |
mix_1_1_func_type ff_mix_1_1_a_## type ## _ ## simd;\ |
mix_2_1_func_type ff_mix_2_1_a_## type ## _ ## simd; |
D(float, sse) |
D(float, avx) |
D(int16, mmx) |
D(int16, sse2) |
av_cold int swri_rematrix_init_x86(struct SwrContext *s){ |
#if HAVE_YASM |
int mm_flags = av_get_cpu_flags(); |
int nb_in = av_get_channel_layout_nb_channels(s->in_ch_layout); |
int nb_out = av_get_channel_layout_nb_channels(s->out_ch_layout); |
int num = nb_in * nb_out; |
int i,j; |
s->mix_1_1_simd = NULL; |
s->mix_2_1_simd = NULL; |
if (s->midbuf.fmt == AV_SAMPLE_FMT_S16P){ |
if(EXTERNAL_MMX(mm_flags)) { |
s->mix_1_1_simd = ff_mix_1_1_a_int16_mmx; |
s->mix_2_1_simd = ff_mix_2_1_a_int16_mmx; |
} |
if(EXTERNAL_SSE2(mm_flags)) { |
s->mix_1_1_simd = ff_mix_1_1_a_int16_sse2; |
s->mix_2_1_simd = ff_mix_2_1_a_int16_sse2; |
} |
s->native_simd_matrix = av_mallocz_array(num, 2 * sizeof(int16_t)); |
s->native_simd_one = av_mallocz(2 * sizeof(int16_t)); |
if (!s->native_simd_matrix || !s->native_simd_one) |
return AVERROR(ENOMEM); |
for(i=0; i<nb_out; i++){ |
int sh = 0; |
for(j=0; j<nb_in; j++) |
sh = FFMAX(sh, FFABS(((int*)s->native_matrix)[i * nb_in + j])); |
sh = FFMAX(av_log2(sh) - 14, 0); |
for(j=0; j<nb_in; j++) { |
((int16_t*)s->native_simd_matrix)[2*(i * nb_in + j)+1] = 15 - sh; |
((int16_t*)s->native_simd_matrix)[2*(i * nb_in + j)] = |
((((int*)s->native_matrix)[i * nb_in + j]) + (1<<sh>>1)) >> sh; |
} |
} |
((int16_t*)s->native_simd_one)[1] = 14; |
((int16_t*)s->native_simd_one)[0] = 16384; |
} else if(s->midbuf.fmt == AV_SAMPLE_FMT_FLTP){ |
if(EXTERNAL_SSE(mm_flags)) { |
s->mix_1_1_simd = ff_mix_1_1_a_float_sse; |
s->mix_2_1_simd = ff_mix_2_1_a_float_sse; |
} |
if(EXTERNAL_AVX_FAST(mm_flags)) { |
s->mix_1_1_simd = ff_mix_1_1_a_float_avx; |
s->mix_2_1_simd = ff_mix_2_1_a_float_avx; |
} |
s->native_simd_matrix = av_mallocz_array(num, sizeof(float)); |
s->native_simd_one = av_mallocz(sizeof(float)); |
if (!s->native_simd_matrix || !s->native_simd_one) |
return AVERROR(ENOMEM); |
memcpy(s->native_simd_matrix, s->native_matrix, num * sizeof(float)); |
memcpy(s->native_simd_one, s->native_one, sizeof(float)); |
} |
#endif |
return 0; |
} |
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/resample.asm |
---|
0,0 → 1,605 |
;****************************************************************************** |
;* Copyright (c) 2012 Michael Niedermayer |
;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com> |
;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com> |
;* |
;* This file is part of FFmpeg. |
;* |
;* FFmpeg is free software; you can redistribute it and/or |
;* modify it under the terms of the GNU Lesser General Public |
;* License as published by the Free Software Foundation; either |
;* version 2.1 of the License, or (at your option) any later version. |
;* |
;* FFmpeg is distributed in the hope that it will be useful, |
;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
;* Lesser General Public License for more details. |
;* |
;* You should have received a copy of the GNU Lesser General Public |
;* License along with FFmpeg; if not, write to the Free Software |
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
;****************************************************************************** |
%include "libavutil/x86/x86util.asm" |
%if ARCH_X86_64 |
%define pointer resq |
%else |
%define pointer resd |
%endif |
struc ResampleContext |
.av_class: pointer 1 |
.filter_bank: pointer 1 |
.filter_length: resd 1 |
.filter_alloc: resd 1 |
.ideal_dst_incr: resd 1 |
.dst_incr: resd 1 |
.dst_incr_div: resd 1 |
.dst_incr_mod: resd 1 |
.index: resd 1 |
.frac: resd 1 |
.src_incr: resd 1 |
.compensation_distance: resd 1 |
.phase_shift: resd 1 |
.phase_mask: resd 1 |
; there's a few more here but we only care about the first few |
endstruc |
SECTION_RODATA |
pf_1: dd 1.0 |
pdbl_1: dq 1.0 |
pd_0x4000: dd 0x4000 |
SECTION .text |
%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant |
; int resample_common_$format(ResampleContext *ctx, $format *dst, |
; const $format *src, int size, int update_ctx) |
%if ARCH_X86_64 ; unix64 and win64 |
cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \ |
dst_incr_mod, size, min_filter_count_x4, \ |
min_filter_len_x4, dst_incr_div, src_incr, \ |
phase_mask, dst_end, filter_bank |
; use red-zone for variable storage |
%define ctx_stackq [rsp-0x8] |
%define src_stackq [rsp-0x10] |
%if WIN64 |
%define update_context_stackd r4m |
%else ; unix64 |
%define update_context_stackd [rsp-0x14] |
%endif |
; load as many variables in registers as possible; for the rest, store |
; on stack so that we have 'ctx' available as one extra register |
mov sized, r3d |
mov phase_maskd, [ctxq+ResampleContext.phase_mask] |
%if UNIX64 |
mov update_context_stackd, r4d |
%endif |
mov indexd, [ctxq+ResampleContext.index] |
mov fracd, [ctxq+ResampleContext.frac] |
mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] |
mov filter_bankq, [ctxq+ResampleContext.filter_bank] |
mov src_incrd, [ctxq+ResampleContext.src_incr] |
mov ctx_stackq, ctxq |
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] |
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] |
shl min_filter_len_x4d, %3 |
lea dst_endq, [dstq+sizeq*%2] |
%if UNIX64 |
mov ecx, [ctxq+ResampleContext.phase_shift] |
mov edi, [ctxq+ResampleContext.filter_alloc] |
DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ |
filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ |
src_incr, phase_mask, dst_end, filter_bank |
%elif WIN64 |
mov R9d, [ctxq+ResampleContext.filter_alloc] |
mov ecx, [ctxq+ResampleContext.phase_shift] |
DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ |
filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ |
src_incr, phase_mask, dst_end, filter_bank |
%endif |
neg min_filter_len_x4q |
sub filter_bankq, min_filter_len_x4q |
sub srcq, min_filter_len_x4q |
mov src_stackq, srcq |
%else ; x86-32 |
cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \ |
index, min_filter_length_x4, filter_bank |
; push temp variables to stack |
%define ctx_stackq r0mp |
%define src_stackq r2mp |
%define update_context_stackd r4m |
mov dstq, r1mp |
mov r3, r3mp |
lea r3, [dstq+r3*%2] |
PUSH dword [ctxq+ResampleContext.dst_incr_div] |
PUSH dword [ctxq+ResampleContext.dst_incr_mod] |
PUSH dword [ctxq+ResampleContext.filter_alloc] |
PUSH r3 |
PUSH dword [ctxq+ResampleContext.phase_mask] |
PUSH dword [ctxq+ResampleContext.src_incr] |
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] |
mov indexd, [ctxq+ResampleContext.index] |
shl min_filter_length_x4d, %3 |
mov fracd, [ctxq+ResampleContext.frac] |
neg min_filter_length_x4q |
mov filter_bankq, [ctxq+ResampleContext.filter_bank] |
sub r2mp, min_filter_length_x4q |
sub filter_bankq, min_filter_length_x4q |
PUSH min_filter_length_x4q |
PUSH filter_bankq |
mov phase_shiftd, [ctxq+ResampleContext.phase_shift] |
DEFINE_ARGS src, phase_shift, dst, frac, index, min_filter_count_x4, filter |
%define filter_bankq dword [rsp+0x0] |
%define min_filter_length_x4q dword [rsp+0x4] |
%define src_incrd dword [rsp+0x8] |
%define phase_maskd dword [rsp+0xc] |
%define dst_endq dword [rsp+0x10] |
%define filter_allocd dword [rsp+0x14] |
%define dst_incr_modd dword [rsp+0x18] |
%define dst_incr_divd dword [rsp+0x1c] |
mov srcq, r2mp |
%endif |
.loop: |
mov filterd, filter_allocd |
imul filterd, indexd |
%if ARCH_X86_64 |
mov min_filter_count_x4q, min_filter_len_x4q |
lea filterq, [filter_bankq+filterq*%2] |
%else ; x86-32 |
mov min_filter_count_x4q, filter_bankq |
lea filterq, [min_filter_count_x4q+filterq*%2] |
mov min_filter_count_x4q, min_filter_length_x4q |
%endif |
%ifidn %1, int16 |
movd m0, [pd_0x4000] |
%else ; float/double |
xorps m0, m0, m0 |
%endif |
align 16 |
.inner_loop: |
movu m1, [srcq+min_filter_count_x4q*1] |
%ifidn %1, int16 |
%if cpuflag(xop) |
vpmadcswd m0, m1, [filterq+min_filter_count_x4q*1], m0 |
%else |
pmaddwd m1, [filterq+min_filter_count_x4q*1] |
paddd m0, m1 |
%endif |
%else ; float/double |
%if cpuflag(fma4) || cpuflag(fma3) |
fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0 |
%else |
mulp%4 m1, m1, [filterq+min_filter_count_x4q*1] |
addp%4 m0, m0, m1 |
%endif ; cpuflag |
%endif |
add min_filter_count_x4q, mmsize |
js .inner_loop |
%ifidn %1, int16 |
HADDD m0, m1 |
psrad m0, 15 |
add fracd, dst_incr_modd |
packssdw m0, m0 |
add indexd, dst_incr_divd |
movd [dstq], m0 |
%else ; float/double |
; horizontal sum & store |
%if mmsize == 32 |
vextractf128 xm1, m0, 0x1 |
addps xm0, xm1 |
%endif |
movhlps xm1, xm0 |
%ifidn %1, float |
addps xm0, xm1 |
shufps xm1, xm0, xm0, q0001 |
%endif |
add fracd, dst_incr_modd |
addp%4 xm0, xm1 |
add indexd, dst_incr_divd |
movs%4 [dstq], xm0 |
%endif |
cmp fracd, src_incrd |
jl .skip |
sub fracd, src_incrd |
inc indexd |
%if UNIX64 |
DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \ |
index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ |
src_incr, phase_mask, dst_end, filter_bank |
%elif WIN64 |
DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \ |
index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \ |
src_incr, phase_mask, dst_end, filter_bank |
%else ; x86-32 |
DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr |
%endif |
.skip: |
mov index_incrd, indexd |
add dstq, %2 |
and indexd, phase_maskd |
sar index_incrd, phase_shiftb |
lea srcq, [srcq+index_incrq*%2] |
cmp dstq, dst_endq |
jne .loop |
%if ARCH_X86_64 |
DEFINE_ARGS ctx, dst, src, phase_shift, index, frac |
%else ; x86-32 |
DEFINE_ARGS src, ctx, update_context, frac, index |
%endif |
cmp dword update_context_stackd, 0 |
jz .skip_store |
; strictly speaking, the function should always return the consumed |
; number of bytes; however, we only use the value if update_context |
; is true, so let's just leave it uninitialized otherwise |
mov ctxq, ctx_stackq |
movifnidn rax, srcq |
mov [ctxq+ResampleContext.frac ], fracd |
sub rax, src_stackq |
mov [ctxq+ResampleContext.index], indexd |
shr rax, %3 |
.skip_store: |
%if ARCH_X86_32 |
ADD rsp, 0x20 |
%endif |
RET |
; int resample_linear_$format(ResampleContext *ctx, float *dst, |
; const float *src, int size, int update_ctx) |
%if ARCH_X86_64 ; unix64 and win64 |
%if UNIX64 |
cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_shift, index, frac, \ |
size, dst_incr_mod, min_filter_count_x4, \ |
min_filter_len_x4, dst_incr_div, src_incr, \ |
src, dst_end, filter_bank |
mov srcq, r2mp |
%else ; win64 |
cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index, frac, \ |
size, dst_incr_mod, min_filter_count_x4, \ |
min_filter_len_x4, dst_incr_div, src_incr, \ |
dst, dst_end, filter_bank |
mov dstq, r1mp |
%endif |
; use red-zone for variable storage |
%define ctx_stackq [rsp-0x8] |
%define src_stackq [rsp-0x10] |
%define phase_mask_stackd [rsp-0x14] |
%if WIN64 |
%define update_context_stackd r4m |
%else ; unix64 |
%define update_context_stackd [rsp-0x18] |
%endif |
; load as many variables in registers as possible; for the rest, store |
; on stack so that we have 'ctx' available as one extra register |
mov sized, r3d |
mov phase_maskd, [ctxq+ResampleContext.phase_mask] |
%if UNIX64 |
mov update_context_stackd, r4d |
%endif |
mov indexd, [ctxq+ResampleContext.index] |
mov fracd, [ctxq+ResampleContext.frac] |
mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod] |
mov filter_bankq, [ctxq+ResampleContext.filter_bank] |
mov src_incrd, [ctxq+ResampleContext.src_incr] |
mov ctx_stackq, ctxq |
mov phase_mask_stackd, phase_maskd |
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length] |
%ifidn %1, int16 |
movd m4, [pd_0x4000] |
%else ; float/double |
cvtsi2s%4 xm0, src_incrd |
movs%4 xm4, [%5] |
divs%4 xm4, xm0 |
%endif |
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div] |
shl min_filter_len_x4d, %3 |
lea dst_endq, [dstq+sizeq*%2] |
%if UNIX64 |
mov ecx, [ctxq+ResampleContext.phase_shift] |
mov edi, [ctxq+ResampleContext.filter_alloc] |
DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, filter1, \ |
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
dst_incr_div, src_incr, src, dst_end, filter_bank |
%elif WIN64 |
mov R9d, [ctxq+ResampleContext.filter_alloc] |
mov ecx, [ctxq+ResampleContext.phase_shift] |
DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, filter1, \ |
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
dst_incr_div, src_incr, dst, dst_end, filter_bank |
%endif |
neg min_filter_len_x4q |
sub filter_bankq, min_filter_len_x4q |
sub srcq, min_filter_len_x4q |
mov src_stackq, srcq |
%else ; x86-32 |
cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \ |
frac, index, dst, filter_bank |
; push temp variables to stack |
%define ctx_stackq r0mp |
%define src_stackq r2mp |
%define update_context_stackd r4m |
mov dstq, r1mp |
mov r3, r3mp |
lea r3, [dstq+r3*%2] |
PUSH dword [ctxq+ResampleContext.dst_incr_div] |
PUSH r3 |
mov r3, dword [ctxq+ResampleContext.filter_alloc] |
PUSH dword [ctxq+ResampleContext.dst_incr_mod] |
PUSH r3 |
shl r3, %3 |
PUSH r3 |
mov r3, dword [ctxq+ResampleContext.src_incr] |
PUSH dword [ctxq+ResampleContext.phase_mask] |
PUSH r3d |
%ifidn %1, int16 |
movd m4, [pd_0x4000] |
%else ; float/double |
cvtsi2s%4 xm0, r3d |
movs%4 xm4, [%5] |
divs%4 xm4, xm0 |
%endif |
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length] |
mov indexd, [ctxq+ResampleContext.index] |
shl min_filter_length_x4d, %3 |
mov fracd, [ctxq+ResampleContext.frac] |
neg min_filter_length_x4q |
mov filter_bankq, [ctxq+ResampleContext.filter_bank] |
sub r2mp, min_filter_length_x4q |
sub filter_bankq, min_filter_length_x4q |
PUSH min_filter_length_x4q |
PUSH filter_bankq |
PUSH dword [ctxq+ResampleContext.phase_shift] |
DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src |
%define phase_shift_stackd dword [rsp+0x0] |
%define filter_bankq dword [rsp+0x4] |
%define min_filter_length_x4q dword [rsp+0x8] |
%define src_incrd dword [rsp+0xc] |
%define phase_mask_stackd dword [rsp+0x10] |
%define filter_alloc_x4q dword [rsp+0x14] |
%define filter_allocd dword [rsp+0x18] |
%define dst_incr_modd dword [rsp+0x1c] |
%define dst_endq dword [rsp+0x20] |
%define dst_incr_divd dword [rsp+0x24] |
mov srcq, r2mp |
%endif |
.loop: |
mov filter1d, filter_allocd |
imul filter1d, indexd |
%if ARCH_X86_64 |
mov min_filter_count_x4q, min_filter_len_x4q |
lea filter1q, [filter_bankq+filter1q*%2] |
lea filter2q, [filter1q+filter_allocq*%2] |
%else ; x86-32 |
mov min_filter_count_x4q, filter_bankq |
lea filter1q, [min_filter_count_x4q+filter1q*%2] |
mov min_filter_count_x4q, min_filter_length_x4q |
mov filter2q, filter1q |
add filter2q, filter_alloc_x4q |
%endif |
%ifidn %1, int16 |
mova m0, m4 |
mova m2, m4 |
%else ; float/double |
xorps m0, m0, m0 |
xorps m2, m2, m2 |
%endif |
align 16 |
.inner_loop: |
movu m1, [srcq+min_filter_count_x4q*1] |
%ifidn %1, int16 |
%if cpuflag(xop) |
vpmadcswd m2, m1, [filter2q+min_filter_count_x4q*1], m2 |
vpmadcswd m0, m1, [filter1q+min_filter_count_x4q*1], m0 |
%else |
pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1] |
pmaddwd m1, [filter1q+min_filter_count_x4q*1] |
paddd m2, m3 |
paddd m0, m1 |
%endif ; cpuflag |
%else ; float/double |
%if cpuflag(fma4) || cpuflag(fma3) |
fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2 |
fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0 |
%else |
mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1] |
mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1] |
addp%4 m2, m2, m3 |
addp%4 m0, m0, m1 |
%endif ; cpuflag |
%endif |
add min_filter_count_x4q, mmsize |
js .inner_loop |
%ifidn %1, int16 |
%if mmsize == 16 |
%if cpuflag(xop) |
vphadddq m2, m2 |
vphadddq m0, m0 |
%endif |
pshufd m3, m2, q0032 |
pshufd m1, m0, q0032 |
paddd m2, m3 |
paddd m0, m1 |
%endif |
%if notcpuflag(xop) |
PSHUFLW m3, m2, q0032 |
PSHUFLW m1, m0, q0032 |
paddd m2, m3 |
paddd m0, m1 |
%endif |
psubd m2, m0 |
; This is probably a really bad idea on atom and other machines with a |
; long transfer latency between GPRs and XMMs (atom). However, it does |
; make the clip a lot simpler... |
movd eax, m2 |
add indexd, dst_incr_divd |
imul fracd |
idiv src_incrd |
movd m1, eax |
add fracd, dst_incr_modd |
paddd m0, m1 |
psrad m0, 15 |
packssdw m0, m0 |
movd [dstq], m0 |
; note that for imul/idiv, I need to move filter to edx/eax for each: |
; - 32bit: eax=r0[filter1], edx=r2[filter2] |
; - win64: eax=r6[filter1], edx=r1[todo] |
; - unix64: eax=r6[filter1], edx=r2[todo] |
%else ; float/double |
; val += (v2 - val) * (FELEML) frac / c->src_incr; |
%if mmsize == 32 |
vextractf128 xm1, m0, 0x1 |
vextractf128 xm3, m2, 0x1 |
addps xm0, xm1 |
addps xm2, xm3 |
%endif |
cvtsi2s%4 xm1, fracd |
subp%4 xm2, xm0 |
mulp%4 xm1, xm4 |
shufp%4 xm1, xm1, q0000 |
%if cpuflag(fma4) || cpuflag(fma3) |
fmaddp%4 xm0, xm2, xm1, xm0 |
%else |
mulp%4 xm2, xm1 |
addp%4 xm0, xm2 |
%endif ; cpuflag |
; horizontal sum & store |
movhlps xm1, xm0 |
%ifidn %1, float |
addps xm0, xm1 |
shufps xm1, xm0, xm0, q0001 |
%endif |
add fracd, dst_incr_modd |
addp%4 xm0, xm1 |
add indexd, dst_incr_divd |
movs%4 [dstq], xm0 |
%endif |
cmp fracd, src_incrd |
jl .skip |
sub fracd, src_incrd |
inc indexd |
%if UNIX64 |
DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, index_incr, \ |
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
dst_incr_div, src_incr, src, dst_end, filter_bank |
%elif WIN64 |
DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, index_incr, \ |
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
dst_incr_div, src_incr, dst, dst_end, filter_bank |
%else ; x86-32 |
DEFINE_ARGS filter1, phase_shift, index_incr, frac, index, dst, src |
%endif |
.skip: |
%if ARCH_X86_32 |
mov phase_shiftd, phase_shift_stackd |
%endif |
mov index_incrd, indexd |
add dstq, %2 |
and indexd, phase_mask_stackd |
sar index_incrd, phase_shiftb |
lea srcq, [srcq+index_incrq*%2] |
cmp dstq, dst_endq |
jne .loop |
%if UNIX64 |
DEFINE_ARGS ctx, dst, filter2, phase_shift, index, frac, index_incr, \ |
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
dst_incr_div, src_incr, src, dst_end, filter_bank |
%elif WIN64 |
DEFINE_ARGS ctx, filter2, src, phase_shift, index, frac, index_incr, \ |
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \ |
dst_incr_div, src_incr, dst, dst_end, filter_bank |
%else ; x86-32 |
DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src |
%endif |
cmp dword update_context_stackd, 0 |
jz .skip_store |
; strictly speaking, the function should always return the consumed |
; number of bytes; however, we only use the value if update_context |
; is true, so let's just leave it uninitialized otherwise |
mov ctxq, ctx_stackq |
movifnidn rax, srcq |
mov [ctxq+ResampleContext.frac ], fracd |
sub rax, src_stackq |
mov [ctxq+ResampleContext.index], indexd |
shr rax, %3 |
.skip_store: |
%if ARCH_X86_32 |
ADD rsp, 0x28 |
%endif |
RET |
%endmacro |
INIT_XMM sse |
RESAMPLE_FNS float, 4, 2, s, pf_1 |
%if HAVE_AVX_EXTERNAL |
INIT_YMM avx |
RESAMPLE_FNS float, 4, 2, s, pf_1 |
%endif |
%if HAVE_FMA3_EXTERNAL |
INIT_YMM fma3 |
RESAMPLE_FNS float, 4, 2, s, pf_1 |
%endif |
%if HAVE_FMA4_EXTERNAL |
INIT_XMM fma4 |
RESAMPLE_FNS float, 4, 2, s, pf_1 |
%endif |
%if ARCH_X86_32 |
INIT_MMX mmxext |
RESAMPLE_FNS int16, 2, 1 |
%endif |
INIT_XMM sse2 |
RESAMPLE_FNS int16, 2, 1 |
%if HAVE_XOP_EXTERNAL |
INIT_XMM xop |
RESAMPLE_FNS int16, 2, 1 |
%endif |
INIT_XMM sse2 |
RESAMPLE_FNS double, 8, 3, d, pdbl_1 |
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/resample_init.c |
---|
0,0 → 1,90 |
/* |
* audio resampling |
* Copyright (c) 2004-2012 Michael Niedermayer <michaelni@gmx.at> |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
/** |
* @file |
* audio resampling |
* @author Michael Niedermayer <michaelni@gmx.at> |
*/ |
#include "libavutil/x86/cpu.h" |
#include "libswresample/resample.h" |
#define RESAMPLE_FUNCS(type, opt) \ |
int ff_resample_common_##type##_##opt(ResampleContext *c, void *dst, \ |
const void *src, int sz, int upd); \ |
int ff_resample_linear_##type##_##opt(ResampleContext *c, void *dst, \ |
const void *src, int sz, int upd) |
RESAMPLE_FUNCS(int16, mmxext); |
RESAMPLE_FUNCS(int16, sse2); |
RESAMPLE_FUNCS(int16, xop); |
RESAMPLE_FUNCS(float, sse); |
RESAMPLE_FUNCS(float, avx); |
RESAMPLE_FUNCS(float, fma3); |
RESAMPLE_FUNCS(float, fma4); |
RESAMPLE_FUNCS(double, sse2); |
av_cold void swri_resample_dsp_x86_init(ResampleContext *c) |
{ |
int av_unused mm_flags = av_get_cpu_flags(); |
switch(c->format){ |
case AV_SAMPLE_FMT_S16P: |
if (ARCH_X86_32 && EXTERNAL_MMXEXT(mm_flags)) { |
c->dsp.resample = c->linear ? ff_resample_linear_int16_mmxext |
: ff_resample_common_int16_mmxext; |
} |
if (EXTERNAL_SSE2(mm_flags)) { |
c->dsp.resample = c->linear ? ff_resample_linear_int16_sse2 |
: ff_resample_common_int16_sse2; |
} |
if (EXTERNAL_XOP(mm_flags)) { |
c->dsp.resample = c->linear ? ff_resample_linear_int16_xop |
: ff_resample_common_int16_xop; |
} |
break; |
case AV_SAMPLE_FMT_FLTP: |
if (EXTERNAL_SSE(mm_flags)) { |
c->dsp.resample = c->linear ? ff_resample_linear_float_sse |
: ff_resample_common_float_sse; |
} |
if (EXTERNAL_AVX_FAST(mm_flags)) { |
c->dsp.resample = c->linear ? ff_resample_linear_float_avx |
: ff_resample_common_float_avx; |
} |
if (EXTERNAL_FMA3(mm_flags) && !(mm_flags & AV_CPU_FLAG_AVXSLOW)) { |
c->dsp.resample = c->linear ? ff_resample_linear_float_fma3 |
: ff_resample_common_float_fma3; |
} |
if (EXTERNAL_FMA4(mm_flags)) { |
c->dsp.resample = c->linear ? ff_resample_linear_float_fma4 |
: ff_resample_common_float_fma4; |
} |
break; |
case AV_SAMPLE_FMT_DBLP: |
if (EXTERNAL_SSE2(mm_flags)) { |
c->dsp.resample = c->linear ? ff_resample_linear_double_sse2 |
: ff_resample_common_double_sse2; |
} |
break; |
} |
} |
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/w64xmmtest.c |
---|
0,0 → 1,29 |
/* |
* check XMM registers for clobbers on Win64 |
* Copyright (c) 2013 Martin Storsjo |
* |
* This file is part of FFmpeg. |
* |
* FFmpeg is free software; you can redistribute it and/or |
* modify it under the terms of the GNU Lesser General Public |
* License as published by the Free Software Foundation; either |
* version 2.1 of the License, or (at your option) any later version. |
* |
* FFmpeg is distributed in the hope that it will be useful, |
* but WITHOUT ANY WARRANTY; without even the implied warranty of |
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
* Lesser General Public License for more details. |
* |
* You should have received a copy of the GNU Lesser General Public |
* License along with FFmpeg; if not, write to the Free Software |
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
*/ |
#include "libswresample/swresample.h" |
#include "libavutil/x86/w64xmmtest.h" |
wrap(swr_convert(struct SwrContext *s, uint8_t **out, int out_count, |
const uint8_t **in , int in_count)) |
{ |
testxmmclobbers(swr_convert, s, out, out_count, in, in_count); |
} |