Subversion Repositories Kolibri OS

Compare Revisions

Regard whitespace Rev 6146 → Rev 6147

/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/Makefile
0,0 → 1,9
YASM-OBJS += x86/audio_convert.o\
x86/rematrix.o\
x86/resample.o\
 
OBJS += x86/audio_convert_init.o\
x86/rematrix_init.o\
x86/resample_init.o\
 
OBJS-$(CONFIG_XMM_CLOBBER_TEST) += x86/w64xmmtest.o
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/audio_convert.asm
0,0 → 1,739
;******************************************************************************
;* Copyright (c) 2012 Michael Niedermayer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
SECTION_RODATA 32
flt2pm31: times 8 dd 4.6566129e-10
flt2p31 : times 8 dd 2147483648.0
flt2p15 : times 8 dd 32768.0
 
word_unpack_shuf : db 0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15
 
SECTION .text
 
 
;to, from, a/u, log2_outsize, log_intsize, const
%macro PACK_2CH 5-7
cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2
mov src2q , [srcq+gprsize]
mov srcq , [srcq]
mov dstq , [dstq]
%ifidn %3, a
test dstq, mmsize-1
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
test src2q, mmsize-1
jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
%else
pack_2ch_%2_to_%1_u_int %+ SUFFIX:
%endif
lea srcq , [srcq + (1<<%5)*lenq]
lea src2q, [src2q + (1<<%5)*lenq]
lea dstq , [dstq + (2<<%4)*lenq]
neg lenq
%7 m0,m1,m2,m3,m4,m5
.next:
%if %4 >= %5
mov%3 m0, [ srcq +(1<<%5)*lenq]
mova m1, m0
mov%3 m2, [ src2q+(1<<%5)*lenq]
%if %5 == 1
punpcklwd m0, m2
punpckhwd m1, m2
%else
punpckldq m0, m2
punpckhdq m1, m2
%endif
%6 m0,m1,m2,m3,m4,m5
%else
mov%3 m0, [ srcq +(1<<%5)*lenq]
mov%3 m1, [mmsize + srcq +(1<<%5)*lenq]
mov%3 m2, [ src2q+(1<<%5)*lenq]
mov%3 m3, [mmsize + src2q+(1<<%5)*lenq]
%6 m0,m1,m2,m3,m4,m5
mova m2, m0
punpcklwd m0, m1
punpckhwd m2, m1
SWAP 1,2
%endif
mov%3 [ dstq+(2<<%4)*lenq], m0
mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1
%if %4 > %5
mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
add lenq, 4*mmsize/(2<<%4)
%else
add lenq, 2*mmsize/(2<<%4)
%endif
jl .next
REP_RET
%endmacro
 
%macro UNPACK_2CH 5-7
cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2
mov dst2q , [dstq+gprsize]
mov srcq , [srcq]
mov dstq , [dstq]
%ifidn %3, a
test dstq, mmsize-1
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
test dst2q, mmsize-1
jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
%else
unpack_2ch_%2_to_%1_u_int %+ SUFFIX:
%endif
lea srcq , [srcq + (2<<%5)*lenq]
lea dstq , [dstq + (1<<%4)*lenq]
lea dst2q, [dst2q + (1<<%4)*lenq]
neg lenq
%7 m0,m1,m2,m3,m4,m5
mova m6, [word_unpack_shuf]
.next:
mov%3 m0, [ srcq +(2<<%5)*lenq]
mov%3 m2, [ mmsize + srcq +(2<<%5)*lenq]
%if %5 == 1
%ifidn SUFFIX, _ssse3
pshufb m0, m6
mova m1, m0
pshufb m2, m6
punpcklqdq m0,m2
punpckhqdq m1,m2
%else
mova m1, m0
punpcklwd m0,m2
punpckhwd m1,m2
 
mova m2, m0
punpcklwd m0,m1
punpckhwd m2,m1
 
mova m1, m0
punpcklwd m0,m2
punpckhwd m1,m2
%endif
%else
mova m1, m0
shufps m0, m2, 10001000b
shufps m1, m2, 11011101b
%endif
%if %4 < %5
mov%3 m2, [2*mmsize + srcq +(2<<%5)*lenq]
mova m3, m2
mov%3 m4, [3*mmsize + srcq +(2<<%5)*lenq]
shufps m2, m4, 10001000b
shufps m3, m4, 11011101b
SWAP 1,2
%endif
%6 m0,m1,m2,m3,m4,m5
mov%3 [ dstq+(1<<%4)*lenq], m0
%if %4 > %5
mov%3 [ dst2q+(1<<%4)*lenq], m2
mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1
mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3
add lenq, 2*mmsize/(1<<%4)
%else
mov%3 [ dst2q+(1<<%4)*lenq], m1
add lenq, mmsize/(1<<%4)
%endif
jl .next
REP_RET
%endmacro
 
%macro CONV 5-7
cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
mov srcq , [srcq]
mov dstq , [dstq]
%ifidn %3, a
test dstq, mmsize-1
jne %2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne %2_to_%1_u_int %+ SUFFIX
%else
%2_to_%1_u_int %+ SUFFIX:
%endif
lea srcq , [srcq + (1<<%5)*lenq]
lea dstq , [dstq + (1<<%4)*lenq]
neg lenq
%7 m0,m1,m2,m3,m4,m5
.next:
mov%3 m0, [ srcq +(1<<%5)*lenq]
mov%3 m1, [ mmsize + srcq +(1<<%5)*lenq]
%if %4 < %5
mov%3 m2, [2*mmsize + srcq +(1<<%5)*lenq]
mov%3 m3, [3*mmsize + srcq +(1<<%5)*lenq]
%endif
%6 m0,m1,m2,m3,m4,m5
mov%3 [ dstq+(1<<%4)*lenq], m0
mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1
%if %4 > %5
mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2
mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3
add lenq, 4*mmsize/(1<<%4)
%else
add lenq, 2*mmsize/(1<<%4)
%endif
jl .next
%if mmsize == 8
emms
RET
%else
REP_RET
%endif
%endmacro
 
%macro PACK_6CH 5-7
cglobal pack_6ch_%2_to_%1_%3, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
%if ARCH_X86_64
mov lend, r2d
%else
%define lend dword r2m
%endif
mov src1q, [srcq+1*gprsize]
mov src2q, [srcq+2*gprsize]
mov src3q, [srcq+3*gprsize]
mov src4q, [srcq+4*gprsize]
mov src5q, [srcq+5*gprsize]
mov srcq, [srcq]
mov dstq, [dstq]
%ifidn %3, a
test dstq, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test src1q, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test src2q, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test src3q, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test src4q, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
test src5q, mmsize-1
jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
%else
pack_6ch_%2_to_%1_u_int %+ SUFFIX:
%endif
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
sub src4q, srcq
sub src5q, srcq
%7 x,x,x,x,m7,x
.loop:
mov%3 m0, [srcq ]
mov%3 m1, [srcq+src1q]
mov%3 m2, [srcq+src2q]
mov%3 m3, [srcq+src3q]
mov%3 m4, [srcq+src4q]
mov%3 m5, [srcq+src5q]
%if cpuflag(sse)
SBUTTERFLYPS 0, 1, 6
SBUTTERFLYPS 2, 3, 6
SBUTTERFLYPS 4, 5, 6
 
%if cpuflag(avx)
blendps m6, m4, m0, 1100b
%else
movaps m6, m4
shufps m4, m0, q3210
SWAP 4,6
%endif
movlhps m0, m2
movhlps m4, m2
%if cpuflag(avx)
blendps m2, m5, m1, 1100b
%else
movaps m2, m5
shufps m5, m1, q3210
SWAP 2,5
%endif
movlhps m1, m3
movhlps m5, m3
 
%6 m0,m6,x,x,m7,m3
%6 m4,m1,x,x,m7,m3
%6 m2,m5,x,x,m7,m3
 
mov %+ %3 %+ ps [dstq ], m0
mov %+ %3 %+ ps [dstq+16], m6
mov %+ %3 %+ ps [dstq+32], m4
mov %+ %3 %+ ps [dstq+48], m1
mov %+ %3 %+ ps [dstq+64], m2
mov %+ %3 %+ ps [dstq+80], m5
%else ; mmx
SBUTTERFLY dq, 0, 1, 6
SBUTTERFLY dq, 2, 3, 6
SBUTTERFLY dq, 4, 5, 6
 
movq [dstq ], m0
movq [dstq+ 8], m2
movq [dstq+16], m4
movq [dstq+24], m1
movq [dstq+32], m3
movq [dstq+40], m5
%endif
add srcq, mmsize
add dstq, mmsize*6
sub lend, mmsize/4
jg .loop
%if mmsize == 8
emms
RET
%else
REP_RET
%endif
%endmacro
 
%macro UNPACK_6CH 5-7
cglobal unpack_6ch_%2_to_%1_%3, 2, 8, 8, dst, src, dst1, dst2, dst3, dst4, dst5, len
%if ARCH_X86_64
mov lend, r2d
%else
%define lend dword r2m
%endif
mov dst1q, [dstq+1*gprsize]
mov dst2q, [dstq+2*gprsize]
mov dst3q, [dstq+3*gprsize]
mov dst4q, [dstq+4*gprsize]
mov dst5q, [dstq+5*gprsize]
mov dstq, [dstq]
mov srcq, [srcq]
%ifidn %3, a
test dstq, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test dst1q, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test dst2q, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test dst3q, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test dst4q, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
test dst5q, mmsize-1
jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
%else
unpack_6ch_%2_to_%1_u_int %+ SUFFIX:
%endif
sub dst1q, dstq
sub dst2q, dstq
sub dst3q, dstq
sub dst4q, dstq
sub dst5q, dstq
%7 x,x,x,x,m7,x
.loop:
mov%3 m0, [srcq ]
mov%3 m1, [srcq+16]
mov%3 m2, [srcq+32]
mov%3 m3, [srcq+48]
mov%3 m4, [srcq+64]
mov%3 m5, [srcq+80]
 
SBUTTERFLYPS 0, 3, 6
SBUTTERFLYPS 1, 4, 6
SBUTTERFLYPS 2, 5, 6
SBUTTERFLYPS 0, 4, 6
SBUTTERFLYPS 3, 2, 6
SBUTTERFLYPS 1, 5, 6
SWAP 1, 4
SWAP 2, 3
 
%6 m0,m1,x,x,m7,m6
%6 m2,m3,x,x,m7,m6
%6 m4,m5,x,x,m7,m6
 
mov %+ %3 %+ ps [dstq ], m0
mov %+ %3 %+ ps [dstq+dst1q], m1
mov %+ %3 %+ ps [dstq+dst2q], m2
mov %+ %3 %+ ps [dstq+dst3q], m3
mov %+ %3 %+ ps [dstq+dst4q], m4
mov %+ %3 %+ ps [dstq+dst5q], m5
 
add srcq, mmsize*6
add dstq, mmsize
sub lend, mmsize/4
jg .loop
REP_RET
%endmacro
 
%define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32)
 
%macro PACK_8CH 5-7
cglobal pack_8ch_%2_to_%1_%3, 2,PACK_8CH_GPRS,10, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7
mov dstq, [dstq]
%if ARCH_X86_32
DEFINE_ARGS dst, src, src2, src3, src4, src5, src6
%define lend dword r2m
%define src1q r0q
%define src1m dword [rsp+32]
%if HAVE_ALIGNED_STACK == 0
DEFINE_ARGS dst, src, src2, src3, src5, src6
%define src4q r0q
%define src4m dword [rsp+36]
%endif
%define src7q r0q
%define src7m dword [rsp+40]
mov dstm, dstq
%endif
mov src7q, [srcq+7*gprsize]
mov src6q, [srcq+6*gprsize]
%if ARCH_X86_32
mov src7m, src7q
%endif
mov src5q, [srcq+5*gprsize]
mov src4q, [srcq+4*gprsize]
mov src3q, [srcq+3*gprsize]
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
mov src4m, src4q
%endif
mov src2q, [srcq+2*gprsize]
mov src1q, [srcq+1*gprsize]
mov srcq, [srcq]
%ifidn %3, a
%if ARCH_X86_32
test dstmp, mmsize-1
%else
test dstq, mmsize-1
%endif
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test srcq, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test src1q, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test src2q, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test src3q, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
test src4m, mmsize-1
%else
test src4q, mmsize-1
%endif
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test src5q, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
test src6q, mmsize-1
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
%if ARCH_X86_32
test src7m, mmsize-1
%else
test src7q, mmsize-1
%endif
jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
%else
pack_8ch_%2_to_%1_u_int %+ SUFFIX:
%endif
sub src1q, srcq
sub src2q, srcq
sub src3q, srcq
%if ARCH_X86_64 || HAVE_ALIGNED_STACK
sub src4q, srcq
%else
sub src4m, srcq
%endif
sub src5q, srcq
sub src6q, srcq
%if ARCH_X86_64
sub src7q, srcq
%else
mov src1m, src1q
sub src7m, srcq
%endif
 
%if ARCH_X86_64
%7 x,x,x,x,m9,x
%elifidn %1, int32
%define m9 [flt2p31]
%else
%define m9 [flt2pm31]
%endif
 
.loop:
mov%3 m0, [srcq ]
mov%3 m1, [srcq+src1q]
mov%3 m2, [srcq+src2q]
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
mov src4q, src4m
%endif
mov%3 m3, [srcq+src3q]
mov%3 m4, [srcq+src4q]
mov%3 m5, [srcq+src5q]
%if ARCH_X86_32
mov src7q, src7m
%endif
mov%3 m6, [srcq+src6q]
mov%3 m7, [srcq+src7q]
 
%if ARCH_X86_64
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
 
%6 m0,m1,x,x,m9,m8
%6 m2,m3,x,x,m9,m8
%6 m4,m5,x,x,m9,m8
%6 m6,m7,x,x,m9,m8
 
mov%3 [dstq], m0
%else
mov dstq, dstm
 
TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1
 
%6 m0,m1,x,x,m9,m2
mova m2, [rsp]
mov%3 [dstq], m0
%6 m2,m3,x,x,m9,m0
%6 m4,m5,x,x,m9,m0
%6 m6,m7,x,x,m9,m0
 
%endif
 
mov%3 [dstq+16], m1
mov%3 [dstq+32], m2
mov%3 [dstq+48], m3
mov%3 [dstq+64], m4
mov%3 [dstq+80], m5
mov%3 [dstq+96], m6
mov%3 [dstq+112], m7
 
add srcq, mmsize
add dstq, mmsize*8
%if ARCH_X86_32
mov dstm, dstq
mov src1q, src1m
%endif
sub lend, mmsize/4
jg .loop
REP_RET
%endmacro
 
%macro INT16_TO_INT32_N 6
pxor m2, m2
pxor m3, m3
punpcklwd m2, m1
punpckhwd m3, m1
SWAP 4,0
pxor m0, m0
pxor m1, m1
punpcklwd m0, m4
punpckhwd m1, m4
%endmacro
 
%macro INT32_TO_INT16_N 6
psrad m0, 16
psrad m1, 16
psrad m2, 16
psrad m3, 16
packssdw m0, m1
packssdw m2, m3
SWAP 1,2
%endmacro
 
%macro INT32_TO_FLOAT_INIT 6
mova %5, [flt2pm31]
%endmacro
%macro INT32_TO_FLOAT_N 6
cvtdq2ps %1, %1
cvtdq2ps %2, %2
mulps %1, %1, %5
mulps %2, %2, %5
%endmacro
 
%macro FLOAT_TO_INT32_INIT 6
mova %5, [flt2p31]
%endmacro
%macro FLOAT_TO_INT32_N 6
mulps %1, %5
mulps %2, %5
cvtps2dq %6, %1
cmpps %1, %1, %5, 5
paddd %1, %6
cvtps2dq %6, %2
cmpps %2, %2, %5, 5
paddd %2, %6
%endmacro
 
%macro INT16_TO_FLOAT_INIT 6
mova m5, [flt2pm31]
%endmacro
%macro INT16_TO_FLOAT_N 6
INT16_TO_INT32_N %1,%2,%3,%4,%5,%6
cvtdq2ps m0, m0
cvtdq2ps m1, m1
cvtdq2ps m2, m2
cvtdq2ps m3, m3
mulps m0, m0, m5
mulps m1, m1, m5
mulps m2, m2, m5
mulps m3, m3, m5
%endmacro
 
%macro FLOAT_TO_INT16_INIT 6
mova m5, [flt2p15]
%endmacro
%macro FLOAT_TO_INT16_N 6
mulps m0, m5
mulps m1, m5
mulps m2, m5
mulps m3, m5
cvtps2dq m0, m0
cvtps2dq m1, m1
packssdw m0, m1
cvtps2dq m1, m2
cvtps2dq m3, m3
packssdw m1, m3
%endmacro
 
%macro NOP_N 0-6
%endmacro
 
INIT_MMX mmx
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
 
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
 
INIT_XMM sse
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
 
UNPACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
UNPACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
 
INIT_XMM sse2
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
 
PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
 
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
 
CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 
PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 
UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
 
PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 
UNPACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
UNPACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 
PACK_8CH float, float, u, 2, 2, NOP_N, NOP_N
PACK_8CH float, float, a, 2, 2, NOP_N, NOP_N
 
PACK_8CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_8CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_8CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_8CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 
INIT_XMM ssse3
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
 
%if HAVE_AVX_EXTERNAL
INIT_XMM avx
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
 
UNPACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
UNPACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
 
PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 
UNPACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
UNPACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
UNPACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 
PACK_8CH float, float, u, 2, 2, NOP_N, NOP_N
PACK_8CH float, float, a, 2, 2, NOP_N, NOP_N
 
PACK_8CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_8CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
PACK_8CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
PACK_8CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
 
INIT_YMM avx
CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
%endif
 
%if HAVE_AVX2_EXTERNAL
INIT_YMM avx2
CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
%endif
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/audio_convert_init.c
0,0 → 1,181
/*
* Copyright (C) 2012 Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of libswresample
*
* libswresample is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* libswresample is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with libswresample; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/x86/cpu.h"
#include "libswresample/swresample_internal.h"
#include "libswresample/audioconvert.h"
 
#define PROTO(pre, in, out, cap) void ff ## pre ## in## _to_ ##out## _a_ ##cap(uint8_t **dst, const uint8_t **src, int len);
#define PROTO2(pre, out, cap) PROTO(pre, int16, out, cap) PROTO(pre, int32, out, cap) PROTO(pre, float, out, cap)
#define PROTO3(pre, cap) PROTO2(pre, int16, cap) PROTO2(pre, int32, cap) PROTO2(pre, float, cap)
#define PROTO4(pre) PROTO3(pre, mmx) PROTO3(pre, sse) PROTO3(pre, sse2) PROTO3(pre, ssse3) PROTO3(pre, sse4) PROTO3(pre, avx) PROTO3(pre, avx2)
PROTO4(_)
PROTO4(_pack_2ch_)
PROTO4(_pack_6ch_)
PROTO4(_pack_8ch_)
PROTO4(_unpack_2ch_)
PROTO4(_unpack_6ch_)
 
av_cold void swri_audio_convert_init_x86(struct AudioConvert *ac,
enum AVSampleFormat out_fmt,
enum AVSampleFormat in_fmt,
int channels){
int mm_flags = av_get_cpu_flags();
 
ac->simd_f= NULL;
 
//FIXME add memcpy case
 
#define MULTI_CAPS_FUNC(flag, cap) \
if (EXTERNAL_##flag(mm_flags)) {\
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S16 || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S16P)\
ac->simd_f = ff_int16_to_int32_a_ ## cap;\
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_S32P)\
ac->simd_f = ff_int32_to_int16_a_ ## cap;\
}
 
MULTI_CAPS_FUNC(MMX, mmx)
MULTI_CAPS_FUNC(SSE2, sse2)
 
if(EXTERNAL_MMX(mm_flags)) {
if(channels == 6) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_6ch_float_to_float_a_mmx;
}
}
if(EXTERNAL_SSE(mm_flags)) {
if(channels == 6) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_6ch_float_to_float_a_sse;
 
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S32)
ac->simd_f = ff_unpack_6ch_float_to_float_a_sse;
}
}
if(EXTERNAL_SSE2(mm_flags)) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_int32_to_float_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S16 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S16P)
ac->simd_f = ff_int16_to_float_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLTP)
ac->simd_f = ff_float_to_int32_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLTP)
ac->simd_f = ff_float_to_int16_a_sse2;
 
if(channels == 2) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_2ch_int32_to_int32_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_S16P)
ac->simd_f = ff_pack_2ch_int16_to_int16_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S16P)
ac->simd_f = ff_pack_2ch_int16_to_int32_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_2ch_int32_to_int16_a_sse2;
 
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S32)
ac->simd_f = ff_unpack_2ch_int32_to_int32_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_S16)
ac->simd_f = ff_unpack_2ch_int16_to_int16_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S16)
ac->simd_f = ff_unpack_2ch_int16_to_int32_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_S32)
ac->simd_f = ff_unpack_2ch_int32_to_int16_a_sse2;
 
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_2ch_int32_to_float_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP)
ac->simd_f = ff_pack_2ch_float_to_int32_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S16P)
ac->simd_f = ff_pack_2ch_int16_to_float_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S16 && in_fmt == AV_SAMPLE_FMT_FLTP)
ac->simd_f = ff_pack_2ch_float_to_int16_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32)
ac->simd_f = ff_unpack_2ch_int32_to_float_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLT)
ac->simd_f = ff_unpack_2ch_float_to_int32_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S16)
ac->simd_f = ff_unpack_2ch_int16_to_float_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_FLT)
ac->simd_f = ff_unpack_2ch_float_to_int16_a_sse2;
}
if(channels == 6) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_6ch_int32_to_float_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP)
ac->simd_f = ff_pack_6ch_float_to_int32_a_sse2;
 
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32)
ac->simd_f = ff_unpack_6ch_int32_to_float_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLT)
ac->simd_f = ff_unpack_6ch_float_to_int32_a_sse2;
}
if(channels == 8) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_8ch_float_to_float_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_8ch_int32_to_float_a_sse2;
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP)
ac->simd_f = ff_pack_8ch_float_to_int32_a_sse2;
}
}
if(EXTERNAL_SSSE3(mm_flags)) {
if(channels == 2) {
if( out_fmt == AV_SAMPLE_FMT_S16P && in_fmt == AV_SAMPLE_FMT_S16)
ac->simd_f = ff_unpack_2ch_int16_to_int16_a_ssse3;
if( out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S16)
ac->simd_f = ff_unpack_2ch_int16_to_int32_a_ssse3;
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S16)
ac->simd_f = ff_unpack_2ch_int16_to_float_a_ssse3;
}
}
if(EXTERNAL_AVX_FAST(mm_flags)) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32 || out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_int32_to_float_a_avx;
}
if(EXTERNAL_AVX(mm_flags)) {
if(channels == 6) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_6ch_float_to_float_a_avx;
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_6ch_int32_to_float_a_avx;
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP)
ac->simd_f = ff_pack_6ch_float_to_int32_a_avx;
 
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_S32)
ac->simd_f = ff_unpack_6ch_float_to_float_a_avx;
if( out_fmt == AV_SAMPLE_FMT_FLTP && in_fmt == AV_SAMPLE_FMT_S32)
ac->simd_f = ff_unpack_6ch_int32_to_float_a_avx;
if( out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLT)
ac->simd_f = ff_unpack_6ch_float_to_int32_a_avx;
}
if(channels == 8) {
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_FLTP || out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_8ch_float_to_float_a_avx;
if( out_fmt == AV_SAMPLE_FMT_FLT && in_fmt == AV_SAMPLE_FMT_S32P)
ac->simd_f = ff_pack_8ch_int32_to_float_a_avx;
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLTP)
ac->simd_f = ff_pack_8ch_float_to_int32_a_avx;
}
}
if(EXTERNAL_AVX2(mm_flags)) {
if( out_fmt == AV_SAMPLE_FMT_S32 && in_fmt == AV_SAMPLE_FMT_FLT || out_fmt == AV_SAMPLE_FMT_S32P && in_fmt == AV_SAMPLE_FMT_FLTP)
ac->simd_f = ff_float_to_int32_a_avx2;
}
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/rematrix.asm
0,0 → 1,250
;******************************************************************************
;* Copyright (c) 2012 Michael Niedermayer
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
 
SECTION_RODATA 32
dw1: times 8 dd 1
w1 : times 16 dw 1
 
SECTION .text
 
%macro MIX2_FLT 1
cglobal mix_2_1_%1_float, 7, 7, 6, out, in1, in2, coeffp, index1, index2, len
%ifidn %1, a
test in1q, mmsize-1
jne mix_2_1_float_u_int %+ SUFFIX
test in2q, mmsize-1
jne mix_2_1_float_u_int %+ SUFFIX
test outq, mmsize-1
jne mix_2_1_float_u_int %+ SUFFIX
%else
mix_2_1_float_u_int %+ SUFFIX:
%endif
VBROADCASTSS m4, [coeffpq + 4*index1q]
VBROADCASTSS m5, [coeffpq + 4*index2q]
shl lend , 2
add in1q , lenq
add in2q , lenq
add outq , lenq
neg lenq
.next:
%ifidn %1, a
mulps m0, m4, [in1q + lenq ]
mulps m1, m5, [in2q + lenq ]
mulps m2, m4, [in1q + lenq + mmsize]
mulps m3, m5, [in2q + lenq + mmsize]
%else
movu m0, [in1q + lenq ]
movu m1, [in2q + lenq ]
movu m2, [in1q + lenq + mmsize]
movu m3, [in2q + lenq + mmsize]
mulps m0, m0, m4
mulps m1, m1, m5
mulps m2, m2, m4
mulps m3, m3, m5
%endif
addps m0, m0, m1
addps m2, m2, m3
mov%1 [outq + lenq ], m0
mov%1 [outq + lenq + mmsize], m2
add lenq, mmsize*2
jl .next
REP_RET
%endmacro
 
%macro MIX1_FLT 1
cglobal mix_1_1_%1_float, 5, 5, 3, out, in, coeffp, index, len
%ifidn %1, a
test inq, mmsize-1
jne mix_1_1_float_u_int %+ SUFFIX
test outq, mmsize-1
jne mix_1_1_float_u_int %+ SUFFIX
%else
mix_1_1_float_u_int %+ SUFFIX:
%endif
VBROADCASTSS m2, [coeffpq + 4*indexq]
shl lenq , 2
add inq , lenq
add outq , lenq
neg lenq
.next:
%ifidn %1, a
mulps m0, m2, [inq + lenq ]
mulps m1, m2, [inq + lenq + mmsize]
%else
movu m0, [inq + lenq ]
movu m1, [inq + lenq + mmsize]
mulps m0, m0, m2
mulps m1, m1, m2
%endif
mov%1 [outq + lenq ], m0
mov%1 [outq + lenq + mmsize], m1
add lenq, mmsize*2
jl .next
REP_RET
%endmacro
 
%macro MIX1_INT16 1
cglobal mix_1_1_%1_int16, 5, 5, 6, out, in, coeffp, index, len
%ifidn %1, a
test inq, mmsize-1
jne mix_1_1_int16_u_int %+ SUFFIX
test outq, mmsize-1
jne mix_1_1_int16_u_int %+ SUFFIX
%else
mix_1_1_int16_u_int %+ SUFFIX:
%endif
movd m4, [coeffpq + 4*indexq]
SPLATW m5, m4
psllq m4, 32
psrlq m4, 48
mova m0, [w1]
psllw m0, m4
psrlw m0, 1
punpcklwd m5, m0
add lenq , lenq
add inq , lenq
add outq , lenq
neg lenq
.next:
mov%1 m0, [inq + lenq ]
mov%1 m2, [inq + lenq + mmsize]
mova m1, m0
mova m3, m2
punpcklwd m0, [w1]
punpckhwd m1, [w1]
punpcklwd m2, [w1]
punpckhwd m3, [w1]
pmaddwd m0, m5
pmaddwd m1, m5
pmaddwd m2, m5
pmaddwd m3, m5
psrad m0, m4
psrad m1, m4
psrad m2, m4
psrad m3, m4
packssdw m0, m1
packssdw m2, m3
mov%1 [outq + lenq ], m0
mov%1 [outq + lenq + mmsize], m2
add lenq, mmsize*2
jl .next
%if mmsize == 8
emms
RET
%else
REP_RET
%endif
%endmacro
 
%macro MIX2_INT16 1
cglobal mix_2_1_%1_int16, 7, 7, 8, out, in1, in2, coeffp, index1, index2, len
%ifidn %1, a
test in1q, mmsize-1
jne mix_2_1_int16_u_int %+ SUFFIX
test in2q, mmsize-1
jne mix_2_1_int16_u_int %+ SUFFIX
test outq, mmsize-1
jne mix_2_1_int16_u_int %+ SUFFIX
%else
mix_2_1_int16_u_int %+ SUFFIX:
%endif
movd m4, [coeffpq + 4*index1q]
movd m6, [coeffpq + 4*index2q]
SPLATW m5, m4
SPLATW m6, m6
psllq m4, 32
psrlq m4, 48
mova m7, [dw1]
pslld m7, m4
psrld m7, 1
punpcklwd m5, m6
add lend , lend
add in1q , lenq
add in2q , lenq
add outq , lenq
neg lenq
.next:
mov%1 m0, [in1q + lenq ]
mov%1 m2, [in2q + lenq ]
mova m1, m0
punpcklwd m0, m2
punpckhwd m1, m2
 
mov%1 m2, [in1q + lenq + mmsize]
mov%1 m6, [in2q + lenq + mmsize]
mova m3, m2
punpcklwd m2, m6
punpckhwd m3, m6
 
pmaddwd m0, m5
pmaddwd m1, m5
pmaddwd m2, m5
pmaddwd m3, m5
paddd m0, m7
paddd m1, m7
paddd m2, m7
paddd m3, m7
psrad m0, m4
psrad m1, m4
psrad m2, m4
psrad m3, m4
packssdw m0, m1
packssdw m2, m3
mov%1 [outq + lenq ], m0
mov%1 [outq + lenq + mmsize], m2
add lenq, mmsize*2
jl .next
%if mmsize == 8
emms
RET
%else
REP_RET
%endif
%endmacro
 
 
INIT_MMX mmx
MIX1_INT16 u
MIX1_INT16 a
MIX2_INT16 u
MIX2_INT16 a
 
INIT_XMM sse
MIX2_FLT u
MIX2_FLT a
MIX1_FLT u
MIX1_FLT a
 
INIT_XMM sse2
MIX1_INT16 u
MIX1_INT16 a
MIX2_INT16 u
MIX2_INT16 a
 
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
MIX2_FLT u
MIX2_FLT a
MIX1_FLT u
MIX1_FLT a
%endif
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/rematrix_init.c
0,0 → 1,90
/*
* Copyright (C) 2012 Michael Niedermayer (michaelni@gmx.at)
*
* This file is part of libswresample
*
* libswresample is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* libswresample is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with libswresample; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libavutil/x86/cpu.h"
#include "libswresample/swresample_internal.h"
 
#define D(type, simd) \
mix_1_1_func_type ff_mix_1_1_a_## type ## _ ## simd;\
mix_2_1_func_type ff_mix_2_1_a_## type ## _ ## simd;
 
D(float, sse)
D(float, avx)
D(int16, mmx)
D(int16, sse2)
 
av_cold int swri_rematrix_init_x86(struct SwrContext *s){
#if HAVE_YASM
int mm_flags = av_get_cpu_flags();
int nb_in = av_get_channel_layout_nb_channels(s->in_ch_layout);
int nb_out = av_get_channel_layout_nb_channels(s->out_ch_layout);
int num = nb_in * nb_out;
int i,j;
 
s->mix_1_1_simd = NULL;
s->mix_2_1_simd = NULL;
 
if (s->midbuf.fmt == AV_SAMPLE_FMT_S16P){
if(EXTERNAL_MMX(mm_flags)) {
s->mix_1_1_simd = ff_mix_1_1_a_int16_mmx;
s->mix_2_1_simd = ff_mix_2_1_a_int16_mmx;
}
if(EXTERNAL_SSE2(mm_flags)) {
s->mix_1_1_simd = ff_mix_1_1_a_int16_sse2;
s->mix_2_1_simd = ff_mix_2_1_a_int16_sse2;
}
s->native_simd_matrix = av_mallocz_array(num, 2 * sizeof(int16_t));
s->native_simd_one = av_mallocz(2 * sizeof(int16_t));
if (!s->native_simd_matrix || !s->native_simd_one)
return AVERROR(ENOMEM);
 
for(i=0; i<nb_out; i++){
int sh = 0;
for(j=0; j<nb_in; j++)
sh = FFMAX(sh, FFABS(((int*)s->native_matrix)[i * nb_in + j]));
sh = FFMAX(av_log2(sh) - 14, 0);
for(j=0; j<nb_in; j++) {
((int16_t*)s->native_simd_matrix)[2*(i * nb_in + j)+1] = 15 - sh;
((int16_t*)s->native_simd_matrix)[2*(i * nb_in + j)] =
((((int*)s->native_matrix)[i * nb_in + j]) + (1<<sh>>1)) >> sh;
}
}
((int16_t*)s->native_simd_one)[1] = 14;
((int16_t*)s->native_simd_one)[0] = 16384;
} else if(s->midbuf.fmt == AV_SAMPLE_FMT_FLTP){
if(EXTERNAL_SSE(mm_flags)) {
s->mix_1_1_simd = ff_mix_1_1_a_float_sse;
s->mix_2_1_simd = ff_mix_2_1_a_float_sse;
}
if(EXTERNAL_AVX_FAST(mm_flags)) {
s->mix_1_1_simd = ff_mix_1_1_a_float_avx;
s->mix_2_1_simd = ff_mix_2_1_a_float_avx;
}
s->native_simd_matrix = av_mallocz_array(num, sizeof(float));
s->native_simd_one = av_mallocz(sizeof(float));
if (!s->native_simd_matrix || !s->native_simd_one)
return AVERROR(ENOMEM);
memcpy(s->native_simd_matrix, s->native_matrix, num * sizeof(float));
memcpy(s->native_simd_one, s->native_one, sizeof(float));
}
#endif
 
return 0;
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/resample.asm
0,0 → 1,605
;******************************************************************************
;* Copyright (c) 2012 Michael Niedermayer
;* Copyright (c) 2014 James Almer <jamrial <at> gmail.com>
;* Copyright (c) 2014 Ronald S. Bultje <rsbultje@gmail.com>
;*
;* This file is part of FFmpeg.
;*
;* FFmpeg is free software; you can redistribute it and/or
;* modify it under the terms of the GNU Lesser General Public
;* License as published by the Free Software Foundation; either
;* version 2.1 of the License, or (at your option) any later version.
;*
;* FFmpeg is distributed in the hope that it will be useful,
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
;* Lesser General Public License for more details.
;*
;* You should have received a copy of the GNU Lesser General Public
;* License along with FFmpeg; if not, write to the Free Software
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
;******************************************************************************
 
%include "libavutil/x86/x86util.asm"
 
%if ARCH_X86_64
%define pointer resq
%else
%define pointer resd
%endif
 
struc ResampleContext
.av_class: pointer 1
.filter_bank: pointer 1
.filter_length: resd 1
.filter_alloc: resd 1
.ideal_dst_incr: resd 1
.dst_incr: resd 1
.dst_incr_div: resd 1
.dst_incr_mod: resd 1
.index: resd 1
.frac: resd 1
.src_incr: resd 1
.compensation_distance: resd 1
.phase_shift: resd 1
.phase_mask: resd 1
 
; there's a few more here but we only care about the first few
endstruc
 
SECTION_RODATA
 
pf_1: dd 1.0
pdbl_1: dq 1.0
pd_0x4000: dd 0x4000
 
SECTION .text
 
%macro RESAMPLE_FNS 3-5 ; format [float or int16], bps, log2_bps, float op suffix [s or d], 1.0 constant
; int resample_common_$format(ResampleContext *ctx, $format *dst,
; const $format *src, int size, int update_ctx)
%if ARCH_X86_64 ; unix64 and win64
cglobal resample_common_%1, 0, 15, 2, ctx, dst, src, phase_shift, index, frac, \
dst_incr_mod, size, min_filter_count_x4, \
min_filter_len_x4, dst_incr_div, src_incr, \
phase_mask, dst_end, filter_bank
 
; use red-zone for variable storage
%define ctx_stackq [rsp-0x8]
%define src_stackq [rsp-0x10]
%if WIN64
%define update_context_stackd r4m
%else ; unix64
%define update_context_stackd [rsp-0x14]
%endif
 
; load as many variables in registers as possible; for the rest, store
; on stack so that we have 'ctx' available as one extra register
mov sized, r3d
mov phase_maskd, [ctxq+ResampleContext.phase_mask]
%if UNIX64
mov update_context_stackd, r4d
%endif
mov indexd, [ctxq+ResampleContext.index]
mov fracd, [ctxq+ResampleContext.frac]
mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
mov src_incrd, [ctxq+ResampleContext.src_incr]
mov ctx_stackq, ctxq
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
shl min_filter_len_x4d, %3
lea dst_endq, [dstq+sizeq*%2]
 
%if UNIX64
mov ecx, [ctxq+ResampleContext.phase_shift]
mov edi, [ctxq+ResampleContext.filter_alloc]
 
DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
src_incr, phase_mask, dst_end, filter_bank
%elif WIN64
mov R9d, [ctxq+ResampleContext.filter_alloc]
mov ecx, [ctxq+ResampleContext.phase_shift]
 
DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
filter, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
src_incr, phase_mask, dst_end, filter_bank
%endif
 
neg min_filter_len_x4q
sub filter_bankq, min_filter_len_x4q
sub srcq, min_filter_len_x4q
mov src_stackq, srcq
%else ; x86-32
cglobal resample_common_%1, 1, 7, 2, ctx, phase_shift, dst, frac, \
index, min_filter_length_x4, filter_bank
 
; push temp variables to stack
%define ctx_stackq r0mp
%define src_stackq r2mp
%define update_context_stackd r4m
 
mov dstq, r1mp
mov r3, r3mp
lea r3, [dstq+r3*%2]
PUSH dword [ctxq+ResampleContext.dst_incr_div]
PUSH dword [ctxq+ResampleContext.dst_incr_mod]
PUSH dword [ctxq+ResampleContext.filter_alloc]
PUSH r3
PUSH dword [ctxq+ResampleContext.phase_mask]
PUSH dword [ctxq+ResampleContext.src_incr]
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
mov indexd, [ctxq+ResampleContext.index]
shl min_filter_length_x4d, %3
mov fracd, [ctxq+ResampleContext.frac]
neg min_filter_length_x4q
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
sub r2mp, min_filter_length_x4q
sub filter_bankq, min_filter_length_x4q
PUSH min_filter_length_x4q
PUSH filter_bankq
mov phase_shiftd, [ctxq+ResampleContext.phase_shift]
 
DEFINE_ARGS src, phase_shift, dst, frac, index, min_filter_count_x4, filter
 
%define filter_bankq dword [rsp+0x0]
%define min_filter_length_x4q dword [rsp+0x4]
%define src_incrd dword [rsp+0x8]
%define phase_maskd dword [rsp+0xc]
%define dst_endq dword [rsp+0x10]
%define filter_allocd dword [rsp+0x14]
%define dst_incr_modd dword [rsp+0x18]
%define dst_incr_divd dword [rsp+0x1c]
 
mov srcq, r2mp
%endif
 
.loop:
mov filterd, filter_allocd
imul filterd, indexd
%if ARCH_X86_64
mov min_filter_count_x4q, min_filter_len_x4q
lea filterq, [filter_bankq+filterq*%2]
%else ; x86-32
mov min_filter_count_x4q, filter_bankq
lea filterq, [min_filter_count_x4q+filterq*%2]
mov min_filter_count_x4q, min_filter_length_x4q
%endif
%ifidn %1, int16
movd m0, [pd_0x4000]
%else ; float/double
xorps m0, m0, m0
%endif
 
align 16
.inner_loop:
movu m1, [srcq+min_filter_count_x4q*1]
%ifidn %1, int16
%if cpuflag(xop)
vpmadcswd m0, m1, [filterq+min_filter_count_x4q*1], m0
%else
pmaddwd m1, [filterq+min_filter_count_x4q*1]
paddd m0, m1
%endif
%else ; float/double
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m0, m1, [filterq+min_filter_count_x4q*1], m0
%else
mulp%4 m1, m1, [filterq+min_filter_count_x4q*1]
addp%4 m0, m0, m1
%endif ; cpuflag
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
 
%ifidn %1, int16
HADDD m0, m1
psrad m0, 15
add fracd, dst_incr_modd
packssdw m0, m0
add indexd, dst_incr_divd
movd [dstq], m0
%else ; float/double
; horizontal sum & store
%if mmsize == 32
vextractf128 xm1, m0, 0x1
addps xm0, xm1
%endif
movhlps xm1, xm0
%ifidn %1, float
addps xm0, xm1
shufps xm1, xm0, xm0, q0001
%endif
add fracd, dst_incr_modd
addp%4 xm0, xm1
add indexd, dst_incr_divd
movs%4 [dstq], xm0
%endif
cmp fracd, src_incrd
jl .skip
sub fracd, src_incrd
inc indexd
 
%if UNIX64
DEFINE_ARGS filter_alloc, dst, src, phase_shift, index, frac, dst_incr_mod, \
index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
src_incr, phase_mask, dst_end, filter_bank
%elif WIN64
DEFINE_ARGS phase_shift, dst, src, filter_alloc, index, frac, dst_incr_mod, \
index_incr, min_filter_count_x4, min_filter_len_x4, dst_incr_div, \
src_incr, phase_mask, dst_end, filter_bank
%else ; x86-32
DEFINE_ARGS src, phase_shift, dst, frac, index, index_incr
%endif
 
.skip:
mov index_incrd, indexd
add dstq, %2
and indexd, phase_maskd
sar index_incrd, phase_shiftb
lea srcq, [srcq+index_incrq*%2]
cmp dstq, dst_endq
jne .loop
 
%if ARCH_X86_64
DEFINE_ARGS ctx, dst, src, phase_shift, index, frac
%else ; x86-32
DEFINE_ARGS src, ctx, update_context, frac, index
%endif
 
cmp dword update_context_stackd, 0
jz .skip_store
; strictly speaking, the function should always return the consumed
; number of bytes; however, we only use the value if update_context
; is true, so let's just leave it uninitialized otherwise
mov ctxq, ctx_stackq
movifnidn rax, srcq
mov [ctxq+ResampleContext.frac ], fracd
sub rax, src_stackq
mov [ctxq+ResampleContext.index], indexd
shr rax, %3
 
.skip_store:
%if ARCH_X86_32
ADD rsp, 0x20
%endif
RET
 
; int resample_linear_$format(ResampleContext *ctx, float *dst,
; const float *src, int size, int update_ctx)
%if ARCH_X86_64 ; unix64 and win64
%if UNIX64
cglobal resample_linear_%1, 0, 15, 5, ctx, dst, phase_mask, phase_shift, index, frac, \
size, dst_incr_mod, min_filter_count_x4, \
min_filter_len_x4, dst_incr_div, src_incr, \
src, dst_end, filter_bank
 
mov srcq, r2mp
%else ; win64
cglobal resample_linear_%1, 0, 15, 5, ctx, phase_mask, src, phase_shift, index, frac, \
size, dst_incr_mod, min_filter_count_x4, \
min_filter_len_x4, dst_incr_div, src_incr, \
dst, dst_end, filter_bank
 
mov dstq, r1mp
%endif
 
; use red-zone for variable storage
%define ctx_stackq [rsp-0x8]
%define src_stackq [rsp-0x10]
%define phase_mask_stackd [rsp-0x14]
%if WIN64
%define update_context_stackd r4m
%else ; unix64
%define update_context_stackd [rsp-0x18]
%endif
 
; load as many variables in registers as possible; for the rest, store
; on stack so that we have 'ctx' available as one extra register
mov sized, r3d
mov phase_maskd, [ctxq+ResampleContext.phase_mask]
%if UNIX64
mov update_context_stackd, r4d
%endif
mov indexd, [ctxq+ResampleContext.index]
mov fracd, [ctxq+ResampleContext.frac]
mov dst_incr_modd, [ctxq+ResampleContext.dst_incr_mod]
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
mov src_incrd, [ctxq+ResampleContext.src_incr]
mov ctx_stackq, ctxq
mov phase_mask_stackd, phase_maskd
mov min_filter_len_x4d, [ctxq+ResampleContext.filter_length]
%ifidn %1, int16
movd m4, [pd_0x4000]
%else ; float/double
cvtsi2s%4 xm0, src_incrd
movs%4 xm4, [%5]
divs%4 xm4, xm0
%endif
mov dst_incr_divd, [ctxq+ResampleContext.dst_incr_div]
shl min_filter_len_x4d, %3
lea dst_endq, [dstq+sizeq*%2]
 
%if UNIX64
mov ecx, [ctxq+ResampleContext.phase_shift]
mov edi, [ctxq+ResampleContext.filter_alloc]
 
DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, filter1, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, src, dst_end, filter_bank
%elif WIN64
mov R9d, [ctxq+ResampleContext.filter_alloc]
mov ecx, [ctxq+ResampleContext.phase_shift]
 
DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, filter1, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, dst, dst_end, filter_bank
%endif
 
neg min_filter_len_x4q
sub filter_bankq, min_filter_len_x4q
sub srcq, min_filter_len_x4q
mov src_stackq, srcq
%else ; x86-32
cglobal resample_linear_%1, 1, 7, 5, ctx, min_filter_length_x4, filter2, \
frac, index, dst, filter_bank
 
; push temp variables to stack
%define ctx_stackq r0mp
%define src_stackq r2mp
%define update_context_stackd r4m
 
mov dstq, r1mp
mov r3, r3mp
lea r3, [dstq+r3*%2]
PUSH dword [ctxq+ResampleContext.dst_incr_div]
PUSH r3
mov r3, dword [ctxq+ResampleContext.filter_alloc]
PUSH dword [ctxq+ResampleContext.dst_incr_mod]
PUSH r3
shl r3, %3
PUSH r3
mov r3, dword [ctxq+ResampleContext.src_incr]
PUSH dword [ctxq+ResampleContext.phase_mask]
PUSH r3d
%ifidn %1, int16
movd m4, [pd_0x4000]
%else ; float/double
cvtsi2s%4 xm0, r3d
movs%4 xm4, [%5]
divs%4 xm4, xm0
%endif
mov min_filter_length_x4d, [ctxq+ResampleContext.filter_length]
mov indexd, [ctxq+ResampleContext.index]
shl min_filter_length_x4d, %3
mov fracd, [ctxq+ResampleContext.frac]
neg min_filter_length_x4q
mov filter_bankq, [ctxq+ResampleContext.filter_bank]
sub r2mp, min_filter_length_x4q
sub filter_bankq, min_filter_length_x4q
PUSH min_filter_length_x4q
PUSH filter_bankq
PUSH dword [ctxq+ResampleContext.phase_shift]
 
DEFINE_ARGS filter1, min_filter_count_x4, filter2, frac, index, dst, src
 
%define phase_shift_stackd dword [rsp+0x0]
%define filter_bankq dword [rsp+0x4]
%define min_filter_length_x4q dword [rsp+0x8]
%define src_incrd dword [rsp+0xc]
%define phase_mask_stackd dword [rsp+0x10]
%define filter_alloc_x4q dword [rsp+0x14]
%define filter_allocd dword [rsp+0x18]
%define dst_incr_modd dword [rsp+0x1c]
%define dst_endq dword [rsp+0x20]
%define dst_incr_divd dword [rsp+0x24]
 
mov srcq, r2mp
%endif
 
.loop:
mov filter1d, filter_allocd
imul filter1d, indexd
%if ARCH_X86_64
mov min_filter_count_x4q, min_filter_len_x4q
lea filter1q, [filter_bankq+filter1q*%2]
lea filter2q, [filter1q+filter_allocq*%2]
%else ; x86-32
mov min_filter_count_x4q, filter_bankq
lea filter1q, [min_filter_count_x4q+filter1q*%2]
mov min_filter_count_x4q, min_filter_length_x4q
mov filter2q, filter1q
add filter2q, filter_alloc_x4q
%endif
%ifidn %1, int16
mova m0, m4
mova m2, m4
%else ; float/double
xorps m0, m0, m0
xorps m2, m2, m2
%endif
 
align 16
.inner_loop:
movu m1, [srcq+min_filter_count_x4q*1]
%ifidn %1, int16
%if cpuflag(xop)
vpmadcswd m2, m1, [filter2q+min_filter_count_x4q*1], m2
vpmadcswd m0, m1, [filter1q+min_filter_count_x4q*1], m0
%else
pmaddwd m3, m1, [filter2q+min_filter_count_x4q*1]
pmaddwd m1, [filter1q+min_filter_count_x4q*1]
paddd m2, m3
paddd m0, m1
%endif ; cpuflag
%else ; float/double
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 m2, m1, [filter2q+min_filter_count_x4q*1], m2
fmaddp%4 m0, m1, [filter1q+min_filter_count_x4q*1], m0
%else
mulp%4 m3, m1, [filter2q+min_filter_count_x4q*1]
mulp%4 m1, m1, [filter1q+min_filter_count_x4q*1]
addp%4 m2, m2, m3
addp%4 m0, m0, m1
%endif ; cpuflag
%endif
add min_filter_count_x4q, mmsize
js .inner_loop
 
%ifidn %1, int16
%if mmsize == 16
%if cpuflag(xop)
vphadddq m2, m2
vphadddq m0, m0
%endif
pshufd m3, m2, q0032
pshufd m1, m0, q0032
paddd m2, m3
paddd m0, m1
%endif
%if notcpuflag(xop)
PSHUFLW m3, m2, q0032
PSHUFLW m1, m0, q0032
paddd m2, m3
paddd m0, m1
%endif
psubd m2, m0
; This is probably a really bad idea on atom and other machines with a
; long transfer latency between GPRs and XMMs (atom). However, it does
; make the clip a lot simpler...
movd eax, m2
add indexd, dst_incr_divd
imul fracd
idiv src_incrd
movd m1, eax
add fracd, dst_incr_modd
paddd m0, m1
psrad m0, 15
packssdw m0, m0
movd [dstq], m0
 
; note that for imul/idiv, I need to move filter to edx/eax for each:
; - 32bit: eax=r0[filter1], edx=r2[filter2]
; - win64: eax=r6[filter1], edx=r1[todo]
; - unix64: eax=r6[filter1], edx=r2[todo]
%else ; float/double
; val += (v2 - val) * (FELEML) frac / c->src_incr;
%if mmsize == 32
vextractf128 xm1, m0, 0x1
vextractf128 xm3, m2, 0x1
addps xm0, xm1
addps xm2, xm3
%endif
cvtsi2s%4 xm1, fracd
subp%4 xm2, xm0
mulp%4 xm1, xm4
shufp%4 xm1, xm1, q0000
%if cpuflag(fma4) || cpuflag(fma3)
fmaddp%4 xm0, xm2, xm1, xm0
%else
mulp%4 xm2, xm1
addp%4 xm0, xm2
%endif ; cpuflag
 
; horizontal sum & store
movhlps xm1, xm0
%ifidn %1, float
addps xm0, xm1
shufps xm1, xm0, xm0, q0001
%endif
add fracd, dst_incr_modd
addp%4 xm0, xm1
add indexd, dst_incr_divd
movs%4 [dstq], xm0
%endif
cmp fracd, src_incrd
jl .skip
sub fracd, src_incrd
inc indexd
 
%if UNIX64
DEFINE_ARGS filter_alloc, dst, filter2, phase_shift, index, frac, index_incr, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, src, dst_end, filter_bank
%elif WIN64
DEFINE_ARGS phase_shift, filter2, src, filter_alloc, index, frac, index_incr, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, dst, dst_end, filter_bank
%else ; x86-32
DEFINE_ARGS filter1, phase_shift, index_incr, frac, index, dst, src
%endif
 
.skip:
%if ARCH_X86_32
mov phase_shiftd, phase_shift_stackd
%endif
mov index_incrd, indexd
add dstq, %2
and indexd, phase_mask_stackd
sar index_incrd, phase_shiftb
lea srcq, [srcq+index_incrq*%2]
cmp dstq, dst_endq
jne .loop
 
%if UNIX64
DEFINE_ARGS ctx, dst, filter2, phase_shift, index, frac, index_incr, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, src, dst_end, filter_bank
%elif WIN64
DEFINE_ARGS ctx, filter2, src, phase_shift, index, frac, index_incr, \
dst_incr_mod, min_filter_count_x4, min_filter_len_x4, \
dst_incr_div, src_incr, dst, dst_end, filter_bank
%else ; x86-32
DEFINE_ARGS filter1, ctx, update_context, frac, index, dst, src
%endif
 
cmp dword update_context_stackd, 0
jz .skip_store
; strictly speaking, the function should always return the consumed
; number of bytes; however, we only use the value if update_context
; is true, so let's just leave it uninitialized otherwise
mov ctxq, ctx_stackq
movifnidn rax, srcq
mov [ctxq+ResampleContext.frac ], fracd
sub rax, src_stackq
mov [ctxq+ResampleContext.index], indexd
shr rax, %3
 
.skip_store:
%if ARCH_X86_32
ADD rsp, 0x28
%endif
RET
%endmacro
 
INIT_XMM sse
RESAMPLE_FNS float, 4, 2, s, pf_1
 
%if HAVE_AVX_EXTERNAL
INIT_YMM avx
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if HAVE_FMA3_EXTERNAL
INIT_YMM fma3
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
%if HAVE_FMA4_EXTERNAL
INIT_XMM fma4
RESAMPLE_FNS float, 4, 2, s, pf_1
%endif
 
%if ARCH_X86_32
INIT_MMX mmxext
RESAMPLE_FNS int16, 2, 1
%endif
 
INIT_XMM sse2
RESAMPLE_FNS int16, 2, 1
%if HAVE_XOP_EXTERNAL
INIT_XMM xop
RESAMPLE_FNS int16, 2, 1
%endif
 
INIT_XMM sse2
RESAMPLE_FNS double, 8, 3, d, pdbl_1
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/resample_init.c
0,0 → 1,90
/*
* audio resampling
* Copyright (c) 2004-2012 Michael Niedermayer <michaelni@gmx.at>
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
/**
* @file
* audio resampling
* @author Michael Niedermayer <michaelni@gmx.at>
*/
 
#include "libavutil/x86/cpu.h"
#include "libswresample/resample.h"
 
#define RESAMPLE_FUNCS(type, opt) \
int ff_resample_common_##type##_##opt(ResampleContext *c, void *dst, \
const void *src, int sz, int upd); \
int ff_resample_linear_##type##_##opt(ResampleContext *c, void *dst, \
const void *src, int sz, int upd)
 
RESAMPLE_FUNCS(int16, mmxext);
RESAMPLE_FUNCS(int16, sse2);
RESAMPLE_FUNCS(int16, xop);
RESAMPLE_FUNCS(float, sse);
RESAMPLE_FUNCS(float, avx);
RESAMPLE_FUNCS(float, fma3);
RESAMPLE_FUNCS(float, fma4);
RESAMPLE_FUNCS(double, sse2);
 
av_cold void swri_resample_dsp_x86_init(ResampleContext *c)
{
int av_unused mm_flags = av_get_cpu_flags();
 
switch(c->format){
case AV_SAMPLE_FMT_S16P:
if (ARCH_X86_32 && EXTERNAL_MMXEXT(mm_flags)) {
c->dsp.resample = c->linear ? ff_resample_linear_int16_mmxext
: ff_resample_common_int16_mmxext;
}
if (EXTERNAL_SSE2(mm_flags)) {
c->dsp.resample = c->linear ? ff_resample_linear_int16_sse2
: ff_resample_common_int16_sse2;
}
if (EXTERNAL_XOP(mm_flags)) {
c->dsp.resample = c->linear ? ff_resample_linear_int16_xop
: ff_resample_common_int16_xop;
}
break;
case AV_SAMPLE_FMT_FLTP:
if (EXTERNAL_SSE(mm_flags)) {
c->dsp.resample = c->linear ? ff_resample_linear_float_sse
: ff_resample_common_float_sse;
}
if (EXTERNAL_AVX_FAST(mm_flags)) {
c->dsp.resample = c->linear ? ff_resample_linear_float_avx
: ff_resample_common_float_avx;
}
if (EXTERNAL_FMA3(mm_flags) && !(mm_flags & AV_CPU_FLAG_AVXSLOW)) {
c->dsp.resample = c->linear ? ff_resample_linear_float_fma3
: ff_resample_common_float_fma3;
}
if (EXTERNAL_FMA4(mm_flags)) {
c->dsp.resample = c->linear ? ff_resample_linear_float_fma4
: ff_resample_common_float_fma4;
}
break;
case AV_SAMPLE_FMT_DBLP:
if (EXTERNAL_SSE2(mm_flags)) {
c->dsp.resample = c->linear ? ff_resample_linear_double_sse2
: ff_resample_common_double_sse2;
}
break;
}
}
/contrib/sdk/sources/ffmpeg/ffmpeg-2.8/libswresample/x86/w64xmmtest.c
0,0 → 1,29
/*
* check XMM registers for clobbers on Win64
* Copyright (c) 2013 Martin Storsjo
*
* This file is part of FFmpeg.
*
* FFmpeg is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2.1 of the License, or (at your option) any later version.
*
* FFmpeg is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with FFmpeg; if not, write to the Free Software
* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/
 
#include "libswresample/swresample.h"
#include "libavutil/x86/w64xmmtest.h"
 
wrap(swr_convert(struct SwrContext *s, uint8_t **out, int out_count,
const uint8_t **in , int in_count))
{
testxmmclobbers(swr_convert, s, out, out_count, in, in_count);
}