WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/ffmpeg/libavcodec/x86/mpegaudiodsp.c

Rev	Author	Line No.	Line
4349	Serge	1	/*
		2	* MMX optimized MP3 decoding functions
		3	* Copyright (c) 2010 Vitor Sessak
		4	*
		5	* This file is part of FFmpeg.
		6	*
		7	* FFmpeg is free software; you can redistribute it and/or
		8	* modify it under the terms of the GNU Lesser General Public
		9	* License as published by the Free Software Foundation; either
		10	* version 2.1 of the License, or (at your option) any later version.
		11	*
		12	* FFmpeg is distributed in the hope that it will be useful,
		13	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		14	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
		15	* Lesser General Public License for more details.
		16	*
		17	* You should have received a copy of the GNU Lesser General Public
		18	* License along with FFmpeg; if not, write to the Free Software
		19	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
		20	*/
		21
		22	#include "libavutil/attributes.h"
		23	#include "libavutil/cpu.h"
		24	#include "libavutil/internal.h"
		25	#include "libavutil/x86/asm.h"
		26	#include "libavutil/x86/cpu.h"
		27	#include "libavcodec/mpegaudiodsp.h"
		28
		29	#define DECL(CPU)\
		30	static void imdct36_blocks_ ## CPU(float out, float buf, float *in, int count, int switch_point, int block_type);\
		31	void ff_imdct36_float_ ## CPU(float out, float buf, float in, float win);
		32
		33	DECL(sse)
		34	DECL(sse2)
		35	DECL(sse3)
		36	DECL(ssse3)
		37	DECL(avx)
		38
		39	void ff_four_imdct36_float_sse(float out, float buf, float in, float win,
		40	float *tmpbuf);
		41	void ff_four_imdct36_float_avx(float out, float buf, float in, float win,
		42	float *tmpbuf);
		43
		44	DECLARE_ALIGNED(16, static float, mdct_win_sse)[2][4][4*40];
		45
		46	#if HAVE_SSE2_INLINE
		47
		48	#define MACS(rt, ra, rb) rt+=(ra)*(rb)
		49	#define MLSS(rt, ra, rb) rt-=(ra)*(rb)
		50
		51	#define SUM8(op, sum, w, p) \
		52	{ \
		53	op(sum, (w)[0 * 64], (p)[0 * 64]); \
		54	op(sum, (w)[1 * 64], (p)[1 * 64]); \
		55	op(sum, (w)[2 * 64], (p)[2 * 64]); \
		56	op(sum, (w)[3 * 64], (p)[3 * 64]); \
		57	op(sum, (w)[4 * 64], (p)[4 * 64]); \
		58	op(sum, (w)[5 * 64], (p)[5 * 64]); \
		59	op(sum, (w)[6 * 64], (p)[6 * 64]); \
		60	op(sum, (w)[7 * 64], (p)[7 * 64]); \
		61	}
		62
		63	static void apply_window(const float buf, const float win1,
		64	const float win2, float sum1, float *sum2, int len)
		65	{
		66	x86_reg count = - 4*len;
		67	const float *win1a = win1+len;
		68	const float *win2a = win2+len;
		69	const float *bufa = buf+len;
		70	float *sum1a = sum1+len;
		71	float *sum2a = sum2+len;
		72
		73
		74	#define MULT(a, b) \
		75	"movaps " #a "(%1,%0), %%xmm1 \n\t" \
		76	"movaps " #a "(%3,%0), %%xmm2 \n\t" \
		77	"mulps %%xmm2, %%xmm1 \n\t" \
		78	"subps %%xmm1, %%xmm0 \n\t" \
		79	"mulps " #b "(%2,%0), %%xmm2 \n\t" \
		80	"subps %%xmm2, %%xmm4 \n\t" \
		81
		82	__asm__ volatile(
		83	"1: \n\t"
		84	"xorps %%xmm0, %%xmm0 \n\t"
		85	"xorps %%xmm4, %%xmm4 \n\t"
		86
		87	MULT( 0, 0)
		88	MULT( 256, 64)
		89	MULT( 512, 128)
		90	MULT( 768, 192)
		91	MULT(1024, 256)
		92	MULT(1280, 320)
		93	MULT(1536, 384)
		94	MULT(1792, 448)
		95
		96	"movaps %%xmm0, (%4,%0) \n\t"
		97	"movaps %%xmm4, (%5,%0) \n\t"
		98	"add $16, %0 \n\t"
		99	"jl 1b \n\t"
		100	:"+&r"(count)
		101	:"r"(win1a), "r"(win2a), "r"(bufa), "r"(sum1a), "r"(sum2a)
		102	);
		103
		104	#undef MULT
		105	}
		106
		107	static void apply_window_mp3(float in, float win, int unused, float out,
		108	int incr)
		109	{
		110	LOCAL_ALIGNED_16(float, suma, [17]);
		111	LOCAL_ALIGNED_16(float, sumb, [17]);
		112	LOCAL_ALIGNED_16(float, sumc, [17]);
		113	LOCAL_ALIGNED_16(float, sumd, [17]);
		114
		115	float sum;
		116
		117	/* copy to avoid wrap */
		118	__asm__ volatile(
		119	"movaps 0(%0), %%xmm0 \n\t" \
		120	"movaps 16(%0), %%xmm1 \n\t" \
		121	"movaps 32(%0), %%xmm2 \n\t" \
		122	"movaps 48(%0), %%xmm3 \n\t" \
		123	"movaps %%xmm0, 0(%1) \n\t" \
		124	"movaps %%xmm1, 16(%1) \n\t" \
		125	"movaps %%xmm2, 32(%1) \n\t" \
		126	"movaps %%xmm3, 48(%1) \n\t" \
		127	"movaps 64(%0), %%xmm0 \n\t" \
		128	"movaps 80(%0), %%xmm1 \n\t" \
		129	"movaps 96(%0), %%xmm2 \n\t" \
		130	"movaps 112(%0), %%xmm3 \n\t" \
		131	"movaps %%xmm0, 64(%1) \n\t" \
		132	"movaps %%xmm1, 80(%1) \n\t" \
		133	"movaps %%xmm2, 96(%1) \n\t" \
		134	"movaps %%xmm3, 112(%1) \n\t"
		135	::"r"(in), "r"(in+512)
		136	:"memory"
		137	);
		138
		139	apply_window(in + 16, win , win + 512, suma, sumc, 16);
		140	apply_window(in + 32, win + 48, win + 640, sumb, sumd, 16);
		141
		142	SUM8(MACS, suma[0], win + 32, in + 48);
		143
		144	sumc[ 0] = 0;
		145	sumb[16] = 0;
		146	sumd[16] = 0;
		147
		148	#define SUMS(suma, sumb, sumc, sumd, out1, out2) \
		149	"movups " #sumd "(%4), %%xmm0 \n\t" \
		150	"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
		151	"subps " #suma "(%1), %%xmm0 \n\t" \
		152	"movaps %%xmm0," #out1 "(%0) \n\t" \
		153	\
		154	"movups " #sumc "(%3), %%xmm0 \n\t" \
		155	"shufps $0x1b, %%xmm0, %%xmm0 \n\t" \
		156	"addps " #sumb "(%2), %%xmm0 \n\t" \
		157	"movaps %%xmm0," #out2 "(%0) \n\t"
		158
		159	if (incr == 1) {
		160	__asm__ volatile(
		161	SUMS( 0, 48, 4, 52, 0, 112)
		162	SUMS(16, 32, 20, 36, 16, 96)
		163	SUMS(32, 16, 36, 20, 32, 80)
		164	SUMS(48, 0, 52, 4, 48, 64)
		165
		166	:"+&r"(out)
		167	:"r"(&suma[0]), "r"(&sumb[0]), "r"(&sumc[0]), "r"(&sumd[0])
		168	:"memory"
		169	);
		170	out += 16*incr;
		171	} else {
		172	int j;
		173	float out2 = out + 32 incr;
		174	out[0 ] = -suma[ 0];
		175	out += incr;
		176	out2 -= incr;
		177	for(j=1;j<16;j++) {
		178	*out = -suma[ j] + sumd[16-j];
		179	*out2 = sumb[16-j] + sumc[ j];
		180	out += incr;
		181	out2 -= incr;
		182	}
		183	}
		184
		185	sum = 0;
		186	SUM8(MLSS, sum, win + 16 + 32, in + 32);
		187	*out = sum;
		188	}
		189
		190	#endif /* HAVE_SSE2_INLINE */
		191
		192	#if HAVE_YASM
		193	#define DECL_IMDCT_BLOCKS(CPU1, CPU2) \
		194	static void imdct36_blocks_ ## CPU1(float out, float buf, float *in, \
		195	int count, int switch_point, int block_type) \
		196	{ \
		197	int align_end = count - (count & 3); \
		198	int j; \
		199	for (j = 0; j < align_end; j+= 4) { \
		200	LOCAL_ALIGNED_16(float, tmpbuf, [1024]); \
		201	float *win = mdct_win_sse[switch_point && j < 4][block_type]; \
		202	/* apply window & overlap with previous buffer */ \
		203	\
		204	/* select window */ \
		205	ff_four_imdct36_float_ ## CPU2(out, buf, in, win, tmpbuf); \
		206	in += 4*18; \
		207	buf += 4*18; \
		208	out += 4; \
		209	} \
		210	for (; j < count; j++) { \
		211	/* apply window & overlap with previous buffer */ \
		212	\
		213	/* select window */ \
		214	int win_idx = (switch_point && j < 2) ? 0 : block_type; \
		215	float *win = ff_mdct_win_float[win_idx + (4 & -(j & 1))]; \
		216	\
		217	ff_imdct36_float_ ## CPU1(out, buf, in, win); \
		218	\
		219	in += 18; \
		220	buf++; \
		221	out++; \
		222	} \
		223	}
		224
		225	#if HAVE_SSE
		226	DECL_IMDCT_BLOCKS(sse,sse)
		227	DECL_IMDCT_BLOCKS(sse2,sse)
		228	DECL_IMDCT_BLOCKS(sse3,sse)
		229	DECL_IMDCT_BLOCKS(ssse3,sse)
		230	#endif
		231	#if HAVE_AVX_EXTERNAL
		232	DECL_IMDCT_BLOCKS(avx,avx)
		233	#endif
		234	#endif /* HAVE_YASM */
		235
		236	av_cold void ff_mpadsp_init_x86(MPADSPContext *s)
		237	{
		238	int cpu_flags = av_get_cpu_flags();
		239
		240	int i, j;
		241	for (j = 0; j < 4; j++) {
		242	for (i = 0; i < 40; i ++) {
		243	mdct_win_sse[0][j][4*i ] = ff_mdct_win_float[j ][i];
		244	mdct_win_sse[0][j][4*i + 1] = ff_mdct_win_float[j + 4][i];
		245	mdct_win_sse[0][j][4*i + 2] = ff_mdct_win_float[j ][i];
		246	mdct_win_sse[0][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
		247	mdct_win_sse[1][j][4*i ] = ff_mdct_win_float[0 ][i];
		248	mdct_win_sse[1][j][4*i + 1] = ff_mdct_win_float[4 ][i];
		249	mdct_win_sse[1][j][4*i + 2] = ff_mdct_win_float[j ][i];
		250	mdct_win_sse[1][j][4*i + 3] = ff_mdct_win_float[j + 4][i];
		251	}
		252	}
		253
		254	#if HAVE_SSE2_INLINE
		255	if (cpu_flags & AV_CPU_FLAG_SSE2) {
		256	s->apply_window_float = apply_window_mp3;
		257	}
		258	#endif /* HAVE_SSE2_INLINE */
		259
		260	#if HAVE_YASM
		261	if (EXTERNAL_SSE(cpu_flags)) {
		262	s->imdct36_blocks_float = imdct36_blocks_sse;
		263	}
		264	if (EXTERNAL_SSE2(cpu_flags)) {
		265	s->imdct36_blocks_float = imdct36_blocks_sse2;
		266	}
		267	if (EXTERNAL_SSE3(cpu_flags)) {
		268	s->imdct36_blocks_float = imdct36_blocks_sse3;
		269	}
		270	if (EXTERNAL_SSSE3(cpu_flags)) {
		271	s->imdct36_blocks_float = imdct36_blocks_ssse3;
		272	}
		273	if (EXTERNAL_AVX(cpu_flags)) {
		274	s->imdct36_blocks_float = imdct36_blocks_avx;
		275	}
		276	#endif /* HAVE_YASM */
		277	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/ffmpeg/libavcodec/x86/mpegaudiodsp.c – Rev 4871