WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/ffmpeg/libavcodec/x86/fdct.c

Rev	Author	Line No.	Line
4349	Serge	1	/*
		2	* MMX optimized forward DCT
		3	* The gcc porting is Copyright (c) 2001 Fabrice Bellard.
		4	* cleanup/optimizations are Copyright (c) 2002-2004 Michael Niedermayer
		5	* SSE2 optimization is Copyright (c) 2004 Denes Balatoni.
		6	*
		7	* from fdctam32.c - AP922 MMX(3D-Now) forward-DCT
		8	*
		9	* Intel Application Note AP-922 - fast, precise implementation of DCT
		10	* http://developer.intel.com/vtune/cbts/appnotes.htm
		11	*
		12	* Also of inspiration:
		13	* a page about fdct at http://www.geocities.com/ssavekar/dct.htm
		14	* Skal's fdct at http://skal.planet-d.net/coding/dct.html
		15	*
		16	* This file is part of FFmpeg.
		17	*
		18	* FFmpeg is free software; you can redistribute it and/or
		19	* modify it under the terms of the GNU Lesser General Public
		20	* License as published by the Free Software Foundation; either
		21	* version 2.1 of the License, or (at your option) any later version.
		22	*
		23	* FFmpeg is distributed in the hope that it will be useful,
		24	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		25	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
		26	* Lesser General Public License for more details.
		27	*
		28	* You should have received a copy of the GNU Lesser General Public
		29	* License along with FFmpeg; if not, write to the Free Software
		30	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
		31	*/
		32
		33	#include "libavutil/common.h"
		34	#include "libavutil/x86/asm.h"
		35	#include "libavcodec/dct.h"
		36
		37	#if HAVE_MMX_INLINE
		38
		39	//////////////////////////////////////////////////////////////////////
		40	//
		41	// constants for the forward DCT
		42	// -----------------------------
		43	//
		44	// Be sure to check that your compiler is aligning all constants to QWORD
		45	// (8-byte) memory boundaries! Otherwise the unaligned memory access will
		46	// severely stall MMX execution.
		47	//
		48	//////////////////////////////////////////////////////////////////////
		49
		50	#define BITS_FRW_ACC 3 //; 2 or 3 for accuracy
		51	#define SHIFT_FRW_COL BITS_FRW_ACC
		52	#define SHIFT_FRW_ROW (BITS_FRW_ACC + 17 - 3)
		53	#define RND_FRW_ROW (1 << (SHIFT_FRW_ROW-1))
		54	//#define RND_FRW_COL (1 << (SHIFT_FRW_COL-1))
		55
		56	#define X8(x) x,x,x,x,x,x,x,x
		57
		58	//concatenated table, for forward DCT transformation
		59	DECLARE_ALIGNED(16, static const int16_t, fdct_tg_all_16)[24] = {
		60	X8(13036), // tg * (2<<16) + 0.5
		61	X8(27146), // tg * (2<<16) + 0.5
		62	X8(-21746) // tg * (2<<16) + 0.5
		63	};
		64
		65	DECLARE_ALIGNED(16, static const int16_t, ocos_4_16)[8] = {
		66	X8(23170) //cos * (2<<15) + 0.5
		67	};
		68
		69	DECLARE_ALIGNED(16, static const int16_t, fdct_one_corr)[8] = { X8(1) };
		70
		71	DECLARE_ALIGNED(8, static const int32_t, fdct_r_row)[2] = {RND_FRW_ROW, RND_FRW_ROW };
		72
		73	static const struct
		74	{
		75	DECLARE_ALIGNED(16, const int32_t, fdct_r_row_sse2)[4];
		76	} fdct_r_row_sse2 =
		77	{{
		78	RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW
		79	}};
		80	//DECLARE_ALIGNED(16, static const long, fdct_r_row_sse2)[4] = {RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW, RND_FRW_ROW};
		81
		82	DECLARE_ALIGNED(8, static const int16_t, tab_frw_01234567)[] = { // forward_dct coeff table
		83	16384, 16384, 22725, 19266,
		84	16384, 16384, 12873, 4520,
		85	21407, 8867, 19266, -4520,
		86	-8867, -21407, -22725, -12873,
		87	16384, -16384, 12873, -22725,
		88	-16384, 16384, 4520, 19266,
		89	8867, -21407, 4520, -12873,
		90	21407, -8867, 19266, -22725,
		91
		92	22725, 22725, 31521, 26722,
		93	22725, 22725, 17855, 6270,
		94	29692, 12299, 26722, -6270,
		95	-12299, -29692, -31521, -17855,
		96	22725, -22725, 17855, -31521,
		97	-22725, 22725, 6270, 26722,
		98	12299, -29692, 6270, -17855,
		99	29692, -12299, 26722, -31521,
		100
		101	21407, 21407, 29692, 25172,
		102	21407, 21407, 16819, 5906,
		103	27969, 11585, 25172, -5906,
		104	-11585, -27969, -29692, -16819,
		105	21407, -21407, 16819, -29692,
		106	-21407, 21407, 5906, 25172,
		107	11585, -27969, 5906, -16819,
		108	27969, -11585, 25172, -29692,
		109
		110	19266, 19266, 26722, 22654,
		111	19266, 19266, 15137, 5315,
		112	25172, 10426, 22654, -5315,
		113	-10426, -25172, -26722, -15137,
		114	19266, -19266, 15137, -26722,
		115	-19266, 19266, 5315, 22654,
		116	10426, -25172, 5315, -15137,
		117	25172, -10426, 22654, -26722,
		118
		119	16384, 16384, 22725, 19266,
		120	16384, 16384, 12873, 4520,
		121	21407, 8867, 19266, -4520,
		122	-8867, -21407, -22725, -12873,
		123	16384, -16384, 12873, -22725,
		124	-16384, 16384, 4520, 19266,
		125	8867, -21407, 4520, -12873,
		126	21407, -8867, 19266, -22725,
		127
		128	19266, 19266, 26722, 22654,
		129	19266, 19266, 15137, 5315,
		130	25172, 10426, 22654, -5315,
		131	-10426, -25172, -26722, -15137,
		132	19266, -19266, 15137, -26722,
		133	-19266, 19266, 5315, 22654,
		134	10426, -25172, 5315, -15137,
		135	25172, -10426, 22654, -26722,
		136
		137	21407, 21407, 29692, 25172,
		138	21407, 21407, 16819, 5906,
		139	27969, 11585, 25172, -5906,
		140	-11585, -27969, -29692, -16819,
		141	21407, -21407, 16819, -29692,
		142	-21407, 21407, 5906, 25172,
		143	11585, -27969, 5906, -16819,
		144	27969, -11585, 25172, -29692,
		145
		146	22725, 22725, 31521, 26722,
		147	22725, 22725, 17855, 6270,
		148	29692, 12299, 26722, -6270,
		149	-12299, -29692, -31521, -17855,
		150	22725, -22725, 17855, -31521,
		151	-22725, 22725, 6270, 26722,
		152	12299, -29692, 6270, -17855,
		153	29692, -12299, 26722, -31521,
		154	};
		155
		156	static const struct
		157	{
		158	DECLARE_ALIGNED(16, const int16_t, tab_frw_01234567_sse2)[256];
		159	} tab_frw_01234567_sse2 =
		160	{{
		161	//DECLARE_ALIGNED(16, static const int16_t, tab_frw_01234567_sse2)[] = { // forward_dct coeff table
		162	#define TABLE_SSE2 C4, C4, C1, C3, -C6, -C2, -C1, -C5, \
		163	C4, C4, C5, C7, C2, C6, C3, -C7, \
		164	-C4, C4, C7, C3, C6, -C2, C7, -C5, \
		165	C4, -C4, C5, -C1, C2, -C6, C3, -C1,
		166	// c1..c7 * cos(pi/4) * 2^15
		167	#define C1 22725
		168	#define C2 21407
		169	#define C3 19266
		170	#define C4 16384
		171	#define C5 12873
		172	#define C6 8867
		173	#define C7 4520
		174	TABLE_SSE2
		175
		176	#undef C1
		177	#undef C2
		178	#undef C3
		179	#undef C4
		180	#undef C5
		181	#undef C6
		182	#undef C7
		183	#define C1 31521
		184	#define C2 29692
		185	#define C3 26722
		186	#define C4 22725
		187	#define C5 17855
		188	#define C6 12299
		189	#define C7 6270
		190	TABLE_SSE2
		191
		192	#undef C1
		193	#undef C2
		194	#undef C3
		195	#undef C4
		196	#undef C5
		197	#undef C6
		198	#undef C7
		199	#define C1 29692
		200	#define C2 27969
		201	#define C3 25172
		202	#define C4 21407
		203	#define C5 16819
		204	#define C6 11585
		205	#define C7 5906
		206	TABLE_SSE2
		207
		208	#undef C1
		209	#undef C2
		210	#undef C3
		211	#undef C4
		212	#undef C5
		213	#undef C6
		214	#undef C7
		215	#define C1 26722
		216	#define C2 25172
		217	#define C3 22654
		218	#define C4 19266
		219	#define C5 15137
		220	#define C6 10426
		221	#define C7 5315
		222	TABLE_SSE2
		223
		224	#undef C1
		225	#undef C2
		226	#undef C3
		227	#undef C4
		228	#undef C5
		229	#undef C6
		230	#undef C7
		231	#define C1 22725
		232	#define C2 21407
		233	#define C3 19266
		234	#define C4 16384
		235	#define C5 12873
		236	#define C6 8867
		237	#define C7 4520
		238	TABLE_SSE2
		239
		240	#undef C1
		241	#undef C2
		242	#undef C3
		243	#undef C4
		244	#undef C5
		245	#undef C6
		246	#undef C7
		247	#define C1 26722
		248	#define C2 25172
		249	#define C3 22654
		250	#define C4 19266
		251	#define C5 15137
		252	#define C6 10426
		253	#define C7 5315
		254	TABLE_SSE2
		255
		256	#undef C1
		257	#undef C2
		258	#undef C3
		259	#undef C4
		260	#undef C5
		261	#undef C6
		262	#undef C7
		263	#define C1 29692
		264	#define C2 27969
		265	#define C3 25172
		266	#define C4 21407
		267	#define C5 16819
		268	#define C6 11585
		269	#define C7 5906
		270	TABLE_SSE2
		271
		272	#undef C1
		273	#undef C2
		274	#undef C3
		275	#undef C4
		276	#undef C5
		277	#undef C6
		278	#undef C7
		279	#define C1 31521
		280	#define C2 29692
		281	#define C3 26722
		282	#define C4 22725
		283	#define C5 17855
		284	#define C6 12299
		285	#define C7 6270
		286	TABLE_SSE2
		287	}};
		288
		289	#define S(s) AV_TOSTRING(s) //AV_STRINGIFY is too long
		290
		291	#define FDCT_COL(cpu, mm, mov)\
		292	static av_always_inline void fdct_col_##cpu(const int16_t in, int16_t out, int offset)\
		293	{\
		294	__asm__ volatile (\
		295	#mov" 16(%0), %%"#mm"0 \n\t" \
		296	#mov" 96(%0), %%"#mm"1 \n\t" \
		297	#mov" %%"#mm"0, %%"#mm"2 \n\t" \
		298	#mov" 32(%0), %%"#mm"3 \n\t" \
		299	"paddsw %%"#mm"1, %%"#mm"0 \n\t" \
		300	#mov" 80(%0), %%"#mm"4 \n\t" \
		301	"psllw $"S(SHIFT_FRW_COL)", %%"#mm"0 \n\t" \
		302	#mov" (%0), %%"#mm"5 \n\t" \
		303	"paddsw %%"#mm"3, %%"#mm"4 \n\t" \
		304	"paddsw 112(%0), %%"#mm"5 \n\t" \
		305	"psllw $"S(SHIFT_FRW_COL)", %%"#mm"4 \n\t" \
		306	#mov" %%"#mm"0, %%"#mm"6 \n\t" \
		307	"psubsw %%"#mm"1, %%"#mm"2 \n\t" \
		308	#mov" 16(%1), %%"#mm"1 \n\t" \
		309	"psubsw %%"#mm"4, %%"#mm"0 \n\t" \
		310	#mov" 48(%0), %%"#mm"7 \n\t" \
		311	"pmulhw %%"#mm"0, %%"#mm"1 \n\t" \
		312	"paddsw 64(%0), %%"#mm"7 \n\t" \
		313	"psllw $"S(SHIFT_FRW_COL)", %%"#mm"5 \n\t" \
		314	"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
		315	"psllw $"S(SHIFT_FRW_COL)", %%"#mm"7 \n\t" \
		316	#mov" %%"#mm"5, %%"#mm"4 \n\t" \
		317	"psubsw %%"#mm"7, %%"#mm"5 \n\t" \
		318	"paddsw %%"#mm"5, %%"#mm"1 \n\t" \
		319	"paddsw %%"#mm"7, %%"#mm"4 \n\t" \
		320	"por (%2), %%"#mm"1 \n\t" \
		321	"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"2 \n\t" \
		322	"pmulhw 16(%1), %%"#mm"5 \n\t" \
		323	#mov" %%"#mm"4, %%"#mm"7 \n\t" \
		324	"psubsw 80(%0), %%"#mm"3 \n\t" \
		325	"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
		326	#mov" %%"#mm"1, 32(%3) \n\t" \
		327	"paddsw %%"#mm"6, %%"#mm"7 \n\t" \
		328	#mov" 48(%0), %%"#mm"1 \n\t" \
		329	"psllw $"S(SHIFT_FRW_COL)"+1, %%"#mm"3 \n\t" \
		330	"psubsw 64(%0), %%"#mm"1 \n\t" \
		331	#mov" %%"#mm"2, %%"#mm"6 \n\t" \
		332	#mov" %%"#mm"4, 64(%3) \n\t" \
		333	"paddsw %%"#mm"3, %%"#mm"2 \n\t" \
		334	"pmulhw (%4), %%"#mm"2 \n\t" \
		335	"psubsw %%"#mm"3, %%"#mm"6 \n\t" \
		336	"pmulhw (%4), %%"#mm"6 \n\t" \
		337	"psubsw %%"#mm"0, %%"#mm"5 \n\t" \
		338	"por (%2), %%"#mm"5 \n\t" \
		339	"psllw $"S(SHIFT_FRW_COL)", %%"#mm"1 \n\t" \
		340	"por (%2), %%"#mm"2 \n\t" \
		341	#mov" %%"#mm"1, %%"#mm"4 \n\t" \
		342	#mov" (%0), %%"#mm"3 \n\t" \
		343	"paddsw %%"#mm"6, %%"#mm"1 \n\t" \
		344	"psubsw 112(%0), %%"#mm"3 \n\t" \
		345	"psubsw %%"#mm"6, %%"#mm"4 \n\t" \
		346	#mov" (%1), %%"#mm"0 \n\t" \
		347	"psllw $"S(SHIFT_FRW_COL)", %%"#mm"3 \n\t" \
		348	#mov" 32(%1), %%"#mm"6 \n\t" \
		349	"pmulhw %%"#mm"1, %%"#mm"0 \n\t" \
		350	#mov" %%"#mm"7, (%3) \n\t" \
		351	"pmulhw %%"#mm"4, %%"#mm"6 \n\t" \
		352	#mov" %%"#mm"5, 96(%3) \n\t" \
		353	#mov" %%"#mm"3, %%"#mm"7 \n\t" \
		354	#mov" 32(%1), %%"#mm"5 \n\t" \
		355	"psubsw %%"#mm"2, %%"#mm"7 \n\t" \
		356	"paddsw %%"#mm"2, %%"#mm"3 \n\t" \
		357	"pmulhw %%"#mm"7, %%"#mm"5 \n\t" \
		358	"paddsw %%"#mm"3, %%"#mm"0 \n\t" \
		359	"paddsw %%"#mm"4, %%"#mm"6 \n\t" \
		360	"pmulhw (%1), %%"#mm"3 \n\t" \
		361	"por (%2), %%"#mm"0 \n\t" \
		362	"paddsw %%"#mm"7, %%"#mm"5 \n\t" \
		363	"psubsw %%"#mm"6, %%"#mm"7 \n\t" \
		364	#mov" %%"#mm"0, 16(%3) \n\t" \
		365	"paddsw %%"#mm"4, %%"#mm"5 \n\t" \
		366	#mov" %%"#mm"7, 48(%3) \n\t" \
		367	"psubsw %%"#mm"1, %%"#mm"3 \n\t" \
		368	#mov" %%"#mm"5, 80(%3) \n\t" \
		369	#mov" %%"#mm"3, 112(%3) \n\t" \
		370	: \
		371	: "r" (in + offset), "r" (fdct_tg_all_16), "r" (fdct_one_corr), \
		372	"r" (out + offset), "r" (ocos_4_16)); \
		373	}
		374
		375	FDCT_COL(mmx, mm, movq)
		376	FDCT_COL(sse2, xmm, movdqa)
		377
		378	static av_always_inline void fdct_row_sse2(const int16_t in, int16_t out)
		379	{
		380	__asm__ volatile(
		381	#define FDCT_ROW_SSE2_H1(i,t) \
		382	"movq " #i "(%0), %%xmm2 \n\t" \
		383	"movq " #i "+8(%0), %%xmm0 \n\t" \
		384	"movdqa " #t "+32(%1), %%xmm3 \n\t" \
		385	"movdqa " #t "+48(%1), %%xmm7 \n\t" \
		386	"movdqa " #t "(%1), %%xmm4 \n\t" \
		387	"movdqa " #t "+16(%1), %%xmm5 \n\t"
		388
		389	#define FDCT_ROW_SSE2_H2(i,t) \
		390	"movq " #i "(%0), %%xmm2 \n\t" \
		391	"movq " #i "+8(%0), %%xmm0 \n\t" \
		392	"movdqa " #t "+32(%1), %%xmm3 \n\t" \
		393	"movdqa " #t "+48(%1), %%xmm7 \n\t"
		394
		395	#define FDCT_ROW_SSE2(i) \
		396	"movq %%xmm2, %%xmm1 \n\t" \
		397	"pshuflw $27, %%xmm0, %%xmm0 \n\t" \
		398	"paddsw %%xmm0, %%xmm1 \n\t" \
		399	"psubsw %%xmm0, %%xmm2 \n\t" \
		400	"punpckldq %%xmm2, %%xmm1 \n\t" \
		401	"pshufd $78, %%xmm1, %%xmm2 \n\t" \
		402	"pmaddwd %%xmm2, %%xmm3 \n\t" \
		403	"pmaddwd %%xmm1, %%xmm7 \n\t" \
		404	"pmaddwd %%xmm5, %%xmm2 \n\t" \
		405	"pmaddwd %%xmm4, %%xmm1 \n\t" \
		406	"paddd %%xmm7, %%xmm3 \n\t" \
		407	"paddd %%xmm2, %%xmm1 \n\t" \
		408	"paddd %%xmm6, %%xmm3 \n\t" \
		409	"paddd %%xmm6, %%xmm1 \n\t" \
		410	"psrad %3, %%xmm3 \n\t" \
		411	"psrad %3, %%xmm1 \n\t" \
		412	"packssdw %%xmm3, %%xmm1 \n\t" \
		413	"movdqa %%xmm1, " #i "(%4) \n\t"
		414
		415	"movdqa (%2), %%xmm6 \n\t"
		416	FDCT_ROW_SSE2_H1(0,0)
		417	FDCT_ROW_SSE2(0)
		418	FDCT_ROW_SSE2_H2(64,0)
		419	FDCT_ROW_SSE2(64)
		420
		421	FDCT_ROW_SSE2_H1(16,64)
		422	FDCT_ROW_SSE2(16)
		423	FDCT_ROW_SSE2_H2(112,64)
		424	FDCT_ROW_SSE2(112)
		425
		426	FDCT_ROW_SSE2_H1(32,128)
		427	FDCT_ROW_SSE2(32)
		428	FDCT_ROW_SSE2_H2(96,128)
		429	FDCT_ROW_SSE2(96)
		430
		431	FDCT_ROW_SSE2_H1(48,192)
		432	FDCT_ROW_SSE2(48)
		433	FDCT_ROW_SSE2_H2(80,192)
		434	FDCT_ROW_SSE2(80)
		435	:
		436	: "r" (in), "r" (tab_frw_01234567_sse2.tab_frw_01234567_sse2),
		437	"r" (fdct_r_row_sse2.fdct_r_row_sse2), "i" (SHIFT_FRW_ROW), "r" (out)
		438	XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3",
		439	"%xmm4", "%xmm5", "%xmm6", "%xmm7")
		440	);
		441	}
		442
		443	static av_always_inline void fdct_row_mmxext(const int16_t in, int16_t out,
		444	const int16_t *table)
		445	{
		446	__asm__ volatile (
		447	"pshufw $0x1B, 8(%0), %%mm5 \n\t"
		448	"movq (%0), %%mm0 \n\t"
		449	"movq %%mm0, %%mm1 \n\t"
		450	"paddsw %%mm5, %%mm0 \n\t"
		451	"psubsw %%mm5, %%mm1 \n\t"
		452	"movq %%mm0, %%mm2 \n\t"
		453	"punpckldq %%mm1, %%mm0 \n\t"
		454	"punpckhdq %%mm1, %%mm2 \n\t"
		455	"movq (%1), %%mm1 \n\t"
		456	"movq 8(%1), %%mm3 \n\t"
		457	"movq 16(%1), %%mm4 \n\t"
		458	"movq 24(%1), %%mm5 \n\t"
		459	"movq 32(%1), %%mm6 \n\t"
		460	"movq 40(%1), %%mm7 \n\t"
		461	"pmaddwd %%mm0, %%mm1 \n\t"
		462	"pmaddwd %%mm2, %%mm3 \n\t"
		463	"pmaddwd %%mm0, %%mm4 \n\t"
		464	"pmaddwd %%mm2, %%mm5 \n\t"
		465	"pmaddwd %%mm0, %%mm6 \n\t"
		466	"pmaddwd %%mm2, %%mm7 \n\t"
		467	"pmaddwd 48(%1), %%mm0 \n\t"
		468	"pmaddwd 56(%1), %%mm2 \n\t"
		469	"paddd %%mm1, %%mm3 \n\t"
		470	"paddd %%mm4, %%mm5 \n\t"
		471	"paddd %%mm6, %%mm7 \n\t"
		472	"paddd %%mm0, %%mm2 \n\t"
		473	"movq (%2), %%mm0 \n\t"
		474	"paddd %%mm0, %%mm3 \n\t"
		475	"paddd %%mm0, %%mm5 \n\t"
		476	"paddd %%mm0, %%mm7 \n\t"
		477	"paddd %%mm0, %%mm2 \n\t"
		478	"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
		479	"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
		480	"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
		481	"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
		482	"packssdw %%mm5, %%mm3 \n\t"
		483	"packssdw %%mm2, %%mm7 \n\t"
		484	"movq %%mm3, (%3) \n\t"
		485	"movq %%mm7, 8(%3) \n\t"
		486	:
		487	: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
		488	}
		489
		490	static av_always_inline void fdct_row_mmx(const int16_t in, int16_t out, const int16_t *table)
		491	{
		492	//FIXME reorder (I do not have an old MMX-only CPU here to benchmark ...)
		493	__asm__ volatile(
		494	"movd 12(%0), %%mm1 \n\t"
		495	"punpcklwd 8(%0), %%mm1 \n\t"
		496	"movq %%mm1, %%mm2 \n\t"
		497	"psrlq $0x20, %%mm1 \n\t"
		498	"movq 0(%0), %%mm0 \n\t"
		499	"punpcklwd %%mm2, %%mm1 \n\t"
		500	"movq %%mm0, %%mm5 \n\t"
		501	"paddsw %%mm1, %%mm0 \n\t"
		502	"psubsw %%mm1, %%mm5 \n\t"
		503	"movq %%mm0, %%mm2 \n\t"
		504	"punpckldq %%mm5, %%mm0 \n\t"
		505	"punpckhdq %%mm5, %%mm2 \n\t"
		506	"movq 0(%1), %%mm1 \n\t"
		507	"movq 8(%1), %%mm3 \n\t"
		508	"movq 16(%1), %%mm4 \n\t"
		509	"movq 24(%1), %%mm5 \n\t"
		510	"movq 32(%1), %%mm6 \n\t"
		511	"movq 40(%1), %%mm7 \n\t"
		512	"pmaddwd %%mm0, %%mm1 \n\t"
		513	"pmaddwd %%mm2, %%mm3 \n\t"
		514	"pmaddwd %%mm0, %%mm4 \n\t"
		515	"pmaddwd %%mm2, %%mm5 \n\t"
		516	"pmaddwd %%mm0, %%mm6 \n\t"
		517	"pmaddwd %%mm2, %%mm7 \n\t"
		518	"pmaddwd 48(%1), %%mm0 \n\t"
		519	"pmaddwd 56(%1), %%mm2 \n\t"
		520	"paddd %%mm1, %%mm3 \n\t"
		521	"paddd %%mm4, %%mm5 \n\t"
		522	"paddd %%mm6, %%mm7 \n\t"
		523	"paddd %%mm0, %%mm2 \n\t"
		524	"movq (%2), %%mm0 \n\t"
		525	"paddd %%mm0, %%mm3 \n\t"
		526	"paddd %%mm0, %%mm5 \n\t"
		527	"paddd %%mm0, %%mm7 \n\t"
		528	"paddd %%mm0, %%mm2 \n\t"
		529	"psrad $"S(SHIFT_FRW_ROW)", %%mm3 \n\t"
		530	"psrad $"S(SHIFT_FRW_ROW)", %%mm5 \n\t"
		531	"psrad $"S(SHIFT_FRW_ROW)", %%mm7 \n\t"
		532	"psrad $"S(SHIFT_FRW_ROW)", %%mm2 \n\t"
		533	"packssdw %%mm5, %%mm3 \n\t"
		534	"packssdw %%mm2, %%mm7 \n\t"
		535	"movq %%mm3, 0(%3) \n\t"
		536	"movq %%mm7, 8(%3) \n\t"
		537	:
		538	: "r" (in), "r" (table), "r" (fdct_r_row), "r" (out));
		539	}
		540
		541	void ff_fdct_mmx(int16_t *block)
		542	{
		543	DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
		544	int16_t * block1= (int16_t*)align_tmp;
		545	const int16_t *table= tab_frw_01234567;
		546	int i;
		547
		548	fdct_col_mmx(block, block1, 0);
		549	fdct_col_mmx(block, block1, 4);
		550
		551	for(i=8;i>0;i--) {
		552	fdct_row_mmx(block1, block, table);
		553	block1 += 8;
		554	table += 32;
		555	block += 8;
		556	}
		557	}
		558
		559	#endif /* HAVE_MMX_INLINE */
		560
		561	#if HAVE_MMXEXT_INLINE
		562
		563	void ff_fdct_mmxext(int16_t *block)
		564	{
		565	DECLARE_ALIGNED(8, int64_t, align_tmp)[16];
		566	int16_t block1= (int16_t)align_tmp;
		567	const int16_t *table= tab_frw_01234567;
		568	int i;
		569
		570	fdct_col_mmx(block, block1, 0);
		571	fdct_col_mmx(block, block1, 4);
		572
		573	for(i=8;i>0;i--) {
		574	fdct_row_mmxext(block1, block, table);
		575	block1 += 8;
		576	table += 32;
		577	block += 8;
		578	}
		579	}
		580
		581	#endif /* HAVE_MMXEXT_INLINE */
		582
		583	#if HAVE_SSE2_INLINE
		584
		585	void ff_fdct_sse2(int16_t *block)
		586	{
		587	DECLARE_ALIGNED(16, int64_t, align_tmp)[16];
		588	int16_t * const block1= (int16_t*)align_tmp;
		589
		590	fdct_col_sse2(block, block1, 0);
		591	fdct_row_sse2(block1, block);
		592	}
		593
		594	#endif /* HAVE_SSE2_INLINE */

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/ffmpeg/libavcodec/x86/fdct.c – Rev 4349