WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/ffmpeg/libavcodec/x86/motion_est.c

Rev	Author	Line No.	Line
4349	Serge	1	/*
		2	* MMX optimized motion estimation
		3	* Copyright (c) 2001 Fabrice Bellard
		4	* Copyright (c) 2002-2004 Michael Niedermayer
		5	*
		6	* mostly by Michael Niedermayer
		7	*
		8	* This file is part of FFmpeg.
		9	*
		10	* FFmpeg is free software; you can redistribute it and/or
		11	* modify it under the terms of the GNU Lesser General Public
		12	* License as published by the Free Software Foundation; either
		13	* version 2.1 of the License, or (at your option) any later version.
		14	*
		15	* FFmpeg is distributed in the hope that it will be useful,
		16	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		17	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
		18	* Lesser General Public License for more details.
		19	*
		20	* You should have received a copy of the GNU Lesser General Public
		21	* License along with FFmpeg; if not, write to the Free Software
		22	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
		23	*/
		24
		25	#include "libavutil/attributes.h"
		26	#include "libavutil/avassert.h"
		27	#include "libavutil/mem.h"
		28	#include "libavutil/x86/asm.h"
		29	#include "libavutil/x86/cpu.h"
		30	#include "dsputil_x86.h"
		31
		32	#if HAVE_INLINE_ASM
		33
		34	DECLARE_ASM_CONST(8, uint64_t, round_tab)[3]={
		35	0x0000000000000000ULL,
		36	0x0001000100010001ULL,
		37	0x0002000200020002ULL,
		38	};
		39
		40	DECLARE_ASM_CONST(8, uint64_t, bone)= 0x0101010101010101LL;
		41
		42	static inline void sad8_1_mmx(uint8_t blk1, uint8_t blk2, int stride, int h)
		43	{
		44	x86_reg len= -(x86_reg)stride*h;
		45	__asm__ volatile(
		46	".p2align 4 \n\t"
		47	"1: \n\t"
		48	"movq (%1, %%"REG_a"), %%mm0 \n\t"
		49	"movq (%2, %%"REG_a"), %%mm2 \n\t"
		50	"movq (%2, %%"REG_a"), %%mm4 \n\t"
		51	"add %3, %%"REG_a" \n\t"
		52	"psubusb %%mm0, %%mm2 \n\t"
		53	"psubusb %%mm4, %%mm0 \n\t"
		54	"movq (%1, %%"REG_a"), %%mm1 \n\t"
		55	"movq (%2, %%"REG_a"), %%mm3 \n\t"
		56	"movq (%2, %%"REG_a"), %%mm5 \n\t"
		57	"psubusb %%mm1, %%mm3 \n\t"
		58	"psubusb %%mm5, %%mm1 \n\t"
		59	"por %%mm2, %%mm0 \n\t"
		60	"por %%mm1, %%mm3 \n\t"
		61	"movq %%mm0, %%mm1 \n\t"
		62	"movq %%mm3, %%mm2 \n\t"
		63	"punpcklbw %%mm7, %%mm0 \n\t"
		64	"punpckhbw %%mm7, %%mm1 \n\t"
		65	"punpcklbw %%mm7, %%mm3 \n\t"
		66	"punpckhbw %%mm7, %%mm2 \n\t"
		67	"paddw %%mm1, %%mm0 \n\t"
		68	"paddw %%mm3, %%mm2 \n\t"
		69	"paddw %%mm2, %%mm0 \n\t"
		70	"paddw %%mm0, %%mm6 \n\t"
		71	"add %3, %%"REG_a" \n\t"
		72	" js 1b \n\t"
		73	: "+a" (len)
		74	: "r" (blk1 - len), "r" (blk2 - len), "r" ((x86_reg)stride)
		75	);
		76	}
		77
		78	static inline void sad8_1_mmxext(uint8_t blk1, uint8_t blk2,
		79	int stride, int h)
		80	{
		81	__asm__ volatile(
		82	".p2align 4 \n\t"
		83	"1: \n\t"
		84	"movq (%1), %%mm0 \n\t"
		85	"movq (%1, %3), %%mm1 \n\t"
		86	"psadbw (%2), %%mm0 \n\t"
		87	"psadbw (%2, %3), %%mm1 \n\t"
		88	"paddw %%mm0, %%mm6 \n\t"
		89	"paddw %%mm1, %%mm6 \n\t"
		90	"lea (%1,%3,2), %1 \n\t"
		91	"lea (%2,%3,2), %2 \n\t"
		92	"sub $2, %0 \n\t"
		93	" jg 1b \n\t"
		94	: "+r" (h), "+r" (blk1), "+r" (blk2)
		95	: "r" ((x86_reg)stride)
		96	);
		97	}
		98
		99	static int sad16_sse2(void v, uint8_t blk2, uint8_t *blk1, int stride, int h)
		100	{
		101	int ret;
		102	__asm__ volatile(
		103	"pxor %%xmm2, %%xmm2 \n\t"
		104	".p2align 4 \n\t"
		105	"1: \n\t"
		106	"movdqu (%1), %%xmm0 \n\t"
		107	"movdqu (%1, %4), %%xmm1 \n\t"
		108	"psadbw (%2), %%xmm0 \n\t"
		109	"psadbw (%2, %4), %%xmm1 \n\t"
		110	"paddw %%xmm0, %%xmm2 \n\t"
		111	"paddw %%xmm1, %%xmm2 \n\t"
		112	"lea (%1,%4,2), %1 \n\t"
		113	"lea (%2,%4,2), %2 \n\t"
		114	"sub $2, %0 \n\t"
		115	" jg 1b \n\t"
		116	"movhlps %%xmm2, %%xmm0 \n\t"
		117	"paddw %%xmm0, %%xmm2 \n\t"
		118	"movd %%xmm2, %3 \n\t"
		119	: "+r" (h), "+r" (blk1), "+r" (blk2), "=r"(ret)
		120	: "r" ((x86_reg)stride)
		121	);
		122	return ret;
		123	}
		124
		125	static inline void sad8_x2a_mmxext(uint8_t blk1, uint8_t blk2,
		126	int stride, int h)
		127	{
		128	__asm__ volatile(
		129	".p2align 4 \n\t"
		130	"1: \n\t"
		131	"movq (%1), %%mm0 \n\t"
		132	"movq (%1, %3), %%mm1 \n\t"
		133	"pavgb 1(%1), %%mm0 \n\t"
		134	"pavgb 1(%1, %3), %%mm1 \n\t"
		135	"psadbw (%2), %%mm0 \n\t"
		136	"psadbw (%2, %3), %%mm1 \n\t"
		137	"paddw %%mm0, %%mm6 \n\t"
		138	"paddw %%mm1, %%mm6 \n\t"
		139	"lea (%1,%3,2), %1 \n\t"
		140	"lea (%2,%3,2), %2 \n\t"
		141	"sub $2, %0 \n\t"
		142	" jg 1b \n\t"
		143	: "+r" (h), "+r" (blk1), "+r" (blk2)
		144	: "r" ((x86_reg)stride)
		145	);
		146	}
		147
		148	static inline void sad8_y2a_mmxext(uint8_t blk1, uint8_t blk2,
		149	int stride, int h)
		150	{
		151	__asm__ volatile(
		152	"movq (%1), %%mm0 \n\t"
		153	"add %3, %1 \n\t"
		154	".p2align 4 \n\t"
		155	"1: \n\t"
		156	"movq (%1), %%mm1 \n\t"
		157	"movq (%1, %3), %%mm2 \n\t"
		158	"pavgb %%mm1, %%mm0 \n\t"
		159	"pavgb %%mm2, %%mm1 \n\t"
		160	"psadbw (%2), %%mm0 \n\t"
		161	"psadbw (%2, %3), %%mm1 \n\t"
		162	"paddw %%mm0, %%mm6 \n\t"
		163	"paddw %%mm1, %%mm6 \n\t"
		164	"movq %%mm2, %%mm0 \n\t"
		165	"lea (%1,%3,2), %1 \n\t"
		166	"lea (%2,%3,2), %2 \n\t"
		167	"sub $2, %0 \n\t"
		168	" jg 1b \n\t"
		169	: "+r" (h), "+r" (blk1), "+r" (blk2)
		170	: "r" ((x86_reg)stride)
		171	);
		172	}
		173
		174	static inline void sad8_4_mmxext(uint8_t blk1, uint8_t blk2,
		175	int stride, int h)
		176	{
		177	__asm__ volatile(
		178	"movq "MANGLE(bone)", %%mm5 \n\t"
		179	"movq (%1), %%mm0 \n\t"
		180	"pavgb 1(%1), %%mm0 \n\t"
		181	"add %3, %1 \n\t"
		182	".p2align 4 \n\t"
		183	"1: \n\t"
		184	"movq (%1), %%mm1 \n\t"
		185	"movq (%1,%3), %%mm2 \n\t"
		186	"pavgb 1(%1), %%mm1 \n\t"
		187	"pavgb 1(%1,%3), %%mm2 \n\t"
		188	"psubusb %%mm5, %%mm1 \n\t"
		189	"pavgb %%mm1, %%mm0 \n\t"
		190	"pavgb %%mm2, %%mm1 \n\t"
		191	"psadbw (%2), %%mm0 \n\t"
		192	"psadbw (%2,%3), %%mm1 \n\t"
		193	"paddw %%mm0, %%mm6 \n\t"
		194	"paddw %%mm1, %%mm6 \n\t"
		195	"movq %%mm2, %%mm0 \n\t"
		196	"lea (%1,%3,2), %1 \n\t"
		197	"lea (%2,%3,2), %2 \n\t"
		198	"sub $2, %0 \n\t"
		199	" jg 1b \n\t"
		200	: "+r" (h), "+r" (blk1), "+r" (blk2)
		201	: "r" ((x86_reg)stride)
		202	);
		203	}
		204
		205	static inline void sad8_2_mmx(uint8_t blk1a, uint8_t blk1b, uint8_t *blk2, int stride, int h)
		206	{
		207	x86_reg len= -(x86_reg)stride*h;
		208	__asm__ volatile(
		209	".p2align 4 \n\t"
		210	"1: \n\t"
		211	"movq (%1, %%"REG_a"), %%mm0 \n\t"
		212	"movq (%2, %%"REG_a"), %%mm1 \n\t"
		213	"movq (%1, %%"REG_a"), %%mm2 \n\t"
		214	"movq (%2, %%"REG_a"), %%mm3 \n\t"
		215	"punpcklbw %%mm7, %%mm0 \n\t"
		216	"punpcklbw %%mm7, %%mm1 \n\t"
		217	"punpckhbw %%mm7, %%mm2 \n\t"
		218	"punpckhbw %%mm7, %%mm3 \n\t"
		219	"paddw %%mm0, %%mm1 \n\t"
		220	"paddw %%mm2, %%mm3 \n\t"
		221	"movq (%3, %%"REG_a"), %%mm4 \n\t"
		222	"movq (%3, %%"REG_a"), %%mm2 \n\t"
		223	"paddw %%mm5, %%mm1 \n\t"
		224	"paddw %%mm5, %%mm3 \n\t"
		225	"psrlw $1, %%mm1 \n\t"
		226	"psrlw $1, %%mm3 \n\t"
		227	"packuswb %%mm3, %%mm1 \n\t"
		228	"psubusb %%mm1, %%mm4 \n\t"
		229	"psubusb %%mm2, %%mm1 \n\t"
		230	"por %%mm4, %%mm1 \n\t"
		231	"movq %%mm1, %%mm0 \n\t"
		232	"punpcklbw %%mm7, %%mm0 \n\t"
		233	"punpckhbw %%mm7, %%mm1 \n\t"
		234	"paddw %%mm1, %%mm0 \n\t"
		235	"paddw %%mm0, %%mm6 \n\t"
		236	"add %4, %%"REG_a" \n\t"
		237	" js 1b \n\t"
		238	: "+a" (len)
		239	: "r" (blk1a - len), "r" (blk1b -len), "r" (blk2 - len), "r" ((x86_reg)stride)
		240	);
		241	}
		242
		243	static inline void sad8_4_mmx(uint8_t blk1, uint8_t blk2, int stride, int h)
		244	{
		245	x86_reg len= -(x86_reg)stride*h;
		246	__asm__ volatile(
		247	"movq (%1, %%"REG_a"), %%mm0 \n\t"
		248	"movq 1(%1, %%"REG_a"), %%mm2 \n\t"
		249	"movq %%mm0, %%mm1 \n\t"
		250	"movq %%mm2, %%mm3 \n\t"
		251	"punpcklbw %%mm7, %%mm0 \n\t"
		252	"punpckhbw %%mm7, %%mm1 \n\t"
		253	"punpcklbw %%mm7, %%mm2 \n\t"
		254	"punpckhbw %%mm7, %%mm3 \n\t"
		255	"paddw %%mm2, %%mm0 \n\t"
		256	"paddw %%mm3, %%mm1 \n\t"
		257	".p2align 4 \n\t"
		258	"1: \n\t"
		259	"movq (%2, %%"REG_a"), %%mm2 \n\t"
		260	"movq 1(%2, %%"REG_a"), %%mm4 \n\t"
		261	"movq %%mm2, %%mm3 \n\t"
		262	"movq %%mm4, %%mm5 \n\t"
		263	"punpcklbw %%mm7, %%mm2 \n\t"
		264	"punpckhbw %%mm7, %%mm3 \n\t"
		265	"punpcklbw %%mm7, %%mm4 \n\t"
		266	"punpckhbw %%mm7, %%mm5 \n\t"
		267	"paddw %%mm4, %%mm2 \n\t"
		268	"paddw %%mm5, %%mm3 \n\t"
		269	"movq 16+"MANGLE(round_tab)", %%mm5 \n\t"
		270	"paddw %%mm2, %%mm0 \n\t"
		271	"paddw %%mm3, %%mm1 \n\t"
		272	"paddw %%mm5, %%mm0 \n\t"
		273	"paddw %%mm5, %%mm1 \n\t"
		274	"movq (%3, %%"REG_a"), %%mm4 \n\t"
		275	"movq (%3, %%"REG_a"), %%mm5 \n\t"
		276	"psrlw $2, %%mm0 \n\t"
		277	"psrlw $2, %%mm1 \n\t"
		278	"packuswb %%mm1, %%mm0 \n\t"
		279	"psubusb %%mm0, %%mm4 \n\t"
		280	"psubusb %%mm5, %%mm0 \n\t"
		281	"por %%mm4, %%mm0 \n\t"
		282	"movq %%mm0, %%mm4 \n\t"
		283	"punpcklbw %%mm7, %%mm0 \n\t"
		284	"punpckhbw %%mm7, %%mm4 \n\t"
		285	"paddw %%mm0, %%mm6 \n\t"
		286	"paddw %%mm4, %%mm6 \n\t"
		287	"movq %%mm2, %%mm0 \n\t"
		288	"movq %%mm3, %%mm1 \n\t"
		289	"add %4, %%"REG_a" \n\t"
		290	" js 1b \n\t"
		291	: "+a" (len)
		292	: "r" (blk1 - len), "r" (blk1 -len + stride), "r" (blk2 - len), "r" ((x86_reg)stride)
		293	);
		294	}
		295
		296	static inline int sum_mmx(void)
		297	{
		298	int ret;
		299	__asm__ volatile(
		300	"movq %%mm6, %%mm0 \n\t"
		301	"psrlq $32, %%mm6 \n\t"
		302	"paddw %%mm0, %%mm6 \n\t"
		303	"movq %%mm6, %%mm0 \n\t"
		304	"psrlq $16, %%mm6 \n\t"
		305	"paddw %%mm0, %%mm6 \n\t"
		306	"movd %%mm6, %0 \n\t"
		307	: "=r" (ret)
		308	);
		309	return ret&0xFFFF;
		310	}
		311
		312	static inline int sum_mmxext(void)
		313	{
		314	int ret;
		315	__asm__ volatile(
		316	"movd %%mm6, %0 \n\t"
		317	: "=r" (ret)
		318	);
		319	return ret;
		320	}
		321
		322	static inline void sad8_x2a_mmx(uint8_t blk1, uint8_t blk2, int stride, int h)
		323	{
		324	sad8_2_mmx(blk1, blk1+1, blk2, stride, h);
		325	}
		326	static inline void sad8_y2a_mmx(uint8_t blk1, uint8_t blk2, int stride, int h)
		327	{
		328	sad8_2_mmx(blk1, blk1+stride, blk2, stride, h);
		329	}
		330
		331
		332	#define PIX_SAD(suf)\
		333	static int sad8_ ## suf(void v, uint8_t blk2, uint8_t *blk1, int stride, int h)\
		334	{\
		335	av_assert2(h==8);\
		336	__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
		337	"pxor %%mm6, %%mm6 \n\t":);\
		338	\
		339	sad8_1_ ## suf(blk1, blk2, stride, 8);\
		340	\
		341	return sum_ ## suf();\
		342	}\
		343	static int sad8_x2_ ## suf(void v, uint8_t blk2, uint8_t *blk1, int stride, int h)\
		344	{\
		345	av_assert2(h==8);\
		346	__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
		347	"pxor %%mm6, %%mm6 \n\t"\
		348	"movq %0, %%mm5 \n\t"\
		349	:: "m"(round_tab[1]) \
		350	);\
		351	\
		352	sad8_x2a_ ## suf(blk1, blk2, stride, 8);\
		353	\
		354	return sum_ ## suf();\
		355	}\
		356	\
		357	static int sad8_y2_ ## suf(void v, uint8_t blk2, uint8_t *blk1, int stride, int h)\
		358	{\
		359	av_assert2(h==8);\
		360	__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
		361	"pxor %%mm6, %%mm6 \n\t"\
		362	"movq %0, %%mm5 \n\t"\
		363	:: "m"(round_tab[1]) \
		364	);\
		365	\
		366	sad8_y2a_ ## suf(blk1, blk2, stride, 8);\
		367	\
		368	return sum_ ## suf();\
		369	}\
		370	\
		371	static int sad8_xy2_ ## suf(void v, uint8_t blk2, uint8_t *blk1, int stride, int h)\
		372	{\
		373	av_assert2(h==8);\
		374	__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
		375	"pxor %%mm6, %%mm6 \n\t"\
		376	::);\
		377	\
		378	sad8_4_ ## suf(blk1, blk2, stride, 8);\
		379	\
		380	return sum_ ## suf();\
		381	}\
		382	\
		383	static int sad16_ ## suf(void v, uint8_t blk2, uint8_t *blk1, int stride, int h)\
		384	{\
		385	__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
		386	"pxor %%mm6, %%mm6 \n\t":);\
		387	\
		388	sad8_1_ ## suf(blk1 , blk2 , stride, h);\
		389	sad8_1_ ## suf(blk1+8, blk2+8, stride, h);\
		390	\
		391	return sum_ ## suf();\
		392	}\
		393	static int sad16_x2_ ## suf(void v, uint8_t blk2, uint8_t *blk1, int stride, int h)\
		394	{\
		395	__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
		396	"pxor %%mm6, %%mm6 \n\t"\
		397	"movq %0, %%mm5 \n\t"\
		398	:: "m"(round_tab[1]) \
		399	);\
		400	\
		401	sad8_x2a_ ## suf(blk1 , blk2 , stride, h);\
		402	sad8_x2a_ ## suf(blk1+8, blk2+8, stride, h);\
		403	\
		404	return sum_ ## suf();\
		405	}\
		406	static int sad16_y2_ ## suf(void v, uint8_t blk2, uint8_t *blk1, int stride, int h)\
		407	{\
		408	__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
		409	"pxor %%mm6, %%mm6 \n\t"\
		410	"movq %0, %%mm5 \n\t"\
		411	:: "m"(round_tab[1]) \
		412	);\
		413	\
		414	sad8_y2a_ ## suf(blk1 , blk2 , stride, h);\
		415	sad8_y2a_ ## suf(blk1+8, blk2+8, stride, h);\
		416	\
		417	return sum_ ## suf();\
		418	}\
		419	static int sad16_xy2_ ## suf(void v, uint8_t blk2, uint8_t *blk1, int stride, int h)\
		420	{\
		421	__asm__ volatile("pxor %%mm7, %%mm7 \n\t"\
		422	"pxor %%mm6, %%mm6 \n\t"\
		423	::);\
		424	\
		425	sad8_4_ ## suf(blk1 , blk2 , stride, h);\
		426	sad8_4_ ## suf(blk1+8, blk2+8, stride, h);\
		427	\
		428	return sum_ ## suf();\
		429	}\
		430
		431	PIX_SAD(mmx)
		432	PIX_SAD(mmxext)
		433
		434	#endif /* HAVE_INLINE_ASM */
		435
		436	av_cold void ff_dsputil_init_pix_mmx(DSPContext c, AVCodecContext avctx)
		437	{
		438	#if HAVE_INLINE_ASM
		439	int cpu_flags = av_get_cpu_flags();
		440
		441	if (INLINE_MMX(cpu_flags)) {
		442	c->pix_abs[0][0] = sad16_mmx;
		443	c->pix_abs[0][1] = sad16_x2_mmx;
		444	c->pix_abs[0][2] = sad16_y2_mmx;
		445	c->pix_abs[0][3] = sad16_xy2_mmx;
		446	c->pix_abs[1][0] = sad8_mmx;
		447	c->pix_abs[1][1] = sad8_x2_mmx;
		448	c->pix_abs[1][2] = sad8_y2_mmx;
		449	c->pix_abs[1][3] = sad8_xy2_mmx;
		450
		451	c->sad[0]= sad16_mmx;
		452	c->sad[1]= sad8_mmx;
		453	}
		454	if (INLINE_MMXEXT(cpu_flags)) {
		455	c->pix_abs[0][0] = sad16_mmxext;
		456	c->pix_abs[1][0] = sad8_mmxext;
		457
		458	c->sad[0] = sad16_mmxext;
		459	c->sad[1] = sad8_mmxext;
		460
		461	if(!(avctx->flags & CODEC_FLAG_BITEXACT)){
		462	c->pix_abs[0][1] = sad16_x2_mmxext;
		463	c->pix_abs[0][2] = sad16_y2_mmxext;
		464	c->pix_abs[0][3] = sad16_xy2_mmxext;
		465	c->pix_abs[1][1] = sad8_x2_mmxext;
		466	c->pix_abs[1][2] = sad8_y2_mmxext;
		467	c->pix_abs[1][3] = sad8_xy2_mmxext;
		468	}
		469	}
		470	if (INLINE_SSE2(cpu_flags) && !(cpu_flags & AV_CPU_FLAG_3DNOW) && avctx->codec_id != AV_CODEC_ID_SNOW) {
		471	c->sad[0]= sad16_sse2;
		472	}
		473	#endif /* HAVE_INLINE_ASM */
		474	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/ffmpeg/libavcodec/x86/motion_est.c – Rev 4349