WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/ffmpeg/libavcodec/ppc/idct_altivec.c

Rev	Author	Line No.	Line
4349	Serge	1	/*
		2	* Copyright (c) 2001 Michel Lespinasse
		3	*
		4	* This file is part of FFmpeg.
		5	*
		6	* FFmpeg is free software; you can redistribute it and/or
		7	* modify it under the terms of the GNU Lesser General Public
		8	* License as published by the Free Software Foundation; either
		9	* version 2.1 of the License, or (at your option) any later version.
		10	*
		11	* FFmpeg is distributed in the hope that it will be useful,
		12	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		13	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
		14	* Lesser General Public License for more details.
		15	*
		16	* You should have received a copy of the GNU Lesser General Public
		17	* License along with FFmpeg; if not, write to the Free Software
		18	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
		19	*/
		20
		21	/*
		22	* NOTE: This code is based on GPL code from the libmpeg2 project. The
		23	* author, Michel Lespinasses, has given explicit permission to release
		24	* under LGPL as part of FFmpeg.
		25	*/
		26
		27	/*
		28	* FFmpeg integration by Dieter Shirley
		29	*
		30	* This file is a direct copy of the AltiVec IDCT module from the libmpeg2
		31	* project. I've deleted all of the libmpeg2-specific code, renamed the
		32	* functions and reordered the function parameters. The only change to the
		33	* IDCT function itself was to factor out the partial transposition, and to
		34	* perform a full transpose at the end of the function.
		35	*/
		36
		37
		38	#include /* malloc(), free() */
		39	#include
		40	#include "config.h"
		41	#if HAVE_ALTIVEC_H
		42	#include
		43	#endif
		44	#include "libavutil/ppc/types_altivec.h"
		45	#include "dsputil_altivec.h"
		46
		47	#define IDCT_HALF \
		48	/* 1st stage */ \
		49	t1 = vec_mradds (a1, vx7, vx1 ); \
		50	t8 = vec_mradds (a1, vx1, vec_subs (zero, vx7)); \
		51	t7 = vec_mradds (a2, vx5, vx3); \
		52	t3 = vec_mradds (ma2, vx3, vx5); \
		53	\
		54	/* 2nd stage */ \
		55	t5 = vec_adds (vx0, vx4); \
		56	t0 = vec_subs (vx0, vx4); \
		57	t2 = vec_mradds (a0, vx6, vx2); \
		58	t4 = vec_mradds (a0, vx2, vec_subs (zero, vx6)); \
		59	t6 = vec_adds (t8, t3); \
		60	t3 = vec_subs (t8, t3); \
		61	t8 = vec_subs (t1, t7); \
		62	t1 = vec_adds (t1, t7); \
		63	\
		64	/* 3rd stage */ \
		65	t7 = vec_adds (t5, t2); \
		66	t2 = vec_subs (t5, t2); \
		67	t5 = vec_adds (t0, t4); \
		68	t0 = vec_subs (t0, t4); \
		69	t4 = vec_subs (t8, t3); \
		70	t3 = vec_adds (t8, t3); \
		71	\
		72	/* 4th stage */ \
		73	vy0 = vec_adds (t7, t1); \
		74	vy7 = vec_subs (t7, t1); \
		75	vy1 = vec_mradds (c4, t3, t5); \
		76	vy6 = vec_mradds (mc4, t3, t5); \
		77	vy2 = vec_mradds (c4, t4, t0); \
		78	vy5 = vec_mradds (mc4, t4, t0); \
		79	vy3 = vec_adds (t2, t6); \
		80	vy4 = vec_subs (t2, t6);
		81
		82
		83	#define IDCT \
		84	vec_s16 vx0, vx1, vx2, vx3, vx4, vx5, vx6, vx7; \
		85	vec_s16 vy0, vy1, vy2, vy3, vy4, vy5, vy6, vy7; \
		86	vec_s16 a0, a1, a2, ma2, c4, mc4, zero, bias; \
		87	vec_s16 t0, t1, t2, t3, t4, t5, t6, t7, t8; \
		88	vec_u16 shift; \
		89	\
		90	c4 = vec_splat (constants[0], 0); \
		91	a0 = vec_splat (constants[0], 1); \
		92	a1 = vec_splat (constants[0], 2); \
		93	a2 = vec_splat (constants[0], 3); \
		94	mc4 = vec_splat (constants[0], 4); \
		95	ma2 = vec_splat (constants[0], 5); \
		96	bias = (vec_s16)vec_splat ((vec_s32)constants[0], 3); \
		97	\
		98	zero = vec_splat_s16 (0); \
		99	shift = vec_splat_u16 (4); \
		100	\
		101	vx0 = vec_mradds (vec_sl (block[0], shift), constants[1], zero); \
		102	vx1 = vec_mradds (vec_sl (block[1], shift), constants[2], zero); \
		103	vx2 = vec_mradds (vec_sl (block[2], shift), constants[3], zero); \
		104	vx3 = vec_mradds (vec_sl (block[3], shift), constants[4], zero); \
		105	vx4 = vec_mradds (vec_sl (block[4], shift), constants[1], zero); \
		106	vx5 = vec_mradds (vec_sl (block[5], shift), constants[4], zero); \
		107	vx6 = vec_mradds (vec_sl (block[6], shift), constants[3], zero); \
		108	vx7 = vec_mradds (vec_sl (block[7], shift), constants[2], zero); \
		109	\
		110	IDCT_HALF \
		111	\
		112	vx0 = vec_mergeh (vy0, vy4); \
		113	vx1 = vec_mergel (vy0, vy4); \
		114	vx2 = vec_mergeh (vy1, vy5); \
		115	vx3 = vec_mergel (vy1, vy5); \
		116	vx4 = vec_mergeh (vy2, vy6); \
		117	vx5 = vec_mergel (vy2, vy6); \
		118	vx6 = vec_mergeh (vy3, vy7); \
		119	vx7 = vec_mergel (vy3, vy7); \
		120	\
		121	vy0 = vec_mergeh (vx0, vx4); \
		122	vy1 = vec_mergel (vx0, vx4); \
		123	vy2 = vec_mergeh (vx1, vx5); \
		124	vy3 = vec_mergel (vx1, vx5); \
		125	vy4 = vec_mergeh (vx2, vx6); \
		126	vy5 = vec_mergel (vx2, vx6); \
		127	vy6 = vec_mergeh (vx3, vx7); \
		128	vy7 = vec_mergel (vx3, vx7); \
		129	\
		130	vx0 = vec_adds (vec_mergeh (vy0, vy4), bias); \
		131	vx1 = vec_mergel (vy0, vy4); \
		132	vx2 = vec_mergeh (vy1, vy5); \
		133	vx3 = vec_mergel (vy1, vy5); \
		134	vx4 = vec_mergeh (vy2, vy6); \
		135	vx5 = vec_mergel (vy2, vy6); \
		136	vx6 = vec_mergeh (vy3, vy7); \
		137	vx7 = vec_mergel (vy3, vy7); \
		138	\
		139	IDCT_HALF \
		140	\
		141	shift = vec_splat_u16 (6); \
		142	vx0 = vec_sra (vy0, shift); \
		143	vx1 = vec_sra (vy1, shift); \
		144	vx2 = vec_sra (vy2, shift); \
		145	vx3 = vec_sra (vy3, shift); \
		146	vx4 = vec_sra (vy4, shift); \
		147	vx5 = vec_sra (vy5, shift); \
		148	vx6 = vec_sra (vy6, shift); \
		149	vx7 = vec_sra (vy7, shift);
		150
		151
		152	static const vec_s16 constants[5] = {
		153	{23170, 13573, 6518, 21895, -23170, -21895, 32, 31},
		154	{16384, 22725, 21407, 19266, 16384, 19266, 21407, 22725},
		155	{22725, 31521, 29692, 26722, 22725, 26722, 29692, 31521},
		156	{21407, 29692, 27969, 25172, 21407, 25172, 27969, 29692},
		157	{19266, 26722, 25172, 22654, 19266, 22654, 25172, 26722}
		158	};
		159
		160	void ff_idct_put_altivec(uint8_t* dest, int stride, int16_t *blk)
		161	{
		162	vec_s16 block = (vec_s16)blk;
		163	vec_u8 tmp;
		164
		165	IDCT
		166
		167	#define COPY(dest,src) \
		168	tmp = vec_packsu (src, src); \
		169	vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
		170	vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
		171
		172	COPY (dest, vx0) dest += stride;
		173	COPY (dest, vx1) dest += stride;
		174	COPY (dest, vx2) dest += stride;
		175	COPY (dest, vx3) dest += stride;
		176	COPY (dest, vx4) dest += stride;
		177	COPY (dest, vx5) dest += stride;
		178	COPY (dest, vx6) dest += stride;
		179	COPY (dest, vx7)
		180	}
		181
		182	void ff_idct_add_altivec(uint8_t* dest, int stride, int16_t *blk)
		183	{
		184	vec_s16 block = (vec_s16)blk;
		185	vec_u8 tmp;
		186	vec_s16 tmp2, tmp3;
		187	vec_u8 perm0;
		188	vec_u8 perm1;
		189	vec_u8 p0, p1, p;
		190
		191	IDCT
		192
		193	p0 = vec_lvsl (0, dest);
		194	p1 = vec_lvsl (stride, dest);
		195	p = vec_splat_u8 (-1);
		196	perm0 = vec_mergeh (p, p0);
		197	perm1 = vec_mergeh (p, p1);
		198
		199	#define ADD(dest,src,perm) \
		200	/* (uint64_t )&tmp = (uint64_t )dest; */ \
		201	tmp = vec_ld (0, dest); \
		202	tmp2 = (vec_s16)vec_perm (tmp, (vec_u8)zero, perm); \
		203	tmp3 = vec_adds (tmp2, src); \
		204	tmp = vec_packsu (tmp3, tmp3); \
		205	vec_ste ((vec_u32)tmp, 0, (unsigned int *)dest); \
		206	vec_ste ((vec_u32)tmp, 4, (unsigned int *)dest);
		207
		208	ADD (dest, vx0, perm0) dest += stride;
		209	ADD (dest, vx1, perm1) dest += stride;
		210	ADD (dest, vx2, perm0) dest += stride;
		211	ADD (dest, vx3, perm1) dest += stride;
		212	ADD (dest, vx4, perm0) dest += stride;
		213	ADD (dest, vx5, perm1) dest += stride;
		214	ADD (dest, vx6, perm0) dest += stride;
		215	ADD (dest, vx7, perm1)
		216	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/ffmpeg/libavcodec/ppc/idct_altivec.c – Rev 4871