WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/ffmpeg/libavcodec/bfin/fdct_bfin.S

Rev	Author	Line No.	Line
4349	Serge	1	/*
		2	* fdct BlackFin
		3	*
		4	* Copyright (C) 2007 Marc Hoffman
		5	*
		6	* This file is part of FFmpeg.
		7	*
		8	* FFmpeg is free software; you can redistribute it and/or
		9	* modify it under the terms of the GNU Lesser General Public
		10	* License as published by the Free Software Foundation; either
		11	* version 2.1 of the License, or (at your option) any later version.
		12	*
		13	* FFmpeg is distributed in the hope that it will be useful,
		14	* but WITHOUT ANY WARRANTY; without even the implied warranty of
		15	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
		16	* Lesser General Public License for more details.
		17	*
		18	* You should have received a copy of the GNU Lesser General Public
		19	* License along with FFmpeg; if not, write to the Free Software
		20	* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
		21	*/
		22	/*
		23	void ff_bfin_fdct (int16_t *buf);
		24
		25	This implementation works only for 8x8 input. The range of input
		26	must be -256 to 255 i.e. 8bit input represented in a 16bit data
		27	word. The original data must be sign extended into the 16bit data
		28	words.
		29
		30
		31	Chen factorization of
		32
		33	8
		34	X(m) = sum (x(n) * cos ((2n+1)mpi/16))
		35	n=0
		36
		37	C4
		38
		39	\ / \ / X S4,S4
		40	1 ---\---------/-1+6----\-/-1+2---------------------------> 4
		41	\ / \ -C4 C3
		42	2 -----\-----/---2+5----/-\-1-2---------------------------> 2
		43	\ / / \ X S3,-S3
		44	3 -------\-/-----3+4--------0-3---------------------------> 6
		45	/ C7 C3
		46	4 -------/-\-----3-4-------------4+5----------------------> 1
		47	/ \ -C4 X \ /S7 C3
		48	5 -----/-----\---2-5----------=4-5----\-/---------------> 5
		49	/ \ X S4,S4 / X S3,-S3
		50	6 ---/---------\-1-6----------=7-6----/-\---------------> 3
		51	/ \ C4 X / \-S7 C3
		52	---------------0-7-------------7+6----------------------> 7
		53	C7
		54
		55	Notation
		56	Cn = cos(n*pi/8) used throughout the code.
		57
		58
		59	Registers used:
		60	R0, R1, R2, R3, R4, R5, R6,R7, P0, P1, P2, P3, P4, P5, A0, A1.
		61	Other registers used:
		62	I0, I1, I2, I3, B0, B2, B3, M0, M1, L3 registers and LC0.
		63
		64	Input - r0 - pointer to start of int16_t *block
		65
		66	Output - The DCT output coefficients in the int16_t *block
		67
		68	Register constraint:
		69	This code is called from jpeg_encode.
		70	R6, R5, R4 if modified should be stored and restored.
		71
		72
		73	Performance: (Timer version 0.6.33)
		74	Code Size : 240 Bytes.
		75	Memory Required :
		76	Input Matrix : 8 * 8 * 2 Bytes.
		77	Coefficients : 16 Bytes
		78	Temporary matrix: 8 * 8 * 2 Bytes.
		79	Cycle Count :26+{18+8(14+2S)}2 where S -> Stalls
		80	(7.45 c/pel)
		81	-----------------------------------------
		82	\| Size \| Forward DCT \| Inverse DCT \|
		83	-----------------------------------------
		84	\| 8x8 \| 284 Cycles \| 311 Cycles \|
		85	-----------------------------------------
		86
		87	Ck = int16(cos(k/16pi)32767+.5)/2
		88	#define C4 23170
		89	#define C3 13623
		90	#define C6 6270
		91	#define C7 3196
		92
		93	Sk = int16(sin(k/16pi)32767+.5)/2
		94	#define S4 11585
		95	#define S3 9102
		96	#define S6 15137
		97	#define S7 16069
		98
		99	the coefficients are ordered as follows:
		100	short dct_coef[]
		101	C4,S4,
		102	C6,S6,
		103	C7,S7,
		104	S3,C3,
		105
		106	-----------------------------------------------------------
		107	FFMPEG conformance testing results
		108	-----------------------------------------------------------
		109	dct-test: modified with the following
		110	dct_error("BFINfdct", 0, ff_bfin_fdct, fdct, test);
		111	produces the following output:
		112
		113	root:/u/ffmpeg/bhead/libavcodec> ./dct-test
		114	ffmpeg DCT/IDCT test
		115
		116	2 -131 -6 -48 -36 33 -83 24
		117	34 52 -24 -15 5 92 57 143
		118	-67 -43 -1 74 -16 5 -71 32
		119	-78 106 92 -34 -38 81 20 -18
		120	7 -62 40 2 -15 90 -62 -83
		121	-83 1 -104 -13 43 -19 7 11
		122	-63 31 12 -29 83 72 21 10
		123	-17 -63 -15 73 50 -91 159 -14
		124	DCT BFINfdct: err_inf=2 err2=0.16425938 syserr=0.00795000 maxout=2098 blockSumErr=27
		125	DCT BFINfdct: 92.1 kdct/s
		126	root:/u/ffmpeg/bhead/libavcodec>
		127
		128	*/
		129
		130	#include "config.h"
		131	#include "config_bfin.h"
		132
		133	#if defined(__FDPIC__) && CONFIG_SRAM
		134	.section .l1.data.B,"aw",@progbits
		135	#else
		136	.data
		137	#endif
		138	.align 4;
		139	dct_coeff:
		140	.short 0x5a82, 0x2d41, 0x187e, 0x3b21, 0x0c7c, 0x3ec5, 0x238e, 0x3537;
		141
		142	#if defined(__FDPIC__) && CONFIG_SRAM
		143	.section .l1.data.A,"aw",@progbits
		144	#endif
		145	.align 4
		146	vtmp: .space 128
		147
		148	.text
		149	DEFUN(fdct,mL1,
		150	(int16_t *block)):
		151	[--SP] = (R7:4, P5:3); // Push the registers onto the stack.
		152
		153	b0 = r0;
		154	RELOC(r0, P3, dct_coeff);
		155	b3 = r0;
		156	RELOC(r0, P3, vtmp);
		157	b2 = r0;
		158
		159	L3 = 16; // L3 is set to 16 to make the coefficient
		160	// array Circular.
		161
		162
		163	//----------------------------------------------------------------------------
		164
		165	/*
		166	* I0, I1, and I2 registers are used to read the input data. I3 register is used
		167	* to read the coefficients. P0 and P1 registers are used for writing the output
		168	* data.
		169	*/
		170	M0 = 12 (X); // All these initializations are used in the
		171	M1 = 16 (X); // modification of address offsets.
		172
		173	M2 = 128 (X);
		174
		175	P2 = 16;
		176	P3 = 32 (X);
		177	P4 = -110 (X);
		178	P5 = -62 (X);
		179	P0 = 2(X);
		180
		181
		182	// Prescale the input to get the correct precision.
		183	i0=b0;
		184	i1=b0;
		185
		186	lsetup (.0, .1) LC0 = P3;
		187	r0=[i0++];
		188	.0: r1=r0<<3 (v) \|\| r0=[i0++] ;
		189	.1: [i1++]=r1;
		190
		191	/*
		192	* B0 points to the "in" buffer.
		193	* B2 points to "temp" buffer in the first iteration.
		194	*/
		195
		196	lsetup (.2, .3) LC0 = P0;
		197	.2:
		198	I0 = B0; // I0 points to Input Element (0, 0).
		199	I1 = B0; // Element 1 and 0 is read in R0.
		200	I1 += M0 \|\| R0 = [I0++]; // I1 points to Input Element (0, 6).
		201	I2 = I1; // Element 6 is read into R3.H.
		202	I2 -= 4 \|\| R3.H = W[I1++]; // I2 points to Input Element (0, 4).
		203
		204	I3 = B3; // I3 points to Coefficients.
		205	P0 = B2; // P0 points to temporary array Element
		206	// (0, 0).
		207	P1 = B2; // P1 points to temporary array.
		208	R7 = [P1++P2] \|\| R2 = [I2++]; // P1 points to temporary array
		209	// Element (1, 0).
		210	// R7 is a dummy read. X4,X5
		211	// are read into R2.
		212	R3.L = W[I1--]; // X7 is read into R3.L.
		213	R1.H = W[I0++]; // X2 is read into R1.H.
		214
		215
		216	/*
		217	* X0 = (X0 + X7) / 2.
		218	* X1 = (X1 + X6) / 2.
		219	* X6 = (X1 - X6) / 2.
		220	* X7 = (X0 - X7) / 2.
		221	* It reads the data 3 in R1.L.
		222	*/
		223
		224	R0 = R0 +\|+ R3, R3 = R0 -\|- R3 \|\| R1.L = W[I0++] \|\| NOP;
		225
		226	/*
		227	* X2 = (X2 + X5) / 2.
		228	* X3 = (X3 + X4) / 2.
		229	* X4 = (X3 - X4) / 2.
		230	* X5 = (X2 - X5) / 2.
		231	* R7 = C4 = cos(4*pi/16)
		232	*/
		233
		234	R1 = R1 +\|+ R2, R2 = R1 -\|- R2 (CO) \|\| NOP \|\| R7 = [I3++];
		235
		236	/*
		237	* At the end of stage 1 R0 has (1,0), R1 has (2,3), R2 has (4, 5) and
		238	* R3 has (6,7).
		239	* Where the notation (x, y) represents uper/lower half pairs.
		240	*/
		241
		242	/*
		243	* X0 = X0 + X3.
		244	* X1 = X1 + X2.
		245	* X2 = X1 - X2.
		246	* X3 = X0 - X3.
		247	*/
		248	R0 = R0 +\|+ R1, R1 = R0 -\|- R1;
		249
		250	lsetup (.row0, .row1) LC1 = P2 >> 1; // 1d dct, loops 8x
		251	.row0:
		252
		253	/*
		254	* This is part 2 computation continued.....
		255	* A1 = X6 * cos(pi/4)
		256	* A0 = X6 * cos(pi/4)
		257	* A1 = A1 - X5 * cos(pi/4)
		258	* A0 = A0 + X5 * cos(pi/4).
		259	* The instruction W[I0] = R3.L is used for packing it to R2.L.
		260	*/
		261
		262	A1=R3.HR7.l, A0=R3.HR7.l \|\| I1+=M1 \|\| W[I0] = R3.L;
		263	R4.H=(A1-=R2.LR7.l), R4.L=(A0+=R2.LR7.l) \|\| I2+=M0 \|\| NOP;
		264
		265	/* R0 = (X1,X0) R1 = (X2,X3) R4 = (X5, X6). */
		266
		267	/*
		268	* A1 = X0 * cos(pi/4)
		269	* A0 = X0 * cos(pi/4)
		270	* A1 = A1 - X1 * cos(pi/4)
		271	* A0 = A0 + X1 * cos(pi/4)
		272	* R7 = (C2,C6)
		273	*/
		274	A1=R0.LR7.h, A0=R0.LR7.h \|\| NOP \|\| R3.H=W[I1++];
		275	R5.H=(A1-=R0.HR7.h),R5.L=(A0+=R0.HR7.h) \|\| R7=[I3++] \|\| NOP;
		276
		277	/*
		278	* A1 = X2 * cos(3pi/8)
		279	* A0 = X3 * cos(3pi/8)
		280	* A1 = A1 + X3 * cos(pi/8)
		281	* A0 = A0 - X2 * cos(pi/8)
		282	* R3 = cos(pi/4)
		283	* R7 = (cos(7pi/8),cos(pi/8))
		284	* X4 = X4 + X5.
		285	* X5 = X4 - X5.
		286	* X6 = X7 - X6.
		287	* X7 = X7 + X6.
		288	*/
		289	A1=R1.HR7.L, A0=R1.LR7.L \|\| W[P0++P3]=R5.L \|\| R2.L=W[I0];
		290	R2=R2+\|+R4, R4=R2-\|-R4 \|\| I0+=4 \|\| R3.L=W[I1--];
		291	R6.H=(A1+=R1.LR7.H),R6.L=(A0 -= R1.H R7.H) \|\| I0+=4 \|\| R7=[I3++];
		292
		293	/* R2 = (X4, X7) R4 = (X5,X6) R5 = (X1, X0) R6 = (X2,X3). */
		294
		295	/*
		296	* A1 = X4 * cos(7pi/16)
		297	* A0 = X7 * cos(7pi/16)
		298	* A1 = A1 + X7 * cos(pi/16)
		299	* A0 = A0 - X4 * cos(pi/16)
		300	*/
		301
		302	A1=R2.HR7.L, A0=R2.LR7.L \|\| W[P0++P3]=R6.H \|\| R0=[I0++];
		303	R2.H=(A1+=R2.LR7.H),R2.L=(A0-=R2.HR7.H) \|\| W[P0++P3]=R5.H \|\| R7=[I3++];
		304
		305	/*
		306	* A1 = X5 * cos(3pi/16)
		307	* A0 = X6 * cos(3pi/16)
		308	* A1 = A1 + X6 * cos(5pi/16)
		309	* A0 = A0 - X5 * cos(5pi/16)
		310	* The output values are written.
		311	*/
		312
		313	A1=R4.HR7.H, A0=R4.LR7.H \|\| W[P0++P2]=R6.L \|\| R1.H=W[I0++];
		314	R4.H=(A1+=R4.LR7.L),R4.L=(A0-=R4.HR7.L) \|\| W[P0++P4]=R2.L \|\| R1.L=W[I0++];
		315
		316
		317	/* Beginning of next stage, pipelined + drain and store the
		318	rest of the column store. */
		319
		320	R0=R0+\|+R3,R3=R0-\|-R3 \|\| W[P1++P3]=R2.H \|\| R2=[I2++];
		321	R1=R1+\|+R2,R2=R1-\|-R2 (CO) \|\| W[P1++P3]=R4.L \|\| R7=[I3++];
		322	.row1: R0=R0+\|+R1,R1=R0-\|-R1 \|\| W[P1++P5]=R4.H \|\| NOP;
		323
		324	// Exchange input with output.
		325	B1 = B0;
		326	B0 = B2;
		327	.3: B2 = B1;
		328
		329	L3=0;
		330	(r7:4,p5:3) = [sp++];
		331	RTS;
		332	DEFUN_END(fdct)

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/ffmpeg/libavcodec/bfin/fdct_bfin.S – Rev 4349