Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* |
2 | * fdct BlackFin |
||
3 | * |
||
4 | * Copyright (C) 2007 Marc Hoffman |
||
5 | * |
||
6 | * This file is part of FFmpeg. |
||
7 | * |
||
8 | * FFmpeg is free software; you can redistribute it and/or |
||
9 | * modify it under the terms of the GNU Lesser General Public |
||
10 | * License as published by the Free Software Foundation; either |
||
11 | * version 2.1 of the License, or (at your option) any later version. |
||
12 | * |
||
13 | * FFmpeg is distributed in the hope that it will be useful, |
||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
16 | * Lesser General Public License for more details. |
||
17 | * |
||
18 | * You should have received a copy of the GNU Lesser General Public |
||
19 | * License along with FFmpeg; if not, write to the Free Software |
||
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
21 | */ |
||
22 | /* |
||
23 | void ff_bfin_fdct (int16_t *buf); |
||
24 | |||
25 | This implementation works only for 8x8 input. The range of input |
||
26 | must be -256 to 255 i.e. 8bit input represented in a 16bit data |
||
27 | word. The original data must be sign extended into the 16bit data |
||
28 | words. |
||
29 | |||
30 | |||
31 | Chen factorization of |
||
32 | |||
33 | 8 |
||
34 | X(m) = sum (x(n) * cos ((2n+1)*m*pi/16)) |
||
35 | n=0 |
||
36 | |||
37 | C4 |
||
38 | |||
39 | \ / \ / X S4,S4 |
||
40 | 1 --*-\---------/-*1+6---*-\-/-*1+2-------*-*-------------------> 4 |
||
41 | \ / \ -C4 C3 |
||
42 | 2 --*---\-----/---*2+5---*-/-\-*1-2---------------*-*-----------> 2 |
||
43 | \ / / \ X S3,-S3 |
||
44 | 3 --*-----\-/-----*3+4---*-----*0-3---------------*-*-----------> 6 |
||
45 | / C7 C3 |
||
46 | 4 --*-----/-\-----*3-4------------*-*4+5--*-----*---------------> 1 |
||
47 | / \ -C4 X \ /S7 C3 |
||
48 | 5 --*---/-----\---*2-5---*-*------*=*4-5----\-/------*-*--------> 5 |
||
49 | / \ X S4,S4 / X S3,-S3 |
||
50 | 6 --*-/---------\-*1-6---*-*------*=*7-6----/-\------*-*--------> 3 |
||
51 | / \ C4 X / \-S7 C3 |
||
52 | --*-------------*0-7------------*-*7+6--*-----*---------------> 7 |
||
53 | C7 |
||
54 | |||
55 | Notation |
||
56 | Cn = cos(n*pi/8) used throughout the code. |
||
57 | |||
58 | |||
59 | Registers used: |
||
60 | R0, R1, R2, R3, R4, R5, R6,R7, P0, P1, P2, P3, P4, P5, A0, A1. |
||
61 | Other registers used: |
||
62 | I0, I1, I2, I3, B0, B2, B3, M0, M1, L3 registers and LC0. |
||
63 | |||
64 | Input - r0 - pointer to start of int16_t *block |
||
65 | |||
66 | Output - The DCT output coefficients in the int16_t *block |
||
67 | |||
68 | Register constraint: |
||
69 | This code is called from jpeg_encode. |
||
70 | R6, R5, R4 if modified should be stored and restored. |
||
71 | |||
72 | |||
73 | Performance: (Timer version 0.6.33) |
||
74 | Code Size : 240 Bytes. |
||
75 | Memory Required : |
||
76 | Input Matrix : 8 * 8 * 2 Bytes. |
||
77 | Coefficients : 16 Bytes |
||
78 | Temporary matrix: 8 * 8 * 2 Bytes. |
||
79 | Cycle Count :26+{18+8*(14+2S)}*2 where S -> Stalls |
||
80 | (7.45 c/pel) |
||
81 | ----------------------------------------- |
||
82 | | Size | Forward DCT | Inverse DCT | |
||
83 | ----------------------------------------- |
||
84 | | 8x8 | 284 Cycles | 311 Cycles | |
||
85 | ----------------------------------------- |
||
86 | |||
87 | Ck = int16(cos(k/16*pi)*32767+.5)/2 |
||
88 | #define C4 23170 |
||
89 | #define C3 13623 |
||
90 | #define C6 6270 |
||
91 | #define C7 3196 |
||
92 | |||
93 | Sk = int16(sin(k/16*pi)*32767+.5)/2 |
||
94 | #define S4 11585 |
||
95 | #define S3 9102 |
||
96 | #define S6 15137 |
||
97 | #define S7 16069 |
||
98 | |||
99 | the coefficients are ordered as follows: |
||
100 | short dct_coef[] |
||
101 | C4,S4, |
||
102 | C6,S6, |
||
103 | C7,S7, |
||
104 | S3,C3, |
||
105 | |||
106 | ----------------------------------------------------------- |
||
107 | FFMPEG conformance testing results |
||
108 | ----------------------------------------------------------- |
||
109 | dct-test: modified with the following |
||
110 | dct_error("BFINfdct", 0, ff_bfin_fdct, fdct, test); |
||
111 | produces the following output: |
||
112 | |||
113 | root:/u/ffmpeg/bhead/libavcodec> ./dct-test |
||
114 | ffmpeg DCT/IDCT test |
||
115 | |||
116 | 2 -131 -6 -48 -36 33 -83 24 |
||
117 | 34 52 -24 -15 5 92 57 143 |
||
118 | -67 -43 -1 74 -16 5 -71 32 |
||
119 | -78 106 92 -34 -38 81 20 -18 |
||
120 | 7 -62 40 2 -15 90 -62 -83 |
||
121 | -83 1 -104 -13 43 -19 7 11 |
||
122 | -63 31 12 -29 83 72 21 10 |
||
123 | -17 -63 -15 73 50 -91 159 -14 |
||
124 | DCT BFINfdct: err_inf=2 err2=0.16425938 syserr=0.00795000 maxout=2098 blockSumErr=27 |
||
125 | DCT BFINfdct: 92.1 kdct/s |
||
126 | root:/u/ffmpeg/bhead/libavcodec> |
||
127 | |||
128 | */ |
||
129 | |||
130 | #include "config.h" |
||
131 | #include "config_bfin.h" |
||
132 | |||
133 | #if defined(__FDPIC__) && CONFIG_SRAM |
||
134 | .section .l1.data.B,"aw",@progbits |
||
135 | #else |
||
136 | .data |
||
137 | #endif |
||
138 | .align 4; |
||
139 | dct_coeff: |
||
140 | .short 0x5a82, 0x2d41, 0x187e, 0x3b21, 0x0c7c, 0x3ec5, 0x238e, 0x3537; |
||
141 | |||
142 | #if defined(__FDPIC__) && CONFIG_SRAM |
||
143 | .section .l1.data.A,"aw",@progbits |
||
144 | #endif |
||
145 | .align 4 |
||
146 | vtmp: .space 128 |
||
147 | |||
148 | .text |
||
149 | DEFUN(fdct,mL1, |
||
150 | (int16_t *block)): |
||
151 | [--SP] = (R7:4, P5:3); // Push the registers onto the stack. |
||
152 | |||
153 | b0 = r0; |
||
154 | RELOC(r0, P3, dct_coeff); |
||
155 | b3 = r0; |
||
156 | RELOC(r0, P3, vtmp); |
||
157 | b2 = r0; |
||
158 | |||
159 | L3 = 16; // L3 is set to 16 to make the coefficient |
||
160 | // array Circular. |
||
161 | |||
162 | |||
163 | //---------------------------------------------------------------------------- |
||
164 | |||
165 | /* |
||
166 | * I0, I1, and I2 registers are used to read the input data. I3 register is used |
||
167 | * to read the coefficients. P0 and P1 registers are used for writing the output |
||
168 | * data. |
||
169 | */ |
||
170 | M0 = 12 (X); // All these initializations are used in the |
||
171 | M1 = 16 (X); // modification of address offsets. |
||
172 | |||
173 | M2 = 128 (X); |
||
174 | |||
175 | P2 = 16; |
||
176 | P3 = 32 (X); |
||
177 | P4 = -110 (X); |
||
178 | P5 = -62 (X); |
||
179 | P0 = 2(X); |
||
180 | |||
181 | |||
182 | // Prescale the input to get the correct precision. |
||
183 | i0=b0; |
||
184 | i1=b0; |
||
185 | |||
186 | lsetup (.0, .1) LC0 = P3; |
||
187 | r0=[i0++]; |
||
188 | .0: r1=r0<<3 (v) || r0=[i0++] ; |
||
189 | .1: [i1++]=r1; |
||
190 | |||
191 | /* |
||
192 | * B0 points to the "in" buffer. |
||
193 | * B2 points to "temp" buffer in the first iteration. |
||
194 | */ |
||
195 | |||
196 | lsetup (.2, .3) LC0 = P0; |
||
197 | .2: |
||
198 | I0 = B0; // I0 points to Input Element (0, 0). |
||
199 | I1 = B0; // Element 1 and 0 is read in R0. |
||
200 | I1 += M0 || R0 = [I0++]; // I1 points to Input Element (0, 6). |
||
201 | I2 = I1; // Element 6 is read into R3.H. |
||
202 | I2 -= 4 || R3.H = W[I1++]; // I2 points to Input Element (0, 4). |
||
203 | |||
204 | I3 = B3; // I3 points to Coefficients. |
||
205 | P0 = B2; // P0 points to temporary array Element |
||
206 | // (0, 0). |
||
207 | P1 = B2; // P1 points to temporary array. |
||
208 | R7 = [P1++P2] || R2 = [I2++]; // P1 points to temporary array |
||
209 | // Element (1, 0). |
||
210 | // R7 is a dummy read. X4,X5 |
||
211 | // are read into R2. |
||
212 | R3.L = W[I1--]; // X7 is read into R3.L. |
||
213 | R1.H = W[I0++]; // X2 is read into R1.H. |
||
214 | |||
215 | |||
216 | /* |
||
217 | * X0 = (X0 + X7) / 2. |
||
218 | * X1 = (X1 + X6) / 2. |
||
219 | * X6 = (X1 - X6) / 2. |
||
220 | * X7 = (X0 - X7) / 2. |
||
221 | * It reads the data 3 in R1.L. |
||
222 | */ |
||
223 | |||
224 | R0 = R0 +|+ R3, R3 = R0 -|- R3 || R1.L = W[I0++] || NOP; |
||
225 | |||
226 | /* |
||
227 | * X2 = (X2 + X5) / 2. |
||
228 | * X3 = (X3 + X4) / 2. |
||
229 | * X4 = (X3 - X4) / 2. |
||
230 | * X5 = (X2 - X5) / 2. |
||
231 | * R7 = C4 = cos(4*pi/16) |
||
232 | */ |
||
233 | |||
234 | R1 = R1 +|+ R2, R2 = R1 -|- R2 (CO) || NOP || R7 = [I3++]; |
||
235 | |||
236 | /* |
||
237 | * At the end of stage 1 R0 has (1,0), R1 has (2,3), R2 has (4, 5) and |
||
238 | * R3 has (6,7). |
||
239 | * Where the notation (x, y) represents uper/lower half pairs. |
||
240 | */ |
||
241 | |||
242 | /* |
||
243 | * X0 = X0 + X3. |
||
244 | * X1 = X1 + X2. |
||
245 | * X2 = X1 - X2. |
||
246 | * X3 = X0 - X3. |
||
247 | */ |
||
248 | R0 = R0 +|+ R1, R1 = R0 -|- R1; |
||
249 | |||
250 | lsetup (.row0, .row1) LC1 = P2 >> 1; // 1d dct, loops 8x |
||
251 | .row0: |
||
252 | |||
253 | /* |
||
254 | * This is part 2 computation continued..... |
||
255 | * A1 = X6 * cos(pi/4) |
||
256 | * A0 = X6 * cos(pi/4) |
||
257 | * A1 = A1 - X5 * cos(pi/4) |
||
258 | * A0 = A0 + X5 * cos(pi/4). |
||
259 | * The instruction W[I0] = R3.L is used for packing it to R2.L. |
||
260 | */ |
||
261 | |||
262 | A1=R3.H*R7.l, A0=R3.H*R7.l || I1+=M1 || W[I0] = R3.L; |
||
263 | R4.H=(A1-=R2.L*R7.l), R4.L=(A0+=R2.L*R7.l) || I2+=M0 || NOP; |
||
264 | |||
265 | /* R0 = (X1,X0) R1 = (X2,X3) R4 = (X5, X6). */ |
||
266 | |||
267 | /* |
||
268 | * A1 = X0 * cos(pi/4) |
||
269 | * A0 = X0 * cos(pi/4) |
||
270 | * A1 = A1 - X1 * cos(pi/4) |
||
271 | * A0 = A0 + X1 * cos(pi/4) |
||
272 | * R7 = (C2,C6) |
||
273 | */ |
||
274 | A1=R0.L*R7.h, A0=R0.L*R7.h || NOP || R3.H=W[I1++]; |
||
275 | R5.H=(A1-=R0.H*R7.h),R5.L=(A0+=R0.H*R7.h) || R7=[I3++] || NOP; |
||
276 | |||
277 | /* |
||
278 | * A1 = X2 * cos(3pi/8) |
||
279 | * A0 = X3 * cos(3pi/8) |
||
280 | * A1 = A1 + X3 * cos(pi/8) |
||
281 | * A0 = A0 - X2 * cos(pi/8) |
||
282 | * R3 = cos(pi/4) |
||
283 | * R7 = (cos(7pi/8),cos(pi/8)) |
||
284 | * X4 = X4 + X5. |
||
285 | * X5 = X4 - X5. |
||
286 | * X6 = X7 - X6. |
||
287 | * X7 = X7 + X6. |
||
288 | */ |
||
289 | A1=R1.H*R7.L, A0=R1.L*R7.L || W[P0++P3]=R5.L || R2.L=W[I0]; |
||
290 | R2=R2+|+R4, R4=R2-|-R4 || I0+=4 || R3.L=W[I1--]; |
||
291 | R6.H=(A1+=R1.L*R7.H),R6.L=(A0 -= R1.H * R7.H) || I0+=4 || R7=[I3++]; |
||
292 | |||
293 | /* R2 = (X4, X7) R4 = (X5,X6) R5 = (X1, X0) R6 = (X2,X3). */ |
||
294 | |||
295 | /* |
||
296 | * A1 = X4 * cos(7pi/16) |
||
297 | * A0 = X7 * cos(7pi/16) |
||
298 | * A1 = A1 + X7 * cos(pi/16) |
||
299 | * A0 = A0 - X4 * cos(pi/16) |
||
300 | */ |
||
301 | |||
302 | A1=R2.H*R7.L, A0=R2.L*R7.L || W[P0++P3]=R6.H || R0=[I0++]; |
||
303 | R2.H=(A1+=R2.L*R7.H),R2.L=(A0-=R2.H*R7.H) || W[P0++P3]=R5.H || R7=[I3++]; |
||
304 | |||
305 | /* |
||
306 | * A1 = X5 * cos(3pi/16) |
||
307 | * A0 = X6 * cos(3pi/16) |
||
308 | * A1 = A1 + X6 * cos(5pi/16) |
||
309 | * A0 = A0 - X5 * cos(5pi/16) |
||
310 | * The output values are written. |
||
311 | */ |
||
312 | |||
313 | A1=R4.H*R7.H, A0=R4.L*R7.H || W[P0++P2]=R6.L || R1.H=W[I0++]; |
||
314 | R4.H=(A1+=R4.L*R7.L),R4.L=(A0-=R4.H*R7.L) || W[P0++P4]=R2.L || R1.L=W[I0++]; |
||
315 | |||
316 | |||
317 | /* Beginning of next stage, **pipelined** + drain and store the |
||
318 | rest of the column store. */ |
||
319 | |||
320 | R0=R0+|+R3,R3=R0-|-R3 || W[P1++P3]=R2.H || R2=[I2++]; |
||
321 | R1=R1+|+R2,R2=R1-|-R2 (CO) || W[P1++P3]=R4.L || R7=[I3++]; |
||
322 | .row1: R0=R0+|+R1,R1=R0-|-R1 || W[P1++P5]=R4.H || NOP; |
||
323 | |||
324 | // Exchange input with output. |
||
325 | B1 = B0; |
||
326 | B0 = B2; |
||
327 | .3: B2 = B1; |
||
328 | |||
329 | L3=0; |
||
330 | (r7:4,p5:3) = [sp++]; |
||
331 | RTS; |
||
332 | DEFUN_END(fdct)3><3> |