Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* |
2 | * Simple IDCT (Alpha optimized) |
||
3 | * |
||
4 | * Copyright (c) 2001 Michael Niedermayer |
||
5 | * |
||
6 | * based upon some outcommented C code from mpeg2dec (idct_mmx.c |
||
7 | * written by Aaron Holtzman |
||
8 | * |
||
9 | * Alpha optimizations by Måns Rullgård |
||
10 | * and Falk Hueffner |
||
11 | * |
||
12 | * This file is part of FFmpeg. |
||
13 | * |
||
14 | * FFmpeg is free software; you can redistribute it and/or |
||
15 | * modify it under the terms of the GNU Lesser General Public |
||
16 | * License as published by the Free Software Foundation; either |
||
17 | * version 2.1 of the License, or (at your option) any later version. |
||
18 | * |
||
19 | * FFmpeg is distributed in the hope that it will be useful, |
||
20 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
21 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
22 | * Lesser General Public License for more details. |
||
23 | * |
||
24 | * You should have received a copy of the GNU Lesser General Public |
||
25 | * License along with FFmpeg; if not, write to the Free Software |
||
26 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
27 | */ |
||
28 | |||
29 | #include "dsputil_alpha.h" |
||
30 | #include "asm.h" |
||
31 | |||
32 | // cos(i * M_PI / 16) * sqrt(2) * (1 << 14) |
||
33 | // W4 is actually exactly 16384, but using 16383 works around |
||
34 | // accumulating rounding errors for some encoders |
||
35 | #define W1 22725 |
||
36 | #define W2 21407 |
||
37 | #define W3 19266 |
||
38 | #define W4 16383 |
||
39 | #define W5 12873 |
||
40 | #define W6 8867 |
||
41 | #define W7 4520 |
||
42 | #define ROW_SHIFT 11 |
||
43 | #define COL_SHIFT 20 |
||
44 | |||
45 | /* 0: all entries 0, 1: only first entry nonzero, 2: otherwise */ |
||
46 | static inline int idct_row(int16_t *row) |
||
47 | { |
||
48 | int a0, a1, a2, a3, b0, b1, b2, b3, t; |
||
49 | uint64_t l, r, t2; |
||
50 | l = ldq(row); |
||
51 | r = ldq(row + 4); |
||
52 | |||
53 | if (l == 0 && r == 0) |
||
54 | return 0; |
||
55 | |||
56 | a0 = W4 * sextw(l) + (1 << (ROW_SHIFT - 1)); |
||
57 | |||
58 | if (((l & ~0xffffUL) | r) == 0) { |
||
59 | a0 >>= ROW_SHIFT; |
||
60 | t2 = (uint16_t) a0; |
||
61 | t2 |= t2 << 16; |
||
62 | t2 |= t2 << 32; |
||
63 | |||
64 | stq(t2, row); |
||
65 | stq(t2, row + 4); |
||
66 | return 1; |
||
67 | } |
||
68 | |||
69 | a1 = a0; |
||
70 | a2 = a0; |
||
71 | a3 = a0; |
||
72 | |||
73 | t = extwl(l, 4); /* row[2] */ |
||
74 | if (t != 0) { |
||
75 | t = sextw(t); |
||
76 | a0 += W2 * t; |
||
77 | a1 += W6 * t; |
||
78 | a2 -= W6 * t; |
||
79 | a3 -= W2 * t; |
||
80 | } |
||
81 | |||
82 | t = extwl(r, 0); /* row[4] */ |
||
83 | if (t != 0) { |
||
84 | t = sextw(t); |
||
85 | a0 += W4 * t; |
||
86 | a1 -= W4 * t; |
||
87 | a2 -= W4 * t; |
||
88 | a3 += W4 * t; |
||
89 | } |
||
90 | |||
91 | t = extwl(r, 4); /* row[6] */ |
||
92 | if (t != 0) { |
||
93 | t = sextw(t); |
||
94 | a0 += W6 * t; |
||
95 | a1 -= W2 * t; |
||
96 | a2 += W2 * t; |
||
97 | a3 -= W6 * t; |
||
98 | } |
||
99 | |||
100 | t = extwl(l, 2); /* row[1] */ |
||
101 | if (t != 0) { |
||
102 | t = sextw(t); |
||
103 | b0 = W1 * t; |
||
104 | b1 = W3 * t; |
||
105 | b2 = W5 * t; |
||
106 | b3 = W7 * t; |
||
107 | } else { |
||
108 | b0 = 0; |
||
109 | b1 = 0; |
||
110 | b2 = 0; |
||
111 | b3 = 0; |
||
112 | } |
||
113 | |||
114 | t = extwl(l, 6); /* row[3] */ |
||
115 | if (t) { |
||
116 | t = sextw(t); |
||
117 | b0 += W3 * t; |
||
118 | b1 -= W7 * t; |
||
119 | b2 -= W1 * t; |
||
120 | b3 -= W5 * t; |
||
121 | } |
||
122 | |||
123 | |||
124 | t = extwl(r, 2); /* row[5] */ |
||
125 | if (t) { |
||
126 | t = sextw(t); |
||
127 | b0 += W5 * t; |
||
128 | b1 -= W1 * t; |
||
129 | b2 += W7 * t; |
||
130 | b3 += W3 * t; |
||
131 | } |
||
132 | |||
133 | t = extwl(r, 6); /* row[7] */ |
||
134 | if (t) { |
||
135 | t = sextw(t); |
||
136 | b0 += W7 * t; |
||
137 | b1 -= W5 * t; |
||
138 | b2 += W3 * t; |
||
139 | b3 -= W1 * t; |
||
140 | } |
||
141 | |||
142 | row[0] = (a0 + b0) >> ROW_SHIFT; |
||
143 | row[1] = (a1 + b1) >> ROW_SHIFT; |
||
144 | row[2] = (a2 + b2) >> ROW_SHIFT; |
||
145 | row[3] = (a3 + b3) >> ROW_SHIFT; |
||
146 | row[4] = (a3 - b3) >> ROW_SHIFT; |
||
147 | row[5] = (a2 - b2) >> ROW_SHIFT; |
||
148 | row[6] = (a1 - b1) >> ROW_SHIFT; |
||
149 | row[7] = (a0 - b0) >> ROW_SHIFT; |
||
150 | |||
151 | return 2; |
||
152 | } |
||
153 | |||
154 | static inline void idct_col(int16_t *col) |
||
155 | { |
||
156 | int a0, a1, a2, a3, b0, b1, b2, b3; |
||
157 | |||
158 | col[0] += (1 << (COL_SHIFT - 1)) / W4; |
||
159 | |||
160 | a0 = W4 * col[8 * 0]; |
||
161 | a1 = W4 * col[8 * 0]; |
||
162 | a2 = W4 * col[8 * 0]; |
||
163 | a3 = W4 * col[8 * 0]; |
||
164 | |||
165 | if (col[8 * 2]) { |
||
166 | a0 += W2 * col[8 * 2]; |
||
167 | a1 += W6 * col[8 * 2]; |
||
168 | a2 -= W6 * col[8 * 2]; |
||
169 | a3 -= W2 * col[8 * 2]; |
||
170 | } |
||
171 | |||
172 | if (col[8 * 4]) { |
||
173 | a0 += W4 * col[8 * 4]; |
||
174 | a1 -= W4 * col[8 * 4]; |
||
175 | a2 -= W4 * col[8 * 4]; |
||
176 | a3 += W4 * col[8 * 4]; |
||
177 | } |
||
178 | |||
179 | if (col[8 * 6]) { |
||
180 | a0 += W6 * col[8 * 6]; |
||
181 | a1 -= W2 * col[8 * 6]; |
||
182 | a2 += W2 * col[8 * 6]; |
||
183 | a3 -= W6 * col[8 * 6]; |
||
184 | } |
||
185 | |||
186 | if (col[8 * 1]) { |
||
187 | b0 = W1 * col[8 * 1]; |
||
188 | b1 = W3 * col[8 * 1]; |
||
189 | b2 = W5 * col[8 * 1]; |
||
190 | b3 = W7 * col[8 * 1]; |
||
191 | } else { |
||
192 | b0 = 0; |
||
193 | b1 = 0; |
||
194 | b2 = 0; |
||
195 | b3 = 0; |
||
196 | } |
||
197 | |||
198 | if (col[8 * 3]) { |
||
199 | b0 += W3 * col[8 * 3]; |
||
200 | b1 -= W7 * col[8 * 3]; |
||
201 | b2 -= W1 * col[8 * 3]; |
||
202 | b3 -= W5 * col[8 * 3]; |
||
203 | } |
||
204 | |||
205 | if (col[8 * 5]) { |
||
206 | b0 += W5 * col[8 * 5]; |
||
207 | b1 -= W1 * col[8 * 5]; |
||
208 | b2 += W7 * col[8 * 5]; |
||
209 | b3 += W3 * col[8 * 5]; |
||
210 | } |
||
211 | |||
212 | if (col[8 * 7]) { |
||
213 | b0 += W7 * col[8 * 7]; |
||
214 | b1 -= W5 * col[8 * 7]; |
||
215 | b2 += W3 * col[8 * 7]; |
||
216 | b3 -= W1 * col[8 * 7]; |
||
217 | } |
||
218 | |||
219 | col[8 * 0] = (a0 + b0) >> COL_SHIFT; |
||
220 | col[8 * 7] = (a0 - b0) >> COL_SHIFT; |
||
221 | col[8 * 1] = (a1 + b1) >> COL_SHIFT; |
||
222 | col[8 * 6] = (a1 - b1) >> COL_SHIFT; |
||
223 | col[8 * 2] = (a2 + b2) >> COL_SHIFT; |
||
224 | col[8 * 5] = (a2 - b2) >> COL_SHIFT; |
||
225 | col[8 * 3] = (a3 + b3) >> COL_SHIFT; |
||
226 | col[8 * 4] = (a3 - b3) >> COL_SHIFT; |
||
227 | } |
||
228 | |||
229 | /* If all rows but the first one are zero after row transformation, |
||
230 | all rows will be identical after column transformation. */ |
||
231 | static inline void idct_col2(int16_t *col) |
||
232 | { |
||
233 | int i; |
||
234 | uint64_t l, r; |
||
235 | |||
236 | for (i = 0; i < 8; ++i) { |
||
237 | int a0 = col[i] + (1 << (COL_SHIFT - 1)) / W4; |
||
238 | |||
239 | a0 *= W4; |
||
240 | col[i] = a0 >> COL_SHIFT; |
||
241 | } |
||
242 | |||
243 | l = ldq(col + 0 * 4); r = ldq(col + 1 * 4); |
||
244 | stq(l, col + 2 * 4); stq(r, col + 3 * 4); |
||
245 | stq(l, col + 4 * 4); stq(r, col + 5 * 4); |
||
246 | stq(l, col + 6 * 4); stq(r, col + 7 * 4); |
||
247 | stq(l, col + 8 * 4); stq(r, col + 9 * 4); |
||
248 | stq(l, col + 10 * 4); stq(r, col + 11 * 4); |
||
249 | stq(l, col + 12 * 4); stq(r, col + 13 * 4); |
||
250 | stq(l, col + 14 * 4); stq(r, col + 15 * 4); |
||
251 | } |
||
252 | |||
253 | void ff_simple_idct_axp(int16_t *block) |
||
254 | { |
||
255 | |||
256 | int i; |
||
257 | int rowsZero = 1; /* all rows except row 0 zero */ |
||
258 | int rowsConstant = 1; /* all rows consist of a constant value */ |
||
259 | |||
260 | for (i = 0; i < 8; i++) { |
||
261 | int sparseness = idct_row(block + 8 * i); |
||
262 | |||
263 | if (i > 0 && sparseness > 0) |
||
264 | rowsZero = 0; |
||
265 | if (sparseness == 2) |
||
266 | rowsConstant = 0; |
||
267 | } |
||
268 | |||
269 | if (rowsZero) { |
||
270 | idct_col2(block); |
||
271 | } else if (rowsConstant) { |
||
272 | idct_col(block); |
||
273 | for (i = 0; i < 8; i += 2) { |
||
274 | uint64_t v = (uint16_t) block[0]; |
||
275 | uint64_t w = (uint16_t) block[8]; |
||
276 | |||
277 | v |= v << 16; |
||
278 | w |= w << 16; |
||
279 | v |= v << 32; |
||
280 | w |= w << 32; |
||
281 | stq(v, block + 0 * 4); |
||
282 | stq(v, block + 1 * 4); |
||
283 | stq(w, block + 2 * 4); |
||
284 | stq(w, block + 3 * 4); |
||
285 | block += 4 * 4; |
||
286 | } |
||
287 | } else { |
||
288 | for (i = 0; i < 8; i++) |
||
289 | idct_col(block + i); |
||
290 | } |
||
291 | } |
||
292 | |||
293 | void ff_simple_idct_put_axp(uint8_t *dest, int line_size, int16_t *block) |
||
294 | { |
||
295 | ff_simple_idct_axp(block); |
||
296 | put_pixels_clamped_axp_p(block, dest, line_size); |
||
297 | } |
||
298 | |||
299 | void ff_simple_idct_add_axp(uint8_t *dest, int line_size, int16_t *block) |
||
300 | { |
||
301 | ff_simple_idct_axp(block); |
||
302 | add_pixels_clamped_axp_p(block, dest, line_size); |
||
303 | }>><>><>><>><>>>><>>><>><>><>><>><> |