Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | /* |
2 | * Copyright (C) 2001-2002 Michael Niedermayer (michaelni@gmx.at) |
||
3 | * |
||
4 | * This file is part of FFmpeg. |
||
5 | * |
||
6 | * FFmpeg is free software; you can redistribute it and/or modify |
||
7 | * it under the terms of the GNU General Public License as published by |
||
8 | * the Free Software Foundation; either version 2 of the License, or |
||
9 | * (at your option) any later version. |
||
10 | * |
||
11 | * FFmpeg is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
14 | * GNU General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU General Public License |
||
17 | * along with FFmpeg; if not, write to the Free Software |
||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
19 | */ |
||
20 | |||
21 | /** |
||
22 | * @file |
||
23 | * mmx/mmx2/3dnow postprocess code. |
||
24 | */ |
||
25 | |||
26 | #include "libavutil/x86/asm.h" |
||
27 | |||
28 | /* A single TEMPLATE_PP_* should be defined (to 1) when this template is |
||
29 | * included. The following macros will define its dependencies to 1 as well |
||
30 | * (like MMX2 depending on MMX), and will define to 0 all the others. Every |
||
31 | * TEMPLATE_PP_* need to be undef at the end. */ |
||
32 | |||
33 | #ifdef TEMPLATE_PP_C |
||
34 | # define RENAME(a) a ## _C |
||
35 | #else |
||
36 | # define TEMPLATE_PP_C 0 |
||
37 | #endif |
||
38 | |||
39 | #ifdef TEMPLATE_PP_ALTIVEC |
||
40 | # define RENAME(a) a ## _altivec |
||
41 | #else |
||
42 | # define TEMPLATE_PP_ALTIVEC 0 |
||
43 | #endif |
||
44 | |||
45 | #ifdef TEMPLATE_PP_MMX |
||
46 | # define RENAME(a) a ## _MMX |
||
47 | #else |
||
48 | # define TEMPLATE_PP_MMX 0 |
||
49 | #endif |
||
50 | |||
51 | #ifdef TEMPLATE_PP_MMXEXT |
||
52 | # undef TEMPLATE_PP_MMX |
||
53 | # define TEMPLATE_PP_MMX 1 |
||
54 | # define RENAME(a) a ## _MMX2 |
||
55 | #else |
||
56 | # define TEMPLATE_PP_MMXEXT 0 |
||
57 | #endif |
||
58 | |||
59 | #ifdef TEMPLATE_PP_3DNOW |
||
60 | # undef TEMPLATE_PP_MMX |
||
61 | # define TEMPLATE_PP_MMX 1 |
||
62 | # define RENAME(a) a ## _3DNow |
||
63 | #else |
||
64 | # define TEMPLATE_PP_3DNOW 0 |
||
65 | #endif |
||
66 | |||
67 | #ifdef TEMPLATE_PP_SSE2 |
||
68 | # undef TEMPLATE_PP_MMX |
||
69 | # define TEMPLATE_PP_MMX 1 |
||
70 | # undef TEMPLATE_PP_MMXEXT |
||
71 | # define TEMPLATE_PP_MMXEXT 1 |
||
72 | # define RENAME(a) a ## _SSE2 |
||
73 | #else |
||
74 | # define TEMPLATE_PP_SSE2 0 |
||
75 | #endif |
||
76 | |||
77 | #undef REAL_PAVGB |
||
78 | #undef PAVGB |
||
79 | #undef PMINUB |
||
80 | #undef PMAXUB |
||
81 | |||
82 | #if TEMPLATE_PP_MMXEXT |
||
83 | #define REAL_PAVGB(a,b) "pavgb " #a ", " #b " \n\t" |
||
84 | #elif TEMPLATE_PP_3DNOW |
||
85 | #define REAL_PAVGB(a,b) "pavgusb " #a ", " #b " \n\t" |
||
86 | #endif |
||
87 | #define PAVGB(a,b) REAL_PAVGB(a,b) |
||
88 | |||
89 | #if TEMPLATE_PP_MMXEXT |
||
90 | #define PMINUB(a,b,t) "pminub " #a ", " #b " \n\t" |
||
91 | #elif TEMPLATE_PP_MMX |
||
92 | #define PMINUB(b,a,t) \ |
||
93 | "movq " #a ", " #t " \n\t"\ |
||
94 | "psubusb " #b ", " #t " \n\t"\ |
||
95 | "psubb " #t ", " #a " \n\t" |
||
96 | #endif |
||
97 | |||
98 | #if TEMPLATE_PP_MMXEXT |
||
99 | #define PMAXUB(a,b) "pmaxub " #a ", " #b " \n\t" |
||
100 | #elif TEMPLATE_PP_MMX |
||
101 | #define PMAXUB(a,b) \ |
||
102 | "psubusb " #a ", " #b " \n\t"\ |
||
103 | "paddb " #a ", " #b " \n\t" |
||
104 | #endif |
||
105 | |||
106 | //FIXME? |255-0| = 1 (should not be a problem ...) |
||
107 | #if TEMPLATE_PP_MMX |
||
108 | /** |
||
109 | * Check if the middle 8x8 Block in the given 8x16 block is flat |
||
110 | */ |
||
111 | static inline int RENAME(vertClassify)(uint8_t src[], int stride, PPContext *c){ |
||
112 | int numEq= 0, dcOk; |
||
113 | src+= stride*4; // src points to begin of the 8x8 Block |
||
114 | __asm__ volatile( |
||
115 | "movq %0, %%mm7 \n\t" |
||
116 | "movq %1, %%mm6 \n\t" |
||
117 | : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) |
||
118 | ); |
||
119 | |||
120 | __asm__ volatile( |
||
121 | "lea (%2, %3), %%"REG_a" \n\t" |
||
122 | // 0 1 2 3 4 5 6 7 8 9 |
||
123 | // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
||
124 | |||
125 | "movq (%2), %%mm0 \n\t" |
||
126 | "movq (%%"REG_a"), %%mm1 \n\t" |
||
127 | "movq %%mm0, %%mm3 \n\t" |
||
128 | "movq %%mm0, %%mm4 \n\t" |
||
129 | PMAXUB(%%mm1, %%mm4) |
||
130 | PMINUB(%%mm1, %%mm3, %%mm5) |
||
131 | "psubb %%mm1, %%mm0 \n\t" // mm0 = difference |
||
132 | "paddb %%mm7, %%mm0 \n\t" |
||
133 | "pcmpgtb %%mm6, %%mm0 \n\t" |
||
134 | |||
135 | "movq (%%"REG_a",%3), %%mm2 \n\t" |
||
136 | PMAXUB(%%mm2, %%mm4) |
||
137 | PMINUB(%%mm2, %%mm3, %%mm5) |
||
138 | "psubb %%mm2, %%mm1 \n\t" |
||
139 | "paddb %%mm7, %%mm1 \n\t" |
||
140 | "pcmpgtb %%mm6, %%mm1 \n\t" |
||
141 | "paddb %%mm1, %%mm0 \n\t" |
||
142 | |||
143 | "movq (%%"REG_a", %3, 2), %%mm1 \n\t" |
||
144 | PMAXUB(%%mm1, %%mm4) |
||
145 | PMINUB(%%mm1, %%mm3, %%mm5) |
||
146 | "psubb %%mm1, %%mm2 \n\t" |
||
147 | "paddb %%mm7, %%mm2 \n\t" |
||
148 | "pcmpgtb %%mm6, %%mm2 \n\t" |
||
149 | "paddb %%mm2, %%mm0 \n\t" |
||
150 | |||
151 | "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" |
||
152 | |||
153 | "movq (%2, %3, 4), %%mm2 \n\t" |
||
154 | PMAXUB(%%mm2, %%mm4) |
||
155 | PMINUB(%%mm2, %%mm3, %%mm5) |
||
156 | "psubb %%mm2, %%mm1 \n\t" |
||
157 | "paddb %%mm7, %%mm1 \n\t" |
||
158 | "pcmpgtb %%mm6, %%mm1 \n\t" |
||
159 | "paddb %%mm1, %%mm0 \n\t" |
||
160 | |||
161 | "movq (%%"REG_a"), %%mm1 \n\t" |
||
162 | PMAXUB(%%mm1, %%mm4) |
||
163 | PMINUB(%%mm1, %%mm3, %%mm5) |
||
164 | "psubb %%mm1, %%mm2 \n\t" |
||
165 | "paddb %%mm7, %%mm2 \n\t" |
||
166 | "pcmpgtb %%mm6, %%mm2 \n\t" |
||
167 | "paddb %%mm2, %%mm0 \n\t" |
||
168 | |||
169 | "movq (%%"REG_a", %3), %%mm2 \n\t" |
||
170 | PMAXUB(%%mm2, %%mm4) |
||
171 | PMINUB(%%mm2, %%mm3, %%mm5) |
||
172 | "psubb %%mm2, %%mm1 \n\t" |
||
173 | "paddb %%mm7, %%mm1 \n\t" |
||
174 | "pcmpgtb %%mm6, %%mm1 \n\t" |
||
175 | "paddb %%mm1, %%mm0 \n\t" |
||
176 | |||
177 | "movq (%%"REG_a", %3, 2), %%mm1 \n\t" |
||
178 | PMAXUB(%%mm1, %%mm4) |
||
179 | PMINUB(%%mm1, %%mm3, %%mm5) |
||
180 | "psubb %%mm1, %%mm2 \n\t" |
||
181 | "paddb %%mm7, %%mm2 \n\t" |
||
182 | "pcmpgtb %%mm6, %%mm2 \n\t" |
||
183 | "paddb %%mm2, %%mm0 \n\t" |
||
184 | "psubusb %%mm3, %%mm4 \n\t" |
||
185 | |||
186 | " \n\t" |
||
187 | #if TEMPLATE_PP_MMXEXT |
||
188 | "pxor %%mm7, %%mm7 \n\t" |
||
189 | "psadbw %%mm7, %%mm0 \n\t" |
||
190 | #else |
||
191 | "movq %%mm0, %%mm1 \n\t" |
||
192 | "psrlw $8, %%mm0 \n\t" |
||
193 | "paddb %%mm1, %%mm0 \n\t" |
||
194 | "movq %%mm0, %%mm1 \n\t" |
||
195 | "psrlq $16, %%mm0 \n\t" |
||
196 | "paddb %%mm1, %%mm0 \n\t" |
||
197 | "movq %%mm0, %%mm1 \n\t" |
||
198 | "psrlq $32, %%mm0 \n\t" |
||
199 | "paddb %%mm1, %%mm0 \n\t" |
||
200 | #endif |
||
201 | "movq %4, %%mm7 \n\t" // QP,..., QP |
||
202 | "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
||
203 | "psubusb %%mm7, %%mm4 \n\t" // Diff <= 2QP -> 0 |
||
204 | "packssdw %%mm4, %%mm4 \n\t" |
||
205 | "movd %%mm0, %0 \n\t" |
||
206 | "movd %%mm4, %1 \n\t" |
||
207 | |||
208 | : "=r" (numEq), "=r" (dcOk) |
||
209 | : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) |
||
210 | : "%"REG_a |
||
211 | ); |
||
212 | |||
213 | numEq= (-numEq) &0xFF; |
||
214 | if(numEq > c->ppMode.flatnessThreshold){ |
||
215 | if(dcOk) return 0; |
||
216 | else return 1; |
||
217 | }else{ |
||
218 | return 2; |
||
219 | } |
||
220 | } |
||
221 | #endif //TEMPLATE_PP_MMX |
||
222 | |||
223 | /** |
||
224 | * Do a vertical low pass filter on the 8x16 block (only write to the 8x8 block in the middle) |
||
225 | * using the 9-Tap Filter (1,1,2,2,4,2,2,1,1)/16 |
||
226 | */ |
||
227 | #if !TEMPLATE_PP_ALTIVEC |
||
228 | static inline void RENAME(doVertLowPass)(uint8_t *src, int stride, PPContext *c) |
||
229 | { |
||
230 | #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
231 | src+= stride*3; |
||
232 | __asm__ volatile( //"movv %0 %1 %2\n\t" |
||
233 | "movq %2, %%mm0 \n\t" // QP,..., QP |
||
234 | "pxor %%mm4, %%mm4 \n\t" |
||
235 | |||
236 | "movq (%0), %%mm6 \n\t" |
||
237 | "movq (%0, %1), %%mm5 \n\t" |
||
238 | "movq %%mm5, %%mm1 \n\t" |
||
239 | "movq %%mm6, %%mm2 \n\t" |
||
240 | "psubusb %%mm6, %%mm5 \n\t" |
||
241 | "psubusb %%mm1, %%mm2 \n\t" |
||
242 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
||
243 | "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
||
244 | "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
||
245 | |||
246 | "pand %%mm2, %%mm6 \n\t" |
||
247 | "pandn %%mm1, %%mm2 \n\t" |
||
248 | "por %%mm2, %%mm6 \n\t"// First Line to Filter |
||
249 | |||
250 | "movq (%0, %1, 8), %%mm5 \n\t" |
||
251 | "lea (%0, %1, 4), %%"REG_a" \n\t" |
||
252 | "lea (%0, %1, 8), %%"REG_c" \n\t" |
||
253 | "sub %1, %%"REG_c" \n\t" |
||
254 | "add %1, %0 \n\t" // %0 points to line 1 not 0 |
||
255 | "movq (%0, %1, 8), %%mm7 \n\t" |
||
256 | "movq %%mm5, %%mm1 \n\t" |
||
257 | "movq %%mm7, %%mm2 \n\t" |
||
258 | "psubusb %%mm7, %%mm5 \n\t" |
||
259 | "psubusb %%mm1, %%mm2 \n\t" |
||
260 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
||
261 | "psubusb %%mm0, %%mm2 \n\t" // diff <= QP -> 0 |
||
262 | "pcmpeqb %%mm4, %%mm2 \n\t" // diff <= QP -> FF |
||
263 | |||
264 | "pand %%mm2, %%mm7 \n\t" |
||
265 | "pandn %%mm1, %%mm2 \n\t" |
||
266 | "por %%mm2, %%mm7 \n\t" // First Line to Filter |
||
267 | |||
268 | |||
269 | // 1 2 3 4 5 6 7 8 |
||
270 | // %0 %0+%1 %0+2%1 eax %0+4%1 eax+2%1 ecx eax+4%1 |
||
271 | // 6 4 2 2 1 1 |
||
272 | // 6 4 4 2 |
||
273 | // 6 8 2 |
||
274 | |||
275 | "movq (%0, %1), %%mm0 \n\t" // 1 |
||
276 | "movq %%mm0, %%mm1 \n\t" // 1 |
||
277 | PAVGB(%%mm6, %%mm0) //1 1 /2 |
||
278 | PAVGB(%%mm6, %%mm0) //3 1 /4 |
||
279 | |||
280 | "movq (%0, %1, 4), %%mm2 \n\t" // 1 |
||
281 | "movq %%mm2, %%mm5 \n\t" // 1 |
||
282 | PAVGB((%%REGa), %%mm2) // 11 /2 |
||
283 | PAVGB((%0, %1, 2), %%mm2) // 211 /4 |
||
284 | "movq %%mm2, %%mm3 \n\t" // 211 /4 |
||
285 | "movq (%0), %%mm4 \n\t" // 1 |
||
286 | PAVGB(%%mm4, %%mm3) // 4 211 /8 |
||
287 | PAVGB(%%mm0, %%mm3) //642211 /16 |
||
288 | "movq %%mm3, (%0) \n\t" // X |
||
289 | // mm1=2 mm2=3(211) mm4=1 mm5=5 mm6=0 mm7=9 |
||
290 | "movq %%mm1, %%mm0 \n\t" // 1 |
||
291 | PAVGB(%%mm6, %%mm0) //1 1 /2 |
||
292 | "movq %%mm4, %%mm3 \n\t" // 1 |
||
293 | PAVGB((%0,%1,2), %%mm3) // 1 1 /2 |
||
294 | PAVGB((%%REGa,%1,2), %%mm5) // 11 /2 |
||
295 | PAVGB((%%REGa), %%mm5) // 211 /4 |
||
296 | PAVGB(%%mm5, %%mm3) // 2 2211 /8 |
||
297 | PAVGB(%%mm0, %%mm3) //4242211 /16 |
||
298 | "movq %%mm3, (%0,%1) \n\t" // X |
||
299 | // mm1=2 mm2=3(211) mm4=1 mm5=4(211) mm6=0 mm7=9 |
||
300 | PAVGB(%%mm4, %%mm6) //11 /2 |
||
301 | "movq (%%"REG_c"), %%mm0 \n\t" // 1 |
||
302 | PAVGB((%%REGa, %1, 2), %%mm0) // 11/2 |
||
303 | "movq %%mm0, %%mm3 \n\t" // 11/2 |
||
304 | PAVGB(%%mm1, %%mm0) // 2 11/4 |
||
305 | PAVGB(%%mm6, %%mm0) //222 11/8 |
||
306 | PAVGB(%%mm2, %%mm0) //22242211/16 |
||
307 | "movq (%0, %1, 2), %%mm2 \n\t" // 1 |
||
308 | "movq %%mm0, (%0, %1, 2) \n\t" // X |
||
309 | // mm1=2 mm2=3 mm3=6(11) mm4=1 mm5=4(211) mm6=0(11) mm7=9 |
||
310 | "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 |
||
311 | PAVGB((%%REGc), %%mm0) // 11 /2 |
||
312 | PAVGB(%%mm0, %%mm6) //11 11 /4 |
||
313 | PAVGB(%%mm1, %%mm4) // 11 /2 |
||
314 | PAVGB(%%mm2, %%mm1) // 11 /2 |
||
315 | PAVGB(%%mm1, %%mm6) //1122 11 /8 |
||
316 | PAVGB(%%mm5, %%mm6) //112242211 /16 |
||
317 | "movq (%%"REG_a"), %%mm5 \n\t" // 1 |
||
318 | "movq %%mm6, (%%"REG_a") \n\t" // X |
||
319 | // mm0=7(11) mm1=2(11) mm2=3 mm3=6(11) mm4=1(11) mm5=4 mm7=9 |
||
320 | "movq (%%"REG_a", %1, 4), %%mm6 \n\t" // 1 |
||
321 | PAVGB(%%mm7, %%mm6) // 11 /2 |
||
322 | PAVGB(%%mm4, %%mm6) // 11 11 /4 |
||
323 | PAVGB(%%mm3, %%mm6) // 11 2211 /8 |
||
324 | PAVGB(%%mm5, %%mm2) // 11 /2 |
||
325 | "movq (%0, %1, 4), %%mm4 \n\t" // 1 |
||
326 | PAVGB(%%mm4, %%mm2) // 112 /4 |
||
327 | PAVGB(%%mm2, %%mm6) // 112242211 /16 |
||
328 | "movq %%mm6, (%0, %1, 4) \n\t" // X |
||
329 | // mm0=7(11) mm1=2(11) mm2=3(112) mm3=6(11) mm4=5 mm5=4 mm7=9 |
||
330 | PAVGB(%%mm7, %%mm1) // 11 2 /4 |
||
331 | PAVGB(%%mm4, %%mm5) // 11 /2 |
||
332 | PAVGB(%%mm5, %%mm0) // 11 11 /4 |
||
333 | "movq (%%"REG_a", %1, 2), %%mm6 \n\t" // 1 |
||
334 | PAVGB(%%mm6, %%mm1) // 11 4 2 /8 |
||
335 | PAVGB(%%mm0, %%mm1) // 11224222 /16 |
||
336 | "movq %%mm1, (%%"REG_a", %1, 2) \n\t" // X |
||
337 | // mm2=3(112) mm3=6(11) mm4=5 mm5=4(11) mm6=6 mm7=9 |
||
338 | PAVGB((%%REGc), %%mm2) // 112 4 /8 |
||
339 | "movq (%%"REG_a", %1, 4), %%mm0 \n\t" // 1 |
||
340 | PAVGB(%%mm0, %%mm6) // 1 1 /2 |
||
341 | PAVGB(%%mm7, %%mm6) // 1 12 /4 |
||
342 | PAVGB(%%mm2, %%mm6) // 1122424 /4 |
||
343 | "movq %%mm6, (%%"REG_c") \n\t" // X |
||
344 | // mm0=8 mm3=6(11) mm4=5 mm5=4(11) mm7=9 |
||
345 | PAVGB(%%mm7, %%mm5) // 11 2 /4 |
||
346 | PAVGB(%%mm7, %%mm5) // 11 6 /8 |
||
347 | |||
348 | PAVGB(%%mm3, %%mm0) // 112 /4 |
||
349 | PAVGB(%%mm0, %%mm5) // 112246 /16 |
||
350 | "movq %%mm5, (%%"REG_a", %1, 4) \n\t" // X |
||
351 | "sub %1, %0 \n\t" |
||
352 | |||
353 | : |
||
354 | : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) |
||
355 | : "%"REG_a, "%"REG_c |
||
356 | ); |
||
357 | #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
358 | const int l1= stride; |
||
359 | const int l2= stride + l1; |
||
360 | const int l3= stride + l2; |
||
361 | const int l4= stride + l3; |
||
362 | const int l5= stride + l4; |
||
363 | const int l6= stride + l5; |
||
364 | const int l7= stride + l6; |
||
365 | const int l8= stride + l7; |
||
366 | const int l9= stride + l8; |
||
367 | int x; |
||
368 | src+= stride*3; |
||
369 | for(x=0; x |
||
370 | const int first= FFABS(src[0] - src[l1]) < c->QP ? src[0] : src[l1]; |
||
371 | const int last= FFABS(src[l8] - src[l9]) < c->QP ? src[l9] : src[l8]; |
||
372 | |||
373 | int sums[10]; |
||
374 | sums[0] = 4*first + src[l1] + src[l2] + src[l3] + 4; |
||
375 | sums[1] = sums[0] - first + src[l4]; |
||
376 | sums[2] = sums[1] - first + src[l5]; |
||
377 | sums[3] = sums[2] - first + src[l6]; |
||
378 | sums[4] = sums[3] - first + src[l7]; |
||
379 | sums[5] = sums[4] - src[l1] + src[l8]; |
||
380 | sums[6] = sums[5] - src[l2] + last; |
||
381 | sums[7] = sums[6] - src[l3] + last; |
||
382 | sums[8] = sums[7] - src[l4] + last; |
||
383 | sums[9] = sums[8] - src[l5] + last; |
||
384 | |||
385 | src[l1]= (sums[0] + sums[2] + 2*src[l1])>>4; |
||
386 | src[l2]= (sums[1] + sums[3] + 2*src[l2])>>4; |
||
387 | src[l3]= (sums[2] + sums[4] + 2*src[l3])>>4; |
||
388 | src[l4]= (sums[3] + sums[5] + 2*src[l4])>>4; |
||
389 | src[l5]= (sums[4] + sums[6] + 2*src[l5])>>4; |
||
390 | src[l6]= (sums[5] + sums[7] + 2*src[l6])>>4; |
||
391 | src[l7]= (sums[6] + sums[8] + 2*src[l7])>>4; |
||
392 | src[l8]= (sums[7] + sums[9] + 2*src[l8])>>4; |
||
393 | |||
394 | src++; |
||
395 | } |
||
396 | #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
397 | } |
||
398 | #endif //TEMPLATE_PP_ALTIVEC |
||
399 | |||
400 | /** |
||
401 | * Experimental Filter 1 |
||
402 | * will not damage linear gradients |
||
403 | * Flat blocks should look like they were passed through the (1,1,2,2,4,2,2,1,1) 9-Tap filter |
||
404 | * can only smooth blocks at the expected locations (it cannot smooth them if they did move) |
||
405 | * MMX2 version does correct clipping C version does not |
||
406 | */ |
||
407 | static inline void RENAME(vertX1Filter)(uint8_t *src, int stride, PPContext *co) |
||
408 | { |
||
409 | #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
410 | src+= stride*3; |
||
411 | |||
412 | __asm__ volatile( |
||
413 | "pxor %%mm7, %%mm7 \n\t" // 0 |
||
414 | "lea (%0, %1), %%"REG_a" \n\t" |
||
415 | "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
||
416 | // 0 1 2 3 4 5 6 7 8 9 |
||
417 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
||
418 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 |
||
419 | "movq (%0, %1, 4), %%mm1 \n\t" // line 4 |
||
420 | "movq %%mm1, %%mm2 \n\t" // line 4 |
||
421 | "psubusb %%mm0, %%mm1 \n\t" |
||
422 | "psubusb %%mm2, %%mm0 \n\t" |
||
423 | "por %%mm1, %%mm0 \n\t" // |l2 - l3| |
||
424 | "movq (%%"REG_c"), %%mm3 \n\t" // line 5 |
||
425 | "movq (%%"REG_c", %1), %%mm4 \n\t" // line 6 |
||
426 | "movq %%mm3, %%mm5 \n\t" // line 5 |
||
427 | "psubusb %%mm4, %%mm3 \n\t" |
||
428 | "psubusb %%mm5, %%mm4 \n\t" |
||
429 | "por %%mm4, %%mm3 \n\t" // |l5 - l6| |
||
430 | PAVGB(%%mm3, %%mm0) // (|l2 - l3| + |l5 - l6|)/2 |
||
431 | "movq %%mm2, %%mm1 \n\t" // line 4 |
||
432 | "psubusb %%mm5, %%mm2 \n\t" |
||
433 | "movq %%mm2, %%mm4 \n\t" |
||
434 | "pcmpeqb %%mm7, %%mm2 \n\t" // (l4 - l5) <= 0 ? -1 : 0 |
||
435 | "psubusb %%mm1, %%mm5 \n\t" |
||
436 | "por %%mm5, %%mm4 \n\t" // |l4 - l5| |
||
437 | "psubusb %%mm0, %%mm4 \n\t" //d = MAX(0, |l4-l5| - (|l2-l3| + |l5-l6|)/2) |
||
438 | "movq %%mm4, %%mm3 \n\t" // d |
||
439 | "movq %2, %%mm0 \n\t" |
||
440 | "paddusb %%mm0, %%mm0 \n\t" |
||
441 | "psubusb %%mm0, %%mm4 \n\t" |
||
442 | "pcmpeqb %%mm7, %%mm4 \n\t" // d <= QP ? -1 : 0 |
||
443 | "psubusb "MANGLE(b01)", %%mm3 \n\t" |
||
444 | "pand %%mm4, %%mm3 \n\t" // d <= QP ? d : 0 |
||
445 | |||
446 | PAVGB(%%mm7, %%mm3) // d/2 |
||
447 | "movq %%mm3, %%mm1 \n\t" // d/2 |
||
448 | PAVGB(%%mm7, %%mm3) // d/4 |
||
449 | PAVGB(%%mm1, %%mm3) // 3*d/8 |
||
450 | |||
451 | "movq (%0, %1, 4), %%mm0 \n\t" // line 4 |
||
452 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
||
453 | "psubusb %%mm3, %%mm0 \n\t" |
||
454 | "pxor %%mm2, %%mm0 \n\t" |
||
455 | "movq %%mm0, (%0, %1, 4) \n\t" // line 4 |
||
456 | |||
457 | "movq (%%"REG_c"), %%mm0 \n\t" // line 5 |
||
458 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
||
459 | "paddusb %%mm3, %%mm0 \n\t" |
||
460 | "pxor %%mm2, %%mm0 \n\t" |
||
461 | "movq %%mm0, (%%"REG_c") \n\t" // line 5 |
||
462 | |||
463 | PAVGB(%%mm7, %%mm1) // d/4 |
||
464 | |||
465 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t" // line 3 |
||
466 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l4-1 : l4 |
||
467 | "psubusb %%mm1, %%mm0 \n\t" |
||
468 | "pxor %%mm2, %%mm0 \n\t" |
||
469 | "movq %%mm0, (%%"REG_a", %1, 2) \n\t" // line 3 |
||
470 | |||
471 | "movq (%%"REG_c", %1), %%mm0 \n\t" // line 6 |
||
472 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l5-1 : l5 |
||
473 | "paddusb %%mm1, %%mm0 \n\t" |
||
474 | "pxor %%mm2, %%mm0 \n\t" |
||
475 | "movq %%mm0, (%%"REG_c", %1) \n\t" // line 6 |
||
476 | |||
477 | PAVGB(%%mm7, %%mm1) // d/8 |
||
478 | |||
479 | "movq (%%"REG_a", %1), %%mm0 \n\t" // line 2 |
||
480 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l2-1 : l2 |
||
481 | "psubusb %%mm1, %%mm0 \n\t" |
||
482 | "pxor %%mm2, %%mm0 \n\t" |
||
483 | "movq %%mm0, (%%"REG_a", %1) \n\t" // line 2 |
||
484 | |||
485 | "movq (%%"REG_c", %1, 2), %%mm0 \n\t" // line 7 |
||
486 | "pxor %%mm2, %%mm0 \n\t" //(l4 - l5) <= 0 ? -l7-1 : l7 |
||
487 | "paddusb %%mm1, %%mm0 \n\t" |
||
488 | "pxor %%mm2, %%mm0 \n\t" |
||
489 | "movq %%mm0, (%%"REG_c", %1, 2) \n\t" // line 7 |
||
490 | |||
491 | : |
||
492 | : "r" (src), "r" ((x86_reg)stride), "m" (co->pQPb) |
||
493 | : "%"REG_a, "%"REG_c |
||
494 | ); |
||
495 | #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
496 | |||
497 | const int l1= stride; |
||
498 | const int l2= stride + l1; |
||
499 | const int l3= stride + l2; |
||
500 | const int l4= stride + l3; |
||
501 | const int l5= stride + l4; |
||
502 | const int l6= stride + l5; |
||
503 | const int l7= stride + l6; |
||
504 | // const int l8= stride + l7; |
||
505 | // const int l9= stride + l8; |
||
506 | int x; |
||
507 | |||
508 | src+= stride*3; |
||
509 | for(x=0; x |
||
510 | int a= src[l3] - src[l4]; |
||
511 | int b= src[l4] - src[l5]; |
||
512 | int c= src[l5] - src[l6]; |
||
513 | |||
514 | int d= FFABS(b) - ((FFABS(a) + FFABS(c))>>1); |
||
515 | d= FFMAX(d, 0); |
||
516 | |||
517 | if(d < co->QP*2){ |
||
518 | int v = d * FFSIGN(-b); |
||
519 | |||
520 | src[l2] +=v>>3; |
||
521 | src[l3] +=v>>2; |
||
522 | src[l4] +=(3*v)>>3; |
||
523 | src[l5] -=(3*v)>>3; |
||
524 | src[l6] -=v>>2; |
||
525 | src[l7] -=v>>3; |
||
526 | } |
||
527 | src++; |
||
528 | } |
||
529 | #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
530 | } |
||
531 | |||
532 | #if !TEMPLATE_PP_ALTIVEC |
||
533 | static inline void RENAME(doVertDefFilter)(uint8_t src[], int stride, PPContext *c) |
||
534 | { |
||
535 | #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
536 | /* |
||
537 | uint8_t tmp[16]; |
||
538 | const int l1= stride; |
||
539 | const int l2= stride + l1; |
||
540 | const int l3= stride + l2; |
||
541 | const int l4= (int)tmp - (int)src - stride*3; |
||
542 | const int l5= (int)tmp - (int)src - stride*3 + 8; |
||
543 | const int l6= stride*3 + l3; |
||
544 | const int l7= stride + l6; |
||
545 | const int l8= stride + l7; |
||
546 | |||
547 | memcpy(tmp, src+stride*7, 8); |
||
548 | memcpy(tmp+8, src+stride*8, 8); |
||
549 | */ |
||
550 | src+= stride*4; |
||
551 | __asm__ volatile( |
||
552 | |||
553 | #if 0 //slightly more accurate and slightly slower |
||
554 | "pxor %%mm7, %%mm7 \n\t" // 0 |
||
555 | "lea (%0, %1), %%"REG_a" \n\t" |
||
556 | "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
||
557 | // 0 1 2 3 4 5 6 7 |
||
558 | // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
||
559 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 |
||
560 | |||
561 | |||
562 | "movq (%0, %1, 2), %%mm0 \n\t" // l2 |
||
563 | "movq (%0), %%mm1 \n\t" // l0 |
||
564 | "movq %%mm0, %%mm2 \n\t" // l2 |
||
565 | PAVGB(%%mm7, %%mm0) // ~l2/2 |
||
566 | PAVGB(%%mm1, %%mm0) // ~(l2 + 2l0)/4 |
||
567 | PAVGB(%%mm2, %%mm0) // ~(5l2 + 2l0)/8 |
||
568 | |||
569 | "movq (%%"REG_a"), %%mm1 \n\t" // l1 |
||
570 | "movq (%%"REG_a", %1, 2), %%mm3 \n\t" // l3 |
||
571 | "movq %%mm1, %%mm4 \n\t" // l1 |
||
572 | PAVGB(%%mm7, %%mm1) // ~l1/2 |
||
573 | PAVGB(%%mm3, %%mm1) // ~(l1 + 2l3)/4 |
||
574 | PAVGB(%%mm4, %%mm1) // ~(5l1 + 2l3)/8 |
||
575 | |||
576 | "movq %%mm0, %%mm4 \n\t" // ~(5l2 + 2l0)/8 |
||
577 | "psubusb %%mm1, %%mm0 \n\t" |
||
578 | "psubusb %%mm4, %%mm1 \n\t" |
||
579 | "por %%mm0, %%mm1 \n\t" // ~|2l0 - 5l1 + 5l2 - 2l3|/8 |
||
580 | // mm1= |lenergy|, mm2= l2, mm3= l3, mm7=0 |
||
581 | |||
582 | "movq (%0, %1, 4), %%mm0 \n\t" // l4 |
||
583 | "movq %%mm0, %%mm4 \n\t" // l4 |
||
584 | PAVGB(%%mm7, %%mm0) // ~l4/2 |
||
585 | PAVGB(%%mm2, %%mm0) // ~(l4 + 2l2)/4 |
||
586 | PAVGB(%%mm4, %%mm0) // ~(5l4 + 2l2)/8 |
||
587 | |||
588 | "movq (%%"REG_c"), %%mm2 \n\t" // l5 |
||
589 | "movq %%mm3, %%mm5 \n\t" // l3 |
||
590 | PAVGB(%%mm7, %%mm3) // ~l3/2 |
||
591 | PAVGB(%%mm2, %%mm3) // ~(l3 + 2l5)/4 |
||
592 | PAVGB(%%mm5, %%mm3) // ~(5l3 + 2l5)/8 |
||
593 | |||
594 | "movq %%mm0, %%mm6 \n\t" // ~(5l4 + 2l2)/8 |
||
595 | "psubusb %%mm3, %%mm0 \n\t" |
||
596 | "psubusb %%mm6, %%mm3 \n\t" |
||
597 | "por %%mm0, %%mm3 \n\t" // ~|2l2 - 5l3 + 5l4 - 2l5|/8 |
||
598 | "pcmpeqb %%mm7, %%mm0 \n\t" // SIGN(2l2 - 5l3 + 5l4 - 2l5) |
||
599 | // mm0= SIGN(menergy), mm1= |lenergy|, mm2= l5, mm3= |menergy|, mm4=l4, mm5= l3, mm7=0 |
||
600 | |||
601 | "movq (%%"REG_c", %1), %%mm6 \n\t" // l6 |
||
602 | "movq %%mm6, %%mm5 \n\t" // l6 |
||
603 | PAVGB(%%mm7, %%mm6) // ~l6/2 |
||
604 | PAVGB(%%mm4, %%mm6) // ~(l6 + 2l4)/4 |
||
605 | PAVGB(%%mm5, %%mm6) // ~(5l6 + 2l4)/8 |
||
606 | |||
607 | "movq (%%"REG_c", %1, 2), %%mm5 \n\t" // l7 |
||
608 | "movq %%mm2, %%mm4 \n\t" // l5 |
||
609 | PAVGB(%%mm7, %%mm2) // ~l5/2 |
||
610 | PAVGB(%%mm5, %%mm2) // ~(l5 + 2l7)/4 |
||
611 | PAVGB(%%mm4, %%mm2) // ~(5l5 + 2l7)/8 |
||
612 | |||
613 | "movq %%mm6, %%mm4 \n\t" // ~(5l6 + 2l4)/8 |
||
614 | "psubusb %%mm2, %%mm6 \n\t" |
||
615 | "psubusb %%mm4, %%mm2 \n\t" |
||
616 | "por %%mm6, %%mm2 \n\t" // ~|2l4 - 5l5 + 5l6 - 2l7|/8 |
||
617 | // mm0= SIGN(menergy), mm1= |lenergy|/8, mm2= |renergy|/8, mm3= |menergy|/8, mm7=0 |
||
618 | |||
619 | |||
620 | PMINUB(%%mm2, %%mm1, %%mm4) // MIN(|lenergy|,|renergy|)/8 |
||
621 | "movq %2, %%mm4 \n\t" // QP //FIXME QP+1 ? |
||
622 | "paddusb "MANGLE(b01)", %%mm4 \n\t" |
||
623 | "pcmpgtb %%mm3, %%mm4 \n\t" // |menergy|/8 < QP |
||
624 | "psubusb %%mm1, %%mm3 \n\t" // d=|menergy|/8-MIN(|lenergy|,|renergy|)/8 |
||
625 | "pand %%mm4, %%mm3 \n\t" |
||
626 | |||
627 | "movq %%mm3, %%mm1 \n\t" |
||
628 | // "psubusb "MANGLE(b01)", %%mm3 \n\t" |
||
629 | PAVGB(%%mm7, %%mm3) |
||
630 | PAVGB(%%mm7, %%mm3) |
||
631 | "paddusb %%mm1, %%mm3 \n\t" |
||
632 | // "paddusb "MANGLE(b01)", %%mm3 \n\t" |
||
633 | |||
634 | "movq (%%"REG_a", %1, 2), %%mm6 \n\t" //l3 |
||
635 | "movq (%0, %1, 4), %%mm5 \n\t" //l4 |
||
636 | "movq (%0, %1, 4), %%mm4 \n\t" //l4 |
||
637 | "psubusb %%mm6, %%mm5 \n\t" |
||
638 | "psubusb %%mm4, %%mm6 \n\t" |
||
639 | "por %%mm6, %%mm5 \n\t" // |l3-l4| |
||
640 | "pcmpeqb %%mm7, %%mm6 \n\t" // SIGN(l3-l4) |
||
641 | "pxor %%mm6, %%mm0 \n\t" |
||
642 | "pand %%mm0, %%mm3 \n\t" |
||
643 | PMINUB(%%mm5, %%mm3, %%mm0) |
||
644 | |||
645 | "psubusb "MANGLE(b01)", %%mm3 \n\t" |
||
646 | PAVGB(%%mm7, %%mm3) |
||
647 | |||
648 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t" |
||
649 | "movq (%0, %1, 4), %%mm2 \n\t" |
||
650 | "pxor %%mm6, %%mm0 \n\t" |
||
651 | "pxor %%mm6, %%mm2 \n\t" |
||
652 | "psubb %%mm3, %%mm0 \n\t" |
||
653 | "paddb %%mm3, %%mm2 \n\t" |
||
654 | "pxor %%mm6, %%mm0 \n\t" |
||
655 | "pxor %%mm6, %%mm2 \n\t" |
||
656 | "movq %%mm0, (%%"REG_a", %1, 2) \n\t" |
||
657 | "movq %%mm2, (%0, %1, 4) \n\t" |
||
658 | #endif //0 |
||
659 | |||
660 | "lea (%0, %1), %%"REG_a" \n\t" |
||
661 | "pcmpeqb %%mm6, %%mm6 \n\t" // -1 |
||
662 | // 0 1 2 3 4 5 6 7 |
||
663 | // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 ecx+%1 ecx+2%1 |
||
664 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 |
||
665 | |||
666 | |||
667 | "movq (%%"REG_a", %1, 2), %%mm1 \n\t" // l3 |
||
668 | "movq (%0, %1, 4), %%mm0 \n\t" // l4 |
||
669 | "pxor %%mm6, %%mm1 \n\t" // -l3-1 |
||
670 | PAVGB(%%mm1, %%mm0) // -q+128 = (l4-l3+256)/2 |
||
671 | // mm1=-l3-1, mm0=128-q |
||
672 | |||
673 | "movq (%%"REG_a", %1, 4), %%mm2 \n\t" // l5 |
||
674 | "movq (%%"REG_a", %1), %%mm3 \n\t" // l2 |
||
675 | "pxor %%mm6, %%mm2 \n\t" // -l5-1 |
||
676 | "movq %%mm2, %%mm5 \n\t" // -l5-1 |
||
677 | "movq "MANGLE(b80)", %%mm4 \n\t" // 128 |
||
678 | "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
||
679 | PAVGB(%%mm3, %%mm2) // (l2-l5+256)/2 |
||
680 | PAVGB(%%mm0, %%mm4) // ~(l4-l3)/4 + 128 |
||
681 | PAVGB(%%mm2, %%mm4) // ~(l2-l5)/4 +(l4-l3)/8 + 128 |
||
682 | PAVGB(%%mm0, %%mm4) // ~(l2-l5)/8 +5(l4-l3)/16 + 128 |
||
683 | // mm1=-l3-1, mm0=128-q, mm3=l2, mm4=menergy/16 + 128, mm5= -l5-1 |
||
684 | |||
685 | "movq (%%"REG_a"), %%mm2 \n\t" // l1 |
||
686 | "pxor %%mm6, %%mm2 \n\t" // -l1-1 |
||
687 | PAVGB(%%mm3, %%mm2) // (l2-l1+256)/2 |
||
688 | PAVGB((%0), %%mm1) // (l0-l3+256)/2 |
||
689 | "movq "MANGLE(b80)", %%mm3 \n\t" // 128 |
||
690 | PAVGB(%%mm2, %%mm3) // ~(l2-l1)/4 + 128 |
||
691 | PAVGB(%%mm1, %%mm3) // ~(l0-l3)/4 +(l2-l1)/8 + 128 |
||
692 | PAVGB(%%mm2, %%mm3) // ~(l0-l3)/8 +5(l2-l1)/16 + 128 |
||
693 | // mm0=128-q, mm3=lenergy/16 + 128, mm4= menergy/16 + 128, mm5= -l5-1 |
||
694 | |||
695 | PAVGB((%%REGc, %1), %%mm5) // (l6-l5+256)/2 |
||
696 | "movq (%%"REG_c", %1, 2), %%mm1 \n\t" // l7 |
||
697 | "pxor %%mm6, %%mm1 \n\t" // -l7-1 |
||
698 | PAVGB((%0, %1, 4), %%mm1) // (l4-l7+256)/2 |
||
699 | "movq "MANGLE(b80)", %%mm2 \n\t" // 128 |
||
700 | PAVGB(%%mm5, %%mm2) // ~(l6-l5)/4 + 128 |
||
701 | PAVGB(%%mm1, %%mm2) // ~(l4-l7)/4 +(l6-l5)/8 + 128 |
||
702 | PAVGB(%%mm5, %%mm2) // ~(l4-l7)/8 +5(l6-l5)/16 + 128 |
||
703 | // mm0=128-q, mm2=renergy/16 + 128, mm3=lenergy/16 + 128, mm4= menergy/16 + 128 |
||
704 | |||
705 | "movq "MANGLE(b00)", %%mm1 \n\t" // 0 |
||
706 | "movq "MANGLE(b00)", %%mm5 \n\t" // 0 |
||
707 | "psubb %%mm2, %%mm1 \n\t" // 128 - renergy/16 |
||
708 | "psubb %%mm3, %%mm5 \n\t" // 128 - lenergy/16 |
||
709 | PMAXUB(%%mm1, %%mm2) // 128 + |renergy/16| |
||
710 | PMAXUB(%%mm5, %%mm3) // 128 + |lenergy/16| |
||
711 | PMINUB(%%mm2, %%mm3, %%mm1) // 128 + MIN(|lenergy|,|renergy|)/16 |
||
712 | |||
713 | // mm0=128-q, mm3=128 + MIN(|lenergy|,|renergy|)/16, mm4= menergy/16 + 128 |
||
714 | |||
715 | "movq "MANGLE(b00)", %%mm7 \n\t" // 0 |
||
716 | "movq %2, %%mm2 \n\t" // QP |
||
717 | PAVGB(%%mm6, %%mm2) // 128 + QP/2 |
||
718 | "psubb %%mm6, %%mm2 \n\t" |
||
719 | |||
720 | "movq %%mm4, %%mm1 \n\t" |
||
721 | "pcmpgtb %%mm7, %%mm1 \n\t" // SIGN(menergy) |
||
722 | "pxor %%mm1, %%mm4 \n\t" |
||
723 | "psubb %%mm1, %%mm4 \n\t" // 128 + |menergy|/16 |
||
724 | "pcmpgtb %%mm4, %%mm2 \n\t" // |menergy|/16 < QP/2 |
||
725 | "psubusb %%mm3, %%mm4 \n\t" //d=|menergy|/16 - MIN(|lenergy|,|renergy|)/16 |
||
726 | // mm0=128-q, mm1= SIGN(menergy), mm2= |menergy|/16 < QP/2, mm4= d/16 |
||
727 | |||
728 | "movq %%mm4, %%mm3 \n\t" // d |
||
729 | "psubusb "MANGLE(b01)", %%mm4 \n\t" |
||
730 | PAVGB(%%mm7, %%mm4) // d/32 |
||
731 | PAVGB(%%mm7, %%mm4) // (d + 32)/64 |
||
732 | "paddb %%mm3, %%mm4 \n\t" // 5d/64 |
||
733 | "pand %%mm2, %%mm4 \n\t" |
||
734 | |||
735 | "movq "MANGLE(b80)", %%mm5 \n\t" // 128 |
||
736 | "psubb %%mm0, %%mm5 \n\t" // q |
||
737 | "paddsb %%mm6, %%mm5 \n\t" // fix bad rounding |
||
738 | "pcmpgtb %%mm5, %%mm7 \n\t" // SIGN(q) |
||
739 | "pxor %%mm7, %%mm5 \n\t" |
||
740 | |||
741 | PMINUB(%%mm5, %%mm4, %%mm3) // MIN(|q|, 5d/64) |
||
742 | "pxor %%mm1, %%mm7 \n\t" // SIGN(d*q) |
||
743 | |||
744 | "pand %%mm7, %%mm4 \n\t" |
||
745 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t" |
||
746 | "movq (%0, %1, 4), %%mm2 \n\t" |
||
747 | "pxor %%mm1, %%mm0 \n\t" |
||
748 | "pxor %%mm1, %%mm2 \n\t" |
||
749 | "paddb %%mm4, %%mm0 \n\t" |
||
750 | "psubb %%mm4, %%mm2 \n\t" |
||
751 | "pxor %%mm1, %%mm0 \n\t" |
||
752 | "pxor %%mm1, %%mm2 \n\t" |
||
753 | "movq %%mm0, (%%"REG_a", %1, 2) \n\t" |
||
754 | "movq %%mm2, (%0, %1, 4) \n\t" |
||
755 | |||
756 | : |
||
757 | : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb) |
||
758 | : "%"REG_a, "%"REG_c |
||
759 | ); |
||
760 | |||
761 | /* |
||
762 | { |
||
763 | int x; |
||
764 | src-= stride; |
||
765 | for(x=0; x |
||
766 | const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
||
767 | if(FFABS(middleEnergy)< 8*QP){ |
||
768 | const int q=(src[l4] - src[l5])/2; |
||
769 | const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
||
770 | const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
||
771 | |||
772 | int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); |
||
773 | d= FFMAX(d, 0); |
||
774 | |||
775 | d= (5*d + 32) >> 6; |
||
776 | d*= FFSIGN(-middleEnergy); |
||
777 | |||
778 | if(q>0){ |
||
779 | d= d<0 ? 0 : d; |
||
780 | d= d>q ? q : d; |
||
781 | }else{ |
||
782 | d= d>0 ? 0 : d; |
||
783 | d= d |
||
784 | } |
||
785 | |||
786 | src[l4]-= d; |
||
787 | src[l5]+= d; |
||
788 | } |
||
789 | src++; |
||
790 | } |
||
791 | src-=8; |
||
792 | for(x=0; x<8; x++){ |
||
793 | int y; |
||
794 | for(y=4; y<6; y++){ |
||
795 | int d= src[x+y*stride] - tmp[x+(y-4)*8]; |
||
796 | int ad= FFABS(d); |
||
797 | static int max=0; |
||
798 | static int sum=0; |
||
799 | static int num=0; |
||
800 | static int bias=0; |
||
801 | |||
802 | if(max |
||
803 | sum+= ad>3 ? 1 : 0; |
||
804 | if(ad>3){ |
||
805 | src[0] = src[7] = src[stride*7] = src[(stride+1)*7]=255; |
||
806 | } |
||
807 | if(y==4) bias+=d; |
||
808 | num++; |
||
809 | if(num%1000000 == 0){ |
||
810 | av_log(c, AV_LOG_INFO, " %d %d %d %d\n", num, sum, max, bias); |
||
811 | } |
||
812 | } |
||
813 | } |
||
814 | } |
||
815 | */ |
||
816 | #elif TEMPLATE_PP_MMX |
||
817 | DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars |
||
818 | src+= stride*4; |
||
819 | __asm__ volatile( |
||
820 | "pxor %%mm7, %%mm7 \n\t" |
||
821 | // 0 1 2 3 4 5 6 7 |
||
822 | // %0 %0+%1 %0+2%1 eax+2%1 %0+4%1 eax+4%1 edx+%1 edx+2%1 |
||
823 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 |
||
824 | |||
825 | "movq (%0), %%mm0 \n\t" |
||
826 | "movq %%mm0, %%mm1 \n\t" |
||
827 | "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
||
828 | "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
||
829 | |||
830 | "movq (%0, %1), %%mm2 \n\t" |
||
831 | "lea (%0, %1, 2), %%"REG_a" \n\t" |
||
832 | "movq %%mm2, %%mm3 \n\t" |
||
833 | "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
||
834 | "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
||
835 | |||
836 | "movq (%%"REG_a"), %%mm4 \n\t" |
||
837 | "movq %%mm4, %%mm5 \n\t" |
||
838 | "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
||
839 | "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
||
840 | |||
841 | "paddw %%mm0, %%mm0 \n\t" // 2L0 |
||
842 | "paddw %%mm1, %%mm1 \n\t" // 2H0 |
||
843 | "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
||
844 | "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
||
845 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
||
846 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
||
847 | |||
848 | "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
||
849 | "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
||
850 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
||
851 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
||
852 | |||
853 | "movq (%%"REG_a", %1), %%mm2 \n\t" |
||
854 | "movq %%mm2, %%mm3 \n\t" |
||
855 | "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
||
856 | "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
||
857 | |||
858 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
||
859 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
||
860 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
||
861 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
||
862 | "movq %%mm0, (%3) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
||
863 | "movq %%mm1, 8(%3) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
||
864 | |||
865 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t" |
||
866 | "movq %%mm0, %%mm1 \n\t" |
||
867 | "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
||
868 | "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
||
869 | |||
870 | "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
||
871 | "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
||
872 | "movq %%mm2, 16(%3) \n\t" // L3 - L4 |
||
873 | "movq %%mm3, 24(%3) \n\t" // H3 - H4 |
||
874 | "paddw %%mm4, %%mm4 \n\t" // 2L2 |
||
875 | "paddw %%mm5, %%mm5 \n\t" // 2H2 |
||
876 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
||
877 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
||
878 | |||
879 | "lea (%%"REG_a", %1), %0 \n\t" |
||
880 | "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
||
881 | "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
||
882 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
||
883 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
||
884 | //50 opcodes so far |
||
885 | "movq (%0, %1, 2), %%mm2 \n\t" |
||
886 | "movq %%mm2, %%mm3 \n\t" |
||
887 | "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
||
888 | "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
||
889 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
||
890 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
||
891 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
||
892 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
||
893 | |||
894 | "movq (%%"REG_a", %1, 4), %%mm6 \n\t" |
||
895 | "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
||
896 | "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
||
897 | "movq (%%"REG_a", %1, 4), %%mm6 \n\t" |
||
898 | "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
||
899 | "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
||
900 | |||
901 | "paddw %%mm0, %%mm0 \n\t" // 2L4 |
||
902 | "paddw %%mm1, %%mm1 \n\t" // 2H4 |
||
903 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
||
904 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
||
905 | |||
906 | "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
||
907 | "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
||
908 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
||
909 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
||
910 | |||
911 | "movq (%0, %1, 4), %%mm2 \n\t" |
||
912 | "movq %%mm2, %%mm3 \n\t" |
||
913 | "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
||
914 | "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
||
915 | |||
916 | "paddw %%mm2, %%mm2 \n\t" // 2L7 |
||
917 | "paddw %%mm3, %%mm3 \n\t" // 2H7 |
||
918 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
||
919 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
||
920 | |||
921 | "movq (%3), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
||
922 | "movq 8(%3), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
||
923 | |||
924 | #if TEMPLATE_PP_MMXEXT |
||
925 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
926 | "psubw %%mm0, %%mm6 \n\t" |
||
927 | "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
||
928 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
929 | "psubw %%mm1, %%mm6 \n\t" |
||
930 | "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
||
931 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
932 | "psubw %%mm2, %%mm6 \n\t" |
||
933 | "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
||
934 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
935 | "psubw %%mm3, %%mm6 \n\t" |
||
936 | "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
||
937 | #else |
||
938 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
939 | "pcmpgtw %%mm0, %%mm6 \n\t" |
||
940 | "pxor %%mm6, %%mm0 \n\t" |
||
941 | "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
||
942 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
943 | "pcmpgtw %%mm1, %%mm6 \n\t" |
||
944 | "pxor %%mm6, %%mm1 \n\t" |
||
945 | "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
||
946 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
947 | "pcmpgtw %%mm2, %%mm6 \n\t" |
||
948 | "pxor %%mm6, %%mm2 \n\t" |
||
949 | "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
||
950 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
951 | "pcmpgtw %%mm3, %%mm6 \n\t" |
||
952 | "pxor %%mm6, %%mm3 \n\t" |
||
953 | "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
||
954 | #endif |
||
955 | |||
956 | #if TEMPLATE_PP_MMXEXT |
||
957 | "pminsw %%mm2, %%mm0 \n\t" |
||
958 | "pminsw %%mm3, %%mm1 \n\t" |
||
959 | #else |
||
960 | "movq %%mm0, %%mm6 \n\t" |
||
961 | "psubusw %%mm2, %%mm6 \n\t" |
||
962 | "psubw %%mm6, %%mm0 \n\t" |
||
963 | "movq %%mm1, %%mm6 \n\t" |
||
964 | "psubusw %%mm3, %%mm6 \n\t" |
||
965 | "psubw %%mm6, %%mm1 \n\t" |
||
966 | #endif |
||
967 | |||
968 | "movd %2, %%mm2 \n\t" // QP |
||
969 | "punpcklbw %%mm7, %%mm2 \n\t" |
||
970 | |||
971 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
972 | "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
||
973 | "pxor %%mm6, %%mm4 \n\t" |
||
974 | "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
||
975 | "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
||
976 | "pxor %%mm7, %%mm5 \n\t" |
||
977 | "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
||
978 | // 100 opcodes |
||
979 | "psllw $3, %%mm2 \n\t" // 8QP |
||
980 | "movq %%mm2, %%mm3 \n\t" // 8QP |
||
981 | "pcmpgtw %%mm4, %%mm2 \n\t" |
||
982 | "pcmpgtw %%mm5, %%mm3 \n\t" |
||
983 | "pand %%mm2, %%mm4 \n\t" |
||
984 | "pand %%mm3, %%mm5 \n\t" |
||
985 | |||
986 | |||
987 | "psubusw %%mm0, %%mm4 \n\t" // hd |
||
988 | "psubusw %%mm1, %%mm5 \n\t" // ld |
||
989 | |||
990 | |||
991 | "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
||
992 | "pmullw %%mm2, %%mm4 \n\t" |
||
993 | "pmullw %%mm2, %%mm5 \n\t" |
||
994 | "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
||
995 | "paddw %%mm2, %%mm4 \n\t" |
||
996 | "paddw %%mm2, %%mm5 \n\t" |
||
997 | "psrlw $6, %%mm4 \n\t" |
||
998 | "psrlw $6, %%mm5 \n\t" |
||
999 | |||
1000 | "movq 16(%3), %%mm0 \n\t" // L3 - L4 |
||
1001 | "movq 24(%3), %%mm1 \n\t" // H3 - H4 |
||
1002 | |||
1003 | "pxor %%mm2, %%mm2 \n\t" |
||
1004 | "pxor %%mm3, %%mm3 \n\t" |
||
1005 | |||
1006 | "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
||
1007 | "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
||
1008 | "pxor %%mm2, %%mm0 \n\t" |
||
1009 | "pxor %%mm3, %%mm1 \n\t" |
||
1010 | "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
||
1011 | "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
||
1012 | "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
||
1013 | "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
||
1014 | |||
1015 | "pxor %%mm6, %%mm2 \n\t" |
||
1016 | "pxor %%mm7, %%mm3 \n\t" |
||
1017 | "pand %%mm2, %%mm4 \n\t" |
||
1018 | "pand %%mm3, %%mm5 \n\t" |
||
1019 | |||
1020 | #if TEMPLATE_PP_MMXEXT |
||
1021 | "pminsw %%mm0, %%mm4 \n\t" |
||
1022 | "pminsw %%mm1, %%mm5 \n\t" |
||
1023 | #else |
||
1024 | "movq %%mm4, %%mm2 \n\t" |
||
1025 | "psubusw %%mm0, %%mm2 \n\t" |
||
1026 | "psubw %%mm2, %%mm4 \n\t" |
||
1027 | "movq %%mm5, %%mm2 \n\t" |
||
1028 | "psubusw %%mm1, %%mm2 \n\t" |
||
1029 | "psubw %%mm2, %%mm5 \n\t" |
||
1030 | #endif |
||
1031 | "pxor %%mm6, %%mm4 \n\t" |
||
1032 | "pxor %%mm7, %%mm5 \n\t" |
||
1033 | "psubw %%mm6, %%mm4 \n\t" |
||
1034 | "psubw %%mm7, %%mm5 \n\t" |
||
1035 | "packsswb %%mm5, %%mm4 \n\t" |
||
1036 | "movq (%0), %%mm0 \n\t" |
||
1037 | "paddb %%mm4, %%mm0 \n\t" |
||
1038 | "movq %%mm0, (%0) \n\t" |
||
1039 | "movq (%0, %1), %%mm0 \n\t" |
||
1040 | "psubb %%mm4, %%mm0 \n\t" |
||
1041 | "movq %%mm0, (%0, %1) \n\t" |
||
1042 | |||
1043 | : "+r" (src) |
||
1044 | : "r" ((x86_reg)stride), "m" (c->pQPb), "r"(tmp) |
||
1045 | : "%"REG_a |
||
1046 | ); |
||
1047 | #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1048 | const int l1= stride; |
||
1049 | const int l2= stride + l1; |
||
1050 | const int l3= stride + l2; |
||
1051 | const int l4= stride + l3; |
||
1052 | const int l5= stride + l4; |
||
1053 | const int l6= stride + l5; |
||
1054 | const int l7= stride + l6; |
||
1055 | const int l8= stride + l7; |
||
1056 | // const int l9= stride + l8; |
||
1057 | int x; |
||
1058 | src+= stride*3; |
||
1059 | for(x=0; x |
||
1060 | const int middleEnergy= 5*(src[l5] - src[l4]) + 2*(src[l3] - src[l6]); |
||
1061 | if(FFABS(middleEnergy) < 8*c->QP){ |
||
1062 | const int q=(src[l4] - src[l5])/2; |
||
1063 | const int leftEnergy= 5*(src[l3] - src[l2]) + 2*(src[l1] - src[l4]); |
||
1064 | const int rightEnergy= 5*(src[l7] - src[l6]) + 2*(src[l5] - src[l8]); |
||
1065 | |||
1066 | int d= FFABS(middleEnergy) - FFMIN( FFABS(leftEnergy), FFABS(rightEnergy) ); |
||
1067 | d= FFMAX(d, 0); |
||
1068 | |||
1069 | d= (5*d + 32) >> 6; |
||
1070 | d*= FFSIGN(-middleEnergy); |
||
1071 | |||
1072 | if(q>0){ |
||
1073 | d= d<0 ? 0 : d; |
||
1074 | d= d>q ? q : d; |
||
1075 | }else{ |
||
1076 | d= d>0 ? 0 : d; |
||
1077 | d= d |
||
1078 | } |
||
1079 | |||
1080 | src[l4]-= d; |
||
1081 | src[l5]+= d; |
||
1082 | } |
||
1083 | src++; |
||
1084 | } |
||
1085 | #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1086 | } |
||
1087 | #endif //TEMPLATE_PP_ALTIVEC |
||
1088 | |||
1089 | #if !TEMPLATE_PP_ALTIVEC |
||
1090 | static inline void RENAME(dering)(uint8_t src[], int stride, PPContext *c) |
||
1091 | { |
||
1092 | #if HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) |
||
1093 | DECLARE_ALIGNED(8, uint64_t, tmp)[3]; |
||
1094 | __asm__ volatile( |
||
1095 | "pxor %%mm6, %%mm6 \n\t" |
||
1096 | "pcmpeqb %%mm7, %%mm7 \n\t" |
||
1097 | "movq %2, %%mm0 \n\t" |
||
1098 | "punpcklbw %%mm6, %%mm0 \n\t" |
||
1099 | "psrlw $1, %%mm0 \n\t" |
||
1100 | "psubw %%mm7, %%mm0 \n\t" |
||
1101 | "packuswb %%mm0, %%mm0 \n\t" |
||
1102 | "movq %%mm0, %3 \n\t" |
||
1103 | |||
1104 | "lea (%0, %1), %%"REG_a" \n\t" |
||
1105 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
||
1106 | |||
1107 | // 0 1 2 3 4 5 6 7 8 9 |
||
1108 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
||
1109 | |||
1110 | #undef REAL_FIND_MIN_MAX |
||
1111 | #undef FIND_MIN_MAX |
||
1112 | #if TEMPLATE_PP_MMXEXT |
||
1113 | #define REAL_FIND_MIN_MAX(addr)\ |
||
1114 | "movq " #addr ", %%mm0 \n\t"\ |
||
1115 | "pminub %%mm0, %%mm7 \n\t"\ |
||
1116 | "pmaxub %%mm0, %%mm6 \n\t" |
||
1117 | #else |
||
1118 | #define REAL_FIND_MIN_MAX(addr)\ |
||
1119 | "movq " #addr ", %%mm0 \n\t"\ |
||
1120 | "movq %%mm7, %%mm1 \n\t"\ |
||
1121 | "psubusb %%mm0, %%mm6 \n\t"\ |
||
1122 | "paddb %%mm0, %%mm6 \n\t"\ |
||
1123 | "psubusb %%mm0, %%mm1 \n\t"\ |
||
1124 | "psubb %%mm1, %%mm7 \n\t" |
||
1125 | #endif |
||
1126 | #define FIND_MIN_MAX(addr) REAL_FIND_MIN_MAX(addr) |
||
1127 | |||
1128 | FIND_MIN_MAX((%%REGa)) |
||
1129 | FIND_MIN_MAX((%%REGa, %1)) |
||
1130 | FIND_MIN_MAX((%%REGa, %1, 2)) |
||
1131 | FIND_MIN_MAX((%0, %1, 4)) |
||
1132 | FIND_MIN_MAX((%%REGd)) |
||
1133 | FIND_MIN_MAX((%%REGd, %1)) |
||
1134 | FIND_MIN_MAX((%%REGd, %1, 2)) |
||
1135 | FIND_MIN_MAX((%0, %1, 8)) |
||
1136 | |||
1137 | "movq %%mm7, %%mm4 \n\t" |
||
1138 | "psrlq $8, %%mm7 \n\t" |
||
1139 | #if TEMPLATE_PP_MMXEXT |
||
1140 | "pminub %%mm4, %%mm7 \n\t" // min of pixels |
||
1141 | "pshufw $0xF9, %%mm7, %%mm4 \n\t" |
||
1142 | "pminub %%mm4, %%mm7 \n\t" // min of pixels |
||
1143 | "pshufw $0xFE, %%mm7, %%mm4 \n\t" |
||
1144 | "pminub %%mm4, %%mm7 \n\t" |
||
1145 | #else |
||
1146 | "movq %%mm7, %%mm1 \n\t" |
||
1147 | "psubusb %%mm4, %%mm1 \n\t" |
||
1148 | "psubb %%mm1, %%mm7 \n\t" |
||
1149 | "movq %%mm7, %%mm4 \n\t" |
||
1150 | "psrlq $16, %%mm7 \n\t" |
||
1151 | "movq %%mm7, %%mm1 \n\t" |
||
1152 | "psubusb %%mm4, %%mm1 \n\t" |
||
1153 | "psubb %%mm1, %%mm7 \n\t" |
||
1154 | "movq %%mm7, %%mm4 \n\t" |
||
1155 | "psrlq $32, %%mm7 \n\t" |
||
1156 | "movq %%mm7, %%mm1 \n\t" |
||
1157 | "psubusb %%mm4, %%mm1 \n\t" |
||
1158 | "psubb %%mm1, %%mm7 \n\t" |
||
1159 | #endif |
||
1160 | |||
1161 | |||
1162 | "movq %%mm6, %%mm4 \n\t" |
||
1163 | "psrlq $8, %%mm6 \n\t" |
||
1164 | #if TEMPLATE_PP_MMXEXT |
||
1165 | "pmaxub %%mm4, %%mm6 \n\t" // max of pixels |
||
1166 | "pshufw $0xF9, %%mm6, %%mm4 \n\t" |
||
1167 | "pmaxub %%mm4, %%mm6 \n\t" |
||
1168 | "pshufw $0xFE, %%mm6, %%mm4 \n\t" |
||
1169 | "pmaxub %%mm4, %%mm6 \n\t" |
||
1170 | #else |
||
1171 | "psubusb %%mm4, %%mm6 \n\t" |
||
1172 | "paddb %%mm4, %%mm6 \n\t" |
||
1173 | "movq %%mm6, %%mm4 \n\t" |
||
1174 | "psrlq $16, %%mm6 \n\t" |
||
1175 | "psubusb %%mm4, %%mm6 \n\t" |
||
1176 | "paddb %%mm4, %%mm6 \n\t" |
||
1177 | "movq %%mm6, %%mm4 \n\t" |
||
1178 | "psrlq $32, %%mm6 \n\t" |
||
1179 | "psubusb %%mm4, %%mm6 \n\t" |
||
1180 | "paddb %%mm4, %%mm6 \n\t" |
||
1181 | #endif |
||
1182 | "movq %%mm6, %%mm0 \n\t" // max |
||
1183 | "psubb %%mm7, %%mm6 \n\t" // max - min |
||
1184 | "push %4 \n\t" |
||
1185 | "movd %%mm6, %k4 \n\t" |
||
1186 | "cmpb "MANGLE(deringThreshold)", %b4 \n\t" |
||
1187 | "pop %4 \n\t" |
||
1188 | " jb 1f \n\t" |
||
1189 | PAVGB(%%mm0, %%mm7) // a=(max + min)/2 |
||
1190 | "punpcklbw %%mm7, %%mm7 \n\t" |
||
1191 | "punpcklbw %%mm7, %%mm7 \n\t" |
||
1192 | "punpcklbw %%mm7, %%mm7 \n\t" |
||
1193 | "movq %%mm7, (%4) \n\t" |
||
1194 | |||
1195 | "movq (%0), %%mm0 \n\t" // L10 |
||
1196 | "movq %%mm0, %%mm1 \n\t" // L10 |
||
1197 | "movq %%mm0, %%mm2 \n\t" // L10 |
||
1198 | "psllq $8, %%mm1 \n\t" |
||
1199 | "psrlq $8, %%mm2 \n\t" |
||
1200 | "movd -4(%0), %%mm3 \n\t" |
||
1201 | "movd 8(%0), %%mm4 \n\t" |
||
1202 | "psrlq $24, %%mm3 \n\t" |
||
1203 | "psllq $56, %%mm4 \n\t" |
||
1204 | "por %%mm3, %%mm1 \n\t" // L00 |
||
1205 | "por %%mm4, %%mm2 \n\t" // L20 |
||
1206 | "movq %%mm1, %%mm3 \n\t" // L00 |
||
1207 | PAVGB(%%mm2, %%mm1) // (L20 + L00)/2 |
||
1208 | PAVGB(%%mm0, %%mm1) // (L20 + L00 + 2L10)/4 |
||
1209 | "psubusb %%mm7, %%mm0 \n\t" |
||
1210 | "psubusb %%mm7, %%mm2 \n\t" |
||
1211 | "psubusb %%mm7, %%mm3 \n\t" |
||
1212 | "pcmpeqb "MANGLE(b00)", %%mm0 \n\t" // L10 > a ? 0 : -1 |
||
1213 | "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L20 > a ? 0 : -1 |
||
1214 | "pcmpeqb "MANGLE(b00)", %%mm3 \n\t" // L00 > a ? 0 : -1 |
||
1215 | "paddb %%mm2, %%mm0 \n\t" |
||
1216 | "paddb %%mm3, %%mm0 \n\t" |
||
1217 | |||
1218 | "movq (%%"REG_a"), %%mm2 \n\t" // L11 |
||
1219 | "movq %%mm2, %%mm3 \n\t" // L11 |
||
1220 | "movq %%mm2, %%mm4 \n\t" // L11 |
||
1221 | "psllq $8, %%mm3 \n\t" |
||
1222 | "psrlq $8, %%mm4 \n\t" |
||
1223 | "movd -4(%%"REG_a"), %%mm5 \n\t" |
||
1224 | "movd 8(%%"REG_a"), %%mm6 \n\t" |
||
1225 | "psrlq $24, %%mm5 \n\t" |
||
1226 | "psllq $56, %%mm6 \n\t" |
||
1227 | "por %%mm5, %%mm3 \n\t" // L01 |
||
1228 | "por %%mm6, %%mm4 \n\t" // L21 |
||
1229 | "movq %%mm3, %%mm5 \n\t" // L01 |
||
1230 | PAVGB(%%mm4, %%mm3) // (L21 + L01)/2 |
||
1231 | PAVGB(%%mm2, %%mm3) // (L21 + L01 + 2L11)/4 |
||
1232 | "psubusb %%mm7, %%mm2 \n\t" |
||
1233 | "psubusb %%mm7, %%mm4 \n\t" |
||
1234 | "psubusb %%mm7, %%mm5 \n\t" |
||
1235 | "pcmpeqb "MANGLE(b00)", %%mm2 \n\t" // L11 > a ? 0 : -1 |
||
1236 | "pcmpeqb "MANGLE(b00)", %%mm4 \n\t" // L21 > a ? 0 : -1 |
||
1237 | "pcmpeqb "MANGLE(b00)", %%mm5 \n\t" // L01 > a ? 0 : -1 |
||
1238 | "paddb %%mm4, %%mm2 \n\t" |
||
1239 | "paddb %%mm5, %%mm2 \n\t" |
||
1240 | // 0, 2, 3, 1 |
||
1241 | #define REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ |
||
1242 | "movq " #src ", " #sx " \n\t" /* src[0] */\ |
||
1243 | "movq " #sx ", " #lx " \n\t" /* src[0] */\ |
||
1244 | "movq " #sx ", " #t0 " \n\t" /* src[0] */\ |
||
1245 | "psllq $8, " #lx " \n\t"\ |
||
1246 | "psrlq $8, " #t0 " \n\t"\ |
||
1247 | "movd -4" #src ", " #t1 " \n\t"\ |
||
1248 | "psrlq $24, " #t1 " \n\t"\ |
||
1249 | "por " #t1 ", " #lx " \n\t" /* src[-1] */\ |
||
1250 | "movd 8" #src ", " #t1 " \n\t"\ |
||
1251 | "psllq $56, " #t1 " \n\t"\ |
||
1252 | "por " #t1 ", " #t0 " \n\t" /* src[+1] */\ |
||
1253 | "movq " #lx ", " #t1 " \n\t" /* src[-1] */\ |
||
1254 | PAVGB(t0, lx) /* (src[-1] + src[+1])/2 */\ |
||
1255 | PAVGB(sx, lx) /* (src[-1] + 2src[0] + src[+1])/4 */\ |
||
1256 | PAVGB(lx, pplx) \ |
||
1257 | "movq " #lx ", 8(%4) \n\t"\ |
||
1258 | "movq (%4), " #lx " \n\t"\ |
||
1259 | "psubusb " #lx ", " #t1 " \n\t"\ |
||
1260 | "psubusb " #lx ", " #t0 " \n\t"\ |
||
1261 | "psubusb " #lx ", " #sx " \n\t"\ |
||
1262 | "movq "MANGLE(b00)", " #lx " \n\t"\ |
||
1263 | "pcmpeqb " #lx ", " #t1 " \n\t" /* src[-1] > a ? 0 : -1*/\ |
||
1264 | "pcmpeqb " #lx ", " #t0 " \n\t" /* src[+1] > a ? 0 : -1*/\ |
||
1265 | "pcmpeqb " #lx ", " #sx " \n\t" /* src[0] > a ? 0 : -1*/\ |
||
1266 | "paddb " #t1 ", " #t0 " \n\t"\ |
||
1267 | "paddb " #t0 ", " #sx " \n\t"\ |
||
1268 | \ |
||
1269 | PAVGB(plx, pplx) /* filtered */\ |
||
1270 | "movq " #dst ", " #t0 " \n\t" /* dst */\ |
||
1271 | "movq " #t0 ", " #t1 " \n\t" /* dst */\ |
||
1272 | "psubusb %3, " #t0 " \n\t"\ |
||
1273 | "paddusb %3, " #t1 " \n\t"\ |
||
1274 | PMAXUB(t0, pplx)\ |
||
1275 | PMINUB(t1, pplx, t0)\ |
||
1276 | "paddb " #sx ", " #ppsx " \n\t"\ |
||
1277 | "paddb " #psx ", " #ppsx " \n\t"\ |
||
1278 | "#paddb "MANGLE(b02)", " #ppsx " \n\t"\ |
||
1279 | "pand "MANGLE(b08)", " #ppsx " \n\t"\ |
||
1280 | "pcmpeqb " #lx ", " #ppsx " \n\t"\ |
||
1281 | "pand " #ppsx ", " #pplx " \n\t"\ |
||
1282 | "pandn " #dst ", " #ppsx " \n\t"\ |
||
1283 | "por " #pplx ", " #ppsx " \n\t"\ |
||
1284 | "movq " #ppsx ", " #dst " \n\t"\ |
||
1285 | "movq 8(%4), " #lx " \n\t" |
||
1286 | |||
1287 | #define DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) \ |
||
1288 | REAL_DERING_CORE(dst,src,ppsx,psx,sx,pplx,plx,lx,t0,t1) |
||
1289 | /* |
||
1290 | 0000000 |
||
1291 | 1111111 |
||
1292 | |||
1293 | 1111110 |
||
1294 | 1111101 |
||
1295 | 1111100 |
||
1296 | 1111011 |
||
1297 | 1111010 |
||
1298 | 1111001 |
||
1299 | |||
1300 | 1111000 |
||
1301 | 1110111 |
||
1302 | |||
1303 | */ |
||
1304 | //DERING_CORE(dst ,src ,ppsx ,psx ,sx ,pplx ,plx ,lx ,t0 ,t1) |
||
1305 | DERING_CORE((%%REGa) ,(%%REGa, %1) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
||
1306 | DERING_CORE((%%REGa, %1) ,(%%REGa, %1, 2),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
||
1307 | DERING_CORE((%%REGa, %1, 2),(%0, %1, 4) ,%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
||
1308 | DERING_CORE((%0, %1, 4) ,(%%REGd) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
||
1309 | DERING_CORE((%%REGd) ,(%%REGd, %1) ,%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
||
1310 | DERING_CORE((%%REGd, %1) ,(%%REGd, %1, 2),%%mm4,%%mm0,%%mm2,%%mm5,%%mm1,%%mm3,%%mm6,%%mm7) |
||
1311 | DERING_CORE((%%REGd, %1, 2),(%0, %1, 8) ,%%mm0,%%mm2,%%mm4,%%mm1,%%mm3,%%mm5,%%mm6,%%mm7) |
||
1312 | DERING_CORE((%0, %1, 8) ,(%%REGd, %1, 4),%%mm2,%%mm4,%%mm0,%%mm3,%%mm5,%%mm1,%%mm6,%%mm7) |
||
1313 | |||
1314 | "1: \n\t" |
||
1315 | : : "r" (src), "r" ((x86_reg)stride), "m" (c->pQPb), "m"(c->pQPb2), "q"(tmp) |
||
1316 | : "%"REG_a, "%"REG_d, "%"REG_SP |
||
1317 | ); |
||
1318 | #else // HAVE_7REGS && (TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW) |
||
1319 | int y; |
||
1320 | int min=255; |
||
1321 | int max=0; |
||
1322 | int avg; |
||
1323 | uint8_t *p; |
||
1324 | int s[10]; |
||
1325 | const int QP2= c->QP/2 + 1; |
||
1326 | |||
1327 | src --; |
||
1328 | for(y=1; y<9; y++){ |
||
1329 | int x; |
||
1330 | p= src + stride*y; |
||
1331 | for(x=1; x<9; x++){ |
||
1332 | p++; |
||
1333 | if(*p > max) max= *p; |
||
1334 | if(*p < min) min= *p; |
||
1335 | } |
||
1336 | } |
||
1337 | avg= (min + max + 1)>>1; |
||
1338 | |||
1339 | if(max - min |
||
1340 | |||
1341 | for(y=0; y<10; y++){ |
||
1342 | int t = 0; |
||
1343 | |||
1344 | if(src[stride*y + 0] > avg) t+= 1; |
||
1345 | if(src[stride*y + 1] > avg) t+= 2; |
||
1346 | if(src[stride*y + 2] > avg) t+= 4; |
||
1347 | if(src[stride*y + 3] > avg) t+= 8; |
||
1348 | if(src[stride*y + 4] > avg) t+= 16; |
||
1349 | if(src[stride*y + 5] > avg) t+= 32; |
||
1350 | if(src[stride*y + 6] > avg) t+= 64; |
||
1351 | if(src[stride*y + 7] > avg) t+= 128; |
||
1352 | if(src[stride*y + 8] > avg) t+= 256; |
||
1353 | if(src[stride*y + 9] > avg) t+= 512; |
||
1354 | |||
1355 | t |= (~t)<<16; |
||
1356 | t &= (t<<1) & (t>>1); |
||
1357 | s[y] = t; |
||
1358 | } |
||
1359 | |||
1360 | for(y=1; y<9; y++){ |
||
1361 | int t = s[y-1] & s[y] & s[y+1]; |
||
1362 | t|= t>>16; |
||
1363 | s[y-1]= t; |
||
1364 | } |
||
1365 | |||
1366 | for(y=1; y<9; y++){ |
||
1367 | int x; |
||
1368 | int t = s[y-1]; |
||
1369 | |||
1370 | p= src + stride*y; |
||
1371 | for(x=1; x<9; x++){ |
||
1372 | p++; |
||
1373 | if(t & (1< |
||
1374 | int f= (*(p-stride-1)) + 2*(*(p-stride)) + (*(p-stride+1)) |
||
1375 | +2*(*(p -1)) + 4*(*p ) + 2*(*(p +1)) |
||
1376 | +(*(p+stride-1)) + 2*(*(p+stride)) + (*(p+stride+1)); |
||
1377 | f= (f + 8)>>4; |
||
1378 | |||
1379 | #ifdef DEBUG_DERING_THRESHOLD |
||
1380 | __asm__ volatile("emms\n\t":); |
||
1381 | { |
||
1382 | static long long numPixels=0; |
||
1383 | if(x!=1 && x!=8 && y!=1 && y!=8) numPixels++; |
||
1384 | // if((max-min)<20 || (max-min)*QP<200) |
||
1385 | // if((max-min)*QP < 500) |
||
1386 | // if(max-min |
||
1387 | if(max-min < 20){ |
||
1388 | static int numSkipped=0; |
||
1389 | static int errorSum=0; |
||
1390 | static int worstQP=0; |
||
1391 | static int worstRange=0; |
||
1392 | static int worstDiff=0; |
||
1393 | int diff= (f - *p); |
||
1394 | int absDiff= FFABS(diff); |
||
1395 | int error= diff*diff; |
||
1396 | |||
1397 | if(x==1 || x==8 || y==1 || y==8) continue; |
||
1398 | |||
1399 | numSkipped++; |
||
1400 | if(absDiff > worstDiff){ |
||
1401 | worstDiff= absDiff; |
||
1402 | worstQP= QP; |
||
1403 | worstRange= max-min; |
||
1404 | } |
||
1405 | errorSum+= error; |
||
1406 | |||
1407 | if(1024LL*1024LL*1024LL % numSkipped == 0){ |
||
1408 | av_log(c, AV_LOG_INFO, "sum:%1.3f, skip:%d, wQP:%d, " |
||
1409 | "wRange:%d, wDiff:%d, relSkip:%1.3f\n", |
||
1410 | (float)errorSum/numSkipped, numSkipped, worstQP, worstRange, |
||
1411 | worstDiff, (float)numSkipped/numPixels); |
||
1412 | } |
||
1413 | } |
||
1414 | } |
||
1415 | #endif |
||
1416 | if (*p + QP2 < f) *p= *p + QP2; |
||
1417 | else if(*p - QP2 > f) *p= *p - QP2; |
||
1418 | else *p=f; |
||
1419 | } |
||
1420 | } |
||
1421 | } |
||
1422 | #ifdef DEBUG_DERING_THRESHOLD |
||
1423 | if(max-min < 20){ |
||
1424 | for(y=1; y<9; y++){ |
||
1425 | int x; |
||
1426 | int t = 0; |
||
1427 | p= src + stride*y; |
||
1428 | for(x=1; x<9; x++){ |
||
1429 | p++; |
||
1430 | *p = FFMIN(*p + 20, 255); |
||
1431 | } |
||
1432 | } |
||
1433 | // src[0] = src[7]=src[stride*7]=src[stride*7 + 7]=255; |
||
1434 | } |
||
1435 | #endif |
||
1436 | #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1437 | } |
||
1438 | #endif //TEMPLATE_PP_ALTIVEC |
||
1439 | |||
1440 | /** |
||
1441 | * Deinterlace the given block by linearly interpolating every second line. |
||
1442 | * will be called for every 8x8 block and can read & write from line 4-15 |
||
1443 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. |
||
1444 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
||
1445 | */ |
||
1446 | static inline void RENAME(deInterlaceInterpolateLinear)(uint8_t src[], int stride) |
||
1447 | { |
||
1448 | #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1449 | src+= 4*stride; |
||
1450 | __asm__ volatile( |
||
1451 | "lea (%0, %1), %%"REG_a" \n\t" |
||
1452 | "lea (%%"REG_a", %1, 4), %%"REG_c" \n\t" |
||
1453 | // 0 1 2 3 4 5 6 7 8 9 |
||
1454 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %0+8%1 ecx+4%1 |
||
1455 | |||
1456 | "movq (%0), %%mm0 \n\t" |
||
1457 | "movq (%%"REG_a", %1), %%mm1 \n\t" |
||
1458 | PAVGB(%%mm1, %%mm0) |
||
1459 | "movq %%mm0, (%%"REG_a") \n\t" |
||
1460 | "movq (%0, %1, 4), %%mm0 \n\t" |
||
1461 | PAVGB(%%mm0, %%mm1) |
||
1462 | "movq %%mm1, (%%"REG_a", %1, 2) \n\t" |
||
1463 | "movq (%%"REG_c", %1), %%mm1 \n\t" |
||
1464 | PAVGB(%%mm1, %%mm0) |
||
1465 | "movq %%mm0, (%%"REG_c") \n\t" |
||
1466 | "movq (%0, %1, 8), %%mm0 \n\t" |
||
1467 | PAVGB(%%mm0, %%mm1) |
||
1468 | "movq %%mm1, (%%"REG_c", %1, 2) \n\t" |
||
1469 | |||
1470 | : : "r" (src), "r" ((x86_reg)stride) |
||
1471 | : "%"REG_a, "%"REG_c |
||
1472 | ); |
||
1473 | #else |
||
1474 | int a, b, x; |
||
1475 | src+= 4*stride; |
||
1476 | |||
1477 | for(x=0; x<2; x++){ |
||
1478 | a= *(uint32_t*)&src[stride*0]; |
||
1479 | b= *(uint32_t*)&src[stride*2]; |
||
1480 | *(uint32_t*)&src[stride*1]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
||
1481 | a= *(uint32_t*)&src[stride*4]; |
||
1482 | *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
||
1483 | b= *(uint32_t*)&src[stride*6]; |
||
1484 | *(uint32_t*)&src[stride*5]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
||
1485 | a= *(uint32_t*)&src[stride*8]; |
||
1486 | *(uint32_t*)&src[stride*7]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
||
1487 | src += 4; |
||
1488 | } |
||
1489 | #endif |
||
1490 | } |
||
1491 | |||
1492 | /** |
||
1493 | * Deinterlace the given block by cubic interpolating every second line. |
||
1494 | * will be called for every 8x8 block and can read & write from line 4-15 |
||
1495 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. |
||
1496 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
||
1497 | * this filter will read lines 3-15 and write 7-13 |
||
1498 | */ |
||
1499 | static inline void RENAME(deInterlaceInterpolateCubic)(uint8_t src[], int stride) |
||
1500 | { |
||
1501 | #if TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1502 | src+= stride*3; |
||
1503 | __asm__ volatile( |
||
1504 | "lea (%0, %1), %%"REG_a" \n\t" |
||
1505 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
||
1506 | "lea (%%"REG_d", %1, 4), %%"REG_c" \n\t" |
||
1507 | "add %1, %%"REG_c" \n\t" |
||
1508 | #if TEMPLATE_PP_SSE2 |
||
1509 | "pxor %%xmm7, %%xmm7 \n\t" |
||
1510 | #define REAL_DEINT_CUBIC(a,b,c,d,e)\ |
||
1511 | "movq " #a ", %%xmm0 \n\t"\ |
||
1512 | "movq " #b ", %%xmm1 \n\t"\ |
||
1513 | "movq " #d ", %%xmm2 \n\t"\ |
||
1514 | "movq " #e ", %%xmm3 \n\t"\ |
||
1515 | "pavgb %%xmm2, %%xmm1 \n\t"\ |
||
1516 | "pavgb %%xmm3, %%xmm0 \n\t"\ |
||
1517 | "punpcklbw %%xmm7, %%xmm0 \n\t"\ |
||
1518 | "punpcklbw %%xmm7, %%xmm1 \n\t"\ |
||
1519 | "psubw %%xmm1, %%xmm0 \n\t"\ |
||
1520 | "psraw $3, %%xmm0 \n\t"\ |
||
1521 | "psubw %%xmm0, %%xmm1 \n\t"\ |
||
1522 | "packuswb %%xmm1, %%xmm1 \n\t"\ |
||
1523 | "movlps %%xmm1, " #c " \n\t" |
||
1524 | #else //TEMPLATE_PP_SSE2 |
||
1525 | "pxor %%mm7, %%mm7 \n\t" |
||
1526 | // 0 1 2 3 4 5 6 7 8 9 10 |
||
1527 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
||
1528 | |||
1529 | #define REAL_DEINT_CUBIC(a,b,c,d,e)\ |
||
1530 | "movq " #a ", %%mm0 \n\t"\ |
||
1531 | "movq " #b ", %%mm1 \n\t"\ |
||
1532 | "movq " #d ", %%mm2 \n\t"\ |
||
1533 | "movq " #e ", %%mm3 \n\t"\ |
||
1534 | PAVGB(%%mm2, %%mm1) /* (b+d) /2 */\ |
||
1535 | PAVGB(%%mm3, %%mm0) /* (a+e) /2 */\ |
||
1536 | "movq %%mm0, %%mm2 \n\t"\ |
||
1537 | "punpcklbw %%mm7, %%mm0 \n\t"\ |
||
1538 | "punpckhbw %%mm7, %%mm2 \n\t"\ |
||
1539 | "movq %%mm1, %%mm3 \n\t"\ |
||
1540 | "punpcklbw %%mm7, %%mm1 \n\t"\ |
||
1541 | "punpckhbw %%mm7, %%mm3 \n\t"\ |
||
1542 | "psubw %%mm1, %%mm0 \n\t" /* L(a+e - (b+d))/2 */\ |
||
1543 | "psubw %%mm3, %%mm2 \n\t" /* H(a+e - (b+d))/2 */\ |
||
1544 | "psraw $3, %%mm0 \n\t" /* L(a+e - (b+d))/16 */\ |
||
1545 | "psraw $3, %%mm2 \n\t" /* H(a+e - (b+d))/16 */\ |
||
1546 | "psubw %%mm0, %%mm1 \n\t" /* L(9b + 9d - a - e)/16 */\ |
||
1547 | "psubw %%mm2, %%mm3 \n\t" /* H(9b + 9d - a - e)/16 */\ |
||
1548 | "packuswb %%mm3, %%mm1 \n\t"\ |
||
1549 | "movq %%mm1, " #c " \n\t" |
||
1550 | #endif //TEMPLATE_PP_SSE2 |
||
1551 | #define DEINT_CUBIC(a,b,c,d,e) REAL_DEINT_CUBIC(a,b,c,d,e) |
||
1552 | |||
1553 | DEINT_CUBIC((%0) , (%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd, %1)) |
||
1554 | DEINT_CUBIC((%%REGa, %1), (%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%0, %1, 8)) |
||
1555 | DEINT_CUBIC((%0, %1, 4) , (%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGc)) |
||
1556 | DEINT_CUBIC((%%REGd, %1), (%0, %1, 8) , (%%REGd, %1, 4), (%%REGc) , (%%REGc, %1, 2)) |
||
1557 | |||
1558 | : : "r" (src), "r" ((x86_reg)stride) |
||
1559 | : |
||
1560 | #if TEMPLATE_PP_SSE2 |
||
1561 | XMM_CLOBBERS("%xmm0", "%xmm1", "%xmm2", "%xmm3", "%xmm7",) |
||
1562 | #endif |
||
1563 | "%"REG_a, "%"REG_d, "%"REG_c |
||
1564 | ); |
||
1565 | #undef REAL_DEINT_CUBIC |
||
1566 | #else //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1567 | int x; |
||
1568 | src+= stride*3; |
||
1569 | for(x=0; x<8; x++){ |
||
1570 | src[stride*3] = CLIP((-src[0] + 9*src[stride*2] + 9*src[stride*4] - src[stride*6])>>4); |
||
1571 | src[stride*5] = CLIP((-src[stride*2] + 9*src[stride*4] + 9*src[stride*6] - src[stride*8])>>4); |
||
1572 | src[stride*7] = CLIP((-src[stride*4] + 9*src[stride*6] + 9*src[stride*8] - src[stride*10])>>4); |
||
1573 | src[stride*9] = CLIP((-src[stride*6] + 9*src[stride*8] + 9*src[stride*10] - src[stride*12])>>4); |
||
1574 | src++; |
||
1575 | } |
||
1576 | #endif //TEMPLATE_PP_SSE2 || TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1577 | } |
||
1578 | |||
1579 | /** |
||
1580 | * Deinterlace the given block by filtering every second line with a (-1 4 2 4 -1) filter. |
||
1581 | * will be called for every 8x8 block and can read & write from line 4-15 |
||
1582 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. |
||
1583 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
||
1584 | * this filter will read lines 4-13 and write 5-11 |
||
1585 | */ |
||
1586 | static inline void RENAME(deInterlaceFF)(uint8_t src[], int stride, uint8_t *tmp) |
||
1587 | { |
||
1588 | #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1589 | src+= stride*4; |
||
1590 | __asm__ volatile( |
||
1591 | "lea (%0, %1), %%"REG_a" \n\t" |
||
1592 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
||
1593 | "pxor %%mm7, %%mm7 \n\t" |
||
1594 | "movq (%2), %%mm0 \n\t" |
||
1595 | // 0 1 2 3 4 5 6 7 8 9 10 |
||
1596 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
||
1597 | |||
1598 | #define REAL_DEINT_FF(a,b,c,d)\ |
||
1599 | "movq " #a ", %%mm1 \n\t"\ |
||
1600 | "movq " #b ", %%mm2 \n\t"\ |
||
1601 | "movq " #c ", %%mm3 \n\t"\ |
||
1602 | "movq " #d ", %%mm4 \n\t"\ |
||
1603 | PAVGB(%%mm3, %%mm1) \ |
||
1604 | PAVGB(%%mm4, %%mm0) \ |
||
1605 | "movq %%mm0, %%mm3 \n\t"\ |
||
1606 | "punpcklbw %%mm7, %%mm0 \n\t"\ |
||
1607 | "punpckhbw %%mm7, %%mm3 \n\t"\ |
||
1608 | "movq %%mm1, %%mm4 \n\t"\ |
||
1609 | "punpcklbw %%mm7, %%mm1 \n\t"\ |
||
1610 | "punpckhbw %%mm7, %%mm4 \n\t"\ |
||
1611 | "psllw $2, %%mm1 \n\t"\ |
||
1612 | "psllw $2, %%mm4 \n\t"\ |
||
1613 | "psubw %%mm0, %%mm1 \n\t"\ |
||
1614 | "psubw %%mm3, %%mm4 \n\t"\ |
||
1615 | "movq %%mm2, %%mm5 \n\t"\ |
||
1616 | "movq %%mm2, %%mm0 \n\t"\ |
||
1617 | "punpcklbw %%mm7, %%mm2 \n\t"\ |
||
1618 | "punpckhbw %%mm7, %%mm5 \n\t"\ |
||
1619 | "paddw %%mm2, %%mm1 \n\t"\ |
||
1620 | "paddw %%mm5, %%mm4 \n\t"\ |
||
1621 | "psraw $2, %%mm1 \n\t"\ |
||
1622 | "psraw $2, %%mm4 \n\t"\ |
||
1623 | "packuswb %%mm4, %%mm1 \n\t"\ |
||
1624 | "movq %%mm1, " #b " \n\t"\ |
||
1625 | |||
1626 | #define DEINT_FF(a,b,c,d) REAL_DEINT_FF(a,b,c,d) |
||
1627 | |||
1628 | DEINT_FF((%0) , (%%REGa) , (%%REGa, %1), (%%REGa, %1, 2)) |
||
1629 | DEINT_FF((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) |
||
1630 | DEINT_FF((%0, %1, 4) , (%%REGd) , (%%REGd, %1), (%%REGd, %1, 2)) |
||
1631 | DEINT_FF((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) |
||
1632 | |||
1633 | "movq %%mm0, (%2) \n\t" |
||
1634 | : : "r" (src), "r" ((x86_reg)stride), "r"(tmp) |
||
1635 | : "%"REG_a, "%"REG_d |
||
1636 | ); |
||
1637 | #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1638 | int x; |
||
1639 | src+= stride*4; |
||
1640 | for(x=0; x<8; x++){ |
||
1641 | int t1= tmp[x]; |
||
1642 | int t2= src[stride*1]; |
||
1643 | |||
1644 | src[stride*1]= CLIP((-t1 + 4*src[stride*0] + 2*t2 + 4*src[stride*2] - src[stride*3] + 4)>>3); |
||
1645 | t1= src[stride*4]; |
||
1646 | src[stride*3]= CLIP((-t2 + 4*src[stride*2] + 2*t1 + 4*src[stride*4] - src[stride*5] + 4)>>3); |
||
1647 | t2= src[stride*6]; |
||
1648 | src[stride*5]= CLIP((-t1 + 4*src[stride*4] + 2*t2 + 4*src[stride*6] - src[stride*7] + 4)>>3); |
||
1649 | t1= src[stride*8]; |
||
1650 | src[stride*7]= CLIP((-t2 + 4*src[stride*6] + 2*t1 + 4*src[stride*8] - src[stride*9] + 4)>>3); |
||
1651 | tmp[x]= t1; |
||
1652 | |||
1653 | src++; |
||
1654 | } |
||
1655 | #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1656 | } |
||
1657 | |||
1658 | /** |
||
1659 | * Deinterlace the given block by filtering every line with a (-1 2 6 2 -1) filter. |
||
1660 | * will be called for every 8x8 block and can read & write from line 4-15 |
||
1661 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. |
||
1662 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
||
1663 | * this filter will read lines 4-13 and write 4-11 |
||
1664 | */ |
||
1665 | static inline void RENAME(deInterlaceL5)(uint8_t src[], int stride, uint8_t *tmp, uint8_t *tmp2) |
||
1666 | { |
||
1667 | #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1668 | src+= stride*4; |
||
1669 | __asm__ volatile( |
||
1670 | "lea (%0, %1), %%"REG_a" \n\t" |
||
1671 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
||
1672 | "pxor %%mm7, %%mm7 \n\t" |
||
1673 | "movq (%2), %%mm0 \n\t" |
||
1674 | "movq (%3), %%mm1 \n\t" |
||
1675 | // 0 1 2 3 4 5 6 7 8 9 10 |
||
1676 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 ecx |
||
1677 | |||
1678 | #define REAL_DEINT_L5(t1,t2,a,b,c)\ |
||
1679 | "movq " #a ", %%mm2 \n\t"\ |
||
1680 | "movq " #b ", %%mm3 \n\t"\ |
||
1681 | "movq " #c ", %%mm4 \n\t"\ |
||
1682 | PAVGB(t2, %%mm3) \ |
||
1683 | PAVGB(t1, %%mm4) \ |
||
1684 | "movq %%mm2, %%mm5 \n\t"\ |
||
1685 | "movq %%mm2, " #t1 " \n\t"\ |
||
1686 | "punpcklbw %%mm7, %%mm2 \n\t"\ |
||
1687 | "punpckhbw %%mm7, %%mm5 \n\t"\ |
||
1688 | "movq %%mm2, %%mm6 \n\t"\ |
||
1689 | "paddw %%mm2, %%mm2 \n\t"\ |
||
1690 | "paddw %%mm6, %%mm2 \n\t"\ |
||
1691 | "movq %%mm5, %%mm6 \n\t"\ |
||
1692 | "paddw %%mm5, %%mm5 \n\t"\ |
||
1693 | "paddw %%mm6, %%mm5 \n\t"\ |
||
1694 | "movq %%mm3, %%mm6 \n\t"\ |
||
1695 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
||
1696 | "punpckhbw %%mm7, %%mm6 \n\t"\ |
||
1697 | "paddw %%mm3, %%mm3 \n\t"\ |
||
1698 | "paddw %%mm6, %%mm6 \n\t"\ |
||
1699 | "paddw %%mm3, %%mm2 \n\t"\ |
||
1700 | "paddw %%mm6, %%mm5 \n\t"\ |
||
1701 | "movq %%mm4, %%mm6 \n\t"\ |
||
1702 | "punpcklbw %%mm7, %%mm4 \n\t"\ |
||
1703 | "punpckhbw %%mm7, %%mm6 \n\t"\ |
||
1704 | "psubw %%mm4, %%mm2 \n\t"\ |
||
1705 | "psubw %%mm6, %%mm5 \n\t"\ |
||
1706 | "psraw $2, %%mm2 \n\t"\ |
||
1707 | "psraw $2, %%mm5 \n\t"\ |
||
1708 | "packuswb %%mm5, %%mm2 \n\t"\ |
||
1709 | "movq %%mm2, " #a " \n\t"\ |
||
1710 | |||
1711 | #define DEINT_L5(t1,t2,a,b,c) REAL_DEINT_L5(t1,t2,a,b,c) |
||
1712 | |||
1713 | DEINT_L5(%%mm0, %%mm1, (%0) , (%%REGa) , (%%REGa, %1) ) |
||
1714 | DEINT_L5(%%mm1, %%mm0, (%%REGa) , (%%REGa, %1) , (%%REGa, %1, 2)) |
||
1715 | DEINT_L5(%%mm0, %%mm1, (%%REGa, %1) , (%%REGa, %1, 2), (%0, %1, 4) ) |
||
1716 | DEINT_L5(%%mm1, %%mm0, (%%REGa, %1, 2), (%0, %1, 4) , (%%REGd) ) |
||
1717 | DEINT_L5(%%mm0, %%mm1, (%0, %1, 4) , (%%REGd) , (%%REGd, %1) ) |
||
1718 | DEINT_L5(%%mm1, %%mm0, (%%REGd) , (%%REGd, %1) , (%%REGd, %1, 2)) |
||
1719 | DEINT_L5(%%mm0, %%mm1, (%%REGd, %1) , (%%REGd, %1, 2), (%0, %1, 8) ) |
||
1720 | DEINT_L5(%%mm1, %%mm0, (%%REGd, %1, 2), (%0, %1, 8) , (%%REGd, %1, 4)) |
||
1721 | |||
1722 | "movq %%mm0, (%2) \n\t" |
||
1723 | "movq %%mm1, (%3) \n\t" |
||
1724 | : : "r" (src), "r" ((x86_reg)stride), "r"(tmp), "r"(tmp2) |
||
1725 | : "%"REG_a, "%"REG_d |
||
1726 | ); |
||
1727 | #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1728 | int x; |
||
1729 | src+= stride*4; |
||
1730 | for(x=0; x<8; x++){ |
||
1731 | int t1= tmp[x]; |
||
1732 | int t2= tmp2[x]; |
||
1733 | int t3= src[0]; |
||
1734 | |||
1735 | src[stride*0]= CLIP((-(t1 + src[stride*2]) + 2*(t2 + src[stride*1]) + 6*t3 + 4)>>3); |
||
1736 | t1= src[stride*1]; |
||
1737 | src[stride*1]= CLIP((-(t2 + src[stride*3]) + 2*(t3 + src[stride*2]) + 6*t1 + 4)>>3); |
||
1738 | t2= src[stride*2]; |
||
1739 | src[stride*2]= CLIP((-(t3 + src[stride*4]) + 2*(t1 + src[stride*3]) + 6*t2 + 4)>>3); |
||
1740 | t3= src[stride*3]; |
||
1741 | src[stride*3]= CLIP((-(t1 + src[stride*5]) + 2*(t2 + src[stride*4]) + 6*t3 + 4)>>3); |
||
1742 | t1= src[stride*4]; |
||
1743 | src[stride*4]= CLIP((-(t2 + src[stride*6]) + 2*(t3 + src[stride*5]) + 6*t1 + 4)>>3); |
||
1744 | t2= src[stride*5]; |
||
1745 | src[stride*5]= CLIP((-(t3 + src[stride*7]) + 2*(t1 + src[stride*6]) + 6*t2 + 4)>>3); |
||
1746 | t3= src[stride*6]; |
||
1747 | src[stride*6]= CLIP((-(t1 + src[stride*8]) + 2*(t2 + src[stride*7]) + 6*t3 + 4)>>3); |
||
1748 | t1= src[stride*7]; |
||
1749 | src[stride*7]= CLIP((-(t2 + src[stride*9]) + 2*(t3 + src[stride*8]) + 6*t1 + 4)>>3); |
||
1750 | |||
1751 | tmp[x]= t3; |
||
1752 | tmp2[x]= t1; |
||
1753 | |||
1754 | src++; |
||
1755 | } |
||
1756 | #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1757 | } |
||
1758 | |||
1759 | /** |
||
1760 | * Deinterlace the given block by filtering all lines with a (1 2 1) filter. |
||
1761 | * will be called for every 8x8 block and can read & write from line 4-15 |
||
1762 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. |
||
1763 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
||
1764 | * this filter will read lines 4-13 and write 4-11 |
||
1765 | */ |
||
1766 | static inline void RENAME(deInterlaceBlendLinear)(uint8_t src[], int stride, uint8_t *tmp) |
||
1767 | { |
||
1768 | #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1769 | src+= 4*stride; |
||
1770 | __asm__ volatile( |
||
1771 | "lea (%0, %1), %%"REG_a" \n\t" |
||
1772 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
||
1773 | // 0 1 2 3 4 5 6 7 8 9 |
||
1774 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
||
1775 | |||
1776 | "movq (%2), %%mm0 \n\t" // L0 |
||
1777 | "movq (%%"REG_a"), %%mm1 \n\t" // L2 |
||
1778 | PAVGB(%%mm1, %%mm0) // L0+L2 |
||
1779 | "movq (%0), %%mm2 \n\t" // L1 |
||
1780 | PAVGB(%%mm2, %%mm0) |
||
1781 | "movq %%mm0, (%0) \n\t" |
||
1782 | "movq (%%"REG_a", %1), %%mm0 \n\t" // L3 |
||
1783 | PAVGB(%%mm0, %%mm2) // L1+L3 |
||
1784 | PAVGB(%%mm1, %%mm2) // 2L2 + L1 + L3 |
||
1785 | "movq %%mm2, (%%"REG_a") \n\t" |
||
1786 | "movq (%%"REG_a", %1, 2), %%mm2 \n\t" // L4 |
||
1787 | PAVGB(%%mm2, %%mm1) // L2+L4 |
||
1788 | PAVGB(%%mm0, %%mm1) // 2L3 + L2 + L4 |
||
1789 | "movq %%mm1, (%%"REG_a", %1) \n\t" |
||
1790 | "movq (%0, %1, 4), %%mm1 \n\t" // L5 |
||
1791 | PAVGB(%%mm1, %%mm0) // L3+L5 |
||
1792 | PAVGB(%%mm2, %%mm0) // 2L4 + L3 + L5 |
||
1793 | "movq %%mm0, (%%"REG_a", %1, 2) \n\t" |
||
1794 | "movq (%%"REG_d"), %%mm0 \n\t" // L6 |
||
1795 | PAVGB(%%mm0, %%mm2) // L4+L6 |
||
1796 | PAVGB(%%mm1, %%mm2) // 2L5 + L4 + L6 |
||
1797 | "movq %%mm2, (%0, %1, 4) \n\t" |
||
1798 | "movq (%%"REG_d", %1), %%mm2 \n\t" // L7 |
||
1799 | PAVGB(%%mm2, %%mm1) // L5+L7 |
||
1800 | PAVGB(%%mm0, %%mm1) // 2L6 + L5 + L7 |
||
1801 | "movq %%mm1, (%%"REG_d") \n\t" |
||
1802 | "movq (%%"REG_d", %1, 2), %%mm1 \n\t" // L8 |
||
1803 | PAVGB(%%mm1, %%mm0) // L6+L8 |
||
1804 | PAVGB(%%mm2, %%mm0) // 2L7 + L6 + L8 |
||
1805 | "movq %%mm0, (%%"REG_d", %1) \n\t" |
||
1806 | "movq (%0, %1, 8), %%mm0 \n\t" // L9 |
||
1807 | PAVGB(%%mm0, %%mm2) // L7+L9 |
||
1808 | PAVGB(%%mm1, %%mm2) // 2L8 + L7 + L9 |
||
1809 | "movq %%mm2, (%%"REG_d", %1, 2) \n\t" |
||
1810 | "movq %%mm1, (%2) \n\t" |
||
1811 | |||
1812 | : : "r" (src), "r" ((x86_reg)stride), "r" (tmp) |
||
1813 | : "%"REG_a, "%"REG_d |
||
1814 | ); |
||
1815 | #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1816 | int a, b, c, x; |
||
1817 | src+= 4*stride; |
||
1818 | |||
1819 | for(x=0; x<2; x++){ |
||
1820 | a= *(uint32_t*)&tmp[stride*0]; |
||
1821 | b= *(uint32_t*)&src[stride*0]; |
||
1822 | c= *(uint32_t*)&src[stride*1]; |
||
1823 | a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
||
1824 | *(uint32_t*)&src[stride*0]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
||
1825 | |||
1826 | a= *(uint32_t*)&src[stride*2]; |
||
1827 | b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
||
1828 | *(uint32_t*)&src[stride*1]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
||
1829 | |||
1830 | b= *(uint32_t*)&src[stride*3]; |
||
1831 | c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); |
||
1832 | *(uint32_t*)&src[stride*2]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); |
||
1833 | |||
1834 | c= *(uint32_t*)&src[stride*4]; |
||
1835 | a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
||
1836 | *(uint32_t*)&src[stride*3]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
||
1837 | |||
1838 | a= *(uint32_t*)&src[stride*5]; |
||
1839 | b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
||
1840 | *(uint32_t*)&src[stride*4]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
||
1841 | |||
1842 | b= *(uint32_t*)&src[stride*6]; |
||
1843 | c= (b&c) + (((b^c)&0xFEFEFEFEUL)>>1); |
||
1844 | *(uint32_t*)&src[stride*5]= (c|a) - (((c^a)&0xFEFEFEFEUL)>>1); |
||
1845 | |||
1846 | c= *(uint32_t*)&src[stride*7]; |
||
1847 | a= (a&c) + (((a^c)&0xFEFEFEFEUL)>>1); |
||
1848 | *(uint32_t*)&src[stride*6]= (a|b) - (((a^b)&0xFEFEFEFEUL)>>1); |
||
1849 | |||
1850 | a= *(uint32_t*)&src[stride*8]; |
||
1851 | b= (a&b) + (((a^b)&0xFEFEFEFEUL)>>1); |
||
1852 | *(uint32_t*)&src[stride*7]= (c|b) - (((c^b)&0xFEFEFEFEUL)>>1); |
||
1853 | |||
1854 | *(uint32_t*)&tmp[stride*0]= c; |
||
1855 | src += 4; |
||
1856 | tmp += 4; |
||
1857 | } |
||
1858 | #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
1859 | } |
||
1860 | |||
1861 | /** |
||
1862 | * Deinterlace the given block by applying a median filter to every second line. |
||
1863 | * will be called for every 8x8 block and can read & write from line 4-15, |
||
1864 | * lines 0-3 have been passed through the deblock / dering filters already, but can be read, too. |
||
1865 | * lines 4-12 will be read into the deblocking filter and should be deinterlaced |
||
1866 | */ |
||
1867 | static inline void RENAME(deInterlaceMedian)(uint8_t src[], int stride) |
||
1868 | { |
||
1869 | #if TEMPLATE_PP_MMX |
||
1870 | src+= 4*stride; |
||
1871 | #if TEMPLATE_PP_MMXEXT |
||
1872 | __asm__ volatile( |
||
1873 | "lea (%0, %1), %%"REG_a" \n\t" |
||
1874 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
||
1875 | // 0 1 2 3 4 5 6 7 8 9 |
||
1876 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
||
1877 | |||
1878 | "movq (%0), %%mm0 \n\t" |
||
1879 | "movq (%%"REG_a", %1), %%mm2 \n\t" |
||
1880 | "movq (%%"REG_a"), %%mm1 \n\t" |
||
1881 | "movq %%mm0, %%mm3 \n\t" |
||
1882 | "pmaxub %%mm1, %%mm0 \n\t" |
||
1883 | "pminub %%mm3, %%mm1 \n\t" |
||
1884 | "pmaxub %%mm2, %%mm1 \n\t" |
||
1885 | "pminub %%mm1, %%mm0 \n\t" |
||
1886 | "movq %%mm0, (%%"REG_a") \n\t" |
||
1887 | |||
1888 | "movq (%0, %1, 4), %%mm0 \n\t" |
||
1889 | "movq (%%"REG_a", %1, 2), %%mm1 \n\t" |
||
1890 | "movq %%mm2, %%mm3 \n\t" |
||
1891 | "pmaxub %%mm1, %%mm2 \n\t" |
||
1892 | "pminub %%mm3, %%mm1 \n\t" |
||
1893 | "pmaxub %%mm0, %%mm1 \n\t" |
||
1894 | "pminub %%mm1, %%mm2 \n\t" |
||
1895 | "movq %%mm2, (%%"REG_a", %1, 2) \n\t" |
||
1896 | |||
1897 | "movq (%%"REG_d"), %%mm2 \n\t" |
||
1898 | "movq (%%"REG_d", %1), %%mm1 \n\t" |
||
1899 | "movq %%mm2, %%mm3 \n\t" |
||
1900 | "pmaxub %%mm0, %%mm2 \n\t" |
||
1901 | "pminub %%mm3, %%mm0 \n\t" |
||
1902 | "pmaxub %%mm1, %%mm0 \n\t" |
||
1903 | "pminub %%mm0, %%mm2 \n\t" |
||
1904 | "movq %%mm2, (%%"REG_d") \n\t" |
||
1905 | |||
1906 | "movq (%%"REG_d", %1, 2), %%mm2 \n\t" |
||
1907 | "movq (%0, %1, 8), %%mm0 \n\t" |
||
1908 | "movq %%mm2, %%mm3 \n\t" |
||
1909 | "pmaxub %%mm0, %%mm2 \n\t" |
||
1910 | "pminub %%mm3, %%mm0 \n\t" |
||
1911 | "pmaxub %%mm1, %%mm0 \n\t" |
||
1912 | "pminub %%mm0, %%mm2 \n\t" |
||
1913 | "movq %%mm2, (%%"REG_d", %1, 2) \n\t" |
||
1914 | |||
1915 | |||
1916 | : : "r" (src), "r" ((x86_reg)stride) |
||
1917 | : "%"REG_a, "%"REG_d |
||
1918 | ); |
||
1919 | |||
1920 | #else // MMX without MMX2 |
||
1921 | __asm__ volatile( |
||
1922 | "lea (%0, %1), %%"REG_a" \n\t" |
||
1923 | "lea (%%"REG_a", %1, 4), %%"REG_d" \n\t" |
||
1924 | // 0 1 2 3 4 5 6 7 8 9 |
||
1925 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
||
1926 | "pxor %%mm7, %%mm7 \n\t" |
||
1927 | |||
1928 | #define REAL_MEDIAN(a,b,c)\ |
||
1929 | "movq " #a ", %%mm0 \n\t"\ |
||
1930 | "movq " #b ", %%mm2 \n\t"\ |
||
1931 | "movq " #c ", %%mm1 \n\t"\ |
||
1932 | "movq %%mm0, %%mm3 \n\t"\ |
||
1933 | "movq %%mm1, %%mm4 \n\t"\ |
||
1934 | "movq %%mm2, %%mm5 \n\t"\ |
||
1935 | "psubusb %%mm1, %%mm3 \n\t"\ |
||
1936 | "psubusb %%mm2, %%mm4 \n\t"\ |
||
1937 | "psubusb %%mm0, %%mm5 \n\t"\ |
||
1938 | "pcmpeqb %%mm7, %%mm3 \n\t"\ |
||
1939 | "pcmpeqb %%mm7, %%mm4 \n\t"\ |
||
1940 | "pcmpeqb %%mm7, %%mm5 \n\t"\ |
||
1941 | "movq %%mm3, %%mm6 \n\t"\ |
||
1942 | "pxor %%mm4, %%mm3 \n\t"\ |
||
1943 | "pxor %%mm5, %%mm4 \n\t"\ |
||
1944 | "pxor %%mm6, %%mm5 \n\t"\ |
||
1945 | "por %%mm3, %%mm1 \n\t"\ |
||
1946 | "por %%mm4, %%mm2 \n\t"\ |
||
1947 | "por %%mm5, %%mm0 \n\t"\ |
||
1948 | "pand %%mm2, %%mm0 \n\t"\ |
||
1949 | "pand %%mm1, %%mm0 \n\t"\ |
||
1950 | "movq %%mm0, " #b " \n\t" |
||
1951 | #define MEDIAN(a,b,c) REAL_MEDIAN(a,b,c) |
||
1952 | |||
1953 | MEDIAN((%0) , (%%REGa) , (%%REGa, %1)) |
||
1954 | MEDIAN((%%REGa, %1), (%%REGa, %1, 2), (%0, %1, 4)) |
||
1955 | MEDIAN((%0, %1, 4) , (%%REGd) , (%%REGd, %1)) |
||
1956 | MEDIAN((%%REGd, %1), (%%REGd, %1, 2), (%0, %1, 8)) |
||
1957 | |||
1958 | : : "r" (src), "r" ((x86_reg)stride) |
||
1959 | : "%"REG_a, "%"REG_d |
||
1960 | ); |
||
1961 | #endif //TEMPLATE_PP_MMXEXT |
||
1962 | #else //TEMPLATE_PP_MMX |
||
1963 | int x, y; |
||
1964 | src+= 4*stride; |
||
1965 | // FIXME - there should be a way to do a few columns in parallel like w/mmx |
||
1966 | for(x=0; x<8; x++){ |
||
1967 | uint8_t *colsrc = src; |
||
1968 | for (y=0; y<4; y++){ |
||
1969 | int a, b, c, d, e, f; |
||
1970 | a = colsrc[0 ]; |
||
1971 | b = colsrc[stride ]; |
||
1972 | c = colsrc[stride*2]; |
||
1973 | d = (a-b)>>31; |
||
1974 | e = (b-c)>>31; |
||
1975 | f = (c-a)>>31; |
||
1976 | colsrc[stride ] = (a|(d^f)) & (b|(d^e)) & (c|(e^f)); |
||
1977 | colsrc += stride*2; |
||
1978 | } |
||
1979 | src++; |
||
1980 | } |
||
1981 | #endif //TEMPLATE_PP_MMX |
||
1982 | } |
||
1983 | |||
1984 | #if TEMPLATE_PP_MMX |
||
1985 | /** |
||
1986 | * Transpose and shift the given 8x8 Block into dst1 and dst2. |
||
1987 | */ |
||
1988 | static inline void RENAME(transpose1)(uint8_t *dst1, uint8_t *dst2, uint8_t *src, int srcStride) |
||
1989 | { |
||
1990 | __asm__( |
||
1991 | "lea (%0, %1), %%"REG_a" \n\t" |
||
1992 | // 0 1 2 3 4 5 6 7 8 9 |
||
1993 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
||
1994 | "movq (%0), %%mm0 \n\t" // 12345678 |
||
1995 | "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh |
||
1996 | "movq %%mm0, %%mm2 \n\t" // 12345678 |
||
1997 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
||
1998 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
||
1999 | |||
2000 | "movq (%%"REG_a", %1), %%mm1 \n\t" |
||
2001 | "movq (%%"REG_a", %1, 2), %%mm3 \n\t" |
||
2002 | "movq %%mm1, %%mm4 \n\t" |
||
2003 | "punpcklbw %%mm3, %%mm1 \n\t" |
||
2004 | "punpckhbw %%mm3, %%mm4 \n\t" |
||
2005 | |||
2006 | "movq %%mm0, %%mm3 \n\t" |
||
2007 | "punpcklwd %%mm1, %%mm0 \n\t" |
||
2008 | "punpckhwd %%mm1, %%mm3 \n\t" |
||
2009 | "movq %%mm2, %%mm1 \n\t" |
||
2010 | "punpcklwd %%mm4, %%mm2 \n\t" |
||
2011 | "punpckhwd %%mm4, %%mm1 \n\t" |
||
2012 | |||
2013 | "movd %%mm0, 128(%2) \n\t" |
||
2014 | "psrlq $32, %%mm0 \n\t" |
||
2015 | "movd %%mm0, 144(%2) \n\t" |
||
2016 | "movd %%mm3, 160(%2) \n\t" |
||
2017 | "psrlq $32, %%mm3 \n\t" |
||
2018 | "movd %%mm3, 176(%2) \n\t" |
||
2019 | "movd %%mm3, 48(%3) \n\t" |
||
2020 | "movd %%mm2, 192(%2) \n\t" |
||
2021 | "movd %%mm2, 64(%3) \n\t" |
||
2022 | "psrlq $32, %%mm2 \n\t" |
||
2023 | "movd %%mm2, 80(%3) \n\t" |
||
2024 | "movd %%mm1, 96(%3) \n\t" |
||
2025 | "psrlq $32, %%mm1 \n\t" |
||
2026 | "movd %%mm1, 112(%3) \n\t" |
||
2027 | |||
2028 | "lea (%%"REG_a", %1, 4), %%"REG_a" \n\t" |
||
2029 | |||
2030 | "movq (%0, %1, 4), %%mm0 \n\t" // 12345678 |
||
2031 | "movq (%%"REG_a"), %%mm1 \n\t" // abcdefgh |
||
2032 | "movq %%mm0, %%mm2 \n\t" // 12345678 |
||
2033 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
||
2034 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
||
2035 | |||
2036 | "movq (%%"REG_a", %1), %%mm1 \n\t" |
||
2037 | "movq (%%"REG_a", %1, 2), %%mm3 \n\t" |
||
2038 | "movq %%mm1, %%mm4 \n\t" |
||
2039 | "punpcklbw %%mm3, %%mm1 \n\t" |
||
2040 | "punpckhbw %%mm3, %%mm4 \n\t" |
||
2041 | |||
2042 | "movq %%mm0, %%mm3 \n\t" |
||
2043 | "punpcklwd %%mm1, %%mm0 \n\t" |
||
2044 | "punpckhwd %%mm1, %%mm3 \n\t" |
||
2045 | "movq %%mm2, %%mm1 \n\t" |
||
2046 | "punpcklwd %%mm4, %%mm2 \n\t" |
||
2047 | "punpckhwd %%mm4, %%mm1 \n\t" |
||
2048 | |||
2049 | "movd %%mm0, 132(%2) \n\t" |
||
2050 | "psrlq $32, %%mm0 \n\t" |
||
2051 | "movd %%mm0, 148(%2) \n\t" |
||
2052 | "movd %%mm3, 164(%2) \n\t" |
||
2053 | "psrlq $32, %%mm3 \n\t" |
||
2054 | "movd %%mm3, 180(%2) \n\t" |
||
2055 | "movd %%mm3, 52(%3) \n\t" |
||
2056 | "movd %%mm2, 196(%2) \n\t" |
||
2057 | "movd %%mm2, 68(%3) \n\t" |
||
2058 | "psrlq $32, %%mm2 \n\t" |
||
2059 | "movd %%mm2, 84(%3) \n\t" |
||
2060 | "movd %%mm1, 100(%3) \n\t" |
||
2061 | "psrlq $32, %%mm1 \n\t" |
||
2062 | "movd %%mm1, 116(%3) \n\t" |
||
2063 | |||
2064 | |||
2065 | :: "r" (src), "r" ((x86_reg)srcStride), "r" (dst1), "r" (dst2) |
||
2066 | : "%"REG_a |
||
2067 | ); |
||
2068 | } |
||
2069 | |||
2070 | /** |
||
2071 | * Transpose the given 8x8 block. |
||
2072 | */ |
||
2073 | static inline void RENAME(transpose2)(uint8_t *dst, int dstStride, uint8_t *src) |
||
2074 | { |
||
2075 | __asm__( |
||
2076 | "lea (%0, %1), %%"REG_a" \n\t" |
||
2077 | "lea (%%"REG_a",%1,4), %%"REG_d" \n\t" |
||
2078 | // 0 1 2 3 4 5 6 7 8 9 |
||
2079 | // %0 eax eax+%1 eax+2%1 %0+4%1 edx edx+%1 edx+2%1 %0+8%1 edx+4%1 |
||
2080 | "movq (%2), %%mm0 \n\t" // 12345678 |
||
2081 | "movq 16(%2), %%mm1 \n\t" // abcdefgh |
||
2082 | "movq %%mm0, %%mm2 \n\t" // 12345678 |
||
2083 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
||
2084 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
||
2085 | |||
2086 | "movq 32(%2), %%mm1 \n\t" |
||
2087 | "movq 48(%2), %%mm3 \n\t" |
||
2088 | "movq %%mm1, %%mm4 \n\t" |
||
2089 | "punpcklbw %%mm3, %%mm1 \n\t" |
||
2090 | "punpckhbw %%mm3, %%mm4 \n\t" |
||
2091 | |||
2092 | "movq %%mm0, %%mm3 \n\t" |
||
2093 | "punpcklwd %%mm1, %%mm0 \n\t" |
||
2094 | "punpckhwd %%mm1, %%mm3 \n\t" |
||
2095 | "movq %%mm2, %%mm1 \n\t" |
||
2096 | "punpcklwd %%mm4, %%mm2 \n\t" |
||
2097 | "punpckhwd %%mm4, %%mm1 \n\t" |
||
2098 | |||
2099 | "movd %%mm0, (%0) \n\t" |
||
2100 | "psrlq $32, %%mm0 \n\t" |
||
2101 | "movd %%mm0, (%%"REG_a") \n\t" |
||
2102 | "movd %%mm3, (%%"REG_a", %1) \n\t" |
||
2103 | "psrlq $32, %%mm3 \n\t" |
||
2104 | "movd %%mm3, (%%"REG_a", %1, 2) \n\t" |
||
2105 | "movd %%mm2, (%0, %1, 4) \n\t" |
||
2106 | "psrlq $32, %%mm2 \n\t" |
||
2107 | "movd %%mm2, (%%"REG_d") \n\t" |
||
2108 | "movd %%mm1, (%%"REG_d", %1) \n\t" |
||
2109 | "psrlq $32, %%mm1 \n\t" |
||
2110 | "movd %%mm1, (%%"REG_d", %1, 2) \n\t" |
||
2111 | |||
2112 | |||
2113 | "movq 64(%2), %%mm0 \n\t" // 12345678 |
||
2114 | "movq 80(%2), %%mm1 \n\t" // abcdefgh |
||
2115 | "movq %%mm0, %%mm2 \n\t" // 12345678 |
||
2116 | "punpcklbw %%mm1, %%mm0 \n\t" // 1a2b3c4d |
||
2117 | "punpckhbw %%mm1, %%mm2 \n\t" // 5e6f7g8h |
||
2118 | |||
2119 | "movq 96(%2), %%mm1 \n\t" |
||
2120 | "movq 112(%2), %%mm3 \n\t" |
||
2121 | "movq %%mm1, %%mm4 \n\t" |
||
2122 | "punpcklbw %%mm3, %%mm1 \n\t" |
||
2123 | "punpckhbw %%mm3, %%mm4 \n\t" |
||
2124 | |||
2125 | "movq %%mm0, %%mm3 \n\t" |
||
2126 | "punpcklwd %%mm1, %%mm0 \n\t" |
||
2127 | "punpckhwd %%mm1, %%mm3 \n\t" |
||
2128 | "movq %%mm2, %%mm1 \n\t" |
||
2129 | "punpcklwd %%mm4, %%mm2 \n\t" |
||
2130 | "punpckhwd %%mm4, %%mm1 \n\t" |
||
2131 | |||
2132 | "movd %%mm0, 4(%0) \n\t" |
||
2133 | "psrlq $32, %%mm0 \n\t" |
||
2134 | "movd %%mm0, 4(%%"REG_a") \n\t" |
||
2135 | "movd %%mm3, 4(%%"REG_a", %1) \n\t" |
||
2136 | "psrlq $32, %%mm3 \n\t" |
||
2137 | "movd %%mm3, 4(%%"REG_a", %1, 2) \n\t" |
||
2138 | "movd %%mm2, 4(%0, %1, 4) \n\t" |
||
2139 | "psrlq $32, %%mm2 \n\t" |
||
2140 | "movd %%mm2, 4(%%"REG_d") \n\t" |
||
2141 | "movd %%mm1, 4(%%"REG_d", %1) \n\t" |
||
2142 | "psrlq $32, %%mm1 \n\t" |
||
2143 | "movd %%mm1, 4(%%"REG_d", %1, 2) \n\t" |
||
2144 | |||
2145 | :: "r" (dst), "r" ((x86_reg)dstStride), "r" (src) |
||
2146 | : "%"REG_a, "%"REG_d |
||
2147 | ); |
||
2148 | } |
||
2149 | #endif //TEMPLATE_PP_MMX |
||
2150 | //static long test=0; |
||
2151 | |||
2152 | #if !TEMPLATE_PP_ALTIVEC |
||
2153 | static inline void RENAME(tempNoiseReducer)(uint8_t *src, int stride, |
||
2154 | uint8_t *tempBlurred, uint32_t *tempBlurredPast, int *maxNoise) |
||
2155 | { |
||
2156 | // to save a register (FIXME do this outside of the loops) |
||
2157 | tempBlurredPast[127]= maxNoise[0]; |
||
2158 | tempBlurredPast[128]= maxNoise[1]; |
||
2159 | tempBlurredPast[129]= maxNoise[2]; |
||
2160 | |||
2161 | #define FAST_L2_DIFF |
||
2162 | //#define L1_DIFF //u should change the thresholds too if u try that one |
||
2163 | #if TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
2164 | __asm__ volatile( |
||
2165 | "lea (%2, %2, 2), %%"REG_a" \n\t" // 3*stride |
||
2166 | "lea (%2, %2, 4), %%"REG_d" \n\t" // 5*stride |
||
2167 | "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
||
2168 | // 0 1 2 3 4 5 6 7 8 9 |
||
2169 | // %x %x+%2 %x+2%2 %x+eax %x+4%2 %x+edx %x+2eax %x+ecx %x+8%2 |
||
2170 | //FIXME reorder? |
||
2171 | #ifdef L1_DIFF //needs mmx2 |
||
2172 | "movq (%0), %%mm0 \n\t" // L0 |
||
2173 | "psadbw (%1), %%mm0 \n\t" // |L0-R0| |
||
2174 | "movq (%0, %2), %%mm1 \n\t" // L1 |
||
2175 | "psadbw (%1, %2), %%mm1 \n\t" // |L1-R1| |
||
2176 | "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
||
2177 | "psadbw (%1, %2, 2), %%mm2 \n\t" // |L2-R2| |
||
2178 | "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 |
||
2179 | "psadbw (%1, %%"REG_a"), %%mm3 \n\t" // |L3-R3| |
||
2180 | |||
2181 | "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
||
2182 | "paddw %%mm1, %%mm0 \n\t" |
||
2183 | "psadbw (%1, %2, 4), %%mm4 \n\t" // |L4-R4| |
||
2184 | "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 |
||
2185 | "paddw %%mm2, %%mm0 \n\t" |
||
2186 | "psadbw (%1, %%"REG_d"), %%mm5 \n\t" // |L5-R5| |
||
2187 | "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 |
||
2188 | "paddw %%mm3, %%mm0 \n\t" |
||
2189 | "psadbw (%1, %%"REG_a", 2), %%mm6 \n\t" // |L6-R6| |
||
2190 | "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 |
||
2191 | "paddw %%mm4, %%mm0 \n\t" |
||
2192 | "psadbw (%1, %%"REG_c"), %%mm7 \n\t" // |L7-R7| |
||
2193 | "paddw %%mm5, %%mm6 \n\t" |
||
2194 | "paddw %%mm7, %%mm6 \n\t" |
||
2195 | "paddw %%mm6, %%mm0 \n\t" |
||
2196 | #else //L1_DIFF |
||
2197 | #if defined (FAST_L2_DIFF) |
||
2198 | "pcmpeqb %%mm7, %%mm7 \n\t" |
||
2199 | "movq "MANGLE(b80)", %%mm6 \n\t" |
||
2200 | "pxor %%mm0, %%mm0 \n\t" |
||
2201 | #define REAL_L2_DIFF_CORE(a, b)\ |
||
2202 | "movq " #a ", %%mm5 \n\t"\ |
||
2203 | "movq " #b ", %%mm2 \n\t"\ |
||
2204 | "pxor %%mm7, %%mm2 \n\t"\ |
||
2205 | PAVGB(%%mm2, %%mm5)\ |
||
2206 | "paddb %%mm6, %%mm5 \n\t"\ |
||
2207 | "movq %%mm5, %%mm2 \n\t"\ |
||
2208 | "psllw $8, %%mm5 \n\t"\ |
||
2209 | "pmaddwd %%mm5, %%mm5 \n\t"\ |
||
2210 | "pmaddwd %%mm2, %%mm2 \n\t"\ |
||
2211 | "paddd %%mm2, %%mm5 \n\t"\ |
||
2212 | "psrld $14, %%mm5 \n\t"\ |
||
2213 | "paddd %%mm5, %%mm0 \n\t" |
||
2214 | |||
2215 | #else //defined (FAST_L2_DIFF) |
||
2216 | "pxor %%mm7, %%mm7 \n\t" |
||
2217 | "pxor %%mm0, %%mm0 \n\t" |
||
2218 | #define REAL_L2_DIFF_CORE(a, b)\ |
||
2219 | "movq " #a ", %%mm5 \n\t"\ |
||
2220 | "movq " #b ", %%mm2 \n\t"\ |
||
2221 | "movq %%mm5, %%mm1 \n\t"\ |
||
2222 | "movq %%mm2, %%mm3 \n\t"\ |
||
2223 | "punpcklbw %%mm7, %%mm5 \n\t"\ |
||
2224 | "punpckhbw %%mm7, %%mm1 \n\t"\ |
||
2225 | "punpcklbw %%mm7, %%mm2 \n\t"\ |
||
2226 | "punpckhbw %%mm7, %%mm3 \n\t"\ |
||
2227 | "psubw %%mm2, %%mm5 \n\t"\ |
||
2228 | "psubw %%mm3, %%mm1 \n\t"\ |
||
2229 | "pmaddwd %%mm5, %%mm5 \n\t"\ |
||
2230 | "pmaddwd %%mm1, %%mm1 \n\t"\ |
||
2231 | "paddd %%mm1, %%mm5 \n\t"\ |
||
2232 | "paddd %%mm5, %%mm0 \n\t" |
||
2233 | |||
2234 | #endif //defined (FAST_L2_DIFF) |
||
2235 | |||
2236 | #define L2_DIFF_CORE(a, b) REAL_L2_DIFF_CORE(a, b) |
||
2237 | |||
2238 | L2_DIFF_CORE((%0) , (%1)) |
||
2239 | L2_DIFF_CORE((%0, %2) , (%1, %2)) |
||
2240 | L2_DIFF_CORE((%0, %2, 2) , (%1, %2, 2)) |
||
2241 | L2_DIFF_CORE((%0, %%REGa) , (%1, %%REGa)) |
||
2242 | L2_DIFF_CORE((%0, %2, 4) , (%1, %2, 4)) |
||
2243 | L2_DIFF_CORE((%0, %%REGd) , (%1, %%REGd)) |
||
2244 | L2_DIFF_CORE((%0, %%REGa,2), (%1, %%REGa,2)) |
||
2245 | L2_DIFF_CORE((%0, %%REGc) , (%1, %%REGc)) |
||
2246 | |||
2247 | #endif //L1_DIFF |
||
2248 | |||
2249 | "movq %%mm0, %%mm4 \n\t" |
||
2250 | "psrlq $32, %%mm0 \n\t" |
||
2251 | "paddd %%mm0, %%mm4 \n\t" |
||
2252 | "movd %%mm4, %%ecx \n\t" |
||
2253 | "shll $2, %%ecx \n\t" |
||
2254 | "mov %3, %%"REG_d" \n\t" |
||
2255 | "addl -4(%%"REG_d"), %%ecx \n\t" |
||
2256 | "addl 4(%%"REG_d"), %%ecx \n\t" |
||
2257 | "addl -1024(%%"REG_d"), %%ecx \n\t" |
||
2258 | "addl $4, %%ecx \n\t" |
||
2259 | "addl 1024(%%"REG_d"), %%ecx \n\t" |
||
2260 | "shrl $3, %%ecx \n\t" |
||
2261 | "movl %%ecx, (%%"REG_d") \n\t" |
||
2262 | |||
2263 | // "mov %3, %%"REG_c" \n\t" |
||
2264 | // "mov %%"REG_c", test \n\t" |
||
2265 | // "jmp 4f \n\t" |
||
2266 | "cmpl 512(%%"REG_d"), %%ecx \n\t" |
||
2267 | " jb 2f \n\t" |
||
2268 | "cmpl 516(%%"REG_d"), %%ecx \n\t" |
||
2269 | " jb 1f \n\t" |
||
2270 | |||
2271 | "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride |
||
2272 | "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
||
2273 | "movq (%0), %%mm0 \n\t" // L0 |
||
2274 | "movq (%0, %2), %%mm1 \n\t" // L1 |
||
2275 | "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
||
2276 | "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 |
||
2277 | "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
||
2278 | "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 |
||
2279 | "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 |
||
2280 | "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 |
||
2281 | "movq %%mm0, (%1) \n\t" // L0 |
||
2282 | "movq %%mm1, (%1, %2) \n\t" // L1 |
||
2283 | "movq %%mm2, (%1, %2, 2) \n\t" // L2 |
||
2284 | "movq %%mm3, (%1, %%"REG_a") \n\t" // L3 |
||
2285 | "movq %%mm4, (%1, %2, 4) \n\t" // L4 |
||
2286 | "movq %%mm5, (%1, %%"REG_d") \n\t" // L5 |
||
2287 | "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // L6 |
||
2288 | "movq %%mm7, (%1, %%"REG_c") \n\t" // L7 |
||
2289 | "jmp 4f \n\t" |
||
2290 | |||
2291 | "1: \n\t" |
||
2292 | "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride |
||
2293 | "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
||
2294 | "movq (%0), %%mm0 \n\t" // L0 |
||
2295 | PAVGB((%1), %%mm0) // L0 |
||
2296 | "movq (%0, %2), %%mm1 \n\t" // L1 |
||
2297 | PAVGB((%1, %2), %%mm1) // L1 |
||
2298 | "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
||
2299 | PAVGB((%1, %2, 2), %%mm2) // L2 |
||
2300 | "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 |
||
2301 | PAVGB((%1, %%REGa), %%mm3) // L3 |
||
2302 | "movq (%0, %2, 4), %%mm4 \n\t" // L4 |
||
2303 | PAVGB((%1, %2, 4), %%mm4) // L4 |
||
2304 | "movq (%0, %%"REG_d"), %%mm5 \n\t" // L5 |
||
2305 | PAVGB((%1, %%REGd), %%mm5) // L5 |
||
2306 | "movq (%0, %%"REG_a", 2), %%mm6 \n\t" // L6 |
||
2307 | PAVGB((%1, %%REGa, 2), %%mm6) // L6 |
||
2308 | "movq (%0, %%"REG_c"), %%mm7 \n\t" // L7 |
||
2309 | PAVGB((%1, %%REGc), %%mm7) // L7 |
||
2310 | "movq %%mm0, (%1) \n\t" // R0 |
||
2311 | "movq %%mm1, (%1, %2) \n\t" // R1 |
||
2312 | "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
||
2313 | "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 |
||
2314 | "movq %%mm4, (%1, %2, 4) \n\t" // R4 |
||
2315 | "movq %%mm5, (%1, %%"REG_d") \n\t" // R5 |
||
2316 | "movq %%mm6, (%1, %%"REG_a", 2) \n\t" // R6 |
||
2317 | "movq %%mm7, (%1, %%"REG_c") \n\t" // R7 |
||
2318 | "movq %%mm0, (%0) \n\t" // L0 |
||
2319 | "movq %%mm1, (%0, %2) \n\t" // L1 |
||
2320 | "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
||
2321 | "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 |
||
2322 | "movq %%mm4, (%0, %2, 4) \n\t" // L4 |
||
2323 | "movq %%mm5, (%0, %%"REG_d") \n\t" // L5 |
||
2324 | "movq %%mm6, (%0, %%"REG_a", 2) \n\t" // L6 |
||
2325 | "movq %%mm7, (%0, %%"REG_c") \n\t" // L7 |
||
2326 | "jmp 4f \n\t" |
||
2327 | |||
2328 | "2: \n\t" |
||
2329 | "cmpl 508(%%"REG_d"), %%ecx \n\t" |
||
2330 | " jb 3f \n\t" |
||
2331 | |||
2332 | "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride |
||
2333 | "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
||
2334 | "movq (%0), %%mm0 \n\t" // L0 |
||
2335 | "movq (%0, %2), %%mm1 \n\t" // L1 |
||
2336 | "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
||
2337 | "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 |
||
2338 | "movq (%1), %%mm4 \n\t" // R0 |
||
2339 | "movq (%1, %2), %%mm5 \n\t" // R1 |
||
2340 | "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
||
2341 | "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 |
||
2342 | PAVGB(%%mm4, %%mm0) |
||
2343 | PAVGB(%%mm5, %%mm1) |
||
2344 | PAVGB(%%mm6, %%mm2) |
||
2345 | PAVGB(%%mm7, %%mm3) |
||
2346 | PAVGB(%%mm4, %%mm0) |
||
2347 | PAVGB(%%mm5, %%mm1) |
||
2348 | PAVGB(%%mm6, %%mm2) |
||
2349 | PAVGB(%%mm7, %%mm3) |
||
2350 | "movq %%mm0, (%1) \n\t" // R0 |
||
2351 | "movq %%mm1, (%1, %2) \n\t" // R1 |
||
2352 | "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
||
2353 | "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 |
||
2354 | "movq %%mm0, (%0) \n\t" // L0 |
||
2355 | "movq %%mm1, (%0, %2) \n\t" // L1 |
||
2356 | "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
||
2357 | "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 |
||
2358 | |||
2359 | "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
||
2360 | "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 |
||
2361 | "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 |
||
2362 | "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 |
||
2363 | "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
||
2364 | "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 |
||
2365 | "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 |
||
2366 | "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 |
||
2367 | PAVGB(%%mm4, %%mm0) |
||
2368 | PAVGB(%%mm5, %%mm1) |
||
2369 | PAVGB(%%mm6, %%mm2) |
||
2370 | PAVGB(%%mm7, %%mm3) |
||
2371 | PAVGB(%%mm4, %%mm0) |
||
2372 | PAVGB(%%mm5, %%mm1) |
||
2373 | PAVGB(%%mm6, %%mm2) |
||
2374 | PAVGB(%%mm7, %%mm3) |
||
2375 | "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
||
2376 | "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 |
||
2377 | "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 |
||
2378 | "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 |
||
2379 | "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
||
2380 | "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 |
||
2381 | "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 |
||
2382 | "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 |
||
2383 | "jmp 4f \n\t" |
||
2384 | |||
2385 | "3: \n\t" |
||
2386 | "lea (%%"REG_a", %2, 2), %%"REG_d" \n\t" // 5*stride |
||
2387 | "lea (%%"REG_d", %2, 2), %%"REG_c" \n\t" // 7*stride |
||
2388 | "movq (%0), %%mm0 \n\t" // L0 |
||
2389 | "movq (%0, %2), %%mm1 \n\t" // L1 |
||
2390 | "movq (%0, %2, 2), %%mm2 \n\t" // L2 |
||
2391 | "movq (%0, %%"REG_a"), %%mm3 \n\t" // L3 |
||
2392 | "movq (%1), %%mm4 \n\t" // R0 |
||
2393 | "movq (%1, %2), %%mm5 \n\t" // R1 |
||
2394 | "movq (%1, %2, 2), %%mm6 \n\t" // R2 |
||
2395 | "movq (%1, %%"REG_a"), %%mm7 \n\t" // R3 |
||
2396 | PAVGB(%%mm4, %%mm0) |
||
2397 | PAVGB(%%mm5, %%mm1) |
||
2398 | PAVGB(%%mm6, %%mm2) |
||
2399 | PAVGB(%%mm7, %%mm3) |
||
2400 | PAVGB(%%mm4, %%mm0) |
||
2401 | PAVGB(%%mm5, %%mm1) |
||
2402 | PAVGB(%%mm6, %%mm2) |
||
2403 | PAVGB(%%mm7, %%mm3) |
||
2404 | PAVGB(%%mm4, %%mm0) |
||
2405 | PAVGB(%%mm5, %%mm1) |
||
2406 | PAVGB(%%mm6, %%mm2) |
||
2407 | PAVGB(%%mm7, %%mm3) |
||
2408 | "movq %%mm0, (%1) \n\t" // R0 |
||
2409 | "movq %%mm1, (%1, %2) \n\t" // R1 |
||
2410 | "movq %%mm2, (%1, %2, 2) \n\t" // R2 |
||
2411 | "movq %%mm3, (%1, %%"REG_a") \n\t" // R3 |
||
2412 | "movq %%mm0, (%0) \n\t" // L0 |
||
2413 | "movq %%mm1, (%0, %2) \n\t" // L1 |
||
2414 | "movq %%mm2, (%0, %2, 2) \n\t" // L2 |
||
2415 | "movq %%mm3, (%0, %%"REG_a") \n\t" // L3 |
||
2416 | |||
2417 | "movq (%0, %2, 4), %%mm0 \n\t" // L4 |
||
2418 | "movq (%0, %%"REG_d"), %%mm1 \n\t" // L5 |
||
2419 | "movq (%0, %%"REG_a", 2), %%mm2 \n\t" // L6 |
||
2420 | "movq (%0, %%"REG_c"), %%mm3 \n\t" // L7 |
||
2421 | "movq (%1, %2, 4), %%mm4 \n\t" // R4 |
||
2422 | "movq (%1, %%"REG_d"), %%mm5 \n\t" // R5 |
||
2423 | "movq (%1, %%"REG_a", 2), %%mm6 \n\t" // R6 |
||
2424 | "movq (%1, %%"REG_c"), %%mm7 \n\t" // R7 |
||
2425 | PAVGB(%%mm4, %%mm0) |
||
2426 | PAVGB(%%mm5, %%mm1) |
||
2427 | PAVGB(%%mm6, %%mm2) |
||
2428 | PAVGB(%%mm7, %%mm3) |
||
2429 | PAVGB(%%mm4, %%mm0) |
||
2430 | PAVGB(%%mm5, %%mm1) |
||
2431 | PAVGB(%%mm6, %%mm2) |
||
2432 | PAVGB(%%mm7, %%mm3) |
||
2433 | PAVGB(%%mm4, %%mm0) |
||
2434 | PAVGB(%%mm5, %%mm1) |
||
2435 | PAVGB(%%mm6, %%mm2) |
||
2436 | PAVGB(%%mm7, %%mm3) |
||
2437 | "movq %%mm0, (%1, %2, 4) \n\t" // R4 |
||
2438 | "movq %%mm1, (%1, %%"REG_d") \n\t" // R5 |
||
2439 | "movq %%mm2, (%1, %%"REG_a", 2) \n\t" // R6 |
||
2440 | "movq %%mm3, (%1, %%"REG_c") \n\t" // R7 |
||
2441 | "movq %%mm0, (%0, %2, 4) \n\t" // L4 |
||
2442 | "movq %%mm1, (%0, %%"REG_d") \n\t" // L5 |
||
2443 | "movq %%mm2, (%0, %%"REG_a", 2) \n\t" // L6 |
||
2444 | "movq %%mm3, (%0, %%"REG_c") \n\t" // L7 |
||
2445 | |||
2446 | "4: \n\t" |
||
2447 | |||
2448 | :: "r" (src), "r" (tempBlurred), "r"((x86_reg)stride), "m" (tempBlurredPast) |
||
2449 | : "%"REG_a, "%"REG_d, "%"REG_c, "memory" |
||
2450 | ); |
||
2451 | #else //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
2452 | { |
||
2453 | int y; |
||
2454 | int d=0; |
||
2455 | // int sysd=0; |
||
2456 | int i; |
||
2457 | |||
2458 | for(y=0; y<8; y++){ |
||
2459 | int x; |
||
2460 | for(x=0; x<8; x++){ |
||
2461 | int ref= tempBlurred[ x + y*stride ]; |
||
2462 | int cur= src[ x + y*stride ]; |
||
2463 | int d1=ref - cur; |
||
2464 | // if(x==0 || x==7) d1+= d1>>1; |
||
2465 | // if(y==0 || y==7) d1+= d1>>1; |
||
2466 | // d+= FFABS(d1); |
||
2467 | d+= d1*d1; |
||
2468 | // sysd+= d1; |
||
2469 | } |
||
2470 | } |
||
2471 | i=d; |
||
2472 | d= ( |
||
2473 | 4*d |
||
2474 | +(*(tempBlurredPast-256)) |
||
2475 | +(*(tempBlurredPast-1))+ (*(tempBlurredPast+1)) |
||
2476 | +(*(tempBlurredPast+256)) |
||
2477 | +4)>>3; |
||
2478 | *tempBlurredPast=i; |
||
2479 | // ((*tempBlurredPast)*3 + d + 2)>>2; |
||
2480 | |||
2481 | /* |
||
2482 | Switch between |
||
2483 | 1 0 0 0 0 0 0 (0) |
||
2484 | 64 32 16 8 4 2 1 (1) |
||
2485 | 64 48 36 27 20 15 11 (33) (approx) |
||
2486 | 64 56 49 43 37 33 29 (200) (approx) |
||
2487 | */ |
||
2488 | if(d > maxNoise[1]){ |
||
2489 | if(d < maxNoise[2]){ |
||
2490 | for(y=0; y<8; y++){ |
||
2491 | int x; |
||
2492 | for(x=0; x<8; x++){ |
||
2493 | int ref= tempBlurred[ x + y*stride ]; |
||
2494 | int cur= src[ x + y*stride ]; |
||
2495 | tempBlurred[ x + y*stride ]= |
||
2496 | src[ x + y*stride ]= |
||
2497 | (ref + cur + 1)>>1; |
||
2498 | } |
||
2499 | } |
||
2500 | }else{ |
||
2501 | for(y=0; y<8; y++){ |
||
2502 | int x; |
||
2503 | for(x=0; x<8; x++){ |
||
2504 | tempBlurred[ x + y*stride ]= src[ x + y*stride ]; |
||
2505 | } |
||
2506 | } |
||
2507 | } |
||
2508 | }else{ |
||
2509 | if(d < maxNoise[0]){ |
||
2510 | for(y=0; y<8; y++){ |
||
2511 | int x; |
||
2512 | for(x=0; x<8; x++){ |
||
2513 | int ref= tempBlurred[ x + y*stride ]; |
||
2514 | int cur= src[ x + y*stride ]; |
||
2515 | tempBlurred[ x + y*stride ]= |
||
2516 | src[ x + y*stride ]= |
||
2517 | (ref*7 + cur + 4)>>3; |
||
2518 | } |
||
2519 | } |
||
2520 | }else{ |
||
2521 | for(y=0; y<8; y++){ |
||
2522 | int x; |
||
2523 | for(x=0; x<8; x++){ |
||
2524 | int ref= tempBlurred[ x + y*stride ]; |
||
2525 | int cur= src[ x + y*stride ]; |
||
2526 | tempBlurred[ x + y*stride ]= |
||
2527 | src[ x + y*stride ]= |
||
2528 | (ref*3 + cur + 2)>>2; |
||
2529 | } |
||
2530 | } |
||
2531 | } |
||
2532 | } |
||
2533 | } |
||
2534 | #endif //TEMPLATE_PP_MMXEXT || TEMPLATE_PP_3DNOW |
||
2535 | } |
||
2536 | #endif //TEMPLATE_PP_ALTIVEC |
||
2537 | |||
2538 | #if TEMPLATE_PP_MMX |
||
2539 | /** |
||
2540 | * accurate deblock filter |
||
2541 | */ |
||
2542 | static av_always_inline void RENAME(do_a_deblock)(uint8_t *src, int step, int stride, PPContext *c){ |
||
2543 | int64_t dc_mask, eq_mask, both_masks; |
||
2544 | int64_t sums[10*8*2]; |
||
2545 | src+= step*3; // src points to begin of the 8x8 Block |
||
2546 | //{ START_TIMER |
||
2547 | __asm__ volatile( |
||
2548 | "movq %0, %%mm7 \n\t" |
||
2549 | "movq %1, %%mm6 \n\t" |
||
2550 | : : "m" (c->mmxDcOffset[c->nonBQP]), "m" (c->mmxDcThreshold[c->nonBQP]) |
||
2551 | ); |
||
2552 | |||
2553 | __asm__ volatile( |
||
2554 | "lea (%2, %3), %%"REG_a" \n\t" |
||
2555 | // 0 1 2 3 4 5 6 7 8 9 |
||
2556 | // %1 eax eax+%2 eax+2%2 %1+4%2 ecx ecx+%2 ecx+2%2 %1+8%2 ecx+4%2 |
||
2557 | |||
2558 | "movq (%2), %%mm0 \n\t" |
||
2559 | "movq (%%"REG_a"), %%mm1 \n\t" |
||
2560 | "movq %%mm1, %%mm3 \n\t" |
||
2561 | "movq %%mm1, %%mm4 \n\t" |
||
2562 | "psubb %%mm1, %%mm0 \n\t" // mm0 = difference |
||
2563 | "paddb %%mm7, %%mm0 \n\t" |
||
2564 | "pcmpgtb %%mm6, %%mm0 \n\t" |
||
2565 | |||
2566 | "movq (%%"REG_a",%3), %%mm2 \n\t" |
||
2567 | PMAXUB(%%mm2, %%mm4) |
||
2568 | PMINUB(%%mm2, %%mm3, %%mm5) |
||
2569 | "psubb %%mm2, %%mm1 \n\t" |
||
2570 | "paddb %%mm7, %%mm1 \n\t" |
||
2571 | "pcmpgtb %%mm6, %%mm1 \n\t" |
||
2572 | "paddb %%mm1, %%mm0 \n\t" |
||
2573 | |||
2574 | "movq (%%"REG_a", %3, 2), %%mm1 \n\t" |
||
2575 | PMAXUB(%%mm1, %%mm4) |
||
2576 | PMINUB(%%mm1, %%mm3, %%mm5) |
||
2577 | "psubb %%mm1, %%mm2 \n\t" |
||
2578 | "paddb %%mm7, %%mm2 \n\t" |
||
2579 | "pcmpgtb %%mm6, %%mm2 \n\t" |
||
2580 | "paddb %%mm2, %%mm0 \n\t" |
||
2581 | |||
2582 | "lea (%%"REG_a", %3, 4), %%"REG_a" \n\t" |
||
2583 | |||
2584 | "movq (%2, %3, 4), %%mm2 \n\t" |
||
2585 | PMAXUB(%%mm2, %%mm4) |
||
2586 | PMINUB(%%mm2, %%mm3, %%mm5) |
||
2587 | "psubb %%mm2, %%mm1 \n\t" |
||
2588 | "paddb %%mm7, %%mm1 \n\t" |
||
2589 | "pcmpgtb %%mm6, %%mm1 \n\t" |
||
2590 | "paddb %%mm1, %%mm0 \n\t" |
||
2591 | |||
2592 | "movq (%%"REG_a"), %%mm1 \n\t" |
||
2593 | PMAXUB(%%mm1, %%mm4) |
||
2594 | PMINUB(%%mm1, %%mm3, %%mm5) |
||
2595 | "psubb %%mm1, %%mm2 \n\t" |
||
2596 | "paddb %%mm7, %%mm2 \n\t" |
||
2597 | "pcmpgtb %%mm6, %%mm2 \n\t" |
||
2598 | "paddb %%mm2, %%mm0 \n\t" |
||
2599 | |||
2600 | "movq (%%"REG_a", %3), %%mm2 \n\t" |
||
2601 | PMAXUB(%%mm2, %%mm4) |
||
2602 | PMINUB(%%mm2, %%mm3, %%mm5) |
||
2603 | "psubb %%mm2, %%mm1 \n\t" |
||
2604 | "paddb %%mm7, %%mm1 \n\t" |
||
2605 | "pcmpgtb %%mm6, %%mm1 \n\t" |
||
2606 | "paddb %%mm1, %%mm0 \n\t" |
||
2607 | |||
2608 | "movq (%%"REG_a", %3, 2), %%mm1 \n\t" |
||
2609 | PMAXUB(%%mm1, %%mm4) |
||
2610 | PMINUB(%%mm1, %%mm3, %%mm5) |
||
2611 | "psubb %%mm1, %%mm2 \n\t" |
||
2612 | "paddb %%mm7, %%mm2 \n\t" |
||
2613 | "pcmpgtb %%mm6, %%mm2 \n\t" |
||
2614 | "paddb %%mm2, %%mm0 \n\t" |
||
2615 | |||
2616 | "movq (%2, %3, 8), %%mm2 \n\t" |
||
2617 | PMAXUB(%%mm2, %%mm4) |
||
2618 | PMINUB(%%mm2, %%mm3, %%mm5) |
||
2619 | "psubb %%mm2, %%mm1 \n\t" |
||
2620 | "paddb %%mm7, %%mm1 \n\t" |
||
2621 | "pcmpgtb %%mm6, %%mm1 \n\t" |
||
2622 | "paddb %%mm1, %%mm0 \n\t" |
||
2623 | |||
2624 | "movq (%%"REG_a", %3, 4), %%mm1 \n\t" |
||
2625 | "psubb %%mm1, %%mm2 \n\t" |
||
2626 | "paddb %%mm7, %%mm2 \n\t" |
||
2627 | "pcmpgtb %%mm6, %%mm2 \n\t" |
||
2628 | "paddb %%mm2, %%mm0 \n\t" |
||
2629 | "psubusb %%mm3, %%mm4 \n\t" |
||
2630 | |||
2631 | "pxor %%mm6, %%mm6 \n\t" |
||
2632 | "movq %4, %%mm7 \n\t" // QP,..., QP |
||
2633 | "paddusb %%mm7, %%mm7 \n\t" // 2QP ... 2QP |
||
2634 | "psubusb %%mm4, %%mm7 \n\t" // Diff >=2QP -> 0 |
||
2635 | "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 |
||
2636 | "pcmpeqb %%mm6, %%mm7 \n\t" // Diff < 2QP -> 0 |
||
2637 | "movq %%mm7, %1 \n\t" |
||
2638 | |||
2639 | "movq %5, %%mm7 \n\t" |
||
2640 | "punpcklbw %%mm7, %%mm7 \n\t" |
||
2641 | "punpcklbw %%mm7, %%mm7 \n\t" |
||
2642 | "punpcklbw %%mm7, %%mm7 \n\t" |
||
2643 | "psubb %%mm0, %%mm6 \n\t" |
||
2644 | "pcmpgtb %%mm7, %%mm6 \n\t" |
||
2645 | "movq %%mm6, %0 \n\t" |
||
2646 | |||
2647 | : "=m" (eq_mask), "=m" (dc_mask) |
||
2648 | : "r" (src), "r" ((x86_reg)step), "m" (c->pQPb), "m"(c->ppMode.flatnessThreshold) |
||
2649 | : "%"REG_a |
||
2650 | ); |
||
2651 | |||
2652 | both_masks = dc_mask & eq_mask; |
||
2653 | |||
2654 | if(both_masks){ |
||
2655 | x86_reg offset= -8*step; |
||
2656 | int64_t *temp_sums= sums; |
||
2657 | |||
2658 | __asm__ volatile( |
||
2659 | "movq %2, %%mm0 \n\t" // QP,..., QP |
||
2660 | "pxor %%mm4, %%mm4 \n\t" |
||
2661 | |||
2662 | "movq (%0), %%mm6 \n\t" |
||
2663 | "movq (%0, %1), %%mm5 \n\t" |
||
2664 | "movq %%mm5, %%mm1 \n\t" |
||
2665 | "movq %%mm6, %%mm2 \n\t" |
||
2666 | "psubusb %%mm6, %%mm5 \n\t" |
||
2667 | "psubusb %%mm1, %%mm2 \n\t" |
||
2668 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
||
2669 | "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 |
||
2670 | "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF |
||
2671 | |||
2672 | "pxor %%mm6, %%mm1 \n\t" |
||
2673 | "pand %%mm0, %%mm1 \n\t" |
||
2674 | "pxor %%mm1, %%mm6 \n\t" |
||
2675 | // 0:QP 6:First |
||
2676 | |||
2677 | "movq (%0, %1, 8), %%mm5 \n\t" |
||
2678 | "add %1, %0 \n\t" // %0 points to line 1 not 0 |
||
2679 | "movq (%0, %1, 8), %%mm7 \n\t" |
||
2680 | "movq %%mm5, %%mm1 \n\t" |
||
2681 | "movq %%mm7, %%mm2 \n\t" |
||
2682 | "psubusb %%mm7, %%mm5 \n\t" |
||
2683 | "psubusb %%mm1, %%mm2 \n\t" |
||
2684 | "por %%mm5, %%mm2 \n\t" // ABS Diff of lines |
||
2685 | "movq %2, %%mm0 \n\t" // QP,..., QP |
||
2686 | "psubusb %%mm2, %%mm0 \n\t" // diff >= QP -> 0 |
||
2687 | "pcmpeqb %%mm4, %%mm0 \n\t" // diff >= QP -> FF |
||
2688 | |||
2689 | "pxor %%mm7, %%mm1 \n\t" |
||
2690 | "pand %%mm0, %%mm1 \n\t" |
||
2691 | "pxor %%mm1, %%mm7 \n\t" |
||
2692 | |||
2693 | "movq %%mm6, %%mm5 \n\t" |
||
2694 | "punpckhbw %%mm4, %%mm6 \n\t" |
||
2695 | "punpcklbw %%mm4, %%mm5 \n\t" |
||
2696 | // 4:0 5/6:First 7:Last |
||
2697 | |||
2698 | "movq %%mm5, %%mm0 \n\t" |
||
2699 | "movq %%mm6, %%mm1 \n\t" |
||
2700 | "psllw $2, %%mm0 \n\t" |
||
2701 | "psllw $2, %%mm1 \n\t" |
||
2702 | "paddw "MANGLE(w04)", %%mm0 \n\t" |
||
2703 | "paddw "MANGLE(w04)", %%mm1 \n\t" |
||
2704 | |||
2705 | #define NEXT\ |
||
2706 | "movq (%0), %%mm2 \n\t"\ |
||
2707 | "movq (%0), %%mm3 \n\t"\ |
||
2708 | "add %1, %0 \n\t"\ |
||
2709 | "punpcklbw %%mm4, %%mm2 \n\t"\ |
||
2710 | "punpckhbw %%mm4, %%mm3 \n\t"\ |
||
2711 | "paddw %%mm2, %%mm0 \n\t"\ |
||
2712 | "paddw %%mm3, %%mm1 \n\t" |
||
2713 | |||
2714 | #define PREV\ |
||
2715 | "movq (%0), %%mm2 \n\t"\ |
||
2716 | "movq (%0), %%mm3 \n\t"\ |
||
2717 | "add %1, %0 \n\t"\ |
||
2718 | "punpcklbw %%mm4, %%mm2 \n\t"\ |
||
2719 | "punpckhbw %%mm4, %%mm3 \n\t"\ |
||
2720 | "psubw %%mm2, %%mm0 \n\t"\ |
||
2721 | "psubw %%mm3, %%mm1 \n\t" |
||
2722 | |||
2723 | |||
2724 | NEXT //0 |
||
2725 | NEXT //1 |
||
2726 | NEXT //2 |
||
2727 | "movq %%mm0, (%3) \n\t" |
||
2728 | "movq %%mm1, 8(%3) \n\t" |
||
2729 | |||
2730 | NEXT //3 |
||
2731 | "psubw %%mm5, %%mm0 \n\t" |
||
2732 | "psubw %%mm6, %%mm1 \n\t" |
||
2733 | "movq %%mm0, 16(%3) \n\t" |
||
2734 | "movq %%mm1, 24(%3) \n\t" |
||
2735 | |||
2736 | NEXT //4 |
||
2737 | "psubw %%mm5, %%mm0 \n\t" |
||
2738 | "psubw %%mm6, %%mm1 \n\t" |
||
2739 | "movq %%mm0, 32(%3) \n\t" |
||
2740 | "movq %%mm1, 40(%3) \n\t" |
||
2741 | |||
2742 | NEXT //5 |
||
2743 | "psubw %%mm5, %%mm0 \n\t" |
||
2744 | "psubw %%mm6, %%mm1 \n\t" |
||
2745 | "movq %%mm0, 48(%3) \n\t" |
||
2746 | "movq %%mm1, 56(%3) \n\t" |
||
2747 | |||
2748 | NEXT //6 |
||
2749 | "psubw %%mm5, %%mm0 \n\t" |
||
2750 | "psubw %%mm6, %%mm1 \n\t" |
||
2751 | "movq %%mm0, 64(%3) \n\t" |
||
2752 | "movq %%mm1, 72(%3) \n\t" |
||
2753 | |||
2754 | "movq %%mm7, %%mm6 \n\t" |
||
2755 | "punpckhbw %%mm4, %%mm7 \n\t" |
||
2756 | "punpcklbw %%mm4, %%mm6 \n\t" |
||
2757 | |||
2758 | NEXT //7 |
||
2759 | "mov %4, %0 \n\t" |
||
2760 | "add %1, %0 \n\t" |
||
2761 | PREV //0 |
||
2762 | "movq %%mm0, 80(%3) \n\t" |
||
2763 | "movq %%mm1, 88(%3) \n\t" |
||
2764 | |||
2765 | PREV //1 |
||
2766 | "paddw %%mm6, %%mm0 \n\t" |
||
2767 | "paddw %%mm7, %%mm1 \n\t" |
||
2768 | "movq %%mm0, 96(%3) \n\t" |
||
2769 | "movq %%mm1, 104(%3) \n\t" |
||
2770 | |||
2771 | PREV //2 |
||
2772 | "paddw %%mm6, %%mm0 \n\t" |
||
2773 | "paddw %%mm7, %%mm1 \n\t" |
||
2774 | "movq %%mm0, 112(%3) \n\t" |
||
2775 | "movq %%mm1, 120(%3) \n\t" |
||
2776 | |||
2777 | PREV //3 |
||
2778 | "paddw %%mm6, %%mm0 \n\t" |
||
2779 | "paddw %%mm7, %%mm1 \n\t" |
||
2780 | "movq %%mm0, 128(%3) \n\t" |
||
2781 | "movq %%mm1, 136(%3) \n\t" |
||
2782 | |||
2783 | PREV //4 |
||
2784 | "paddw %%mm6, %%mm0 \n\t" |
||
2785 | "paddw %%mm7, %%mm1 \n\t" |
||
2786 | "movq %%mm0, 144(%3) \n\t" |
||
2787 | "movq %%mm1, 152(%3) \n\t" |
||
2788 | |||
2789 | "mov %4, %0 \n\t" //FIXME |
||
2790 | |||
2791 | : "+&r"(src) |
||
2792 | : "r" ((x86_reg)step), "m" (c->pQPb), "r"(sums), "g"(src) |
||
2793 | ); |
||
2794 | |||
2795 | src+= step; // src points to begin of the 8x8 Block |
||
2796 | |||
2797 | __asm__ volatile( |
||
2798 | "movq %4, %%mm6 \n\t" |
||
2799 | "pcmpeqb %%mm5, %%mm5 \n\t" |
||
2800 | "pxor %%mm6, %%mm5 \n\t" |
||
2801 | "pxor %%mm7, %%mm7 \n\t" |
||
2802 | |||
2803 | "1: \n\t" |
||
2804 | "movq (%1), %%mm0 \n\t" |
||
2805 | "movq 8(%1), %%mm1 \n\t" |
||
2806 | "paddw 32(%1), %%mm0 \n\t" |
||
2807 | "paddw 40(%1), %%mm1 \n\t" |
||
2808 | "movq (%0, %3), %%mm2 \n\t" |
||
2809 | "movq %%mm2, %%mm3 \n\t" |
||
2810 | "movq %%mm2, %%mm4 \n\t" |
||
2811 | "punpcklbw %%mm7, %%mm2 \n\t" |
||
2812 | "punpckhbw %%mm7, %%mm3 \n\t" |
||
2813 | "paddw %%mm2, %%mm0 \n\t" |
||
2814 | "paddw %%mm3, %%mm1 \n\t" |
||
2815 | "paddw %%mm2, %%mm0 \n\t" |
||
2816 | "paddw %%mm3, %%mm1 \n\t" |
||
2817 | "psrlw $4, %%mm0 \n\t" |
||
2818 | "psrlw $4, %%mm1 \n\t" |
||
2819 | "packuswb %%mm1, %%mm0 \n\t" |
||
2820 | "pand %%mm6, %%mm0 \n\t" |
||
2821 | "pand %%mm5, %%mm4 \n\t" |
||
2822 | "por %%mm4, %%mm0 \n\t" |
||
2823 | "movq %%mm0, (%0, %3) \n\t" |
||
2824 | "add $16, %1 \n\t" |
||
2825 | "add %2, %0 \n\t" |
||
2826 | " js 1b \n\t" |
||
2827 | |||
2828 | : "+r"(offset), "+r"(temp_sums) |
||
2829 | : "r" ((x86_reg)step), "r"(src - offset), "m"(both_masks) |
||
2830 | ); |
||
2831 | }else |
||
2832 | src+= step; // src points to begin of the 8x8 Block |
||
2833 | |||
2834 | if(eq_mask != -1LL){ |
||
2835 | uint8_t *temp_src= src; |
||
2836 | DECLARE_ALIGNED(8, uint64_t, tmp)[4]; // make space for 4 8-byte vars |
||
2837 | __asm__ volatile( |
||
2838 | "pxor %%mm7, %%mm7 \n\t" |
||
2839 | // 0 1 2 3 4 5 6 7 8 9 |
||
2840 | // %0 eax eax+%1 eax+2%1 %0+4%1 ecx ecx+%1 ecx+2%1 %1+8%1 ecx+4%1 |
||
2841 | |||
2842 | "movq (%0), %%mm0 \n\t" |
||
2843 | "movq %%mm0, %%mm1 \n\t" |
||
2844 | "punpcklbw %%mm7, %%mm0 \n\t" // low part of line 0 |
||
2845 | "punpckhbw %%mm7, %%mm1 \n\t" // high part of line 0 |
||
2846 | |||
2847 | "movq (%0, %1), %%mm2 \n\t" |
||
2848 | "lea (%0, %1, 2), %%"REG_a" \n\t" |
||
2849 | "movq %%mm2, %%mm3 \n\t" |
||
2850 | "punpcklbw %%mm7, %%mm2 \n\t" // low part of line 1 |
||
2851 | "punpckhbw %%mm7, %%mm3 \n\t" // high part of line 1 |
||
2852 | |||
2853 | "movq (%%"REG_a"), %%mm4 \n\t" |
||
2854 | "movq %%mm4, %%mm5 \n\t" |
||
2855 | "punpcklbw %%mm7, %%mm4 \n\t" // low part of line 2 |
||
2856 | "punpckhbw %%mm7, %%mm5 \n\t" // high part of line 2 |
||
2857 | |||
2858 | "paddw %%mm0, %%mm0 \n\t" // 2L0 |
||
2859 | "paddw %%mm1, %%mm1 \n\t" // 2H0 |
||
2860 | "psubw %%mm4, %%mm2 \n\t" // L1 - L2 |
||
2861 | "psubw %%mm5, %%mm3 \n\t" // H1 - H2 |
||
2862 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - L1 + L2 |
||
2863 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - H1 + H2 |
||
2864 | |||
2865 | "psllw $2, %%mm2 \n\t" // 4L1 - 4L2 |
||
2866 | "psllw $2, %%mm3 \n\t" // 4H1 - 4H2 |
||
2867 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 |
||
2868 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 |
||
2869 | |||
2870 | "movq (%%"REG_a", %1), %%mm2 \n\t" |
||
2871 | "movq %%mm2, %%mm3 \n\t" |
||
2872 | "punpcklbw %%mm7, %%mm2 \n\t" // L3 |
||
2873 | "punpckhbw %%mm7, %%mm3 \n\t" // H3 |
||
2874 | |||
2875 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - L3 |
||
2876 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - H3 |
||
2877 | "psubw %%mm2, %%mm0 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
||
2878 | "psubw %%mm3, %%mm1 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
||
2879 | "movq %%mm0, (%4) \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
||
2880 | "movq %%mm1, 8(%4) \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
||
2881 | |||
2882 | "movq (%%"REG_a", %1, 2), %%mm0 \n\t" |
||
2883 | "movq %%mm0, %%mm1 \n\t" |
||
2884 | "punpcklbw %%mm7, %%mm0 \n\t" // L4 |
||
2885 | "punpckhbw %%mm7, %%mm1 \n\t" // H4 |
||
2886 | |||
2887 | "psubw %%mm0, %%mm2 \n\t" // L3 - L4 |
||
2888 | "psubw %%mm1, %%mm3 \n\t" // H3 - H4 |
||
2889 | "movq %%mm2, 16(%4) \n\t" // L3 - L4 |
||
2890 | "movq %%mm3, 24(%4) \n\t" // H3 - H4 |
||
2891 | "paddw %%mm4, %%mm4 \n\t" // 2L2 |
||
2892 | "paddw %%mm5, %%mm5 \n\t" // 2H2 |
||
2893 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - L3 + L4 |
||
2894 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - H3 + H4 |
||
2895 | |||
2896 | "lea (%%"REG_a", %1), %0 \n\t" |
||
2897 | "psllw $2, %%mm2 \n\t" // 4L3 - 4L4 |
||
2898 | "psllw $2, %%mm3 \n\t" // 4H3 - 4H4 |
||
2899 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 |
||
2900 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 |
||
2901 | //50 opcodes so far |
||
2902 | "movq (%0, %1, 2), %%mm2 \n\t" |
||
2903 | "movq %%mm2, %%mm3 \n\t" |
||
2904 | "punpcklbw %%mm7, %%mm2 \n\t" // L5 |
||
2905 | "punpckhbw %%mm7, %%mm3 \n\t" // H5 |
||
2906 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - L5 |
||
2907 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - H5 |
||
2908 | "psubw %%mm2, %%mm4 \n\t" // 2L2 - 5L3 + 5L4 - 2L5 |
||
2909 | "psubw %%mm3, %%mm5 \n\t" // 2H2 - 5H3 + 5H4 - 2H5 |
||
2910 | |||
2911 | "movq (%%"REG_a", %1, 4), %%mm6 \n\t" |
||
2912 | "punpcklbw %%mm7, %%mm6 \n\t" // L6 |
||
2913 | "psubw %%mm6, %%mm2 \n\t" // L5 - L6 |
||
2914 | "movq (%%"REG_a", %1, 4), %%mm6 \n\t" |
||
2915 | "punpckhbw %%mm7, %%mm6 \n\t" // H6 |
||
2916 | "psubw %%mm6, %%mm3 \n\t" // H5 - H6 |
||
2917 | |||
2918 | "paddw %%mm0, %%mm0 \n\t" // 2L4 |
||
2919 | "paddw %%mm1, %%mm1 \n\t" // 2H4 |
||
2920 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - L5 + L6 |
||
2921 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - H5 + H6 |
||
2922 | |||
2923 | "psllw $2, %%mm2 \n\t" // 4L5 - 4L6 |
||
2924 | "psllw $2, %%mm3 \n\t" // 4H5 - 4H6 |
||
2925 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 |
||
2926 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 |
||
2927 | |||
2928 | "movq (%0, %1, 4), %%mm2 \n\t" |
||
2929 | "movq %%mm2, %%mm3 \n\t" |
||
2930 | "punpcklbw %%mm7, %%mm2 \n\t" // L7 |
||
2931 | "punpckhbw %%mm7, %%mm3 \n\t" // H7 |
||
2932 | |||
2933 | "paddw %%mm2, %%mm2 \n\t" // 2L7 |
||
2934 | "paddw %%mm3, %%mm3 \n\t" // 2H7 |
||
2935 | "psubw %%mm2, %%mm0 \n\t" // 2L4 - 5L5 + 5L6 - 2L7 |
||
2936 | "psubw %%mm3, %%mm1 \n\t" // 2H4 - 5H5 + 5H6 - 2H7 |
||
2937 | |||
2938 | "movq (%4), %%mm2 \n\t" // 2L0 - 5L1 + 5L2 - 2L3 |
||
2939 | "movq 8(%4), %%mm3 \n\t" // 2H0 - 5H1 + 5H2 - 2H3 |
||
2940 | |||
2941 | #if TEMPLATE_PP_MMXEXT |
||
2942 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
2943 | "psubw %%mm0, %%mm6 \n\t" |
||
2944 | "pmaxsw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
||
2945 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
2946 | "psubw %%mm1, %%mm6 \n\t" |
||
2947 | "pmaxsw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
||
2948 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
2949 | "psubw %%mm2, %%mm6 \n\t" |
||
2950 | "pmaxsw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
||
2951 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
2952 | "psubw %%mm3, %%mm6 \n\t" |
||
2953 | "pmaxsw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
||
2954 | #else |
||
2955 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
2956 | "pcmpgtw %%mm0, %%mm6 \n\t" |
||
2957 | "pxor %%mm6, %%mm0 \n\t" |
||
2958 | "psubw %%mm6, %%mm0 \n\t" // |2L4 - 5L5 + 5L6 - 2L7| |
||
2959 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
2960 | "pcmpgtw %%mm1, %%mm6 \n\t" |
||
2961 | "pxor %%mm6, %%mm1 \n\t" |
||
2962 | "psubw %%mm6, %%mm1 \n\t" // |2H4 - 5H5 + 5H6 - 2H7| |
||
2963 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
2964 | "pcmpgtw %%mm2, %%mm6 \n\t" |
||
2965 | "pxor %%mm6, %%mm2 \n\t" |
||
2966 | "psubw %%mm6, %%mm2 \n\t" // |2L0 - 5L1 + 5L2 - 2L3| |
||
2967 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
2968 | "pcmpgtw %%mm3, %%mm6 \n\t" |
||
2969 | "pxor %%mm6, %%mm3 \n\t" |
||
2970 | "psubw %%mm6, %%mm3 \n\t" // |2H0 - 5H1 + 5H2 - 2H3| |
||
2971 | #endif |
||
2972 | |||
2973 | #if TEMPLATE_PP_MMXEXT |
||
2974 | "pminsw %%mm2, %%mm0 \n\t" |
||
2975 | "pminsw %%mm3, %%mm1 \n\t" |
||
2976 | #else |
||
2977 | "movq %%mm0, %%mm6 \n\t" |
||
2978 | "psubusw %%mm2, %%mm6 \n\t" |
||
2979 | "psubw %%mm6, %%mm0 \n\t" |
||
2980 | "movq %%mm1, %%mm6 \n\t" |
||
2981 | "psubusw %%mm3, %%mm6 \n\t" |
||
2982 | "psubw %%mm6, %%mm1 \n\t" |
||
2983 | #endif |
||
2984 | |||
2985 | "movd %2, %%mm2 \n\t" // QP |
||
2986 | "punpcklbw %%mm7, %%mm2 \n\t" |
||
2987 | |||
2988 | "movq %%mm7, %%mm6 \n\t" // 0 |
||
2989 | "pcmpgtw %%mm4, %%mm6 \n\t" // sign(2L2 - 5L3 + 5L4 - 2L5) |
||
2990 | "pxor %%mm6, %%mm4 \n\t" |
||
2991 | "psubw %%mm6, %%mm4 \n\t" // |2L2 - 5L3 + 5L4 - 2L5| |
||
2992 | "pcmpgtw %%mm5, %%mm7 \n\t" // sign(2H2 - 5H3 + 5H4 - 2H5) |
||
2993 | "pxor %%mm7, %%mm5 \n\t" |
||
2994 | "psubw %%mm7, %%mm5 \n\t" // |2H2 - 5H3 + 5H4 - 2H5| |
||
2995 | // 100 opcodes |
||
2996 | "psllw $3, %%mm2 \n\t" // 8QP |
||
2997 | "movq %%mm2, %%mm3 \n\t" // 8QP |
||
2998 | "pcmpgtw %%mm4, %%mm2 \n\t" |
||
2999 | "pcmpgtw %%mm5, %%mm3 \n\t" |
||
3000 | "pand %%mm2, %%mm4 \n\t" |
||
3001 | "pand %%mm3, %%mm5 \n\t" |
||
3002 | |||
3003 | |||
3004 | "psubusw %%mm0, %%mm4 \n\t" // hd |
||
3005 | "psubusw %%mm1, %%mm5 \n\t" // ld |
||
3006 | |||
3007 | |||
3008 | "movq "MANGLE(w05)", %%mm2 \n\t" // 5 |
||
3009 | "pmullw %%mm2, %%mm4 \n\t" |
||
3010 | "pmullw %%mm2, %%mm5 \n\t" |
||
3011 | "movq "MANGLE(w20)", %%mm2 \n\t" // 32 |
||
3012 | "paddw %%mm2, %%mm4 \n\t" |
||
3013 | "paddw %%mm2, %%mm5 \n\t" |
||
3014 | "psrlw $6, %%mm4 \n\t" |
||
3015 | "psrlw $6, %%mm5 \n\t" |
||
3016 | |||
3017 | "movq 16(%4), %%mm0 \n\t" // L3 - L4 |
||
3018 | "movq 24(%4), %%mm1 \n\t" // H3 - H4 |
||
3019 | |||
3020 | "pxor %%mm2, %%mm2 \n\t" |
||
3021 | "pxor %%mm3, %%mm3 \n\t" |
||
3022 | |||
3023 | "pcmpgtw %%mm0, %%mm2 \n\t" // sign (L3-L4) |
||
3024 | "pcmpgtw %%mm1, %%mm3 \n\t" // sign (H3-H4) |
||
3025 | "pxor %%mm2, %%mm0 \n\t" |
||
3026 | "pxor %%mm3, %%mm1 \n\t" |
||
3027 | "psubw %%mm2, %%mm0 \n\t" // |L3-L4| |
||
3028 | "psubw %%mm3, %%mm1 \n\t" // |H3-H4| |
||
3029 | "psrlw $1, %%mm0 \n\t" // |L3 - L4|/2 |
||
3030 | "psrlw $1, %%mm1 \n\t" // |H3 - H4|/2 |
||
3031 | |||
3032 | "pxor %%mm6, %%mm2 \n\t" |
||
3033 | "pxor %%mm7, %%mm3 \n\t" |
||
3034 | "pand %%mm2, %%mm4 \n\t" |
||
3035 | "pand %%mm3, %%mm5 \n\t" |
||
3036 | |||
3037 | #if TEMPLATE_PP_MMXEXT |
||
3038 | "pminsw %%mm0, %%mm4 \n\t" |
||
3039 | "pminsw %%mm1, %%mm5 \n\t" |
||
3040 | #else |
||
3041 | "movq %%mm4, %%mm2 \n\t" |
||
3042 | "psubusw %%mm0, %%mm2 \n\t" |
||
3043 | "psubw %%mm2, %%mm4 \n\t" |
||
3044 | "movq %%mm5, %%mm2 \n\t" |
||
3045 | "psubusw %%mm1, %%mm2 \n\t" |
||
3046 | "psubw %%mm2, %%mm5 \n\t" |
||
3047 | #endif |
||
3048 | "pxor %%mm6, %%mm4 \n\t" |
||
3049 | "pxor %%mm7, %%mm5 \n\t" |
||
3050 | "psubw %%mm6, %%mm4 \n\t" |
||
3051 | "psubw %%mm7, %%mm5 \n\t" |
||
3052 | "packsswb %%mm5, %%mm4 \n\t" |
||
3053 | "movq %3, %%mm1 \n\t" |
||
3054 | "pandn %%mm4, %%mm1 \n\t" |
||
3055 | "movq (%0), %%mm0 \n\t" |
||
3056 | "paddb %%mm1, %%mm0 \n\t" |
||
3057 | "movq %%mm0, (%0) \n\t" |
||
3058 | "movq (%0, %1), %%mm0 \n\t" |
||
3059 | "psubb %%mm1, %%mm0 \n\t" |
||
3060 | "movq %%mm0, (%0, %1) \n\t" |
||
3061 | |||
3062 | : "+r" (temp_src) |
||
3063 | : "r" ((x86_reg)step), "m" (c->pQPb), "m"(eq_mask), "r"(tmp) |
||
3064 | : "%"REG_a |
||
3065 | ); |
||
3066 | } |
||
3067 | /*if(step==16){ |
||
3068 | STOP_TIMER("step16") |
||
3069 | }else{ |
||
3070 | STOP_TIMER("stepX") |
||
3071 | } |
||
3072 | } */ |
||
3073 | } |
||
3074 | #endif //TEMPLATE_PP_MMX |
||
3075 | |||
3076 | static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
||
3077 | const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c); |
||
3078 | |||
3079 | /** |
||
3080 | * Copy a block from src to dst and fixes the blacklevel. |
||
3081 | * levelFix == 0 -> do not touch the brightness & contrast |
||
3082 | */ |
||
3083 | #undef REAL_SCALED_CPY |
||
3084 | #undef SCALED_CPY |
||
3085 | |||
3086 | static inline void RENAME(blockCopy)(uint8_t dst[], int dstStride, const uint8_t src[], int srcStride, |
||
3087 | int levelFix, int64_t *packedOffsetAndScale) |
||
3088 | { |
||
3089 | #if !TEMPLATE_PP_MMX |
||
3090 | int i; |
||
3091 | #endif |
||
3092 | if(levelFix){ |
||
3093 | #if TEMPLATE_PP_MMX |
||
3094 | __asm__ volatile( |
||
3095 | "movq (%%"REG_a"), %%mm2 \n\t" // packedYOffset |
||
3096 | "movq 8(%%"REG_a"), %%mm3 \n\t" // packedYScale |
||
3097 | "lea (%2,%4), %%"REG_a" \n\t" |
||
3098 | "lea (%3,%5), %%"REG_d" \n\t" |
||
3099 | "pxor %%mm4, %%mm4 \n\t" |
||
3100 | #if TEMPLATE_PP_MMXEXT |
||
3101 | #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ |
||
3102 | "movq " #src1 ", %%mm0 \n\t"\ |
||
3103 | "movq " #src1 ", %%mm5 \n\t"\ |
||
3104 | "movq " #src2 ", %%mm1 \n\t"\ |
||
3105 | "movq " #src2 ", %%mm6 \n\t"\ |
||
3106 | "punpcklbw %%mm0, %%mm0 \n\t"\ |
||
3107 | "punpckhbw %%mm5, %%mm5 \n\t"\ |
||
3108 | "punpcklbw %%mm1, %%mm1 \n\t"\ |
||
3109 | "punpckhbw %%mm6, %%mm6 \n\t"\ |
||
3110 | "pmulhuw %%mm3, %%mm0 \n\t"\ |
||
3111 | "pmulhuw %%mm3, %%mm5 \n\t"\ |
||
3112 | "pmulhuw %%mm3, %%mm1 \n\t"\ |
||
3113 | "pmulhuw %%mm3, %%mm6 \n\t"\ |
||
3114 | "psubw %%mm2, %%mm0 \n\t"\ |
||
3115 | "psubw %%mm2, %%mm5 \n\t"\ |
||
3116 | "psubw %%mm2, %%mm1 \n\t"\ |
||
3117 | "psubw %%mm2, %%mm6 \n\t"\ |
||
3118 | "packuswb %%mm5, %%mm0 \n\t"\ |
||
3119 | "packuswb %%mm6, %%mm1 \n\t"\ |
||
3120 | "movq %%mm0, " #dst1 " \n\t"\ |
||
3121 | "movq %%mm1, " #dst2 " \n\t"\ |
||
3122 | |||
3123 | #else //TEMPLATE_PP_MMXEXT |
||
3124 | #define REAL_SCALED_CPY(src1, src2, dst1, dst2) \ |
||
3125 | "movq " #src1 ", %%mm0 \n\t"\ |
||
3126 | "movq " #src1 ", %%mm5 \n\t"\ |
||
3127 | "punpcklbw %%mm4, %%mm0 \n\t"\ |
||
3128 | "punpckhbw %%mm4, %%mm5 \n\t"\ |
||
3129 | "psubw %%mm2, %%mm0 \n\t"\ |
||
3130 | "psubw %%mm2, %%mm5 \n\t"\ |
||
3131 | "movq " #src2 ", %%mm1 \n\t"\ |
||
3132 | "psllw $6, %%mm0 \n\t"\ |
||
3133 | "psllw $6, %%mm5 \n\t"\ |
||
3134 | "pmulhw %%mm3, %%mm0 \n\t"\ |
||
3135 | "movq " #src2 ", %%mm6 \n\t"\ |
||
3136 | "pmulhw %%mm3, %%mm5 \n\t"\ |
||
3137 | "punpcklbw %%mm4, %%mm1 \n\t"\ |
||
3138 | "punpckhbw %%mm4, %%mm6 \n\t"\ |
||
3139 | "psubw %%mm2, %%mm1 \n\t"\ |
||
3140 | "psubw %%mm2, %%mm6 \n\t"\ |
||
3141 | "psllw $6, %%mm1 \n\t"\ |
||
3142 | "psllw $6, %%mm6 \n\t"\ |
||
3143 | "pmulhw %%mm3, %%mm1 \n\t"\ |
||
3144 | "pmulhw %%mm3, %%mm6 \n\t"\ |
||
3145 | "packuswb %%mm5, %%mm0 \n\t"\ |
||
3146 | "packuswb %%mm6, %%mm1 \n\t"\ |
||
3147 | "movq %%mm0, " #dst1 " \n\t"\ |
||
3148 | "movq %%mm1, " #dst2 " \n\t"\ |
||
3149 | |||
3150 | #endif //TEMPLATE_PP_MMXEXT |
||
3151 | #define SCALED_CPY(src1, src2, dst1, dst2)\ |
||
3152 | REAL_SCALED_CPY(src1, src2, dst1, dst2) |
||
3153 | |||
3154 | SCALED_CPY((%2) , (%2, %4) , (%3) , (%3, %5)) |
||
3155 | SCALED_CPY((%2, %4, 2), (%%REGa, %4, 2), (%3, %5, 2), (%%REGd, %5, 2)) |
||
3156 | SCALED_CPY((%2, %4, 4), (%%REGa, %4, 4), (%3, %5, 4), (%%REGd, %5, 4)) |
||
3157 | "lea (%%"REG_a",%4,4), %%"REG_a" \n\t" |
||
3158 | "lea (%%"REG_d",%5,4), %%"REG_d" \n\t" |
||
3159 | SCALED_CPY((%%REGa, %4), (%%REGa, %4, 2), (%%REGd, %5), (%%REGd, %5, 2)) |
||
3160 | |||
3161 | |||
3162 | : "=&a" (packedOffsetAndScale) |
||
3163 | : "0" (packedOffsetAndScale), |
||
3164 | "r"(src), |
||
3165 | "r"(dst), |
||
3166 | "r" ((x86_reg)srcStride), |
||
3167 | "r" ((x86_reg)dstStride) |
||
3168 | : "%"REG_d |
||
3169 | ); |
||
3170 | #else //TEMPLATE_PP_MMX |
||
3171 | for(i=0; i<8; i++) |
||
3172 | memcpy( &(dst[dstStride*i]), |
||
3173 | &(src[srcStride*i]), BLOCK_SIZE); |
||
3174 | #endif //TEMPLATE_PP_MMX |
||
3175 | }else{ |
||
3176 | #if TEMPLATE_PP_MMX |
||
3177 | __asm__ volatile( |
||
3178 | "lea (%0,%2), %%"REG_a" \n\t" |
||
3179 | "lea (%1,%3), %%"REG_d" \n\t" |
||
3180 | |||
3181 | #define REAL_SIMPLE_CPY(src1, src2, dst1, dst2) \ |
||
3182 | "movq " #src1 ", %%mm0 \n\t"\ |
||
3183 | "movq " #src2 ", %%mm1 \n\t"\ |
||
3184 | "movq %%mm0, " #dst1 " \n\t"\ |
||
3185 | "movq %%mm1, " #dst2 " \n\t"\ |
||
3186 | |||
3187 | #define SIMPLE_CPY(src1, src2, dst1, dst2)\ |
||
3188 | REAL_SIMPLE_CPY(src1, src2, dst1, dst2) |
||
3189 | |||
3190 | SIMPLE_CPY((%0) , (%0, %2) , (%1) , (%1, %3)) |
||
3191 | SIMPLE_CPY((%0, %2, 2), (%%REGa, %2, 2), (%1, %3, 2), (%%REGd, %3, 2)) |
||
3192 | SIMPLE_CPY((%0, %2, 4), (%%REGa, %2, 4), (%1, %3, 4), (%%REGd, %3, 4)) |
||
3193 | "lea (%%"REG_a",%2,4), %%"REG_a" \n\t" |
||
3194 | "lea (%%"REG_d",%3,4), %%"REG_d" \n\t" |
||
3195 | SIMPLE_CPY((%%REGa, %2), (%%REGa, %2, 2), (%%REGd, %3), (%%REGd, %3, 2)) |
||
3196 | |||
3197 | : : "r" (src), |
||
3198 | "r" (dst), |
||
3199 | "r" ((x86_reg)srcStride), |
||
3200 | "r" ((x86_reg)dstStride) |
||
3201 | : "%"REG_a, "%"REG_d |
||
3202 | ); |
||
3203 | #else //TEMPLATE_PP_MMX |
||
3204 | for(i=0; i<8; i++) |
||
3205 | memcpy( &(dst[dstStride*i]), |
||
3206 | &(src[srcStride*i]), BLOCK_SIZE); |
||
3207 | #endif //TEMPLATE_PP_MMX |
||
3208 | } |
||
3209 | } |
||
3210 | |||
3211 | /** |
||
3212 | * Duplicate the given 8 src pixels ? times upward |
||
3213 | */ |
||
3214 | static inline void RENAME(duplicate)(uint8_t src[], int stride) |
||
3215 | { |
||
3216 | #if TEMPLATE_PP_MMX |
||
3217 | __asm__ volatile( |
||
3218 | "movq (%0), %%mm0 \n\t" |
||
3219 | "movq %%mm0, (%0, %1, 4) \n\t" |
||
3220 | "add %1, %0 \n\t" |
||
3221 | "movq %%mm0, (%0) \n\t" |
||
3222 | "movq %%mm0, (%0, %1) \n\t" |
||
3223 | "movq %%mm0, (%0, %1, 2) \n\t" |
||
3224 | "movq %%mm0, (%0, %1, 4) \n\t" |
||
3225 | : "+r" (src) |
||
3226 | : "r" ((x86_reg)-stride) |
||
3227 | ); |
||
3228 | #else |
||
3229 | int i; |
||
3230 | uint8_t *p=src; |
||
3231 | for(i=0; i<5; i++){ |
||
3232 | p-= stride; |
||
3233 | memcpy(p, src, 8); |
||
3234 | } |
||
3235 | #endif |
||
3236 | } |
||
3237 | |||
3238 | /** |
||
3239 | * Filter array of bytes (Y or U or V values) |
||
3240 | */ |
||
3241 | static void RENAME(postProcess)(const uint8_t src[], int srcStride, uint8_t dst[], int dstStride, int width, int height, |
||
3242 | const QP_STORE_T QPs[], int QPStride, int isColor, PPContext *c2) |
||
3243 | { |
||
3244 | DECLARE_ALIGNED(8, PPContext, c)= *c2; //copy to stack for faster access |
||
3245 | int x,y; |
||
3246 | #ifdef TEMPLATE_PP_TIME_MODE |
||
3247 | const int mode= TEMPLATE_PP_TIME_MODE; |
||
3248 | #else |
||
3249 | const int mode= isColor ? c.ppMode.chromMode : c.ppMode.lumMode; |
||
3250 | #endif |
||
3251 | int black=0, white=255; // blackest black and whitest white in the picture |
||
3252 | int QPCorrecture= 256*256; |
||
3253 | |||
3254 | int copyAhead; |
||
3255 | #if TEMPLATE_PP_MMX |
||
3256 | int i; |
||
3257 | #endif |
||
3258 | |||
3259 | const int qpHShift= isColor ? 4-c.hChromaSubSample : 4; |
||
3260 | const int qpVShift= isColor ? 4-c.vChromaSubSample : 4; |
||
3261 | |||
3262 | //FIXME remove |
||
3263 | uint64_t * const yHistogram= c.yHistogram; |
||
3264 | uint8_t * const tempSrc= srcStride > 0 ? c.tempSrc : c.tempSrc - 23*srcStride; |
||
3265 | uint8_t * const tempDst= (dstStride > 0 ? c.tempDst : c.tempDst - 23*dstStride) + 32; |
||
3266 | //const int mbWidth= isColor ? (width+7)>>3 : (width+15)>>4; |
||
3267 | |||
3268 | #if TEMPLATE_PP_MMX |
||
3269 | for(i=0; i<57; i++){ |
||
3270 | int offset= ((i*c.ppMode.baseDcDiff)>>8) + 1; |
||
3271 | int threshold= offset*2 + 1; |
||
3272 | c.mmxDcOffset[i]= 0x7F - offset; |
||
3273 | c.mmxDcThreshold[i]= 0x7F - threshold; |
||
3274 | c.mmxDcOffset[i]*= 0x0101010101010101LL; |
||
3275 | c.mmxDcThreshold[i]*= 0x0101010101010101LL; |
||
3276 | } |
||
3277 | #endif |
||
3278 | |||
3279 | if(mode & CUBIC_IPOL_DEINT_FILTER) copyAhead=16; |
||
3280 | else if( (mode & LINEAR_BLEND_DEINT_FILTER) |
||
3281 | || (mode & FFMPEG_DEINT_FILTER) |
||
3282 | || (mode & LOWPASS5_DEINT_FILTER)) copyAhead=14; |
||
3283 | else if( (mode & V_DEBLOCK) |
||
3284 | || (mode & LINEAR_IPOL_DEINT_FILTER) |
||
3285 | || (mode & MEDIAN_DEINT_FILTER) |
||
3286 | || (mode & V_A_DEBLOCK)) copyAhead=13; |
||
3287 | else if(mode & V_X1_FILTER) copyAhead=11; |
||
3288 | // else if(mode & V_RK1_FILTER) copyAhead=10; |
||
3289 | else if(mode & DERING) copyAhead=9; |
||
3290 | else copyAhead=8; |
||
3291 | |||
3292 | copyAhead-= 8; |
||
3293 | |||
3294 | if(!isColor){ |
||
3295 | uint64_t sum= 0; |
||
3296 | int i; |
||
3297 | uint64_t maxClipped; |
||
3298 | uint64_t clipped; |
||
3299 | double scale; |
||
3300 | |||
3301 | c.frameNum++; |
||
3302 | // first frame is fscked so we ignore it |
||
3303 | if(c.frameNum == 1) yHistogram[0]= width*(uint64_t)height/64*15/256; |
||
3304 | |||
3305 | for(i=0; i<256; i++){ |
||
3306 | sum+= yHistogram[i]; |
||
3307 | } |
||
3308 | |||
3309 | /* We always get a completely black picture first. */ |
||
3310 | maxClipped= (uint64_t)(sum * c.ppMode.maxClippedThreshold); |
||
3311 | |||
3312 | clipped= sum; |
||
3313 | for(black=255; black>0; black--){ |
||
3314 | if(clipped < maxClipped) break; |
||
3315 | clipped-= yHistogram[black]; |
||
3316 | } |
||
3317 | |||
3318 | clipped= sum; |
||
3319 | for(white=0; white<256; white++){ |
||
3320 | if(clipped < maxClipped) break; |
||
3321 | clipped-= yHistogram[white]; |
||
3322 | } |
||
3323 | |||
3324 | scale= (double)(c.ppMode.maxAllowedY - c.ppMode.minAllowedY) / (double)(white-black); |
||
3325 | |||
3326 | #if TEMPLATE_PP_MMXEXT |
||
3327 | c.packedYScale= (uint16_t)(scale*256.0 + 0.5); |
||
3328 | c.packedYOffset= (((black*c.packedYScale)>>8) - c.ppMode.minAllowedY) & 0xFFFF; |
||
3329 | #else |
||
3330 | c.packedYScale= (uint16_t)(scale*1024.0 + 0.5); |
||
3331 | c.packedYOffset= (black - c.ppMode.minAllowedY) & 0xFFFF; |
||
3332 | #endif |
||
3333 | |||
3334 | c.packedYOffset|= c.packedYOffset<<32; |
||
3335 | c.packedYOffset|= c.packedYOffset<<16; |
||
3336 | |||
3337 | c.packedYScale|= c.packedYScale<<32; |
||
3338 | c.packedYScale|= c.packedYScale<<16; |
||
3339 | |||
3340 | if(mode & LEVEL_FIX) QPCorrecture= (int)(scale*256*256 + 0.5); |
||
3341 | else QPCorrecture= 256*256; |
||
3342 | }else{ |
||
3343 | c.packedYScale= 0x0100010001000100LL; |
||
3344 | c.packedYOffset= 0; |
||
3345 | QPCorrecture= 256*256; |
||
3346 | } |
||
3347 | |||
3348 | /* copy & deinterlace first row of blocks */ |
||
3349 | y=-BLOCK_SIZE; |
||
3350 | { |
||
3351 | const uint8_t *srcBlock= &(src[y*srcStride]); |
||
3352 | uint8_t *dstBlock= tempDst + dstStride; |
||
3353 | |||
3354 | // From this point on it is guaranteed that we can read and write 16 lines downward |
||
3355 | // finish 1 block before the next otherwise we might have a problem |
||
3356 | // with the L1 Cache of the P4 ... or only a few blocks at a time or something |
||
3357 | for(x=0; x |
||
3358 | |||
3359 | #if TEMPLATE_PP_MMXEXT |
||
3360 | /* |
||
3361 | prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
||
3362 | prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
||
3363 | prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
||
3364 | prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
||
3365 | */ |
||
3366 | |||
3367 | __asm__( |
||
3368 | "mov %4, %%"REG_a" \n\t" |
||
3369 | "shr $2, %%"REG_a" \n\t" |
||
3370 | "and $6, %%"REG_a" \n\t" |
||
3371 | "add %5, %%"REG_a" \n\t" |
||
3372 | "mov %%"REG_a", %%"REG_d" \n\t" |
||
3373 | "imul %1, %%"REG_a" \n\t" |
||
3374 | "imul %3, %%"REG_d" \n\t" |
||
3375 | "prefetchnta 32(%%"REG_a", %0) \n\t" |
||
3376 | "prefetcht0 32(%%"REG_d", %2) \n\t" |
||
3377 | "add %1, %%"REG_a" \n\t" |
||
3378 | "add %3, %%"REG_d" \n\t" |
||
3379 | "prefetchnta 32(%%"REG_a", %0) \n\t" |
||
3380 | "prefetcht0 32(%%"REG_d", %2) \n\t" |
||
3381 | :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), |
||
3382 | "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) |
||
3383 | : "%"REG_a, "%"REG_d |
||
3384 | ); |
||
3385 | |||
3386 | #elif TEMPLATE_PP_3DNOW |
||
3387 | //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... |
||
3388 | /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
||
3389 | prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
||
3390 | prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
||
3391 | prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
||
3392 | */ |
||
3393 | #endif |
||
3394 | |||
3395 | RENAME(blockCopy)(dstBlock + dstStride*8, dstStride, |
||
3396 | srcBlock + srcStride*8, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
||
3397 | |||
3398 | RENAME(duplicate)(dstBlock + dstStride*8, dstStride); |
||
3399 | |||
3400 | if(mode & LINEAR_IPOL_DEINT_FILTER) |
||
3401 | RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
||
3402 | else if(mode & LINEAR_BLEND_DEINT_FILTER) |
||
3403 | RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); |
||
3404 | else if(mode & MEDIAN_DEINT_FILTER) |
||
3405 | RENAME(deInterlaceMedian)(dstBlock, dstStride); |
||
3406 | else if(mode & CUBIC_IPOL_DEINT_FILTER) |
||
3407 | RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
||
3408 | else if(mode & FFMPEG_DEINT_FILTER) |
||
3409 | RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); |
||
3410 | else if(mode & LOWPASS5_DEINT_FILTER) |
||
3411 | RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); |
||
3412 | /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
||
3413 | RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
||
3414 | */ |
||
3415 | dstBlock+=8; |
||
3416 | srcBlock+=8; |
||
3417 | } |
||
3418 | if(width==FFABS(dstStride)) |
||
3419 | linecpy(dst, tempDst + 9*dstStride, copyAhead, dstStride); |
||
3420 | else{ |
||
3421 | int i; |
||
3422 | for(i=0; i |
||
3423 | memcpy(dst + i*dstStride, tempDst + (9+i)*dstStride, width); |
||
3424 | } |
||
3425 | } |
||
3426 | } |
||
3427 | |||
3428 | for(y=0; y |
||
3429 | //1% speedup if these are here instead of the inner loop |
||
3430 | const uint8_t *srcBlock= &(src[y*srcStride]); |
||
3431 | uint8_t *dstBlock= &(dst[y*dstStride]); |
||
3432 | #if TEMPLATE_PP_MMX |
||
3433 | uint8_t *tempBlock1= c.tempBlocks; |
||
3434 | uint8_t *tempBlock2= c.tempBlocks + 8; |
||
3435 | #endif |
||
3436 | const int8_t *QPptr= &QPs[(y>>qpVShift)*QPStride]; |
||
3437 | int8_t *nonBQPptr= &c.nonBQPTable[(y>>qpVShift)*FFABS(QPStride)]; |
||
3438 | int QP=0; |
||
3439 | /* can we mess with a 8x16 block from srcBlock/dstBlock downwards and 1 line upwards |
||
3440 | if not than use a temporary buffer */ |
||
3441 | if(y+15 >= height){ |
||
3442 | int i; |
||
3443 | /* copy from line (copyAhead) to (copyAhead+7) of src, these will be copied with |
||
3444 | blockcopy to dst later */ |
||
3445 | linecpy(tempSrc + srcStride*copyAhead, srcBlock + srcStride*copyAhead, |
||
3446 | FFMAX(height-y-copyAhead, 0), srcStride); |
||
3447 | |||
3448 | /* duplicate last line of src to fill the void up to line (copyAhead+7) */ |
||
3449 | for(i=FFMAX(height-y, 8); i |
||
3450 | memcpy(tempSrc + srcStride*i, src + srcStride*(height-1), FFABS(srcStride)); |
||
3451 | |||
3452 | /* copy up to (copyAhead+1) lines of dst (line -1 to (copyAhead-1))*/ |
||
3453 | linecpy(tempDst, dstBlock - dstStride, FFMIN(height-y+1, copyAhead+1), dstStride); |
||
3454 | |||
3455 | /* duplicate last line of dst to fill the void up to line (copyAhead) */ |
||
3456 | for(i=height-y+1; i<=copyAhead; i++) |
||
3457 | memcpy(tempDst + dstStride*i, dst + dstStride*(height-1), FFABS(dstStride)); |
||
3458 | |||
3459 | dstBlock= tempDst + dstStride; |
||
3460 | srcBlock= tempSrc; |
||
3461 | } |
||
3462 | |||
3463 | // From this point on it is guaranteed that we can read and write 16 lines downward |
||
3464 | // finish 1 block before the next otherwise we might have a problem |
||
3465 | // with the L1 Cache of the P4 ... or only a few blocks at a time or something |
||
3466 | for(x=0; x |
||
3467 | const int stride= dstStride; |
||
3468 | #if TEMPLATE_PP_MMX |
||
3469 | uint8_t *tmpXchg; |
||
3470 | #endif |
||
3471 | if(isColor){ |
||
3472 | QP= QPptr[x>>qpHShift]; |
||
3473 | c.nonBQP= nonBQPptr[x>>qpHShift]; |
||
3474 | }else{ |
||
3475 | QP= QPptr[x>>4]; |
||
3476 | QP= (QP* QPCorrecture + 256*128)>>16; |
||
3477 | c.nonBQP= nonBQPptr[x>>4]; |
||
3478 | c.nonBQP= (c.nonBQP* QPCorrecture + 256*128)>>16; |
||
3479 | yHistogram[ srcBlock[srcStride*12 + 4] ]++; |
||
3480 | } |
||
3481 | c.QP= QP; |
||
3482 | #if TEMPLATE_PP_MMX |
||
3483 | __asm__ volatile( |
||
3484 | "movd %1, %%mm7 \n\t" |
||
3485 | "packuswb %%mm7, %%mm7 \n\t" // 0, 0, 0, QP, 0, 0, 0, QP |
||
3486 | "packuswb %%mm7, %%mm7 \n\t" // 0,QP, 0, QP, 0,QP, 0, QP |
||
3487 | "packuswb %%mm7, %%mm7 \n\t" // QP,..., QP |
||
3488 | "movq %%mm7, %0 \n\t" |
||
3489 | : "=m" (c.pQPb) |
||
3490 | : "r" (QP) |
||
3491 | ); |
||
3492 | #endif |
||
3493 | |||
3494 | |||
3495 | #if TEMPLATE_PP_MMXEXT |
||
3496 | /* |
||
3497 | prefetchnta(srcBlock + (((x>>2)&6) + 5)*srcStride + 32); |
||
3498 | prefetchnta(srcBlock + (((x>>2)&6) + 6)*srcStride + 32); |
||
3499 | prefetcht0(dstBlock + (((x>>2)&6) + 5)*dstStride + 32); |
||
3500 | prefetcht0(dstBlock + (((x>>2)&6) + 6)*dstStride + 32); |
||
3501 | */ |
||
3502 | |||
3503 | __asm__( |
||
3504 | "mov %4, %%"REG_a" \n\t" |
||
3505 | "shr $2, %%"REG_a" \n\t" |
||
3506 | "and $6, %%"REG_a" \n\t" |
||
3507 | "add %5, %%"REG_a" \n\t" |
||
3508 | "mov %%"REG_a", %%"REG_d" \n\t" |
||
3509 | "imul %1, %%"REG_a" \n\t" |
||
3510 | "imul %3, %%"REG_d" \n\t" |
||
3511 | "prefetchnta 32(%%"REG_a", %0) \n\t" |
||
3512 | "prefetcht0 32(%%"REG_d", %2) \n\t" |
||
3513 | "add %1, %%"REG_a" \n\t" |
||
3514 | "add %3, %%"REG_d" \n\t" |
||
3515 | "prefetchnta 32(%%"REG_a", %0) \n\t" |
||
3516 | "prefetcht0 32(%%"REG_d", %2) \n\t" |
||
3517 | :: "r" (srcBlock), "r" ((x86_reg)srcStride), "r" (dstBlock), "r" ((x86_reg)dstStride), |
||
3518 | "g" ((x86_reg)x), "g" ((x86_reg)copyAhead) |
||
3519 | : "%"REG_a, "%"REG_d |
||
3520 | ); |
||
3521 | |||
3522 | #elif TEMPLATE_PP_3DNOW |
||
3523 | //FIXME check if this is faster on an 3dnow chip or if it is faster without the prefetch or ... |
||
3524 | /* prefetch(srcBlock + (((x>>3)&3) + 5)*srcStride + 32); |
||
3525 | prefetch(srcBlock + (((x>>3)&3) + 9)*srcStride + 32); |
||
3526 | prefetchw(dstBlock + (((x>>3)&3) + 5)*dstStride + 32); |
||
3527 | prefetchw(dstBlock + (((x>>3)&3) + 9)*dstStride + 32); |
||
3528 | */ |
||
3529 | #endif |
||
3530 | |||
3531 | RENAME(blockCopy)(dstBlock + dstStride*copyAhead, dstStride, |
||
3532 | srcBlock + srcStride*copyAhead, srcStride, mode & LEVEL_FIX, &c.packedYOffset); |
||
3533 | |||
3534 | if(mode & LINEAR_IPOL_DEINT_FILTER) |
||
3535 | RENAME(deInterlaceInterpolateLinear)(dstBlock, dstStride); |
||
3536 | else if(mode & LINEAR_BLEND_DEINT_FILTER) |
||
3537 | RENAME(deInterlaceBlendLinear)(dstBlock, dstStride, c.deintTemp + x); |
||
3538 | else if(mode & MEDIAN_DEINT_FILTER) |
||
3539 | RENAME(deInterlaceMedian)(dstBlock, dstStride); |
||
3540 | else if(mode & CUBIC_IPOL_DEINT_FILTER) |
||
3541 | RENAME(deInterlaceInterpolateCubic)(dstBlock, dstStride); |
||
3542 | else if(mode & FFMPEG_DEINT_FILTER) |
||
3543 | RENAME(deInterlaceFF)(dstBlock, dstStride, c.deintTemp + x); |
||
3544 | else if(mode & LOWPASS5_DEINT_FILTER) |
||
3545 | RENAME(deInterlaceL5)(dstBlock, dstStride, c.deintTemp + x, c.deintTemp + width + x); |
||
3546 | /* else if(mode & CUBIC_BLEND_DEINT_FILTER) |
||
3547 | RENAME(deInterlaceBlendCubic)(dstBlock, dstStride); |
||
3548 | */ |
||
3549 | |||
3550 | /* only deblock if we have 2 blocks */ |
||
3551 | if(y + 8 < height){ |
||
3552 | if(mode & V_X1_FILTER) |
||
3553 | RENAME(vertX1Filter)(dstBlock, stride, &c); |
||
3554 | else if(mode & V_DEBLOCK){ |
||
3555 | const int t= RENAME(vertClassify)(dstBlock, stride, &c); |
||
3556 | |||
3557 | if(t==1) |
||
3558 | RENAME(doVertLowPass)(dstBlock, stride, &c); |
||
3559 | else if(t==2) |
||
3560 | RENAME(doVertDefFilter)(dstBlock, stride, &c); |
||
3561 | }else if(mode & V_A_DEBLOCK){ |
||
3562 | RENAME(do_a_deblock)(dstBlock, stride, 1, &c); |
||
3563 | } |
||
3564 | } |
||
3565 | |||
3566 | #if TEMPLATE_PP_MMX |
||
3567 | RENAME(transpose1)(tempBlock1, tempBlock2, dstBlock, dstStride); |
||
3568 | #endif |
||
3569 | /* check if we have a previous block to deblock it with dstBlock */ |
||
3570 | if(x - 8 >= 0){ |
||
3571 | #if TEMPLATE_PP_MMX |
||
3572 | if(mode & H_X1_FILTER) |
||
3573 | RENAME(vertX1Filter)(tempBlock1, 16, &c); |
||
3574 | else if(mode & H_DEBLOCK){ |
||
3575 | //START_TIMER |
||
3576 | const int t= RENAME(vertClassify)(tempBlock1, 16, &c); |
||
3577 | //STOP_TIMER("dc & minmax") |
||
3578 | if(t==1) |
||
3579 | RENAME(doVertLowPass)(tempBlock1, 16, &c); |
||
3580 | else if(t==2) |
||
3581 | RENAME(doVertDefFilter)(tempBlock1, 16, &c); |
||
3582 | }else if(mode & H_A_DEBLOCK){ |
||
3583 | RENAME(do_a_deblock)(tempBlock1, 16, 1, &c); |
||
3584 | } |
||
3585 | |||
3586 | RENAME(transpose2)(dstBlock-4, dstStride, tempBlock1 + 4*16); |
||
3587 | |||
3588 | #else |
||
3589 | if(mode & H_X1_FILTER) |
||
3590 | horizX1Filter(dstBlock-4, stride, QP); |
||
3591 | else if(mode & H_DEBLOCK){ |
||
3592 | #if TEMPLATE_PP_ALTIVEC |
||
3593 | DECLARE_ALIGNED(16, unsigned char, tempBlock)[272]; |
||
3594 | int t; |
||
3595 | transpose_16x8_char_toPackedAlign_altivec(tempBlock, dstBlock - (4 + 1), stride); |
||
3596 | |||
3597 | t = vertClassify_altivec(tempBlock-48, 16, &c); |
||
3598 | if(t==1) { |
||
3599 | doVertLowPass_altivec(tempBlock-48, 16, &c); |
||
3600 | transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); |
||
3601 | } |
||
3602 | else if(t==2) { |
||
3603 | doVertDefFilter_altivec(tempBlock-48, 16, &c); |
||
3604 | transpose_8x16_char_fromPackedAlign_altivec(dstBlock - (4 + 1), tempBlock, stride); |
||
3605 | } |
||
3606 | #else |
||
3607 | const int t= RENAME(horizClassify)(dstBlock-4, stride, &c); |
||
3608 | |||
3609 | if(t==1) |
||
3610 | RENAME(doHorizLowPass)(dstBlock-4, stride, &c); |
||
3611 | else if(t==2) |
||
3612 | RENAME(doHorizDefFilter)(dstBlock-4, stride, &c); |
||
3613 | #endif |
||
3614 | }else if(mode & H_A_DEBLOCK){ |
||
3615 | RENAME(do_a_deblock)(dstBlock-8, 1, stride, &c); |
||
3616 | } |
||
3617 | #endif //TEMPLATE_PP_MMX |
||
3618 | if(mode & DERING){ |
||
3619 | //FIXME filter first line |
||
3620 | if(y>0) RENAME(dering)(dstBlock - stride - 8, stride, &c); |
||
3621 | } |
||
3622 | |||
3623 | if(mode & TEMP_NOISE_FILTER) |
||
3624 | { |
||
3625 | RENAME(tempNoiseReducer)(dstBlock-8, stride, |
||
3626 | c.tempBlurred[isColor] + y*dstStride + x, |
||
3627 | c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256, |
||
3628 | c.ppMode.maxTmpNoise); |
||
3629 | } |
||
3630 | } |
||
3631 | |||
3632 | dstBlock+=8; |
||
3633 | srcBlock+=8; |
||
3634 | |||
3635 | #if TEMPLATE_PP_MMX |
||
3636 | tmpXchg= tempBlock1; |
||
3637 | tempBlock1= tempBlock2; |
||
3638 | tempBlock2 = tmpXchg; |
||
3639 | #endif |
||
3640 | } |
||
3641 | |||
3642 | if(mode & DERING){ |
||
3643 | if(y > 0) RENAME(dering)(dstBlock - dstStride - 8, dstStride, &c); |
||
3644 | } |
||
3645 | |||
3646 | if((mode & TEMP_NOISE_FILTER)){ |
||
3647 | RENAME(tempNoiseReducer)(dstBlock-8, dstStride, |
||
3648 | c.tempBlurred[isColor] + y*dstStride + x, |
||
3649 | c.tempBlurredPast[isColor] + (y>>3)*256 + (x>>3) + 256, |
||
3650 | c.ppMode.maxTmpNoise); |
||
3651 | } |
||
3652 | |||
3653 | /* did we use a tmp buffer for the last lines*/ |
||
3654 | if(y+15 >= height){ |
||
3655 | uint8_t *dstBlock= &(dst[y*dstStride]); |
||
3656 | if(width==FFABS(dstStride)) |
||
3657 | linecpy(dstBlock, tempDst + dstStride, height-y, dstStride); |
||
3658 | else{ |
||
3659 | int i; |
||
3660 | for(i=0; i |
||
3661 | memcpy(dstBlock + i*dstStride, tempDst + (i+1)*dstStride, width); |
||
3662 | } |
||
3663 | } |
||
3664 | } |
||
3665 | /* |
||
3666 | for(x=0; x |
||
3667 | volatile int i; |
||
3668 | i+= dstBlock[x + 7*dstStride] + dstBlock[x + 8*dstStride] |
||
3669 | + dstBlock[x + 9*dstStride] + dstBlock[x +10*dstStride] |
||
3670 | + dstBlock[x +11*dstStride] + dstBlock[x +12*dstStride]; |
||
3671 | + dstBlock[x +13*dstStride] |
||
3672 | + dstBlock[x +14*dstStride] + dstBlock[x +15*dstStride]; |
||
3673 | }*/ |
||
3674 | } |
||
3675 | #if TEMPLATE_PP_3DNOW |
||
3676 | __asm__ volatile("femms"); |
||
3677 | #elif TEMPLATE_PP_MMX |
||
3678 | __asm__ volatile("emms"); |
||
3679 | #endif |
||
3680 | |||
3681 | #ifdef DEBUG_BRIGHTNESS |
||
3682 | if(!isColor){ |
||
3683 | int max=1; |
||
3684 | int i; |
||
3685 | for(i=0; i<256; i++) |
||
3686 | if(yHistogram[i] > max) max=yHistogram[i]; |
||
3687 | |||
3688 | for(i=1; i<256; i++){ |
||
3689 | int x; |
||
3690 | int start=yHistogram[i-1]/(max/256+1); |
||
3691 | int end=yHistogram[i]/(max/256+1); |
||
3692 | int inc= end > start ? 1 : -1; |
||
3693 | for(x=start; x!=end+inc; x+=inc) |
||
3694 | dst[ i*dstStride + x]+=128; |
||
3695 | } |
||
3696 | |||
3697 | for(i=0; i<100; i+=2){ |
||
3698 | dst[ (white)*dstStride + i]+=128; |
||
3699 | dst[ (black)*dstStride + i]+=128; |
||
3700 | } |
||
3701 | } |
||
3702 | #endif |
||
3703 | |||
3704 | *c2= c; //copy local context back |
||
3705 | |||
3706 | } |
||
3707 | |||
3708 | #undef RENAME |
||
3709 | #undef TEMPLATE_PP_C |
||
3710 | #undef TEMPLATE_PP_ALTIVEC |
||
3711 | #undef TEMPLATE_PP_MMX |
||
3712 | #undef TEMPLATE_PP_MMXEXT |
||
3713 | #undef TEMPLATE_PP_3DNOW |
||
3714 | #undef TEMPLATE_PP_SSE2100;>256;>256;>>=copyAhead;>16; |