Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* |
2 | * MMX and SSE2 optimized snow DSP utils |
||
3 | * Copyright (c) 2005-2006 Robert Edele |
||
4 | * |
||
5 | * This file is part of FFmpeg. |
||
6 | * |
||
7 | * FFmpeg is free software; you can redistribute it and/or |
||
8 | * modify it under the terms of the GNU Lesser General Public |
||
9 | * License as published by the Free Software Foundation; either |
||
10 | * version 2.1 of the License, or (at your option) any later version. |
||
11 | * |
||
12 | * FFmpeg is distributed in the hope that it will be useful, |
||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | * Lesser General Public License for more details. |
||
16 | * |
||
17 | * You should have received a copy of the GNU Lesser General Public |
||
18 | * License along with FFmpeg; if not, write to the Free Software |
||
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
20 | */ |
||
21 | |||
22 | #include "libavutil/cpu.h" |
||
23 | #include "libavutil/x86/asm.h" |
||
24 | #include "libavcodec/avcodec.h" |
||
25 | #include "libavcodec/snow.h" |
||
26 | #include "libavcodec/snow_dwt.h" |
||
27 | #include "dsputil_x86.h" |
||
28 | |||
29 | #if HAVE_INLINE_ASM |
||
30 | |||
31 | static void ff_snow_horizontal_compose97i_sse2(IDWTELEM *b, IDWTELEM *temp, int width){ |
||
32 | const int w2= (width+1)>>1; |
||
33 | const int w_l= (width>>1); |
||
34 | const int w_r= w2 - 1; |
||
35 | int i; |
||
36 | |||
37 | { // Lift 0 |
||
38 | IDWTELEM * const ref = b + w2 - 1; |
||
39 | IDWTELEM b_0 = b[0]; //By allowing the first entry in b[0] to be calculated twice |
||
40 | // (the first time erroneously), we allow the SSE2 code to run an extra pass. |
||
41 | // The savings in code and time are well worth having to store this value and |
||
42 | // calculate b[0] correctly afterwards. |
||
43 | |||
44 | i = 0; |
||
45 | __asm__ volatile( |
||
46 | "pcmpeqd %%xmm7, %%xmm7 \n\t" |
||
47 | "pcmpeqd %%xmm3, %%xmm3 \n\t" |
||
48 | "psllw $1, %%xmm3 \n\t" |
||
49 | "paddw %%xmm7, %%xmm3 \n\t" |
||
50 | "psllw $13, %%xmm3 \n\t" |
||
51 | ::); |
||
52 | for(; i |
||
53 | __asm__ volatile( |
||
54 | "movdqu (%1), %%xmm1 \n\t" |
||
55 | "movdqu 16(%1), %%xmm5 \n\t" |
||
56 | "movdqu 2(%1), %%xmm2 \n\t" |
||
57 | "movdqu 18(%1), %%xmm6 \n\t" |
||
58 | "paddw %%xmm1, %%xmm2 \n\t" |
||
59 | "paddw %%xmm5, %%xmm6 \n\t" |
||
60 | "paddw %%xmm7, %%xmm2 \n\t" |
||
61 | "paddw %%xmm7, %%xmm6 \n\t" |
||
62 | "pmulhw %%xmm3, %%xmm2 \n\t" |
||
63 | "pmulhw %%xmm3, %%xmm6 \n\t" |
||
64 | "paddw (%0), %%xmm2 \n\t" |
||
65 | "paddw 16(%0), %%xmm6 \n\t" |
||
66 | "movdqa %%xmm2, (%0) \n\t" |
||
67 | "movdqa %%xmm6, 16(%0) \n\t" |
||
68 | :: "r"(&b[i]), "r"(&ref[i]) |
||
69 | : "memory" |
||
70 | ); |
||
71 | } |
||
72 | snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); |
||
73 | b[0] = b_0 - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); |
||
74 | } |
||
75 | |||
76 | { // Lift 1 |
||
77 | IDWTELEM * const dst = b+w2; |
||
78 | |||
79 | i = 0; |
||
80 | for(; (((x86_reg)&dst[i]) & 0x1F) && i |
||
81 | dst[i] = dst[i] - (b[i] + b[i + 1]); |
||
82 | } |
||
83 | for(; i |
||
84 | __asm__ volatile( |
||
85 | "movdqu (%1), %%xmm1 \n\t" |
||
86 | "movdqu 16(%1), %%xmm5 \n\t" |
||
87 | "movdqu 2(%1), %%xmm2 \n\t" |
||
88 | "movdqu 18(%1), %%xmm6 \n\t" |
||
89 | "paddw %%xmm1, %%xmm2 \n\t" |
||
90 | "paddw %%xmm5, %%xmm6 \n\t" |
||
91 | "movdqa (%0), %%xmm0 \n\t" |
||
92 | "movdqa 16(%0), %%xmm4 \n\t" |
||
93 | "psubw %%xmm2, %%xmm0 \n\t" |
||
94 | "psubw %%xmm6, %%xmm4 \n\t" |
||
95 | "movdqa %%xmm0, (%0) \n\t" |
||
96 | "movdqa %%xmm4, 16(%0) \n\t" |
||
97 | :: "r"(&dst[i]), "r"(&b[i]) |
||
98 | : "memory" |
||
99 | ); |
||
100 | } |
||
101 | snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); |
||
102 | } |
||
103 | |||
104 | { // Lift 2 |
||
105 | IDWTELEM * const ref = b+w2 - 1; |
||
106 | IDWTELEM b_0 = b[0]; |
||
107 | |||
108 | i = 0; |
||
109 | __asm__ volatile( |
||
110 | "psllw $15, %%xmm7 \n\t" |
||
111 | "pcmpeqw %%xmm6, %%xmm6 \n\t" |
||
112 | "psrlw $13, %%xmm6 \n\t" |
||
113 | "paddw %%xmm7, %%xmm6 \n\t" |
||
114 | ::); |
||
115 | for(; i |
||
116 | __asm__ volatile( |
||
117 | "movdqu (%1), %%xmm0 \n\t" |
||
118 | "movdqu 16(%1), %%xmm4 \n\t" |
||
119 | "movdqu 2(%1), %%xmm1 \n\t" |
||
120 | "movdqu 18(%1), %%xmm5 \n\t" //FIXME try aligned reads and shifts |
||
121 | "paddw %%xmm6, %%xmm0 \n\t" |
||
122 | "paddw %%xmm6, %%xmm4 \n\t" |
||
123 | "paddw %%xmm7, %%xmm1 \n\t" |
||
124 | "paddw %%xmm7, %%xmm5 \n\t" |
||
125 | "pavgw %%xmm1, %%xmm0 \n\t" |
||
126 | "pavgw %%xmm5, %%xmm4 \n\t" |
||
127 | "psubw %%xmm7, %%xmm0 \n\t" |
||
128 | "psubw %%xmm7, %%xmm4 \n\t" |
||
129 | "psraw $1, %%xmm0 \n\t" |
||
130 | "psraw $1, %%xmm4 \n\t" |
||
131 | "movdqa (%0), %%xmm1 \n\t" |
||
132 | "movdqa 16(%0), %%xmm5 \n\t" |
||
133 | "paddw %%xmm1, %%xmm0 \n\t" |
||
134 | "paddw %%xmm5, %%xmm4 \n\t" |
||
135 | "psraw $2, %%xmm0 \n\t" |
||
136 | "psraw $2, %%xmm4 \n\t" |
||
137 | "paddw %%xmm1, %%xmm0 \n\t" |
||
138 | "paddw %%xmm5, %%xmm4 \n\t" |
||
139 | "movdqa %%xmm0, (%0) \n\t" |
||
140 | "movdqa %%xmm4, 16(%0) \n\t" |
||
141 | :: "r"(&b[i]), "r"(&ref[i]) |
||
142 | : "memory" |
||
143 | ); |
||
144 | } |
||
145 | snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); |
||
146 | b[0] = b_0 + ((2 * ref[1] + W_BO-1 + 4 * b_0) >> W_BS); |
||
147 | } |
||
148 | |||
149 | { // Lift 3 |
||
150 | IDWTELEM * const src = b+w2; |
||
151 | |||
152 | i = 0; |
||
153 | for(; (((x86_reg)&temp[i]) & 0x1F) && i |
||
154 | temp[i] = src[i] - ((-W_AM*(b[i] + b[i+1]))>>W_AS); |
||
155 | } |
||
156 | for(; i |
||
157 | __asm__ volatile( |
||
158 | "movdqu 2(%1), %%xmm2 \n\t" |
||
159 | "movdqu 18(%1), %%xmm6 \n\t" |
||
160 | "paddw (%1), %%xmm2 \n\t" |
||
161 | "paddw 16(%1), %%xmm6 \n\t" |
||
162 | "movdqu (%0), %%xmm0 \n\t" |
||
163 | "movdqu 16(%0), %%xmm4 \n\t" |
||
164 | "paddw %%xmm2, %%xmm0 \n\t" |
||
165 | "paddw %%xmm6, %%xmm4 \n\t" |
||
166 | "psraw $1, %%xmm2 \n\t" |
||
167 | "psraw $1, %%xmm6 \n\t" |
||
168 | "paddw %%xmm0, %%xmm2 \n\t" |
||
169 | "paddw %%xmm4, %%xmm6 \n\t" |
||
170 | "movdqa %%xmm2, (%2) \n\t" |
||
171 | "movdqa %%xmm6, 16(%2) \n\t" |
||
172 | :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) |
||
173 | : "memory" |
||
174 | ); |
||
175 | } |
||
176 | snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); |
||
177 | } |
||
178 | |||
179 | { |
||
180 | snow_interleave_line_header(&i, width, b, temp); |
||
181 | |||
182 | for (; (i & 0x3E) != 0x3E; i-=2){ |
||
183 | b[i+1] = temp[i>>1]; |
||
184 | b[i] = b[i>>1]; |
||
185 | } |
||
186 | for (i-=62; i>=0; i-=64){ |
||
187 | __asm__ volatile( |
||
188 | "movdqa (%1), %%xmm0 \n\t" |
||
189 | "movdqa 16(%1), %%xmm2 \n\t" |
||
190 | "movdqa 32(%1), %%xmm4 \n\t" |
||
191 | "movdqa 48(%1), %%xmm6 \n\t" |
||
192 | "movdqa (%1), %%xmm1 \n\t" |
||
193 | "movdqa 16(%1), %%xmm3 \n\t" |
||
194 | "movdqa 32(%1), %%xmm5 \n\t" |
||
195 | "movdqa 48(%1), %%xmm7 \n\t" |
||
196 | "punpcklwd (%2), %%xmm0 \n\t" |
||
197 | "punpcklwd 16(%2), %%xmm2 \n\t" |
||
198 | "punpcklwd 32(%2), %%xmm4 \n\t" |
||
199 | "punpcklwd 48(%2), %%xmm6 \n\t" |
||
200 | "movdqa %%xmm0, (%0) \n\t" |
||
201 | "movdqa %%xmm2, 32(%0) \n\t" |
||
202 | "movdqa %%xmm4, 64(%0) \n\t" |
||
203 | "movdqa %%xmm6, 96(%0) \n\t" |
||
204 | "punpckhwd (%2), %%xmm1 \n\t" |
||
205 | "punpckhwd 16(%2), %%xmm3 \n\t" |
||
206 | "punpckhwd 32(%2), %%xmm5 \n\t" |
||
207 | "punpckhwd 48(%2), %%xmm7 \n\t" |
||
208 | "movdqa %%xmm1, 16(%0) \n\t" |
||
209 | "movdqa %%xmm3, 48(%0) \n\t" |
||
210 | "movdqa %%xmm5, 80(%0) \n\t" |
||
211 | "movdqa %%xmm7, 112(%0) \n\t" |
||
212 | :: "r"(&(b)[i]), "r"(&(b)[i>>1]), "r"(&(temp)[i>>1]) |
||
213 | : "memory" |
||
214 | ); |
||
215 | } |
||
216 | } |
||
217 | } |
||
218 | |||
219 | static void ff_snow_horizontal_compose97i_mmx(IDWTELEM *b, IDWTELEM *temp, int width){ |
||
220 | const int w2= (width+1)>>1; |
||
221 | const int w_l= (width>>1); |
||
222 | const int w_r= w2 - 1; |
||
223 | int i; |
||
224 | |||
225 | { // Lift 0 |
||
226 | IDWTELEM * const ref = b + w2 - 1; |
||
227 | |||
228 | i = 1; |
||
229 | b[0] = b[0] - ((W_DM * 2 * ref[1]+W_DO)>>W_DS); |
||
230 | __asm__ volatile( |
||
231 | "pcmpeqw %%mm7, %%mm7 \n\t" |
||
232 | "pcmpeqw %%mm3, %%mm3 \n\t" |
||
233 | "psllw $1, %%mm3 \n\t" |
||
234 | "paddw %%mm7, %%mm3 \n\t" |
||
235 | "psllw $13, %%mm3 \n\t" |
||
236 | ::); |
||
237 | for(; i |
||
238 | __asm__ volatile( |
||
239 | "movq (%1), %%mm2 \n\t" |
||
240 | "movq 8(%1), %%mm6 \n\t" |
||
241 | "paddw 2(%1), %%mm2 \n\t" |
||
242 | "paddw 10(%1), %%mm6 \n\t" |
||
243 | "paddw %%mm7, %%mm2 \n\t" |
||
244 | "paddw %%mm7, %%mm6 \n\t" |
||
245 | "pmulhw %%mm3, %%mm2 \n\t" |
||
246 | "pmulhw %%mm3, %%mm6 \n\t" |
||
247 | "paddw (%0), %%mm2 \n\t" |
||
248 | "paddw 8(%0), %%mm6 \n\t" |
||
249 | "movq %%mm2, (%0) \n\t" |
||
250 | "movq %%mm6, 8(%0) \n\t" |
||
251 | :: "r"(&b[i]), "r"(&ref[i]) |
||
252 | : "memory" |
||
253 | ); |
||
254 | } |
||
255 | snow_horizontal_compose_lift_lead_out(i, b, b, ref, width, w_l, 0, W_DM, W_DO, W_DS); |
||
256 | } |
||
257 | |||
258 | { // Lift 1 |
||
259 | IDWTELEM * const dst = b+w2; |
||
260 | |||
261 | i = 0; |
||
262 | for(; i |
||
263 | __asm__ volatile( |
||
264 | "movq (%1), %%mm2 \n\t" |
||
265 | "movq 8(%1), %%mm6 \n\t" |
||
266 | "paddw 2(%1), %%mm2 \n\t" |
||
267 | "paddw 10(%1), %%mm6 \n\t" |
||
268 | "movq (%0), %%mm0 \n\t" |
||
269 | "movq 8(%0), %%mm4 \n\t" |
||
270 | "psubw %%mm2, %%mm0 \n\t" |
||
271 | "psubw %%mm6, %%mm4 \n\t" |
||
272 | "movq %%mm0, (%0) \n\t" |
||
273 | "movq %%mm4, 8(%0) \n\t" |
||
274 | :: "r"(&dst[i]), "r"(&b[i]) |
||
275 | : "memory" |
||
276 | ); |
||
277 | } |
||
278 | snow_horizontal_compose_lift_lead_out(i, dst, dst, b, width, w_r, 1, W_CM, W_CO, W_CS); |
||
279 | } |
||
280 | |||
281 | { // Lift 2 |
||
282 | IDWTELEM * const ref = b+w2 - 1; |
||
283 | |||
284 | i = 1; |
||
285 | b[0] = b[0] + (((2 * ref[1] + W_BO) + 4 * b[0]) >> W_BS); |
||
286 | __asm__ volatile( |
||
287 | "psllw $15, %%mm7 \n\t" |
||
288 | "pcmpeqw %%mm6, %%mm6 \n\t" |
||
289 | "psrlw $13, %%mm6 \n\t" |
||
290 | "paddw %%mm7, %%mm6 \n\t" |
||
291 | ::); |
||
292 | for(; i |
||
293 | __asm__ volatile( |
||
294 | "movq (%1), %%mm0 \n\t" |
||
295 | "movq 8(%1), %%mm4 \n\t" |
||
296 | "movq 2(%1), %%mm1 \n\t" |
||
297 | "movq 10(%1), %%mm5 \n\t" |
||
298 | "paddw %%mm6, %%mm0 \n\t" |
||
299 | "paddw %%mm6, %%mm4 \n\t" |
||
300 | "paddw %%mm7, %%mm1 \n\t" |
||
301 | "paddw %%mm7, %%mm5 \n\t" |
||
302 | "pavgw %%mm1, %%mm0 \n\t" |
||
303 | "pavgw %%mm5, %%mm4 \n\t" |
||
304 | "psubw %%mm7, %%mm0 \n\t" |
||
305 | "psubw %%mm7, %%mm4 \n\t" |
||
306 | "psraw $1, %%mm0 \n\t" |
||
307 | "psraw $1, %%mm4 \n\t" |
||
308 | "movq (%0), %%mm1 \n\t" |
||
309 | "movq 8(%0), %%mm5 \n\t" |
||
310 | "paddw %%mm1, %%mm0 \n\t" |
||
311 | "paddw %%mm5, %%mm4 \n\t" |
||
312 | "psraw $2, %%mm0 \n\t" |
||
313 | "psraw $2, %%mm4 \n\t" |
||
314 | "paddw %%mm1, %%mm0 \n\t" |
||
315 | "paddw %%mm5, %%mm4 \n\t" |
||
316 | "movq %%mm0, (%0) \n\t" |
||
317 | "movq %%mm4, 8(%0) \n\t" |
||
318 | :: "r"(&b[i]), "r"(&ref[i]) |
||
319 | : "memory" |
||
320 | ); |
||
321 | } |
||
322 | snow_horizontal_compose_liftS_lead_out(i, b, b, ref, width, w_l); |
||
323 | } |
||
324 | |||
325 | { // Lift 3 |
||
326 | IDWTELEM * const src = b+w2; |
||
327 | i = 0; |
||
328 | |||
329 | for(; i |
||
330 | __asm__ volatile( |
||
331 | "movq 2(%1), %%mm2 \n\t" |
||
332 | "movq 10(%1), %%mm6 \n\t" |
||
333 | "paddw (%1), %%mm2 \n\t" |
||
334 | "paddw 8(%1), %%mm6 \n\t" |
||
335 | "movq (%0), %%mm0 \n\t" |
||
336 | "movq 8(%0), %%mm4 \n\t" |
||
337 | "paddw %%mm2, %%mm0 \n\t" |
||
338 | "paddw %%mm6, %%mm4 \n\t" |
||
339 | "psraw $1, %%mm2 \n\t" |
||
340 | "psraw $1, %%mm6 \n\t" |
||
341 | "paddw %%mm0, %%mm2 \n\t" |
||
342 | "paddw %%mm4, %%mm6 \n\t" |
||
343 | "movq %%mm2, (%2) \n\t" |
||
344 | "movq %%mm6, 8(%2) \n\t" |
||
345 | :: "r"(&src[i]), "r"(&b[i]), "r"(&temp[i]) |
||
346 | : "memory" |
||
347 | ); |
||
348 | } |
||
349 | snow_horizontal_compose_lift_lead_out(i, temp, src, b, width, w_r, 1, -W_AM, W_AO+1, W_AS); |
||
350 | } |
||
351 | |||
352 | { |
||
353 | snow_interleave_line_header(&i, width, b, temp); |
||
354 | |||
355 | for (; (i & 0x1E) != 0x1E; i-=2){ |
||
356 | b[i+1] = temp[i>>1]; |
||
357 | b[i] = b[i>>1]; |
||
358 | } |
||
359 | for (i-=30; i>=0; i-=32){ |
||
360 | __asm__ volatile( |
||
361 | "movq (%1), %%mm0 \n\t" |
||
362 | "movq 8(%1), %%mm2 \n\t" |
||
363 | "movq 16(%1), %%mm4 \n\t" |
||
364 | "movq 24(%1), %%mm6 \n\t" |
||
365 | "movq (%1), %%mm1 \n\t" |
||
366 | "movq 8(%1), %%mm3 \n\t" |
||
367 | "movq 16(%1), %%mm5 \n\t" |
||
368 | "movq 24(%1), %%mm7 \n\t" |
||
369 | "punpcklwd (%2), %%mm0 \n\t" |
||
370 | "punpcklwd 8(%2), %%mm2 \n\t" |
||
371 | "punpcklwd 16(%2), %%mm4 \n\t" |
||
372 | "punpcklwd 24(%2), %%mm6 \n\t" |
||
373 | "movq %%mm0, (%0) \n\t" |
||
374 | "movq %%mm2, 16(%0) \n\t" |
||
375 | "movq %%mm4, 32(%0) \n\t" |
||
376 | "movq %%mm6, 48(%0) \n\t" |
||
377 | "punpckhwd (%2), %%mm1 \n\t" |
||
378 | "punpckhwd 8(%2), %%mm3 \n\t" |
||
379 | "punpckhwd 16(%2), %%mm5 \n\t" |
||
380 | "punpckhwd 24(%2), %%mm7 \n\t" |
||
381 | "movq %%mm1, 8(%0) \n\t" |
||
382 | "movq %%mm3, 24(%0) \n\t" |
||
383 | "movq %%mm5, 40(%0) \n\t" |
||
384 | "movq %%mm7, 56(%0) \n\t" |
||
385 | :: "r"(&b[i]), "r"(&b[i>>1]), "r"(&temp[i>>1]) |
||
386 | : "memory" |
||
387 | ); |
||
388 | } |
||
389 | } |
||
390 | } |
||
391 | |||
392 | #if HAVE_7REGS |
||
393 | #define snow_vertical_compose_sse2_load_add(op,r,t0,t1,t2,t3)\ |
||
394 | ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ |
||
395 | ""op" 16("r",%%"REG_d"), %%"t1" \n\t"\ |
||
396 | ""op" 32("r",%%"REG_d"), %%"t2" \n\t"\ |
||
397 | ""op" 48("r",%%"REG_d"), %%"t3" \n\t" |
||
398 | |||
399 | #define snow_vertical_compose_sse2_load(r,t0,t1,t2,t3)\ |
||
400 | snow_vertical_compose_sse2_load_add("movdqa",r,t0,t1,t2,t3) |
||
401 | |||
402 | #define snow_vertical_compose_sse2_add(r,t0,t1,t2,t3)\ |
||
403 | snow_vertical_compose_sse2_load_add("paddw",r,t0,t1,t2,t3) |
||
404 | |||
405 | #define snow_vertical_compose_r2r_sub(s0,s1,s2,s3,t0,t1,t2,t3)\ |
||
406 | "psubw %%"s0", %%"t0" \n\t"\ |
||
407 | "psubw %%"s1", %%"t1" \n\t"\ |
||
408 | "psubw %%"s2", %%"t2" \n\t"\ |
||
409 | "psubw %%"s3", %%"t3" \n\t" |
||
410 | |||
411 | #define snow_vertical_compose_sse2_store(w,s0,s1,s2,s3)\ |
||
412 | "movdqa %%"s0", ("w",%%"REG_d") \n\t"\ |
||
413 | "movdqa %%"s1", 16("w",%%"REG_d") \n\t"\ |
||
414 | "movdqa %%"s2", 32("w",%%"REG_d") \n\t"\ |
||
415 | "movdqa %%"s3", 48("w",%%"REG_d") \n\t" |
||
416 | |||
417 | #define snow_vertical_compose_sra(n,t0,t1,t2,t3)\ |
||
418 | "psraw $"n", %%"t0" \n\t"\ |
||
419 | "psraw $"n", %%"t1" \n\t"\ |
||
420 | "psraw $"n", %%"t2" \n\t"\ |
||
421 | "psraw $"n", %%"t3" \n\t" |
||
422 | |||
423 | #define snow_vertical_compose_r2r_add(s0,s1,s2,s3,t0,t1,t2,t3)\ |
||
424 | "paddw %%"s0", %%"t0" \n\t"\ |
||
425 | "paddw %%"s1", %%"t1" \n\t"\ |
||
426 | "paddw %%"s2", %%"t2" \n\t"\ |
||
427 | "paddw %%"s3", %%"t3" \n\t" |
||
428 | |||
429 | #define snow_vertical_compose_r2r_pmulhw(s0,s1,s2,s3,t0,t1,t2,t3)\ |
||
430 | "pmulhw %%"s0", %%"t0" \n\t"\ |
||
431 | "pmulhw %%"s1", %%"t1" \n\t"\ |
||
432 | "pmulhw %%"s2", %%"t2" \n\t"\ |
||
433 | "pmulhw %%"s3", %%"t3" \n\t" |
||
434 | |||
435 | #define snow_vertical_compose_sse2_move(s0,s1,s2,s3,t0,t1,t2,t3)\ |
||
436 | "movdqa %%"s0", %%"t0" \n\t"\ |
||
437 | "movdqa %%"s1", %%"t1" \n\t"\ |
||
438 | "movdqa %%"s2", %%"t2" \n\t"\ |
||
439 | "movdqa %%"s3", %%"t3" \n\t" |
||
440 | |||
441 | static void ff_snow_vertical_compose97i_sse2(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ |
||
442 | x86_reg i = width; |
||
443 | |||
444 | while(i & 0x1F) |
||
445 | { |
||
446 | i--; |
||
447 | b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; |
||
448 | b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; |
||
449 | b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; |
||
450 | b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; |
||
451 | } |
||
452 | i+=i; |
||
453 | |||
454 | __asm__ volatile ( |
||
455 | "jmp 2f \n\t" |
||
456 | "1: \n\t" |
||
457 | snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") |
||
458 | snow_vertical_compose_sse2_add("%6","xmm0","xmm2","xmm4","xmm6") |
||
459 | |||
460 | |||
461 | "pcmpeqw %%xmm0, %%xmm0 \n\t" |
||
462 | "pcmpeqw %%xmm2, %%xmm2 \n\t" |
||
463 | "paddw %%xmm2, %%xmm2 \n\t" |
||
464 | "paddw %%xmm0, %%xmm2 \n\t" |
||
465 | "psllw $13, %%xmm2 \n\t" |
||
466 | snow_vertical_compose_r2r_add("xmm0","xmm0","xmm0","xmm0","xmm1","xmm3","xmm5","xmm7") |
||
467 | snow_vertical_compose_r2r_pmulhw("xmm2","xmm2","xmm2","xmm2","xmm1","xmm3","xmm5","xmm7") |
||
468 | snow_vertical_compose_sse2_add("%5","xmm1","xmm3","xmm5","xmm7") |
||
469 | snow_vertical_compose_sse2_store("%5","xmm1","xmm3","xmm5","xmm7") |
||
470 | snow_vertical_compose_sse2_load("%4","xmm0","xmm2","xmm4","xmm6") |
||
471 | snow_vertical_compose_sse2_add("%3","xmm1","xmm3","xmm5","xmm7") |
||
472 | snow_vertical_compose_r2r_sub("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
||
473 | snow_vertical_compose_sse2_store("%4","xmm0","xmm2","xmm4","xmm6") |
||
474 | |||
475 | "pcmpeqw %%xmm7, %%xmm7 \n\t" |
||
476 | "pcmpeqw %%xmm5, %%xmm5 \n\t" |
||
477 | "psllw $15, %%xmm7 \n\t" |
||
478 | "psrlw $13, %%xmm5 \n\t" |
||
479 | "paddw %%xmm7, %%xmm5 \n\t" |
||
480 | snow_vertical_compose_r2r_add("xmm5","xmm5","xmm5","xmm5","xmm0","xmm2","xmm4","xmm6") |
||
481 | "movq (%2,%%"REG_d"), %%xmm1 \n\t" |
||
482 | "movq 8(%2,%%"REG_d"), %%xmm3 \n\t" |
||
483 | "paddw %%xmm7, %%xmm1 \n\t" |
||
484 | "paddw %%xmm7, %%xmm3 \n\t" |
||
485 | "pavgw %%xmm1, %%xmm0 \n\t" |
||
486 | "pavgw %%xmm3, %%xmm2 \n\t" |
||
487 | "movq 16(%2,%%"REG_d"), %%xmm1 \n\t" |
||
488 | "movq 24(%2,%%"REG_d"), %%xmm3 \n\t" |
||
489 | "paddw %%xmm7, %%xmm1 \n\t" |
||
490 | "paddw %%xmm7, %%xmm3 \n\t" |
||
491 | "pavgw %%xmm1, %%xmm4 \n\t" |
||
492 | "pavgw %%xmm3, %%xmm6 \n\t" |
||
493 | snow_vertical_compose_r2r_sub("xmm7","xmm7","xmm7","xmm7","xmm0","xmm2","xmm4","xmm6") |
||
494 | snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") |
||
495 | snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") |
||
496 | |||
497 | snow_vertical_compose_sra("2","xmm0","xmm2","xmm4","xmm6") |
||
498 | snow_vertical_compose_sse2_add("%3","xmm0","xmm2","xmm4","xmm6") |
||
499 | snow_vertical_compose_sse2_store("%3","xmm0","xmm2","xmm4","xmm6") |
||
500 | snow_vertical_compose_sse2_add("%1","xmm0","xmm2","xmm4","xmm6") |
||
501 | snow_vertical_compose_sse2_move("xmm0","xmm2","xmm4","xmm6","xmm1","xmm3","xmm5","xmm7") |
||
502 | snow_vertical_compose_sra("1","xmm0","xmm2","xmm4","xmm6") |
||
503 | snow_vertical_compose_r2r_add("xmm1","xmm3","xmm5","xmm7","xmm0","xmm2","xmm4","xmm6") |
||
504 | snow_vertical_compose_sse2_add("%2","xmm0","xmm2","xmm4","xmm6") |
||
505 | snow_vertical_compose_sse2_store("%2","xmm0","xmm2","xmm4","xmm6") |
||
506 | |||
507 | "2: \n\t" |
||
508 | "sub $64, %%"REG_d" \n\t" |
||
509 | "jge 1b \n\t" |
||
510 | :"+d"(i) |
||
511 | :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); |
||
512 | } |
||
513 | |||
514 | #define snow_vertical_compose_mmx_load_add(op,r,t0,t1,t2,t3)\ |
||
515 | ""op" ("r",%%"REG_d"), %%"t0" \n\t"\ |
||
516 | ""op" 8("r",%%"REG_d"), %%"t1" \n\t"\ |
||
517 | ""op" 16("r",%%"REG_d"), %%"t2" \n\t"\ |
||
518 | ""op" 24("r",%%"REG_d"), %%"t3" \n\t" |
||
519 | |||
520 | #define snow_vertical_compose_mmx_load(r,t0,t1,t2,t3)\ |
||
521 | snow_vertical_compose_mmx_load_add("movq",r,t0,t1,t2,t3) |
||
522 | |||
523 | #define snow_vertical_compose_mmx_add(r,t0,t1,t2,t3)\ |
||
524 | snow_vertical_compose_mmx_load_add("paddw",r,t0,t1,t2,t3) |
||
525 | |||
526 | #define snow_vertical_compose_mmx_store(w,s0,s1,s2,s3)\ |
||
527 | "movq %%"s0", ("w",%%"REG_d") \n\t"\ |
||
528 | "movq %%"s1", 8("w",%%"REG_d") \n\t"\ |
||
529 | "movq %%"s2", 16("w",%%"REG_d") \n\t"\ |
||
530 | "movq %%"s3", 24("w",%%"REG_d") \n\t" |
||
531 | |||
532 | #define snow_vertical_compose_mmx_move(s0,s1,s2,s3,t0,t1,t2,t3)\ |
||
533 | "movq %%"s0", %%"t0" \n\t"\ |
||
534 | "movq %%"s1", %%"t1" \n\t"\ |
||
535 | "movq %%"s2", %%"t2" \n\t"\ |
||
536 | "movq %%"s3", %%"t3" \n\t" |
||
537 | |||
538 | |||
539 | static void ff_snow_vertical_compose97i_mmx(IDWTELEM *b0, IDWTELEM *b1, IDWTELEM *b2, IDWTELEM *b3, IDWTELEM *b4, IDWTELEM *b5, int width){ |
||
540 | x86_reg i = width; |
||
541 | while(i & 15) |
||
542 | { |
||
543 | i--; |
||
544 | b4[i] -= (W_DM*(b3[i] + b5[i])+W_DO)>>W_DS; |
||
545 | b3[i] -= (W_CM*(b2[i] + b4[i])+W_CO)>>W_CS; |
||
546 | b2[i] += (W_BM*(b1[i] + b3[i])+4*b2[i]+W_BO)>>W_BS; |
||
547 | b1[i] += (W_AM*(b0[i] + b2[i])+W_AO)>>W_AS; |
||
548 | } |
||
549 | i+=i; |
||
550 | __asm__ volatile( |
||
551 | "jmp 2f \n\t" |
||
552 | "1: \n\t" |
||
553 | |||
554 | snow_vertical_compose_mmx_load("%4","mm1","mm3","mm5","mm7") |
||
555 | snow_vertical_compose_mmx_add("%6","mm1","mm3","mm5","mm7") |
||
556 | "pcmpeqw %%mm0, %%mm0 \n\t" |
||
557 | "pcmpeqw %%mm2, %%mm2 \n\t" |
||
558 | "paddw %%mm2, %%mm2 \n\t" |
||
559 | "paddw %%mm0, %%mm2 \n\t" |
||
560 | "psllw $13, %%mm2 \n\t" |
||
561 | snow_vertical_compose_r2r_add("mm0","mm0","mm0","mm0","mm1","mm3","mm5","mm7") |
||
562 | snow_vertical_compose_r2r_pmulhw("mm2","mm2","mm2","mm2","mm1","mm3","mm5","mm7") |
||
563 | snow_vertical_compose_mmx_add("%5","mm1","mm3","mm5","mm7") |
||
564 | snow_vertical_compose_mmx_store("%5","mm1","mm3","mm5","mm7") |
||
565 | snow_vertical_compose_mmx_load("%4","mm0","mm2","mm4","mm6") |
||
566 | snow_vertical_compose_mmx_add("%3","mm1","mm3","mm5","mm7") |
||
567 | snow_vertical_compose_r2r_sub("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
||
568 | snow_vertical_compose_mmx_store("%4","mm0","mm2","mm4","mm6") |
||
569 | "pcmpeqw %%mm7, %%mm7 \n\t" |
||
570 | "pcmpeqw %%mm5, %%mm5 \n\t" |
||
571 | "psllw $15, %%mm7 \n\t" |
||
572 | "psrlw $13, %%mm5 \n\t" |
||
573 | "paddw %%mm7, %%mm5 \n\t" |
||
574 | snow_vertical_compose_r2r_add("mm5","mm5","mm5","mm5","mm0","mm2","mm4","mm6") |
||
575 | "movq (%2,%%"REG_d"), %%mm1 \n\t" |
||
576 | "movq 8(%2,%%"REG_d"), %%mm3 \n\t" |
||
577 | "paddw %%mm7, %%mm1 \n\t" |
||
578 | "paddw %%mm7, %%mm3 \n\t" |
||
579 | "pavgw %%mm1, %%mm0 \n\t" |
||
580 | "pavgw %%mm3, %%mm2 \n\t" |
||
581 | "movq 16(%2,%%"REG_d"), %%mm1 \n\t" |
||
582 | "movq 24(%2,%%"REG_d"), %%mm3 \n\t" |
||
583 | "paddw %%mm7, %%mm1 \n\t" |
||
584 | "paddw %%mm7, %%mm3 \n\t" |
||
585 | "pavgw %%mm1, %%mm4 \n\t" |
||
586 | "pavgw %%mm3, %%mm6 \n\t" |
||
587 | snow_vertical_compose_r2r_sub("mm7","mm7","mm7","mm7","mm0","mm2","mm4","mm6") |
||
588 | snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") |
||
589 | snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") |
||
590 | |||
591 | snow_vertical_compose_sra("2","mm0","mm2","mm4","mm6") |
||
592 | snow_vertical_compose_mmx_add("%3","mm0","mm2","mm4","mm6") |
||
593 | snow_vertical_compose_mmx_store("%3","mm0","mm2","mm4","mm6") |
||
594 | snow_vertical_compose_mmx_add("%1","mm0","mm2","mm4","mm6") |
||
595 | snow_vertical_compose_mmx_move("mm0","mm2","mm4","mm6","mm1","mm3","mm5","mm7") |
||
596 | snow_vertical_compose_sra("1","mm0","mm2","mm4","mm6") |
||
597 | snow_vertical_compose_r2r_add("mm1","mm3","mm5","mm7","mm0","mm2","mm4","mm6") |
||
598 | snow_vertical_compose_mmx_add("%2","mm0","mm2","mm4","mm6") |
||
599 | snow_vertical_compose_mmx_store("%2","mm0","mm2","mm4","mm6") |
||
600 | |||
601 | "2: \n\t" |
||
602 | "sub $32, %%"REG_d" \n\t" |
||
603 | "jge 1b \n\t" |
||
604 | :"+d"(i) |
||
605 | :"r"(b0),"r"(b1),"r"(b2),"r"(b3),"r"(b4),"r"(b5)); |
||
606 | } |
||
607 | #endif //HAVE_7REGS |
||
608 | |||
609 | #define snow_inner_add_yblock_sse2_header \ |
||
610 | IDWTELEM * * dst_array = sb->line + src_y;\ |
||
611 | x86_reg tmp;\ |
||
612 | __asm__ volatile(\ |
||
613 | "mov %7, %%"REG_c" \n\t"\ |
||
614 | "mov %6, %2 \n\t"\ |
||
615 | "mov %4, %%"REG_S" \n\t"\ |
||
616 | "pxor %%xmm7, %%xmm7 \n\t" /* 0 */\ |
||
617 | "pcmpeqd %%xmm3, %%xmm3 \n\t"\ |
||
618 | "psllw $15, %%xmm3 \n\t"\ |
||
619 | "psrlw $12, %%xmm3 \n\t" /* FRAC_BITS >> 1 */\ |
||
620 | "1: \n\t"\ |
||
621 | "mov %1, %%"REG_D" \n\t"\ |
||
622 | "mov (%%"REG_D"), %%"REG_D" \n\t"\ |
||
623 | "add %3, %%"REG_D" \n\t" |
||
624 | |||
625 | #define snow_inner_add_yblock_sse2_start_8(out_reg1, out_reg2, ptr_offset, s_offset)\ |
||
626 | "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
||
627 | "movq (%%"REG_d"), %%"out_reg1" \n\t"\ |
||
628 | "movq (%%"REG_d", %%"REG_c"), %%"out_reg2" \n\t"\ |
||
629 | "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ |
||
630 | "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ |
||
631 | "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ |
||
632 | "movq "s_offset"+16(%%"REG_S"), %%xmm4 \n\t"\ |
||
633 | "punpcklbw %%xmm7, %%xmm0 \n\t"\ |
||
634 | "punpcklbw %%xmm7, %%xmm4 \n\t"\ |
||
635 | "pmullw %%xmm0, %%"out_reg1" \n\t"\ |
||
636 | "pmullw %%xmm4, %%"out_reg2" \n\t" |
||
637 | |||
638 | #define snow_inner_add_yblock_sse2_start_16(out_reg1, out_reg2, ptr_offset, s_offset)\ |
||
639 | "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
||
640 | "movq (%%"REG_d"), %%"out_reg1" \n\t"\ |
||
641 | "movq 8(%%"REG_d"), %%"out_reg2" \n\t"\ |
||
642 | "punpcklbw %%xmm7, %%"out_reg1" \n\t"\ |
||
643 | "punpcklbw %%xmm7, %%"out_reg2" \n\t"\ |
||
644 | "movq "s_offset"(%%"REG_S"), %%xmm0 \n\t"\ |
||
645 | "movq "s_offset"+8(%%"REG_S"), %%xmm4 \n\t"\ |
||
646 | "punpcklbw %%xmm7, %%xmm0 \n\t"\ |
||
647 | "punpcklbw %%xmm7, %%xmm4 \n\t"\ |
||
648 | "pmullw %%xmm0, %%"out_reg1" \n\t"\ |
||
649 | "pmullw %%xmm4, %%"out_reg2" \n\t" |
||
650 | |||
651 | #define snow_inner_add_yblock_sse2_accum_8(ptr_offset, s_offset) \ |
||
652 | snow_inner_add_yblock_sse2_start_8("xmm2", "xmm6", ptr_offset, s_offset)\ |
||
653 | "paddusw %%xmm2, %%xmm1 \n\t"\ |
||
654 | "paddusw %%xmm6, %%xmm5 \n\t" |
||
655 | |||
656 | #define snow_inner_add_yblock_sse2_accum_16(ptr_offset, s_offset) \ |
||
657 | snow_inner_add_yblock_sse2_start_16("xmm2", "xmm6", ptr_offset, s_offset)\ |
||
658 | "paddusw %%xmm2, %%xmm1 \n\t"\ |
||
659 | "paddusw %%xmm6, %%xmm5 \n\t" |
||
660 | |||
661 | #define snow_inner_add_yblock_sse2_end_common1\ |
||
662 | "add $32, %%"REG_S" \n\t"\ |
||
663 | "add %%"REG_c", %0 \n\t"\ |
||
664 | "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ |
||
665 | "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ |
||
666 | "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ |
||
667 | "add %%"REG_c", (%%"REG_a") \n\t" |
||
668 | |||
669 | #define snow_inner_add_yblock_sse2_end_common2\ |
||
670 | "jnz 1b \n\t"\ |
||
671 | :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ |
||
672 | :\ |
||
673 | "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ |
||
674 | "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
||
675 | |||
676 | #define snow_inner_add_yblock_sse2_end_8\ |
||
677 | "sal $1, %%"REG_c" \n\t"\ |
||
678 | "add"OPSIZE" $"PTR_SIZE"*2, %1 \n\t"\ |
||
679 | snow_inner_add_yblock_sse2_end_common1\ |
||
680 | "sar $1, %%"REG_c" \n\t"\ |
||
681 | "sub $2, %2 \n\t"\ |
||
682 | snow_inner_add_yblock_sse2_end_common2 |
||
683 | |||
684 | #define snow_inner_add_yblock_sse2_end_16\ |
||
685 | "add"OPSIZE" $"PTR_SIZE"*1, %1 \n\t"\ |
||
686 | snow_inner_add_yblock_sse2_end_common1\ |
||
687 | "dec %2 \n\t"\ |
||
688 | snow_inner_add_yblock_sse2_end_common2 |
||
689 | |||
690 | static void inner_add_yblock_bw_8_obmc_16_bh_even_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, |
||
691 | int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
||
692 | snow_inner_add_yblock_sse2_header |
||
693 | snow_inner_add_yblock_sse2_start_8("xmm1", "xmm5", "3", "0") |
||
694 | snow_inner_add_yblock_sse2_accum_8("2", "8") |
||
695 | snow_inner_add_yblock_sse2_accum_8("1", "128") |
||
696 | snow_inner_add_yblock_sse2_accum_8("0", "136") |
||
697 | |||
698 | "mov %0, %%"REG_d" \n\t" |
||
699 | "movdqa (%%"REG_D"), %%xmm0 \n\t" |
||
700 | "movdqa %%xmm1, %%xmm2 \n\t" |
||
701 | |||
702 | "punpckhwd %%xmm7, %%xmm1 \n\t" |
||
703 | "punpcklwd %%xmm7, %%xmm2 \n\t" |
||
704 | "paddd %%xmm2, %%xmm0 \n\t" |
||
705 | "movdqa 16(%%"REG_D"), %%xmm2 \n\t" |
||
706 | "paddd %%xmm1, %%xmm2 \n\t" |
||
707 | "paddd %%xmm3, %%xmm0 \n\t" |
||
708 | "paddd %%xmm3, %%xmm2 \n\t" |
||
709 | |||
710 | "mov %1, %%"REG_D" \n\t" |
||
711 | "mov "PTR_SIZE"(%%"REG_D"), %%"REG_D";\n\t" |
||
712 | "add %3, %%"REG_D" \n\t" |
||
713 | |||
714 | "movdqa (%%"REG_D"), %%xmm4 \n\t" |
||
715 | "movdqa %%xmm5, %%xmm6 \n\t" |
||
716 | "punpckhwd %%xmm7, %%xmm5 \n\t" |
||
717 | "punpcklwd %%xmm7, %%xmm6 \n\t" |
||
718 | "paddd %%xmm6, %%xmm4 \n\t" |
||
719 | "movdqa 16(%%"REG_D"), %%xmm6 \n\t" |
||
720 | "paddd %%xmm5, %%xmm6 \n\t" |
||
721 | "paddd %%xmm3, %%xmm4 \n\t" |
||
722 | "paddd %%xmm3, %%xmm6 \n\t" |
||
723 | |||
724 | "psrad $8, %%xmm0 \n\t" /* FRAC_BITS. */ |
||
725 | "psrad $8, %%xmm2 \n\t" /* FRAC_BITS. */ |
||
726 | "packssdw %%xmm2, %%xmm0 \n\t" |
||
727 | "packuswb %%xmm7, %%xmm0 \n\t" |
||
728 | "movq %%xmm0, (%%"REG_d") \n\t" |
||
729 | |||
730 | "psrad $8, %%xmm4 \n\t" /* FRAC_BITS. */ |
||
731 | "psrad $8, %%xmm6 \n\t" /* FRAC_BITS. */ |
||
732 | "packssdw %%xmm6, %%xmm4 \n\t" |
||
733 | "packuswb %%xmm7, %%xmm4 \n\t" |
||
734 | "movq %%xmm4, (%%"REG_d",%%"REG_c");\n\t" |
||
735 | snow_inner_add_yblock_sse2_end_8 |
||
736 | } |
||
737 | |||
738 | static void inner_add_yblock_bw_16_obmc_32_sse2(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, |
||
739 | int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
||
740 | snow_inner_add_yblock_sse2_header |
||
741 | snow_inner_add_yblock_sse2_start_16("xmm1", "xmm5", "3", "0") |
||
742 | snow_inner_add_yblock_sse2_accum_16("2", "16") |
||
743 | snow_inner_add_yblock_sse2_accum_16("1", "512") |
||
744 | snow_inner_add_yblock_sse2_accum_16("0", "528") |
||
745 | |||
746 | "mov %0, %%"REG_d" \n\t" |
||
747 | "psrlw $4, %%xmm1 \n\t" |
||
748 | "psrlw $4, %%xmm5 \n\t" |
||
749 | "paddw (%%"REG_D"), %%xmm1 \n\t" |
||
750 | "paddw 16(%%"REG_D"), %%xmm5 \n\t" |
||
751 | "paddw %%xmm3, %%xmm1 \n\t" |
||
752 | "paddw %%xmm3, %%xmm5 \n\t" |
||
753 | "psraw $4, %%xmm1 \n\t" /* FRAC_BITS. */ |
||
754 | "psraw $4, %%xmm5 \n\t" /* FRAC_BITS. */ |
||
755 | "packuswb %%xmm5, %%xmm1 \n\t" |
||
756 | |||
757 | "movdqu %%xmm1, (%%"REG_d") \n\t" |
||
758 | |||
759 | snow_inner_add_yblock_sse2_end_16 |
||
760 | } |
||
761 | |||
762 | #define snow_inner_add_yblock_mmx_header \ |
||
763 | IDWTELEM * * dst_array = sb->line + src_y;\ |
||
764 | x86_reg tmp;\ |
||
765 | __asm__ volatile(\ |
||
766 | "mov %7, %%"REG_c" \n\t"\ |
||
767 | "mov %6, %2 \n\t"\ |
||
768 | "mov %4, %%"REG_S" \n\t"\ |
||
769 | "pxor %%mm7, %%mm7 \n\t" /* 0 */\ |
||
770 | "pcmpeqd %%mm3, %%mm3 \n\t"\ |
||
771 | "psllw $15, %%mm3 \n\t"\ |
||
772 | "psrlw $12, %%mm3 \n\t" /* FRAC_BITS >> 1 */\ |
||
773 | "1: \n\t"\ |
||
774 | "mov %1, %%"REG_D" \n\t"\ |
||
775 | "mov (%%"REG_D"), %%"REG_D" \n\t"\ |
||
776 | "add %3, %%"REG_D" \n\t" |
||
777 | |||
778 | #define snow_inner_add_yblock_mmx_start(out_reg1, out_reg2, ptr_offset, s_offset, d_offset)\ |
||
779 | "mov "PTR_SIZE"*"ptr_offset"(%%"REG_a"), %%"REG_d"; \n\t"\ |
||
780 | "movd "d_offset"(%%"REG_d"), %%"out_reg1" \n\t"\ |
||
781 | "movd "d_offset"+4(%%"REG_d"), %%"out_reg2" \n\t"\ |
||
782 | "punpcklbw %%mm7, %%"out_reg1" \n\t"\ |
||
783 | "punpcklbw %%mm7, %%"out_reg2" \n\t"\ |
||
784 | "movd "s_offset"(%%"REG_S"), %%mm0 \n\t"\ |
||
785 | "movd "s_offset"+4(%%"REG_S"), %%mm4 \n\t"\ |
||
786 | "punpcklbw %%mm7, %%mm0 \n\t"\ |
||
787 | "punpcklbw %%mm7, %%mm4 \n\t"\ |
||
788 | "pmullw %%mm0, %%"out_reg1" \n\t"\ |
||
789 | "pmullw %%mm4, %%"out_reg2" \n\t" |
||
790 | |||
791 | #define snow_inner_add_yblock_mmx_accum(ptr_offset, s_offset, d_offset) \ |
||
792 | snow_inner_add_yblock_mmx_start("mm2", "mm6", ptr_offset, s_offset, d_offset)\ |
||
793 | "paddusw %%mm2, %%mm1 \n\t"\ |
||
794 | "paddusw %%mm6, %%mm5 \n\t" |
||
795 | |||
796 | #define snow_inner_add_yblock_mmx_mix(read_offset, write_offset)\ |
||
797 | "mov %0, %%"REG_d" \n\t"\ |
||
798 | "psrlw $4, %%mm1 \n\t"\ |
||
799 | "psrlw $4, %%mm5 \n\t"\ |
||
800 | "paddw "read_offset"(%%"REG_D"), %%mm1 \n\t"\ |
||
801 | "paddw "read_offset"+8(%%"REG_D"), %%mm5 \n\t"\ |
||
802 | "paddw %%mm3, %%mm1 \n\t"\ |
||
803 | "paddw %%mm3, %%mm5 \n\t"\ |
||
804 | "psraw $4, %%mm1 \n\t"\ |
||
805 | "psraw $4, %%mm5 \n\t"\ |
||
806 | "packuswb %%mm5, %%mm1 \n\t"\ |
||
807 | "movq %%mm1, "write_offset"(%%"REG_d") \n\t" |
||
808 | |||
809 | #define snow_inner_add_yblock_mmx_end(s_step)\ |
||
810 | "add $"s_step", %%"REG_S" \n\t"\ |
||
811 | "add %%"REG_c", "PTR_SIZE"*3(%%"REG_a");\n\t"\ |
||
812 | "add %%"REG_c", "PTR_SIZE"*2(%%"REG_a");\n\t"\ |
||
813 | "add %%"REG_c", "PTR_SIZE"*1(%%"REG_a");\n\t"\ |
||
814 | "add %%"REG_c", (%%"REG_a") \n\t"\ |
||
815 | "add"OPSIZE " $"PTR_SIZE"*1, %1 \n\t"\ |
||
816 | "add %%"REG_c", %0 \n\t"\ |
||
817 | "dec %2 \n\t"\ |
||
818 | "jnz 1b \n\t"\ |
||
819 | :"+m"(dst8),"+m"(dst_array),"=&r"(tmp)\ |
||
820 | :\ |
||
821 | "rm"((x86_reg)(src_x<<1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ |
||
822 | "%"REG_c"","%"REG_S"","%"REG_D"","%"REG_d""); |
||
823 | |||
824 | static void inner_add_yblock_bw_8_obmc_16_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, |
||
825 | int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
||
826 | snow_inner_add_yblock_mmx_header |
||
827 | snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") |
||
828 | snow_inner_add_yblock_mmx_accum("2", "8", "0") |
||
829 | snow_inner_add_yblock_mmx_accum("1", "128", "0") |
||
830 | snow_inner_add_yblock_mmx_accum("0", "136", "0") |
||
831 | snow_inner_add_yblock_mmx_mix("0", "0") |
||
832 | snow_inner_add_yblock_mmx_end("16") |
||
833 | } |
||
834 | |||
835 | static void inner_add_yblock_bw_16_obmc_32_mmx(const uint8_t *obmc, const x86_reg obmc_stride, uint8_t * * block, int b_w, x86_reg b_h, |
||
836 | int src_x, int src_y, x86_reg src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
||
837 | snow_inner_add_yblock_mmx_header |
||
838 | snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "0", "0") |
||
839 | snow_inner_add_yblock_mmx_accum("2", "16", "0") |
||
840 | snow_inner_add_yblock_mmx_accum("1", "512", "0") |
||
841 | snow_inner_add_yblock_mmx_accum("0", "528", "0") |
||
842 | snow_inner_add_yblock_mmx_mix("0", "0") |
||
843 | |||
844 | snow_inner_add_yblock_mmx_start("mm1", "mm5", "3", "8", "8") |
||
845 | snow_inner_add_yblock_mmx_accum("2", "24", "8") |
||
846 | snow_inner_add_yblock_mmx_accum("1", "520", "8") |
||
847 | snow_inner_add_yblock_mmx_accum("0", "536", "8") |
||
848 | snow_inner_add_yblock_mmx_mix("16", "8") |
||
849 | snow_inner_add_yblock_mmx_end("32") |
||
850 | } |
||
851 | |||
852 | static void ff_snow_inner_add_yblock_sse2(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
||
853 | int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
||
854 | |||
855 | if (b_w == 16) |
||
856 | inner_add_yblock_bw_16_obmc_32_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
||
857 | else if (b_w == 8 && obmc_stride == 16) { |
||
858 | if (!(b_h & 1)) |
||
859 | inner_add_yblock_bw_8_obmc_16_bh_even_sse2(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
||
860 | else |
||
861 | inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
||
862 | } else |
||
863 | ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
||
864 | } |
||
865 | |||
866 | static void ff_snow_inner_add_yblock_mmx(const uint8_t *obmc, const int obmc_stride, uint8_t * * block, int b_w, int b_h, |
||
867 | int src_x, int src_y, int src_stride, slice_buffer * sb, int add, uint8_t * dst8){ |
||
868 | if (b_w == 16) |
||
869 | inner_add_yblock_bw_16_obmc_32_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
||
870 | else if (b_w == 8 && obmc_stride == 16) |
||
871 | inner_add_yblock_bw_8_obmc_16_mmx(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
||
872 | else |
||
873 | ff_snow_inner_add_yblock(obmc, obmc_stride, block, b_w, b_h, src_x,src_y, src_stride, sb, add, dst8); |
||
874 | } |
||
875 | |||
876 | #endif /* HAVE_INLINE_ASM */ |
||
877 | |||
878 | void ff_dwt_init_x86(SnowDWTContext *c) |
||
879 | { |
||
880 | #if HAVE_INLINE_ASM |
||
881 | int mm_flags = av_get_cpu_flags(); |
||
882 | |||
883 | if (mm_flags & AV_CPU_FLAG_MMX) { |
||
884 | if(mm_flags & AV_CPU_FLAG_SSE2 & 0){ |
||
885 | c->horizontal_compose97i = ff_snow_horizontal_compose97i_sse2; |
||
886 | #if HAVE_7REGS |
||
887 | c->vertical_compose97i = ff_snow_vertical_compose97i_sse2; |
||
888 | #endif |
||
889 | c->inner_add_yblock = ff_snow_inner_add_yblock_sse2; |
||
890 | } |
||
891 | else{ |
||
892 | if (mm_flags & AV_CPU_FLAG_MMXEXT) { |
||
893 | c->horizontal_compose97i = ff_snow_horizontal_compose97i_mmx; |
||
894 | #if HAVE_7REGS |
||
895 | c->vertical_compose97i = ff_snow_vertical_compose97i_mmx; |
||
896 | #endif |
||
897 | } |
||
898 | c->inner_add_yblock = ff_snow_inner_add_yblock_mmx; |
||
899 | } |
||
900 | } |
||
901 | #endif /* HAVE_INLINE_ASM */ |
||
902 | }1)),"m"(obmc),"a"(block),"m"(b_h),"m"(src_stride):\ |