Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | ;***************************************************************************** |
2 | ;* MMX optimized DSP utils |
||
3 | ;***************************************************************************** |
||
4 | ;* Copyright (c) 2000, 2001 Fabrice Bellard |
||
5 | ;* Copyright (c) 2002-2004 Michael Niedermayer |
||
6 | ;* |
||
7 | ;* This file is part of FFmpeg. |
||
8 | ;* |
||
9 | ;* FFmpeg is free software; you can redistribute it and/or |
||
10 | ;* modify it under the terms of the GNU Lesser General Public |
||
11 | ;* License as published by the Free Software Foundation; either |
||
12 | ;* version 2.1 of the License, or (at your option) any later version. |
||
13 | ;* |
||
14 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
17 | ;* Lesser General Public License for more details. |
||
18 | ;* |
||
19 | ;* You should have received a copy of the GNU Lesser General Public |
||
20 | ;* License along with FFmpeg; if not, write to the Free Software |
||
21 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
22 | ;***************************************************************************** |
||
23 | |||
24 | %include "libavutil/x86/x86util.asm" |
||
25 | |||
26 | SECTION .text |
||
27 | |||
28 | %macro DIFF_PIXELS_1 4 |
||
29 | movh %1, %3 |
||
30 | movh %2, %4 |
||
31 | punpcklbw %2, %1 |
||
32 | punpcklbw %1, %1 |
||
33 | psubw %1, %2 |
||
34 | %endmacro |
||
35 | |||
36 | ; %1=uint8_t *pix1, %2=uint8_t *pix2, %3=static offset, %4=stride, %5=stride*3 |
||
37 | ; %6=temporary storage location |
||
38 | ; this macro requires $mmsize stack space (aligned) on %6 (except on SSE+x86-64) |
||
39 | %macro DIFF_PIXELS_8 6 |
||
40 | DIFF_PIXELS_1 m0, m7, [%1 +%3], [%2 +%3] |
||
41 | DIFF_PIXELS_1 m1, m7, [%1+%4 +%3], [%2+%4 +%3] |
||
42 | DIFF_PIXELS_1 m2, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
||
43 | add %1, %5 |
||
44 | add %2, %5 |
||
45 | DIFF_PIXELS_1 m3, m7, [%1 +%3], [%2 +%3] |
||
46 | DIFF_PIXELS_1 m4, m7, [%1+%4 +%3], [%2+%4 +%3] |
||
47 | DIFF_PIXELS_1 m5, m7, [%1+%4*2+%3], [%2+%4*2+%3] |
||
48 | DIFF_PIXELS_1 m6, m7, [%1+%5 +%3], [%2+%5 +%3] |
||
49 | %ifdef m8 |
||
50 | DIFF_PIXELS_1 m7, m8, [%1+%4*4+%3], [%2+%4*4+%3] |
||
51 | %else |
||
52 | mova [%6], m0 |
||
53 | DIFF_PIXELS_1 m7, m0, [%1+%4*4+%3], [%2+%4*4+%3] |
||
54 | mova m0, [%6] |
||
55 | %endif |
||
56 | sub %1, %5 |
||
57 | sub %2, %5 |
||
58 | %endmacro |
||
59 | |||
60 | %macro HADAMARD8 0 |
||
61 | SUMSUB_BADC w, 0, 1, 2, 3 |
||
62 | SUMSUB_BADC w, 4, 5, 6, 7 |
||
63 | SUMSUB_BADC w, 0, 2, 1, 3 |
||
64 | SUMSUB_BADC w, 4, 6, 5, 7 |
||
65 | SUMSUB_BADC w, 0, 4, 1, 5 |
||
66 | SUMSUB_BADC w, 2, 6, 3, 7 |
||
67 | %endmacro |
||
68 | |||
69 | %macro ABS1_SUM 3 |
||
70 | ABS1 %1, %2 |
||
71 | paddusw %3, %1 |
||
72 | %endmacro |
||
73 | |||
74 | %macro ABS2_SUM 6 |
||
75 | ABS2 %1, %2, %3, %4 |
||
76 | paddusw %5, %1 |
||
77 | paddusw %6, %2 |
||
78 | %endmacro |
||
79 | |||
80 | %macro ABS_SUM_8x8_64 1 |
||
81 | ABS2 m0, m1, m8, m9 |
||
82 | ABS2_SUM m2, m3, m8, m9, m0, m1 |
||
83 | ABS2_SUM m4, m5, m8, m9, m0, m1 |
||
84 | ABS2_SUM m6, m7, m8, m9, m0, m1 |
||
85 | paddusw m0, m1 |
||
86 | %endmacro |
||
87 | |||
88 | %macro ABS_SUM_8x8_32 1 |
||
89 | mova [%1], m7 |
||
90 | ABS1 m0, m7 |
||
91 | ABS1 m1, m7 |
||
92 | ABS1_SUM m2, m7, m0 |
||
93 | ABS1_SUM m3, m7, m1 |
||
94 | ABS1_SUM m4, m7, m0 |
||
95 | ABS1_SUM m5, m7, m1 |
||
96 | ABS1_SUM m6, m7, m0 |
||
97 | mova m2, [%1] |
||
98 | ABS1_SUM m2, m7, m1 |
||
99 | paddusw m0, m1 |
||
100 | %endmacro |
||
101 | |||
102 | ; FIXME: HSUM saturates at 64k, while an 8x8 hadamard or dct block can get up to |
||
103 | ; about 100k on extreme inputs. But that's very unlikely to occur in natural video, |
||
104 | ; and it's even more unlikely to not have any alternative mvs/modes with lower cost. |
||
105 | %macro HSUM 3 |
||
106 | %if cpuflag(sse2) |
||
107 | movhlps %2, %1 |
||
108 | paddusw %1, %2 |
||
109 | pshuflw %2, %1, 0xE |
||
110 | paddusw %1, %2 |
||
111 | pshuflw %2, %1, 0x1 |
||
112 | paddusw %1, %2 |
||
113 | movd %3, %1 |
||
114 | %elif cpuflag(mmxext) |
||
115 | pshufw %2, %1, 0xE |
||
116 | paddusw %1, %2 |
||
117 | pshufw %2, %1, 0x1 |
||
118 | paddusw %1, %2 |
||
119 | movd %3, %1 |
||
120 | %elif cpuflag(mmx) |
||
121 | mova %2, %1 |
||
122 | psrlq %1, 32 |
||
123 | paddusw %1, %2 |
||
124 | mova %2, %1 |
||
125 | psrlq %1, 16 |
||
126 | paddusw %1, %2 |
||
127 | movd %3, %1 |
||
128 | %endif |
||
129 | %endmacro |
||
130 | |||
131 | %macro STORE4 5 |
||
132 | mova [%1+mmsize*0], %2 |
||
133 | mova [%1+mmsize*1], %3 |
||
134 | mova [%1+mmsize*2], %4 |
||
135 | mova [%1+mmsize*3], %5 |
||
136 | %endmacro |
||
137 | |||
138 | %macro LOAD4 5 |
||
139 | mova %2, [%1+mmsize*0] |
||
140 | mova %3, [%1+mmsize*1] |
||
141 | mova %4, [%1+mmsize*2] |
||
142 | mova %5, [%1+mmsize*3] |
||
143 | %endmacro |
||
144 | |||
145 | %macro hadamard8_16_wrapper 2 |
||
146 | cglobal hadamard8_diff, 4, 4, %1 |
||
147 | %ifndef m8 |
||
148 | %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
||
149 | SUB rsp, pad |
||
150 | %endif |
||
151 | call hadamard8x8_diff %+ SUFFIX |
||
152 | %ifndef m8 |
||
153 | ADD rsp, pad |
||
154 | %endif |
||
155 | RET |
||
156 | |||
157 | cglobal hadamard8_diff16, 5, 6, %1 |
||
158 | %ifndef m8 |
||
159 | %assign pad %2*mmsize-(4+stack_offset&(mmsize-1)) |
||
160 | SUB rsp, pad |
||
161 | %endif |
||
162 | |||
163 | call hadamard8x8_diff %+ SUFFIX |
||
164 | mov r5d, eax |
||
165 | |||
166 | add r1, 8 |
||
167 | add r2, 8 |
||
168 | call hadamard8x8_diff %+ SUFFIX |
||
169 | add r5d, eax |
||
170 | |||
171 | cmp r4d, 16 |
||
172 | jne .done |
||
173 | |||
174 | lea r1, [r1+r3*8-8] |
||
175 | lea r2, [r2+r3*8-8] |
||
176 | call hadamard8x8_diff %+ SUFFIX |
||
177 | add r5d, eax |
||
178 | |||
179 | add r1, 8 |
||
180 | add r2, 8 |
||
181 | call hadamard8x8_diff %+ SUFFIX |
||
182 | add r5d, eax |
||
183 | |||
184 | .done: |
||
185 | mov eax, r5d |
||
186 | %ifndef m8 |
||
187 | ADD rsp, pad |
||
188 | %endif |
||
189 | RET |
||
190 | %endmacro |
||
191 | |||
192 | %macro HADAMARD8_DIFF 0-1 |
||
193 | %if cpuflag(sse2) |
||
194 | hadamard8x8_diff %+ SUFFIX: |
||
195 | lea r0, [r3*3] |
||
196 | DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize |
||
197 | HADAMARD8 |
||
198 | %if ARCH_X86_64 |
||
199 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
||
200 | %else |
||
201 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [rsp+gprsize], [rsp+mmsize+gprsize] |
||
202 | %endif |
||
203 | HADAMARD8 |
||
204 | ABS_SUM_8x8 rsp+gprsize |
||
205 | HSUM m0, m1, eax |
||
206 | and eax, 0xFFFF |
||
207 | ret |
||
208 | |||
209 | hadamard8_16_wrapper %1, 3 |
||
210 | %elif cpuflag(mmx) |
||
211 | ALIGN 16 |
||
212 | ; int hadamard8_diff_##cpu(void *s, uint8_t *src1, uint8_t *src2, |
||
213 | ; int stride, int h) |
||
214 | ; r0 = void *s = unused, int h = unused (always 8) |
||
215 | ; note how r1, r2 and r3 are not clobbered in this function, so 16x16 |
||
216 | ; can simply call this 2x2x (and that's why we access rsp+gprsize |
||
217 | ; everywhere, which is rsp of calling func |
||
218 | hadamard8x8_diff %+ SUFFIX: |
||
219 | lea r0, [r3*3] |
||
220 | |||
221 | ; first 4x8 pixels |
||
222 | DIFF_PIXELS_8 r1, r2, 0, r3, r0, rsp+gprsize+0x60 |
||
223 | HADAMARD8 |
||
224 | mova [rsp+gprsize+0x60], m7 |
||
225 | TRANSPOSE4x4W 0, 1, 2, 3, 7 |
||
226 | STORE4 rsp+gprsize, m0, m1, m2, m3 |
||
227 | mova m7, [rsp+gprsize+0x60] |
||
228 | TRANSPOSE4x4W 4, 5, 6, 7, 0 |
||
229 | STORE4 rsp+gprsize+0x40, m4, m5, m6, m7 |
||
230 | |||
231 | ; second 4x8 pixels |
||
232 | DIFF_PIXELS_8 r1, r2, 4, r3, r0, rsp+gprsize+0x60 |
||
233 | HADAMARD8 |
||
234 | mova [rsp+gprsize+0x60], m7 |
||
235 | TRANSPOSE4x4W 0, 1, 2, 3, 7 |
||
236 | STORE4 rsp+gprsize+0x20, m0, m1, m2, m3 |
||
237 | mova m7, [rsp+gprsize+0x60] |
||
238 | TRANSPOSE4x4W 4, 5, 6, 7, 0 |
||
239 | |||
240 | LOAD4 rsp+gprsize+0x40, m0, m1, m2, m3 |
||
241 | HADAMARD8 |
||
242 | ABS_SUM_8x8_32 rsp+gprsize+0x60 |
||
243 | mova [rsp+gprsize+0x60], m0 |
||
244 | |||
245 | LOAD4 rsp+gprsize , m0, m1, m2, m3 |
||
246 | LOAD4 rsp+gprsize+0x20, m4, m5, m6, m7 |
||
247 | HADAMARD8 |
||
248 | ABS_SUM_8x8_32 rsp+gprsize |
||
249 | paddusw m0, [rsp+gprsize+0x60] |
||
250 | |||
251 | HSUM m0, m1, eax |
||
252 | and rax, 0xFFFF |
||
253 | ret |
||
254 | |||
255 | hadamard8_16_wrapper 0, 14 |
||
256 | %endif |
||
257 | %endmacro |
||
258 | |||
259 | INIT_MMX mmx |
||
260 | HADAMARD8_DIFF |
||
261 | |||
262 | INIT_MMX mmxext |
||
263 | HADAMARD8_DIFF |
||
264 | |||
265 | INIT_XMM sse2 |
||
266 | %if ARCH_X86_64 |
||
267 | %define ABS_SUM_8x8 ABS_SUM_8x8_64 |
||
268 | %else |
||
269 | %define ABS_SUM_8x8 ABS_SUM_8x8_32 |
||
270 | %endif |
||
271 | HADAMARD8_DIFF 10 |
||
272 | |||
273 | INIT_XMM ssse3 |
||
274 | %define ABS_SUM_8x8 ABS_SUM_8x8_64 |
||
275 | HADAMARD8_DIFF 9 |
||
276 | |||
277 | INIT_XMM sse2 |
||
278 | ; sse16_sse2(void *v, uint8_t * pix1, uint8_t * pix2, int line_size, int h) |
||
279 | cglobal sse16, 5, 5, 8 |
||
280 | shr r4d, 1 |
||
281 | pxor m0, m0 ; mm0 = 0 |
||
282 | pxor m7, m7 ; mm7 holds the sum |
||
283 | |||
284 | .next2lines: ; FIXME why are these unaligned movs? pix1[] is aligned |
||
285 | movu m1, [r1 ] ; mm1 = pix1[0][0-15] |
||
286 | movu m2, [r2 ] ; mm2 = pix2[0][0-15] |
||
287 | movu m3, [r1+r3] ; mm3 = pix1[1][0-15] |
||
288 | movu m4, [r2+r3] ; mm4 = pix2[1][0-15] |
||
289 | |||
290 | ; todo: mm1-mm2, mm3-mm4 |
||
291 | ; algo: subtract mm1 from mm2 with saturation and vice versa |
||
292 | ; OR the result to get the absolute difference |
||
293 | mova m5, m1 |
||
294 | mova m6, m3 |
||
295 | psubusb m1, m2 |
||
296 | psubusb m3, m4 |
||
297 | psubusb m2, m5 |
||
298 | psubusb m4, m6 |
||
299 | |||
300 | por m2, m1 |
||
301 | por m4, m3 |
||
302 | |||
303 | ; now convert to 16-bit vectors so we can square them |
||
304 | mova m1, m2 |
||
305 | mova m3, m4 |
||
306 | |||
307 | punpckhbw m2, m0 |
||
308 | punpckhbw m4, m0 |
||
309 | punpcklbw m1, m0 ; mm1 not spread over (mm1,mm2) |
||
310 | punpcklbw m3, m0 ; mm4 not spread over (mm3,mm4) |
||
311 | |||
312 | pmaddwd m2, m2 |
||
313 | pmaddwd m4, m4 |
||
314 | pmaddwd m1, m1 |
||
315 | pmaddwd m3, m3 |
||
316 | |||
317 | lea r1, [r1+r3*2] ; pix1 += 2*line_size |
||
318 | lea r2, [r2+r3*2] ; pix2 += 2*line_size |
||
319 | |||
320 | paddd m1, m2 |
||
321 | paddd m3, m4 |
||
322 | paddd m7, m1 |
||
323 | paddd m7, m3 |
||
324 | |||
325 | dec r4 |
||
326 | jnz .next2lines |
||
327 | |||
328 | mova m1, m7 |
||
329 | psrldq m7, 8 ; shift hi qword to lo |
||
330 | paddd m7, m1 |
||
331 | mova m1, m7 |
||
332 | psrldq m7, 4 ; shift hi dword to lo |
||
333 | paddd m7, m1 |
||
334 | movd eax, m7 ; return value |
||
335 | RET |
||
336 | |||
337 | INIT_MMX mmx |
||
338 | ; get_pixels_mmx(int16_t *block, const uint8_t *pixels, int line_size) |
||
339 | cglobal get_pixels, 3,4 |
||
340 | movsxdifnidn r2, r2d |
||
341 | add r0, 128 |
||
342 | mov r3, -128 |
||
343 | pxor m7, m7 |
||
344 | .loop: |
||
345 | mova m0, [r1] |
||
346 | mova m2, [r1+r2] |
||
347 | mova m1, m0 |
||
348 | mova m3, m2 |
||
349 | punpcklbw m0, m7 |
||
350 | punpckhbw m1, m7 |
||
351 | punpcklbw m2, m7 |
||
352 | punpckhbw m3, m7 |
||
353 | mova [r0+r3+ 0], m0 |
||
354 | mova [r0+r3+ 8], m1 |
||
355 | mova [r0+r3+16], m2 |
||
356 | mova [r0+r3+24], m3 |
||
357 | lea r1, [r1+r2*2] |
||
358 | add r3, 32 |
||
359 | js .loop |
||
360 | REP_RET |
||
361 | |||
362 | INIT_XMM sse2 |
||
363 | cglobal get_pixels, 3, 4 |
||
364 | movsxdifnidn r2, r2d |
||
365 | lea r3, [r2*3] |
||
366 | pxor m4, m4 |
||
367 | movh m0, [r1] |
||
368 | movh m1, [r1+r2] |
||
369 | movh m2, [r1+r2*2] |
||
370 | movh m3, [r1+r3] |
||
371 | lea r1, [r1+r2*4] |
||
372 | punpcklbw m0, m4 |
||
373 | punpcklbw m1, m4 |
||
374 | punpcklbw m2, m4 |
||
375 | punpcklbw m3, m4 |
||
376 | mova [r0], m0 |
||
377 | mova [r0+0x10], m1 |
||
378 | mova [r0+0x20], m2 |
||
379 | mova [r0+0x30], m3 |
||
380 | movh m0, [r1] |
||
381 | movh m1, [r1+r2*1] |
||
382 | movh m2, [r1+r2*2] |
||
383 | movh m3, [r1+r3] |
||
384 | punpcklbw m0, m4 |
||
385 | punpcklbw m1, m4 |
||
386 | punpcklbw m2, m4 |
||
387 | punpcklbw m3, m4 |
||
388 | mova [r0+0x40], m0 |
||
389 | mova [r0+0x50], m1 |
||
390 | mova [r0+0x60], m2 |
||
391 | mova [r0+0x70], m3 |
||
392 | RET |
||
393 | |||
394 | INIT_MMX mmx |
||
395 | ; diff_pixels_mmx(int16_t *block, const uint8_t *s1, const unint8_t *s2, stride) |
||
396 | cglobal diff_pixels, 4,5 |
||
397 | movsxdifnidn r3, r3d |
||
398 | pxor m7, m7 |
||
399 | add r0, 128 |
||
400 | mov r4, -128 |
||
401 | .loop: |
||
402 | mova m0, [r1] |
||
403 | mova m2, [r2] |
||
404 | mova m1, m0 |
||
405 | mova m3, m2 |
||
406 | punpcklbw m0, m7 |
||
407 | punpckhbw m1, m7 |
||
408 | punpcklbw m2, m7 |
||
409 | punpckhbw m3, m7 |
||
410 | psubw m0, m2 |
||
411 | psubw m1, m3 |
||
412 | mova [r0+r4+0], m0 |
||
413 | mova [r0+r4+8], m1 |
||
414 | add r1, r3 |
||
415 | add r2, r3 |
||
416 | add r4, 16 |
||
417 | jne .loop |
||
418 | REP_RET |
||
419 | |||
420 | INIT_MMX mmx |
||
421 | ; pix_sum16_mmx(uint8_t * pix, int line_size) |
||
422 | cglobal pix_sum16, 2, 3 |
||
423 | movsxdifnidn r1, r1d |
||
424 | mov r2, r1 |
||
425 | neg r2 |
||
426 | shl r2, 4 |
||
427 | sub r0, r2 |
||
428 | pxor m7, m7 |
||
429 | pxor m6, m6 |
||
430 | .loop: |
||
431 | mova m0, [r0+r2+0] |
||
432 | mova m1, [r0+r2+0] |
||
433 | mova m2, [r0+r2+8] |
||
434 | mova m3, [r0+r2+8] |
||
435 | punpcklbw m0, m7 |
||
436 | punpckhbw m1, m7 |
||
437 | punpcklbw m2, m7 |
||
438 | punpckhbw m3, m7 |
||
439 | paddw m1, m0 |
||
440 | paddw m3, m2 |
||
441 | paddw m3, m1 |
||
442 | paddw m6, m3 |
||
443 | add r2, r1 |
||
444 | js .loop |
||
445 | mova m5, m6 |
||
446 | psrlq m6, 32 |
||
447 | paddw m6, m5 |
||
448 | mova m5, m6 |
||
449 | psrlq m6, 16 |
||
450 | paddw m6, m5 |
||
451 | movd eax, m6 |
||
452 | and eax, 0xffff |
||
453 | RET |
||
454 | |||
455 | INIT_MMX mmx |
||
456 | ; pix_norm1_mmx(uint8_t *pix, int line_size) |
||
457 | cglobal pix_norm1, 2, 4 |
||
458 | movsxdifnidn r1, r1d |
||
459 | mov r2, 16 |
||
460 | pxor m0, m0 |
||
461 | pxor m7, m7 |
||
462 | .loop: |
||
463 | mova m2, [r0+0] |
||
464 | mova m3, [r0+8] |
||
465 | mova m1, m2 |
||
466 | punpckhbw m1, m0 |
||
467 | punpcklbw m2, m0 |
||
468 | mova m4, m3 |
||
469 | punpckhbw m3, m0 |
||
470 | punpcklbw m4, m0 |
||
471 | pmaddwd m1, m1 |
||
472 | pmaddwd m2, m2 |
||
473 | pmaddwd m3, m3 |
||
474 | pmaddwd m4, m4 |
||
475 | paddd m2, m1 |
||
476 | paddd m4, m3 |
||
477 | paddd m7, m2 |
||
478 | add r0, r1 |
||
479 | paddd m7, m4 |
||
480 | dec r2 |
||
481 | jne .loop |
||
482 | mova m1, m7 |
||
483 | psrlq m7, 32 |
||
484 | paddd m1, m7 |
||
485 | movd eax, m1 |
||
486 | RET |
||
487 |