Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | ;****************************************************************************** |
2 | ;* MMX/SSSE3-optimized functions for H264 chroma MC |
||
3 | ;* Copyright (c) 2005 Zoltan Hidvegi |
||
4 | ;* 2005-2008 Loren Merritt |
||
5 | ;* |
||
6 | ;* This file is part of FFmpeg. |
||
7 | ;* |
||
8 | ;* FFmpeg is free software; you can redistribute it and/or |
||
9 | ;* modify it under the terms of the GNU Lesser General Public |
||
10 | ;* License as published by the Free Software Foundation; either |
||
11 | ;* version 2.1 of the License, or (at your option) any later version. |
||
12 | ;* |
||
13 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
14 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
15 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
16 | ;* Lesser General Public License for more details. |
||
17 | ;* |
||
18 | ;* You should have received a copy of the GNU Lesser General Public |
||
19 | ;* License along with FFmpeg; if not, write to the Free Software |
||
20 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
21 | ;****************************************************************************** |
||
22 | |||
23 | %include "libavutil/x86/x86util.asm" |
||
24 | |||
25 | SECTION_RODATA |
||
26 | |||
27 | rnd_rv40_2d_tbl: times 4 dw 0 |
||
28 | times 4 dw 16 |
||
29 | times 4 dw 32 |
||
30 | times 4 dw 16 |
||
31 | times 4 dw 32 |
||
32 | times 4 dw 28 |
||
33 | times 4 dw 32 |
||
34 | times 4 dw 28 |
||
35 | times 4 dw 0 |
||
36 | times 4 dw 32 |
||
37 | times 4 dw 16 |
||
38 | times 4 dw 32 |
||
39 | times 4 dw 32 |
||
40 | times 4 dw 28 |
||
41 | times 4 dw 32 |
||
42 | times 4 dw 28 |
||
43 | rnd_rv40_1d_tbl: times 4 dw 0 |
||
44 | times 4 dw 2 |
||
45 | times 4 dw 4 |
||
46 | times 4 dw 2 |
||
47 | times 4 dw 4 |
||
48 | times 4 dw 3 |
||
49 | times 4 dw 4 |
||
50 | times 4 dw 3 |
||
51 | times 4 dw 0 |
||
52 | times 4 dw 4 |
||
53 | times 4 dw 2 |
||
54 | times 4 dw 4 |
||
55 | times 4 dw 4 |
||
56 | times 4 dw 3 |
||
57 | times 4 dw 4 |
||
58 | times 4 dw 3 |
||
59 | |||
60 | cextern pw_3 |
||
61 | cextern pw_4 |
||
62 | cextern pw_8 |
||
63 | pw_28: times 8 dw 28 |
||
64 | cextern pw_32 |
||
65 | cextern pw_64 |
||
66 | |||
67 | SECTION .text |
||
68 | |||
69 | %macro mv0_pixels_mc8 0 |
||
70 | lea r4, [r2*2 ] |
||
71 | .next4rows: |
||
72 | movq mm0, [r1 ] |
||
73 | movq mm1, [r1+r2] |
||
74 | add r1, r4 |
||
75 | CHROMAMC_AVG mm0, [r0 ] |
||
76 | CHROMAMC_AVG mm1, [r0+r2] |
||
77 | movq [r0 ], mm0 |
||
78 | movq [r0+r2], mm1 |
||
79 | add r0, r4 |
||
80 | movq mm0, [r1 ] |
||
81 | movq mm1, [r1+r2] |
||
82 | add r1, r4 |
||
83 | CHROMAMC_AVG mm0, [r0 ] |
||
84 | CHROMAMC_AVG mm1, [r0+r2] |
||
85 | movq [r0 ], mm0 |
||
86 | movq [r0+r2], mm1 |
||
87 | add r0, r4 |
||
88 | sub r3d, 4 |
||
89 | jne .next4rows |
||
90 | %endmacro |
||
91 | |||
92 | %macro chroma_mc8_mmx_func 2-3 |
||
93 | %ifidn %2, rv40 |
||
94 | %ifdef PIC |
||
95 | %define rnd_1d_rv40 r8 |
||
96 | %define rnd_2d_rv40 r8 |
||
97 | %define extra_regs 2 |
||
98 | %else ; no-PIC |
||
99 | %define rnd_1d_rv40 rnd_rv40_1d_tbl |
||
100 | %define rnd_2d_rv40 rnd_rv40_2d_tbl |
||
101 | %define extra_regs 1 |
||
102 | %endif ; PIC |
||
103 | %else |
||
104 | %define extra_regs 0 |
||
105 | %endif ; rv40 |
||
106 | ; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/, |
||
107 | ; int stride, int h, int mx, int my) |
||
108 | cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0 |
||
109 | %if ARCH_X86_64 |
||
110 | movsxd r2, r2d |
||
111 | %endif |
||
112 | mov r6d, r5d |
||
113 | or r6d, r4d |
||
114 | jne .at_least_one_non_zero |
||
115 | ; mx == 0 AND my == 0 - no filter needed |
||
116 | mv0_pixels_mc8 |
||
117 | REP_RET |
||
118 | |||
119 | .at_least_one_non_zero: |
||
120 | %ifidn %2, rv40 |
||
121 | %if ARCH_X86_64 |
||
122 | mov r7, r5 |
||
123 | and r7, 6 ; &~1 for mx/my=[0,7] |
||
124 | lea r7, [r7*4+r4] |
||
125 | sar r7d, 1 |
||
126 | %define rnd_bias r7 |
||
127 | %define dest_reg r0 |
||
128 | %else ; x86-32 |
||
129 | mov r0, r5 |
||
130 | and r0, 6 ; &~1 for mx/my=[0,7] |
||
131 | lea r0, [r0*4+r4] |
||
132 | sar r0d, 1 |
||
133 | %define rnd_bias r0 |
||
134 | %define dest_reg r5 |
||
135 | %endif |
||
136 | %else ; vc1, h264 |
||
137 | %define rnd_bias 0 |
||
138 | %define dest_reg r0 |
||
139 | %endif |
||
140 | |||
141 | test r5d, r5d |
||
142 | mov r6, 1 |
||
143 | je .my_is_zero |
||
144 | test r4d, r4d |
||
145 | mov r6, r2 ; dxy = x ? 1 : stride |
||
146 | jne .both_non_zero |
||
147 | .my_is_zero: |
||
148 | ; mx == 0 XOR my == 0 - 1 dimensional filter only |
||
149 | or r4d, r5d ; x + y |
||
150 | |||
151 | %ifidn %2, rv40 |
||
152 | %ifdef PIC |
||
153 | lea r8, [rnd_rv40_1d_tbl] |
||
154 | %endif |
||
155 | %if ARCH_X86_64 == 0 |
||
156 | mov r5, r0m |
||
157 | %endif |
||
158 | %endif |
||
159 | |||
160 | movd m5, r4d |
||
161 | movq m4, [pw_8] |
||
162 | movq m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3 |
||
163 | punpcklwd m5, m5 |
||
164 | punpckldq m5, m5 ; mm5 = B = x |
||
165 | pxor m7, m7 |
||
166 | psubw m4, m5 ; mm4 = A = 8-x |
||
167 | |||
168 | .next1drow: |
||
169 | movq m0, [r1 ] ; mm0 = src[0..7] |
||
170 | movq m2, [r1+r6] ; mm1 = src[1..8] |
||
171 | |||
172 | movq m1, m0 |
||
173 | movq m3, m2 |
||
174 | punpcklbw m0, m7 |
||
175 | punpckhbw m1, m7 |
||
176 | punpcklbw m2, m7 |
||
177 | punpckhbw m3, m7 |
||
178 | pmullw m0, m4 ; [mm0,mm1] = A * src[0..7] |
||
179 | pmullw m1, m4 |
||
180 | pmullw m2, m5 ; [mm2,mm3] = B * src[1..8] |
||
181 | pmullw m3, m5 |
||
182 | |||
183 | paddw m0, m6 |
||
184 | paddw m1, m6 |
||
185 | paddw m0, m2 |
||
186 | paddw m1, m3 |
||
187 | psrlw m0, 3 |
||
188 | psrlw m1, 3 |
||
189 | packuswb m0, m1 |
||
190 | CHROMAMC_AVG m0, [dest_reg] |
||
191 | movq [dest_reg], m0 ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3 |
||
192 | |||
193 | add dest_reg, r2 |
||
194 | add r1, r2 |
||
195 | dec r3d |
||
196 | jne .next1drow |
||
197 | REP_RET |
||
198 | |||
199 | .both_non_zero: ; general case, bilinear |
||
200 | movd m4, r4d ; x |
||
201 | movd m6, r5d ; y |
||
202 | %ifidn %2, rv40 |
||
203 | %ifdef PIC |
||
204 | lea r8, [rnd_rv40_2d_tbl] |
||
205 | %endif |
||
206 | %if ARCH_X86_64 == 0 |
||
207 | mov r5, r0m |
||
208 | %endif |
||
209 | %endif |
||
210 | mov r6, rsp ; backup stack pointer |
||
211 | and rsp, ~(mmsize-1) ; align stack |
||
212 | sub rsp, 16 ; AA and DD |
||
213 | |||
214 | punpcklwd m4, m4 |
||
215 | punpcklwd m6, m6 |
||
216 | punpckldq m4, m4 ; mm4 = x words |
||
217 | punpckldq m6, m6 ; mm6 = y words |
||
218 | movq m5, m4 |
||
219 | pmullw m4, m6 ; mm4 = x * y |
||
220 | psllw m5, 3 |
||
221 | psllw m6, 3 |
||
222 | movq m7, m5 |
||
223 | paddw m7, m6 |
||
224 | movq [rsp+8], m4 ; DD = x * y |
||
225 | psubw m5, m4 ; mm5 = B = 8x - xy |
||
226 | psubw m6, m4 ; mm6 = C = 8y - xy |
||
227 | paddw m4, [pw_64] |
||
228 | psubw m4, m7 ; mm4 = A = xy - (8x+8y) + 64 |
||
229 | pxor m7, m7 |
||
230 | movq [rsp ], m4 |
||
231 | |||
232 | movq m0, [r1 ] ; mm0 = src[0..7] |
||
233 | movq m1, [r1+1] ; mm1 = src[1..8] |
||
234 | .next2drow: |
||
235 | add r1, r2 |
||
236 | |||
237 | movq m2, m0 |
||
238 | movq m3, m1 |
||
239 | punpckhbw m0, m7 |
||
240 | punpcklbw m1, m7 |
||
241 | punpcklbw m2, m7 |
||
242 | punpckhbw m3, m7 |
||
243 | pmullw m0, [rsp] |
||
244 | pmullw m2, [rsp] |
||
245 | pmullw m1, m5 |
||
246 | pmullw m3, m5 |
||
247 | paddw m2, m1 ; mm2 = A * src[0..3] + B * src[1..4] |
||
248 | paddw m3, m0 ; mm3 = A * src[4..7] + B * src[5..8] |
||
249 | |||
250 | movq m0, [r1] |
||
251 | movq m1, m0 |
||
252 | punpcklbw m0, m7 |
||
253 | punpckhbw m1, m7 |
||
254 | pmullw m0, m6 |
||
255 | pmullw m1, m6 |
||
256 | paddw m2, m0 |
||
257 | paddw m3, m1 ; [mm2,mm3] += C * src[0..7] |
||
258 | |||
259 | movq m1, [r1+1] |
||
260 | movq m0, m1 |
||
261 | movq m4, m1 |
||
262 | punpcklbw m0, m7 |
||
263 | punpckhbw m4, m7 |
||
264 | pmullw m0, [rsp+8] |
||
265 | pmullw m4, [rsp+8] |
||
266 | paddw m2, m0 |
||
267 | paddw m3, m4 ; [mm2,mm3] += D * src[1..8] |
||
268 | movq m0, [r1] |
||
269 | |||
270 | paddw m2, [rnd_2d_%2+rnd_bias*8] |
||
271 | paddw m3, [rnd_2d_%2+rnd_bias*8] |
||
272 | psrlw m2, 6 |
||
273 | psrlw m3, 6 |
||
274 | packuswb m2, m3 |
||
275 | CHROMAMC_AVG m2, [dest_reg] |
||
276 | movq [dest_reg], m2 ; dst[0..7] = ([mm2,mm3] + rnd) >> 6 |
||
277 | |||
278 | add dest_reg, r2 |
||
279 | dec r3d |
||
280 | jne .next2drow |
||
281 | mov rsp, r6 ; restore stack pointer |
||
282 | RET |
||
283 | %endmacro |
||
284 | |||
285 | %macro chroma_mc4_mmx_func 2 |
||
286 | %define extra_regs 0 |
||
287 | %ifidn %2, rv40 |
||
288 | %ifdef PIC |
||
289 | %define extra_regs 1 |
||
290 | %endif ; PIC |
||
291 | %endif ; rv40 |
||
292 | cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0 |
||
293 | %if ARCH_X86_64 |
||
294 | movsxd r2, r2d |
||
295 | %endif |
||
296 | pxor m7, m7 |
||
297 | movd m2, r4d ; x |
||
298 | movd m3, r5d ; y |
||
299 | movq m4, [pw_8] |
||
300 | movq m5, [pw_8] |
||
301 | punpcklwd m2, m2 |
||
302 | punpcklwd m3, m3 |
||
303 | punpcklwd m2, m2 |
||
304 | punpcklwd m3, m3 |
||
305 | psubw m4, m2 |
||
306 | psubw m5, m3 |
||
307 | |||
308 | %ifidn %2, rv40 |
||
309 | %ifdef PIC |
||
310 | lea r6, [rnd_rv40_2d_tbl] |
||
311 | %define rnd_2d_rv40 r6 |
||
312 | %else |
||
313 | %define rnd_2d_rv40 rnd_rv40_2d_tbl |
||
314 | %endif |
||
315 | and r5, 6 ; &~1 for mx/my=[0,7] |
||
316 | lea r5, [r5*4+r4] |
||
317 | sar r5d, 1 |
||
318 | %define rnd_bias r5 |
||
319 | %else ; vc1, h264 |
||
320 | %define rnd_bias 0 |
||
321 | %endif |
||
322 | |||
323 | movd m0, [r1 ] |
||
324 | movd m6, [r1+1] |
||
325 | add r1, r2 |
||
326 | punpcklbw m0, m7 |
||
327 | punpcklbw m6, m7 |
||
328 | pmullw m0, m4 |
||
329 | pmullw m6, m2 |
||
330 | paddw m6, m0 |
||
331 | |||
332 | .next2rows: |
||
333 | movd m0, [r1 ] |
||
334 | movd m1, [r1+1] |
||
335 | add r1, r2 |
||
336 | punpcklbw m0, m7 |
||
337 | punpcklbw m1, m7 |
||
338 | pmullw m0, m4 |
||
339 | pmullw m1, m2 |
||
340 | paddw m1, m0 |
||
341 | movq m0, m1 |
||
342 | |||
343 | pmullw m6, m5 |
||
344 | pmullw m1, m3 |
||
345 | paddw m6, [rnd_2d_%2+rnd_bias*8] |
||
346 | paddw m1, m6 |
||
347 | psrlw m1, 6 |
||
348 | packuswb m1, m1 |
||
349 | CHROMAMC_AVG4 m1, m6, [r0] |
||
350 | movd [r0], m1 |
||
351 | add r0, r2 |
||
352 | |||
353 | movd m6, [r1 ] |
||
354 | movd m1, [r1+1] |
||
355 | add r1, r2 |
||
356 | punpcklbw m6, m7 |
||
357 | punpcklbw m1, m7 |
||
358 | pmullw m6, m4 |
||
359 | pmullw m1, m2 |
||
360 | paddw m1, m6 |
||
361 | movq m6, m1 |
||
362 | pmullw m0, m5 |
||
363 | pmullw m1, m3 |
||
364 | paddw m0, [rnd_2d_%2+rnd_bias*8] |
||
365 | paddw m1, m0 |
||
366 | psrlw m1, 6 |
||
367 | packuswb m1, m1 |
||
368 | CHROMAMC_AVG4 m1, m0, [r0] |
||
369 | movd [r0], m1 |
||
370 | add r0, r2 |
||
371 | sub r3d, 2 |
||
372 | jnz .next2rows |
||
373 | REP_RET |
||
374 | %endmacro |
||
375 | |||
376 | %macro chroma_mc2_mmx_func 2 |
||
377 | cglobal %1_%2_chroma_mc2, 6, 7, 0 |
||
378 | %if ARCH_X86_64 |
||
379 | movsxd r2, r2d |
||
380 | %endif |
||
381 | |||
382 | mov r6d, r4d |
||
383 | shl r4d, 16 |
||
384 | sub r4d, r6d |
||
385 | add r4d, 8 |
||
386 | imul r5d, r4d ; x*y<<16 | y*(8-x) |
||
387 | shl r4d, 3 |
||
388 | sub r4d, r5d ; x*(8-y)<<16 | (8-x)*(8-y) |
||
389 | |||
390 | movd m5, r4d |
||
391 | movd m6, r5d |
||
392 | punpckldq m5, m5 ; mm5 = {A,B,A,B} |
||
393 | punpckldq m6, m6 ; mm6 = {C,D,C,D} |
||
394 | pxor m7, m7 |
||
395 | movd m2, [r1] |
||
396 | punpcklbw m2, m7 |
||
397 | pshufw m2, m2, 0x94 ; mm0 = src[0,1,1,2] |
||
398 | |||
399 | .nextrow: |
||
400 | add r1, r2 |
||
401 | movq m1, m2 |
||
402 | pmaddwd m1, m5 ; mm1 = A * src[0,1] + B * src[1,2] |
||
403 | movd m0, [r1] |
||
404 | punpcklbw m0, m7 |
||
405 | pshufw m0, m0, 0x94 ; mm0 = src[0,1,1,2] |
||
406 | movq m2, m0 |
||
407 | pmaddwd m0, m6 |
||
408 | paddw m1, [rnd_2d_%2] |
||
409 | paddw m1, m0 ; mm1 += C * src[0,1] + D * src[1,2] |
||
410 | psrlw m1, 6 |
||
411 | packssdw m1, m7 |
||
412 | packuswb m1, m7 |
||
413 | CHROMAMC_AVG4 m1, m3, [r0] |
||
414 | movd r5d, m1 |
||
415 | mov [r0], r5w |
||
416 | add r0, r2 |
||
417 | sub r3d, 1 |
||
418 | jnz .nextrow |
||
419 | REP_RET |
||
420 | %endmacro |
||
421 | |||
422 | %define rnd_1d_h264 pw_4 |
||
423 | %define rnd_2d_h264 pw_32 |
||
424 | %define rnd_1d_vc1 pw_3 |
||
425 | %define rnd_2d_vc1 pw_28 |
||
426 | |||
427 | %macro NOTHING 2-3 |
||
428 | %endmacro |
||
429 | %macro DIRECT_AVG 2 |
||
430 | PAVGB %1, %2 |
||
431 | %endmacro |
||
432 | %macro COPY_AVG 3 |
||
433 | movd %2, %3 |
||
434 | PAVGB %1, %2 |
||
435 | %endmacro |
||
436 | |||
437 | INIT_MMX mmx |
||
438 | %define CHROMAMC_AVG NOTHING |
||
439 | %define CHROMAMC_AVG4 NOTHING |
||
440 | chroma_mc8_mmx_func put, h264, _rnd |
||
441 | chroma_mc8_mmx_func put, vc1, _nornd |
||
442 | chroma_mc8_mmx_func put, rv40 |
||
443 | chroma_mc4_mmx_func put, h264 |
||
444 | chroma_mc4_mmx_func put, rv40 |
||
445 | |||
446 | INIT_MMX mmxext |
||
447 | chroma_mc2_mmx_func put, h264 |
||
448 | |||
449 | %define CHROMAMC_AVG DIRECT_AVG |
||
450 | %define CHROMAMC_AVG4 COPY_AVG |
||
451 | chroma_mc8_mmx_func avg, h264, _rnd |
||
452 | chroma_mc8_mmx_func avg, vc1, _nornd |
||
453 | chroma_mc8_mmx_func avg, rv40 |
||
454 | chroma_mc4_mmx_func avg, h264 |
||
455 | chroma_mc4_mmx_func avg, rv40 |
||
456 | chroma_mc2_mmx_func avg, h264 |
||
457 | |||
458 | INIT_MMX 3dnow |
||
459 | chroma_mc8_mmx_func avg, h264, _rnd |
||
460 | chroma_mc8_mmx_func avg, vc1, _nornd |
||
461 | chroma_mc8_mmx_func avg, rv40 |
||
462 | chroma_mc4_mmx_func avg, h264 |
||
463 | chroma_mc4_mmx_func avg, rv40 |
||
464 | |||
465 | %macro chroma_mc8_ssse3_func 2-3 |
||
466 | cglobal %1_%2_chroma_mc8%3, 6, 7, 8 |
||
467 | %if ARCH_X86_64 |
||
468 | movsxd r2, r2d |
||
469 | %endif |
||
470 | mov r6d, r5d |
||
471 | or r6d, r4d |
||
472 | jne .at_least_one_non_zero |
||
473 | ; mx == 0 AND my == 0 - no filter needed |
||
474 | mv0_pixels_mc8 |
||
475 | REP_RET |
||
476 | |||
477 | .at_least_one_non_zero: |
||
478 | test r5d, r5d |
||
479 | je .my_is_zero |
||
480 | test r4d, r4d |
||
481 | je .mx_is_zero |
||
482 | |||
483 | ; general case, bilinear |
||
484 | mov r6d, r4d |
||
485 | shl r4d, 8 |
||
486 | sub r4, r6 |
||
487 | mov r6, 8 |
||
488 | add r4, 8 ; x*288+8 = x<<8 | (8-x) |
||
489 | sub r6d, r5d |
||
490 | imul r6, r4 ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) |
||
491 | imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) |
||
492 | |||
493 | movd m7, r6d |
||
494 | movd m6, r4d |
||
495 | movdqa m5, [rnd_2d_%2] |
||
496 | movq m0, [r1 ] |
||
497 | movq m1, [r1+1] |
||
498 | pshuflw m7, m7, 0 |
||
499 | pshuflw m6, m6, 0 |
||
500 | punpcklbw m0, m1 |
||
501 | movlhps m7, m7 |
||
502 | movlhps m6, m6 |
||
503 | |||
504 | .next2rows: |
||
505 | movq m1, [r1+r2*1 ] |
||
506 | movq m2, [r1+r2*1+1] |
||
507 | movq m3, [r1+r2*2 ] |
||
508 | movq m4, [r1+r2*2+1] |
||
509 | lea r1, [r1+r2*2] |
||
510 | punpcklbw m1, m2 |
||
511 | movdqa m2, m1 |
||
512 | punpcklbw m3, m4 |
||
513 | movdqa m4, m3 |
||
514 | pmaddubsw m0, m7 |
||
515 | pmaddubsw m1, m6 |
||
516 | pmaddubsw m2, m7 |
||
517 | pmaddubsw m3, m6 |
||
518 | paddw m0, m5 |
||
519 | paddw m2, m5 |
||
520 | paddw m1, m0 |
||
521 | paddw m3, m2 |
||
522 | psrlw m1, 6 |
||
523 | movdqa m0, m4 |
||
524 | psrlw m3, 6 |
||
525 | %ifidn %1, avg |
||
526 | movq m2, [r0 ] |
||
527 | movhps m2, [r0+r2] |
||
528 | %endif |
||
529 | packuswb m1, m3 |
||
530 | CHROMAMC_AVG m1, m2 |
||
531 | movq [r0 ], m1 |
||
532 | movhps [r0+r2], m1 |
||
533 | sub r3d, 2 |
||
534 | lea r0, [r0+r2*2] |
||
535 | jg .next2rows |
||
536 | REP_RET |
||
537 | |||
538 | .my_is_zero: |
||
539 | mov r5d, r4d |
||
540 | shl r4d, 8 |
||
541 | add r4, 8 |
||
542 | sub r4, r5 ; 255*x+8 = x<<8 | (8-x) |
||
543 | movd m7, r4d |
||
544 | movdqa m6, [rnd_1d_%2] |
||
545 | pshuflw m7, m7, 0 |
||
546 | movlhps m7, m7 |
||
547 | |||
548 | .next2xrows: |
||
549 | movq m0, [r1 ] |
||
550 | movq m1, [r1 +1] |
||
551 | movq m2, [r1+r2 ] |
||
552 | movq m3, [r1+r2+1] |
||
553 | punpcklbw m0, m1 |
||
554 | punpcklbw m2, m3 |
||
555 | pmaddubsw m0, m7 |
||
556 | pmaddubsw m2, m7 |
||
557 | %ifidn %1, avg |
||
558 | movq m4, [r0 ] |
||
559 | movhps m4, [r0+r2] |
||
560 | %endif |
||
561 | paddw m0, m6 |
||
562 | paddw m2, m6 |
||
563 | psrlw m0, 3 |
||
564 | psrlw m2, 3 |
||
565 | packuswb m0, m2 |
||
566 | CHROMAMC_AVG m0, m4 |
||
567 | movq [r0 ], m0 |
||
568 | movhps [r0+r2], m0 |
||
569 | sub r3d, 2 |
||
570 | lea r0, [r0+r2*2] |
||
571 | lea r1, [r1+r2*2] |
||
572 | jg .next2xrows |
||
573 | REP_RET |
||
574 | |||
575 | .mx_is_zero: |
||
576 | mov r4d, r5d |
||
577 | shl r5d, 8 |
||
578 | add r5, 8 |
||
579 | sub r5, r4 ; 255*y+8 = y<<8 | (8-y) |
||
580 | movd m7, r5d |
||
581 | movdqa m6, [rnd_1d_%2] |
||
582 | pshuflw m7, m7, 0 |
||
583 | movlhps m7, m7 |
||
584 | |||
585 | .next2yrows: |
||
586 | movq m0, [r1 ] |
||
587 | movq m1, [r1+r2 ] |
||
588 | movdqa m2, m1 |
||
589 | movq m3, [r1+r2*2] |
||
590 | lea r1, [r1+r2*2] |
||
591 | punpcklbw m0, m1 |
||
592 | punpcklbw m2, m3 |
||
593 | pmaddubsw m0, m7 |
||
594 | pmaddubsw m2, m7 |
||
595 | %ifidn %1, avg |
||
596 | movq m4, [r0 ] |
||
597 | movhps m4, [r0+r2] |
||
598 | %endif |
||
599 | paddw m0, m6 |
||
600 | paddw m2, m6 |
||
601 | psrlw m0, 3 |
||
602 | psrlw m2, 3 |
||
603 | packuswb m0, m2 |
||
604 | CHROMAMC_AVG m0, m4 |
||
605 | movq [r0 ], m0 |
||
606 | movhps [r0+r2], m0 |
||
607 | sub r3d, 2 |
||
608 | lea r0, [r0+r2*2] |
||
609 | jg .next2yrows |
||
610 | REP_RET |
||
611 | %endmacro |
||
612 | |||
613 | %macro chroma_mc4_ssse3_func 2 |
||
614 | cglobal %1_%2_chroma_mc4, 6, 7, 0 |
||
615 | %if ARCH_X86_64 |
||
616 | movsxd r2, r2d |
||
617 | %endif |
||
618 | mov r6, r4 |
||
619 | shl r4d, 8 |
||
620 | sub r4d, r6d |
||
621 | mov r6, 8 |
||
622 | add r4d, 8 ; x*288+8 |
||
623 | sub r6d, r5d |
||
624 | imul r6d, r4d ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x) |
||
625 | imul r4d, r5d ; y *(x*255+8) = y *x<<8 | y *(8-x) |
||
626 | |||
627 | movd m7, r6d |
||
628 | movd m6, r4d |
||
629 | movq m5, [pw_32] |
||
630 | movd m0, [r1 ] |
||
631 | pshufw m7, m7, 0 |
||
632 | punpcklbw m0, [r1+1] |
||
633 | pshufw m6, m6, 0 |
||
634 | |||
635 | .next2rows: |
||
636 | movd m1, [r1+r2*1 ] |
||
637 | movd m3, [r1+r2*2 ] |
||
638 | punpcklbw m1, [r1+r2*1+1] |
||
639 | punpcklbw m3, [r1+r2*2+1] |
||
640 | lea r1, [r1+r2*2] |
||
641 | movq m2, m1 |
||
642 | movq m4, m3 |
||
643 | pmaddubsw m0, m7 |
||
644 | pmaddubsw m1, m6 |
||
645 | pmaddubsw m2, m7 |
||
646 | pmaddubsw m3, m6 |
||
647 | paddw m0, m5 |
||
648 | paddw m2, m5 |
||
649 | paddw m1, m0 |
||
650 | paddw m3, m2 |
||
651 | psrlw m1, 6 |
||
652 | movq m0, m4 |
||
653 | psrlw m3, 6 |
||
654 | packuswb m1, m1 |
||
655 | packuswb m3, m3 |
||
656 | CHROMAMC_AVG m1, [r0 ] |
||
657 | CHROMAMC_AVG m3, [r0+r2] |
||
658 | movd [r0 ], m1 |
||
659 | movd [r0+r2], m3 |
||
660 | sub r3d, 2 |
||
661 | lea r0, [r0+r2*2] |
||
662 | jg .next2rows |
||
663 | REP_RET |
||
664 | %endmacro |
||
665 | |||
666 | %define CHROMAMC_AVG NOTHING |
||
667 | INIT_XMM ssse3 |
||
668 | chroma_mc8_ssse3_func put, h264, _rnd |
||
669 | chroma_mc8_ssse3_func put, vc1, _nornd |
||
670 | INIT_MMX ssse3 |
||
671 | chroma_mc4_ssse3_func put, h264 |
||
672 | |||
673 | %define CHROMAMC_AVG DIRECT_AVG |
||
674 | INIT_XMM ssse3 |
||
675 | chroma_mc8_ssse3_func avg, h264, _rnd |
||
676 | chroma_mc8_ssse3_func avg, vc1, _nornd |
||
677 | INIT_MMX ssse3 |
||
678 | chroma_mc4_ssse3_func avg, h2648><8>8><8>8><8>8><8>8><8>8><8>8><8>16><16>16><16> |