Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
;******************************************************************************
2
;* MMX/SSSE3-optimized functions for H264 chroma MC
3
;* Copyright (c) 2005 Zoltan Hidvegi ,
4
;*               2005-2008 Loren Merritt
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22
 
23
%include "libavutil/x86/x86util.asm"
24
 
25
SECTION_RODATA
26
 
27
rnd_rv40_2d_tbl: times 4 dw  0
28
                 times 4 dw 16
29
                 times 4 dw 32
30
                 times 4 dw 16
31
                 times 4 dw 32
32
                 times 4 dw 28
33
                 times 4 dw 32
34
                 times 4 dw 28
35
                 times 4 dw  0
36
                 times 4 dw 32
37
                 times 4 dw 16
38
                 times 4 dw 32
39
                 times 4 dw 32
40
                 times 4 dw 28
41
                 times 4 dw 32
42
                 times 4 dw 28
43
rnd_rv40_1d_tbl: times 4 dw  0
44
                 times 4 dw  2
45
                 times 4 dw  4
46
                 times 4 dw  2
47
                 times 4 dw  4
48
                 times 4 dw  3
49
                 times 4 dw  4
50
                 times 4 dw  3
51
                 times 4 dw  0
52
                 times 4 dw  4
53
                 times 4 dw  2
54
                 times 4 dw  4
55
                 times 4 dw  4
56
                 times 4 dw  3
57
                 times 4 dw  4
58
                 times 4 dw  3
59
 
60
cextern pw_3
61
cextern pw_4
62
cextern pw_8
63
pw_28: times 8 dw 28
64
cextern pw_32
65
cextern pw_64
66
 
67
SECTION .text
68
 
69
%macro mv0_pixels_mc8 0
70
    lea           r4, [r2*2 ]
71
.next4rows:
72
    movq         mm0, [r1   ]
73
    movq         mm1, [r1+r2]
74
    add           r1, r4
75
    CHROMAMC_AVG mm0, [r0   ]
76
    CHROMAMC_AVG mm1, [r0+r2]
77
    movq     [r0   ], mm0
78
    movq     [r0+r2], mm1
79
    add           r0, r4
80
    movq         mm0, [r1   ]
81
    movq         mm1, [r1+r2]
82
    add           r1, r4
83
    CHROMAMC_AVG mm0, [r0   ]
84
    CHROMAMC_AVG mm1, [r0+r2]
85
    movq     [r0   ], mm0
86
    movq     [r0+r2], mm1
87
    add           r0, r4
88
    sub          r3d, 4
89
    jne .next4rows
90
%endmacro
91
 
92
%macro chroma_mc8_mmx_func 2-3
93
%ifidn %2, rv40
94
%ifdef PIC
95
%define rnd_1d_rv40 r8
96
%define rnd_2d_rv40 r8
97
%define extra_regs 2
98
%else ; no-PIC
99
%define rnd_1d_rv40 rnd_rv40_1d_tbl
100
%define rnd_2d_rv40 rnd_rv40_2d_tbl
101
%define extra_regs 1
102
%endif ; PIC
103
%else
104
%define extra_regs 0
105
%endif ; rv40
106
; put/avg_h264_chroma_mc8_*(uint8_t *dst /*align 8*/, uint8_t *src /*align 1*/,
107
;                           int stride, int h, int mx, int my)
108
cglobal %1_%2_chroma_mc8%3, 6, 7 + extra_regs, 0
109
%if ARCH_X86_64
110
    movsxd        r2, r2d
111
%endif
112
    mov          r6d, r5d
113
    or           r6d, r4d
114
    jne .at_least_one_non_zero
115
    ; mx == 0 AND my == 0 - no filter needed
116
    mv0_pixels_mc8
117
    REP_RET
118
 
119
.at_least_one_non_zero:
120
%ifidn %2, rv40
121
%if ARCH_X86_64
122
    mov           r7, r5
123
    and           r7, 6         ; &~1 for mx/my=[0,7]
124
    lea           r7, [r7*4+r4]
125
    sar          r7d, 1
126
%define rnd_bias r7
127
%define dest_reg r0
128
%else ; x86-32
129
    mov           r0, r5
130
    and           r0, 6         ; &~1 for mx/my=[0,7]
131
    lea           r0, [r0*4+r4]
132
    sar          r0d, 1
133
%define rnd_bias r0
134
%define dest_reg r5
135
%endif
136
%else ; vc1, h264
137
%define rnd_bias  0
138
%define dest_reg r0
139
%endif
140
 
141
    test         r5d, r5d
142
    mov           r6, 1
143
    je .my_is_zero
144
    test         r4d, r4d
145
    mov           r6, r2        ; dxy = x ? 1 : stride
146
    jne .both_non_zero
147
.my_is_zero:
148
    ; mx == 0 XOR my == 0 - 1 dimensional filter only
149
    or           r4d, r5d       ; x + y
150
 
151
%ifidn %2, rv40
152
%ifdef PIC
153
    lea           r8, [rnd_rv40_1d_tbl]
154
%endif
155
%if ARCH_X86_64 == 0
156
    mov           r5, r0m
157
%endif
158
%endif
159
 
160
    movd          m5, r4d
161
    movq          m4, [pw_8]
162
    movq          m6, [rnd_1d_%2+rnd_bias*8] ; mm6 = rnd >> 3
163
    punpcklwd     m5, m5
164
    punpckldq     m5, m5        ; mm5 = B = x
165
    pxor          m7, m7
166
    psubw         m4, m5        ; mm4 = A = 8-x
167
 
168
.next1drow:
169
    movq          m0, [r1   ]   ; mm0 = src[0..7]
170
    movq          m2, [r1+r6]   ; mm1 = src[1..8]
171
 
172
    movq          m1, m0
173
    movq          m3, m2
174
    punpcklbw     m0, m7
175
    punpckhbw     m1, m7
176
    punpcklbw     m2, m7
177
    punpckhbw     m3, m7
178
    pmullw        m0, m4        ; [mm0,mm1] = A * src[0..7]
179
    pmullw        m1, m4
180
    pmullw        m2, m5        ; [mm2,mm3] = B * src[1..8]
181
    pmullw        m3, m5
182
 
183
    paddw         m0, m6
184
    paddw         m1, m6
185
    paddw         m0, m2
186
    paddw         m1, m3
187
    psrlw         m0, 3
188
    psrlw         m1, 3
189
    packuswb      m0, m1
190
    CHROMAMC_AVG  m0, [dest_reg]
191
    movq  [dest_reg], m0        ; dst[0..7] = (A * src[0..7] + B * src[1..8] + (rnd >> 3)) >> 3
192
 
193
    add     dest_reg, r2
194
    add           r1, r2
195
    dec           r3d
196
    jne .next1drow
197
    REP_RET
198
 
199
.both_non_zero: ; general case, bilinear
200
    movd          m4, r4d         ; x
201
    movd          m6, r5d         ; y
202
%ifidn %2, rv40
203
%ifdef PIC
204
    lea           r8, [rnd_rv40_2d_tbl]
205
%endif
206
%if ARCH_X86_64 == 0
207
    mov           r5, r0m
208
%endif
209
%endif
210
    mov           r6, rsp         ; backup stack pointer
211
    and          rsp, ~(mmsize-1) ; align stack
212
    sub          rsp, 16          ; AA and DD
213
 
214
    punpcklwd     m4, m4
215
    punpcklwd     m6, m6
216
    punpckldq     m4, m4          ; mm4 = x words
217
    punpckldq     m6, m6          ; mm6 = y words
218
    movq          m5, m4
219
    pmullw        m4, m6          ; mm4 = x * y
220
    psllw         m5, 3
221
    psllw         m6, 3
222
    movq          m7, m5
223
    paddw         m7, m6
224
    movq     [rsp+8], m4          ; DD = x * y
225
    psubw         m5, m4          ; mm5 = B = 8x - xy
226
    psubw         m6, m4          ; mm6 = C = 8y - xy
227
    paddw         m4, [pw_64]
228
    psubw         m4, m7          ; mm4 = A = xy - (8x+8y) + 64
229
    pxor          m7, m7
230
    movq     [rsp  ], m4
231
 
232
    movq          m0, [r1  ]      ; mm0 = src[0..7]
233
    movq          m1, [r1+1]      ; mm1 = src[1..8]
234
.next2drow:
235
    add           r1, r2
236
 
237
    movq          m2, m0
238
    movq          m3, m1
239
    punpckhbw     m0, m7
240
    punpcklbw     m1, m7
241
    punpcklbw     m2, m7
242
    punpckhbw     m3, m7
243
    pmullw        m0, [rsp]
244
    pmullw        m2, [rsp]
245
    pmullw        m1, m5
246
    pmullw        m3, m5
247
    paddw         m2, m1          ; mm2 = A * src[0..3] + B * src[1..4]
248
    paddw         m3, m0          ; mm3 = A * src[4..7] + B * src[5..8]
249
 
250
    movq          m0, [r1]
251
    movq          m1, m0
252
    punpcklbw     m0, m7
253
    punpckhbw     m1, m7
254
    pmullw        m0, m6
255
    pmullw        m1, m6
256
    paddw         m2, m0
257
    paddw         m3, m1          ; [mm2,mm3] += C * src[0..7]
258
 
259
    movq          m1, [r1+1]
260
    movq          m0, m1
261
    movq          m4, m1
262
    punpcklbw     m0, m7
263
    punpckhbw     m4, m7
264
    pmullw        m0, [rsp+8]
265
    pmullw        m4, [rsp+8]
266
    paddw         m2, m0
267
    paddw         m3, m4          ; [mm2,mm3] += D * src[1..8]
268
    movq          m0, [r1]
269
 
270
    paddw         m2, [rnd_2d_%2+rnd_bias*8]
271
    paddw         m3, [rnd_2d_%2+rnd_bias*8]
272
    psrlw         m2, 6
273
    psrlw         m3, 6
274
    packuswb      m2, m3
275
    CHROMAMC_AVG  m2, [dest_reg]
276
    movq  [dest_reg], m2          ; dst[0..7] = ([mm2,mm3] + rnd) >> 6
277
 
278
    add     dest_reg, r2
279
    dec          r3d
280
    jne .next2drow
281
    mov          rsp, r6          ; restore stack pointer
282
    RET
283
%endmacro
284
 
285
%macro chroma_mc4_mmx_func 2
286
%define extra_regs 0
287
%ifidn %2, rv40
288
%ifdef PIC
289
%define extra_regs 1
290
%endif ; PIC
291
%endif ; rv40
292
cglobal %1_%2_chroma_mc4, 6, 6 + extra_regs, 0
293
%if ARCH_X86_64
294
    movsxd        r2, r2d
295
%endif
296
    pxor          m7, m7
297
    movd          m2, r4d         ; x
298
    movd          m3, r5d         ; y
299
    movq          m4, [pw_8]
300
    movq          m5, [pw_8]
301
    punpcklwd     m2, m2
302
    punpcklwd     m3, m3
303
    punpcklwd     m2, m2
304
    punpcklwd     m3, m3
305
    psubw         m4, m2
306
    psubw         m5, m3
307
 
308
%ifidn %2, rv40
309
%ifdef PIC
310
   lea            r6, [rnd_rv40_2d_tbl]
311
%define rnd_2d_rv40 r6
312
%else
313
%define rnd_2d_rv40 rnd_rv40_2d_tbl
314
%endif
315
    and           r5, 6         ; &~1 for mx/my=[0,7]
316
    lea           r5, [r5*4+r4]
317
    sar          r5d, 1
318
%define rnd_bias r5
319
%else ; vc1, h264
320
%define rnd_bias 0
321
%endif
322
 
323
    movd          m0, [r1  ]
324
    movd          m6, [r1+1]
325
    add           r1, r2
326
    punpcklbw     m0, m7
327
    punpcklbw     m6, m7
328
    pmullw        m0, m4
329
    pmullw        m6, m2
330
    paddw         m6, m0
331
 
332
.next2rows:
333
    movd          m0, [r1  ]
334
    movd          m1, [r1+1]
335
    add           r1, r2
336
    punpcklbw     m0, m7
337
    punpcklbw     m1, m7
338
    pmullw        m0, m4
339
    pmullw        m1, m2
340
    paddw         m1, m0
341
    movq          m0, m1
342
 
343
    pmullw        m6, m5
344
    pmullw        m1, m3
345
    paddw         m6, [rnd_2d_%2+rnd_bias*8]
346
    paddw         m1, m6
347
    psrlw         m1, 6
348
    packuswb      m1, m1
349
    CHROMAMC_AVG4 m1, m6, [r0]
350
    movd        [r0], m1
351
    add           r0, r2
352
 
353
    movd          m6, [r1  ]
354
    movd          m1, [r1+1]
355
    add           r1, r2
356
    punpcklbw     m6, m7
357
    punpcklbw     m1, m7
358
    pmullw        m6, m4
359
    pmullw        m1, m2
360
    paddw         m1, m6
361
    movq          m6, m1
362
    pmullw        m0, m5
363
    pmullw        m1, m3
364
    paddw         m0, [rnd_2d_%2+rnd_bias*8]
365
    paddw         m1, m0
366
    psrlw         m1, 6
367
    packuswb      m1, m1
368
    CHROMAMC_AVG4 m1, m0, [r0]
369
    movd        [r0], m1
370
    add           r0, r2
371
    sub          r3d, 2
372
    jnz .next2rows
373
    REP_RET
374
%endmacro
375
 
376
%macro chroma_mc2_mmx_func 2
377
cglobal %1_%2_chroma_mc2, 6, 7, 0
378
%if ARCH_X86_64
379
    movsxd        r2, r2d
380
%endif
381
 
382
    mov          r6d, r4d
383
    shl          r4d, 16
384
    sub          r4d, r6d
385
    add          r4d, 8
386
    imul         r5d, r4d         ; x*y<<16 | y*(8-x)
387
    shl          r4d, 3
388
    sub          r4d, r5d         ; x*(8-y)<<16 | (8-x)*(8-y)
389
 
390
    movd          m5, r4d
391
    movd          m6, r5d
392
    punpckldq     m5, m5          ; mm5 = {A,B,A,B}
393
    punpckldq     m6, m6          ; mm6 = {C,D,C,D}
394
    pxor          m7, m7
395
    movd          m2, [r1]
396
    punpcklbw     m2, m7
397
    pshufw        m2, m2, 0x94    ; mm0 = src[0,1,1,2]
398
 
399
.nextrow:
400
    add           r1, r2
401
    movq          m1, m2
402
    pmaddwd       m1, m5          ; mm1 = A * src[0,1] + B * src[1,2]
403
    movd          m0, [r1]
404
    punpcklbw     m0, m7
405
    pshufw        m0, m0, 0x94    ; mm0 = src[0,1,1,2]
406
    movq          m2, m0
407
    pmaddwd       m0, m6
408
    paddw         m1, [rnd_2d_%2]
409
    paddw         m1, m0          ; mm1 += C * src[0,1] + D * src[1,2]
410
    psrlw         m1, 6
411
    packssdw      m1, m7
412
    packuswb      m1, m7
413
    CHROMAMC_AVG4 m1, m3, [r0]
414
    movd         r5d, m1
415
    mov         [r0], r5w
416
    add           r0, r2
417
    sub          r3d, 1
418
    jnz .nextrow
419
    REP_RET
420
%endmacro
421
 
422
%define rnd_1d_h264 pw_4
423
%define rnd_2d_h264 pw_32
424
%define rnd_1d_vc1  pw_3
425
%define rnd_2d_vc1  pw_28
426
 
427
%macro NOTHING 2-3
428
%endmacro
429
%macro DIRECT_AVG 2
430
    PAVGB         %1, %2
431
%endmacro
432
%macro COPY_AVG 3
433
    movd          %2, %3
434
    PAVGB         %1, %2
435
%endmacro
436
 
437
INIT_MMX mmx
438
%define CHROMAMC_AVG  NOTHING
439
%define CHROMAMC_AVG4 NOTHING
440
chroma_mc8_mmx_func put, h264, _rnd
441
chroma_mc8_mmx_func put, vc1,  _nornd
442
chroma_mc8_mmx_func put, rv40
443
chroma_mc4_mmx_func put, h264
444
chroma_mc4_mmx_func put, rv40
445
 
446
INIT_MMX mmxext
447
chroma_mc2_mmx_func put, h264
448
 
449
%define CHROMAMC_AVG  DIRECT_AVG
450
%define CHROMAMC_AVG4 COPY_AVG
451
chroma_mc8_mmx_func avg, h264, _rnd
452
chroma_mc8_mmx_func avg, vc1,  _nornd
453
chroma_mc8_mmx_func avg, rv40
454
chroma_mc4_mmx_func avg, h264
455
chroma_mc4_mmx_func avg, rv40
456
chroma_mc2_mmx_func avg, h264
457
 
458
INIT_MMX 3dnow
459
chroma_mc8_mmx_func avg, h264, _rnd
460
chroma_mc8_mmx_func avg, vc1,  _nornd
461
chroma_mc8_mmx_func avg, rv40
462
chroma_mc4_mmx_func avg, h264
463
chroma_mc4_mmx_func avg, rv40
464
 
465
%macro chroma_mc8_ssse3_func 2-3
466
cglobal %1_%2_chroma_mc8%3, 6, 7, 8
467
%if ARCH_X86_64
468
    movsxd        r2, r2d
469
%endif
470
    mov          r6d, r5d
471
    or           r6d, r4d
472
    jne .at_least_one_non_zero
473
    ; mx == 0 AND my == 0 - no filter needed
474
    mv0_pixels_mc8
475
    REP_RET
476
 
477
.at_least_one_non_zero:
478
    test         r5d, r5d
479
    je .my_is_zero
480
    test         r4d, r4d
481
    je .mx_is_zero
482
 
483
    ; general case, bilinear
484
    mov          r6d, r4d
485
    shl          r4d, 8
486
    sub           r4, r6
487
    mov           r6, 8
488
    add           r4, 8           ; x*288+8 = x<<8 | (8-x)
489
    sub          r6d, r5d
490
    imul          r6, r4          ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
491
    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
492
 
493
    movd          m7, r6d
494
    movd          m6, r4d
495
    movdqa        m5, [rnd_2d_%2]
496
    movq          m0, [r1  ]
497
    movq          m1, [r1+1]
498
    pshuflw       m7, m7, 0
499
    pshuflw       m6, m6, 0
500
    punpcklbw     m0, m1
501
    movlhps       m7, m7
502
    movlhps       m6, m6
503
 
504
.next2rows:
505
    movq          m1, [r1+r2*1   ]
506
    movq          m2, [r1+r2*1+1]
507
    movq          m3, [r1+r2*2  ]
508
    movq          m4, [r1+r2*2+1]
509
    lea           r1, [r1+r2*2]
510
    punpcklbw     m1, m2
511
    movdqa        m2, m1
512
    punpcklbw     m3, m4
513
    movdqa        m4, m3
514
    pmaddubsw     m0, m7
515
    pmaddubsw     m1, m6
516
    pmaddubsw     m2, m7
517
    pmaddubsw     m3, m6
518
    paddw         m0, m5
519
    paddw         m2, m5
520
    paddw         m1, m0
521
    paddw         m3, m2
522
    psrlw         m1, 6
523
    movdqa        m0, m4
524
    psrlw         m3, 6
525
%ifidn %1, avg
526
    movq          m2, [r0   ]
527
    movhps        m2, [r0+r2]
528
%endif
529
    packuswb      m1, m3
530
    CHROMAMC_AVG  m1, m2
531
    movq     [r0   ], m1
532
    movhps   [r0+r2], m1
533
    sub          r3d, 2
534
    lea           r0, [r0+r2*2]
535
    jg .next2rows
536
    REP_RET
537
 
538
.my_is_zero:
539
    mov          r5d, r4d
540
    shl          r4d, 8
541
    add           r4, 8
542
    sub           r4, r5          ; 255*x+8 = x<<8 | (8-x)
543
    movd          m7, r4d
544
    movdqa        m6, [rnd_1d_%2]
545
    pshuflw       m7, m7, 0
546
    movlhps       m7, m7
547
 
548
.next2xrows:
549
    movq          m0, [r1     ]
550
    movq          m1, [r1   +1]
551
    movq          m2, [r1+r2  ]
552
    movq          m3, [r1+r2+1]
553
    punpcklbw     m0, m1
554
    punpcklbw     m2, m3
555
    pmaddubsw     m0, m7
556
    pmaddubsw     m2, m7
557
%ifidn %1, avg
558
    movq          m4, [r0   ]
559
    movhps        m4, [r0+r2]
560
%endif
561
    paddw         m0, m6
562
    paddw         m2, m6
563
    psrlw         m0, 3
564
    psrlw         m2, 3
565
    packuswb      m0, m2
566
    CHROMAMC_AVG  m0, m4
567
    movq     [r0   ], m0
568
    movhps   [r0+r2], m0
569
    sub          r3d, 2
570
    lea           r0, [r0+r2*2]
571
    lea           r1, [r1+r2*2]
572
    jg .next2xrows
573
    REP_RET
574
 
575
.mx_is_zero:
576
    mov          r4d, r5d
577
    shl          r5d, 8
578
    add           r5, 8
579
    sub           r5, r4          ; 255*y+8 = y<<8 | (8-y)
580
    movd          m7, r5d
581
    movdqa        m6, [rnd_1d_%2]
582
    pshuflw       m7, m7, 0
583
    movlhps       m7, m7
584
 
585
.next2yrows:
586
    movq          m0, [r1     ]
587
    movq          m1, [r1+r2  ]
588
    movdqa        m2, m1
589
    movq          m3, [r1+r2*2]
590
    lea           r1, [r1+r2*2]
591
    punpcklbw     m0, m1
592
    punpcklbw     m2, m3
593
    pmaddubsw     m0, m7
594
    pmaddubsw     m2, m7
595
%ifidn %1, avg
596
    movq          m4, [r0   ]
597
    movhps        m4, [r0+r2]
598
%endif
599
    paddw         m0, m6
600
    paddw         m2, m6
601
    psrlw         m0, 3
602
    psrlw         m2, 3
603
    packuswb      m0, m2
604
    CHROMAMC_AVG  m0, m4
605
    movq     [r0   ], m0
606
    movhps   [r0+r2], m0
607
    sub          r3d, 2
608
    lea           r0, [r0+r2*2]
609
    jg .next2yrows
610
    REP_RET
611
%endmacro
612
 
613
%macro chroma_mc4_ssse3_func 2
614
cglobal %1_%2_chroma_mc4, 6, 7, 0
615
%if ARCH_X86_64
616
    movsxd        r2, r2d
617
%endif
618
    mov           r6, r4
619
    shl          r4d, 8
620
    sub          r4d, r6d
621
    mov           r6, 8
622
    add          r4d, 8           ; x*288+8
623
    sub          r6d, r5d
624
    imul         r6d, r4d         ; (8-y)*(x*255+8) = (8-y)*x<<8 | (8-y)*(8-x)
625
    imul         r4d, r5d         ;    y *(x*255+8) =    y *x<<8 |    y *(8-x)
626
 
627
    movd          m7, r6d
628
    movd          m6, r4d
629
    movq          m5, [pw_32]
630
    movd          m0, [r1  ]
631
    pshufw        m7, m7, 0
632
    punpcklbw     m0, [r1+1]
633
    pshufw        m6, m6, 0
634
 
635
.next2rows:
636
    movd          m1, [r1+r2*1  ]
637
    movd          m3, [r1+r2*2  ]
638
    punpcklbw     m1, [r1+r2*1+1]
639
    punpcklbw     m3, [r1+r2*2+1]
640
    lea           r1, [r1+r2*2]
641
    movq          m2, m1
642
    movq          m4, m3
643
    pmaddubsw     m0, m7
644
    pmaddubsw     m1, m6
645
    pmaddubsw     m2, m7
646
    pmaddubsw     m3, m6
647
    paddw         m0, m5
648
    paddw         m2, m5
649
    paddw         m1, m0
650
    paddw         m3, m2
651
    psrlw         m1, 6
652
    movq          m0, m4
653
    psrlw         m3, 6
654
    packuswb      m1, m1
655
    packuswb      m3, m3
656
    CHROMAMC_AVG  m1, [r0  ]
657
    CHROMAMC_AVG  m3, [r0+r2]
658
    movd     [r0   ], m1
659
    movd     [r0+r2], m3
660
    sub          r3d, 2
661
    lea           r0, [r0+r2*2]
662
    jg .next2rows
663
    REP_RET
664
%endmacro
665
 
666
%define CHROMAMC_AVG NOTHING
667
INIT_XMM ssse3
668
chroma_mc8_ssse3_func put, h264, _rnd
669
chroma_mc8_ssse3_func put, vc1,  _nornd
670
INIT_MMX ssse3
671
chroma_mc4_ssse3_func put, h264
672
 
673
%define CHROMAMC_AVG DIRECT_AVG
674
INIT_XMM ssse3
675
chroma_mc8_ssse3_func avg, h264, _rnd
676
chroma_mc8_ssse3_func avg, vc1,  _nornd
677
INIT_MMX ssse3
678
chroma_mc4_ssse3_func avg, h264