Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6148 serge 1
;*****************************************************************************
2
;* MMX/SSE2/SSSE3-optimized H.264 QPEL code
3
;*****************************************************************************
4
;* Copyright (c) 2004-2005 Michael Niedermayer, Loren Merritt
5
;* Copyright (C) 2012 Daniel Kang
6
;*
7
;* Authors: Daniel Kang 
8
;*
9
;* This file is part of FFmpeg.
10
;*
11
;* FFmpeg is free software; you can redistribute it and/or
12
;* modify it under the terms of the GNU Lesser General Public
13
;* License as published by the Free Software Foundation; either
14
;* version 2.1 of the License, or (at your option) any later version.
15
;*
16
;* FFmpeg is distributed in the hope that it will be useful,
17
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
18
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
19
;* Lesser General Public License for more details.
20
;*
21
;* You should have received a copy of the GNU Lesser General Public
22
;* License along with FFmpeg; if not, write to the Free Software
23
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
24
;******************************************************************************
25
 
26
%include "libavutil/x86/x86util.asm"
27
 
28
SECTION_RODATA 32
29
 
30
cextern pw_16
31
cextern pw_5
32
cextern pb_0
33
 
34
SECTION .text
35
 
36
 
37
%macro op_avgh 3
38
    movh   %3, %2
39
    pavgb  %1, %3
40
    movh   %2, %1
41
%endmacro
42
 
43
%macro op_avg 2-3
44
    pavgb  %1, %2
45
    mova   %2, %1
46
%endmacro
47
 
48
%macro op_puth 2-3
49
    movh   %2, %1
50
%endmacro
51
 
52
%macro op_put 2-3
53
    mova   %2, %1
54
%endmacro
55
 
56
%macro QPEL4_H_LOWPASS_OP 1
57
cglobal %1_h264_qpel4_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
58
    movsxdifnidn  r2, r2d
59
    movsxdifnidn  r3, r3d
60
    pxor          m7, m7
61
    mova          m4, [pw_5]
62
    mova          m5, [pw_16]
63
    mov          r4d, 4
64
.loop:
65
    movh          m1, [r1-1]
66
    movh          m2, [r1+0]
67
    movh          m3, [r1+1]
68
    movh          m0, [r1+2]
69
    punpcklbw     m1, m7
70
    punpcklbw     m2, m7
71
    punpcklbw     m3, m7
72
    punpcklbw     m0, m7
73
    paddw         m1, m0
74
    paddw         m2, m3
75
    movh          m0, [r1-2]
76
    movh          m3, [r1+3]
77
    punpcklbw     m0, m7
78
    punpcklbw     m3, m7
79
    paddw         m0, m3
80
    psllw         m2, 2
81
    psubw         m2, m1
82
    pmullw        m2, m4
83
    paddw         m0, m5
84
    paddw         m0, m2
85
    psraw         m0, 5
86
    packuswb      m0, m0
87
    op_%1h        m0, [r0], m6
88
    add           r0, r2
89
    add           r1, r3
90
    dec          r4d
91
    jg         .loop
92
    REP_RET
93
%endmacro
94
 
95
INIT_MMX mmxext
96
QPEL4_H_LOWPASS_OP put
97
QPEL4_H_LOWPASS_OP avg
98
 
99
%macro QPEL8_H_LOWPASS_OP 1
100
cglobal %1_h264_qpel8_h_lowpass, 4,5 ; dst, src, dstStride, srcStride
101
    movsxdifnidn  r2, r2d
102
    movsxdifnidn  r3, r3d
103
    mov          r4d, 8
104
    pxor          m7, m7
105
    mova          m6, [pw_5]
106
.loop:
107
    mova          m0, [r1]
108
    mova          m2, [r1+1]
109
    mova          m1, m0
110
    mova          m3, m2
111
    punpcklbw     m0, m7
112
    punpckhbw     m1, m7
113
    punpcklbw     m2, m7
114
    punpckhbw     m3, m7
115
    paddw         m0, m2
116
    paddw         m1, m3
117
    psllw         m0, 2
118
    psllw         m1, 2
119
    mova          m2, [r1-1]
120
    mova          m4, [r1+2]
121
    mova          m3, m2
122
    mova          m5, m4
123
    punpcklbw     m2, m7
124
    punpckhbw     m3, m7
125
    punpcklbw     m4, m7
126
    punpckhbw     m5, m7
127
    paddw         m2, m4
128
    paddw         m5, m3
129
    psubw         m0, m2
130
    psubw         m1, m5
131
    pmullw        m0, m6
132
    pmullw        m1, m6
133
    movd          m2, [r1-2]
134
    movd          m5, [r1+7]
135
    punpcklbw     m2, m7
136
    punpcklbw     m5, m7
137
    paddw         m2, m3
138
    paddw         m4, m5
139
    mova          m5, [pw_16]
140
    paddw         m2, m5
141
    paddw         m4, m5
142
    paddw         m0, m2
143
    paddw         m1, m4
144
    psraw         m0, 5
145
    psraw         m1, 5
146
    packuswb      m0, m1
147
    op_%1         m0, [r0], m4
148
    add           r0, r2
149
    add           r1, r3
150
    dec          r4d
151
    jg         .loop
152
    REP_RET
153
%endmacro
154
 
155
INIT_MMX mmxext
156
QPEL8_H_LOWPASS_OP put
157
QPEL8_H_LOWPASS_OP avg
158
 
159
%macro QPEL8_H_LOWPASS_OP_XMM 1
160
cglobal %1_h264_qpel8_h_lowpass, 4,5,8 ; dst, src, dstStride, srcStride
161
    movsxdifnidn  r2, r2d
162
    movsxdifnidn  r3, r3d
163
    mov          r4d, 8
164
    pxor          m7, m7
165
    mova          m6, [pw_5]
166
.loop:
167
    movu          m1, [r1-2]
168
    mova          m0, m1
169
    punpckhbw     m1, m7
170
    punpcklbw     m0, m7
171
    mova          m2, m1
172
    mova          m3, m1
173
    mova          m4, m1
174
    mova          m5, m1
175
    palignr       m4, m0, 2
176
    palignr       m3, m0, 4
177
    palignr       m2, m0, 6
178
    palignr       m1, m0, 8
179
    palignr       m5, m0, 10
180
    paddw         m0, m5
181
    paddw         m2, m3
182
    paddw         m1, m4
183
    psllw         m2, 2
184
    psubw         m2, m1
185
    paddw         m0, [pw_16]
186
    pmullw        m2, m6
187
    paddw         m2, m0
188
    psraw         m2, 5
189
    packuswb      m2, m2
190
    op_%1h        m2, [r0], m4
191
    add           r1, r3
192
    add           r0, r2
193
    dec          r4d
194
    jne        .loop
195
    REP_RET
196
%endmacro
197
 
198
INIT_XMM ssse3
199
QPEL8_H_LOWPASS_OP_XMM put
200
QPEL8_H_LOWPASS_OP_XMM avg
201
 
202
 
203
%macro QPEL4_H_LOWPASS_L2_OP 1
204
cglobal %1_h264_qpel4_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
205
    movsxdifnidn  r3, r3d
206
    movsxdifnidn  r4, r4d
207
    pxor          m7, m7
208
    mova          m4, [pw_5]
209
    mova          m5, [pw_16]
210
    mov          r5d, 4
211
.loop:
212
    movh          m1, [r1-1]
213
    movh          m2, [r1+0]
214
    movh          m3, [r1+1]
215
    movh          m0, [r1+2]
216
    punpcklbw     m1, m7
217
    punpcklbw     m2, m7
218
    punpcklbw     m3, m7
219
    punpcklbw     m0, m7
220
    paddw         m1, m0
221
    paddw         m2, m3
222
    movh          m0, [r1-2]
223
    movh          m3, [r1+3]
224
    punpcklbw     m0, m7
225
    punpcklbw     m3, m7
226
    paddw         m0, m3
227
    psllw         m2, 2
228
    psubw         m2, m1
229
    pmullw        m2, m4
230
    paddw         m0, m5
231
    paddw         m0, m2
232
    movh          m3, [r2]
233
    psraw         m0, 5
234
    packuswb      m0, m0
235
    pavgb         m0, m3
236
    op_%1h        m0, [r0], m6
237
    add           r0, r3
238
    add           r1, r3
239
    add           r2, r4
240
    dec          r5d
241
    jg         .loop
242
    REP_RET
243
%endmacro
244
 
245
INIT_MMX mmxext
246
QPEL4_H_LOWPASS_L2_OP put
247
QPEL4_H_LOWPASS_L2_OP avg
248
 
249
 
250
%macro QPEL8_H_LOWPASS_L2_OP 1
251
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6 ; dst, src, src2, dstStride, srcStride
252
    movsxdifnidn  r3, r3d
253
    movsxdifnidn  r4, r4d
254
    mov          r5d, 8
255
    pxor          m7, m7
256
    mova          m6, [pw_5]
257
.loop:
258
    mova          m0, [r1]
259
    mova          m2, [r1+1]
260
    mova          m1, m0
261
    mova          m3, m2
262
    punpcklbw     m0, m7
263
    punpckhbw     m1, m7
264
    punpcklbw     m2, m7
265
    punpckhbw     m3, m7
266
    paddw         m0, m2
267
    paddw         m1, m3
268
    psllw         m0, 2
269
    psllw         m1, 2
270
    mova          m2, [r1-1]
271
    mova          m4, [r1+2]
272
    mova          m3, m2
273
    mova          m5, m4
274
    punpcklbw     m2, m7
275
    punpckhbw     m3, m7
276
    punpcklbw     m4, m7
277
    punpckhbw     m5, m7
278
    paddw         m2, m4
279
    paddw         m5, m3
280
    psubw         m0, m2
281
    psubw         m1, m5
282
    pmullw        m0, m6
283
    pmullw        m1, m6
284
    movd          m2, [r1-2]
285
    movd          m5, [r1+7]
286
    punpcklbw     m2, m7
287
    punpcklbw     m5, m7
288
    paddw         m2, m3
289
    paddw         m4, m5
290
    mova          m5, [pw_16]
291
    paddw         m2, m5
292
    paddw         m4, m5
293
    paddw         m0, m2
294
    paddw         m1, m4
295
    psraw         m0, 5
296
    psraw         m1, 5
297
    mova          m4, [r2]
298
    packuswb      m0, m1
299
    pavgb         m0, m4
300
    op_%1         m0, [r0], m4
301
    add           r0, r3
302
    add           r1, r3
303
    add           r2, r4
304
    dec          r5d
305
    jg         .loop
306
    REP_RET
307
%endmacro
308
 
309
INIT_MMX mmxext
310
QPEL8_H_LOWPASS_L2_OP put
311
QPEL8_H_LOWPASS_L2_OP avg
312
 
313
 
314
%macro QPEL8_H_LOWPASS_L2_OP_XMM 1
315
cglobal %1_h264_qpel8_h_lowpass_l2, 5,6,8 ; dst, src, src2, dstStride, src2Stride
316
    movsxdifnidn  r3, r3d
317
    movsxdifnidn  r4, r4d
318
    mov          r5d, 8
319
    pxor          m7, m7
320
    mova          m6, [pw_5]
321
.loop:
322
    lddqu         m1, [r1-2]
323
    mova          m0, m1
324
    punpckhbw     m1, m7
325
    punpcklbw     m0, m7
326
    mova          m2, m1
327
    mova          m3, m1
328
    mova          m4, m1
329
    mova          m5, m1
330
    palignr       m4, m0, 2
331
    palignr       m3, m0, 4
332
    palignr       m2, m0, 6
333
    palignr       m1, m0, 8
334
    palignr       m5, m0, 10
335
    paddw         m0, m5
336
    paddw         m2, m3
337
    paddw         m1, m4
338
    psllw         m2, 2
339
    movh          m3, [r2]
340
    psubw         m2, m1
341
    paddw         m0, [pw_16]
342
    pmullw        m2, m6
343
    paddw         m2, m0
344
    psraw         m2, 5
345
    packuswb      m2, m2
346
    pavgb         m2, m3
347
    op_%1h        m2, [r0], m4
348
    add           r1, r3
349
    add           r0, r3
350
    add           r2, r4
351
    dec          r5d
352
    jg         .loop
353
    REP_RET
354
%endmacro
355
 
356
INIT_XMM ssse3
357
QPEL8_H_LOWPASS_L2_OP_XMM put
358
QPEL8_H_LOWPASS_L2_OP_XMM avg
359
 
360
 
361
; All functions that call this are required to have function arguments of
362
; dst, src, dstStride, srcStride
363
%macro FILT_V 1
364
    mova      m6, m2
365
    movh      m5, [r1]
366
    paddw     m6, m3
367
    psllw     m6, 2
368
    psubw     m6, m1
369
    psubw     m6, m4
370
    punpcklbw m5, m7
371
    pmullw    m6, [pw_5]
372
    paddw     m0, [pw_16]
373
    add       r1, r3
374
    paddw     m0, m5
375
    paddw     m6, m0
376
    psraw     m6, 5
377
    packuswb  m6, m6
378
    op_%1h    m6, [r0], m0 ; 1
379
    add       r0, r2
380
    SWAP       0, 1, 2, 3, 4, 5
381
%endmacro
382
 
383
%macro QPEL4_V_LOWPASS_OP 1
384
cglobal %1_h264_qpel4_v_lowpass, 4,4 ; dst, src, dstStride, srcStride
385
    movsxdifnidn  r2, r2d
386
    movsxdifnidn  r3, r3d
387
    sub           r1, r3
388
    sub           r1, r3
389
    pxor          m7, m7
390
    movh          m0, [r1]
391
    movh          m1, [r1+r3]
392
    lea           r1, [r1+2*r3]
393
    movh          m2, [r1]
394
    movh          m3, [r1+r3]
395
    lea           r1, [r1+2*r3]
396
    movh          m4, [r1]
397
    add           r1, r3
398
    punpcklbw     m0, m7
399
    punpcklbw     m1, m7
400
    punpcklbw     m2, m7
401
    punpcklbw     m3, m7
402
    punpcklbw     m4, m7
403
    FILT_V        %1
404
    FILT_V        %1
405
    FILT_V        %1
406
    FILT_V        %1
407
    RET
408
%endmacro
409
 
410
INIT_MMX mmxext
411
QPEL4_V_LOWPASS_OP put
412
QPEL4_V_LOWPASS_OP avg
413
 
414
 
415
 
416
%macro QPEL8OR16_V_LOWPASS_OP 1
417
%if cpuflag(sse2)
418
cglobal %1_h264_qpel8or16_v_lowpass, 5,5,8 ; dst, src, dstStride, srcStride, h
419
    movsxdifnidn  r2, r2d
420
    movsxdifnidn  r3, r3d
421
    sub           r1, r3
422
    sub           r1, r3
423
%else
424
cglobal %1_h264_qpel8or16_v_lowpass_op, 5,5,8 ; dst, src, dstStride, srcStride, h
425
    movsxdifnidn  r2, r2d
426
    movsxdifnidn  r3, r3d
427
%endif
428
    pxor          m7, m7
429
    movh          m0, [r1]
430
    movh          m1, [r1+r3]
431
    lea           r1, [r1+2*r3]
432
    movh          m2, [r1]
433
    movh          m3, [r1+r3]
434
    lea           r1, [r1+2*r3]
435
    movh          m4, [r1]
436
    add           r1, r3
437
    punpcklbw     m0, m7
438
    punpcklbw     m1, m7
439
    punpcklbw     m2, m7
440
    punpcklbw     m3, m7
441
    punpcklbw     m4, m7
442
    FILT_V        %1
443
    FILT_V        %1
444
    FILT_V        %1
445
    FILT_V        %1
446
    FILT_V        %1
447
    FILT_V        %1
448
    FILT_V        %1
449
    FILT_V        %1
450
    cmp          r4d, 16
451
    jne         .end
452
    FILT_V        %1
453
    FILT_V        %1
454
    FILT_V        %1
455
    FILT_V        %1
456
    FILT_V        %1
457
    FILT_V        %1
458
    FILT_V        %1
459
    FILT_V        %1
460
.end:
461
    REP_RET
462
%endmacro
463
 
464
INIT_MMX mmxext
465
QPEL8OR16_V_LOWPASS_OP put
466
QPEL8OR16_V_LOWPASS_OP avg
467
 
468
INIT_XMM sse2
469
QPEL8OR16_V_LOWPASS_OP put
470
QPEL8OR16_V_LOWPASS_OP avg
471
 
472
 
473
; All functions that use this are required to have args:
474
; src, tmp, srcSize
475
%macro FILT_HV 1 ; offset
476
    mova           m6, m2
477
    movh           m5, [r0]
478
    paddw          m6, m3
479
    psllw          m6, 2
480
    paddw          m0, [pw_16]
481
    psubw          m6, m1
482
    psubw          m6, m4
483
    punpcklbw      m5, m7
484
    pmullw         m6, [pw_5]
485
    paddw          m0, m5
486
    add            r0, r2
487
    paddw          m6, m0
488
    mova      [r1+%1], m6
489
    SWAP            0, 1, 2, 3, 4, 5
490
%endmacro
491
 
492
%macro QPEL4_HV1_LOWPASS_OP 1
493
cglobal %1_h264_qpel4_hv_lowpass_v, 3,3 ; src, tmp, srcStride
494
    movsxdifnidn  r2, r2d
495
    pxor          m7, m7
496
    movh          m0, [r0]
497
    movh          m1, [r0+r2]
498
    lea           r0, [r0+2*r2]
499
    movh          m2, [r0]
500
    movh          m3, [r0+r2]
501
    lea           r0, [r0+2*r2]
502
    movh          m4, [r0]
503
    add           r0, r2
504
    punpcklbw     m0, m7
505
    punpcklbw     m1, m7
506
    punpcklbw     m2, m7
507
    punpcklbw     m3, m7
508
    punpcklbw     m4, m7
509
    FILT_HV       0*24
510
    FILT_HV       1*24
511
    FILT_HV       2*24
512
    FILT_HV       3*24
513
    RET
514
 
515
cglobal %1_h264_qpel4_hv_lowpass_h, 3,4 ; tmp, dst, dstStride
516
    movsxdifnidn  r2, r2d
517
    mov          r3d, 4
518
.loop:
519
    mova          m0, [r0]
520
    paddw         m0, [r0+10]
521
    mova          m1, [r0+2]
522
    paddw         m1, [r0+8]
523
    mova          m2, [r0+4]
524
    paddw         m2, [r0+6]
525
    psubw         m0, m1
526
    psraw         m0, 2
527
    psubw         m0, m1
528
    paddsw        m0, m2
529
    psraw         m0, 2
530
    paddw         m0, m2
531
    psraw         m0, 6
532
    packuswb      m0, m0
533
    op_%1h        m0, [r1], m7
534
    add           r0, 24
535
    add           r1, r2
536
    dec          r3d
537
    jnz        .loop
538
    REP_RET
539
%endmacro
540
 
541
INIT_MMX mmxext
542
QPEL4_HV1_LOWPASS_OP put
543
QPEL4_HV1_LOWPASS_OP avg
544
 
545
%macro QPEL8OR16_HV1_LOWPASS_OP 1
546
cglobal %1_h264_qpel8or16_hv1_lowpass_op, 4,4,8 ; src, tmp, srcStride, size
547
    movsxdifnidn  r2, r2d
548
    pxor          m7, m7
549
    movh          m0, [r0]
550
    movh          m1, [r0+r2]
551
    lea           r0, [r0+2*r2]
552
    movh          m2, [r0]
553
    movh          m3, [r0+r2]
554
    lea           r0, [r0+2*r2]
555
    movh          m4, [r0]
556
    add           r0, r2
557
    punpcklbw     m0, m7
558
    punpcklbw     m1, m7
559
    punpcklbw     m2, m7
560
    punpcklbw     m3, m7
561
    punpcklbw     m4, m7
562
    FILT_HV     0*48
563
    FILT_HV     1*48
564
    FILT_HV     2*48
565
    FILT_HV     3*48
566
    FILT_HV     4*48
567
    FILT_HV     5*48
568
    FILT_HV     6*48
569
    FILT_HV     7*48
570
    cmp          r3d, 16
571
    jne         .end
572
    FILT_HV     8*48
573
    FILT_HV     9*48
574
    FILT_HV    10*48
575
    FILT_HV    11*48
576
    FILT_HV    12*48
577
    FILT_HV    13*48
578
    FILT_HV    14*48
579
    FILT_HV    15*48
580
.end:
581
    REP_RET
582
%endmacro
583
 
584
INIT_MMX mmxext
585
QPEL8OR16_HV1_LOWPASS_OP put
586
QPEL8OR16_HV1_LOWPASS_OP avg
587
 
588
INIT_XMM sse2
589
QPEL8OR16_HV1_LOWPASS_OP put
590
 
591
 
592
 
593
%macro QPEL8OR16_HV2_LOWPASS_OP 1
594
; unused is to match ssse3 and mmxext args
595
cglobal %1_h264_qpel8or16_hv2_lowpass_op, 5,5 ; dst, tmp, dstStride, unused, h
596
    movsxdifnidn  r2, r2d
597
.loop:
598
    mova          m0, [r1]
599
    mova          m3, [r1+8]
600
    mova          m1, [r1+2]
601
    mova          m4, [r1+10]
602
    paddw         m0, m4
603
    paddw         m1, m3
604
    paddw         m3, [r1+18]
605
    paddw         m4, [r1+16]
606
    mova          m2, [r1+4]
607
    mova          m5, [r1+12]
608
    paddw         m2, [r1+6]
609
    paddw         m5, [r1+14]
610
    psubw         m0, m1
611
    psubw         m3, m4
612
    psraw         m0, 2
613
    psraw         m3, 2
614
    psubw         m0, m1
615
    psubw         m3, m4
616
    paddsw        m0, m2
617
    paddsw        m3, m5
618
    psraw         m0, 2
619
    psraw         m3, 2
620
    paddw         m0, m2
621
    paddw         m3, m5
622
    psraw         m0, 6
623
    psraw         m3, 6
624
    packuswb      m0, m3
625
    op_%1         m0, [r0], m7
626
    add           r1, 48
627
    add           r0, r2
628
    dec          r4d
629
    jne        .loop
630
    REP_RET
631
%endmacro
632
 
633
INIT_MMX mmxext
634
QPEL8OR16_HV2_LOWPASS_OP put
635
QPEL8OR16_HV2_LOWPASS_OP avg
636
 
637
%macro QPEL8OR16_HV2_LOWPASS_OP_XMM 1
638
cglobal %1_h264_qpel8or16_hv2_lowpass, 5,5,8 ; dst, tmp, dstStride, tmpStride, size
639
    movsxdifnidn  r2, r2d
640
    movsxdifnidn  r3, r3d
641
    cmp          r4d, 16
642
    je         .op16
643
.loop8:
644
    mova          m1, [r1+16]
645
    mova          m0, [r1]
646
    mova          m2, m1
647
    mova          m3, m1
648
    mova          m4, m1
649
    mova          m5, m1
650
    palignr       m5, m0, 10
651
    palignr       m4, m0, 8
652
    palignr       m3, m0, 6
653
    palignr       m2, m0, 4
654
    palignr       m1, m0, 2
655
    paddw         m0, m5
656
    paddw         m1, m4
657
    paddw         m2, m3
658
    psubw         m0, m1
659
    psraw         m0, 2
660
    psubw         m0, m1
661
    paddw         m0, m2
662
    psraw         m0, 2
663
    paddw         m0, m2
664
    psraw         m0, 6
665
    packuswb      m0, m0
666
    op_%1h        m0, [r0], m7
667
    add           r1, 48
668
    add           r0, r2
669
    dec          r4d
670
    jne       .loop8
671
    jmp        .done
672
.op16:
673
    mova          m4, [r1+32]
674
    mova          m5, [r1+16]
675
    mova          m7, [r1]
676
    mova          m3, m4
677
    mova          m2, m4
678
    mova          m1, m4
679
    mova          m0, m4
680
    palignr       m0, m5, 10
681
    palignr       m1, m5, 8
682
    palignr       m2, m5, 6
683
    palignr       m3, m5, 4
684
    palignr       m4, m5, 2
685
    paddw         m0, m5
686
    paddw         m1, m4
687
    paddw         m2, m3
688
    mova          m6, m5
689
    mova          m4, m5
690
    mova          m3, m5
691
    palignr       m4, m7, 8
692
    palignr       m6, m7, 2
693
    palignr       m3, m7, 10
694
    paddw         m4, m6
695
    mova          m6, m5
696
    palignr       m5, m7, 6
697
    palignr       m6, m7, 4
698
    paddw         m3, m7
699
    paddw         m5, m6
700
    psubw         m0, m1
701
    psubw         m3, m4
702
    psraw         m0, 2
703
    psraw         m3, 2
704
    psubw         m0, m1
705
    psubw         m3, m4
706
    paddw         m0, m2
707
    paddw         m3, m5
708
    psraw         m0, 2
709
    psraw         m3, 2
710
    paddw         m0, m2
711
    paddw         m3, m5
712
    psraw         m0, 6
713
    psraw         m3, 6
714
    packuswb      m3, m0
715
    op_%1         m3, [r0], m7
716
    add           r1, 48
717
    add           r0, r2
718
    dec          r4d
719
    jne        .op16
720
.done:
721
    REP_RET
722
%endmacro
723
 
724
INIT_XMM ssse3
725
QPEL8OR16_HV2_LOWPASS_OP_XMM put
726
QPEL8OR16_HV2_LOWPASS_OP_XMM avg
727
 
728
 
729
%macro PIXELS4_L2_SHIFT5 1
730
cglobal %1_pixels4_l2_shift5,6,6 ; dst, src16, src8, dstStride, src8Stride, h
731
    movsxdifnidn  r3, r3d
732
    movsxdifnidn  r4, r4d
733
    mova          m0, [r1]
734
    mova          m1, [r1+24]
735
    psraw         m0, 5
736
    psraw         m1, 5
737
    packuswb      m0, m0
738
    packuswb      m1, m1
739
    pavgb         m0, [r2]
740
    pavgb         m1, [r2+r4]
741
    op_%1h        m0, [r0], m4
742
    op_%1h        m1, [r0+r3], m5
743
    lea           r2, [r2+r4*2]
744
    lea           r0, [r0+r3*2]
745
    mova          m0, [r1+48]
746
    mova          m1, [r1+72]
747
    psraw         m0, 5
748
    psraw         m1, 5
749
    packuswb      m0, m0
750
    packuswb      m1, m1
751
    pavgb         m0, [r2]
752
    pavgb         m1, [r2+r4]
753
    op_%1h        m0, [r0], m4
754
    op_%1h        m1, [r0+r3], m5
755
    RET
756
%endmacro
757
 
758
INIT_MMX mmxext
759
PIXELS4_L2_SHIFT5 put
760
PIXELS4_L2_SHIFT5 avg
761
 
762
 
763
%macro PIXELS8_L2_SHIFT5 1
764
cglobal %1_pixels8_l2_shift5, 6, 6 ; dst, src16, src8, dstStride, src8Stride, h
765
    movsxdifnidn  r3, r3d
766
    movsxdifnidn  r4, r4d
767
.loop:
768
    mova          m0, [r1]
769
    mova          m1, [r1+8]
770
    mova          m2, [r1+48]
771
    mova          m3, [r1+48+8]
772
    psraw         m0, 5
773
    psraw         m1, 5
774
    psraw         m2, 5
775
    psraw         m3, 5
776
    packuswb      m0, m1
777
    packuswb      m2, m3
778
    pavgb         m0, [r2]
779
    pavgb         m2, [r2+r4]
780
    op_%1         m0, [r0], m4
781
    op_%1         m2, [r0+r3], m5
782
    lea           r2, [r2+2*r4]
783
    add           r1, 48*2
784
    lea           r0, [r0+2*r3]
785
    sub          r5d, 2
786
    jne        .loop
787
    REP_RET
788
%endmacro
789
 
790
INIT_MMX mmxext
791
PIXELS8_L2_SHIFT5 put
792
PIXELS8_L2_SHIFT5 avg
793
 
794
 
795
%if ARCH_X86_64
796
%macro QPEL16_H_LOWPASS_L2_OP 1
797
cglobal %1_h264_qpel16_h_lowpass_l2, 5, 6, 16 ; dst, src, src2, dstStride, src2Stride
798
    movsxdifnidn  r3, r3d
799
    movsxdifnidn  r4, r4d
800
    mov          r5d, 16
801
    pxor         m15, m15
802
    mova         m14, [pw_5]
803
    mova         m13, [pw_16]
804
.loop:
805
    lddqu         m1, [r1+6]
806
    lddqu         m7, [r1-2]
807
    mova          m0, m1
808
    punpckhbw     m1, m15
809
    punpcklbw     m0, m15
810
    punpcklbw     m7, m15
811
    mova          m2, m1
812
    mova          m6, m0
813
    mova          m3, m1
814
    mova          m8, m0
815
    mova          m4, m1
816
    mova          m9, m0
817
    mova         m12, m0
818
    mova         m11, m1
819
    palignr      m11, m0, 10
820
    palignr      m12, m7, 10
821
    palignr       m4, m0, 2
822
    palignr       m9, m7, 2
823
    palignr       m3, m0, 4
824
    palignr       m8, m7, 4
825
    palignr       m2, m0, 6
826
    palignr       m6, m7, 6
827
    paddw        m11, m0
828
    palignr       m1, m0, 8
829
    palignr       m0, m7, 8
830
    paddw         m7, m12
831
    paddw         m2, m3
832
    paddw         m6, m8
833
    paddw         m1, m4
834
    paddw         m0, m9
835
    psllw         m2, 2
836
    psllw         m6, 2
837
    psubw         m2, m1
838
    psubw         m6, m0
839
    paddw        m11, m13
840
    paddw         m7, m13
841
    pmullw        m2, m14
842
    pmullw        m6, m14
843
    lddqu         m3, [r2]
844
    paddw         m2, m11
845
    paddw         m6, m7
846
    psraw         m2, 5
847
    psraw         m6, 5
848
    packuswb      m6, m2
849
    pavgb         m6, m3
850
    op_%1         m6, [r0], m11
851
    add           r1, r3
852
    add           r0, r3
853
    add           r2, r4
854
    dec          r5d
855
    jg         .loop
856
    REP_RET
857
%endmacro
858
 
859
INIT_XMM ssse3
860
QPEL16_H_LOWPASS_L2_OP put
861
QPEL16_H_LOWPASS_L2_OP avg
862
%endif