Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6148 serge 1
;*****************************************************************************
2
;* MMX/SSE2-optimized H.264 iDCT
3
;*****************************************************************************
4
;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt
5
;* Copyright (C) 2003-2008 x264 project
6
;*
7
;* Authors: Laurent Aimar 
8
;*          Loren Merritt 
9
;*          Holger Lubitz 
10
;*          Min Chen 
11
;*
12
;* This file is part of FFmpeg.
13
;*
14
;* FFmpeg is free software; you can redistribute it and/or
15
;* modify it under the terms of the GNU Lesser General Public
16
;* License as published by the Free Software Foundation; either
17
;* version 2.1 of the License, or (at your option) any later version.
18
;*
19
;* FFmpeg is distributed in the hope that it will be useful,
20
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
21
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
22
;* Lesser General Public License for more details.
23
;*
24
;* You should have received a copy of the GNU Lesser General Public
25
;* License along with FFmpeg; if not, write to the Free Software
26
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
27
;*****************************************************************************
28
 
29
%include "libavutil/x86/x86util.asm"
30
 
31
SECTION_RODATA
32
 
33
scan8_mem: db  4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8
34
           db  6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8
35
           db  4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8
36
           db  6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8
37
           db  4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8
38
           db  6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8
39
           db  4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8
40
           db  6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8
41
           db  4+11*8, 5+11*8, 4+12*8, 5+12*8
42
           db  6+11*8, 7+11*8, 6+12*8, 7+12*8
43
           db  4+13*8, 5+13*8, 4+14*8, 5+14*8
44
           db  6+13*8, 7+13*8, 6+14*8, 7+14*8
45
%ifdef PIC
46
%define npicregs 1
47
%define scan8 picregq
48
%else
49
%define npicregs 0
50
%define scan8 scan8_mem
51
%endif
52
 
53
cextern pw_32
54
cextern pw_1
55
 
56
SECTION .text
57
 
58
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
59
%macro IDCT4_ADD 3
60
    ; Load dct coeffs
61
    movq         m0, [%2]
62
    movq         m1, [%2+8]
63
    movq         m2, [%2+16]
64
    movq         m3, [%2+24]
65
 
66
    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
67
    mova         m6, [pw_32]
68
    TRANSPOSE4x4W 0, 1, 2, 3, 4
69
    paddw        m0, m6
70
    IDCT4_1D      w, 0, 1, 2, 3, 4, 5
71
    pxor         m7, m7
72
    movq    [%2+ 0], m7
73
    movq    [%2+ 8], m7
74
    movq    [%2+16], m7
75
    movq    [%2+24], m7
76
 
77
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3
78
    lea          %1, [%1+%3*2]
79
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3
80
%endmacro
81
 
82
INIT_MMX mmx
83
; ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
84
cglobal h264_idct_add_8, 3, 3, 0
85
    IDCT4_ADD    r0, r1, r2
86
    RET
87
 
88
%macro IDCT8_1D 2
89
    mova         m0, m1
90
    psraw        m1, 1
91
    mova         m4, m5
92
    psraw        m4, 1
93
    paddw        m4, m5
94
    paddw        m1, m0
95
    paddw        m4, m7
96
    paddw        m1, m5
97
    psubw        m4, m0
98
    paddw        m1, m3
99
 
100
    psubw        m0, m3
101
    psubw        m5, m3
102
    psraw        m3, 1
103
    paddw        m0, m7
104
    psubw        m5, m7
105
    psraw        m7, 1
106
    psubw        m0, m3
107
    psubw        m5, m7
108
 
109
    mova         m7, m1
110
    psraw        m1, 2
111
    mova         m3, m4
112
    psraw        m3, 2
113
    paddw        m3, m0
114
    psraw        m0, 2
115
    paddw        m1, m5
116
    psraw        m5, 2
117
    psubw        m0, m4
118
    psubw        m7, m5
119
 
120
    mova         m5, m6
121
    psraw        m6, 1
122
    mova         m4, m2
123
    psraw        m4, 1
124
    paddw        m6, m2
125
    psubw        m4, m5
126
 
127
    mova         m2, %1
128
    mova         m5, %2
129
    SUMSUB_BA    w, 5, 2
130
    SUMSUB_BA    w, 6, 5
131
    SUMSUB_BA    w, 4, 2
132
    SUMSUB_BA    w, 7, 6
133
    SUMSUB_BA    w, 0, 4
134
    SUMSUB_BA    w, 3, 2
135
    SUMSUB_BA    w, 1, 5
136
    SWAP         7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
137
%endmacro
138
 
139
%macro IDCT8_1D_FULL 1
140
    mova         m7, [%1+112]
141
    mova         m6, [%1+ 96]
142
    mova         m5, [%1+ 80]
143
    mova         m3, [%1+ 48]
144
    mova         m2, [%1+ 32]
145
    mova         m1, [%1+ 16]
146
    IDCT8_1D   [%1], [%1+ 64]
147
%endmacro
148
 
149
; %1=int16_t *block, %2=int16_t *dstblock
150
%macro IDCT8_ADD_MMX_START 2
151
    IDCT8_1D_FULL %1
152
    mova       [%1], m7
153
    TRANSPOSE4x4W 0, 1, 2, 3, 7
154
    mova         m7, [%1]
155
    mova    [%2   ], m0
156
    mova    [%2+16], m1
157
    mova    [%2+32], m2
158
    mova    [%2+48], m3
159
    TRANSPOSE4x4W 4, 5, 6, 7, 3
160
    mova    [%2+ 8], m4
161
    mova    [%2+24], m5
162
    mova    [%2+40], m6
163
    mova    [%2+56], m7
164
%endmacro
165
 
166
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
167
%macro IDCT8_ADD_MMX_END 3-4
168
    IDCT8_1D_FULL %2
169
    mova    [%2   ], m5
170
    mova    [%2+16], m6
171
    mova    [%2+32], m7
172
 
173
    pxor         m7, m7
174
%if %0 == 4
175
    movq   [%4+  0], m7
176
    movq   [%4+  8], m7
177
    movq   [%4+ 16], m7
178
    movq   [%4+ 24], m7
179
    movq   [%4+ 32], m7
180
    movq   [%4+ 40], m7
181
    movq   [%4+ 48], m7
182
    movq   [%4+ 56], m7
183
    movq   [%4+ 64], m7
184
    movq   [%4+ 72], m7
185
    movq   [%4+ 80], m7
186
    movq   [%4+ 88], m7
187
    movq   [%4+ 96], m7
188
    movq   [%4+104], m7
189
    movq   [%4+112], m7
190
    movq   [%4+120], m7
191
%endif
192
    STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3
193
    lea          %1, [%1+%3*2]
194
    STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3
195
    mova         m0, [%2   ]
196
    mova         m1, [%2+16]
197
    mova         m2, [%2+32]
198
    lea          %1, [%1+%3*2]
199
    STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3
200
    lea          %1, [%1+%3*2]
201
    STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3
202
%endmacro
203
 
204
INIT_MMX mmx
205
; ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride)
206
cglobal h264_idct8_add_8, 3, 4, 0
207
    %assign pad 128+4-(stack_offset&7)
208
    SUB         rsp, pad
209
 
210
    add   word [r1], 32
211
    IDCT8_ADD_MMX_START r1  , rsp
212
    IDCT8_ADD_MMX_START r1+8, rsp+64
213
    lea          r3, [r0+4]
214
    IDCT8_ADD_MMX_END   r0  , rsp,   r2, r1
215
    IDCT8_ADD_MMX_END   r3  , rsp+8, r2
216
 
217
    ADD         rsp, pad
218
    RET
219
 
220
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
221
%macro IDCT8_ADD_SSE 4
222
    IDCT8_1D_FULL %2
223
%if ARCH_X86_64
224
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8
225
%else
226
    TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16]
227
%endif
228
    paddw        m0, [pw_32]
229
 
230
%if ARCH_X86_64 == 0
231
    mova    [%2   ], m0
232
    mova    [%2+16], m4
233
    IDCT8_1D   [%2], [%2+ 16]
234
    mova    [%2   ], m6
235
    mova    [%2+16], m7
236
%else
237
    SWAP          0, 8
238
    SWAP          4, 9
239
    IDCT8_1D     m8, m9
240
    SWAP          6, 8
241
    SWAP          7, 9
242
%endif
243
 
244
    pxor         m7, m7
245
    lea          %4, [%3*3]
246
    STORE_DIFF   m0, m6, m7, [%1     ]
247
    STORE_DIFF   m1, m6, m7, [%1+%3  ]
248
    STORE_DIFF   m2, m6, m7, [%1+%3*2]
249
    STORE_DIFF   m3, m6, m7, [%1+%4  ]
250
%if ARCH_X86_64 == 0
251
    mova         m0, [%2   ]
252
    mova         m1, [%2+16]
253
%else
254
    SWAP          0, 8
255
    SWAP          1, 9
256
%endif
257
    mova   [%2+  0], m7
258
    mova   [%2+ 16], m7
259
    mova   [%2+ 32], m7
260
    mova   [%2+ 48], m7
261
    mova   [%2+ 64], m7
262
    mova   [%2+ 80], m7
263
    mova   [%2+ 96], m7
264
    mova   [%2+112], m7
265
    lea          %1, [%1+%3*4]
266
    STORE_DIFF   m4, m6, m7, [%1     ]
267
    STORE_DIFF   m5, m6, m7, [%1+%3  ]
268
    STORE_DIFF   m0, m6, m7, [%1+%3*2]
269
    STORE_DIFF   m1, m6, m7, [%1+%4  ]
270
%endmacro
271
 
272
INIT_XMM sse2
273
; ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride)
274
cglobal h264_idct8_add_8, 3, 4, 10
275
    IDCT8_ADD_SSE r0, r1, r2, r3
276
    RET
277
 
278
%macro DC_ADD_MMXEXT_INIT 2
279
    add          %1, 32
280
    sar          %1, 6
281
    movd         m0, %1d
282
    lea          %1, [%2*3]
283
    pshufw       m0, m0, 0
284
    pxor         m1, m1
285
    psubw        m1, m0
286
    packuswb     m0, m0
287
    packuswb     m1, m1
288
%endmacro
289
 
290
%macro DC_ADD_MMXEXT_OP 4
291
    %1           m2, [%2     ]
292
    %1           m3, [%2+%3  ]
293
    %1           m4, [%2+%3*2]
294
    %1           m5, [%2+%4  ]
295
    paddusb      m2, m0
296
    paddusb      m3, m0
297
    paddusb      m4, m0
298
    paddusb      m5, m0
299
    psubusb      m2, m1
300
    psubusb      m3, m1
301
    psubusb      m4, m1
302
    psubusb      m5, m1
303
    %1    [%2     ], m2
304
    %1    [%2+%3  ], m3
305
    %1    [%2+%3*2], m4
306
    %1    [%2+%4  ], m5
307
%endmacro
308
 
309
INIT_MMX mmxext
310
; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
311
%if ARCH_X86_64
312
cglobal h264_idct_dc_add_8, 3, 4, 0
313
    movsx        r3, word [r1]
314
    mov  dword [r1], 0
315
    DC_ADD_MMXEXT_INIT r3, r2
316
    DC_ADD_MMXEXT_OP movh, r0, r2, r3
317
    RET
318
 
319
; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
320
cglobal h264_idct8_dc_add_8, 3, 4, 0
321
    movsx        r3, word [r1]
322
    mov  dword [r1], 0
323
    DC_ADD_MMXEXT_INIT r3, r2
324
    DC_ADD_MMXEXT_OP mova, r0, r2, r3
325
    lea          r0, [r0+r2*4]
326
    DC_ADD_MMXEXT_OP mova, r0, r2, r3
327
    RET
328
%else
329
; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
330
cglobal h264_idct_dc_add_8, 2, 3, 0
331
    movsx        r2, word [r1]
332
    mov  dword [r1], 0
333
    mov          r1, r2m
334
    DC_ADD_MMXEXT_INIT r2, r1
335
    DC_ADD_MMXEXT_OP movh, r0, r1, r2
336
    RET
337
 
338
; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride)
339
cglobal h264_idct8_dc_add_8, 2, 3, 0
340
    movsx        r2, word [r1]
341
    mov  dword [r1], 0
342
    mov          r1, r2m
343
    DC_ADD_MMXEXT_INIT r2, r1
344
    DC_ADD_MMXEXT_OP mova, r0, r1, r2
345
    lea          r0, [r0+r1*4]
346
    DC_ADD_MMXEXT_OP mova, r0, r1, r2
347
    RET
348
%endif
349
 
350
INIT_MMX mmx
351
; ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset,
352
;                          int16_t *block, int stride,
353
;                          const uint8_t nnzc[6 * 8])
354
cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
355
    xor          r5, r5
356
%ifdef PIC
357
    lea     picregq, [scan8_mem]
358
%endif
359
.nextblock:
360
    movzx        r6, byte [scan8+r5]
361
    movzx        r6, byte [r4+r6]
362
    test         r6, r6
363
    jz .skipblock
364
    mov         r6d, dword [r1+r5*4]
365
    lea          r6, [r0+r6]
366
    IDCT4_ADD    r6, r2, r3
367
.skipblock:
368
    inc          r5
369
    add          r2, 32
370
    cmp          r5, 16
371
    jl .nextblock
372
    REP_RET
373
 
374
; ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset,
375
;                          int16_t *block, int stride,
376
;                          const uint8_t nnzc[6 * 8])
377
cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
378
    %assign pad 128+4-(stack_offset&7)
379
    SUB         rsp, pad
380
 
381
    xor          r5, r5
382
%ifdef PIC
383
    lea     picregq, [scan8_mem]
384
%endif
385
.nextblock:
386
    movzx        r6, byte [scan8+r5]
387
    movzx        r6, byte [r4+r6]
388
    test         r6, r6
389
    jz .skipblock
390
    mov         r6d, dword [r1+r5*4]
391
    add          r6, r0
392
    add   word [r2], 32
393
    IDCT8_ADD_MMX_START r2  , rsp
394
    IDCT8_ADD_MMX_START r2+8, rsp+64
395
    IDCT8_ADD_MMX_END   r6  , rsp,   r3, r2
396
    mov         r6d, dword [r1+r5*4]
397
    lea          r6, [r0+r6+4]
398
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
399
.skipblock:
400
    add          r5, 4
401
    add          r2, 128
402
    cmp          r5, 16
403
    jl .nextblock
404
    ADD         rsp, pad
405
    RET
406
 
407
INIT_MMX mmxext
408
; ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset,
409
;                             int16_t *block, int stride,
410
;                             const uint8_t nnzc[6 * 8])
411
cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
412
    xor          r5, r5
413
%ifdef PIC
414
    lea     picregq, [scan8_mem]
415
%endif
416
.nextblock:
417
    movzx        r6, byte [scan8+r5]
418
    movzx        r6, byte [r4+r6]
419
    test         r6, r6
420
    jz .skipblock
421
    cmp          r6, 1
422
    jnz .no_dc
423
    movsx        r6, word [r2]
424
    test         r6, r6
425
    jz .no_dc
426
    mov   word [r2], 0
427
    DC_ADD_MMXEXT_INIT r6, r3
428
%if ARCH_X86_64 == 0
429
%define dst2q r1
430
%define dst2d r1d
431
%endif
432
    mov       dst2d, dword [r1+r5*4]
433
    lea       dst2q, [r0+dst2q]
434
    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
435
%if ARCH_X86_64 == 0
436
    mov          r1, r1m
437
%endif
438
    inc          r5
439
    add          r2, 32
440
    cmp          r5, 16
441
    jl .nextblock
442
    REP_RET
443
.no_dc:
444
    mov         r6d, dword [r1+r5*4]
445
    add          r6, r0
446
    IDCT4_ADD    r6, r2, r3
447
.skipblock:
448
    inc          r5
449
    add          r2, 32
450
    cmp          r5, 16
451
    jl .nextblock
452
    REP_RET
453
 
454
INIT_MMX mmx
455
; ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset,
456
;                               int16_t *block, int stride,
457
;                               const uint8_t nnzc[6 * 8])
458
cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg
459
    xor          r5, r5
460
%ifdef PIC
461
    lea     picregq, [scan8_mem]
462
%endif
463
.nextblock:
464
    movzx        r6, byte [scan8+r5]
465
    movzx        r6, byte [r4+r6]
466
    or          r6w, word [r2]
467
    test         r6, r6
468
    jz .skipblock
469
    mov         r6d, dword [r1+r5*4]
470
    add          r6, r0
471
    IDCT4_ADD    r6, r2, r3
472
.skipblock:
473
    inc          r5
474
    add          r2, 32
475
    cmp          r5, 16
476
    jl .nextblock
477
    REP_RET
478
 
479
INIT_MMX mmxext
480
; ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset,
481
;                                  int16_t *block, int stride,
482
;                                  const uint8_t nnzc[6 * 8])
483
cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
484
    xor          r5, r5
485
%ifdef PIC
486
    lea     picregq, [scan8_mem]
487
%endif
488
.nextblock:
489
    movzx        r6, byte [scan8+r5]
490
    movzx        r6, byte [r4+r6]
491
    test         r6, r6
492
    jz .try_dc
493
    mov         r6d, dword [r1+r5*4]
494
    lea          r6, [r0+r6]
495
    IDCT4_ADD    r6, r2, r3
496
    inc          r5
497
    add          r2, 32
498
    cmp          r5, 16
499
    jl .nextblock
500
    REP_RET
501
.try_dc:
502
    movsx        r6, word [r2]
503
    test         r6, r6
504
    jz .skipblock
505
    mov   word [r2], 0
506
    DC_ADD_MMXEXT_INIT r6, r3
507
%if ARCH_X86_64 == 0
508
%define dst2q r1
509
%define dst2d r1d
510
%endif
511
    mov       dst2d, dword [r1+r5*4]
512
    add       dst2q, r0
513
    DC_ADD_MMXEXT_OP movh, dst2q, r3, r6
514
%if ARCH_X86_64 == 0
515
    mov          r1, r1m
516
%endif
517
.skipblock:
518
    inc          r5
519
    add          r2, 32
520
    cmp          r5, 16
521
    jl .nextblock
522
    REP_RET
523
 
524
; ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset,
525
;                             int16_t *block, int stride,
526
;                             const uint8_t nnzc[6 * 8])
527
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
528
    %assign pad 128+4-(stack_offset&7)
529
    SUB         rsp, pad
530
 
531
    xor          r5, r5
532
%ifdef PIC
533
    lea     picregq, [scan8_mem]
534
%endif
535
.nextblock:
536
    movzx        r6, byte [scan8+r5]
537
    movzx        r6, byte [r4+r6]
538
    test         r6, r6
539
    jz .skipblock
540
    cmp          r6, 1
541
    jnz .no_dc
542
    movsx        r6, word [r2]
543
    test         r6, r6
544
    jz .no_dc
545
    mov   word [r2], 0
546
    DC_ADD_MMXEXT_INIT r6, r3
547
%if ARCH_X86_64 == 0
548
%define dst2q r1
549
%define dst2d r1d
550
%endif
551
    mov       dst2d, dword [r1+r5*4]
552
    lea       dst2q, [r0+dst2q]
553
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
554
    lea       dst2q, [dst2q+r3*4]
555
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
556
%if ARCH_X86_64 == 0
557
    mov          r1, r1m
558
%endif
559
    add          r5, 4
560
    add          r2, 128
561
    cmp          r5, 16
562
    jl .nextblock
563
 
564
    ADD         rsp, pad
565
    RET
566
.no_dc:
567
    mov         r6d, dword [r1+r5*4]
568
    add          r6, r0
569
    add   word [r2], 32
570
    IDCT8_ADD_MMX_START r2  , rsp
571
    IDCT8_ADD_MMX_START r2+8, rsp+64
572
    IDCT8_ADD_MMX_END   r6  , rsp,   r3, r2
573
    mov         r6d, dword [r1+r5*4]
574
    lea          r6, [r0+r6+4]
575
    IDCT8_ADD_MMX_END   r6  , rsp+8, r3
576
.skipblock:
577
    add          r5, 4
578
    add          r2, 128
579
    cmp          r5, 16
580
    jl .nextblock
581
 
582
    ADD         rsp, pad
583
    RET
584
 
585
INIT_XMM sse2
586
; ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset,
587
;                           int16_t *block, int stride,
588
;                           const uint8_t nnzc[6 * 8])
589
cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
590
    xor          r5, r5
591
%ifdef PIC
592
    lea     picregq, [scan8_mem]
593
%endif
594
.nextblock:
595
    movzx        r6, byte [scan8+r5]
596
    movzx        r6, byte [r4+r6]
597
    test         r6, r6
598
    jz .skipblock
599
    cmp          r6, 1
600
    jnz .no_dc
601
    movsx        r6, word [r2]
602
    test         r6, r6
603
    jz .no_dc
604
INIT_MMX cpuname
605
    mov   word [r2], 0
606
    DC_ADD_MMXEXT_INIT r6, r3
607
%if ARCH_X86_64 == 0
608
%define dst2q r1
609
%define dst2d r1d
610
%endif
611
    mov       dst2d, dword [r1+r5*4]
612
    add       dst2q, r0
613
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
614
    lea       dst2q, [dst2q+r3*4]
615
    DC_ADD_MMXEXT_OP mova, dst2q, r3, r6
616
%if ARCH_X86_64 == 0
617
    mov          r1, r1m
618
%endif
619
    add          r5, 4
620
    add          r2, 128
621
    cmp          r5, 16
622
    jl .nextblock
623
    REP_RET
624
.no_dc:
625
INIT_XMM cpuname
626
    mov       dst2d, dword [r1+r5*4]
627
    add       dst2q, r0
628
    IDCT8_ADD_SSE dst2q, r2, r3, r6
629
%if ARCH_X86_64 == 0
630
    mov          r1, r1m
631
%endif
632
.skipblock:
633
    add          r5, 4
634
    add          r2, 128
635
    cmp          r5, 16
636
    jl .nextblock
637
    REP_RET
638
 
639
INIT_MMX mmx
640
h264_idct_add8_mmx_plane:
641
.nextblock:
642
    movzx        r6, byte [scan8+r5]
643
    movzx        r6, byte [r4+r6]
644
    or          r6w, word [r2]
645
    test         r6, r6
646
    jz .skipblock
647
%if ARCH_X86_64
648
    mov         r0d, dword [r1+r5*4]
649
    add          r0, [dst2q]
650
%else
651
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
652
    mov          r0, [r0]
653
    add          r0, dword [r1+r5*4]
654
%endif
655
    IDCT4_ADD    r0, r2, r3
656
.skipblock:
657
    inc          r5
658
    add          r2, 32
659
    test         r5, 3
660
    jnz .nextblock
661
    rep ret
662
 
663
; ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset,
664
;                         int16_t *block, int stride, const uint8_t nnzc[6 * 8])
665
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
666
    mov          r5, 16
667
    add          r2, 512
668
%ifdef PIC
669
    lea     picregq, [scan8_mem]
670
%endif
671
%if ARCH_X86_64
672
    mov       dst2q, r0
673
%endif
674
    call         h264_idct_add8_mmx_plane
675
    mov          r5, 32
676
    add          r2, 384
677
%if ARCH_X86_64
678
    add       dst2q, gprsize
679
%else
680
    add        r0mp, gprsize
681
%endif
682
    call         h264_idct_add8_mmx_plane
683
    RET
684
 
685
h264_idct_add8_mmxext_plane:
686
.nextblock:
687
    movzx        r6, byte [scan8+r5]
688
    movzx        r6, byte [r4+r6]
689
    test         r6, r6
690
    jz .try_dc
691
%if ARCH_X86_64
692
    mov         r0d, dword [r1+r5*4]
693
    add          r0, [dst2q]
694
%else
695
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
696
    mov          r0, [r0]
697
    add          r0, dword [r1+r5*4]
698
%endif
699
    IDCT4_ADD    r0, r2, r3
700
    inc          r5
701
    add          r2, 32
702
    test         r5, 3
703
    jnz .nextblock
704
    rep ret
705
.try_dc:
706
    movsx        r6, word [r2]
707
    test         r6, r6
708
    jz .skipblock
709
    mov   word [r2], 0
710
    DC_ADD_MMXEXT_INIT r6, r3
711
%if ARCH_X86_64
712
    mov         r0d, dword [r1+r5*4]
713
    add          r0, [dst2q]
714
%else
715
    mov          r0, r1m ; XXX r1m here is actually r0m of the calling func
716
    mov          r0, [r0]
717
    add          r0, dword [r1+r5*4]
718
%endif
719
    DC_ADD_MMXEXT_OP movh, r0, r3, r6
720
.skipblock:
721
    inc          r5
722
    add          r2, 32
723
    test         r5, 3
724
    jnz .nextblock
725
    rep ret
726
 
727
INIT_MMX mmxext
728
; ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset,
729
;                            int16_t *block, int stride,
730
;                            const uint8_t nnzc[6 * 8])
731
cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg
732
    mov          r5, 16
733
    add          r2, 512
734
%if ARCH_X86_64
735
    mov       dst2q, r0
736
%endif
737
%ifdef PIC
738
    lea     picregq, [scan8_mem]
739
%endif
740
    call h264_idct_add8_mmxext_plane
741
    mov          r5, 32
742
    add          r2, 384
743
%if ARCH_X86_64
744
    add       dst2q, gprsize
745
%else
746
    add        r0mp, gprsize
747
%endif
748
    call h264_idct_add8_mmxext_plane
749
    RET
750
 
751
; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered
752
h264_idct_dc_add8_mmxext:
753
    movd         m0, [r2   ]          ;  0 0 X D
754
    mov word [r2+ 0], 0
755
    punpcklwd    m0, [r2+32]          ;  x X d D
756
    mov word [r2+32], 0
757
    paddsw       m0, [pw_32]
758
    psraw        m0, 6
759
    punpcklwd    m0, m0               ;  d d D D
760
    pxor         m1, m1               ;  0 0 0 0
761
    psubw        m1, m0               ; -d-d-D-D
762
    packuswb     m0, m1               ; -d-d-D-D d d D D
763
    pshufw       m1, m0, 0xFA         ; -d-d-d-d-D-D-D-D
764
    punpcklwd    m0, m0               ;  d d d d D D D D
765
    lea          r6, [r3*3]
766
    DC_ADD_MMXEXT_OP movq, r0, r3, r6
767
    ret
768
 
769
ALIGN 16
770
INIT_XMM sse2
771
; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride
772
h264_add8x4_idct_sse2:
773
    movq   m0, [r2+ 0]
774
    movq   m1, [r2+ 8]
775
    movq   m2, [r2+16]
776
    movq   m3, [r2+24]
777
    movhps m0, [r2+32]
778
    movhps m1, [r2+40]
779
    movhps m2, [r2+48]
780
    movhps m3, [r2+56]
781
    IDCT4_1D w,0,1,2,3,4,5
782
    TRANSPOSE2x4x4W 0,1,2,3,4
783
    paddw m0, [pw_32]
784
    IDCT4_1D w,0,1,2,3,4,5
785
    pxor  m7, m7
786
    mova [r2+ 0], m7
787
    mova [r2+16], m7
788
    mova [r2+32], m7
789
    mova [r2+48], m7
790
    STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3
791
    lea   r0, [r0+r3*2]
792
    STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3
793
    ret
794
 
795
%macro add16_sse2_cycle 2
796
    movzx       r0, word [r4+%2]
797
    test        r0, r0
798
    jz .cycle%1end
799
    mov        r0d, dword [r1+%1*8]
800
%if ARCH_X86_64
801
    add         r0, r5
802
%else
803
    add         r0, r0m
804
%endif
805
    call        h264_add8x4_idct_sse2
806
.cycle%1end:
807
%if %1 < 7
808
    add         r2, 64
809
%endif
810
%endmacro
811
 
812
; ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset,
813
;                           int16_t *block, int stride,
814
;                           const uint8_t nnzc[6 * 8])
815
cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8
816
%if ARCH_X86_64
817
    mov         r5, r0
818
%endif
819
    ; unrolling of the loop leads to an average performance gain of
820
    ; 20-25%
821
    add16_sse2_cycle 0, 0xc
822
    add16_sse2_cycle 1, 0x14
823
    add16_sse2_cycle 2, 0xe
824
    add16_sse2_cycle 3, 0x16
825
    add16_sse2_cycle 4, 0x1c
826
    add16_sse2_cycle 5, 0x24
827
    add16_sse2_cycle 6, 0x1e
828
    add16_sse2_cycle 7, 0x26
829
    RET
830
 
831
%macro add16intra_sse2_cycle 2
832
    movzx       r0, word [r4+%2]
833
    test        r0, r0
834
    jz .try%1dc
835
    mov        r0d, dword [r1+%1*8]
836
%if ARCH_X86_64
837
    add         r0, r7
838
%else
839
    add         r0, r0m
840
%endif
841
    call        h264_add8x4_idct_sse2
842
    jmp .cycle%1end
843
.try%1dc:
844
    movsx       r0, word [r2   ]
845
    or         r0w, word [r2+32]
846
    jz .cycle%1end
847
    mov        r0d, dword [r1+%1*8]
848
%if ARCH_X86_64
849
    add         r0, r7
850
%else
851
    add         r0, r0m
852
%endif
853
    call        h264_idct_dc_add8_mmxext
854
.cycle%1end:
855
%if %1 < 7
856
    add         r2, 64
857
%endif
858
%endmacro
859
 
860
; ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset,
861
;                                int16_t *block, int stride,
862
;                                const uint8_t nnzc[6 * 8])
863
cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8
864
%if ARCH_X86_64
865
    mov         r7, r0
866
%endif
867
    add16intra_sse2_cycle 0, 0xc
868
    add16intra_sse2_cycle 1, 0x14
869
    add16intra_sse2_cycle 2, 0xe
870
    add16intra_sse2_cycle 3, 0x16
871
    add16intra_sse2_cycle 4, 0x1c
872
    add16intra_sse2_cycle 5, 0x24
873
    add16intra_sse2_cycle 6, 0x1e
874
    add16intra_sse2_cycle 7, 0x26
875
    RET
876
 
877
%macro add8_sse2_cycle 2
878
    movzx       r0, word [r4+%2]
879
    test        r0, r0
880
    jz .try%1dc
881
%if ARCH_X86_64
882
    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
883
    add         r0, [r7]
884
%else
885
    mov         r0, r0m
886
    mov         r0, [r0]
887
    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
888
%endif
889
    call        h264_add8x4_idct_sse2
890
    jmp .cycle%1end
891
.try%1dc:
892
    movsx       r0, word [r2   ]
893
    or         r0w, word [r2+32]
894
    jz .cycle%1end
895
%if ARCH_X86_64
896
    mov        r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
897
    add         r0, [r7]
898
%else
899
    mov         r0, r0m
900
    mov         r0, [r0]
901
    add         r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))]
902
%endif
903
    call        h264_idct_dc_add8_mmxext
904
.cycle%1end:
905
%if %1 == 1
906
    add         r2, 384+64
907
%elif %1 < 3
908
    add         r2, 64
909
%endif
910
%endmacro
911
 
912
; ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset,
913
;                          int16_t *block, int stride,
914
;                          const uint8_t nnzc[6 * 8])
915
cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8
916
    add          r2, 512
917
%if ARCH_X86_64
918
    mov          r7, r0
919
%endif
920
    add8_sse2_cycle 0, 0x34
921
    add8_sse2_cycle 1, 0x3c
922
%if ARCH_X86_64
923
    add          r7, gprsize
924
%else
925
    add        r0mp, gprsize
926
%endif
927
    add8_sse2_cycle 2, 0x5c
928
    add8_sse2_cycle 3, 0x64
929
    RET
930
 
931
;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul)
932
 
933
%macro WALSH4_1D 5
934
    SUMSUB_BADC w, %4, %3, %2, %1, %5
935
    SUMSUB_BADC w, %4, %2, %3, %1, %5
936
    SWAP %1, %4, %3
937
%endmacro
938
 
939
%macro DEQUANT_MMX 3
940
    mova        m7, [pw_1]
941
    mova        m4, %1
942
    punpcklwd   %1, m7
943
    punpckhwd   m4, m7
944
    mova        m5, %2
945
    punpcklwd   %2, m7
946
    punpckhwd   m5, m7
947
    movd        m7, t3d
948
    punpckldq   m7, m7
949
    pmaddwd     %1, m7
950
    pmaddwd     %2, m7
951
    pmaddwd     m4, m7
952
    pmaddwd     m5, m7
953
    psrad       %1, %3
954
    psrad       %2, %3
955
    psrad       m4, %3
956
    psrad       m5, %3
957
    packssdw    %1, m4
958
    packssdw    %2, m5
959
%endmacro
960
 
961
%macro STORE_WORDS 5-9
962
%if cpuflag(sse)
963
    movd  t0d, %1
964
    psrldq  %1, 4
965
    movd  t1d, %1
966
    psrldq  %1, 4
967
    mov [t2+%2*32], t0w
968
    mov [t2+%4*32], t1w
969
    shr   t0d, 16
970
    shr   t1d, 16
971
    mov [t2+%3*32], t0w
972
    mov [t2+%5*32], t1w
973
    movd  t0d, %1
974
    psrldq  %1, 4
975
    movd  t1d, %1
976
    mov [t2+%6*32], t0w
977
    mov [t2+%8*32], t1w
978
    shr   t0d, 16
979
    shr   t1d, 16
980
    mov [t2+%7*32], t0w
981
    mov [t2+%9*32], t1w
982
%else
983
    movd  t0d, %1
984
    psrlq  %1, 32
985
    movd  t1d, %1
986
    mov [t2+%2*32], t0w
987
    mov [t2+%4*32], t1w
988
    shr   t0d, 16
989
    shr   t1d, 16
990
    mov [t2+%3*32], t0w
991
    mov [t2+%5*32], t1w
992
%endif
993
%endmacro
994
 
995
%macro DEQUANT_STORE 1
996
%if cpuflag(sse2)
997
    movd      xmm4, t3d
998
    movq      xmm5, [pw_1]
999
    pshufd    xmm4, xmm4, 0
1000
    movq2dq   xmm0, m0
1001
    movq2dq   xmm1, m1
1002
    movq2dq   xmm2, m2
1003
    movq2dq   xmm3, m3
1004
    punpcklwd xmm0, xmm5
1005
    punpcklwd xmm1, xmm5
1006
    punpcklwd xmm2, xmm5
1007
    punpcklwd xmm3, xmm5
1008
    pmaddwd   xmm0, xmm4
1009
    pmaddwd   xmm1, xmm4
1010
    pmaddwd   xmm2, xmm4
1011
    pmaddwd   xmm3, xmm4
1012
    psrad     xmm0, %1
1013
    psrad     xmm1, %1
1014
    psrad     xmm2, %1
1015
    psrad     xmm3, %1
1016
    packssdw  xmm0, xmm1
1017
    packssdw  xmm2, xmm3
1018
    STORE_WORDS xmm0,  0,  1,  4,  5,  2,  3,  6,  7
1019
    STORE_WORDS xmm2,  8,  9, 12, 13, 10, 11, 14, 15
1020
%else
1021
    DEQUANT_MMX m0, m1, %1
1022
    STORE_WORDS m0,  0,  1,  4,  5
1023
    STORE_WORDS m1,  2,  3,  6,  7
1024
 
1025
    DEQUANT_MMX m2, m3, %1
1026
    STORE_WORDS m2,  8,  9, 12, 13
1027
    STORE_WORDS m3, 10, 11, 14, 15
1028
%endif
1029
%endmacro
1030
 
1031
%macro IDCT_DC_DEQUANT 1
1032
cglobal h264_luma_dc_dequant_idct, 3, 4, %1
1033
    ; manually spill XMM registers for Win64 because
1034
    ; the code here is initialized with INIT_MMX
1035
    WIN64_SPILL_XMM %1
1036
    movq        m3, [r1+24]
1037
    movq        m2, [r1+16]
1038
    movq        m1, [r1+ 8]
1039
    movq        m0, [r1+ 0]
1040
    WALSH4_1D    0,1,2,3,4
1041
    TRANSPOSE4x4W 0,1,2,3,4
1042
    WALSH4_1D    0,1,2,3,4
1043
 
1044
; shift, tmp, output, qmul
1045
%if WIN64
1046
    DECLARE_REG_TMP 0,3,1,2
1047
    ; we can't avoid this, because r0 is the shift register (ecx) on win64
1048
    xchg        r0, t2
1049
%elif ARCH_X86_64
1050
    DECLARE_REG_TMP 3,1,0,2
1051
%else
1052
    DECLARE_REG_TMP 1,3,0,2
1053
%endif
1054
 
1055
    cmp        t3d, 32767
1056
    jg .big_qmul
1057
    add        t3d, 128 << 16
1058
    DEQUANT_STORE 8
1059
    RET
1060
.big_qmul:
1061
    bsr        t0d, t3d
1062
    add        t3d, 128 << 16
1063
    mov        t1d, 7
1064
    cmp        t0d, t1d
1065
    cmovg      t0d, t1d
1066
    inc        t1d
1067
    shr        t3d, t0b
1068
    sub        t1d, t0d
1069
%if cpuflag(sse2)
1070
    movd      xmm6, t1d
1071
    DEQUANT_STORE xmm6
1072
%else
1073
    movd        m6, t1d
1074
    DEQUANT_STORE m6
1075
%endif
1076
    RET
1077
%endmacro
1078
 
1079
INIT_MMX mmx
1080
IDCT_DC_DEQUANT 0
1081
INIT_MMX sse2
1082
IDCT_DC_DEQUANT 7