Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
;*****************************************************************************
2
;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2011 x264 project
5
;*
6
;* Authors: Daniel Kang 
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or
11
;* modify it under the terms of the GNU Lesser General Public
12
;* License as published by the Free Software Foundation; either
13
;* version 2.1 of the License, or (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
18
;* Lesser General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU Lesser General Public
21
;* License along with FFmpeg; if not, write to the Free Software
22
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
23
;******************************************************************************
24
 
25
%include "libavutil/x86/x86util.asm"
26
 
27
SECTION_RODATA
28
 
29
pw_pixel_max: times 8 dw ((1 << 10)-1)
30
pd_32:        times 4 dd 32
31
 
32
SECTION .text
33
 
34
;-----------------------------------------------------------------------------
35
; void h264_idct_add(pixel *dst, dctcoef *block, int stride)
36
;-----------------------------------------------------------------------------
37
%macro STORE_DIFFx2 6
38
    psrad       %1, 6
39
    psrad       %2, 6
40
    packssdw    %1, %2
41
    movq        %3, [%5]
42
    movhps      %3, [%5+%6]
43
    paddsw      %1, %3
44
    CLIPW       %1, %4, [pw_pixel_max]
45
    movq      [%5], %1
46
    movhps [%5+%6], %1
47
%endmacro
48
 
49
%macro STORE_DIFF16 5
50
    psrad       %1, 6
51
    psrad       %2, 6
52
    packssdw    %1, %2
53
    paddsw      %1, [%5]
54
    CLIPW       %1, %3, %4
55
    mova      [%5], %1
56
%endmacro
57
 
58
;dst, in, stride
59
%macro IDCT4_ADD_10 3
60
    mova  m0, [%2+ 0]
61
    mova  m1, [%2+16]
62
    mova  m2, [%2+32]
63
    mova  m3, [%2+48]
64
    IDCT4_1D d,0,1,2,3,4,5
65
    TRANSPOSE4x4D 0,1,2,3,4
66
    paddd m0, [pd_32]
67
    IDCT4_1D d,0,1,2,3,4,5
68
    pxor  m5, m5
69
    mova [%2+ 0], m5
70
    mova [%2+16], m5
71
    mova [%2+32], m5
72
    mova [%2+48], m5
73
    STORE_DIFFx2 m0, m1, m4, m5, %1, %3
74
    lea   %1, [%1+%3*2]
75
    STORE_DIFFx2 m2, m3, m4, m5, %1, %3
76
%endmacro
77
 
78
%macro IDCT_ADD_10 0
79
cglobal h264_idct_add_10, 3,3
80
    IDCT4_ADD_10 r0, r1, r2
81
    RET
82
%endmacro
83
 
84
INIT_XMM sse2
85
IDCT_ADD_10
86
%if HAVE_AVX_EXTERNAL
87
INIT_XMM avx
88
IDCT_ADD_10
89
%endif
90
 
91
;-----------------------------------------------------------------------------
92
; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
93
;-----------------------------------------------------------------------------
94
;;;;;;; NO FATE SAMPLES TRIGGER THIS
95
%macro ADD4x4IDCT 0
96
add4x4_idct %+ SUFFIX:
97
    add   r5, r0
98
    mova  m0, [r2+ 0]
99
    mova  m1, [r2+16]
100
    mova  m2, [r2+32]
101
    mova  m3, [r2+48]
102
    IDCT4_1D d,0,1,2,3,4,5
103
    TRANSPOSE4x4D 0,1,2,3,4
104
    paddd m0, [pd_32]
105
    IDCT4_1D d,0,1,2,3,4,5
106
    pxor  m5, m5
107
    mova  [r2+ 0], m5
108
    mova  [r2+16], m5
109
    mova  [r2+32], m5
110
    mova  [r2+48], m5
111
    STORE_DIFFx2 m0, m1, m4, m5, r5, r3
112
    lea   r5, [r5+r3*2]
113
    STORE_DIFFx2 m2, m3, m4, m5, r5, r3
114
    ret
115
%endmacro
116
 
117
INIT_XMM sse2
118
ALIGN 16
119
ADD4x4IDCT
120
%if HAVE_AVX_EXTERNAL
121
INIT_XMM avx
122
ALIGN 16
123
ADD4x4IDCT
124
%endif
125
 
126
%macro ADD16_OP 2
127
    cmp          byte [r4+%2], 0
128
    jz .skipblock%1
129
    mov         r5d, [r1+%1*4]
130
    call add4x4_idct %+ SUFFIX
131
.skipblock%1:
132
%if %1<15
133
    add          r2, 64
134
%endif
135
%endmacro
136
 
137
%macro IDCT_ADD16_10 0
138
cglobal h264_idct_add16_10, 5,6
139
    ADD16_OP 0, 4+1*8
140
    ADD16_OP 1, 5+1*8
141
    ADD16_OP 2, 4+2*8
142
    ADD16_OP 3, 5+2*8
143
    ADD16_OP 4, 6+1*8
144
    ADD16_OP 5, 7+1*8
145
    ADD16_OP 6, 6+2*8
146
    ADD16_OP 7, 7+2*8
147
    ADD16_OP 8, 4+3*8
148
    ADD16_OP 9, 5+3*8
149
    ADD16_OP 10, 4+4*8
150
    ADD16_OP 11, 5+4*8
151
    ADD16_OP 12, 6+3*8
152
    ADD16_OP 13, 7+3*8
153
    ADD16_OP 14, 6+4*8
154
    ADD16_OP 15, 7+4*8
155
    REP_RET
156
%endmacro
157
 
158
INIT_XMM sse2
159
IDCT_ADD16_10
160
%if HAVE_AVX_EXTERNAL
161
INIT_XMM avx
162
IDCT_ADD16_10
163
%endif
164
 
165
;-----------------------------------------------------------------------------
166
; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride)
167
;-----------------------------------------------------------------------------
168
%macro IDCT_DC_ADD_OP_10 3
169
    pxor      m5, m5
170
%if avx_enabled
171
    paddw     m1, m0, [%1+0   ]
172
    paddw     m2, m0, [%1+%2  ]
173
    paddw     m3, m0, [%1+%2*2]
174
    paddw     m4, m0, [%1+%3  ]
175
%else
176
    mova      m1, [%1+0   ]
177
    mova      m2, [%1+%2  ]
178
    mova      m3, [%1+%2*2]
179
    mova      m4, [%1+%3  ]
180
    paddw     m1, m0
181
    paddw     m2, m0
182
    paddw     m3, m0
183
    paddw     m4, m0
184
%endif
185
    CLIPW     m1, m5, m6
186
    CLIPW     m2, m5, m6
187
    CLIPW     m3, m5, m6
188
    CLIPW     m4, m5, m6
189
    mova [%1+0   ], m1
190
    mova [%1+%2  ], m2
191
    mova [%1+%2*2], m3
192
    mova [%1+%3  ], m4
193
%endmacro
194
 
195
INIT_MMX mmxext
196
cglobal h264_idct_dc_add_10,3,3
197
    movd      m0, [r1]
198
    mov dword [r1], 0
199
    paddd     m0, [pd_32]
200
    psrad     m0, 6
201
    lea       r1, [r2*3]
202
    pshufw    m0, m0, 0
203
    mova      m6, [pw_pixel_max]
204
    IDCT_DC_ADD_OP_10 r0, r2, r1
205
    RET
206
 
207
;-----------------------------------------------------------------------------
208
; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride)
209
;-----------------------------------------------------------------------------
210
%macro IDCT8_DC_ADD 0
211
cglobal h264_idct8_dc_add_10,3,4,7
212
    movd      m0, [r1]
213
    mov dword[r1], 0
214
    paddd     m0, [pd_32]
215
    psrad     m0, 6
216
    lea       r1, [r2*3]
217
    SPLATW    m0, m0, 0
218
    mova      m6, [pw_pixel_max]
219
    IDCT_DC_ADD_OP_10 r0, r2, r1
220
    lea       r0, [r0+r2*4]
221
    IDCT_DC_ADD_OP_10 r0, r2, r1
222
    RET
223
%endmacro
224
 
225
INIT_XMM sse2
226
IDCT8_DC_ADD
227
%if HAVE_AVX_EXTERNAL
228
INIT_XMM avx
229
IDCT8_DC_ADD
230
%endif
231
 
232
;-----------------------------------------------------------------------------
233
; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
234
;-----------------------------------------------------------------------------
235
%macro AC 1
236
.ac%1:
237
    mov  r5d, [r1+(%1+0)*4]
238
    call add4x4_idct %+ SUFFIX
239
    mov  r5d, [r1+(%1+1)*4]
240
    add  r2, 64
241
    call add4x4_idct %+ SUFFIX
242
    add  r2, 64
243
    jmp .skipadd%1
244
%endmacro
245
 
246
%assign last_block 16
247
%macro ADD16_OP_INTRA 2
248
    cmp      word [r4+%2], 0
249
    jnz .ac%1
250
    mov      r5d, [r2+ 0]
251
    or       r5d, [r2+64]
252
    jz .skipblock%1
253
    mov      r5d, [r1+(%1+0)*4]
254
    call idct_dc_add %+ SUFFIX
255
.skipblock%1:
256
%if %1
257
    add       r2, 128
258
%endif
259
.skipadd%1:
260
%endmacro
261
 
262
%macro IDCT_ADD16INTRA_10 0
263
idct_dc_add %+ SUFFIX:
264
    add       r5, r0
265
    movq      m0, [r2+ 0]
266
    movhps    m0, [r2+64]
267
    mov dword [r2+ 0], 0
268
    mov dword [r2+64], 0
269
    paddd     m0, [pd_32]
270
    psrad     m0, 6
271
    pshufhw   m0, m0, 0
272
    pshuflw   m0, m0, 0
273
    lea       r6, [r3*3]
274
    mova      m6, [pw_pixel_max]
275
    IDCT_DC_ADD_OP_10 r5, r3, r6
276
    ret
277
 
278
cglobal h264_idct_add16intra_10,5,7,8
279
    ADD16_OP_INTRA 0, 4+1*8
280
    ADD16_OP_INTRA 2, 4+2*8
281
    ADD16_OP_INTRA 4, 6+1*8
282
    ADD16_OP_INTRA 6, 6+2*8
283
    ADD16_OP_INTRA 8, 4+3*8
284
    ADD16_OP_INTRA 10, 4+4*8
285
    ADD16_OP_INTRA 12, 6+3*8
286
    ADD16_OP_INTRA 14, 6+4*8
287
    REP_RET
288
    AC 8
289
    AC 10
290
    AC 12
291
    AC 14
292
    AC 0
293
    AC 2
294
    AC 4
295
    AC 6
296
%endmacro
297
 
298
INIT_XMM sse2
299
IDCT_ADD16INTRA_10
300
%if HAVE_AVX_EXTERNAL
301
INIT_XMM avx
302
IDCT_ADD16INTRA_10
303
%endif
304
 
305
%assign last_block 36
306
;-----------------------------------------------------------------------------
307
; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
308
;-----------------------------------------------------------------------------
309
%macro IDCT_ADD8 0
310
cglobal h264_idct_add8_10,5,8,7
311
%if ARCH_X86_64
312
    mov      r7, r0
313
%endif
314
    add      r2, 1024
315
    mov      r0, [r0]
316
    ADD16_OP_INTRA 16, 4+ 6*8
317
    ADD16_OP_INTRA 18, 4+ 7*8
318
    add      r2, 1024-128*2
319
%if ARCH_X86_64
320
    mov      r0, [r7+gprsize]
321
%else
322
    mov      r0, r0m
323
    mov      r0, [r0+gprsize]
324
%endif
325
    ADD16_OP_INTRA 32, 4+11*8
326
    ADD16_OP_INTRA 34, 4+12*8
327
    REP_RET
328
    AC 16
329
    AC 18
330
    AC 32
331
    AC 34
332
 
333
%endmacro ; IDCT_ADD8
334
 
335
INIT_XMM sse2
336
IDCT_ADD8
337
%if HAVE_AVX_EXTERNAL
338
INIT_XMM avx
339
IDCT_ADD8
340
%endif
341
 
342
;-----------------------------------------------------------------------------
343
; void h264_idct8_add(pixel *dst, dctcoef *block, int stride)
344
;-----------------------------------------------------------------------------
345
%macro IDCT8_1D 2
346
    SWAP      0, 1
347
    psrad     m4, m5, 1
348
    psrad     m1, m0, 1
349
    paddd     m4, m5
350
    paddd     m1, m0
351
    paddd     m4, m7
352
    paddd     m1, m5
353
    psubd     m4, m0
354
    paddd     m1, m3
355
 
356
    psubd     m0, m3
357
    psubd     m5, m3
358
    paddd     m0, m7
359
    psubd     m5, m7
360
    psrad     m3, 1
361
    psrad     m7, 1
362
    psubd     m0, m3
363
    psubd     m5, m7
364
 
365
    SWAP      1, 7
366
    psrad     m1, m7, 2
367
    psrad     m3, m4, 2
368
    paddd     m3, m0
369
    psrad     m0, 2
370
    paddd     m1, m5
371
    psrad     m5, 2
372
    psubd     m0, m4
373
    psubd     m7, m5
374
 
375
    SWAP      5, 6
376
    psrad     m4, m2, 1
377
    psrad     m6, m5, 1
378
    psubd     m4, m5
379
    paddd     m6, m2
380
 
381
    mova      m2, %1
382
    mova      m5, %2
383
    SUMSUB_BA d, 5, 2
384
    SUMSUB_BA d, 6, 5
385
    SUMSUB_BA d, 4, 2
386
    SUMSUB_BA d, 7, 6
387
    SUMSUB_BA d, 0, 4
388
    SUMSUB_BA d, 3, 2
389
    SUMSUB_BA d, 1, 5
390
    SWAP      7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567
391
%endmacro
392
 
393
%macro IDCT8_1D_FULL 1
394
    mova         m7, [%1+112*2]
395
    mova         m6, [%1+ 96*2]
396
    mova         m5, [%1+ 80*2]
397
    mova         m3, [%1+ 48*2]
398
    mova         m2, [%1+ 32*2]
399
    mova         m1, [%1+ 16*2]
400
    IDCT8_1D   [%1], [%1+ 64*2]
401
%endmacro
402
 
403
; %1=int16_t *block, %2=int16_t *dstblock
404
%macro IDCT8_ADD_SSE_START 2
405
    IDCT8_1D_FULL %1
406
%if ARCH_X86_64
407
    TRANSPOSE4x4D  0,1,2,3,8
408
    mova    [%2    ], m0
409
    TRANSPOSE4x4D  4,5,6,7,8
410
    mova    [%2+8*2], m4
411
%else
412
    mova         [%1], m7
413
    TRANSPOSE4x4D   0,1,2,3,7
414
    mova           m7, [%1]
415
    mova    [%2     ], m0
416
    mova    [%2+16*2], m1
417
    mova    [%2+32*2], m2
418
    mova    [%2+48*2], m3
419
    TRANSPOSE4x4D   4,5,6,7,3
420
    mova    [%2+ 8*2], m4
421
    mova    [%2+24*2], m5
422
    mova    [%2+40*2], m6
423
    mova    [%2+56*2], m7
424
%endif
425
%endmacro
426
 
427
; %1=uint8_t *dst, %2=int16_t *block, %3=int stride
428
%macro IDCT8_ADD_SSE_END 3
429
    IDCT8_1D_FULL %2
430
    mova  [%2     ], m6
431
    mova  [%2+16*2], m7
432
 
433
    pxor         m7, m7
434
    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
435
    lea          %1, [%1+%3*2]
436
    STORE_DIFFx2 m2, m3, m6, m7, %1, %3
437
    mova         m0, [%2     ]
438
    mova         m1, [%2+16*2]
439
    lea          %1, [%1+%3*2]
440
    STORE_DIFFx2 m4, m5, m6, m7, %1, %3
441
    lea          %1, [%1+%3*2]
442
    STORE_DIFFx2 m0, m1, m6, m7, %1, %3
443
%endmacro
444
 
445
%macro IDCT8_ADD 0
446
cglobal h264_idct8_add_10, 3,4,16
447
%if UNIX64 == 0
448
    %assign pad 16-gprsize-(stack_offset&15)
449
    sub  rsp, pad
450
    call h264_idct8_add1_10 %+ SUFFIX
451
    add  rsp, pad
452
    RET
453
%endif
454
 
455
ALIGN 16
456
; TODO: does not need to use stack
457
h264_idct8_add1_10 %+ SUFFIX:
458
%assign pad 256+16-gprsize
459
    sub          rsp, pad
460
    add   dword [r1], 32
461
 
462
%if ARCH_X86_64
463
    IDCT8_ADD_SSE_START r1, rsp
464
    SWAP 1,  9
465
    SWAP 2, 10
466
    SWAP 3, 11
467
    SWAP 5, 13
468
    SWAP 6, 14
469
    SWAP 7, 15
470
    IDCT8_ADD_SSE_START r1+16, rsp+128
471
    PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7
472
    IDCT8_1D [rsp], [rsp+128]
473
    SWAP 0,  8
474
    SWAP 1,  9
475
    SWAP 2, 10
476
    SWAP 3, 11
477
    SWAP 4, 12
478
    SWAP 5, 13
479
    SWAP 6, 14
480
    SWAP 7, 15
481
    IDCT8_1D [rsp+16], [rsp+144]
482
    psrad         m8, 6
483
    psrad         m0, 6
484
    packssdw      m8, m0
485
    paddsw        m8, [r0]
486
    pxor          m0, m0
487
    mova    [r1+  0], m0
488
    mova    [r1+ 16], m0
489
    mova    [r1+ 32], m0
490
    mova    [r1+ 48], m0
491
    mova    [r1+ 64], m0
492
    mova    [r1+ 80], m0
493
    mova    [r1+ 96], m0
494
    mova    [r1+112], m0
495
    mova    [r1+128], m0
496
    mova    [r1+144], m0
497
    mova    [r1+160], m0
498
    mova    [r1+176], m0
499
    mova    [r1+192], m0
500
    mova    [r1+208], m0
501
    mova    [r1+224], m0
502
    mova    [r1+240], m0
503
    CLIPW         m8, m0, [pw_pixel_max]
504
    mova        [r0], m8
505
    mova          m8, [pw_pixel_max]
506
    STORE_DIFF16  m9, m1, m0, m8, r0+r2
507
    lea           r0, [r0+r2*2]
508
    STORE_DIFF16 m10, m2, m0, m8, r0
509
    STORE_DIFF16 m11, m3, m0, m8, r0+r2
510
    lea           r0, [r0+r2*2]
511
    STORE_DIFF16 m12, m4, m0, m8, r0
512
    STORE_DIFF16 m13, m5, m0, m8, r0+r2
513
    lea           r0, [r0+r2*2]
514
    STORE_DIFF16 m14, m6, m0, m8, r0
515
    STORE_DIFF16 m15, m7, m0, m8, r0+r2
516
%else
517
    IDCT8_ADD_SSE_START r1,    rsp
518
    IDCT8_ADD_SSE_START r1+16, rsp+128
519
    lea           r3, [r0+8]
520
    IDCT8_ADD_SSE_END r0, rsp,    r2
521
    IDCT8_ADD_SSE_END r3, rsp+16, r2
522
    mova    [r1+  0], m7
523
    mova    [r1+ 16], m7
524
    mova    [r1+ 32], m7
525
    mova    [r1+ 48], m7
526
    mova    [r1+ 64], m7
527
    mova    [r1+ 80], m7
528
    mova    [r1+ 96], m7
529
    mova    [r1+112], m7
530
    mova    [r1+128], m7
531
    mova    [r1+144], m7
532
    mova    [r1+160], m7
533
    mova    [r1+176], m7
534
    mova    [r1+192], m7
535
    mova    [r1+208], m7
536
    mova    [r1+224], m7
537
    mova    [r1+240], m7
538
%endif ; ARCH_X86_64
539
 
540
    add          rsp, pad
541
    ret
542
%endmacro
543
 
544
INIT_XMM sse2
545
IDCT8_ADD
546
%if HAVE_AVX_EXTERNAL
547
INIT_XMM avx
548
IDCT8_ADD
549
%endif
550
 
551
;-----------------------------------------------------------------------------
552
; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8])
553
;-----------------------------------------------------------------------------
554
;;;;;;; NO FATE SAMPLES TRIGGER THIS
555
%macro IDCT8_ADD4_OP 2
556
    cmp       byte [r4+%2], 0
557
    jz .skipblock%1
558
    mov      r0d, [r6+%1*4]
559
    add       r0, r5
560
    call h264_idct8_add1_10 %+ SUFFIX
561
.skipblock%1:
562
%if %1<12
563
    add       r1, 256
564
%endif
565
%endmacro
566
 
567
%macro IDCT8_ADD4 0
568
cglobal h264_idct8_add4_10, 0,7,16
569
    %assign pad 16-gprsize-(stack_offset&15)
570
    SUB      rsp, pad
571
    mov       r5, r0mp
572
    mov       r6, r1mp
573
    mov       r1, r2mp
574
    mov      r2d, r3m
575
    movifnidn r4, r4mp
576
    IDCT8_ADD4_OP  0, 4+1*8
577
    IDCT8_ADD4_OP  4, 6+1*8
578
    IDCT8_ADD4_OP  8, 4+3*8
579
    IDCT8_ADD4_OP 12, 6+3*8
580
    ADD       rsp, pad
581
    RET
582
%endmacro ; IDCT8_ADD4
583
 
584
INIT_XMM sse2
585
IDCT8_ADD4
586
%if HAVE_AVX_EXTERNAL
587
INIT_XMM avx
588
IDCT8_ADD4
589
%endif