Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6147 serge 1
;*****************************************************************************
2
;* MMX/SSE2/AVX-optimized H.264 deblocking code
3
;*****************************************************************************
4
;* Copyright (C) 2005-2011 x264 project
5
;*
6
;* Authors: Loren Merritt 
7
;*          Fiona Glaser 
8
;*          Oskar Arvidsson 
9
;*
10
;* This file is part of FFmpeg.
11
;*
12
;* FFmpeg is free software; you can redistribute it and/or
13
;* modify it under the terms of the GNU Lesser General Public
14
;* License as published by the Free Software Foundation; either
15
;* version 2.1 of the License, or (at your option) any later version.
16
;*
17
;* FFmpeg is distributed in the hope that it will be useful,
18
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
19
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
20
;* Lesser General Public License for more details.
21
;*
22
;* You should have received a copy of the GNU Lesser General Public
23
;* License along with FFmpeg; if not, write to the Free Software
24
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
25
;******************************************************************************
26
 
27
%include "libavutil/x86/x86util.asm"
28
 
29
SECTION_RODATA
30
 
31
pb_A1: times 16 db 0xA1
32
pb_3_1: times 4 db 3, 1
33
 
34
SECTION .text
35
 
36
cextern pb_0
37
cextern pb_1
38
cextern pb_3
39
 
40
; expands to [base],...,[base+7*stride]
41
%define PASS8ROWS(base, base3, stride, stride3) \
42
    [base], [base+stride], [base+stride*2], [base3], \
43
    [base3+stride], [base3+stride*2], [base3+stride3], [base3+stride*4]
44
 
45
%define PASS8ROWS(base, base3, stride, stride3, offset) \
46
    PASS8ROWS(base+offset, base3+offset, stride, stride3)
47
 
48
; in: 8 rows of 4 bytes in %4..%11
49
; out: 4 rows of 8 bytes in m0..m3
50
%macro TRANSPOSE4x8_LOAD 11
51
    movh       m0, %4
52
    movh       m2, %5
53
    movh       m1, %6
54
    movh       m3, %7
55
    punpckl%1  m0, m2
56
    punpckl%1  m1, m3
57
    mova       m2, m0
58
    punpckl%2  m0, m1
59
    punpckh%2  m2, m1
60
 
61
    movh       m4, %8
62
    movh       m6, %9
63
    movh       m5, %10
64
    movh       m7, %11
65
    punpckl%1  m4, m6
66
    punpckl%1  m5, m7
67
    mova       m6, m4
68
    punpckl%2  m4, m5
69
    punpckh%2  m6, m5
70
 
71
    punpckh%3  m1, m0, m4
72
    punpckh%3  m3, m2, m6
73
    punpckl%3  m0, m4
74
    punpckl%3  m2, m6
75
%endmacro
76
 
77
; in: 4 rows of 8 bytes in m0..m3
78
; out: 8 rows of 4 bytes in %1..%8
79
%macro TRANSPOSE8x4B_STORE 8
80
    punpckhdq  m4, m0, m0
81
    punpckhdq  m5, m1, m1
82
    punpckhdq  m6, m2, m2
83
 
84
    punpcklbw  m0, m1
85
    punpcklbw  m2, m3
86
    punpcklwd  m1, m0, m2
87
    punpckhwd  m0, m2
88
    movh       %1, m1
89
    punpckhdq  m1, m1
90
    movh       %2, m1
91
    movh       %3, m0
92
    punpckhdq  m0, m0
93
    movh       %4, m0
94
 
95
    punpckhdq  m3, m3
96
    punpcklbw  m4, m5
97
    punpcklbw  m6, m3
98
    punpcklwd  m5, m4, m6
99
    punpckhwd  m4, m6
100
    movh       %5, m5
101
    punpckhdq  m5, m5
102
    movh       %6, m5
103
    movh       %7, m4
104
    punpckhdq  m4, m4
105
    movh       %8, m4
106
%endmacro
107
 
108
%macro TRANSPOSE4x8B_LOAD 8
109
    TRANSPOSE4x8_LOAD bw, wd, dq, %1, %2, %3, %4, %5, %6, %7, %8
110
%endmacro
111
 
112
%macro SBUTTERFLY3 4
113
    punpckh%1  %4, %2, %3
114
    punpckl%1  %2, %3
115
%endmacro
116
 
117
; in: 8 rows of 8 (only the middle 6 pels are used) in %1..%8
118
; out: 6 rows of 8 in [%9+0*16] .. [%9+5*16]
119
%macro TRANSPOSE6x8_MEM 9
120
    RESET_MM_PERMUTATION
121
    movq  m0, %1
122
    movq  m1, %2
123
    movq  m2, %3
124
    movq  m3, %4
125
    movq  m4, %5
126
    movq  m5, %6
127
    movq  m6, %7
128
    SBUTTERFLY bw, 0, 1, 7
129
    SBUTTERFLY bw, 2, 3, 7
130
    SBUTTERFLY bw, 4, 5, 7
131
    movq  [%9+0x10], m3
132
    SBUTTERFLY3 bw, m6, %8, m7
133
    SBUTTERFLY wd, 0, 2, 3
134
    SBUTTERFLY wd, 4, 6, 3
135
    punpckhdq m0, m4
136
    movq  [%9+0x00], m0
137
    SBUTTERFLY3 wd, m1, [%9+0x10], m3
138
    SBUTTERFLY wd, 5, 7, 0
139
    SBUTTERFLY dq, 1, 5, 0
140
    SBUTTERFLY dq, 2, 6, 0
141
    punpckldq m3, m7
142
    movq  [%9+0x10], m2
143
    movq  [%9+0x20], m6
144
    movq  [%9+0x30], m1
145
    movq  [%9+0x40], m5
146
    movq  [%9+0x50], m3
147
    RESET_MM_PERMUTATION
148
%endmacro
149
 
150
; in: 8 rows of 8 in %1..%8
151
; out: 8 rows of 8 in %9..%16
152
%macro TRANSPOSE8x8_MEM 16
153
    RESET_MM_PERMUTATION
154
    movq  m0, %1
155
    movq  m1, %2
156
    movq  m2, %3
157
    movq  m3, %4
158
    movq  m4, %5
159
    movq  m5, %6
160
    movq  m6, %7
161
    SBUTTERFLY bw, 0, 1, 7
162
    SBUTTERFLY bw, 2, 3, 7
163
    SBUTTERFLY bw, 4, 5, 7
164
    SBUTTERFLY3 bw, m6, %8, m7
165
    movq  %9,  m5
166
    SBUTTERFLY wd, 0, 2, 5
167
    SBUTTERFLY wd, 4, 6, 5
168
    SBUTTERFLY wd, 1, 3, 5
169
    movq  %11, m6
170
    movq  m6,  %9
171
    SBUTTERFLY wd, 6, 7, 5
172
    SBUTTERFLY dq, 0, 4, 5
173
    SBUTTERFLY dq, 1, 6, 5
174
    movq  %9,  m0
175
    movq  %10, m4
176
    movq  %13, m1
177
    movq  %14, m6
178
    SBUTTERFLY3 dq, m2, %11, m0
179
    SBUTTERFLY dq, 3, 7, 4
180
    movq  %11, m2
181
    movq  %12, m0
182
    movq  %15, m3
183
    movq  %16, m7
184
    RESET_MM_PERMUTATION
185
%endmacro
186
 
187
; out: %4 = |%1-%2|>%3
188
; clobbers: %5
189
%macro DIFF_GT 5
190
%if avx_enabled == 0
191
    mova    %5, %2
192
    mova    %4, %1
193
    psubusb %5, %1
194
    psubusb %4, %2
195
%else
196
    psubusb %5, %2, %1
197
    psubusb %4, %1, %2
198
%endif
199
    por     %4, %5
200
    psubusb %4, %3
201
%endmacro
202
 
203
; out: %4 = |%1-%2|>%3
204
; clobbers: %5
205
%macro DIFF_GT2 5
206
%if ARCH_X86_64
207
    psubusb %5, %2, %1
208
    psubusb %4, %1, %2
209
%else
210
    mova    %5, %2
211
    mova    %4, %1
212
    psubusb %5, %1
213
    psubusb %4, %2
214
%endif
215
    psubusb %5, %3
216
    psubusb %4, %3
217
    pcmpeqb %4, %5
218
%endmacro
219
 
220
; in: m0=p1 m1=p0 m2=q0 m3=q1 %1=alpha-1 %2=beta-1
221
; out: m5=beta-1, m7=mask, %3=alpha-1
222
; clobbers: m4,m6
223
%macro LOAD_MASK 2-3
224
    movd     m4, %1
225
    movd     m5, %2
226
    SPLATW   m4, m4
227
    SPLATW   m5, m5
228
    packuswb m4, m4  ; 16x alpha-1
229
    packuswb m5, m5  ; 16x beta-1
230
%if %0>2
231
    mova     %3, m4
232
%endif
233
    DIFF_GT  m1, m2, m4, m7, m6 ; |p0-q0| > alpha-1
234
    DIFF_GT  m0, m1, m5, m4, m6 ; |p1-p0| > beta-1
235
    por      m7, m4
236
    DIFF_GT  m3, m2, m5, m4, m6 ; |q1-q0| > beta-1
237
    por      m7, m4
238
    pxor     m6, m6
239
    pcmpeqb  m7, m6
240
%endmacro
241
 
242
; in: m0=p1 m1=p0 m2=q0 m3=q1 m7=(tc&mask)
243
; out: m1=p0' m2=q0'
244
; clobbers: m0,3-6
245
%macro DEBLOCK_P0_Q0 0
246
    pcmpeqb m4, m4
247
    pxor    m5, m1, m2   ; p0^q0
248
    pxor    m3, m4
249
    pand    m5, [pb_1]   ; (p0^q0)&1
250
    pavgb   m3, m0       ; (p1 - q1 + 256)>>1
251
    pxor    m4, m1
252
    pavgb   m3, [pb_3]   ; (((p1 - q1 + 256)>>1)+4)>>1 = 64+2+(p1-q1)>>2
253
    pavgb   m4, m2       ; (q0 - p0 + 256)>>1
254
    pavgb   m3, m5
255
    mova    m6, [pb_A1]
256
    paddusb m3, m4       ; d+128+33
257
    psubusb m6, m3
258
    psubusb m3, [pb_A1]
259
    pminub  m6, m7
260
    pminub  m3, m7
261
    psubusb m1, m6
262
    psubusb m2, m3
263
    paddusb m1, m3
264
    paddusb m2, m6
265
%endmacro
266
 
267
; in: m1=p0 m2=q0
268
;     %1=p1 %2=q2 %3=[q2] %4=[q1] %5=tc0 %6=tmp
269
; out: [q1] = clip( (q2+((p0+q0+1)>>1))>>1, q1-tc0, q1+tc0 )
270
; clobbers: q2, tmp, tc0
271
%macro LUMA_Q1 6
272
    pavgb   %6, m1, m2
273
    pavgb   %2, %6       ; avg(p2,avg(p0,q0))
274
    pxor    %6, %3
275
    pand    %6, [pb_1]   ; (p2^avg(p0,q0))&1
276
    psubusb %2, %6       ; (p2+((p0+q0+1)>>1))>>1
277
    psubusb %6, %1, %5
278
    paddusb %5, %1
279
    pmaxub  %2, %6
280
    pminub  %2, %5
281
    mova    %4, %2
282
%endmacro
283
 
284
%if ARCH_X86_64
285
;-----------------------------------------------------------------------------
286
; void ff_deblock_v_luma(uint8_t *pix, int stride, int alpha, int beta,
287
;                        int8_t *tc0)
288
;-----------------------------------------------------------------------------
289
%macro DEBLOCK_LUMA 0
290
cglobal deblock_v_luma_8, 5,5,10
291
    movd    m8, [r4] ; tc0
292
    lea     r4, [r1*3]
293
    dec     r2d        ; alpha-1
294
    neg     r4
295
    dec     r3d        ; beta-1
296
    add     r4, r0     ; pix-3*stride
297
 
298
    mova    m0, [r4+r1]   ; p1
299
    mova    m1, [r4+2*r1] ; p0
300
    mova    m2, [r0]      ; q0
301
    mova    m3, [r0+r1]   ; q1
302
    LOAD_MASK r2d, r3d
303
 
304
    punpcklbw m8, m8
305
    punpcklbw m8, m8 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
306
    pcmpeqb m9, m9
307
    pcmpeqb m9, m8
308
    pandn   m9, m7
309
    pand    m8, m9
310
 
311
    movdqa  m3, [r4] ; p2
312
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
313
    pand    m6, m9
314
    psubb   m7, m8, m6
315
    pand    m6, m8
316
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
317
 
318
    movdqa  m4, [r0+2*r1] ; q2
319
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
320
    pand    m6, m9
321
    pand    m8, m6
322
    psubb   m7, m6
323
    mova    m3, [r0+r1]
324
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m8, m6
325
 
326
    DEBLOCK_P0_Q0
327
    mova    [r4+2*r1], m1
328
    mova    [r0], m2
329
    RET
330
 
331
;-----------------------------------------------------------------------------
332
; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
333
;                        int8_t *tc0)
334
;-----------------------------------------------------------------------------
335
INIT_MMX cpuname
336
cglobal deblock_h_luma_8, 5,9,0,0x60+16*WIN64
337
    movsxd r7,  r1d
338
    lea    r8,  [r7+r7*2]
339
    lea    r6,  [r0-4]
340
    lea    r5,  [r0-4+r8]
341
%if WIN64
342
    %define pix_tmp rsp+0x30 ; shadow space + r4
343
%else
344
    %define pix_tmp rsp
345
%endif
346
 
347
    ; transpose 6x16 -> tmp space
348
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp
349
    lea    r6, [r6+r7*8]
350
    lea    r5, [r5+r7*8]
351
    TRANSPOSE6x8_MEM  PASS8ROWS(r6, r5, r7, r8), pix_tmp+8
352
 
353
    ; vertical filter
354
    ; alpha, beta, tc0 are still in r2d, r3d, r4
355
    ; don't backup r6, r5, r7, r8 because deblock_v_luma_sse2 doesn't use them
356
    lea    r0, [pix_tmp+0x30]
357
    mov    r1d, 0x10
358
%if WIN64
359
    mov    [rsp+0x20], r4
360
%endif
361
    call   deblock_v_luma_8
362
 
363
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
364
    add    r6, 2
365
    add    r5, 2
366
    movq   m0, [pix_tmp+0x18]
367
    movq   m1, [pix_tmp+0x28]
368
    movq   m2, [pix_tmp+0x38]
369
    movq   m3, [pix_tmp+0x48]
370
    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
371
 
372
    shl    r7,  3
373
    sub    r6,  r7
374
    sub    r5,  r7
375
    shr    r7,  3
376
    movq   m0, [pix_tmp+0x10]
377
    movq   m1, [pix_tmp+0x20]
378
    movq   m2, [pix_tmp+0x30]
379
    movq   m3, [pix_tmp+0x40]
380
    TRANSPOSE8x4B_STORE  PASS8ROWS(r6, r5, r7, r8)
381
 
382
    RET
383
%endmacro
384
 
385
INIT_XMM sse2
386
DEBLOCK_LUMA
387
%if HAVE_AVX_EXTERNAL
388
INIT_XMM avx
389
DEBLOCK_LUMA
390
%endif
391
 
392
%else
393
 
394
%macro DEBLOCK_LUMA 2
395
;-----------------------------------------------------------------------------
396
; void ff_deblock_v8_luma(uint8_t *pix, int stride, int alpha, int beta,
397
;                         int8_t *tc0)
398
;-----------------------------------------------------------------------------
399
cglobal deblock_%1_luma_8, 5,5,8,2*%2
400
    lea     r4, [r1*3]
401
    dec     r2     ; alpha-1
402
    neg     r4
403
    dec     r3     ; beta-1
404
    add     r4, r0 ; pix-3*stride
405
 
406
    mova    m0, [r4+r1]   ; p1
407
    mova    m1, [r4+2*r1] ; p0
408
    mova    m2, [r0]      ; q0
409
    mova    m3, [r0+r1]   ; q1
410
    LOAD_MASK r2, r3
411
 
412
    mov     r3, r4mp
413
    pcmpeqb m3, m3
414
    movd    m4, [r3] ; tc0
415
    punpcklbw m4, m4
416
    punpcklbw m4, m4 ; tc = 4x tc0[3], 4x tc0[2], 4x tc0[1], 4x tc0[0]
417
    mova   [esp+%2], m4 ; tc
418
    pcmpgtb m4, m3
419
    mova    m3, [r4] ; p2
420
    pand    m4, m7
421
    mova   [esp], m4 ; mask
422
 
423
    DIFF_GT2 m1, m3, m5, m6, m7 ; |p2-p0| > beta-1
424
    pand    m6, m4
425
    pand    m4, [esp+%2] ; tc
426
    psubb   m7, m4, m6
427
    pand    m6, m4
428
    LUMA_Q1 m0, m3, [r4], [r4+r1], m6, m4
429
 
430
    mova    m4, [r0+2*r1] ; q2
431
    DIFF_GT2 m2, m4, m5, m6, m3 ; |q2-q0| > beta-1
432
    pand    m6, [esp] ; mask
433
    mova    m5, [esp+%2] ; tc
434
    psubb   m7, m6
435
    pand    m5, m6
436
    mova    m3, [r0+r1]
437
    LUMA_Q1 m3, m4, [r0+2*r1], [r0+r1], m5, m6
438
 
439
    DEBLOCK_P0_Q0
440
    mova    [r4+2*r1], m1
441
    mova    [r0], m2
442
    RET
443
 
444
;-----------------------------------------------------------------------------
445
; void ff_deblock_h_luma(uint8_t *pix, int stride, int alpha, int beta,
446
;                        int8_t *tc0)
447
;-----------------------------------------------------------------------------
448
INIT_MMX cpuname
449
cglobal deblock_h_luma_8, 0,5,8,0x60+12
450
    mov    r0, r0mp
451
    mov    r3, r1m
452
    lea    r4, [r3*3]
453
    sub    r0, 4
454
    lea    r1, [r0+r4]
455
%define pix_tmp esp+12
456
 
457
    ; transpose 6x16 -> tmp space
458
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp
459
    lea    r0, [r0+r3*8]
460
    lea    r1, [r1+r3*8]
461
    TRANSPOSE6x8_MEM  PASS8ROWS(r0, r1, r3, r4), pix_tmp+8
462
 
463
    ; vertical filter
464
    lea    r0, [pix_tmp+0x30]
465
    PUSH   dword r4m
466
    PUSH   dword r3m
467
    PUSH   dword r2m
468
    PUSH   dword 16
469
    PUSH   dword r0
470
    call   deblock_%1_luma_8
471
%ifidn %1, v8
472
    add    dword [esp   ], 8 ; pix_tmp+0x38
473
    add    dword [esp+16], 2 ; tc0+2
474
    call   deblock_%1_luma_8
475
%endif
476
    ADD    esp, 20
477
 
478
    ; transpose 16x4 -> original space  (only the middle 4 rows were changed by the filter)
479
    mov    r0, r0mp
480
    sub    r0, 2
481
 
482
    movq   m0, [pix_tmp+0x10]
483
    movq   m1, [pix_tmp+0x20]
484
    lea    r1, [r0+r4]
485
    movq   m2, [pix_tmp+0x30]
486
    movq   m3, [pix_tmp+0x40]
487
    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
488
 
489
    lea    r0, [r0+r3*8]
490
    lea    r1, [r1+r3*8]
491
    movq   m0, [pix_tmp+0x18]
492
    movq   m1, [pix_tmp+0x28]
493
    movq   m2, [pix_tmp+0x38]
494
    movq   m3, [pix_tmp+0x48]
495
    TRANSPOSE8x4B_STORE  PASS8ROWS(r0, r1, r3, r4)
496
 
497
    RET
498
%endmacro ; DEBLOCK_LUMA
499
 
500
INIT_MMX mmxext
501
DEBLOCK_LUMA v8, 8
502
INIT_XMM sse2
503
DEBLOCK_LUMA v, 16
504
%if HAVE_AVX_EXTERNAL
505
INIT_XMM avx
506
DEBLOCK_LUMA v, 16
507
%endif
508
 
509
%endif ; ARCH
510
 
511
 
512
 
513
%macro LUMA_INTRA_P012 4 ; p0..p3 in memory
514
%if ARCH_X86_64
515
    pavgb t0, p2, p1
516
    pavgb t1, p0, q0
517
%else
518
    mova  t0, p2
519
    mova  t1, p0
520
    pavgb t0, p1
521
    pavgb t1, q0
522
%endif
523
    pavgb t0, t1 ; ((p2+p1+1)/2 + (p0+q0+1)/2 + 1)/2
524
    mova  t5, t1
525
%if ARCH_X86_64
526
    paddb t2, p2, p1
527
    paddb t3, p0, q0
528
%else
529
    mova  t2, p2
530
    mova  t3, p0
531
    paddb t2, p1
532
    paddb t3, q0
533
%endif
534
    paddb t2, t3
535
    mova  t3, t2
536
    mova  t4, t2
537
    psrlw t2, 1
538
    pavgb t2, mpb_0
539
    pxor  t2, t0
540
    pand  t2, mpb_1
541
    psubb t0, t2 ; p1' = (p2+p1+p0+q0+2)/4;
542
 
543
%if ARCH_X86_64
544
    pavgb t1, p2, q1
545
    psubb t2, p2, q1
546
%else
547
    mova  t1, p2
548
    mova  t2, p2
549
    pavgb t1, q1
550
    psubb t2, q1
551
%endif
552
    paddb t3, t3
553
    psubb t3, t2 ; p2+2*p1+2*p0+2*q0+q1
554
    pand  t2, mpb_1
555
    psubb t1, t2
556
    pavgb t1, p1
557
    pavgb t1, t5 ; (((p2+q1)/2 + p1+1)/2 + (p0+q0+1)/2 + 1)/2
558
    psrlw t3, 2
559
    pavgb t3, mpb_0
560
    pxor  t3, t1
561
    pand  t3, mpb_1
562
    psubb t1, t3 ; p0'a = (p2+2*p1+2*p0+2*q0+q1+4)/8
563
 
564
    pxor  t3, p0, q1
565
    pavgb t2, p0, q1
566
    pand  t3, mpb_1
567
    psubb t2, t3
568
    pavgb t2, p1 ; p0'b = (2*p1+p0+q0+2)/4
569
 
570
    pxor  t1, t2
571
    pxor  t2, p0
572
    pand  t1, mask1p
573
    pand  t2, mask0
574
    pxor  t1, t2
575
    pxor  t1, p0
576
    mova  %1, t1 ; store p0
577
 
578
    mova  t1, %4 ; p3
579
    paddb t2, t1, p2
580
    pavgb t1, p2
581
    pavgb t1, t0 ; (p3+p2+1)/2 + (p2+p1+p0+q0+2)/4
582
    paddb t2, t2
583
    paddb t2, t4 ; 2*p3+3*p2+p1+p0+q0
584
    psrlw t2, 2
585
    pavgb t2, mpb_0
586
    pxor  t2, t1
587
    pand  t2, mpb_1
588
    psubb t1, t2 ; p2' = (2*p3+3*p2+p1+p0+q0+4)/8
589
 
590
    pxor  t0, p1
591
    pxor  t1, p2
592
    pand  t0, mask1p
593
    pand  t1, mask1p
594
    pxor  t0, p1
595
    pxor  t1, p2
596
    mova  %2, t0 ; store p1
597
    mova  %3, t1 ; store p2
598
%endmacro
599
 
600
%macro LUMA_INTRA_SWAP_PQ 0
601
    %define q1 m0
602
    %define q0 m1
603
    %define p0 m2
604
    %define p1 m3
605
    %define p2 q2
606
    %define mask1p mask1q
607
%endmacro
608
 
609
%macro DEBLOCK_LUMA_INTRA 1
610
    %define p1 m0
611
    %define p0 m1
612
    %define q0 m2
613
    %define q1 m3
614
    %define t0 m4
615
    %define t1 m5
616
    %define t2 m6
617
    %define t3 m7
618
%if ARCH_X86_64
619
    %define p2 m8
620
    %define q2 m9
621
    %define t4 m10
622
    %define t5 m11
623
    %define mask0 m12
624
    %define mask1p m13
625
%if WIN64
626
    %define mask1q [rsp]
627
%else
628
    %define mask1q [rsp-24]
629
%endif
630
    %define mpb_0 m14
631
    %define mpb_1 m15
632
%else
633
    %define spill(x) [esp+16*x]
634
    %define p2 [r4+r1]
635
    %define q2 [r0+2*r1]
636
    %define t4 spill(0)
637
    %define t5 spill(1)
638
    %define mask0 spill(2)
639
    %define mask1p spill(3)
640
    %define mask1q spill(4)
641
    %define mpb_0 [pb_0]
642
    %define mpb_1 [pb_1]
643
%endif
644
 
645
;-----------------------------------------------------------------------------
646
; void ff_deblock_v_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
647
;-----------------------------------------------------------------------------
648
%if WIN64
649
cglobal deblock_%1_luma_intra_8, 4,6,16,0x10
650
%else
651
cglobal deblock_%1_luma_intra_8, 4,6,16,ARCH_X86_64*0x50-0x50
652
%endif
653
    lea     r4, [r1*4]
654
    lea     r5, [r1*3] ; 3*stride
655
    dec     r2d        ; alpha-1
656
    jl .end
657
    neg     r4
658
    dec     r3d        ; beta-1
659
    jl .end
660
    add     r4, r0     ; pix-4*stride
661
    mova    p1, [r4+2*r1]
662
    mova    p0, [r4+r5]
663
    mova    q0, [r0]
664
    mova    q1, [r0+r1]
665
%if ARCH_X86_64
666
    pxor    mpb_0, mpb_0
667
    mova    mpb_1, [pb_1]
668
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
669
    SWAP    7, 12 ; m12=mask0
670
    pavgb   t5, mpb_0
671
    pavgb   t5, mpb_1 ; alpha/4+1
672
    movdqa  p2, [r4+r1]
673
    movdqa  q2, [r0+2*r1]
674
    DIFF_GT2 p0, q0, t5, t0, t3 ; t0 = |p0-q0| > alpha/4+1
675
    DIFF_GT2 p0, p2, m5, t2, t5 ; mask1 = |p2-p0| > beta-1
676
    DIFF_GT2 q0, q2, m5, t4, t5 ; t4 = |q2-q0| > beta-1
677
    pand    t0, mask0
678
    pand    t4, t0
679
    pand    t2, t0
680
    mova    mask1q, t4
681
    mova    mask1p, t2
682
%else
683
    LOAD_MASK r2d, r3d, t5 ; m5=beta-1, t5=alpha-1, m7=mask0
684
    mova    m4, t5
685
    mova    mask0, m7
686
    pavgb   m4, [pb_0]
687
    pavgb   m4, [pb_1] ; alpha/4+1
688
    DIFF_GT2 p0, q0, m4, m6, m7 ; m6 = |p0-q0| > alpha/4+1
689
    pand    m6, mask0
690
    DIFF_GT2 p0, p2, m5, m4, m7 ; m4 = |p2-p0| > beta-1
691
    pand    m4, m6
692
    mova    mask1p, m4
693
    DIFF_GT2 q0, q2, m5, m4, m7 ; m4 = |q2-q0| > beta-1
694
    pand    m4, m6
695
    mova    mask1q, m4
696
%endif
697
    LUMA_INTRA_P012 [r4+r5], [r4+2*r1], [r4+r1], [r4]
698
    LUMA_INTRA_SWAP_PQ
699
    LUMA_INTRA_P012 [r0], [r0+r1], [r0+2*r1], [r0+r5]
700
.end:
701
    RET
702
 
703
INIT_MMX cpuname
704
%if ARCH_X86_64
705
;-----------------------------------------------------------------------------
706
; void ff_deblock_h_luma_intra(uint8_t *pix, int stride, int alpha, int beta)
707
;-----------------------------------------------------------------------------
708
cglobal deblock_h_luma_intra_8, 4,9,0,0x80
709
    movsxd r7,  r1d
710
    lea    r8,  [r7*3]
711
    lea    r6,  [r0-4]
712
    lea    r5,  [r0-4+r8]
713
%if WIN64
714
    %define pix_tmp rsp+0x20 ; shadow space
715
%else
716
    %define pix_tmp rsp
717
%endif
718
 
719
    ; transpose 8x16 -> tmp space
720
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
721
    lea    r6, [r6+r7*8]
722
    lea    r5, [r5+r7*8]
723
    TRANSPOSE8x8_MEM  PASS8ROWS(r6, r5, r7, r8), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
724
 
725
    lea    r0,  [pix_tmp+0x40]
726
    mov    r1,  0x10
727
    call   deblock_v_luma_intra_8
728
 
729
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
730
    lea    r5, [r6+r8]
731
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
732
    shl    r7,  3
733
    sub    r6,  r7
734
    sub    r5,  r7
735
    shr    r7,  3
736
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r6, r5, r7, r8)
737
    RET
738
%else
739
cglobal deblock_h_luma_intra_8, 2,4,8,0x80
740
    lea    r3,  [r1*3]
741
    sub    r0,  4
742
    lea    r2,  [r0+r3]
743
    %define pix_tmp rsp
744
 
745
    ; transpose 8x16 -> tmp space
746
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30)
747
    lea    r0,  [r0+r1*8]
748
    lea    r2,  [r2+r1*8]
749
    TRANSPOSE8x8_MEM  PASS8ROWS(r0, r2, r1, r3), PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30)
750
 
751
    lea    r0,  [pix_tmp+0x40]
752
    PUSH   dword r3m
753
    PUSH   dword r2m
754
    PUSH   dword 16
755
    PUSH   r0
756
    call   deblock_%1_luma_intra_8
757
%ifidn %1, v8
758
    add    dword [rsp], 8 ; pix_tmp+8
759
    call   deblock_%1_luma_intra_8
760
%endif
761
    ADD    esp, 16
762
 
763
    mov    r1,  r1m
764
    mov    r0,  r0mp
765
    lea    r3,  [r1*3]
766
    sub    r0,  4
767
    lea    r2,  [r0+r3]
768
    ; transpose 16x6 -> original space (but we can't write only 6 pixels, so really 16x8)
769
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp, pix_tmp+0x30, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
770
    lea    r0,  [r0+r1*8]
771
    lea    r2,  [r2+r1*8]
772
    TRANSPOSE8x8_MEM  PASS8ROWS(pix_tmp+8, pix_tmp+0x38, 0x10, 0x30), PASS8ROWS(r0, r2, r1, r3)
773
    RET
774
%endif ; ARCH_X86_64
775
%endmacro ; DEBLOCK_LUMA_INTRA
776
 
777
INIT_XMM sse2
778
DEBLOCK_LUMA_INTRA v
779
%if HAVE_AVX_EXTERNAL
780
INIT_XMM avx
781
DEBLOCK_LUMA_INTRA v
782
%endif
783
%if ARCH_X86_64 == 0
784
INIT_MMX mmxext
785
DEBLOCK_LUMA_INTRA v8
786
%endif
787
 
788
INIT_MMX mmxext
789
 
790
%macro CHROMA_V_START 0
791
    dec    r2d      ; alpha-1
792
    dec    r3d      ; beta-1
793
    mov    t5, r0
794
    sub    t5, r1
795
    sub    t5, r1
796
%endmacro
797
 
798
%macro CHROMA_H_START 0
799
    dec    r2d
800
    dec    r3d
801
    sub    r0, 2
802
    lea    t6, [r1*3]
803
    mov    t5, r0
804
    add    r0, t6
805
%endmacro
806
 
807
%define t5 r5
808
%define t6 r6
809
 
810
;-----------------------------------------------------------------------------
811
; void ff_deblock_v_chroma(uint8_t *pix, int stride, int alpha, int beta,
812
;                          int8_t *tc0)
813
;-----------------------------------------------------------------------------
814
cglobal deblock_v_chroma_8, 5,6
815
    CHROMA_V_START
816
    movq  m0, [t5]
817
    movq  m1, [t5+r1]
818
    movq  m2, [r0]
819
    movq  m3, [r0+r1]
820
    call ff_chroma_inter_body_mmxext
821
    movq  [t5+r1], m1
822
    movq  [r0], m2
823
    RET
824
 
825
;-----------------------------------------------------------------------------
826
; void ff_deblock_h_chroma(uint8_t *pix, int stride, int alpha, int beta,
827
;                          int8_t *tc0)
828
;-----------------------------------------------------------------------------
829
cglobal deblock_h_chroma_8, 5,7
830
%if ARCH_X86_64
831
    ; This could use the red zone on 64 bit unix to avoid the stack pointer
832
    ; readjustment, but valgrind assumes the red zone is clobbered on
833
    ; function calls and returns.
834
    sub   rsp, 16
835
    %define buf0 [rsp]
836
    %define buf1 [rsp+8]
837
%else
838
    %define buf0 r0m
839
    %define buf1 r2m
840
%endif
841
    CHROMA_H_START
842
    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
843
    movq  buf0, m0
844
    movq  buf1, m3
845
    LOAD_MASK  r2d, r3d
846
    movd       m6, [r4] ; tc0
847
    punpcklbw  m6, m6
848
    pand       m7, m6
849
    DEBLOCK_P0_Q0
850
    movq  m0, buf0
851
    movq  m3, buf1
852
    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
853
%if ARCH_X86_64
854
    add   rsp, 16
855
%endif
856
    RET
857
 
858
ALIGN 16
859
ff_chroma_inter_body_mmxext:
860
    LOAD_MASK  r2d, r3d
861
    movd       m6, [r4] ; tc0
862
    punpcklbw  m6, m6
863
    pand       m7, m6
864
    DEBLOCK_P0_Q0
865
    ret
866
 
867
 
868
 
869
; in: %1=p0 %2=p1 %3=q1
870
; out: p0 = (p0 + q1 + 2*p1 + 2) >> 2
871
%macro CHROMA_INTRA_P0 3
872
    movq    m4, %1
873
    pxor    m4, %3
874
    pand    m4, [pb_1] ; m4 = (p0^q1)&1
875
    pavgb   %1, %3
876
    psubusb %1, m4
877
    pavgb   %1, %2             ; dst = avg(p1, avg(p0,q1) - ((p0^q1)&1))
878
%endmacro
879
 
880
%define t5 r4
881
%define t6 r5
882
 
883
;------------------------------------------------------------------------------
884
; void ff_deblock_v_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
885
;------------------------------------------------------------------------------
886
cglobal deblock_v_chroma_intra_8, 4,5
887
    CHROMA_V_START
888
    movq  m0, [t5]
889
    movq  m1, [t5+r1]
890
    movq  m2, [r0]
891
    movq  m3, [r0+r1]
892
    call ff_chroma_intra_body_mmxext
893
    movq  [t5+r1], m1
894
    movq  [r0], m2
895
    RET
896
 
897
;------------------------------------------------------------------------------
898
; void ff_deblock_h_chroma_intra(uint8_t *pix, int stride, int alpha, int beta)
899
;------------------------------------------------------------------------------
900
cglobal deblock_h_chroma_intra_8, 4,6
901
    CHROMA_H_START
902
    TRANSPOSE4x8_LOAD  bw, wd, dq, PASS8ROWS(t5, r0, r1, t6)
903
    call ff_chroma_intra_body_mmxext
904
    TRANSPOSE8x4B_STORE PASS8ROWS(t5, r0, r1, t6)
905
    RET
906
 
907
ALIGN 16
908
ff_chroma_intra_body_mmxext:
909
    LOAD_MASK r2d, r3d
910
    movq   m5, m1
911
    movq   m6, m2
912
    CHROMA_INTRA_P0  m1, m0, m3
913
    CHROMA_INTRA_P0  m2, m3, m0
914
    psubb  m1, m5
915
    psubb  m2, m6
916
    pand   m1, m7
917
    pand   m2, m7
918
    paddb  m1, m5
919
    paddb  m2, m6
920
    ret
921
 
922
;-----------------------------------------------------------------------------
923
; void ff_h264_loop_filter_strength(int16_t bs[2][4][4], uint8_t nnz[40],
924
;                                   int8_t ref[2][40], int16_t mv[2][40][2],
925
;                                   int bidir,    int edges,    int step,
926
;                                   int mask_mv0, int mask_mv1, int field);
927
;
928
; bidir    is 0 or 1
929
; edges    is 1 or 4
930
; step     is 1 or 2
931
; mask_mv0 is 0 or 3
932
; mask_mv1 is 0 or 1
933
; field    is 0 or 1
934
;-----------------------------------------------------------------------------
935
%macro loop_filter_strength_iteration 7 ; edges, step, mask_mv,
936
                                        ; dir, d_idx, mask_dir, bidir
937
%define edgesd    %1
938
%define stepd     %2
939
%define mask_mvd  %3
940
%define dir       %4
941
%define d_idx     %5
942
%define mask_dir  %6
943
%define bidir     %7
944
    xor          b_idxd, b_idxd ; for (b_idx = 0; b_idx < edges; b_idx += step)
945
%%.b_idx_loop:
946
%if mask_dir == 0
947
    pxor             m0, m0
948
%endif
949
    test         b_idxd, dword mask_mvd
950
    jnz %%.skip_loop_iter                       ; if (!(b_idx & mask_mv))
951
%if bidir == 1
952
    movd             m2, [refq+b_idxq+d_idx+12] ; { ref0[bn] }
953
    punpckldq        m2, [refq+b_idxq+d_idx+52] ; { ref0[bn], ref1[bn] }
954
    pshufw           m0, [refq+b_idxq+12], 0x44 ; { ref0[b],  ref0[b]  }
955
    pshufw           m1, [refq+b_idxq+52], 0x44 ; { ref1[b],  ref1[b]  }
956
    pshufw           m3, m2, 0x4E               ; { ref1[bn], ref0[bn] }
957
    psubb            m0, m2                     ; { ref0[b] != ref0[bn],
958
                                                ;   ref0[b] != ref1[bn] }
959
    psubb            m1, m3                     ; { ref1[b] != ref1[bn],
960
                                                ;   ref1[b] != ref0[bn] }
961
 
962
    por              m0, m1
963
    mova             m1, [mvq+b_idxq*4+(d_idx+12)*4]
964
    mova             m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
965
    mova             m3, m1
966
    mova             m4, m2
967
    psubw            m1, [mvq+b_idxq*4+12*4]
968
    psubw            m2, [mvq+b_idxq*4+12*4+mmsize]
969
    psubw            m3, [mvq+b_idxq*4+52*4]
970
    psubw            m4, [mvq+b_idxq*4+52*4+mmsize]
971
    packsswb         m1, m2
972
    packsswb         m3, m4
973
    paddb            m1, m6
974
    paddb            m3, m6
975
    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
976
    psubusb          m3, m5
977
    packsswb         m1, m3
978
 
979
    por              m0, m1
980
    mova             m1, [mvq+b_idxq*4+(d_idx+52)*4]
981
    mova             m2, [mvq+b_idxq*4+(d_idx+52)*4+mmsize]
982
    mova             m3, m1
983
    mova             m4, m2
984
    psubw            m1, [mvq+b_idxq*4+12*4]
985
    psubw            m2, [mvq+b_idxq*4+12*4+mmsize]
986
    psubw            m3, [mvq+b_idxq*4+52*4]
987
    psubw            m4, [mvq+b_idxq*4+52*4+mmsize]
988
    packsswb         m1, m2
989
    packsswb         m3, m4
990
    paddb            m1, m6
991
    paddb            m3, m6
992
    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
993
    psubusb          m3, m5
994
    packsswb         m1, m3
995
 
996
    pshufw           m1, m1, 0x4E
997
    por              m0, m1
998
    pshufw           m1, m0, 0x4E
999
    pminub           m0, m1
1000
%else ; bidir == 0
1001
    movd             m0, [refq+b_idxq+12]
1002
    psubb            m0, [refq+b_idxq+d_idx+12] ; ref[b] != ref[bn]
1003
 
1004
    mova             m1, [mvq+b_idxq*4+12*4]
1005
    mova             m2, [mvq+b_idxq*4+12*4+mmsize]
1006
    psubw            m1, [mvq+b_idxq*4+(d_idx+12)*4]
1007
    psubw            m2, [mvq+b_idxq*4+(d_idx+12)*4+mmsize]
1008
    packsswb         m1, m2
1009
    paddb            m1, m6
1010
    psubusb          m1, m5 ; abs(mv[b] - mv[bn]) >= limit
1011
    packsswb         m1, m1
1012
    por              m0, m1
1013
%endif ; bidir == 1/0
1014
 
1015
%%.skip_loop_iter:
1016
    movd             m1, [nnzq+b_idxq+12]
1017
    por              m1, [nnzq+b_idxq+d_idx+12] ; nnz[b] || nnz[bn]
1018
 
1019
    pminub           m1, m7
1020
    pminub           m0, m7
1021
    psllw            m1, 1
1022
    pxor             m2, m2
1023
    pmaxub           m1, m0
1024
    punpcklbw        m1, m2
1025
    movq [bsq+b_idxq+32*dir], m1
1026
 
1027
    add          b_idxd, dword stepd
1028
    cmp          b_idxd, dword edgesd
1029
    jl %%.b_idx_loop
1030
%endmacro
1031
 
1032
INIT_MMX mmxext
1033
cglobal h264_loop_filter_strength, 9, 9, 0, bs, nnz, ref, mv, bidir, edges, \
1034
                                            step, mask_mv0, mask_mv1, field
1035
%define b_idxq bidirq
1036
%define b_idxd bidird
1037
    cmp    dword fieldm, 0
1038
    mova             m7, [pb_1]
1039
    mova             m5, [pb_3]
1040
    je .nofield
1041
    mova             m5, [pb_3_1]
1042
.nofield:
1043
    mova             m6, m5
1044
    paddb            m5, m5
1045
 
1046
    shl     dword stepd, 3
1047
    shl    dword edgesd, 3
1048
%if ARCH_X86_32
1049
%define mask_mv0d mask_mv0m
1050
%define mask_mv1d mask_mv1m
1051
%endif
1052
    shl dword mask_mv1d, 3
1053
    shl dword mask_mv0d, 3
1054
 
1055
    cmp    dword bidird, 0
1056
    jne .bidir
1057
    loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8,  0, 0
1058
    loop_filter_strength_iteration     32,     8, mask_mv0d, 0, -1, -1, 0
1059
 
1060
    mova             m0, [bsq+mmsize*0]
1061
    mova             m1, [bsq+mmsize*1]
1062
    mova             m2, [bsq+mmsize*2]
1063
    mova             m3, [bsq+mmsize*3]
1064
    TRANSPOSE4x4W 0, 1, 2, 3, 4
1065
    mova  [bsq+mmsize*0], m0
1066
    mova  [bsq+mmsize*1], m1
1067
    mova  [bsq+mmsize*2], m2
1068
    mova  [bsq+mmsize*3], m3
1069
    RET
1070
 
1071
.bidir:
1072
    loop_filter_strength_iteration edgesd, stepd, mask_mv1d, 1, -8,  0, 1
1073
    loop_filter_strength_iteration     32,     8, mask_mv0d, 0, -1, -1, 1
1074
 
1075
    mova             m0, [bsq+mmsize*0]
1076
    mova             m1, [bsq+mmsize*1]
1077
    mova             m2, [bsq+mmsize*2]
1078
    mova             m3, [bsq+mmsize*3]
1079
    TRANSPOSE4x4W 0, 1, 2, 3, 4
1080
    mova  [bsq+mmsize*0], m0
1081
    mova  [bsq+mmsize*1], m1
1082
    mova  [bsq+mmsize*2], m2
1083
    mova  [bsq+mmsize*3], m3
1084
    RET