Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
;******************************************************************************
2
;* VC1 deblocking optimizations
3
;* Copyright (c) 2009 David Conrad
4
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21
 
22
%include "libavutil/x86/x86util.asm"
23
 
24
cextern pw_4
25
cextern pw_5
26
 
27
section .text
28
 
29
; dst_low, dst_high (src), zero
30
; zero-extends one vector from 8 to 16 bits
31
%macro UNPACK_8TO16 4
32
    mova      m%2, m%3
33
    punpckh%1 m%3, m%4
34
    punpckl%1 m%2, m%4
35
%endmacro
36
 
37
%macro STORE_4_WORDS 6
38
%if cpuflag(sse4)
39
    pextrw %1, %5, %6+0
40
    pextrw %2, %5, %6+1
41
    pextrw %3, %5, %6+2
42
    pextrw %4, %5, %6+3
43
%else
44
    movd  %6d, %5
45
%if mmsize==16
46
    psrldq %5, 4
47
%else
48
    psrlq  %5, 32
49
%endif
50
    mov    %1, %6w
51
    shr    %6, 16
52
    mov    %2, %6w
53
    movd  %6d, %5
54
    mov    %3, %6w
55
    shr    %6, 16
56
    mov    %4, %6w
57
%endif
58
%endmacro
59
 
60
; in:  p1 p0 q0 q1, clobbers p0
61
; out: p1 = (2*(p1 - q1) - 5*(p0 - q0) + 4) >> 3
62
%macro VC1_LOOP_FILTER_A0 4
63
    psubw  %1, %4
64
    psubw  %2, %3
65
    paddw  %1, %1
66
    pmullw %2, [pw_5]
67
    psubw  %1, %2
68
    paddw  %1, [pw_4]
69
    psraw  %1, 3
70
%endmacro
71
 
72
; in: p0 q0 a0 a1 a2
73
;     m0 m1 m7 m6 m5
74
; %1: size
75
; out: m0=p0' m1=q0'
76
%macro VC1_FILTER 1
77
    PABSW   m4, m7
78
    PABSW   m3, m6
79
    PABSW   m2, m5
80
    mova    m6, m4
81
    pminsw  m3, m2
82
    pcmpgtw m6, m3  ; if (a2 < a0 || a1 < a0)
83
    psubw   m3, m4
84
    pmullw  m3, [pw_5]   ; 5*(a3 - a0)
85
    PABSW   m2, m3
86
    psraw   m2, 3   ; abs(d/8)
87
    pxor    m7, m3  ; d_sign ^= a0_sign
88
 
89
    pxor    m5, m5
90
    movd    m3, r2d
91
%if %1 > 4
92
    punpcklbw m3, m3
93
%endif
94
    punpcklbw m3, m5
95
    pcmpgtw m3, m4  ; if (a0 < pq)
96
    pand    m6, m3
97
 
98
    mova    m3, m0
99
    psubw   m3, m1
100
    PABSW   m4, m3
101
    psraw   m4, 1
102
    pxor    m3, m7  ; d_sign ^ clip_sign
103
    psraw   m3, 15
104
    pminsw  m2, m4  ; min(d, clip)
105
    pcmpgtw m4, m5
106
    pand    m6, m4  ; filt3 (C return value)
107
 
108
; each set of 4 pixels is not filtered if the 3rd is not
109
%if mmsize==16
110
    pshuflw m4, m6, 0xaa
111
%if %1 > 4
112
    pshufhw m4, m4, 0xaa
113
%endif
114
%else
115
    pshufw  m4, m6, 0xaa
116
%endif
117
    pandn   m3, m4
118
    pand    m2, m6
119
    pand    m3, m2  ; d final
120
 
121
    psraw   m7, 15
122
    pxor    m3, m7
123
    psubw   m3, m7
124
    psubw   m0, m3
125
    paddw   m1, m3
126
    packuswb m0, m0
127
    packuswb m1, m1
128
%endmacro
129
 
130
; 1st param: size of filter
131
; 2nd param: mov suffix equivalent to the filter size
132
%macro VC1_V_LOOP_FILTER 2
133
    pxor      m5, m5
134
    mov%2     m6, [r4]
135
    mov%2     m4, [r4+r1]
136
    mov%2     m7, [r4+2*r1]
137
    mov%2     m0, [r4+r3]
138
    punpcklbw m6, m5
139
    punpcklbw m4, m5
140
    punpcklbw m7, m5
141
    punpcklbw m0, m5
142
 
143
    VC1_LOOP_FILTER_A0 m6, m4, m7, m0
144
    mov%2     m1, [r0]
145
    mov%2     m2, [r0+r1]
146
    punpcklbw m1, m5
147
    punpcklbw m2, m5
148
    mova      m4, m0
149
    VC1_LOOP_FILTER_A0 m7, m4, m1, m2
150
    mov%2     m3, [r0+2*r1]
151
    mov%2     m4, [r0+r3]
152
    punpcklbw m3, m5
153
    punpcklbw m4, m5
154
    mova      m5, m1
155
    VC1_LOOP_FILTER_A0 m5, m2, m3, m4
156
 
157
    VC1_FILTER %1
158
    mov%2 [r4+r3], m0
159
    mov%2 [r0],    m1
160
%endmacro
161
 
162
; 1st param: size of filter
163
;     NOTE: UNPACK_8TO16 this number of 8 bit numbers are in half a register
164
; 2nd (optional) param: temp register to use for storing words
165
%macro VC1_H_LOOP_FILTER 1-2
166
%if %1 == 4
167
    movq      m0, [r0     -4]
168
    movq      m1, [r0+  r1-4]
169
    movq      m2, [r0+2*r1-4]
170
    movq      m3, [r0+  r3-4]
171
    TRANSPOSE4x4B 0, 1, 2, 3, 4
172
%else
173
    movq      m0, [r0     -4]
174
    movq      m4, [r0+  r1-4]
175
    movq      m1, [r0+2*r1-4]
176
    movq      m5, [r0+  r3-4]
177
    movq      m2, [r4     -4]
178
    movq      m6, [r4+  r1-4]
179
    movq      m3, [r4+2*r1-4]
180
    movq      m7, [r4+  r3-4]
181
    punpcklbw m0, m4
182
    punpcklbw m1, m5
183
    punpcklbw m2, m6
184
    punpcklbw m3, m7
185
    TRANSPOSE4x4W 0, 1, 2, 3, 4
186
%endif
187
    pxor      m5, m5
188
 
189
    UNPACK_8TO16 bw, 6, 0, 5
190
    UNPACK_8TO16 bw, 7, 1, 5
191
    VC1_LOOP_FILTER_A0 m6, m0, m7, m1
192
    UNPACK_8TO16 bw, 4, 2, 5
193
    mova    m0, m1                      ; m0 = p0
194
    VC1_LOOP_FILTER_A0 m7, m1, m4, m2
195
    UNPACK_8TO16 bw, 1, 3, 5
196
    mova    m5, m4
197
    VC1_LOOP_FILTER_A0 m5, m2, m1, m3
198
    SWAP 1, 4                           ; m1 = q0
199
 
200
    VC1_FILTER %1
201
    punpcklbw m0, m1
202
%if %0 > 1
203
    STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, %2
204
%if %1 > 4
205
    psrldq m0, 4
206
    STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, %2
207
%endif
208
%else
209
    STORE_4_WORDS [r0-1], [r0+r1-1], [r0+2*r1-1], [r0+r3-1], m0, 0
210
    STORE_4_WORDS [r4-1], [r4+r1-1], [r4+2*r1-1], [r4+r3-1], m0, 4
211
%endif
212
%endmacro
213
 
214
 
215
%macro START_V_FILTER 0
216
    mov  r4, r0
217
    lea  r3, [4*r1]
218
    sub  r4, r3
219
    lea  r3, [r1+2*r1]
220
    imul r2, 0x01010101
221
%endmacro
222
 
223
%macro START_H_FILTER 1
224
    lea  r3, [r1+2*r1]
225
%if %1 > 4
226
    lea  r4, [r0+4*r1]
227
%endif
228
    imul r2, 0x01010101
229
%endmacro
230
 
231
%macro VC1_LF 0
232
cglobal vc1_v_loop_filter_internal
233
    VC1_V_LOOP_FILTER 4, d
234
    ret
235
 
236
cglobal vc1_h_loop_filter_internal
237
    VC1_H_LOOP_FILTER 4, r4
238
    ret
239
 
240
; void ff_vc1_v_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
241
cglobal vc1_v_loop_filter4, 3,5,0
242
    START_V_FILTER
243
    call vc1_v_loop_filter_internal
244
    RET
245
 
246
; void ff_vc1_h_loop_filter4_mmxext(uint8_t *src, int stride, int pq)
247
cglobal vc1_h_loop_filter4, 3,5,0
248
    START_H_FILTER 4
249
    call vc1_h_loop_filter_internal
250
    RET
251
 
252
; void ff_vc1_v_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
253
cglobal vc1_v_loop_filter8, 3,5,0
254
    START_V_FILTER
255
    call vc1_v_loop_filter_internal
256
    add  r4, 4
257
    add  r0, 4
258
    call vc1_v_loop_filter_internal
259
    RET
260
 
261
; void ff_vc1_h_loop_filter8_mmxext(uint8_t *src, int stride, int pq)
262
cglobal vc1_h_loop_filter8, 3,5,0
263
    START_H_FILTER 4
264
    call vc1_h_loop_filter_internal
265
    lea  r0, [r0+4*r1]
266
    call vc1_h_loop_filter_internal
267
    RET
268
%endmacro
269
 
270
INIT_MMX mmxext
271
VC1_LF
272
 
273
INIT_XMM sse2
274
; void ff_vc1_v_loop_filter8_sse2(uint8_t *src, int stride, int pq)
275
cglobal vc1_v_loop_filter8, 3,5,8
276
    START_V_FILTER
277
    VC1_V_LOOP_FILTER 8, q
278
    RET
279
 
280
; void ff_vc1_h_loop_filter8_sse2(uint8_t *src, int stride, int pq)
281
cglobal vc1_h_loop_filter8, 3,6,8
282
    START_H_FILTER 8
283
    VC1_H_LOOP_FILTER 8, r5
284
    RET
285
 
286
INIT_MMX ssse3
287
; void ff_vc1_v_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
288
cglobal vc1_v_loop_filter4, 3,5,0
289
    START_V_FILTER
290
    VC1_V_LOOP_FILTER 4, d
291
    RET
292
 
293
; void ff_vc1_h_loop_filter4_ssse3(uint8_t *src, int stride, int pq)
294
cglobal vc1_h_loop_filter4, 3,5,0
295
    START_H_FILTER 4
296
    VC1_H_LOOP_FILTER 4, r4
297
    RET
298
 
299
INIT_XMM ssse3
300
; void ff_vc1_v_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
301
cglobal vc1_v_loop_filter8, 3,5,8
302
    START_V_FILTER
303
    VC1_V_LOOP_FILTER 8, q
304
    RET
305
 
306
; void ff_vc1_h_loop_filter8_ssse3(uint8_t *src, int stride, int pq)
307
cglobal vc1_h_loop_filter8, 3,6,8
308
    START_H_FILTER 8
309
    VC1_H_LOOP_FILTER 8, r5
310
    RET
311
 
312
INIT_XMM sse4
313
; void ff_vc1_h_loop_filter8_sse4(uint8_t *src, int stride, int pq)
314
cglobal vc1_h_loop_filter8, 3,5,8
315
    START_H_FILTER 8
316
    VC1_H_LOOP_FILTER 8
317
    RET