Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
;*****************************************************************************
2
;* x86-optimized functions for yadif filter
3
;*
4
;* Copyright (C) 2006 Michael Niedermayer 
5
;* Copyright (c) 2013 Daniel Kang 
6
;* Copyright (c) 2011-2013 James Darnley 
7
;*
8
;* This file is part of FFmpeg.
9
;*
10
;* FFmpeg is free software; you can redistribute it and/or modify
11
;* it under the terms of the GNU General Public License as published by
12
;* the Free Software Foundation; either version 2 of the License, or
13
;* (at your option) any later version.
14
;*
15
;* FFmpeg is distributed in the hope that it will be useful,
16
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
17
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18
;* GNU General Public License for more details.
19
;*
20
;* You should have received a copy of the GNU General Public License along
21
;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
22
;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23
;******************************************************************************
24
 
25
%include "libavutil/x86/x86util.asm"
26
 
27
SECTION_RODATA
28
 
29
pw_1:    times 8 dw 1
30
pw_8000: times 8 dw 0x8000
31
pd_1:    times 4 dd 1
32
pd_8000: times 4 dd 0x8000
33
 
34
SECTION .text
35
 
36
%macro PIXSHIFT1 1
37
%if cpuflag(sse2)
38
    psrldq %1, 2
39
%else
40
    psrlq %1, 16
41
%endif
42
%endmacro
43
 
44
%macro PIXSHIFT2 1
45
%if cpuflag(sse2)
46
    psrldq %1, 4
47
%else
48
    psrlq %1, 32
49
%endif
50
%endmacro
51
 
52
%macro PABS 2
53
%if cpuflag(ssse3)
54
    pabsd %1, %1
55
%else
56
    pxor    %2, %2
57
    pcmpgtd %2, %1
58
    pxor    %1, %2
59
    psubd   %1, %2
60
%endif
61
%endmacro
62
 
63
%macro PACK 1
64
%if cpuflag(sse4)
65
    packusdw %1, %1
66
%else
67
    psubd    %1, [pd_8000]
68
    packssdw %1, %1
69
    paddw    %1, [pw_8000]
70
%endif
71
%endmacro
72
 
73
%macro PMINSD 3
74
%if cpuflag(sse4)
75
    pminsd %1, %2
76
%else
77
    mova    %3, %2
78
    pcmpgtd %3, %1
79
    pand    %1, %3
80
    pandn   %3, %2
81
    por     %1, %3
82
%endif
83
%endmacro
84
 
85
%macro PMAXSD 3
86
%if cpuflag(sse4)
87
    pmaxsd %1, %2
88
%else
89
    mova    %3, %1
90
    pcmpgtd %3, %2
91
    pand    %1, %3
92
    pandn   %3, %2
93
    por     %1, %3
94
%endif
95
%endmacro
96
 
97
%macro PMAXUW 2
98
%if cpuflag(sse4)
99
    pmaxuw %1, %2
100
%else
101
    psubusw %1, %2
102
    paddusw %1, %2
103
%endif
104
%endmacro
105
 
106
%macro CHECK 2
107
    movu      m2, [curq+t1+%1*2]
108
    movu      m3, [curq+t0+%2*2]
109
    mova      m4, m2
110
    mova      m5, m2
111
    pxor      m4, m3
112
    pavgw     m5, m3
113
    pand      m4, [pw_1]
114
    psubusw   m5, m4
115
%if mmsize == 16
116
    psrldq    m5, 2
117
%else
118
    psrlq     m5, 16
119
%endif
120
    punpcklwd m5, m7
121
    mova      m4, m2
122
    psubusw   m2, m3
123
    psubusw   m3, m4
124
    PMAXUW    m2, m3
125
    mova      m3, m2
126
    mova      m4, m2
127
%if mmsize == 16
128
    psrldq    m3, 2
129
    psrldq    m4, 4
130
%else
131
    psrlq     m3, 16
132
    psrlq     m4, 32
133
%endif
134
    punpcklwd m2, m7
135
    punpcklwd m3, m7
136
    punpcklwd m4, m7
137
    paddd     m2, m3
138
    paddd     m2, m4
139
%endmacro
140
 
141
%macro CHECK1 0
142
    mova    m3, m0
143
    pcmpgtd m3, m2
144
    PMINSD  m0, m2, m6
145
    mova    m6, m3
146
    pand    m5, m3
147
    pandn   m3, m1
148
    por     m3, m5
149
    mova    m1, m3
150
%endmacro
151
 
152
%macro CHECK2 0
153
    paddd   m6, [pd_1]
154
    pslld   m6, 30
155
    paddd   m2, m6
156
    mova    m3, m0
157
    pcmpgtd m3, m2
158
    PMINSD  m0, m2, m4
159
    pand    m5, m3
160
    pandn   m3, m1
161
    por     m3, m5
162
    mova    m1, m3
163
%endmacro
164
 
165
; This version of CHECK2 has 3 fewer instructions on sets older than SSE4 but I
166
; am not sure whether it is any faster.  A rewrite or refactor of the filter
167
; code should make it possible to eliminate the move intruction at the end.  It
168
; exists to satisfy the expectation that the "score" values are in m1.
169
 
170
; %macro CHECK2 0
171
;     mova    m3, m0
172
;     pcmpgtd m0, m2
173
;     pand    m0, m6
174
;     mova    m6, m0
175
;     pand    m5, m6
176
;     pand    m2, m0
177
;     pandn   m6, m1
178
;     pandn   m0, m3
179
;     por     m6, m5
180
;     por     m0, m2
181
;     mova    m1, m6
182
; %endmacro
183
 
184
%macro LOAD 2
185
    movh      %1, %2
186
    punpcklwd %1, m7
187
%endmacro
188
 
189
%macro FILTER 3
190
.loop%1:
191
    pxor         m7, m7
192
    LOAD         m0, [curq+t1]
193
    LOAD         m1, [curq+t0]
194
    LOAD         m2, [%2]
195
    LOAD         m3, [%3]
196
    mova         m4, m3
197
    paddd        m3, m2
198
    psrad        m3, 1
199
    mova   [rsp+ 0], m0
200
    mova   [rsp+16], m3
201
    mova   [rsp+32], m1
202
    psubd        m2, m4
203
    PABS         m2, m4
204
    LOAD         m3, [prevq+t1]
205
    LOAD         m4, [prevq+t0]
206
    psubd        m3, m0
207
    psubd        m4, m1
208
    PABS         m3, m5
209
    PABS         m4, m5
210
    paddd        m3, m4
211
    psrld        m2, 1
212
    psrld        m3, 1
213
    PMAXSD       m2, m3, m6
214
    LOAD         m3, [nextq+t1]
215
    LOAD         m4, [nextq+t0]
216
    psubd        m3, m0
217
    psubd        m4, m1
218
    PABS         m3, m5
219
    PABS         m4, m5
220
    paddd        m3, m4
221
    psrld        m3, 1
222
    PMAXSD       m2, m3, m6
223
    mova   [rsp+48], m2
224
 
225
    paddd        m1, m0
226
    paddd        m0, m0
227
    psubd        m0, m1
228
    psrld        m1, 1
229
    PABS         m0, m2
230
 
231
    movu         m2, [curq+t1-1*2]
232
    movu         m3, [curq+t0-1*2]
233
    mova         m4, m2
234
    psubusw      m2, m3
235
    psubusw      m3, m4
236
    PMAXUW       m2, m3
237
%if mmsize == 16
238
    mova         m3, m2
239
    psrldq       m3, 4
240
%else
241
    mova         m3, m2
242
    psrlq        m3, 32
243
%endif
244
    punpcklwd    m2, m7
245
    punpcklwd    m3, m7
246
    paddd        m0, m2
247
    paddd        m0, m3
248
    psubd        m0, [pd_1]
249
 
250
    CHECK -2, 0
251
    CHECK1
252
    CHECK -3, 1
253
    CHECK2
254
    CHECK 0, -2
255
    CHECK1
256
    CHECK 1, -3
257
    CHECK2
258
 
259
    mova         m6, [rsp+48]
260
    cmp   DWORD r8m, 2
261
    jge .end%1
262
    LOAD         m2, [%2+t1*2]
263
    LOAD         m4, [%3+t1*2]
264
    LOAD         m3, [%2+t0*2]
265
    LOAD         m5, [%3+t0*2]
266
    paddd        m2, m4
267
    paddd        m3, m5
268
    psrld        m2, 1
269
    psrld        m3, 1
270
    mova         m4, [rsp+ 0]
271
    mova         m5, [rsp+16]
272
    mova         m7, [rsp+32]
273
    psubd        m2, m4
274
    psubd        m3, m7
275
    mova         m0, m5
276
    psubd        m5, m4
277
    psubd        m0, m7
278
    mova         m4, m2
279
    PMINSD       m2, m3, m7
280
    PMAXSD       m3, m4, m7
281
    PMAXSD       m2, m5, m7
282
    PMINSD       m3, m5, m7
283
    PMAXSD       m2, m0, m7
284
    PMINSD       m3, m0, m7
285
    pxor         m4, m4
286
    PMAXSD       m6, m3, m7
287
    psubd        m4, m2
288
    PMAXSD       m6, m4, m7
289
 
290
.end%1:
291
    mova         m2, [rsp+16]
292
    mova         m3, m2
293
    psubd        m2, m6
294
    paddd        m3, m6
295
    PMAXSD       m1, m2, m7
296
    PMINSD       m1, m3, m7
297
    PACK         m1
298
 
299
    movh     [dstq], m1
300
    add        dstq, mmsize/2
301
    add       prevq, mmsize/2
302
    add        curq, mmsize/2
303
    add       nextq, mmsize/2
304
    sub   DWORD r4m, mmsize/4
305
    jg .loop%1
306
%endmacro
307
 
308
%macro YADIF 0
309
%if ARCH_X86_32
310
cglobal yadif_filter_line_16bit, 4, 6, 8, 80, dst, prev, cur, next, w, \
311
                                              prefs, mrefs, parity, mode
312
%else
313
cglobal yadif_filter_line_16bit, 4, 7, 8, 80, dst, prev, cur, next, w, \
314
                                              prefs, mrefs, parity, mode
315
%endif
316
%if ARCH_X86_32
317
    mov            r4, r5mp
318
    mov            r5, r6mp
319
    DECLARE_REG_TMP 4,5
320
%else
321
    movsxd         r5, DWORD r5m
322
    movsxd         r6, DWORD r6m
323
    DECLARE_REG_TMP 5,6
324
%endif
325
 
326
    cmp DWORD paritym, 0
327
    je .parity0
328
    FILTER 1, prevq, curq
329
    jmp .ret
330
 
331
.parity0:
332
    FILTER 0, curq, nextq
333
 
334
.ret:
335
    RET
336
%endmacro
337
 
338
INIT_XMM sse4
339
YADIF
340
INIT_XMM ssse3
341
YADIF
342
INIT_XMM sse2
343
YADIF
344
%if ARCH_X86_32
345
INIT_MMX mmxext
346
YADIF
347
%endif