Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
;******************************************************************************
2
;* x86-optimized horizontal line scaling functions
3
;* Copyright (c) 2011 Ronald S. Bultje 
4
;*
5
;* This file is part of FFmpeg.
6
;*
7
;* FFmpeg is free software; you can redistribute it and/or
8
;* modify it under the terms of the GNU Lesser General Public
9
;* License as published by the Free Software Foundation; either
10
;* version 2.1 of the License, or (at your option) any later version.
11
;*
12
;* FFmpeg is distributed in the hope that it will be useful,
13
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
14
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
15
;* Lesser General Public License for more details.
16
;*
17
;* You should have received a copy of the GNU Lesser General Public
18
;* License along with FFmpeg; if not, write to the Free Software
19
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
20
;******************************************************************************
21
 
22
%include "libavutil/x86/x86util.asm"
23
 
24
SECTION_RODATA
25
 
26
max_19bit_int: times 4 dd 0x7ffff
27
max_19bit_flt: times 4 dd 524287.0
28
minshort:      times 8 dw 0x8000
29
unicoeff:      times 4 dd 0x20000000
30
 
31
SECTION .text
32
 
33
;-----------------------------------------------------------------------------
34
; horizontal line scaling
35
;
36
; void hscaleto__
37
;                               (SwsContext *c, int{16,32}_t *dst,
38
;                                int dstW, const uint{8,16}_t *src,
39
;                                const int16_t *filter,
40
;                                const int32_t *filterPos, int filterSize);
41
;
42
; Scale one horizontal line. Input is either 8-bits width or 16-bits width
43
; ($source_width can be either 8, 9, 10 or 16, difference is whether we have to
44
; downscale before multiplying). Filter is 14-bits. Output is either 15bits
45
; (in int16_t) or 19bits (in int32_t), as given in $intermediate_nbits. Each
46
; output pixel is generated from $filterSize input pixels, the position of
47
; the first pixel is given in filterPos[nOutputPixel].
48
;-----------------------------------------------------------------------------
49
 
50
; SCALE_FUNC source_width, intermediate_nbits, filtersize, filtersuffix, n_args, n_xmm
51
%macro SCALE_FUNC 6
52
%ifnidn %3, X
53
cglobal hscale%1to%2_%4, %5, 7, %6, pos0, dst, w, src, filter, fltpos, pos1
54
%else
55
cglobal hscale%1to%2_%4, %5, 10, %6, pos0, dst, w, srcmem, filter, fltpos, fltsize
56
%endif
57
%if ARCH_X86_64
58
    movsxd        wq, wd
59
%define mov32 movsxd
60
%else ; x86-32
61
%define mov32 mov
62
%endif ; x86-64
63
%if %2 == 19
64
%if mmsize == 8 ; mmx
65
    mova          m2, [max_19bit_int]
66
%elif cpuflag(sse4)
67
    mova          m2, [max_19bit_int]
68
%else ; ssse3/sse2
69
    mova          m2, [max_19bit_flt]
70
%endif ; mmx/sse2/ssse3/sse4
71
%endif ; %2 == 19
72
%if %1 == 16
73
    mova          m6, [minshort]
74
    mova          m7, [unicoeff]
75
%elif %1 == 8
76
    pxor          m3, m3
77
%endif ; %1 == 8/16
78
 
79
%if %1 == 8
80
%define movlh movd
81
%define movbh movh
82
%define srcmul 1
83
%else ; %1 == 9-16
84
%define movlh movq
85
%define movbh movu
86
%define srcmul 2
87
%endif ; %1 == 8/9-16
88
 
89
%ifnidn %3, X
90
 
91
    ; setup loop
92
%if %3 == 8
93
    shl           wq, 1                         ; this allows *16 (i.e. now *8) in lea instructions for the 8-tap filter
94
%define wshr 1
95
%else ; %3 == 4
96
%define wshr 0
97
%endif ; %3 == 8
98
    lea      filterq, [filterq+wq*8]
99
%if %2 == 15
100
    lea         dstq, [dstq+wq*(2>>wshr)]
101
%else ; %2 == 19
102
    lea         dstq, [dstq+wq*(4>>wshr)]
103
%endif ; %2 == 15/19
104
    lea      fltposq, [fltposq+wq*(4>>wshr)]
105
    neg           wq
106
 
107
.loop:
108
%if %3 == 4 ; filterSize == 4 scaling
109
    ; load 2x4 or 4x4 source pixels into m0/m1
110
    mov32      pos0q, dword [fltposq+wq*4+ 0]   ; filterPos[0]
111
    mov32      pos1q, dword [fltposq+wq*4+ 4]   ; filterPos[1]
112
    movlh         m0, [srcq+pos0q*srcmul]       ; src[filterPos[0] + {0,1,2,3}]
113
%if mmsize == 8
114
    movlh         m1, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
115
%else ; mmsize == 16
116
%if %1 > 8
117
    movhps        m0, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
118
%else ; %1 == 8
119
    movd          m4, [srcq+pos1q*srcmul]       ; src[filterPos[1] + {0,1,2,3}]
120
%endif
121
    mov32      pos0q, dword [fltposq+wq*4+ 8]   ; filterPos[2]
122
    mov32      pos1q, dword [fltposq+wq*4+12]   ; filterPos[3]
123
    movlh         m1, [srcq+pos0q*srcmul]       ; src[filterPos[2] + {0,1,2,3}]
124
%if %1 > 8
125
    movhps        m1, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
126
%else ; %1 == 8
127
    movd          m5, [srcq+pos1q*srcmul]       ; src[filterPos[3] + {0,1,2,3}]
128
    punpckldq     m0, m4
129
    punpckldq     m1, m5
130
%endif ; %1 == 8
131
%endif ; mmsize == 8/16
132
%if %1 == 8
133
    punpcklbw     m0, m3                        ; byte -> word
134
    punpcklbw     m1, m3                        ; byte -> word
135
%endif ; %1 == 8
136
 
137
    ; multiply with filter coefficients
138
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
139
             ; add back 0x8000 * sum(coeffs) after the horizontal add
140
    psubw         m0, m6
141
    psubw         m1, m6
142
%endif ; %1 == 16
143
    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
144
    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
145
 
146
    ; add up horizontally (4 srcpix * 4 coefficients -> 1 dstpix)
147
%if mmsize == 8 ; mmx
148
    movq          m4, m0
149
    punpckldq     m0, m1
150
    punpckhdq     m4, m1
151
    paddd         m0, m4
152
%elif notcpuflag(ssse3) ; sse2
153
    mova          m4, m0
154
    shufps        m0, m1, 10001000b
155
    shufps        m4, m1, 11011101b
156
    paddd         m0, m4
157
%else ; ssse3/sse4
158
    phaddd        m0, m1                        ; filter[{ 0, 1, 2, 3}]*src[filterPos[0]+{0,1,2,3}],
159
                                                ; filter[{ 4, 5, 6, 7}]*src[filterPos[1]+{0,1,2,3}],
160
                                                ; filter[{ 8, 9,10,11}]*src[filterPos[2]+{0,1,2,3}],
161
                                                ; filter[{12,13,14,15}]*src[filterPos[3]+{0,1,2,3}]
162
%endif ; mmx/sse2/ssse3/sse4
163
%else ; %3 == 8, i.e. filterSize == 8 scaling
164
    ; load 2x8 or 4x8 source pixels into m0, m1, m4 and m5
165
    mov32      pos0q, dword [fltposq+wq*2+0]    ; filterPos[0]
166
    mov32      pos1q, dword [fltposq+wq*2+4]    ; filterPos[1]
167
    movbh         m0, [srcq+ pos0q   *srcmul]   ; src[filterPos[0] + {0,1,2,3,4,5,6,7}]
168
%if mmsize == 8
169
    movbh         m1, [srcq+(pos0q+4)*srcmul]   ; src[filterPos[0] + {4,5,6,7}]
170
    movbh         m4, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3}]
171
    movbh         m5, [srcq+(pos1q+4)*srcmul]   ; src[filterPos[1] + {4,5,6,7}]
172
%else ; mmsize == 16
173
    movbh         m1, [srcq+ pos1q   *srcmul]   ; src[filterPos[1] + {0,1,2,3,4,5,6,7}]
174
    mov32      pos0q, dword [fltposq+wq*2+8]    ; filterPos[2]
175
    mov32      pos1q, dword [fltposq+wq*2+12]   ; filterPos[3]
176
    movbh         m4, [srcq+ pos0q   *srcmul]   ; src[filterPos[2] + {0,1,2,3,4,5,6,7}]
177
    movbh         m5, [srcq+ pos1q   *srcmul]   ; src[filterPos[3] + {0,1,2,3,4,5,6,7}]
178
%endif ; mmsize == 8/16
179
%if %1 == 8
180
    punpcklbw     m0, m3                        ; byte -> word
181
    punpcklbw     m1, m3                        ; byte -> word
182
    punpcklbw     m4, m3                        ; byte -> word
183
    punpcklbw     m5, m3                        ; byte -> word
184
%endif ; %1 == 8
185
 
186
    ; multiply
187
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
188
             ; add back 0x8000 * sum(coeffs) after the horizontal add
189
    psubw         m0, m6
190
    psubw         m1, m6
191
    psubw         m4, m6
192
    psubw         m5, m6
193
%endif ; %1 == 16
194
    pmaddwd       m0, [filterq+wq*8+mmsize*0]   ; *= filter[{0,1,..,6,7}]
195
    pmaddwd       m1, [filterq+wq*8+mmsize*1]   ; *= filter[{8,9,..,14,15}]
196
    pmaddwd       m4, [filterq+wq*8+mmsize*2]   ; *= filter[{16,17,..,22,23}]
197
    pmaddwd       m5, [filterq+wq*8+mmsize*3]   ; *= filter[{24,25,..,30,31}]
198
 
199
    ; add up horizontally (8 srcpix * 8 coefficients -> 1 dstpix)
200
%if mmsize == 8
201
    paddd         m0, m1
202
    paddd         m4, m5
203
    movq          m1, m0
204
    punpckldq     m0, m4
205
    punpckhdq     m1, m4
206
    paddd         m0, m1
207
%elif notcpuflag(ssse3) ; sse2
208
%if %1 == 8
209
%define mex m6
210
%else
211
%define mex m3
212
%endif
213
    ; emulate horizontal add as transpose + vertical add
214
    mova         mex, m0
215
    punpckldq     m0, m1
216
    punpckhdq    mex, m1
217
    paddd         m0, mex
218
    mova          m1, m4
219
    punpckldq     m4, m5
220
    punpckhdq     m1, m5
221
    paddd         m4, m1
222
    mova          m1, m0
223
    punpcklqdq    m0, m4
224
    punpckhqdq    m1, m4
225
    paddd         m0, m1
226
%else ; ssse3/sse4
227
    ; FIXME if we rearrange the filter in pairs of 4, we can
228
    ; load pixels likewise and use 2 x paddd + phaddd instead
229
    ; of 3 x phaddd here, faster on older cpus
230
    phaddd        m0, m1
231
    phaddd        m4, m5
232
    phaddd        m0, m4                        ; filter[{ 0, 1,..., 6, 7}]*src[filterPos[0]+{0,1,...,6,7}],
233
                                                ; filter[{ 8, 9,...,14,15}]*src[filterPos[1]+{0,1,...,6,7}],
234
                                                ; filter[{16,17,...,22,23}]*src[filterPos[2]+{0,1,...,6,7}],
235
                                                ; filter[{24,25,...,30,31}]*src[filterPos[3]+{0,1,...,6,7}]
236
%endif ; mmx/sse2/ssse3/sse4
237
%endif ; %3 == 4/8
238
 
239
%else ; %3 == X, i.e. any filterSize scaling
240
 
241
%ifidn %4, X4
242
%define dlt 4
243
%else ; %4 == X || %4 == X8
244
%define dlt 0
245
%endif ; %4 ==/!= X4
246
%if ARCH_X86_64
247
%define srcq    r8
248
%define pos1q   r7
249
%define srcendq r9
250
    movsxd  fltsizeq, fltsized                  ; filterSize
251
    lea      srcendq, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
252
%else ; x86-32
253
%define srcq    srcmemq
254
%define pos1q   dstq
255
%define srcendq r6m
256
    lea        pos0q, [srcmemq+(fltsizeq-dlt)*srcmul] ; &src[filterSize&~4]
257
    mov      srcendq, pos0q
258
%endif ; x86-32/64
259
    lea      fltposq, [fltposq+wq*4]
260
%if %2 == 15
261
    lea         dstq, [dstq+wq*2]
262
%else ; %2 == 19
263
    lea         dstq, [dstq+wq*4]
264
%endif ; %2 == 15/19
265
    movifnidn  dstmp, dstq
266
    neg           wq
267
 
268
.loop:
269
    mov32      pos0q, dword [fltposq+wq*4+0]    ; filterPos[0]
270
    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
271
    ; FIXME maybe do 4px/iteration on x86-64 (x86-32 wouldn't have enough regs)?
272
    pxor          m4, m4
273
    pxor          m5, m5
274
    mov         srcq, srcmemmp
275
 
276
.innerloop:
277
    ; load 2x4 (mmx) or 2x8 (sse) source pixels into m0/m1 -> m4/m5
278
    movbh         m0, [srcq+ pos0q     *srcmul] ; src[filterPos[0] + {0,1,2,3(,4,5,6,7)}]
279
    movbh         m1, [srcq+(pos1q+dlt)*srcmul] ; src[filterPos[1] + {0,1,2,3(,4,5,6,7)}]
280
%if %1 == 8
281
    punpcklbw     m0, m3
282
    punpcklbw     m1, m3
283
%endif ; %1 == 8
284
 
285
    ; multiply
286
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
287
             ; add back 0x8000 * sum(coeffs) after the horizontal add
288
    psubw         m0, m6
289
    psubw         m1, m6
290
%endif ; %1 == 16
291
    pmaddwd       m0, [filterq]                 ; filter[{0,1,2,3(,4,5,6,7)}]
292
    pmaddwd       m1, [filterq+(fltsizeq+dlt)*2]; filter[filtersize+{0,1,2,3(,4,5,6,7)}]
293
    paddd         m4, m0
294
    paddd         m5, m1
295
    add      filterq, mmsize
296
    add         srcq, srcmul*mmsize/2
297
    cmp         srcq, srcendq                   ; while (src += 4) < &src[filterSize]
298
    jl .innerloop
299
 
300
%ifidn %4, X4
301
    mov32      pos1q, dword [fltposq+wq*4+4]    ; filterPos[1]
302
    movlh         m0, [srcq+ pos0q     *srcmul] ; split last 4 srcpx of dstpx[0]
303
    sub        pos1q, fltsizeq                  ; and first 4 srcpx of dstpx[1]
304
%if %1 > 8
305
    movhps        m0, [srcq+(pos1q+dlt)*srcmul]
306
%else ; %1 == 8
307
    movd          m1, [srcq+(pos1q+dlt)*srcmul]
308
    punpckldq     m0, m1
309
%endif ; %1 == 8
310
%if %1 == 8
311
    punpcklbw     m0, m3
312
%endif ; %1 == 8
313
%if %1 == 16 ; pmaddwd needs signed adds, so this moves unsigned -> signed, we'll
314
             ; add back 0x8000 * sum(coeffs) after the horizontal add
315
    psubw         m0, m6
316
%endif ; %1 == 16
317
    pmaddwd       m0, [filterq]
318
%endif ; %4 == X4
319
 
320
    lea      filterq, [filterq+(fltsizeq+dlt)*2]
321
 
322
%if mmsize == 8 ; mmx
323
    movq          m0, m4
324
    punpckldq     m4, m5
325
    punpckhdq     m0, m5
326
    paddd         m0, m4
327
%else ; mmsize == 16
328
%if notcpuflag(ssse3) ; sse2
329
    mova          m1, m4
330
    punpcklqdq    m4, m5
331
    punpckhqdq    m1, m5
332
    paddd         m4, m1
333
%else ; ssse3/sse4
334
    phaddd        m4, m5
335
%endif ; sse2/ssse3/sse4
336
%ifidn %4, X4
337
    paddd         m4, m0
338
%endif ; %3 == X4
339
%if notcpuflag(ssse3) ; sse2
340
    pshufd        m4, m4, 11011000b
341
    movhlps       m0, m4
342
    paddd         m0, m4
343
%else ; ssse3/sse4
344
    phaddd        m4, m4
345
    SWAP           0, 4
346
%endif ; sse2/ssse3/sse4
347
%endif ; mmsize == 8/16
348
%endif ; %3 ==/!= X
349
 
350
%if %1 == 16 ; add 0x8000 * sum(coeffs), i.e. back from signed -> unsigned
351
    paddd         m0, m7
352
%endif ; %1 == 16
353
 
354
    ; clip, store
355
    psrad         m0, 14 + %1 - %2
356
%ifidn %3, X
357
    movifnidn   dstq, dstmp
358
%endif ; %3 == X
359
%if %2 == 15
360
    packssdw      m0, m0
361
%ifnidn %3, X
362
    movh [dstq+wq*(2>>wshr)], m0
363
%else ; %3 == X
364
    movd [dstq+wq*2], m0
365
%endif ; %3 ==/!= X
366
%else ; %2 == 19
367
%if mmsize == 8
368
    PMINSD_MMX    m0, m2, m4
369
%elif cpuflag(sse4)
370
    pminsd        m0, m2
371
%else ; sse2/ssse3
372
    cvtdq2ps      m0, m0
373
    minps         m0, m2
374
    cvtps2dq      m0, m0
375
%endif ; mmx/sse2/ssse3/sse4
376
%ifnidn %3, X
377
    mova [dstq+wq*(4>>wshr)], m0
378
%else ; %3 == X
379
    movq [dstq+wq*4], m0
380
%endif ; %3 ==/!= X
381
%endif ; %2 == 15/19
382
%ifnidn %3, X
383
    add           wq, (mmsize<
384
                                                ; per iteration. see "shl wq,1" above as for why we do this
385
%else ; %3 == X
386
    add           wq, 2
387
%endif ; %3 ==/!= X
388
    jl .loop
389
    REP_RET
390
%endmacro
391
 
392
; SCALE_FUNCS source_width, intermediate_nbits, n_xmm
393
%macro SCALE_FUNCS 3
394
SCALE_FUNC %1, %2, 4, 4,  6, %3
395
SCALE_FUNC %1, %2, 8, 8,  6, %3
396
%if mmsize == 8
397
SCALE_FUNC %1, %2, X, X,  7, %3
398
%else
399
SCALE_FUNC %1, %2, X, X4, 7, %3
400
SCALE_FUNC %1, %2, X, X8, 7, %3
401
%endif
402
%endmacro
403
 
404
; SCALE_FUNCS2 8_xmm_args, 9to10_xmm_args, 16_xmm_args
405
%macro SCALE_FUNCS2 3
406
%if notcpuflag(sse4)
407
SCALE_FUNCS  8, 15, %1
408
SCALE_FUNCS  9, 15, %2
409
SCALE_FUNCS 10, 15, %2
410
SCALE_FUNCS 12, 15, %2
411
SCALE_FUNCS 14, 15, %2
412
SCALE_FUNCS 16, 15, %3
413
%endif ; !sse4
414
SCALE_FUNCS  8, 19, %1
415
SCALE_FUNCS  9, 19, %2
416
SCALE_FUNCS 10, 19, %2
417
SCALE_FUNCS 12, 19, %2
418
SCALE_FUNCS 14, 19, %2
419
SCALE_FUNCS 16, 19, %3
420
%endmacro
421
 
422
%if ARCH_X86_32
423
INIT_MMX mmx
424
SCALE_FUNCS2 0, 0, 0
425
%endif
426
INIT_XMM sse2
427
SCALE_FUNCS2 6, 7, 8
428
INIT_XMM ssse3
429
SCALE_FUNCS2 6, 6, 8
430
INIT_XMM sse4
431
SCALE_FUNCS2 6, 6, 8