Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6147 serge 1
;*****************************************************************************
2
;* x86-optimized Float DSP functions
3
;*
4
;* Copyright 2006 Loren Merritt
5
;*
6
;* This file is part of FFmpeg.
7
;*
8
;* FFmpeg is free software; you can redistribute it and/or
9
;* modify it under the terms of the GNU Lesser General Public
10
;* License as published by the Free Software Foundation; either
11
;* version 2.1 of the License, or (at your option) any later version.
12
;*
13
;* FFmpeg is distributed in the hope that it will be useful,
14
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
15
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
;* Lesser General Public License for more details.
17
;*
18
;* You should have received a copy of the GNU Lesser General Public
19
;* License along with FFmpeg; if not, write to the Free Software
20
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
;******************************************************************************
22
 
23
%include "x86util.asm"
24
 
25
SECTION .text
26
 
27
;-----------------------------------------------------------------------------
28
; void vector_fmul(float *dst, const float *src0, const float *src1, int len)
29
;-----------------------------------------------------------------------------
30
%macro VECTOR_FMUL 0
31
cglobal vector_fmul, 4,4,2, dst, src0, src1, len
32
    lea       lenq, [lend*4 - 64]
33
ALIGN 16
34
.loop:
35
%assign a 0
36
%rep 32/mmsize
37
    mova      m0,   [src0q + lenq + (a+0)*mmsize]
38
    mova      m1,   [src0q + lenq + (a+1)*mmsize]
39
    mulps     m0, m0, [src1q + lenq + (a+0)*mmsize]
40
    mulps     m1, m1, [src1q + lenq + (a+1)*mmsize]
41
    mova      [dstq + lenq + (a+0)*mmsize], m0
42
    mova      [dstq + lenq + (a+1)*mmsize], m1
43
%assign a a+2
44
%endrep
45
 
46
    sub       lenq, 64
47
    jge       .loop
48
    REP_RET
49
%endmacro
50
 
51
INIT_XMM sse
52
VECTOR_FMUL
53
%if HAVE_AVX_EXTERNAL
54
INIT_YMM avx
55
VECTOR_FMUL
56
%endif
57
 
58
;------------------------------------------------------------------------------
59
; void ff_vector_fmac_scalar(float *dst, const float *src, float mul, int len)
60
;------------------------------------------------------------------------------
61
 
62
%macro VECTOR_FMAC_SCALAR 0
63
%if UNIX64
64
cglobal vector_fmac_scalar, 3,3,5, dst, src, len
65
%else
66
cglobal vector_fmac_scalar, 4,4,5, dst, src, mul, len
67
%endif
68
%if ARCH_X86_32
69
    VBROADCASTSS m0, mulm
70
%else
71
%if WIN64
72
    SWAP 0, 2
73
%endif
74
    shufps      xm0, xm0, 0
75
%if cpuflag(avx)
76
    vinsertf128  m0, m0, xm0, 1
77
%endif
78
%endif
79
    lea    lenq, [lend*4-64]
80
.loop:
81
%if cpuflag(fma3)
82
    mova     m1,     [dstq+lenq]
83
    mova     m2,     [dstq+lenq+1*mmsize]
84
    fmaddps  m1, m0, [srcq+lenq], m1
85
    fmaddps  m2, m0, [srcq+lenq+1*mmsize], m2
86
%else ; cpuflag
87
    mulps    m1, m0, [srcq+lenq]
88
    mulps    m2, m0, [srcq+lenq+1*mmsize]
89
%if mmsize < 32
90
    mulps    m3, m0, [srcq+lenq+2*mmsize]
91
    mulps    m4, m0, [srcq+lenq+3*mmsize]
92
%endif ; mmsize
93
    addps    m1, m1, [dstq+lenq]
94
    addps    m2, m2, [dstq+lenq+1*mmsize]
95
%if mmsize < 32
96
    addps    m3, m3, [dstq+lenq+2*mmsize]
97
    addps    m4, m4, [dstq+lenq+3*mmsize]
98
%endif ; mmsize
99
%endif ; cpuflag
100
    mova  [dstq+lenq], m1
101
    mova  [dstq+lenq+1*mmsize], m2
102
%if mmsize < 32
103
    mova  [dstq+lenq+2*mmsize], m3
104
    mova  [dstq+lenq+3*mmsize], m4
105
%endif ; mmsize
106
    sub    lenq, 64
107
    jge .loop
108
    REP_RET
109
%endmacro
110
 
111
INIT_XMM sse
112
VECTOR_FMAC_SCALAR
113
%if HAVE_AVX_EXTERNAL
114
INIT_YMM avx
115
VECTOR_FMAC_SCALAR
116
%endif
117
%if HAVE_FMA3_EXTERNAL
118
INIT_YMM fma3
119
VECTOR_FMAC_SCALAR
120
%endif
121
 
122
;------------------------------------------------------------------------------
123
; void ff_vector_fmul_scalar(float *dst, const float *src, float mul, int len)
124
;------------------------------------------------------------------------------
125
 
126
%macro VECTOR_FMUL_SCALAR 0
127
%if UNIX64
128
cglobal vector_fmul_scalar, 3,3,2, dst, src, len
129
%else
130
cglobal vector_fmul_scalar, 4,4,3, dst, src, mul, len
131
%endif
132
%if ARCH_X86_32
133
    movss    m0, mulm
134
%elif WIN64
135
    SWAP 0, 2
136
%endif
137
    shufps   m0, m0, 0
138
    lea    lenq, [lend*4-mmsize]
139
.loop:
140
    mova     m1, [srcq+lenq]
141
    mulps    m1, m0
142
    mova  [dstq+lenq], m1
143
    sub    lenq, mmsize
144
    jge .loop
145
    REP_RET
146
%endmacro
147
 
148
INIT_XMM sse
149
VECTOR_FMUL_SCALAR
150
 
151
;------------------------------------------------------------------------------
152
; void ff_vector_dmul_scalar(double *dst, const double *src, double mul,
153
;                            int len)
154
;------------------------------------------------------------------------------
155
 
156
%macro VECTOR_DMUL_SCALAR 0
157
%if ARCH_X86_32
158
cglobal vector_dmul_scalar, 3,4,3, dst, src, mul, len, lenaddr
159
    mov          lenq, lenaddrm
160
%elif UNIX64
161
cglobal vector_dmul_scalar, 3,3,3, dst, src, len
162
%else
163
cglobal vector_dmul_scalar, 4,4,3, dst, src, mul, len
164
%endif
165
%if ARCH_X86_32
166
    VBROADCASTSD   m0, mulm
167
%else
168
%if WIN64
169
    SWAP 0, 2
170
%endif
171
    movlhps       xm0, xm0
172
%if cpuflag(avx)
173
    vinsertf128   ym0, ym0, xm0, 1
174
%endif
175
%endif
176
    lea          lenq, [lend*8-2*mmsize]
177
.loop:
178
    mulpd          m1, m0, [srcq+lenq       ]
179
    mulpd          m2, m0, [srcq+lenq+mmsize]
180
    mova   [dstq+lenq       ], m1
181
    mova   [dstq+lenq+mmsize], m2
182
    sub          lenq, 2*mmsize
183
    jge .loop
184
    REP_RET
185
%endmacro
186
 
187
INIT_XMM sse2
188
VECTOR_DMUL_SCALAR
189
%if HAVE_AVX_EXTERNAL
190
INIT_YMM avx
191
VECTOR_DMUL_SCALAR
192
%endif
193
 
194
;-----------------------------------------------------------------------------
195
; vector_fmul_window(float *dst, const float *src0,
196
;                    const float *src1, const float *win, int len);
197
;-----------------------------------------------------------------------------
198
%macro VECTOR_FMUL_WINDOW 0
199
cglobal vector_fmul_window, 5, 6, 6, dst, src0, src1, win, len, len1
200
    shl     lend, 2
201
    lea    len1q, [lenq - mmsize]
202
    add    src0q, lenq
203
    add     dstq, lenq
204
    add     winq, lenq
205
    neg     lenq
206
.loop:
207
    mova      m0, [winq  + lenq]
208
    mova      m4, [src0q + lenq]
209
%if cpuflag(sse)
210
    mova      m1, [winq  + len1q]
211
    mova      m5, [src1q + len1q]
212
    shufps    m1, m1, 0x1b
213
    shufps    m5, m5, 0x1b
214
    mova      m2, m0
215
    mova      m3, m1
216
    mulps     m2, m4
217
    mulps     m3, m5
218
    mulps     m1, m4
219
    mulps     m0, m5
220
    addps     m2, m3
221
    subps     m1, m0
222
    shufps    m2, m2, 0x1b
223
%else
224
    pswapd    m1, [winq  + len1q]
225
    pswapd    m5, [src1q + len1q]
226
    mova      m2, m0
227
    mova      m3, m1
228
    pfmul     m2, m4
229
    pfmul     m3, m5
230
    pfmul     m1, m4
231
    pfmul     m0, m5
232
    pfadd     m2, m3
233
    pfsub     m1, m0
234
    pswapd    m2, m2
235
%endif
236
    mova      [dstq + lenq], m1
237
    mova      [dstq + len1q], m2
238
    sub       len1q, mmsize
239
    add       lenq,  mmsize
240
    jl .loop
241
%if mmsize == 8
242
    femms
243
%endif
244
    REP_RET
245
%endmacro
246
 
247
INIT_MMX 3dnowext
248
VECTOR_FMUL_WINDOW
249
INIT_XMM sse
250
VECTOR_FMUL_WINDOW
251
 
252
;-----------------------------------------------------------------------------
253
; vector_fmul_add(float *dst, const float *src0, const float *src1,
254
;                 const float *src2, int len)
255
;-----------------------------------------------------------------------------
256
%macro VECTOR_FMUL_ADD 0
257
cglobal vector_fmul_add, 5,5,4, dst, src0, src1, src2, len
258
    lea       lenq, [lend*4 - 2*mmsize]
259
ALIGN 16
260
.loop:
261
    mova    m0,   [src0q + lenq]
262
    mova    m1,   [src0q + lenq + mmsize]
263
%if cpuflag(fma3)
264
    mova    m2,     [src2q + lenq]
265
    mova    m3,     [src2q + lenq + mmsize]
266
    fmaddps m0, m0, [src1q + lenq], m2
267
    fmaddps m1, m1, [src1q + lenq + mmsize], m3
268
%else
269
    mulps   m0, m0, [src1q + lenq]
270
    mulps   m1, m1, [src1q + lenq + mmsize]
271
    addps   m0, m0, [src2q + lenq]
272
    addps   m1, m1, [src2q + lenq + mmsize]
273
%endif
274
    mova    [dstq + lenq], m0
275
    mova    [dstq + lenq + mmsize], m1
276
 
277
    sub     lenq,   2*mmsize
278
    jge     .loop
279
    REP_RET
280
%endmacro
281
 
282
INIT_XMM sse
283
VECTOR_FMUL_ADD
284
%if HAVE_AVX_EXTERNAL
285
INIT_YMM avx
286
VECTOR_FMUL_ADD
287
%endif
288
%if HAVE_FMA3_EXTERNAL
289
INIT_YMM fma3
290
VECTOR_FMUL_ADD
291
%endif
292
 
293
;-----------------------------------------------------------------------------
294
; void vector_fmul_reverse(float *dst, const float *src0, const float *src1,
295
;                          int len)
296
;-----------------------------------------------------------------------------
297
%macro VECTOR_FMUL_REVERSE 0
298
cglobal vector_fmul_reverse, 4,4,2, dst, src0, src1, len
299
    lea       lenq, [lend*4 - 2*mmsize]
300
ALIGN 16
301
.loop:
302
%if cpuflag(avx)
303
    vmovaps     xmm0, [src1q + 16]
304
    vinsertf128 m0, m0, [src1q], 1
305
    vshufps     m0, m0, m0, q0123
306
    vmovaps     xmm1, [src1q + mmsize + 16]
307
    vinsertf128 m1, m1, [src1q + mmsize], 1
308
    vshufps     m1, m1, m1, q0123
309
%else
310
    mova    m0, [src1q]
311
    mova    m1, [src1q + mmsize]
312
    shufps  m0, m0, q0123
313
    shufps  m1, m1, q0123
314
%endif
315
    mulps   m0, m0, [src0q + lenq + mmsize]
316
    mulps   m1, m1, [src0q + lenq]
317
    mova    [dstq + lenq + mmsize], m0
318
    mova    [dstq + lenq], m1
319
    add     src1q, 2*mmsize
320
    sub     lenq,  2*mmsize
321
    jge     .loop
322
    REP_RET
323
%endmacro
324
 
325
INIT_XMM sse
326
VECTOR_FMUL_REVERSE
327
%if HAVE_AVX_EXTERNAL
328
INIT_YMM avx
329
VECTOR_FMUL_REVERSE
330
%endif
331
 
332
; float scalarproduct_float_sse(const float *v1, const float *v2, int len)
333
INIT_XMM sse
334
cglobal scalarproduct_float, 3,3,2, v1, v2, offset
335
    shl   offsetd, 2
336
    add       v1q, offsetq
337
    add       v2q, offsetq
338
    neg   offsetq
339
    xorps    xmm0, xmm0
340
.loop:
341
    movaps   xmm1, [v1q+offsetq]
342
    mulps    xmm1, [v2q+offsetq]
343
    addps    xmm0, xmm1
344
    add   offsetq, 16
345
    js .loop
346
    movhlps  xmm1, xmm0
347
    addps    xmm0, xmm1
348
    movss    xmm1, xmm0
349
    shufps   xmm0, xmm0, 1
350
    addss    xmm0, xmm1
351
%if ARCH_X86_64 == 0
352
    movss     r0m,  xmm0
353
    fld dword r0m
354
%endif
355
    RET
356
 
357
;-----------------------------------------------------------------------------
358
; void ff_butterflies_float(float *src0, float *src1, int len);
359
;-----------------------------------------------------------------------------
360
INIT_XMM sse
361
cglobal butterflies_float, 3,3,3, src0, src1, len
362
%if ARCH_X86_64
363
    movsxd    lenq, lend
364
%endif
365
    test      lenq, lenq
366
    jz .end
367
    shl       lenq, 2
368
    add      src0q, lenq
369
    add      src1q, lenq
370
    neg       lenq
371
.loop:
372
    mova        m0, [src0q + lenq]
373
    mova        m1, [src1q + lenq]
374
    subps       m2, m0, m1
375
    addps       m0, m0, m1
376
    mova        [src1q + lenq], m2
377
    mova        [src0q + lenq], m0
378
    add       lenq, mmsize
379
    jl .loop
380
.end:
381
    REP_RET