Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6147 serge 1
;******************************************************************************
2
;* Copyright (c) 2012 Michael Niedermayer
3
;*
4
;* This file is part of FFmpeg.
5
;*
6
;* FFmpeg is free software; you can redistribute it and/or
7
;* modify it under the terms of the GNU Lesser General Public
8
;* License as published by the Free Software Foundation; either
9
;* version 2.1 of the License, or (at your option) any later version.
10
;*
11
;* FFmpeg is distributed in the hope that it will be useful,
12
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
13
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
14
;* Lesser General Public License for more details.
15
;*
16
;* You should have received a copy of the GNU Lesser General Public
17
;* License along with FFmpeg; if not, write to the Free Software
18
;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
19
;******************************************************************************
20
 
21
%include "libavutil/x86/x86util.asm"
22
 
23
SECTION_RODATA 32
24
flt2pm31: times 8 dd 4.6566129e-10
25
flt2p31 : times 8 dd 2147483648.0
26
flt2p15 : times 8 dd 32768.0
27
 
28
word_unpack_shuf : db  0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15
29
 
30
SECTION .text
31
 
32
 
33
;to, from, a/u, log2_outsize, log_intsize, const
34
%macro PACK_2CH 5-7
35
cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2
36
    mov src2q   , [srcq+gprsize]
37
    mov srcq    , [srcq]
38
    mov dstq    , [dstq]
39
%ifidn %3, a
40
    test dstq, mmsize-1
41
        jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
42
    test srcq, mmsize-1
43
        jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
44
    test src2q, mmsize-1
45
        jne pack_2ch_%2_to_%1_u_int %+ SUFFIX
46
%else
47
pack_2ch_%2_to_%1_u_int %+ SUFFIX:
48
%endif
49
    lea     srcq , [srcq  + (1<<%5)*lenq]
50
    lea     src2q, [src2q + (1<<%5)*lenq]
51
    lea     dstq , [dstq  + (2<<%4)*lenq]
52
    neg     lenq
53
    %7 m0,m1,m2,m3,m4,m5
54
.next:
55
%if %4 >= %5
56
    mov%3     m0, [         srcq +(1<<%5)*lenq]
57
    mova      m1, m0
58
    mov%3     m2, [         src2q+(1<<%5)*lenq]
59
%if %5 == 1
60
    punpcklwd m0, m2
61
    punpckhwd m1, m2
62
%else
63
    punpckldq m0, m2
64
    punpckhdq m1, m2
65
%endif
66
    %6 m0,m1,m2,m3,m4,m5
67
%else
68
    mov%3     m0, [         srcq +(1<<%5)*lenq]
69
    mov%3     m1, [mmsize + srcq +(1<<%5)*lenq]
70
    mov%3     m2, [         src2q+(1<<%5)*lenq]
71
    mov%3     m3, [mmsize + src2q+(1<<%5)*lenq]
72
    %6 m0,m1,m2,m3,m4,m5
73
    mova      m2, m0
74
    punpcklwd m0, m1
75
    punpckhwd m2, m1
76
    SWAP 1,2
77
%endif
78
    mov%3 [           dstq+(2<<%4)*lenq], m0
79
    mov%3 [  mmsize + dstq+(2<<%4)*lenq], m1
80
%if %4 > %5
81
    mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2
82
    mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3
83
    add lenq, 4*mmsize/(2<<%4)
84
%else
85
    add lenq, 2*mmsize/(2<<%4)
86
%endif
87
        jl .next
88
    REP_RET
89
%endmacro
90
 
91
%macro UNPACK_2CH 5-7
92
cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2
93
    mov dst2q   , [dstq+gprsize]
94
    mov srcq    , [srcq]
95
    mov dstq    , [dstq]
96
%ifidn %3, a
97
    test dstq, mmsize-1
98
        jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
99
    test srcq, mmsize-1
100
        jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
101
    test dst2q, mmsize-1
102
        jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX
103
%else
104
unpack_2ch_%2_to_%1_u_int %+ SUFFIX:
105
%endif
106
    lea     srcq , [srcq  + (2<<%5)*lenq]
107
    lea     dstq , [dstq  + (1<<%4)*lenq]
108
    lea     dst2q, [dst2q + (1<<%4)*lenq]
109
    neg     lenq
110
    %7 m0,m1,m2,m3,m4,m5
111
    mova      m6, [word_unpack_shuf]
112
.next:
113
    mov%3     m0, [           srcq +(2<<%5)*lenq]
114
    mov%3     m2, [  mmsize + srcq +(2<<%5)*lenq]
115
%if %5 == 1
116
%ifidn SUFFIX, _ssse3
117
    pshufb    m0, m6
118
    mova      m1, m0
119
    pshufb    m2, m6
120
    punpcklqdq m0,m2
121
    punpckhqdq m1,m2
122
%else
123
    mova      m1, m0
124
    punpcklwd m0,m2
125
    punpckhwd m1,m2
126
 
127
    mova      m2, m0
128
    punpcklwd m0,m1
129
    punpckhwd m2,m1
130
 
131
    mova      m1, m0
132
    punpcklwd m0,m2
133
    punpckhwd m1,m2
134
%endif
135
%else
136
    mova      m1, m0
137
    shufps    m0, m2, 10001000b
138
    shufps    m1, m2, 11011101b
139
%endif
140
%if %4 < %5
141
    mov%3     m2, [2*mmsize + srcq +(2<<%5)*lenq]
142
    mova      m3, m2
143
    mov%3     m4, [3*mmsize + srcq +(2<<%5)*lenq]
144
    shufps    m2, m4, 10001000b
145
    shufps    m3, m4, 11011101b
146
    SWAP 1,2
147
%endif
148
    %6 m0,m1,m2,m3,m4,m5
149
    mov%3 [           dstq+(1<<%4)*lenq], m0
150
%if %4 > %5
151
    mov%3 [          dst2q+(1<<%4)*lenq], m2
152
    mov%3 [ mmsize +  dstq+(1<<%4)*lenq], m1
153
    mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3
154
    add lenq, 2*mmsize/(1<<%4)
155
%else
156
    mov%3 [          dst2q+(1<<%4)*lenq], m1
157
    add lenq, mmsize/(1<<%4)
158
%endif
159
        jl .next
160
    REP_RET
161
%endmacro
162
 
163
%macro CONV 5-7
164
cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len
165
    mov srcq    , [srcq]
166
    mov dstq    , [dstq]
167
%ifidn %3, a
168
    test dstq, mmsize-1
169
        jne %2_to_%1_u_int %+ SUFFIX
170
    test srcq, mmsize-1
171
        jne %2_to_%1_u_int %+ SUFFIX
172
%else
173
%2_to_%1_u_int %+ SUFFIX:
174
%endif
175
    lea     srcq , [srcq  + (1<<%5)*lenq]
176
    lea     dstq , [dstq  + (1<<%4)*lenq]
177
    neg     lenq
178
    %7 m0,m1,m2,m3,m4,m5
179
.next:
180
    mov%3     m0, [           srcq +(1<<%5)*lenq]
181
    mov%3     m1, [  mmsize + srcq +(1<<%5)*lenq]
182
%if %4 < %5
183
    mov%3     m2, [2*mmsize + srcq +(1<<%5)*lenq]
184
    mov%3     m3, [3*mmsize + srcq +(1<<%5)*lenq]
185
%endif
186
    %6 m0,m1,m2,m3,m4,m5
187
    mov%3 [           dstq+(1<<%4)*lenq], m0
188
    mov%3 [  mmsize + dstq+(1<<%4)*lenq], m1
189
%if %4 > %5
190
    mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2
191
    mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3
192
    add lenq, 4*mmsize/(1<<%4)
193
%else
194
    add lenq, 2*mmsize/(1<<%4)
195
%endif
196
        jl .next
197
%if mmsize == 8
198
    emms
199
    RET
200
%else
201
    REP_RET
202
%endif
203
%endmacro
204
 
205
%macro PACK_6CH 5-7
206
cglobal pack_6ch_%2_to_%1_%3, 2,8,7, dst, src, src1, src2, src3, src4, src5, len
207
%if ARCH_X86_64
208
    mov     lend, r2d
209
%else
210
    %define lend dword r2m
211
%endif
212
    mov    src1q, [srcq+1*gprsize]
213
    mov    src2q, [srcq+2*gprsize]
214
    mov    src3q, [srcq+3*gprsize]
215
    mov    src4q, [srcq+4*gprsize]
216
    mov    src5q, [srcq+5*gprsize]
217
    mov     srcq, [srcq]
218
    mov     dstq, [dstq]
219
%ifidn %3, a
220
    test dstq, mmsize-1
221
        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
222
    test srcq, mmsize-1
223
        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
224
    test src1q, mmsize-1
225
        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
226
    test src2q, mmsize-1
227
        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
228
    test src3q, mmsize-1
229
        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
230
    test src4q, mmsize-1
231
        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
232
    test src5q, mmsize-1
233
        jne pack_6ch_%2_to_%1_u_int %+ SUFFIX
234
%else
235
pack_6ch_%2_to_%1_u_int %+ SUFFIX:
236
%endif
237
    sub    src1q, srcq
238
    sub    src2q, srcq
239
    sub    src3q, srcq
240
    sub    src4q, srcq
241
    sub    src5q, srcq
242
    %7 x,x,x,x,m7,x
243
.loop:
244
    mov%3     m0, [srcq      ]
245
    mov%3     m1, [srcq+src1q]
246
    mov%3     m2, [srcq+src2q]
247
    mov%3     m3, [srcq+src3q]
248
    mov%3     m4, [srcq+src4q]
249
    mov%3     m5, [srcq+src5q]
250
%if cpuflag(sse)
251
    SBUTTERFLYPS 0, 1, 6
252
    SBUTTERFLYPS 2, 3, 6
253
    SBUTTERFLYPS 4, 5, 6
254
 
255
%if cpuflag(avx)
256
    blendps   m6, m4, m0, 1100b
257
%else
258
    movaps    m6, m4
259
    shufps    m4, m0, q3210
260
    SWAP 4,6
261
%endif
262
    movlhps   m0, m2
263
    movhlps   m4, m2
264
%if cpuflag(avx)
265
    blendps   m2, m5, m1, 1100b
266
%else
267
    movaps    m2, m5
268
    shufps    m5, m1, q3210
269
    SWAP 2,5
270
%endif
271
    movlhps   m1, m3
272
    movhlps   m5, m3
273
 
274
    %6 m0,m6,x,x,m7,m3
275
    %6 m4,m1,x,x,m7,m3
276
    %6 m2,m5,x,x,m7,m3
277
 
278
    mov %+ %3 %+ ps [dstq   ], m0
279
    mov %+ %3 %+ ps [dstq+16], m6
280
    mov %+ %3 %+ ps [dstq+32], m4
281
    mov %+ %3 %+ ps [dstq+48], m1
282
    mov %+ %3 %+ ps [dstq+64], m2
283
    mov %+ %3 %+ ps [dstq+80], m5
284
%else ; mmx
285
    SBUTTERFLY dq, 0, 1, 6
286
    SBUTTERFLY dq, 2, 3, 6
287
    SBUTTERFLY dq, 4, 5, 6
288
 
289
    movq   [dstq   ], m0
290
    movq   [dstq+ 8], m2
291
    movq   [dstq+16], m4
292
    movq   [dstq+24], m1
293
    movq   [dstq+32], m3
294
    movq   [dstq+40], m5
295
%endif
296
    add      srcq, mmsize
297
    add      dstq, mmsize*6
298
    sub      lend, mmsize/4
299
    jg .loop
300
%if mmsize == 8
301
    emms
302
    RET
303
%else
304
    REP_RET
305
%endif
306
%endmacro
307
 
308
%macro UNPACK_6CH 5-7
309
cglobal unpack_6ch_%2_to_%1_%3, 2, 8, 8, dst, src, dst1, dst2, dst3, dst4, dst5, len
310
%if ARCH_X86_64
311
    mov     lend, r2d
312
%else
313
    %define lend dword r2m
314
%endif
315
    mov    dst1q, [dstq+1*gprsize]
316
    mov    dst2q, [dstq+2*gprsize]
317
    mov    dst3q, [dstq+3*gprsize]
318
    mov    dst4q, [dstq+4*gprsize]
319
    mov    dst5q, [dstq+5*gprsize]
320
    mov     dstq, [dstq]
321
    mov     srcq, [srcq]
322
%ifidn %3, a
323
    test dstq, mmsize-1
324
        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
325
    test srcq, mmsize-1
326
        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
327
    test dst1q, mmsize-1
328
        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
329
    test dst2q, mmsize-1
330
        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
331
    test dst3q, mmsize-1
332
        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
333
    test dst4q, mmsize-1
334
        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
335
    test dst5q, mmsize-1
336
        jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX
337
%else
338
unpack_6ch_%2_to_%1_u_int %+ SUFFIX:
339
%endif
340
    sub    dst1q, dstq
341
    sub    dst2q, dstq
342
    sub    dst3q, dstq
343
    sub    dst4q, dstq
344
    sub    dst5q, dstq
345
    %7 x,x,x,x,m7,x
346
.loop:
347
    mov%3     m0, [srcq   ]
348
    mov%3     m1, [srcq+16]
349
    mov%3     m2, [srcq+32]
350
    mov%3     m3, [srcq+48]
351
    mov%3     m4, [srcq+64]
352
    mov%3     m5, [srcq+80]
353
 
354
    SBUTTERFLYPS 0, 3, 6
355
    SBUTTERFLYPS 1, 4, 6
356
    SBUTTERFLYPS 2, 5, 6
357
    SBUTTERFLYPS 0, 4, 6
358
    SBUTTERFLYPS 3, 2, 6
359
    SBUTTERFLYPS 1, 5, 6
360
    SWAP 1, 4
361
    SWAP 2, 3
362
 
363
    %6 m0,m1,x,x,m7,m6
364
    %6 m2,m3,x,x,m7,m6
365
    %6 m4,m5,x,x,m7,m6
366
 
367
    mov %+ %3 %+ ps [dstq      ], m0
368
    mov %+ %3 %+ ps [dstq+dst1q], m1
369
    mov %+ %3 %+ ps [dstq+dst2q], m2
370
    mov %+ %3 %+ ps [dstq+dst3q], m3
371
    mov %+ %3 %+ ps [dstq+dst4q], m4
372
    mov %+ %3 %+ ps [dstq+dst5q], m5
373
 
374
    add      srcq, mmsize*6
375
    add      dstq, mmsize
376
    sub      lend, mmsize/4
377
    jg .loop
378
    REP_RET
379
%endmacro
380
 
381
%define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32)
382
 
383
%macro PACK_8CH 5-7
384
cglobal pack_8ch_%2_to_%1_%3, 2,PACK_8CH_GPRS,10, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7
385
    mov     dstq, [dstq]
386
%if ARCH_X86_32
387
    DEFINE_ARGS dst, src, src2, src3, src4, src5, src6
388
    %define lend dword r2m
389
    %define src1q r0q
390
    %define src1m dword [rsp+32]
391
%if HAVE_ALIGNED_STACK == 0
392
    DEFINE_ARGS dst, src, src2, src3, src5, src6
393
    %define src4q r0q
394
    %define src4m dword [rsp+36]
395
%endif
396
    %define src7q r0q
397
    %define src7m dword [rsp+40]
398
    mov     dstm, dstq
399
%endif
400
    mov    src7q, [srcq+7*gprsize]
401
    mov    src6q, [srcq+6*gprsize]
402
%if ARCH_X86_32
403
    mov    src7m, src7q
404
%endif
405
    mov    src5q, [srcq+5*gprsize]
406
    mov    src4q, [srcq+4*gprsize]
407
    mov    src3q, [srcq+3*gprsize]
408
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
409
    mov    src4m, src4q
410
%endif
411
    mov    src2q, [srcq+2*gprsize]
412
    mov    src1q, [srcq+1*gprsize]
413
    mov     srcq, [srcq]
414
%ifidn %3, a
415
%if ARCH_X86_32
416
    test dstmp, mmsize-1
417
%else
418
    test dstq, mmsize-1
419
%endif
420
        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
421
    test srcq, mmsize-1
422
        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
423
    test src1q, mmsize-1
424
        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
425
    test src2q, mmsize-1
426
        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
427
    test src3q, mmsize-1
428
        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
429
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
430
    test src4m, mmsize-1
431
%else
432
    test src4q, mmsize-1
433
%endif
434
        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
435
    test src5q, mmsize-1
436
        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
437
    test src6q, mmsize-1
438
        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
439
%if ARCH_X86_32
440
    test src7m, mmsize-1
441
%else
442
    test src7q, mmsize-1
443
%endif
444
        jne pack_8ch_%2_to_%1_u_int %+ SUFFIX
445
%else
446
pack_8ch_%2_to_%1_u_int %+ SUFFIX:
447
%endif
448
    sub    src1q, srcq
449
    sub    src2q, srcq
450
    sub    src3q, srcq
451
%if ARCH_X86_64 || HAVE_ALIGNED_STACK
452
    sub    src4q, srcq
453
%else
454
    sub    src4m, srcq
455
%endif
456
    sub    src5q, srcq
457
    sub    src6q, srcq
458
%if ARCH_X86_64
459
    sub    src7q, srcq
460
%else
461
    mov src1m, src1q
462
    sub src7m, srcq
463
%endif
464
 
465
%if ARCH_X86_64
466
    %7 x,x,x,x,m9,x
467
%elifidn %1, int32
468
    %define m9 [flt2p31]
469
%else
470
    %define m9 [flt2pm31]
471
%endif
472
 
473
.loop:
474
    mov%3     m0, [srcq      ]
475
    mov%3     m1, [srcq+src1q]
476
    mov%3     m2, [srcq+src2q]
477
%if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0
478
    mov    src4q, src4m
479
%endif
480
    mov%3     m3, [srcq+src3q]
481
    mov%3     m4, [srcq+src4q]
482
    mov%3     m5, [srcq+src5q]
483
%if ARCH_X86_32
484
    mov    src7q, src7m
485
%endif
486
    mov%3     m6, [srcq+src6q]
487
    mov%3     m7, [srcq+src7q]
488
 
489
%if ARCH_X86_64
490
    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8
491
 
492
    %6 m0,m1,x,x,m9,m8
493
    %6 m2,m3,x,x,m9,m8
494
    %6 m4,m5,x,x,m9,m8
495
    %6 m6,m7,x,x,m9,m8
496
 
497
    mov%3 [dstq], m0
498
%else
499
    mov     dstq, dstm
500
 
501
    TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1
502
 
503
    %6 m0,m1,x,x,m9,m2
504
    mova     m2, [rsp]
505
    mov%3   [dstq], m0
506
    %6 m2,m3,x,x,m9,m0
507
    %6 m4,m5,x,x,m9,m0
508
    %6 m6,m7,x,x,m9,m0
509
 
510
%endif
511
 
512
    mov%3 [dstq+16],  m1
513
    mov%3 [dstq+32],  m2
514
    mov%3 [dstq+48],  m3
515
    mov%3 [dstq+64],  m4
516
    mov%3 [dstq+80],  m5
517
    mov%3 [dstq+96],  m6
518
    mov%3 [dstq+112], m7
519
 
520
    add      srcq, mmsize
521
    add      dstq, mmsize*8
522
%if ARCH_X86_32
523
    mov      dstm, dstq
524
    mov      src1q, src1m
525
%endif
526
    sub      lend, mmsize/4
527
    jg .loop
528
    REP_RET
529
%endmacro
530
 
531
%macro INT16_TO_INT32_N 6
532
    pxor      m2, m2
533
    pxor      m3, m3
534
    punpcklwd m2, m1
535
    punpckhwd m3, m1
536
    SWAP 4,0
537
    pxor      m0, m0
538
    pxor      m1, m1
539
    punpcklwd m0, m4
540
    punpckhwd m1, m4
541
%endmacro
542
 
543
%macro INT32_TO_INT16_N 6
544
    psrad     m0, 16
545
    psrad     m1, 16
546
    psrad     m2, 16
547
    psrad     m3, 16
548
    packssdw  m0, m1
549
    packssdw  m2, m3
550
    SWAP 1,2
551
%endmacro
552
 
553
%macro INT32_TO_FLOAT_INIT 6
554
    mova      %5, [flt2pm31]
555
%endmacro
556
%macro INT32_TO_FLOAT_N 6
557
    cvtdq2ps  %1, %1
558
    cvtdq2ps  %2, %2
559
    mulps %1, %1, %5
560
    mulps %2, %2, %5
561
%endmacro
562
 
563
%macro FLOAT_TO_INT32_INIT 6
564
    mova      %5, [flt2p31]
565
%endmacro
566
%macro FLOAT_TO_INT32_N 6
567
    mulps %1, %5
568
    mulps %2, %5
569
    cvtps2dq  %6, %1
570
    cmpps %1, %1, %5, 5
571
    paddd %1, %6
572
    cvtps2dq  %6, %2
573
    cmpps %2, %2, %5, 5
574
    paddd %2, %6
575
%endmacro
576
 
577
%macro INT16_TO_FLOAT_INIT 6
578
    mova      m5, [flt2pm31]
579
%endmacro
580
%macro INT16_TO_FLOAT_N 6
581
    INT16_TO_INT32_N %1,%2,%3,%4,%5,%6
582
    cvtdq2ps  m0, m0
583
    cvtdq2ps  m1, m1
584
    cvtdq2ps  m2, m2
585
    cvtdq2ps  m3, m3
586
    mulps m0, m0, m5
587
    mulps m1, m1, m5
588
    mulps m2, m2, m5
589
    mulps m3, m3, m5
590
%endmacro
591
 
592
%macro FLOAT_TO_INT16_INIT 6
593
    mova      m5, [flt2p15]
594
%endmacro
595
%macro FLOAT_TO_INT16_N 6
596
    mulps m0, m5
597
    mulps m1, m5
598
    mulps m2, m5
599
    mulps m3, m5
600
    cvtps2dq  m0, m0
601
    cvtps2dq  m1, m1
602
    packssdw  m0, m1
603
    cvtps2dq  m1, m2
604
    cvtps2dq  m3, m3
605
    packssdw  m1, m3
606
%endmacro
607
 
608
%macro NOP_N 0-6
609
%endmacro
610
 
611
INIT_MMX mmx
612
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
613
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
614
CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
615
CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
616
 
617
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
618
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
619
 
620
INIT_XMM sse
621
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
622
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
623
 
624
UNPACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
625
UNPACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
626
 
627
INIT_XMM sse2
628
CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
629
CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
630
CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
631
CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
632
 
633
PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
634
PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
635
PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
636
PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
637
PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
638
PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
639
PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
640
PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
641
 
642
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
643
UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
644
UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N
645
UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N
646
UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
647
UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
648
UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N
649
UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N
650
 
651
CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
652
CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
653
CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
654
CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
655
CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
656
CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
657
CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
658
CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
659
 
660
PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
661
PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
662
PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
663
PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
664
PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
665
PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
666
PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
667
PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
668
 
669
UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
670
UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
671
UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
672
UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
673
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
674
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
675
UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
676
UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT
677
 
678
PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
679
PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
680
PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
681
PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
682
 
683
UNPACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
684
UNPACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
685
UNPACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
686
UNPACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
687
 
688
PACK_8CH float, float, u, 2, 2, NOP_N, NOP_N
689
PACK_8CH float, float, a, 2, 2, NOP_N, NOP_N
690
 
691
PACK_8CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
692
PACK_8CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
693
PACK_8CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
694
PACK_8CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
695
 
696
INIT_XMM ssse3
697
UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N
698
UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N
699
UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N
700
UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N
701
UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
702
UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT
703
 
704
%if HAVE_AVX_EXTERNAL
705
INIT_XMM avx
706
PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
707
PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
708
 
709
UNPACK_6CH float, float, u, 2, 2, NOP_N, NOP_N
710
UNPACK_6CH float, float, a, 2, 2, NOP_N, NOP_N
711
 
712
PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
713
PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
714
PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
715
PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
716
 
717
UNPACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
718
UNPACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
719
UNPACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
720
UNPACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
721
 
722
PACK_8CH float, float, u, 2, 2, NOP_N, NOP_N
723
PACK_8CH float, float, a, 2, 2, NOP_N, NOP_N
724
 
725
PACK_8CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
726
PACK_8CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
727
PACK_8CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
728
PACK_8CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
729
 
730
INIT_YMM avx
731
CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
732
CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT
733
%endif
734
 
735
%if HAVE_AVX2_EXTERNAL
736
INIT_YMM avx2
737
CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
738
CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT
739
%endif