Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6147 | serge | 1 | ;****************************************************************************** |
2 | ;* Copyright (c) 2012 Michael Niedermayer |
||
3 | ;* |
||
4 | ;* This file is part of FFmpeg. |
||
5 | ;* |
||
6 | ;* FFmpeg is free software; you can redistribute it and/or |
||
7 | ;* modify it under the terms of the GNU Lesser General Public |
||
8 | ;* License as published by the Free Software Foundation; either |
||
9 | ;* version 2.1 of the License, or (at your option) any later version. |
||
10 | ;* |
||
11 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
12 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | ;* Lesser General Public License for more details. |
||
15 | ;* |
||
16 | ;* You should have received a copy of the GNU Lesser General Public |
||
17 | ;* License along with FFmpeg; if not, write to the Free Software |
||
18 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
19 | ;****************************************************************************** |
||
20 | |||
21 | %include "libavutil/x86/x86util.asm" |
||
22 | |||
23 | SECTION_RODATA 32 |
||
24 | flt2pm31: times 8 dd 4.6566129e-10 |
||
25 | flt2p31 : times 8 dd 2147483648.0 |
||
26 | flt2p15 : times 8 dd 32768.0 |
||
27 | |||
28 | word_unpack_shuf : db 0, 1, 4, 5, 8, 9,12,13, 2, 3, 6, 7,10,11,14,15 |
||
29 | |||
30 | SECTION .text |
||
31 | |||
32 | |||
33 | ;to, from, a/u, log2_outsize, log_intsize, const |
||
34 | %macro PACK_2CH 5-7 |
||
35 | cglobal pack_2ch_%2_to_%1_%3, 3, 4, 6, dst, src, len, src2 |
||
36 | mov src2q , [srcq+gprsize] |
||
37 | mov srcq , [srcq] |
||
38 | mov dstq , [dstq] |
||
39 | %ifidn %3, a |
||
40 | test dstq, mmsize-1 |
||
41 | jne pack_2ch_%2_to_%1_u_int %+ SUFFIX |
||
42 | test srcq, mmsize-1 |
||
43 | jne pack_2ch_%2_to_%1_u_int %+ SUFFIX |
||
44 | test src2q, mmsize-1 |
||
45 | jne pack_2ch_%2_to_%1_u_int %+ SUFFIX |
||
46 | %else |
||
47 | pack_2ch_%2_to_%1_u_int %+ SUFFIX: |
||
48 | %endif |
||
49 | lea srcq , [srcq + (1<<%5)*lenq] |
||
50 | lea src2q, [src2q + (1<<%5)*lenq] |
||
51 | lea dstq , [dstq + (2<<%4)*lenq] |
||
52 | neg lenq |
||
53 | %7 m0,m1,m2,m3,m4,m5 |
||
54 | .next: |
||
55 | %if %4 >= %5 |
||
56 | mov%3 m0, [ srcq +(1<<%5)*lenq] |
||
57 | mova m1, m0 |
||
58 | mov%3 m2, [ src2q+(1<<%5)*lenq] |
||
59 | %if %5 == 1 |
||
60 | punpcklwd m0, m2 |
||
61 | punpckhwd m1, m2 |
||
62 | %else |
||
63 | punpckldq m0, m2 |
||
64 | punpckhdq m1, m2 |
||
65 | %endif |
||
66 | %6 m0,m1,m2,m3,m4,m5 |
||
67 | %else |
||
68 | mov%3 m0, [ srcq +(1<<%5)*lenq] |
||
69 | mov%3 m1, [mmsize + srcq +(1<<%5)*lenq] |
||
70 | mov%3 m2, [ src2q+(1<<%5)*lenq] |
||
71 | mov%3 m3, [mmsize + src2q+(1<<%5)*lenq] |
||
72 | %6 m0,m1,m2,m3,m4,m5 |
||
73 | mova m2, m0 |
||
74 | punpcklwd m0, m1 |
||
75 | punpckhwd m2, m1 |
||
76 | SWAP 1,2 |
||
77 | %endif |
||
78 | mov%3 [ dstq+(2<<%4)*lenq], m0 |
||
79 | mov%3 [ mmsize + dstq+(2<<%4)*lenq], m1 |
||
80 | %if %4 > %5 |
||
81 | mov%3 [2*mmsize + dstq+(2<<%4)*lenq], m2 |
||
82 | mov%3 [3*mmsize + dstq+(2<<%4)*lenq], m3 |
||
83 | add lenq, 4*mmsize/(2<<%4) |
||
84 | %else |
||
85 | add lenq, 2*mmsize/(2<<%4) |
||
86 | %endif |
||
87 | jl .next |
||
88 | REP_RET |
||
89 | %endmacro |
||
90 | |||
91 | %macro UNPACK_2CH 5-7 |
||
92 | cglobal unpack_2ch_%2_to_%1_%3, 3, 4, 7, dst, src, len, dst2 |
||
93 | mov dst2q , [dstq+gprsize] |
||
94 | mov srcq , [srcq] |
||
95 | mov dstq , [dstq] |
||
96 | %ifidn %3, a |
||
97 | test dstq, mmsize-1 |
||
98 | jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX |
||
99 | test srcq, mmsize-1 |
||
100 | jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX |
||
101 | test dst2q, mmsize-1 |
||
102 | jne unpack_2ch_%2_to_%1_u_int %+ SUFFIX |
||
103 | %else |
||
104 | unpack_2ch_%2_to_%1_u_int %+ SUFFIX: |
||
105 | %endif |
||
106 | lea srcq , [srcq + (2<<%5)*lenq] |
||
107 | lea dstq , [dstq + (1<<%4)*lenq] |
||
108 | lea dst2q, [dst2q + (1<<%4)*lenq] |
||
109 | neg lenq |
||
110 | %7 m0,m1,m2,m3,m4,m5 |
||
111 | mova m6, [word_unpack_shuf] |
||
112 | .next: |
||
113 | mov%3 m0, [ srcq +(2<<%5)*lenq] |
||
114 | mov%3 m2, [ mmsize + srcq +(2<<%5)*lenq] |
||
115 | %if %5 == 1 |
||
116 | %ifidn SUFFIX, _ssse3 |
||
117 | pshufb m0, m6 |
||
118 | mova m1, m0 |
||
119 | pshufb m2, m6 |
||
120 | punpcklqdq m0,m2 |
||
121 | punpckhqdq m1,m2 |
||
122 | %else |
||
123 | mova m1, m0 |
||
124 | punpcklwd m0,m2 |
||
125 | punpckhwd m1,m2 |
||
126 | |||
127 | mova m2, m0 |
||
128 | punpcklwd m0,m1 |
||
129 | punpckhwd m2,m1 |
||
130 | |||
131 | mova m1, m0 |
||
132 | punpcklwd m0,m2 |
||
133 | punpckhwd m1,m2 |
||
134 | %endif |
||
135 | %else |
||
136 | mova m1, m0 |
||
137 | shufps m0, m2, 10001000b |
||
138 | shufps m1, m2, 11011101b |
||
139 | %endif |
||
140 | %if %4 < %5 |
||
141 | mov%3 m2, [2*mmsize + srcq +(2<<%5)*lenq] |
||
142 | mova m3, m2 |
||
143 | mov%3 m4, [3*mmsize + srcq +(2<<%5)*lenq] |
||
144 | shufps m2, m4, 10001000b |
||
145 | shufps m3, m4, 11011101b |
||
146 | SWAP 1,2 |
||
147 | %endif |
||
148 | %6 m0,m1,m2,m3,m4,m5 |
||
149 | mov%3 [ dstq+(1<<%4)*lenq], m0 |
||
150 | %if %4 > %5 |
||
151 | mov%3 [ dst2q+(1<<%4)*lenq], m2 |
||
152 | mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1 |
||
153 | mov%3 [ mmsize + dst2q+(1<<%4)*lenq], m3 |
||
154 | add lenq, 2*mmsize/(1<<%4) |
||
155 | %else |
||
156 | mov%3 [ dst2q+(1<<%4)*lenq], m1 |
||
157 | add lenq, mmsize/(1<<%4) |
||
158 | %endif |
||
159 | jl .next |
||
160 | REP_RET |
||
161 | %endmacro |
||
162 | |||
163 | %macro CONV 5-7 |
||
164 | cglobal %2_to_%1_%3, 3, 3, 6, dst, src, len |
||
165 | mov srcq , [srcq] |
||
166 | mov dstq , [dstq] |
||
167 | %ifidn %3, a |
||
168 | test dstq, mmsize-1 |
||
169 | jne %2_to_%1_u_int %+ SUFFIX |
||
170 | test srcq, mmsize-1 |
||
171 | jne %2_to_%1_u_int %+ SUFFIX |
||
172 | %else |
||
173 | %2_to_%1_u_int %+ SUFFIX: |
||
174 | %endif |
||
175 | lea srcq , [srcq + (1<<%5)*lenq] |
||
176 | lea dstq , [dstq + (1<<%4)*lenq] |
||
177 | neg lenq |
||
178 | %7 m0,m1,m2,m3,m4,m5 |
||
179 | .next: |
||
180 | mov%3 m0, [ srcq +(1<<%5)*lenq] |
||
181 | mov%3 m1, [ mmsize + srcq +(1<<%5)*lenq] |
||
182 | %if %4 < %5 |
||
183 | mov%3 m2, [2*mmsize + srcq +(1<<%5)*lenq] |
||
184 | mov%3 m3, [3*mmsize + srcq +(1<<%5)*lenq] |
||
185 | %endif |
||
186 | %6 m0,m1,m2,m3,m4,m5 |
||
187 | mov%3 [ dstq+(1<<%4)*lenq], m0 |
||
188 | mov%3 [ mmsize + dstq+(1<<%4)*lenq], m1 |
||
189 | %if %4 > %5 |
||
190 | mov%3 [2*mmsize + dstq+(1<<%4)*lenq], m2 |
||
191 | mov%3 [3*mmsize + dstq+(1<<%4)*lenq], m3 |
||
192 | add lenq, 4*mmsize/(1<<%4) |
||
193 | %else |
||
194 | add lenq, 2*mmsize/(1<<%4) |
||
195 | %endif |
||
196 | jl .next |
||
197 | %if mmsize == 8 |
||
198 | emms |
||
199 | RET |
||
200 | %else |
||
201 | REP_RET |
||
202 | %endif |
||
203 | %endmacro |
||
204 | |||
205 | %macro PACK_6CH 5-7 |
||
206 | cglobal pack_6ch_%2_to_%1_%3, 2,8,7, dst, src, src1, src2, src3, src4, src5, len |
||
207 | %if ARCH_X86_64 |
||
208 | mov lend, r2d |
||
209 | %else |
||
210 | %define lend dword r2m |
||
211 | %endif |
||
212 | mov src1q, [srcq+1*gprsize] |
||
213 | mov src2q, [srcq+2*gprsize] |
||
214 | mov src3q, [srcq+3*gprsize] |
||
215 | mov src4q, [srcq+4*gprsize] |
||
216 | mov src5q, [srcq+5*gprsize] |
||
217 | mov srcq, [srcq] |
||
218 | mov dstq, [dstq] |
||
219 | %ifidn %3, a |
||
220 | test dstq, mmsize-1 |
||
221 | jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
222 | test srcq, mmsize-1 |
||
223 | jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
224 | test src1q, mmsize-1 |
||
225 | jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
226 | test src2q, mmsize-1 |
||
227 | jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
228 | test src3q, mmsize-1 |
||
229 | jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
230 | test src4q, mmsize-1 |
||
231 | jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
232 | test src5q, mmsize-1 |
||
233 | jne pack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
234 | %else |
||
235 | pack_6ch_%2_to_%1_u_int %+ SUFFIX: |
||
236 | %endif |
||
237 | sub src1q, srcq |
||
238 | sub src2q, srcq |
||
239 | sub src3q, srcq |
||
240 | sub src4q, srcq |
||
241 | sub src5q, srcq |
||
242 | %7 x,x,x,x,m7,x |
||
243 | .loop: |
||
244 | mov%3 m0, [srcq ] |
||
245 | mov%3 m1, [srcq+src1q] |
||
246 | mov%3 m2, [srcq+src2q] |
||
247 | mov%3 m3, [srcq+src3q] |
||
248 | mov%3 m4, [srcq+src4q] |
||
249 | mov%3 m5, [srcq+src5q] |
||
250 | %if cpuflag(sse) |
||
251 | SBUTTERFLYPS 0, 1, 6 |
||
252 | SBUTTERFLYPS 2, 3, 6 |
||
253 | SBUTTERFLYPS 4, 5, 6 |
||
254 | |||
255 | %if cpuflag(avx) |
||
256 | blendps m6, m4, m0, 1100b |
||
257 | %else |
||
258 | movaps m6, m4 |
||
259 | shufps m4, m0, q3210 |
||
260 | SWAP 4,6 |
||
261 | %endif |
||
262 | movlhps m0, m2 |
||
263 | movhlps m4, m2 |
||
264 | %if cpuflag(avx) |
||
265 | blendps m2, m5, m1, 1100b |
||
266 | %else |
||
267 | movaps m2, m5 |
||
268 | shufps m5, m1, q3210 |
||
269 | SWAP 2,5 |
||
270 | %endif |
||
271 | movlhps m1, m3 |
||
272 | movhlps m5, m3 |
||
273 | |||
274 | %6 m0,m6,x,x,m7,m3 |
||
275 | %6 m4,m1,x,x,m7,m3 |
||
276 | %6 m2,m5,x,x,m7,m3 |
||
277 | |||
278 | mov %+ %3 %+ ps [dstq ], m0 |
||
279 | mov %+ %3 %+ ps [dstq+16], m6 |
||
280 | mov %+ %3 %+ ps [dstq+32], m4 |
||
281 | mov %+ %3 %+ ps [dstq+48], m1 |
||
282 | mov %+ %3 %+ ps [dstq+64], m2 |
||
283 | mov %+ %3 %+ ps [dstq+80], m5 |
||
284 | %else ; mmx |
||
285 | SBUTTERFLY dq, 0, 1, 6 |
||
286 | SBUTTERFLY dq, 2, 3, 6 |
||
287 | SBUTTERFLY dq, 4, 5, 6 |
||
288 | |||
289 | movq [dstq ], m0 |
||
290 | movq [dstq+ 8], m2 |
||
291 | movq [dstq+16], m4 |
||
292 | movq [dstq+24], m1 |
||
293 | movq [dstq+32], m3 |
||
294 | movq [dstq+40], m5 |
||
295 | %endif |
||
296 | add srcq, mmsize |
||
297 | add dstq, mmsize*6 |
||
298 | sub lend, mmsize/4 |
||
299 | jg .loop |
||
300 | %if mmsize == 8 |
||
301 | emms |
||
302 | RET |
||
303 | %else |
||
304 | REP_RET |
||
305 | %endif |
||
306 | %endmacro |
||
307 | |||
308 | %macro UNPACK_6CH 5-7 |
||
309 | cglobal unpack_6ch_%2_to_%1_%3, 2, 8, 8, dst, src, dst1, dst2, dst3, dst4, dst5, len |
||
310 | %if ARCH_X86_64 |
||
311 | mov lend, r2d |
||
312 | %else |
||
313 | %define lend dword r2m |
||
314 | %endif |
||
315 | mov dst1q, [dstq+1*gprsize] |
||
316 | mov dst2q, [dstq+2*gprsize] |
||
317 | mov dst3q, [dstq+3*gprsize] |
||
318 | mov dst4q, [dstq+4*gprsize] |
||
319 | mov dst5q, [dstq+5*gprsize] |
||
320 | mov dstq, [dstq] |
||
321 | mov srcq, [srcq] |
||
322 | %ifidn %3, a |
||
323 | test dstq, mmsize-1 |
||
324 | jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
325 | test srcq, mmsize-1 |
||
326 | jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
327 | test dst1q, mmsize-1 |
||
328 | jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
329 | test dst2q, mmsize-1 |
||
330 | jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
331 | test dst3q, mmsize-1 |
||
332 | jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
333 | test dst4q, mmsize-1 |
||
334 | jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
335 | test dst5q, mmsize-1 |
||
336 | jne unpack_6ch_%2_to_%1_u_int %+ SUFFIX |
||
337 | %else |
||
338 | unpack_6ch_%2_to_%1_u_int %+ SUFFIX: |
||
339 | %endif |
||
340 | sub dst1q, dstq |
||
341 | sub dst2q, dstq |
||
342 | sub dst3q, dstq |
||
343 | sub dst4q, dstq |
||
344 | sub dst5q, dstq |
||
345 | %7 x,x,x,x,m7,x |
||
346 | .loop: |
||
347 | mov%3 m0, [srcq ] |
||
348 | mov%3 m1, [srcq+16] |
||
349 | mov%3 m2, [srcq+32] |
||
350 | mov%3 m3, [srcq+48] |
||
351 | mov%3 m4, [srcq+64] |
||
352 | mov%3 m5, [srcq+80] |
||
353 | |||
354 | SBUTTERFLYPS 0, 3, 6 |
||
355 | SBUTTERFLYPS 1, 4, 6 |
||
356 | SBUTTERFLYPS 2, 5, 6 |
||
357 | SBUTTERFLYPS 0, 4, 6 |
||
358 | SBUTTERFLYPS 3, 2, 6 |
||
359 | SBUTTERFLYPS 1, 5, 6 |
||
360 | SWAP 1, 4 |
||
361 | SWAP 2, 3 |
||
362 | |||
363 | %6 m0,m1,x,x,m7,m6 |
||
364 | %6 m2,m3,x,x,m7,m6 |
||
365 | %6 m4,m5,x,x,m7,m6 |
||
366 | |||
367 | mov %+ %3 %+ ps [dstq ], m0 |
||
368 | mov %+ %3 %+ ps [dstq+dst1q], m1 |
||
369 | mov %+ %3 %+ ps [dstq+dst2q], m2 |
||
370 | mov %+ %3 %+ ps [dstq+dst3q], m3 |
||
371 | mov %+ %3 %+ ps [dstq+dst4q], m4 |
||
372 | mov %+ %3 %+ ps [dstq+dst5q], m5 |
||
373 | |||
374 | add srcq, mmsize*6 |
||
375 | add dstq, mmsize |
||
376 | sub lend, mmsize/4 |
||
377 | jg .loop |
||
378 | REP_RET |
||
379 | %endmacro |
||
380 | |||
381 | %define PACK_8CH_GPRS (10 * ARCH_X86_64) + ((6 + HAVE_ALIGNED_STACK) * ARCH_X86_32) |
||
382 | |||
383 | %macro PACK_8CH 5-7 |
||
384 | cglobal pack_8ch_%2_to_%1_%3, 2,PACK_8CH_GPRS,10, ARCH_X86_32*48, dst, src, len, src1, src2, src3, src4, src5, src6, src7 |
||
385 | mov dstq, [dstq] |
||
386 | %if ARCH_X86_32 |
||
387 | DEFINE_ARGS dst, src, src2, src3, src4, src5, src6 |
||
388 | %define lend dword r2m |
||
389 | %define src1q r0q |
||
390 | %define src1m dword [rsp+32] |
||
391 | %if HAVE_ALIGNED_STACK == 0 |
||
392 | DEFINE_ARGS dst, src, src2, src3, src5, src6 |
||
393 | %define src4q r0q |
||
394 | %define src4m dword [rsp+36] |
||
395 | %endif |
||
396 | %define src7q r0q |
||
397 | %define src7m dword [rsp+40] |
||
398 | mov dstm, dstq |
||
399 | %endif |
||
400 | mov src7q, [srcq+7*gprsize] |
||
401 | mov src6q, [srcq+6*gprsize] |
||
402 | %if ARCH_X86_32 |
||
403 | mov src7m, src7q |
||
404 | %endif |
||
405 | mov src5q, [srcq+5*gprsize] |
||
406 | mov src4q, [srcq+4*gprsize] |
||
407 | mov src3q, [srcq+3*gprsize] |
||
408 | %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 |
||
409 | mov src4m, src4q |
||
410 | %endif |
||
411 | mov src2q, [srcq+2*gprsize] |
||
412 | mov src1q, [srcq+1*gprsize] |
||
413 | mov srcq, [srcq] |
||
414 | %ifidn %3, a |
||
415 | %if ARCH_X86_32 |
||
416 | test dstmp, mmsize-1 |
||
417 | %else |
||
418 | test dstq, mmsize-1 |
||
419 | %endif |
||
420 | jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
||
421 | test srcq, mmsize-1 |
||
422 | jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
||
423 | test src1q, mmsize-1 |
||
424 | jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
||
425 | test src2q, mmsize-1 |
||
426 | jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
||
427 | test src3q, mmsize-1 |
||
428 | jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
||
429 | %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 |
||
430 | test src4m, mmsize-1 |
||
431 | %else |
||
432 | test src4q, mmsize-1 |
||
433 | %endif |
||
434 | jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
||
435 | test src5q, mmsize-1 |
||
436 | jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
||
437 | test src6q, mmsize-1 |
||
438 | jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
||
439 | %if ARCH_X86_32 |
||
440 | test src7m, mmsize-1 |
||
441 | %else |
||
442 | test src7q, mmsize-1 |
||
443 | %endif |
||
444 | jne pack_8ch_%2_to_%1_u_int %+ SUFFIX |
||
445 | %else |
||
446 | pack_8ch_%2_to_%1_u_int %+ SUFFIX: |
||
447 | %endif |
||
448 | sub src1q, srcq |
||
449 | sub src2q, srcq |
||
450 | sub src3q, srcq |
||
451 | %if ARCH_X86_64 || HAVE_ALIGNED_STACK |
||
452 | sub src4q, srcq |
||
453 | %else |
||
454 | sub src4m, srcq |
||
455 | %endif |
||
456 | sub src5q, srcq |
||
457 | sub src6q, srcq |
||
458 | %if ARCH_X86_64 |
||
459 | sub src7q, srcq |
||
460 | %else |
||
461 | mov src1m, src1q |
||
462 | sub src7m, srcq |
||
463 | %endif |
||
464 | |||
465 | %if ARCH_X86_64 |
||
466 | %7 x,x,x,x,m9,x |
||
467 | %elifidn %1, int32 |
||
468 | %define m9 [flt2p31] |
||
469 | %else |
||
470 | %define m9 [flt2pm31] |
||
471 | %endif |
||
472 | |||
473 | .loop: |
||
474 | mov%3 m0, [srcq ] |
||
475 | mov%3 m1, [srcq+src1q] |
||
476 | mov%3 m2, [srcq+src2q] |
||
477 | %if ARCH_X86_32 && HAVE_ALIGNED_STACK == 0 |
||
478 | mov src4q, src4m |
||
479 | %endif |
||
480 | mov%3 m3, [srcq+src3q] |
||
481 | mov%3 m4, [srcq+src4q] |
||
482 | mov%3 m5, [srcq+src5q] |
||
483 | %if ARCH_X86_32 |
||
484 | mov src7q, src7m |
||
485 | %endif |
||
486 | mov%3 m6, [srcq+src6q] |
||
487 | mov%3 m7, [srcq+src7q] |
||
488 | |||
489 | %if ARCH_X86_64 |
||
490 | TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, 8 |
||
491 | |||
492 | %6 m0,m1,x,x,m9,m8 |
||
493 | %6 m2,m3,x,x,m9,m8 |
||
494 | %6 m4,m5,x,x,m9,m8 |
||
495 | %6 m6,m7,x,x,m9,m8 |
||
496 | |||
497 | mov%3 [dstq], m0 |
||
498 | %else |
||
499 | mov dstq, dstm |
||
500 | |||
501 | TRANSPOSE8x4D 0, 1, 2, 3, 4, 5, 6, 7, [rsp], [rsp+16], 1 |
||
502 | |||
503 | %6 m0,m1,x,x,m9,m2 |
||
504 | mova m2, [rsp] |
||
505 | mov%3 [dstq], m0 |
||
506 | %6 m2,m3,x,x,m9,m0 |
||
507 | %6 m4,m5,x,x,m9,m0 |
||
508 | %6 m6,m7,x,x,m9,m0 |
||
509 | |||
510 | %endif |
||
511 | |||
512 | mov%3 [dstq+16], m1 |
||
513 | mov%3 [dstq+32], m2 |
||
514 | mov%3 [dstq+48], m3 |
||
515 | mov%3 [dstq+64], m4 |
||
516 | mov%3 [dstq+80], m5 |
||
517 | mov%3 [dstq+96], m6 |
||
518 | mov%3 [dstq+112], m7 |
||
519 | |||
520 | add srcq, mmsize |
||
521 | add dstq, mmsize*8 |
||
522 | %if ARCH_X86_32 |
||
523 | mov dstm, dstq |
||
524 | mov src1q, src1m |
||
525 | %endif |
||
526 | sub lend, mmsize/4 |
||
527 | jg .loop |
||
528 | REP_RET |
||
529 | %endmacro |
||
530 | |||
531 | %macro INT16_TO_INT32_N 6 |
||
532 | pxor m2, m2 |
||
533 | pxor m3, m3 |
||
534 | punpcklwd m2, m1 |
||
535 | punpckhwd m3, m1 |
||
536 | SWAP 4,0 |
||
537 | pxor m0, m0 |
||
538 | pxor m1, m1 |
||
539 | punpcklwd m0, m4 |
||
540 | punpckhwd m1, m4 |
||
541 | %endmacro |
||
542 | |||
543 | %macro INT32_TO_INT16_N 6 |
||
544 | psrad m0, 16 |
||
545 | psrad m1, 16 |
||
546 | psrad m2, 16 |
||
547 | psrad m3, 16 |
||
548 | packssdw m0, m1 |
||
549 | packssdw m2, m3 |
||
550 | SWAP 1,2 |
||
551 | %endmacro |
||
552 | |||
553 | %macro INT32_TO_FLOAT_INIT 6 |
||
554 | mova %5, [flt2pm31] |
||
555 | %endmacro |
||
556 | %macro INT32_TO_FLOAT_N 6 |
||
557 | cvtdq2ps %1, %1 |
||
558 | cvtdq2ps %2, %2 |
||
559 | mulps %1, %1, %5 |
||
560 | mulps %2, %2, %5 |
||
561 | %endmacro |
||
562 | |||
563 | %macro FLOAT_TO_INT32_INIT 6 |
||
564 | mova %5, [flt2p31] |
||
565 | %endmacro |
||
566 | %macro FLOAT_TO_INT32_N 6 |
||
567 | mulps %1, %5 |
||
568 | mulps %2, %5 |
||
569 | cvtps2dq %6, %1 |
||
570 | cmpps %1, %1, %5, 5 |
||
571 | paddd %1, %6 |
||
572 | cvtps2dq %6, %2 |
||
573 | cmpps %2, %2, %5, 5 |
||
574 | paddd %2, %6 |
||
575 | %endmacro |
||
576 | |||
577 | %macro INT16_TO_FLOAT_INIT 6 |
||
578 | mova m5, [flt2pm31] |
||
579 | %endmacro |
||
580 | %macro INT16_TO_FLOAT_N 6 |
||
581 | INT16_TO_INT32_N %1,%2,%3,%4,%5,%6 |
||
582 | cvtdq2ps m0, m0 |
||
583 | cvtdq2ps m1, m1 |
||
584 | cvtdq2ps m2, m2 |
||
585 | cvtdq2ps m3, m3 |
||
586 | mulps m0, m0, m5 |
||
587 | mulps m1, m1, m5 |
||
588 | mulps m2, m2, m5 |
||
589 | mulps m3, m3, m5 |
||
590 | %endmacro |
||
591 | |||
592 | %macro FLOAT_TO_INT16_INIT 6 |
||
593 | mova m5, [flt2p15] |
||
594 | %endmacro |
||
595 | %macro FLOAT_TO_INT16_N 6 |
||
596 | mulps m0, m5 |
||
597 | mulps m1, m5 |
||
598 | mulps m2, m5 |
||
599 | mulps m3, m5 |
||
600 | cvtps2dq m0, m0 |
||
601 | cvtps2dq m1, m1 |
||
602 | packssdw m0, m1 |
||
603 | cvtps2dq m1, m2 |
||
604 | cvtps2dq m3, m3 |
||
605 | packssdw m1, m3 |
||
606 | %endmacro |
||
607 | |||
608 | %macro NOP_N 0-6 |
||
609 | %endmacro |
||
610 | |||
611 | INIT_MMX mmx |
||
612 | CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N |
||
613 | CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N |
||
614 | CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N |
||
615 | CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N |
||
616 | |||
617 | PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N |
||
618 | PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N |
||
619 | |||
620 | INIT_XMM sse |
||
621 | PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N |
||
622 | PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N |
||
623 | |||
624 | UNPACK_6CH float, float, u, 2, 2, NOP_N, NOP_N |
||
625 | UNPACK_6CH float, float, a, 2, 2, NOP_N, NOP_N |
||
626 | |||
627 | INIT_XMM sse2 |
||
628 | CONV int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N |
||
629 | CONV int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N |
||
630 | CONV int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N |
||
631 | CONV int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N |
||
632 | |||
633 | PACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N |
||
634 | PACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N |
||
635 | PACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N |
||
636 | PACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N |
||
637 | PACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N |
||
638 | PACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N |
||
639 | PACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N |
||
640 | PACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N |
||
641 | |||
642 | UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N |
||
643 | UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N |
||
644 | UNPACK_2CH int32, int32, u, 2, 2, NOP_N, NOP_N |
||
645 | UNPACK_2CH int32, int32, a, 2, 2, NOP_N, NOP_N |
||
646 | UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N |
||
647 | UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N |
||
648 | UNPACK_2CH int16, int32, u, 1, 2, INT32_TO_INT16_N, NOP_N |
||
649 | UNPACK_2CH int16, int32, a, 1, 2, INT32_TO_INT16_N, NOP_N |
||
650 | |||
651 | CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
652 | CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
653 | CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
654 | CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
655 | CONV float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
||
656 | CONV float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
||
657 | CONV int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
||
658 | CONV int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
||
659 | |||
660 | PACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
661 | PACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
662 | PACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
663 | PACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
664 | PACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
||
665 | PACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
||
666 | PACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
||
667 | PACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
||
668 | |||
669 | UNPACK_2CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
670 | UNPACK_2CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
671 | UNPACK_2CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
672 | UNPACK_2CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
673 | UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
||
674 | UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
||
675 | UNPACK_2CH int16, float, u, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
||
676 | UNPACK_2CH int16, float, a, 1, 2, FLOAT_TO_INT16_N, FLOAT_TO_INT16_INIT |
||
677 | |||
678 | PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
679 | PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
680 | PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
681 | PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
682 | |||
683 | UNPACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
684 | UNPACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
685 | UNPACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
686 | UNPACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
687 | |||
688 | PACK_8CH float, float, u, 2, 2, NOP_N, NOP_N |
||
689 | PACK_8CH float, float, a, 2, 2, NOP_N, NOP_N |
||
690 | |||
691 | PACK_8CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
692 | PACK_8CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
693 | PACK_8CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
694 | PACK_8CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
695 | |||
696 | INIT_XMM ssse3 |
||
697 | UNPACK_2CH int16, int16, u, 1, 1, NOP_N, NOP_N |
||
698 | UNPACK_2CH int16, int16, a, 1, 1, NOP_N, NOP_N |
||
699 | UNPACK_2CH int32, int16, u, 2, 1, INT16_TO_INT32_N, NOP_N |
||
700 | UNPACK_2CH int32, int16, a, 2, 1, INT16_TO_INT32_N, NOP_N |
||
701 | UNPACK_2CH float, int16, u, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
||
702 | UNPACK_2CH float, int16, a, 2, 1, INT16_TO_FLOAT_N, INT16_TO_FLOAT_INIT |
||
703 | |||
704 | %if HAVE_AVX_EXTERNAL |
||
705 | INIT_XMM avx |
||
706 | PACK_6CH float, float, u, 2, 2, NOP_N, NOP_N |
||
707 | PACK_6CH float, float, a, 2, 2, NOP_N, NOP_N |
||
708 | |||
709 | UNPACK_6CH float, float, u, 2, 2, NOP_N, NOP_N |
||
710 | UNPACK_6CH float, float, a, 2, 2, NOP_N, NOP_N |
||
711 | |||
712 | PACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
713 | PACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
714 | PACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
715 | PACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
716 | |||
717 | UNPACK_6CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
718 | UNPACK_6CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
719 | UNPACK_6CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
720 | UNPACK_6CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
721 | |||
722 | PACK_8CH float, float, u, 2, 2, NOP_N, NOP_N |
||
723 | PACK_8CH float, float, a, 2, 2, NOP_N, NOP_N |
||
724 | |||
725 | PACK_8CH float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
726 | PACK_8CH float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
727 | PACK_8CH int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
728 | PACK_8CH int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
729 | |||
730 | INIT_YMM avx |
||
731 | CONV float, int32, u, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
732 | CONV float, int32, a, 2, 2, INT32_TO_FLOAT_N, INT32_TO_FLOAT_INIT |
||
733 | %endif |
||
734 | |||
735 | %if HAVE_AVX2_EXTERNAL |
||
736 | INIT_YMM avx2 |
||
737 | CONV int32, float, u, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
738 | CONV int32, float, a, 2, 2, FLOAT_TO_INT32_N, FLOAT_TO_INT32_INIT |
||
739 | %endif%4) |