Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | ;****************************************************************************** |
2 | ;* x86 optimized channel mixing |
||
3 | ;* Copyright (c) 2012 Justin Ruggles |
||
4 | ;* |
||
5 | ;* This file is part of FFmpeg. |
||
6 | ;* |
||
7 | ;* FFmpeg is free software; you can redistribute it and/or |
||
8 | ;* modify it under the terms of the GNU Lesser General Public |
||
9 | ;* License as published by the Free Software Foundation; either |
||
10 | ;* version 2.1 of the License, or (at your option) any later version. |
||
11 | ;* |
||
12 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
13 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | ;* Lesser General Public License for more details. |
||
16 | ;* |
||
17 | ;* You should have received a copy of the GNU Lesser General Public |
||
18 | ;* License along with FFmpeg; if not, write to the Free Software |
||
19 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
20 | ;****************************************************************************** |
||
21 | |||
22 | %include "libavutil/x86/x86util.asm" |
||
23 | %include "util.asm" |
||
24 | |||
25 | SECTION_TEXT |
||
26 | |||
27 | ;----------------------------------------------------------------------------- |
||
28 | ; void ff_mix_2_to_1_fltp_flt(float **src, float **matrix, int len, |
||
29 | ; int out_ch, int in_ch); |
||
30 | ;----------------------------------------------------------------------------- |
||
31 | |||
32 | %macro MIX_2_TO_1_FLTP_FLT 0 |
||
33 | cglobal mix_2_to_1_fltp_flt, 3,4,6, src, matrix, len, src1 |
||
34 | mov src1q, [srcq+gprsize] |
||
35 | mov srcq, [srcq ] |
||
36 | sub src1q, srcq |
||
37 | mov matrixq, [matrixq ] |
||
38 | VBROADCASTSS m4, [matrixq ] |
||
39 | VBROADCASTSS m5, [matrixq+4] |
||
40 | ALIGN 16 |
||
41 | .loop: |
||
42 | mulps m0, m4, [srcq ] |
||
43 | mulps m1, m5, [srcq+src1q ] |
||
44 | mulps m2, m4, [srcq+ mmsize] |
||
45 | mulps m3, m5, [srcq+src1q+mmsize] |
||
46 | addps m0, m0, m1 |
||
47 | addps m2, m2, m3 |
||
48 | mova [srcq ], m0 |
||
49 | mova [srcq+mmsize], m2 |
||
50 | add srcq, mmsize*2 |
||
51 | sub lend, mmsize*2/4 |
||
52 | jg .loop |
||
53 | REP_RET |
||
54 | %endmacro |
||
55 | |||
56 | INIT_XMM sse |
||
57 | MIX_2_TO_1_FLTP_FLT |
||
58 | %if HAVE_AVX_EXTERNAL |
||
59 | INIT_YMM avx |
||
60 | MIX_2_TO_1_FLTP_FLT |
||
61 | %endif |
||
62 | |||
63 | ;----------------------------------------------------------------------------- |
||
64 | ; void ff_mix_2_to_1_s16p_flt(int16_t **src, float **matrix, int len, |
||
65 | ; int out_ch, int in_ch); |
||
66 | ;----------------------------------------------------------------------------- |
||
67 | |||
68 | %macro MIX_2_TO_1_S16P_FLT 0 |
||
69 | cglobal mix_2_to_1_s16p_flt, 3,4,6, src, matrix, len, src1 |
||
70 | mov src1q, [srcq+gprsize] |
||
71 | mov srcq, [srcq] |
||
72 | sub src1q, srcq |
||
73 | mov matrixq, [matrixq ] |
||
74 | VBROADCASTSS m4, [matrixq ] |
||
75 | VBROADCASTSS m5, [matrixq+4] |
||
76 | ALIGN 16 |
||
77 | .loop: |
||
78 | mova m0, [srcq ] |
||
79 | mova m2, [srcq+src1q] |
||
80 | S16_TO_S32_SX 0, 1 |
||
81 | S16_TO_S32_SX 2, 3 |
||
82 | cvtdq2ps m0, m0 |
||
83 | cvtdq2ps m1, m1 |
||
84 | cvtdq2ps m2, m2 |
||
85 | cvtdq2ps m3, m3 |
||
86 | mulps m0, m4 |
||
87 | mulps m1, m4 |
||
88 | mulps m2, m5 |
||
89 | mulps m3, m5 |
||
90 | addps m0, m2 |
||
91 | addps m1, m3 |
||
92 | cvtps2dq m0, m0 |
||
93 | cvtps2dq m1, m1 |
||
94 | packssdw m0, m1 |
||
95 | mova [srcq], m0 |
||
96 | add srcq, mmsize |
||
97 | sub lend, mmsize/2 |
||
98 | jg .loop |
||
99 | REP_RET |
||
100 | %endmacro |
||
101 | |||
102 | INIT_XMM sse2 |
||
103 | MIX_2_TO_1_S16P_FLT |
||
104 | INIT_XMM sse4 |
||
105 | MIX_2_TO_1_S16P_FLT |
||
106 | |||
107 | ;----------------------------------------------------------------------------- |
||
108 | ; void ff_mix_2_to_1_s16p_q8(int16_t **src, int16_t **matrix, int len, |
||
109 | ; int out_ch, int in_ch); |
||
110 | ;----------------------------------------------------------------------------- |
||
111 | |||
112 | INIT_XMM sse2 |
||
113 | cglobal mix_2_to_1_s16p_q8, 3,4,6, src, matrix, len, src1 |
||
114 | mov src1q, [srcq+gprsize] |
||
115 | mov srcq, [srcq] |
||
116 | sub src1q, srcq |
||
117 | mov matrixq, [matrixq] |
||
118 | movd m4, [matrixq] |
||
119 | movd m5, [matrixq] |
||
120 | SPLATW m4, m4, 0 |
||
121 | SPLATW m5, m5, 1 |
||
122 | pxor m0, m0 |
||
123 | punpcklwd m4, m0 |
||
124 | punpcklwd m5, m0 |
||
125 | ALIGN 16 |
||
126 | .loop: |
||
127 | mova m0, [srcq ] |
||
128 | mova m2, [srcq+src1q] |
||
129 | punpckhwd m1, m0, m0 |
||
130 | punpcklwd m0, m0 |
||
131 | punpckhwd m3, m2, m2 |
||
132 | punpcklwd m2, m2 |
||
133 | pmaddwd m0, m4 |
||
134 | pmaddwd m1, m4 |
||
135 | pmaddwd m2, m5 |
||
136 | pmaddwd m3, m5 |
||
137 | paddd m0, m2 |
||
138 | paddd m1, m3 |
||
139 | psrad m0, 8 |
||
140 | psrad m1, 8 |
||
141 | packssdw m0, m1 |
||
142 | mova [srcq], m0 |
||
143 | add srcq, mmsize |
||
144 | sub lend, mmsize/2 |
||
145 | jg .loop |
||
146 | REP_RET |
||
147 | |||
148 | ;----------------------------------------------------------------------------- |
||
149 | ; void ff_mix_1_to_2_fltp_flt(float **src, float **matrix, int len, |
||
150 | ; int out_ch, int in_ch); |
||
151 | ;----------------------------------------------------------------------------- |
||
152 | |||
153 | %macro MIX_1_TO_2_FLTP_FLT 0 |
||
154 | cglobal mix_1_to_2_fltp_flt, 3,5,4, src0, matrix0, len, src1, matrix1 |
||
155 | mov src1q, [src0q+gprsize] |
||
156 | mov src0q, [src0q] |
||
157 | sub src1q, src0q |
||
158 | mov matrix1q, [matrix0q+gprsize] |
||
159 | mov matrix0q, [matrix0q] |
||
160 | VBROADCASTSS m2, [matrix0q] |
||
161 | VBROADCASTSS m3, [matrix1q] |
||
162 | ALIGN 16 |
||
163 | .loop: |
||
164 | mova m0, [src0q] |
||
165 | mulps m1, m0, m3 |
||
166 | mulps m0, m0, m2 |
||
167 | mova [src0q ], m0 |
||
168 | mova [src0q+src1q], m1 |
||
169 | add src0q, mmsize |
||
170 | sub lend, mmsize/4 |
||
171 | jg .loop |
||
172 | REP_RET |
||
173 | %endmacro |
||
174 | |||
175 | INIT_XMM sse |
||
176 | MIX_1_TO_2_FLTP_FLT |
||
177 | %if HAVE_AVX_EXTERNAL |
||
178 | INIT_YMM avx |
||
179 | MIX_1_TO_2_FLTP_FLT |
||
180 | %endif |
||
181 | |||
182 | ;----------------------------------------------------------------------------- |
||
183 | ; void ff_mix_1_to_2_s16p_flt(int16_t **src, float **matrix, int len, |
||
184 | ; int out_ch, int in_ch); |
||
185 | ;----------------------------------------------------------------------------- |
||
186 | |||
187 | %macro MIX_1_TO_2_S16P_FLT 0 |
||
188 | cglobal mix_1_to_2_s16p_flt, 3,5,6, src0, matrix0, len, src1, matrix1 |
||
189 | mov src1q, [src0q+gprsize] |
||
190 | mov src0q, [src0q] |
||
191 | sub src1q, src0q |
||
192 | mov matrix1q, [matrix0q+gprsize] |
||
193 | mov matrix0q, [matrix0q] |
||
194 | VBROADCASTSS m4, [matrix0q] |
||
195 | VBROADCASTSS m5, [matrix1q] |
||
196 | ALIGN 16 |
||
197 | .loop: |
||
198 | mova m0, [src0q] |
||
199 | S16_TO_S32_SX 0, 2 |
||
200 | cvtdq2ps m0, m0 |
||
201 | cvtdq2ps m2, m2 |
||
202 | mulps m1, m0, m5 |
||
203 | mulps m0, m0, m4 |
||
204 | mulps m3, m2, m5 |
||
205 | mulps m2, m2, m4 |
||
206 | cvtps2dq m0, m0 |
||
207 | cvtps2dq m1, m1 |
||
208 | cvtps2dq m2, m2 |
||
209 | cvtps2dq m3, m3 |
||
210 | packssdw m0, m2 |
||
211 | packssdw m1, m3 |
||
212 | mova [src0q ], m0 |
||
213 | mova [src0q+src1q], m1 |
||
214 | add src0q, mmsize |
||
215 | sub lend, mmsize/2 |
||
216 | jg .loop |
||
217 | REP_RET |
||
218 | %endmacro |
||
219 | |||
220 | INIT_XMM sse2 |
||
221 | MIX_1_TO_2_S16P_FLT |
||
222 | INIT_XMM sse4 |
||
223 | MIX_1_TO_2_S16P_FLT |
||
224 | %if HAVE_AVX_EXTERNAL |
||
225 | INIT_XMM avx |
||
226 | MIX_1_TO_2_S16P_FLT |
||
227 | %endif |
||
228 | |||
229 | ;----------------------------------------------------------------------------- |
||
230 | ; void ff_mix_3_8_to_1_2_fltp/s16p_flt(float/int16_t **src, float **matrix, |
||
231 | ; int len, int out_ch, int in_ch); |
||
232 | ;----------------------------------------------------------------------------- |
||
233 | |||
234 | %macro MIX_3_8_TO_1_2_FLT 3 ; %1 = in channels, %2 = out channels, %3 = s16p or fltp |
||
235 | ; define some names to make the code clearer |
||
236 | %assign in_channels %1 |
||
237 | %assign out_channels %2 |
||
238 | %assign stereo out_channels - 1 |
||
239 | %ifidn %3, s16p |
||
240 | %assign is_s16 1 |
||
241 | %else |
||
242 | %assign is_s16 0 |
||
243 | %endif |
||
244 | |||
245 | ; determine how many matrix elements must go on the stack vs. mmregs |
||
246 | %assign matrix_elements in_channels * out_channels |
||
247 | %if is_s16 |
||
248 | %if stereo |
||
249 | %assign needed_mmregs 7 |
||
250 | %else |
||
251 | %assign needed_mmregs 5 |
||
252 | %endif |
||
253 | %else |
||
254 | %if stereo |
||
255 | %assign needed_mmregs 4 |
||
256 | %else |
||
257 | %assign needed_mmregs 3 |
||
258 | %endif |
||
259 | %endif |
||
260 | %assign matrix_elements_mm num_mmregs - needed_mmregs |
||
261 | %if matrix_elements < matrix_elements_mm |
||
262 | %assign matrix_elements_mm matrix_elements |
||
263 | %endif |
||
264 | %if matrix_elements_mm < matrix_elements |
||
265 | %assign matrix_elements_stack matrix_elements - matrix_elements_mm |
||
266 | %else |
||
267 | %assign matrix_elements_stack 0 |
||
268 | %endif |
||
269 | %assign matrix_stack_size matrix_elements_stack * mmsize |
||
270 | |||
271 | %assign needed_stack_size -1 * matrix_stack_size |
||
272 | %if ARCH_X86_32 && in_channels >= 7 |
||
273 | %assign needed_stack_size needed_stack_size - 16 |
||
274 | %endif |
||
275 | |||
276 | cglobal mix_%1_to_%2_%3_flt, 3,in_channels+2,needed_mmregs+matrix_elements_mm, needed_stack_size, src0, src1, len, src2, src3, src4, src5, src6, src7 |
||
277 | |||
278 | ; define src pointers on stack if needed |
||
279 | %if matrix_elements_stack > 0 && ARCH_X86_32 && in_channels >= 7 |
||
280 | %define src5m [rsp+matrix_stack_size+0] |
||
281 | %define src6m [rsp+matrix_stack_size+4] |
||
282 | %define src7m [rsp+matrix_stack_size+8] |
||
283 | %endif |
||
284 | |||
285 | ; load matrix pointers |
||
286 | %define matrix0q r1q |
||
287 | %define matrix1q r3q |
||
288 | %if stereo |
||
289 | mov matrix1q, [matrix0q+gprsize] |
||
290 | %endif |
||
291 | mov matrix0q, [matrix0q] |
||
292 | |||
293 | ; define matrix coeff names |
||
294 | %assign %%i 0 |
||
295 | %assign %%j needed_mmregs |
||
296 | %rep in_channels |
||
297 | %if %%i >= matrix_elements_mm |
||
298 | CAT_XDEFINE mx_stack_0_, %%i, 1 |
||
299 | CAT_XDEFINE mx_0_, %%i, [rsp+(%%i-matrix_elements_mm)*mmsize] |
||
300 | %else |
||
301 | CAT_XDEFINE mx_stack_0_, %%i, 0 |
||
302 | CAT_XDEFINE mx_0_, %%i, m %+ %%j |
||
303 | %assign %%j %%j+1 |
||
304 | %endif |
||
305 | %assign %%i %%i+1 |
||
306 | %endrep |
||
307 | %if stereo |
||
308 | %assign %%i 0 |
||
309 | %rep in_channels |
||
310 | %if in_channels + %%i >= matrix_elements_mm |
||
311 | CAT_XDEFINE mx_stack_1_, %%i, 1 |
||
312 | CAT_XDEFINE mx_1_, %%i, [rsp+(in_channels+%%i-matrix_elements_mm)*mmsize] |
||
313 | %else |
||
314 | CAT_XDEFINE mx_stack_1_, %%i, 0 |
||
315 | CAT_XDEFINE mx_1_, %%i, m %+ %%j |
||
316 | %assign %%j %%j+1 |
||
317 | %endif |
||
318 | %assign %%i %%i+1 |
||
319 | %endrep |
||
320 | %endif |
||
321 | |||
322 | ; load/splat matrix coeffs |
||
323 | %assign %%i 0 |
||
324 | %rep in_channels |
||
325 | %if mx_stack_0_ %+ %%i |
||
326 | VBROADCASTSS m0, [matrix0q+4*%%i] |
||
327 | mova mx_0_ %+ %%i, m0 |
||
328 | %else |
||
329 | VBROADCASTSS mx_0_ %+ %%i, [matrix0q+4*%%i] |
||
330 | %endif |
||
331 | %if stereo |
||
332 | %if mx_stack_1_ %+ %%i |
||
333 | VBROADCASTSS m0, [matrix1q+4*%%i] |
||
334 | mova mx_1_ %+ %%i, m0 |
||
335 | %else |
||
336 | VBROADCASTSS mx_1_ %+ %%i, [matrix1q+4*%%i] |
||
337 | %endif |
||
338 | %endif |
||
339 | %assign %%i %%i+1 |
||
340 | %endrep |
||
341 | |||
342 | ; load channel pointers to registers as offsets from the first channel pointer |
||
343 | %if ARCH_X86_64 |
||
344 | movsxd lenq, r2d |
||
345 | %endif |
||
346 | shl lenq, 2-is_s16 |
||
347 | %assign %%i 1 |
||
348 | %rep (in_channels - 1) |
||
349 | %if ARCH_X86_32 && in_channels >= 7 && %%i >= 5 |
||
350 | mov src5q, [src0q+%%i*gprsize] |
||
351 | add src5q, lenq |
||
352 | mov src %+ %%i %+ m, src5q |
||
353 | %else |
||
354 | mov src %+ %%i %+ q, [src0q+%%i*gprsize] |
||
355 | add src %+ %%i %+ q, lenq |
||
356 | %endif |
||
357 | %assign %%i %%i+1 |
||
358 | %endrep |
||
359 | mov src0q, [src0q] |
||
360 | add src0q, lenq |
||
361 | neg lenq |
||
362 | .loop: |
||
363 | ; for x86-32 with 7-8 channels we do not have enough gp registers for all src |
||
364 | ; pointers, so we have to load some of them from the stack each time |
||
365 | %define copy_src_from_stack ARCH_X86_32 && in_channels >= 7 && %%i >= 5 |
||
366 | %if is_s16 |
||
367 | ; mix with s16p input |
||
368 | mova m0, [src0q+lenq] |
||
369 | S16_TO_S32_SX 0, 1 |
||
370 | cvtdq2ps m0, m0 |
||
371 | cvtdq2ps m1, m1 |
||
372 | %if stereo |
||
373 | mulps m2, m0, mx_1_0 |
||
374 | mulps m3, m1, mx_1_0 |
||
375 | %endif |
||
376 | mulps m0, m0, mx_0_0 |
||
377 | mulps m1, m1, mx_0_0 |
||
378 | %assign %%i 1 |
||
379 | %rep (in_channels - 1) |
||
380 | %if copy_src_from_stack |
||
381 | %define src_ptr src5q |
||
382 | %else |
||
383 | %define src_ptr src %+ %%i %+ q |
||
384 | %endif |
||
385 | %if stereo |
||
386 | %if copy_src_from_stack |
||
387 | mov src_ptr, src %+ %%i %+ m |
||
388 | %endif |
||
389 | mova m4, [src_ptr+lenq] |
||
390 | S16_TO_S32_SX 4, 5 |
||
391 | cvtdq2ps m4, m4 |
||
392 | cvtdq2ps m5, m5 |
||
393 | FMULADD_PS m2, m4, mx_1_ %+ %%i, m2, m6 |
||
394 | FMULADD_PS m3, m5, mx_1_ %+ %%i, m3, m6 |
||
395 | FMULADD_PS m0, m4, mx_0_ %+ %%i, m0, m4 |
||
396 | FMULADD_PS m1, m5, mx_0_ %+ %%i, m1, m5 |
||
397 | %else |
||
398 | %if copy_src_from_stack |
||
399 | mov src_ptr, src %+ %%i %+ m |
||
400 | %endif |
||
401 | mova m2, [src_ptr+lenq] |
||
402 | S16_TO_S32_SX 2, 3 |
||
403 | cvtdq2ps m2, m2 |
||
404 | cvtdq2ps m3, m3 |
||
405 | FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m4 |
||
406 | FMULADD_PS m1, m3, mx_0_ %+ %%i, m1, m4 |
||
407 | %endif |
||
408 | %assign %%i %%i+1 |
||
409 | %endrep |
||
410 | %if stereo |
||
411 | cvtps2dq m2, m2 |
||
412 | cvtps2dq m3, m3 |
||
413 | packssdw m2, m3 |
||
414 | mova [src1q+lenq], m2 |
||
415 | %endif |
||
416 | cvtps2dq m0, m0 |
||
417 | cvtps2dq m1, m1 |
||
418 | packssdw m0, m1 |
||
419 | mova [src0q+lenq], m0 |
||
420 | %else |
||
421 | ; mix with fltp input |
||
422 | %if stereo || mx_stack_0_0 |
||
423 | mova m0, [src0q+lenq] |
||
424 | %endif |
||
425 | %if stereo |
||
426 | mulps m1, m0, mx_1_0 |
||
427 | %endif |
||
428 | %if stereo || mx_stack_0_0 |
||
429 | mulps m0, m0, mx_0_0 |
||
430 | %else |
||
431 | mulps m0, mx_0_0, [src0q+lenq] |
||
432 | %endif |
||
433 | %assign %%i 1 |
||
434 | %rep (in_channels - 1) |
||
435 | %if copy_src_from_stack |
||
436 | %define src_ptr src5q |
||
437 | mov src_ptr, src %+ %%i %+ m |
||
438 | %else |
||
439 | %define src_ptr src %+ %%i %+ q |
||
440 | %endif |
||
441 | ; avoid extra load for mono if matrix is in a mm register |
||
442 | %if stereo || mx_stack_0_ %+ %%i |
||
443 | mova m2, [src_ptr+lenq] |
||
444 | %endif |
||
445 | %if stereo |
||
446 | FMULADD_PS m1, m2, mx_1_ %+ %%i, m1, m3 |
||
447 | %endif |
||
448 | %if stereo || mx_stack_0_ %+ %%i |
||
449 | FMULADD_PS m0, m2, mx_0_ %+ %%i, m0, m2 |
||
450 | %else |
||
451 | FMULADD_PS m0, mx_0_ %+ %%i, [src_ptr+lenq], m0, m1 |
||
452 | %endif |
||
453 | %assign %%i %%i+1 |
||
454 | %endrep |
||
455 | mova [src0q+lenq], m0 |
||
456 | %if stereo |
||
457 | mova [src1q+lenq], m1 |
||
458 | %endif |
||
459 | %endif |
||
460 | |||
461 | add lenq, mmsize |
||
462 | jl .loop |
||
463 | ; zero ymm high halves |
||
464 | %if mmsize == 32 |
||
465 | vzeroupper |
||
466 | %endif |
||
467 | RET |
||
468 | %endmacro |
||
469 | |||
470 | %macro MIX_3_8_TO_1_2_FLT_FUNCS 0 |
||
471 | %assign %%i 3 |
||
472 | %rep 6 |
||
473 | INIT_XMM sse |
||
474 | MIX_3_8_TO_1_2_FLT %%i, 1, fltp |
||
475 | MIX_3_8_TO_1_2_FLT %%i, 2, fltp |
||
476 | INIT_XMM sse2 |
||
477 | MIX_3_8_TO_1_2_FLT %%i, 1, s16p |
||
478 | MIX_3_8_TO_1_2_FLT %%i, 2, s16p |
||
479 | INIT_XMM sse4 |
||
480 | MIX_3_8_TO_1_2_FLT %%i, 1, s16p |
||
481 | MIX_3_8_TO_1_2_FLT %%i, 2, s16p |
||
482 | ; do not use ymm AVX or FMA4 in x86-32 for 6 or more channels due to stack alignment issues |
||
483 | %if HAVE_AVX_EXTERNAL |
||
484 | %if ARCH_X86_64 || %%i < 6 |
||
485 | INIT_YMM avx |
||
486 | %else |
||
487 | INIT_XMM avx |
||
488 | %endif |
||
489 | MIX_3_8_TO_1_2_FLT %%i, 1, fltp |
||
490 | MIX_3_8_TO_1_2_FLT %%i, 2, fltp |
||
491 | INIT_XMM avx |
||
492 | MIX_3_8_TO_1_2_FLT %%i, 1, s16p |
||
493 | MIX_3_8_TO_1_2_FLT %%i, 2, s16p |
||
494 | %endif |
||
495 | %if HAVE_FMA4_EXTERNAL |
||
496 | %if ARCH_X86_64 || %%i < 6 |
||
497 | INIT_YMM fma4 |
||
498 | %else |
||
499 | INIT_XMM fma4 |
||
500 | %endif |
||
501 | MIX_3_8_TO_1_2_FLT %%i, 1, fltp |
||
502 | MIX_3_8_TO_1_2_FLT %%i, 2, fltp |
||
503 | INIT_XMM fma4 |
||
504 | MIX_3_8_TO_1_2_FLT %%i, 1, s16p |
||
505 | MIX_3_8_TO_1_2_FLT %%i, 2, s16p |
||
506 | %endif |
||
507 | %assign %%i %%i+1 |
||
508 | %endrep |
||
509 | %endmacro |
||
510 | |||
511 | MIX_3_8_TO_1_2_FLT_FUNCS>>>> |