Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | ;***************************************************************************** |
2 | ;* MMX/SSE2/AVX-optimized 10-bit H.264 iDCT code |
||
3 | ;***************************************************************************** |
||
4 | ;* Copyright (C) 2005-2011 x264 project |
||
5 | ;* |
||
6 | ;* Authors: Daniel Kang |
||
7 | ;* |
||
8 | ;* This file is part of FFmpeg. |
||
9 | ;* |
||
10 | ;* FFmpeg is free software; you can redistribute it and/or |
||
11 | ;* modify it under the terms of the GNU Lesser General Public |
||
12 | ;* License as published by the Free Software Foundation; either |
||
13 | ;* version 2.1 of the License, or (at your option) any later version. |
||
14 | ;* |
||
15 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
16 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
17 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
18 | ;* Lesser General Public License for more details. |
||
19 | ;* |
||
20 | ;* You should have received a copy of the GNU Lesser General Public |
||
21 | ;* License along with FFmpeg; if not, write to the Free Software |
||
22 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
23 | ;****************************************************************************** |
||
24 | |||
25 | %include "libavutil/x86/x86util.asm" |
||
26 | |||
27 | SECTION_RODATA |
||
28 | |||
29 | pw_pixel_max: times 8 dw ((1 << 10)-1) |
||
30 | pd_32: times 4 dd 32 |
||
31 | |||
32 | SECTION .text |
||
33 | |||
34 | ;----------------------------------------------------------------------------- |
||
35 | ; void h264_idct_add(pixel *dst, dctcoef *block, int stride) |
||
36 | ;----------------------------------------------------------------------------- |
||
37 | %macro STORE_DIFFx2 6 |
||
38 | psrad %1, 6 |
||
39 | psrad %2, 6 |
||
40 | packssdw %1, %2 |
||
41 | movq %3, [%5] |
||
42 | movhps %3, [%5+%6] |
||
43 | paddsw %1, %3 |
||
44 | CLIPW %1, %4, [pw_pixel_max] |
||
45 | movq [%5], %1 |
||
46 | movhps [%5+%6], %1 |
||
47 | %endmacro |
||
48 | |||
49 | %macro STORE_DIFF16 5 |
||
50 | psrad %1, 6 |
||
51 | psrad %2, 6 |
||
52 | packssdw %1, %2 |
||
53 | paddsw %1, [%5] |
||
54 | CLIPW %1, %3, %4 |
||
55 | mova [%5], %1 |
||
56 | %endmacro |
||
57 | |||
58 | ;dst, in, stride |
||
59 | %macro IDCT4_ADD_10 3 |
||
60 | mova m0, [%2+ 0] |
||
61 | mova m1, [%2+16] |
||
62 | mova m2, [%2+32] |
||
63 | mova m3, [%2+48] |
||
64 | IDCT4_1D d,0,1,2,3,4,5 |
||
65 | TRANSPOSE4x4D 0,1,2,3,4 |
||
66 | paddd m0, [pd_32] |
||
67 | IDCT4_1D d,0,1,2,3,4,5 |
||
68 | pxor m5, m5 |
||
69 | mova [%2+ 0], m5 |
||
70 | mova [%2+16], m5 |
||
71 | mova [%2+32], m5 |
||
72 | mova [%2+48], m5 |
||
73 | STORE_DIFFx2 m0, m1, m4, m5, %1, %3 |
||
74 | lea %1, [%1+%3*2] |
||
75 | STORE_DIFFx2 m2, m3, m4, m5, %1, %3 |
||
76 | %endmacro |
||
77 | |||
78 | %macro IDCT_ADD_10 0 |
||
79 | cglobal h264_idct_add_10, 3,3 |
||
80 | IDCT4_ADD_10 r0, r1, r2 |
||
81 | RET |
||
82 | %endmacro |
||
83 | |||
84 | INIT_XMM sse2 |
||
85 | IDCT_ADD_10 |
||
86 | %if HAVE_AVX_EXTERNAL |
||
87 | INIT_XMM avx |
||
88 | IDCT_ADD_10 |
||
89 | %endif |
||
90 | |||
91 | ;----------------------------------------------------------------------------- |
||
92 | ; h264_idct_add16(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
||
93 | ;----------------------------------------------------------------------------- |
||
94 | ;;;;;;; NO FATE SAMPLES TRIGGER THIS |
||
95 | %macro ADD4x4IDCT 0 |
||
96 | add4x4_idct %+ SUFFIX: |
||
97 | add r5, r0 |
||
98 | mova m0, [r2+ 0] |
||
99 | mova m1, [r2+16] |
||
100 | mova m2, [r2+32] |
||
101 | mova m3, [r2+48] |
||
102 | IDCT4_1D d,0,1,2,3,4,5 |
||
103 | TRANSPOSE4x4D 0,1,2,3,4 |
||
104 | paddd m0, [pd_32] |
||
105 | IDCT4_1D d,0,1,2,3,4,5 |
||
106 | pxor m5, m5 |
||
107 | mova [r2+ 0], m5 |
||
108 | mova [r2+16], m5 |
||
109 | mova [r2+32], m5 |
||
110 | mova [r2+48], m5 |
||
111 | STORE_DIFFx2 m0, m1, m4, m5, r5, r3 |
||
112 | lea r5, [r5+r3*2] |
||
113 | STORE_DIFFx2 m2, m3, m4, m5, r5, r3 |
||
114 | ret |
||
115 | %endmacro |
||
116 | |||
117 | INIT_XMM sse2 |
||
118 | ALIGN 16 |
||
119 | ADD4x4IDCT |
||
120 | %if HAVE_AVX_EXTERNAL |
||
121 | INIT_XMM avx |
||
122 | ALIGN 16 |
||
123 | ADD4x4IDCT |
||
124 | %endif |
||
125 | |||
126 | %macro ADD16_OP 2 |
||
127 | cmp byte [r4+%2], 0 |
||
128 | jz .skipblock%1 |
||
129 | mov r5d, [r1+%1*4] |
||
130 | call add4x4_idct %+ SUFFIX |
||
131 | .skipblock%1: |
||
132 | %if %1<15 |
||
133 | add r2, 64 |
||
134 | %endif |
||
135 | %endmacro |
||
136 | |||
137 | %macro IDCT_ADD16_10 0 |
||
138 | cglobal h264_idct_add16_10, 5,6 |
||
139 | ADD16_OP 0, 4+1*8 |
||
140 | ADD16_OP 1, 5+1*8 |
||
141 | ADD16_OP 2, 4+2*8 |
||
142 | ADD16_OP 3, 5+2*8 |
||
143 | ADD16_OP 4, 6+1*8 |
||
144 | ADD16_OP 5, 7+1*8 |
||
145 | ADD16_OP 6, 6+2*8 |
||
146 | ADD16_OP 7, 7+2*8 |
||
147 | ADD16_OP 8, 4+3*8 |
||
148 | ADD16_OP 9, 5+3*8 |
||
149 | ADD16_OP 10, 4+4*8 |
||
150 | ADD16_OP 11, 5+4*8 |
||
151 | ADD16_OP 12, 6+3*8 |
||
152 | ADD16_OP 13, 7+3*8 |
||
153 | ADD16_OP 14, 6+4*8 |
||
154 | ADD16_OP 15, 7+4*8 |
||
155 | REP_RET |
||
156 | %endmacro |
||
157 | |||
158 | INIT_XMM sse2 |
||
159 | IDCT_ADD16_10 |
||
160 | %if HAVE_AVX_EXTERNAL |
||
161 | INIT_XMM avx |
||
162 | IDCT_ADD16_10 |
||
163 | %endif |
||
164 | |||
165 | ;----------------------------------------------------------------------------- |
||
166 | ; void h264_idct_dc_add(pixel *dst, dctcoef *block, int stride) |
||
167 | ;----------------------------------------------------------------------------- |
||
168 | %macro IDCT_DC_ADD_OP_10 3 |
||
169 | pxor m5, m5 |
||
170 | %if avx_enabled |
||
171 | paddw m1, m0, [%1+0 ] |
||
172 | paddw m2, m0, [%1+%2 ] |
||
173 | paddw m3, m0, [%1+%2*2] |
||
174 | paddw m4, m0, [%1+%3 ] |
||
175 | %else |
||
176 | mova m1, [%1+0 ] |
||
177 | mova m2, [%1+%2 ] |
||
178 | mova m3, [%1+%2*2] |
||
179 | mova m4, [%1+%3 ] |
||
180 | paddw m1, m0 |
||
181 | paddw m2, m0 |
||
182 | paddw m3, m0 |
||
183 | paddw m4, m0 |
||
184 | %endif |
||
185 | CLIPW m1, m5, m6 |
||
186 | CLIPW m2, m5, m6 |
||
187 | CLIPW m3, m5, m6 |
||
188 | CLIPW m4, m5, m6 |
||
189 | mova [%1+0 ], m1 |
||
190 | mova [%1+%2 ], m2 |
||
191 | mova [%1+%2*2], m3 |
||
192 | mova [%1+%3 ], m4 |
||
193 | %endmacro |
||
194 | |||
195 | INIT_MMX mmxext |
||
196 | cglobal h264_idct_dc_add_10,3,3 |
||
197 | movd m0, [r1] |
||
198 | mov dword [r1], 0 |
||
199 | paddd m0, [pd_32] |
||
200 | psrad m0, 6 |
||
201 | lea r1, [r2*3] |
||
202 | pshufw m0, m0, 0 |
||
203 | mova m6, [pw_pixel_max] |
||
204 | IDCT_DC_ADD_OP_10 r0, r2, r1 |
||
205 | RET |
||
206 | |||
207 | ;----------------------------------------------------------------------------- |
||
208 | ; void h264_idct8_dc_add(pixel *dst, dctcoef *block, int stride) |
||
209 | ;----------------------------------------------------------------------------- |
||
210 | %macro IDCT8_DC_ADD 0 |
||
211 | cglobal h264_idct8_dc_add_10,3,4,7 |
||
212 | movd m0, [r1] |
||
213 | mov dword[r1], 0 |
||
214 | paddd m0, [pd_32] |
||
215 | psrad m0, 6 |
||
216 | lea r1, [r2*3] |
||
217 | SPLATW m0, m0, 0 |
||
218 | mova m6, [pw_pixel_max] |
||
219 | IDCT_DC_ADD_OP_10 r0, r2, r1 |
||
220 | lea r0, [r0+r2*4] |
||
221 | IDCT_DC_ADD_OP_10 r0, r2, r1 |
||
222 | RET |
||
223 | %endmacro |
||
224 | |||
225 | INIT_XMM sse2 |
||
226 | IDCT8_DC_ADD |
||
227 | %if HAVE_AVX_EXTERNAL |
||
228 | INIT_XMM avx |
||
229 | IDCT8_DC_ADD |
||
230 | %endif |
||
231 | |||
232 | ;----------------------------------------------------------------------------- |
||
233 | ; h264_idct_add16intra(pixel *dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
||
234 | ;----------------------------------------------------------------------------- |
||
235 | %macro AC 1 |
||
236 | .ac%1: |
||
237 | mov r5d, [r1+(%1+0)*4] |
||
238 | call add4x4_idct %+ SUFFIX |
||
239 | mov r5d, [r1+(%1+1)*4] |
||
240 | add r2, 64 |
||
241 | call add4x4_idct %+ SUFFIX |
||
242 | add r2, 64 |
||
243 | jmp .skipadd%1 |
||
244 | %endmacro |
||
245 | |||
246 | %assign last_block 16 |
||
247 | %macro ADD16_OP_INTRA 2 |
||
248 | cmp word [r4+%2], 0 |
||
249 | jnz .ac%1 |
||
250 | mov r5d, [r2+ 0] |
||
251 | or r5d, [r2+64] |
||
252 | jz .skipblock%1 |
||
253 | mov r5d, [r1+(%1+0)*4] |
||
254 | call idct_dc_add %+ SUFFIX |
||
255 | .skipblock%1: |
||
256 | %if %1 |
||
257 | add r2, 128 |
||
258 | %endif |
||
259 | .skipadd%1: |
||
260 | %endmacro |
||
261 | |||
262 | %macro IDCT_ADD16INTRA_10 0 |
||
263 | idct_dc_add %+ SUFFIX: |
||
264 | add r5, r0 |
||
265 | movq m0, [r2+ 0] |
||
266 | movhps m0, [r2+64] |
||
267 | mov dword [r2+ 0], 0 |
||
268 | mov dword [r2+64], 0 |
||
269 | paddd m0, [pd_32] |
||
270 | psrad m0, 6 |
||
271 | pshufhw m0, m0, 0 |
||
272 | pshuflw m0, m0, 0 |
||
273 | lea r6, [r3*3] |
||
274 | mova m6, [pw_pixel_max] |
||
275 | IDCT_DC_ADD_OP_10 r5, r3, r6 |
||
276 | ret |
||
277 | |||
278 | cglobal h264_idct_add16intra_10,5,7,8 |
||
279 | ADD16_OP_INTRA 0, 4+1*8 |
||
280 | ADD16_OP_INTRA 2, 4+2*8 |
||
281 | ADD16_OP_INTRA 4, 6+1*8 |
||
282 | ADD16_OP_INTRA 6, 6+2*8 |
||
283 | ADD16_OP_INTRA 8, 4+3*8 |
||
284 | ADD16_OP_INTRA 10, 4+4*8 |
||
285 | ADD16_OP_INTRA 12, 6+3*8 |
||
286 | ADD16_OP_INTRA 14, 6+4*8 |
||
287 | REP_RET |
||
288 | AC 8 |
||
289 | AC 10 |
||
290 | AC 12 |
||
291 | AC 14 |
||
292 | AC 0 |
||
293 | AC 2 |
||
294 | AC 4 |
||
295 | AC 6 |
||
296 | %endmacro |
||
297 | |||
298 | INIT_XMM sse2 |
||
299 | IDCT_ADD16INTRA_10 |
||
300 | %if HAVE_AVX_EXTERNAL |
||
301 | INIT_XMM avx |
||
302 | IDCT_ADD16INTRA_10 |
||
303 | %endif |
||
304 | |||
305 | %assign last_block 36 |
||
306 | ;----------------------------------------------------------------------------- |
||
307 | ; h264_idct_add8(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
||
308 | ;----------------------------------------------------------------------------- |
||
309 | %macro IDCT_ADD8 0 |
||
310 | cglobal h264_idct_add8_10,5,8,7 |
||
311 | %if ARCH_X86_64 |
||
312 | mov r7, r0 |
||
313 | %endif |
||
314 | add r2, 1024 |
||
315 | mov r0, [r0] |
||
316 | ADD16_OP_INTRA 16, 4+ 6*8 |
||
317 | ADD16_OP_INTRA 18, 4+ 7*8 |
||
318 | add r2, 1024-128*2 |
||
319 | %if ARCH_X86_64 |
||
320 | mov r0, [r7+gprsize] |
||
321 | %else |
||
322 | mov r0, r0m |
||
323 | mov r0, [r0+gprsize] |
||
324 | %endif |
||
325 | ADD16_OP_INTRA 32, 4+11*8 |
||
326 | ADD16_OP_INTRA 34, 4+12*8 |
||
327 | REP_RET |
||
328 | AC 16 |
||
329 | AC 18 |
||
330 | AC 32 |
||
331 | AC 34 |
||
332 | |||
333 | %endmacro ; IDCT_ADD8 |
||
334 | |||
335 | INIT_XMM sse2 |
||
336 | IDCT_ADD8 |
||
337 | %if HAVE_AVX_EXTERNAL |
||
338 | INIT_XMM avx |
||
339 | IDCT_ADD8 |
||
340 | %endif |
||
341 | |||
342 | ;----------------------------------------------------------------------------- |
||
343 | ; void h264_idct8_add(pixel *dst, dctcoef *block, int stride) |
||
344 | ;----------------------------------------------------------------------------- |
||
345 | %macro IDCT8_1D 2 |
||
346 | SWAP 0, 1 |
||
347 | psrad m4, m5, 1 |
||
348 | psrad m1, m0, 1 |
||
349 | paddd m4, m5 |
||
350 | paddd m1, m0 |
||
351 | paddd m4, m7 |
||
352 | paddd m1, m5 |
||
353 | psubd m4, m0 |
||
354 | paddd m1, m3 |
||
355 | |||
356 | psubd m0, m3 |
||
357 | psubd m5, m3 |
||
358 | paddd m0, m7 |
||
359 | psubd m5, m7 |
||
360 | psrad m3, 1 |
||
361 | psrad m7, 1 |
||
362 | psubd m0, m3 |
||
363 | psubd m5, m7 |
||
364 | |||
365 | SWAP 1, 7 |
||
366 | psrad m1, m7, 2 |
||
367 | psrad m3, m4, 2 |
||
368 | paddd m3, m0 |
||
369 | psrad m0, 2 |
||
370 | paddd m1, m5 |
||
371 | psrad m5, 2 |
||
372 | psubd m0, m4 |
||
373 | psubd m7, m5 |
||
374 | |||
375 | SWAP 5, 6 |
||
376 | psrad m4, m2, 1 |
||
377 | psrad m6, m5, 1 |
||
378 | psubd m4, m5 |
||
379 | paddd m6, m2 |
||
380 | |||
381 | mova m2, %1 |
||
382 | mova m5, %2 |
||
383 | SUMSUB_BA d, 5, 2 |
||
384 | SUMSUB_BA d, 6, 5 |
||
385 | SUMSUB_BA d, 4, 2 |
||
386 | SUMSUB_BA d, 7, 6 |
||
387 | SUMSUB_BA d, 0, 4 |
||
388 | SUMSUB_BA d, 3, 2 |
||
389 | SUMSUB_BA d, 1, 5 |
||
390 | SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 |
||
391 | %endmacro |
||
392 | |||
393 | %macro IDCT8_1D_FULL 1 |
||
394 | mova m7, [%1+112*2] |
||
395 | mova m6, [%1+ 96*2] |
||
396 | mova m5, [%1+ 80*2] |
||
397 | mova m3, [%1+ 48*2] |
||
398 | mova m2, [%1+ 32*2] |
||
399 | mova m1, [%1+ 16*2] |
||
400 | IDCT8_1D [%1], [%1+ 64*2] |
||
401 | %endmacro |
||
402 | |||
403 | ; %1=int16_t *block, %2=int16_t *dstblock |
||
404 | %macro IDCT8_ADD_SSE_START 2 |
||
405 | IDCT8_1D_FULL %1 |
||
406 | %if ARCH_X86_64 |
||
407 | TRANSPOSE4x4D 0,1,2,3,8 |
||
408 | mova [%2 ], m0 |
||
409 | TRANSPOSE4x4D 4,5,6,7,8 |
||
410 | mova [%2+8*2], m4 |
||
411 | %else |
||
412 | mova [%1], m7 |
||
413 | TRANSPOSE4x4D 0,1,2,3,7 |
||
414 | mova m7, [%1] |
||
415 | mova [%2 ], m0 |
||
416 | mova [%2+16*2], m1 |
||
417 | mova [%2+32*2], m2 |
||
418 | mova [%2+48*2], m3 |
||
419 | TRANSPOSE4x4D 4,5,6,7,3 |
||
420 | mova [%2+ 8*2], m4 |
||
421 | mova [%2+24*2], m5 |
||
422 | mova [%2+40*2], m6 |
||
423 | mova [%2+56*2], m7 |
||
424 | %endif |
||
425 | %endmacro |
||
426 | |||
427 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
||
428 | %macro IDCT8_ADD_SSE_END 3 |
||
429 | IDCT8_1D_FULL %2 |
||
430 | mova [%2 ], m6 |
||
431 | mova [%2+16*2], m7 |
||
432 | |||
433 | pxor m7, m7 |
||
434 | STORE_DIFFx2 m0, m1, m6, m7, %1, %3 |
||
435 | lea %1, [%1+%3*2] |
||
436 | STORE_DIFFx2 m2, m3, m6, m7, %1, %3 |
||
437 | mova m0, [%2 ] |
||
438 | mova m1, [%2+16*2] |
||
439 | lea %1, [%1+%3*2] |
||
440 | STORE_DIFFx2 m4, m5, m6, m7, %1, %3 |
||
441 | lea %1, [%1+%3*2] |
||
442 | STORE_DIFFx2 m0, m1, m6, m7, %1, %3 |
||
443 | %endmacro |
||
444 | |||
445 | %macro IDCT8_ADD 0 |
||
446 | cglobal h264_idct8_add_10, 3,4,16 |
||
447 | %if UNIX64 == 0 |
||
448 | %assign pad 16-gprsize-(stack_offset&15) |
||
449 | sub rsp, pad |
||
450 | call h264_idct8_add1_10 %+ SUFFIX |
||
451 | add rsp, pad |
||
452 | RET |
||
453 | %endif |
||
454 | |||
455 | ALIGN 16 |
||
456 | ; TODO: does not need to use stack |
||
457 | h264_idct8_add1_10 %+ SUFFIX: |
||
458 | %assign pad 256+16-gprsize |
||
459 | sub rsp, pad |
||
460 | add dword [r1], 32 |
||
461 | |||
462 | %if ARCH_X86_64 |
||
463 | IDCT8_ADD_SSE_START r1, rsp |
||
464 | SWAP 1, 9 |
||
465 | SWAP 2, 10 |
||
466 | SWAP 3, 11 |
||
467 | SWAP 5, 13 |
||
468 | SWAP 6, 14 |
||
469 | SWAP 7, 15 |
||
470 | IDCT8_ADD_SSE_START r1+16, rsp+128 |
||
471 | PERMUTE 1,9, 2,10, 3,11, 5,1, 6,2, 7,3, 9,13, 10,14, 11,15, 13,5, 14,6, 15,7 |
||
472 | IDCT8_1D [rsp], [rsp+128] |
||
473 | SWAP 0, 8 |
||
474 | SWAP 1, 9 |
||
475 | SWAP 2, 10 |
||
476 | SWAP 3, 11 |
||
477 | SWAP 4, 12 |
||
478 | SWAP 5, 13 |
||
479 | SWAP 6, 14 |
||
480 | SWAP 7, 15 |
||
481 | IDCT8_1D [rsp+16], [rsp+144] |
||
482 | psrad m8, 6 |
||
483 | psrad m0, 6 |
||
484 | packssdw m8, m0 |
||
485 | paddsw m8, [r0] |
||
486 | pxor m0, m0 |
||
487 | mova [r1+ 0], m0 |
||
488 | mova [r1+ 16], m0 |
||
489 | mova [r1+ 32], m0 |
||
490 | mova [r1+ 48], m0 |
||
491 | mova [r1+ 64], m0 |
||
492 | mova [r1+ 80], m0 |
||
493 | mova [r1+ 96], m0 |
||
494 | mova [r1+112], m0 |
||
495 | mova [r1+128], m0 |
||
496 | mova [r1+144], m0 |
||
497 | mova [r1+160], m0 |
||
498 | mova [r1+176], m0 |
||
499 | mova [r1+192], m0 |
||
500 | mova [r1+208], m0 |
||
501 | mova [r1+224], m0 |
||
502 | mova [r1+240], m0 |
||
503 | CLIPW m8, m0, [pw_pixel_max] |
||
504 | mova [r0], m8 |
||
505 | mova m8, [pw_pixel_max] |
||
506 | STORE_DIFF16 m9, m1, m0, m8, r0+r2 |
||
507 | lea r0, [r0+r2*2] |
||
508 | STORE_DIFF16 m10, m2, m0, m8, r0 |
||
509 | STORE_DIFF16 m11, m3, m0, m8, r0+r2 |
||
510 | lea r0, [r0+r2*2] |
||
511 | STORE_DIFF16 m12, m4, m0, m8, r0 |
||
512 | STORE_DIFF16 m13, m5, m0, m8, r0+r2 |
||
513 | lea r0, [r0+r2*2] |
||
514 | STORE_DIFF16 m14, m6, m0, m8, r0 |
||
515 | STORE_DIFF16 m15, m7, m0, m8, r0+r2 |
||
516 | %else |
||
517 | IDCT8_ADD_SSE_START r1, rsp |
||
518 | IDCT8_ADD_SSE_START r1+16, rsp+128 |
||
519 | lea r3, [r0+8] |
||
520 | IDCT8_ADD_SSE_END r0, rsp, r2 |
||
521 | IDCT8_ADD_SSE_END r3, rsp+16, r2 |
||
522 | mova [r1+ 0], m7 |
||
523 | mova [r1+ 16], m7 |
||
524 | mova [r1+ 32], m7 |
||
525 | mova [r1+ 48], m7 |
||
526 | mova [r1+ 64], m7 |
||
527 | mova [r1+ 80], m7 |
||
528 | mova [r1+ 96], m7 |
||
529 | mova [r1+112], m7 |
||
530 | mova [r1+128], m7 |
||
531 | mova [r1+144], m7 |
||
532 | mova [r1+160], m7 |
||
533 | mova [r1+176], m7 |
||
534 | mova [r1+192], m7 |
||
535 | mova [r1+208], m7 |
||
536 | mova [r1+224], m7 |
||
537 | mova [r1+240], m7 |
||
538 | %endif ; ARCH_X86_64 |
||
539 | |||
540 | add rsp, pad |
||
541 | ret |
||
542 | %endmacro |
||
543 | |||
544 | INIT_XMM sse2 |
||
545 | IDCT8_ADD |
||
546 | %if HAVE_AVX_EXTERNAL |
||
547 | INIT_XMM avx |
||
548 | IDCT8_ADD |
||
549 | %endif |
||
550 | |||
551 | ;----------------------------------------------------------------------------- |
||
552 | ; h264_idct8_add4(pixel **dst, const int *block_offset, dctcoef *block, int stride, const uint8_t nnzc[6*8]) |
||
553 | ;----------------------------------------------------------------------------- |
||
554 | ;;;;;;; NO FATE SAMPLES TRIGGER THIS |
||
555 | %macro IDCT8_ADD4_OP 2 |
||
556 | cmp byte [r4+%2], 0 |
||
557 | jz .skipblock%1 |
||
558 | mov r0d, [r6+%1*4] |
||
559 | add r0, r5 |
||
560 | call h264_idct8_add1_10 %+ SUFFIX |
||
561 | .skipblock%1: |
||
562 | %if %1<12 |
||
563 | add r1, 256 |
||
564 | %endif |
||
565 | %endmacro |
||
566 | |||
567 | %macro IDCT8_ADD4 0 |
||
568 | cglobal h264_idct8_add4_10, 0,7,16 |
||
569 | %assign pad 16-gprsize-(stack_offset&15) |
||
570 | SUB rsp, pad |
||
571 | mov r5, r0mp |
||
572 | mov r6, r1mp |
||
573 | mov r1, r2mp |
||
574 | mov r2d, r3m |
||
575 | movifnidn r4, r4mp |
||
576 | IDCT8_ADD4_OP 0, 4+1*8 |
||
577 | IDCT8_ADD4_OP 4, 6+1*8 |
||
578 | IDCT8_ADD4_OP 8, 4+3*8 |
||
579 | IDCT8_ADD4_OP 12, 6+3*8 |
||
580 | ADD rsp, pad |
||
581 | RET |
||
582 | %endmacro ; IDCT8_ADD4 |
||
583 | |||
584 | INIT_XMM sse2 |
||
585 | IDCT8_ADD4 |
||
586 | %if HAVE_AVX_EXTERNAL |
||
587 | INIT_XMM avx |
||
588 | IDCT8_ADD4 |
||
589 | %endif12 |