Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | ;***************************************************************************** |
2 | ;* MMX/SSE2-optimized H.264 iDCT |
||
3 | ;***************************************************************************** |
||
4 | ;* Copyright (C) 2004-2005 Michael Niedermayer, Loren Merritt |
||
5 | ;* Copyright (C) 2003-2008 x264 project |
||
6 | ;* |
||
7 | ;* Authors: Laurent Aimar |
||
8 | ;* Loren Merritt |
||
9 | ;* Holger Lubitz |
||
10 | ;* Min Chen |
||
11 | ;* |
||
12 | ;* This file is part of FFmpeg. |
||
13 | ;* |
||
14 | ;* FFmpeg is free software; you can redistribute it and/or |
||
15 | ;* modify it under the terms of the GNU Lesser General Public |
||
16 | ;* License as published by the Free Software Foundation; either |
||
17 | ;* version 2.1 of the License, or (at your option) any later version. |
||
18 | ;* |
||
19 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
20 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
21 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
22 | ;* Lesser General Public License for more details. |
||
23 | ;* |
||
24 | ;* You should have received a copy of the GNU Lesser General Public |
||
25 | ;* License along with FFmpeg; if not, write to the Free Software |
||
26 | ;* Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
27 | ;***************************************************************************** |
||
28 | |||
29 | %include "libavutil/x86/x86util.asm" |
||
30 | |||
31 | SECTION_RODATA |
||
32 | |||
33 | scan8_mem: db 4+ 1*8, 5+ 1*8, 4+ 2*8, 5+ 2*8 |
||
34 | db 6+ 1*8, 7+ 1*8, 6+ 2*8, 7+ 2*8 |
||
35 | db 4+ 3*8, 5+ 3*8, 4+ 4*8, 5+ 4*8 |
||
36 | db 6+ 3*8, 7+ 3*8, 6+ 4*8, 7+ 4*8 |
||
37 | db 4+ 6*8, 5+ 6*8, 4+ 7*8, 5+ 7*8 |
||
38 | db 6+ 6*8, 7+ 6*8, 6+ 7*8, 7+ 7*8 |
||
39 | db 4+ 8*8, 5+ 8*8, 4+ 9*8, 5+ 9*8 |
||
40 | db 6+ 8*8, 7+ 8*8, 6+ 9*8, 7+ 9*8 |
||
41 | db 4+11*8, 5+11*8, 4+12*8, 5+12*8 |
||
42 | db 6+11*8, 7+11*8, 6+12*8, 7+12*8 |
||
43 | db 4+13*8, 5+13*8, 4+14*8, 5+14*8 |
||
44 | db 6+13*8, 7+13*8, 6+14*8, 7+14*8 |
||
45 | %ifdef PIC |
||
46 | %define npicregs 1 |
||
47 | %define scan8 picregq |
||
48 | %else |
||
49 | %define npicregs 0 |
||
50 | %define scan8 scan8_mem |
||
51 | %endif |
||
52 | |||
53 | cextern pw_32 |
||
54 | cextern pw_1 |
||
55 | |||
56 | SECTION .text |
||
57 | |||
58 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
||
59 | %macro IDCT4_ADD 3 |
||
60 | ; Load dct coeffs |
||
61 | movq m0, [%2] |
||
62 | movq m1, [%2+8] |
||
63 | movq m2, [%2+16] |
||
64 | movq m3, [%2+24] |
||
65 | |||
66 | IDCT4_1D w, 0, 1, 2, 3, 4, 5 |
||
67 | mova m6, [pw_32] |
||
68 | TRANSPOSE4x4W 0, 1, 2, 3, 4 |
||
69 | paddw m0, m6 |
||
70 | IDCT4_1D w, 0, 1, 2, 3, 4, 5 |
||
71 | pxor m7, m7 |
||
72 | movq [%2+ 0], m7 |
||
73 | movq [%2+ 8], m7 |
||
74 | movq [%2+16], m7 |
||
75 | movq [%2+24], m7 |
||
76 | |||
77 | STORE_DIFFx2 m0, m1, m4, m5, m7, 6, %1, %3 |
||
78 | lea %1, [%1+%3*2] |
||
79 | STORE_DIFFx2 m2, m3, m4, m5, m7, 6, %1, %3 |
||
80 | %endmacro |
||
81 | |||
82 | INIT_MMX mmx |
||
83 | ; ff_h264_idct_add_8_mmx(uint8_t *dst, int16_t *block, int stride) |
||
84 | cglobal h264_idct_add_8, 3, 3, 0 |
||
85 | IDCT4_ADD r0, r1, r2 |
||
86 | RET |
||
87 | |||
88 | %macro IDCT8_1D 2 |
||
89 | mova m0, m1 |
||
90 | psraw m1, 1 |
||
91 | mova m4, m5 |
||
92 | psraw m4, 1 |
||
93 | paddw m4, m5 |
||
94 | paddw m1, m0 |
||
95 | paddw m4, m7 |
||
96 | paddw m1, m5 |
||
97 | psubw m4, m0 |
||
98 | paddw m1, m3 |
||
99 | |||
100 | psubw m0, m3 |
||
101 | psubw m5, m3 |
||
102 | psraw m3, 1 |
||
103 | paddw m0, m7 |
||
104 | psubw m5, m7 |
||
105 | psraw m7, 1 |
||
106 | psubw m0, m3 |
||
107 | psubw m5, m7 |
||
108 | |||
109 | mova m7, m1 |
||
110 | psraw m1, 2 |
||
111 | mova m3, m4 |
||
112 | psraw m3, 2 |
||
113 | paddw m3, m0 |
||
114 | psraw m0, 2 |
||
115 | paddw m1, m5 |
||
116 | psraw m5, 2 |
||
117 | psubw m0, m4 |
||
118 | psubw m7, m5 |
||
119 | |||
120 | mova m5, m6 |
||
121 | psraw m6, 1 |
||
122 | mova m4, m2 |
||
123 | psraw m4, 1 |
||
124 | paddw m6, m2 |
||
125 | psubw m4, m5 |
||
126 | |||
127 | mova m2, %1 |
||
128 | mova m5, %2 |
||
129 | SUMSUB_BA w, 5, 2 |
||
130 | SUMSUB_BA w, 6, 5 |
||
131 | SUMSUB_BA w, 4, 2 |
||
132 | SUMSUB_BA w, 7, 6 |
||
133 | SUMSUB_BA w, 0, 4 |
||
134 | SUMSUB_BA w, 3, 2 |
||
135 | SUMSUB_BA w, 1, 5 |
||
136 | SWAP 7, 6, 4, 5, 2, 3, 1, 0 ; 70315246 -> 01234567 |
||
137 | %endmacro |
||
138 | |||
139 | %macro IDCT8_1D_FULL 1 |
||
140 | mova m7, [%1+112] |
||
141 | mova m6, [%1+ 96] |
||
142 | mova m5, [%1+ 80] |
||
143 | mova m3, [%1+ 48] |
||
144 | mova m2, [%1+ 32] |
||
145 | mova m1, [%1+ 16] |
||
146 | IDCT8_1D [%1], [%1+ 64] |
||
147 | %endmacro |
||
148 | |||
149 | ; %1=int16_t *block, %2=int16_t *dstblock |
||
150 | %macro IDCT8_ADD_MMX_START 2 |
||
151 | IDCT8_1D_FULL %1 |
||
152 | mova [%1], m7 |
||
153 | TRANSPOSE4x4W 0, 1, 2, 3, 7 |
||
154 | mova m7, [%1] |
||
155 | mova [%2 ], m0 |
||
156 | mova [%2+16], m1 |
||
157 | mova [%2+32], m2 |
||
158 | mova [%2+48], m3 |
||
159 | TRANSPOSE4x4W 4, 5, 6, 7, 3 |
||
160 | mova [%2+ 8], m4 |
||
161 | mova [%2+24], m5 |
||
162 | mova [%2+40], m6 |
||
163 | mova [%2+56], m7 |
||
164 | %endmacro |
||
165 | |||
166 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
||
167 | %macro IDCT8_ADD_MMX_END 3-4 |
||
168 | IDCT8_1D_FULL %2 |
||
169 | mova [%2 ], m5 |
||
170 | mova [%2+16], m6 |
||
171 | mova [%2+32], m7 |
||
172 | |||
173 | pxor m7, m7 |
||
174 | %if %0 == 4 |
||
175 | movq [%4+ 0], m7 |
||
176 | movq [%4+ 8], m7 |
||
177 | movq [%4+ 16], m7 |
||
178 | movq [%4+ 24], m7 |
||
179 | movq [%4+ 32], m7 |
||
180 | movq [%4+ 40], m7 |
||
181 | movq [%4+ 48], m7 |
||
182 | movq [%4+ 56], m7 |
||
183 | movq [%4+ 64], m7 |
||
184 | movq [%4+ 72], m7 |
||
185 | movq [%4+ 80], m7 |
||
186 | movq [%4+ 88], m7 |
||
187 | movq [%4+ 96], m7 |
||
188 | movq [%4+104], m7 |
||
189 | movq [%4+112], m7 |
||
190 | movq [%4+120], m7 |
||
191 | %endif |
||
192 | STORE_DIFFx2 m0, m1, m5, m6, m7, 6, %1, %3 |
||
193 | lea %1, [%1+%3*2] |
||
194 | STORE_DIFFx2 m2, m3, m5, m6, m7, 6, %1, %3 |
||
195 | mova m0, [%2 ] |
||
196 | mova m1, [%2+16] |
||
197 | mova m2, [%2+32] |
||
198 | lea %1, [%1+%3*2] |
||
199 | STORE_DIFFx2 m4, m0, m5, m6, m7, 6, %1, %3 |
||
200 | lea %1, [%1+%3*2] |
||
201 | STORE_DIFFx2 m1, m2, m5, m6, m7, 6, %1, %3 |
||
202 | %endmacro |
||
203 | |||
204 | INIT_MMX mmx |
||
205 | ; ff_h264_idct8_add_8_mmx(uint8_t *dst, int16_t *block, int stride) |
||
206 | cglobal h264_idct8_add_8, 3, 4, 0 |
||
207 | %assign pad 128+4-(stack_offset&7) |
||
208 | SUB rsp, pad |
||
209 | |||
210 | add word [r1], 32 |
||
211 | IDCT8_ADD_MMX_START r1 , rsp |
||
212 | IDCT8_ADD_MMX_START r1+8, rsp+64 |
||
213 | lea r3, [r0+4] |
||
214 | IDCT8_ADD_MMX_END r0 , rsp, r2, r1 |
||
215 | IDCT8_ADD_MMX_END r3 , rsp+8, r2 |
||
216 | |||
217 | ADD rsp, pad |
||
218 | RET |
||
219 | |||
220 | ; %1=uint8_t *dst, %2=int16_t *block, %3=int stride |
||
221 | %macro IDCT8_ADD_SSE 4 |
||
222 | IDCT8_1D_FULL %2 |
||
223 | %if ARCH_X86_64 |
||
224 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, 8 |
||
225 | %else |
||
226 | TRANSPOSE8x8W 0, 1, 2, 3, 4, 5, 6, 7, [%2], [%2+16] |
||
227 | %endif |
||
228 | paddw m0, [pw_32] |
||
229 | |||
230 | %if ARCH_X86_64 == 0 |
||
231 | mova [%2 ], m0 |
||
232 | mova [%2+16], m4 |
||
233 | IDCT8_1D [%2], [%2+ 16] |
||
234 | mova [%2 ], m6 |
||
235 | mova [%2+16], m7 |
||
236 | %else |
||
237 | SWAP 0, 8 |
||
238 | SWAP 4, 9 |
||
239 | IDCT8_1D m8, m9 |
||
240 | SWAP 6, 8 |
||
241 | SWAP 7, 9 |
||
242 | %endif |
||
243 | |||
244 | pxor m7, m7 |
||
245 | lea %4, [%3*3] |
||
246 | STORE_DIFF m0, m6, m7, [%1 ] |
||
247 | STORE_DIFF m1, m6, m7, [%1+%3 ] |
||
248 | STORE_DIFF m2, m6, m7, [%1+%3*2] |
||
249 | STORE_DIFF m3, m6, m7, [%1+%4 ] |
||
250 | %if ARCH_X86_64 == 0 |
||
251 | mova m0, [%2 ] |
||
252 | mova m1, [%2+16] |
||
253 | %else |
||
254 | SWAP 0, 8 |
||
255 | SWAP 1, 9 |
||
256 | %endif |
||
257 | mova [%2+ 0], m7 |
||
258 | mova [%2+ 16], m7 |
||
259 | mova [%2+ 32], m7 |
||
260 | mova [%2+ 48], m7 |
||
261 | mova [%2+ 64], m7 |
||
262 | mova [%2+ 80], m7 |
||
263 | mova [%2+ 96], m7 |
||
264 | mova [%2+112], m7 |
||
265 | lea %1, [%1+%3*4] |
||
266 | STORE_DIFF m4, m6, m7, [%1 ] |
||
267 | STORE_DIFF m5, m6, m7, [%1+%3 ] |
||
268 | STORE_DIFF m0, m6, m7, [%1+%3*2] |
||
269 | STORE_DIFF m1, m6, m7, [%1+%4 ] |
||
270 | %endmacro |
||
271 | |||
272 | INIT_XMM sse2 |
||
273 | ; ff_h264_idct8_add_8_sse2(uint8_t *dst, int16_t *block, int stride) |
||
274 | cglobal h264_idct8_add_8, 3, 4, 10 |
||
275 | IDCT8_ADD_SSE r0, r1, r2, r3 |
||
276 | RET |
||
277 | |||
278 | %macro DC_ADD_MMXEXT_INIT 2 |
||
279 | add %1, 32 |
||
280 | sar %1, 6 |
||
281 | movd m0, %1d |
||
282 | lea %1, [%2*3] |
||
283 | pshufw m0, m0, 0 |
||
284 | pxor m1, m1 |
||
285 | psubw m1, m0 |
||
286 | packuswb m0, m0 |
||
287 | packuswb m1, m1 |
||
288 | %endmacro |
||
289 | |||
290 | %macro DC_ADD_MMXEXT_OP 4 |
||
291 | %1 m2, [%2 ] |
||
292 | %1 m3, [%2+%3 ] |
||
293 | %1 m4, [%2+%3*2] |
||
294 | %1 m5, [%2+%4 ] |
||
295 | paddusb m2, m0 |
||
296 | paddusb m3, m0 |
||
297 | paddusb m4, m0 |
||
298 | paddusb m5, m0 |
||
299 | psubusb m2, m1 |
||
300 | psubusb m3, m1 |
||
301 | psubusb m4, m1 |
||
302 | psubusb m5, m1 |
||
303 | %1 [%2 ], m2 |
||
304 | %1 [%2+%3 ], m3 |
||
305 | %1 [%2+%3*2], m4 |
||
306 | %1 [%2+%4 ], m5 |
||
307 | %endmacro |
||
308 | |||
309 | INIT_MMX mmxext |
||
310 | ; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) |
||
311 | %if ARCH_X86_64 |
||
312 | cglobal h264_idct_dc_add_8, 3, 4, 0 |
||
313 | movsx r3, word [r1] |
||
314 | mov dword [r1], 0 |
||
315 | DC_ADD_MMXEXT_INIT r3, r2 |
||
316 | DC_ADD_MMXEXT_OP movh, r0, r2, r3 |
||
317 | RET |
||
318 | |||
319 | ; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) |
||
320 | cglobal h264_idct8_dc_add_8, 3, 4, 0 |
||
321 | movsx r3, word [r1] |
||
322 | mov dword [r1], 0 |
||
323 | DC_ADD_MMXEXT_INIT r3, r2 |
||
324 | DC_ADD_MMXEXT_OP mova, r0, r2, r3 |
||
325 | lea r0, [r0+r2*4] |
||
326 | DC_ADD_MMXEXT_OP mova, r0, r2, r3 |
||
327 | RET |
||
328 | %else |
||
329 | ; ff_h264_idct_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) |
||
330 | cglobal h264_idct_dc_add_8, 2, 3, 0 |
||
331 | movsx r2, word [r1] |
||
332 | mov dword [r1], 0 |
||
333 | mov r1, r2m |
||
334 | DC_ADD_MMXEXT_INIT r2, r1 |
||
335 | DC_ADD_MMXEXT_OP movh, r0, r1, r2 |
||
336 | RET |
||
337 | |||
338 | ; ff_h264_idct8_dc_add_8_mmxext(uint8_t *dst, int16_t *block, int stride) |
||
339 | cglobal h264_idct8_dc_add_8, 2, 3, 0 |
||
340 | movsx r2, word [r1] |
||
341 | mov dword [r1], 0 |
||
342 | mov r1, r2m |
||
343 | DC_ADD_MMXEXT_INIT r2, r1 |
||
344 | DC_ADD_MMXEXT_OP mova, r0, r1, r2 |
||
345 | lea r0, [r0+r1*4] |
||
346 | DC_ADD_MMXEXT_OP mova, r0, r1, r2 |
||
347 | RET |
||
348 | %endif |
||
349 | |||
350 | INIT_MMX mmx |
||
351 | ; ff_h264_idct_add16_8_mmx(uint8_t *dst, const int *block_offset, |
||
352 | ; int16_t *block, int stride, |
||
353 | ; const uint8_t nnzc[6 * 8]) |
||
354 | cglobal h264_idct_add16_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg |
||
355 | xor r5, r5 |
||
356 | %ifdef PIC |
||
357 | lea picregq, [scan8_mem] |
||
358 | %endif |
||
359 | .nextblock: |
||
360 | movzx r6, byte [scan8+r5] |
||
361 | movzx r6, byte [r4+r6] |
||
362 | test r6, r6 |
||
363 | jz .skipblock |
||
364 | mov r6d, dword [r1+r5*4] |
||
365 | lea r6, [r0+r6] |
||
366 | IDCT4_ADD r6, r2, r3 |
||
367 | .skipblock: |
||
368 | inc r5 |
||
369 | add r2, 32 |
||
370 | cmp r5, 16 |
||
371 | jl .nextblock |
||
372 | REP_RET |
||
373 | |||
374 | ; ff_h264_idct8_add4_8_mmx(uint8_t *dst, const int *block_offset, |
||
375 | ; int16_t *block, int stride, |
||
376 | ; const uint8_t nnzc[6 * 8]) |
||
377 | cglobal h264_idct8_add4_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg |
||
378 | %assign pad 128+4-(stack_offset&7) |
||
379 | SUB rsp, pad |
||
380 | |||
381 | xor r5, r5 |
||
382 | %ifdef PIC |
||
383 | lea picregq, [scan8_mem] |
||
384 | %endif |
||
385 | .nextblock: |
||
386 | movzx r6, byte [scan8+r5] |
||
387 | movzx r6, byte [r4+r6] |
||
388 | test r6, r6 |
||
389 | jz .skipblock |
||
390 | mov r6d, dword [r1+r5*4] |
||
391 | add r6, r0 |
||
392 | add word [r2], 32 |
||
393 | IDCT8_ADD_MMX_START r2 , rsp |
||
394 | IDCT8_ADD_MMX_START r2+8, rsp+64 |
||
395 | IDCT8_ADD_MMX_END r6 , rsp, r3, r2 |
||
396 | mov r6d, dword [r1+r5*4] |
||
397 | lea r6, [r0+r6+4] |
||
398 | IDCT8_ADD_MMX_END r6 , rsp+8, r3 |
||
399 | .skipblock: |
||
400 | add r5, 4 |
||
401 | add r2, 128 |
||
402 | cmp r5, 16 |
||
403 | jl .nextblock |
||
404 | ADD rsp, pad |
||
405 | RET |
||
406 | |||
407 | INIT_MMX mmxext |
||
408 | ; ff_h264_idct_add16_8_mmxext(uint8_t *dst, const int *block_offset, |
||
409 | ; int16_t *block, int stride, |
||
410 | ; const uint8_t nnzc[6 * 8]) |
||
411 | cglobal h264_idct_add16_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
||
412 | xor r5, r5 |
||
413 | %ifdef PIC |
||
414 | lea picregq, [scan8_mem] |
||
415 | %endif |
||
416 | .nextblock: |
||
417 | movzx r6, byte [scan8+r5] |
||
418 | movzx r6, byte [r4+r6] |
||
419 | test r6, r6 |
||
420 | jz .skipblock |
||
421 | cmp r6, 1 |
||
422 | jnz .no_dc |
||
423 | movsx r6, word [r2] |
||
424 | test r6, r6 |
||
425 | jz .no_dc |
||
426 | mov word [r2], 0 |
||
427 | DC_ADD_MMXEXT_INIT r6, r3 |
||
428 | %if ARCH_X86_64 == 0 |
||
429 | %define dst2q r1 |
||
430 | %define dst2d r1d |
||
431 | %endif |
||
432 | mov dst2d, dword [r1+r5*4] |
||
433 | lea dst2q, [r0+dst2q] |
||
434 | DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 |
||
435 | %if ARCH_X86_64 == 0 |
||
436 | mov r1, r1m |
||
437 | %endif |
||
438 | inc r5 |
||
439 | add r2, 32 |
||
440 | cmp r5, 16 |
||
441 | jl .nextblock |
||
442 | REP_RET |
||
443 | .no_dc: |
||
444 | mov r6d, dword [r1+r5*4] |
||
445 | add r6, r0 |
||
446 | IDCT4_ADD r6, r2, r3 |
||
447 | .skipblock: |
||
448 | inc r5 |
||
449 | add r2, 32 |
||
450 | cmp r5, 16 |
||
451 | jl .nextblock |
||
452 | REP_RET |
||
453 | |||
454 | INIT_MMX mmx |
||
455 | ; ff_h264_idct_add16intra_8_mmx(uint8_t *dst, const int *block_offset, |
||
456 | ; int16_t *block, int stride, |
||
457 | ; const uint8_t nnzc[6 * 8]) |
||
458 | cglobal h264_idct_add16intra_8, 5, 7 + npicregs, 0, dst, block_offset, block, stride, nnzc, cntr, coeff, picreg |
||
459 | xor r5, r5 |
||
460 | %ifdef PIC |
||
461 | lea picregq, [scan8_mem] |
||
462 | %endif |
||
463 | .nextblock: |
||
464 | movzx r6, byte [scan8+r5] |
||
465 | movzx r6, byte [r4+r6] |
||
466 | or r6w, word [r2] |
||
467 | test r6, r6 |
||
468 | jz .skipblock |
||
469 | mov r6d, dword [r1+r5*4] |
||
470 | add r6, r0 |
||
471 | IDCT4_ADD r6, r2, r3 |
||
472 | .skipblock: |
||
473 | inc r5 |
||
474 | add r2, 32 |
||
475 | cmp r5, 16 |
||
476 | jl .nextblock |
||
477 | REP_RET |
||
478 | |||
479 | INIT_MMX mmxext |
||
480 | ; ff_h264_idct_add16intra_8_mmxext(uint8_t *dst, const int *block_offset, |
||
481 | ; int16_t *block, int stride, |
||
482 | ; const uint8_t nnzc[6 * 8]) |
||
483 | cglobal h264_idct_add16intra_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
||
484 | xor r5, r5 |
||
485 | %ifdef PIC |
||
486 | lea picregq, [scan8_mem] |
||
487 | %endif |
||
488 | .nextblock: |
||
489 | movzx r6, byte [scan8+r5] |
||
490 | movzx r6, byte [r4+r6] |
||
491 | test r6, r6 |
||
492 | jz .try_dc |
||
493 | mov r6d, dword [r1+r5*4] |
||
494 | lea r6, [r0+r6] |
||
495 | IDCT4_ADD r6, r2, r3 |
||
496 | inc r5 |
||
497 | add r2, 32 |
||
498 | cmp r5, 16 |
||
499 | jl .nextblock |
||
500 | REP_RET |
||
501 | .try_dc: |
||
502 | movsx r6, word [r2] |
||
503 | test r6, r6 |
||
504 | jz .skipblock |
||
505 | mov word [r2], 0 |
||
506 | DC_ADD_MMXEXT_INIT r6, r3 |
||
507 | %if ARCH_X86_64 == 0 |
||
508 | %define dst2q r1 |
||
509 | %define dst2d r1d |
||
510 | %endif |
||
511 | mov dst2d, dword [r1+r5*4] |
||
512 | add dst2q, r0 |
||
513 | DC_ADD_MMXEXT_OP movh, dst2q, r3, r6 |
||
514 | %if ARCH_X86_64 == 0 |
||
515 | mov r1, r1m |
||
516 | %endif |
||
517 | .skipblock: |
||
518 | inc r5 |
||
519 | add r2, 32 |
||
520 | cmp r5, 16 |
||
521 | jl .nextblock |
||
522 | REP_RET |
||
523 | |||
524 | ; ff_h264_idct8_add4_8_mmxext(uint8_t *dst, const int *block_offset, |
||
525 | ; int16_t *block, int stride, |
||
526 | ; const uint8_t nnzc[6 * 8]) |
||
527 | cglobal h264_idct8_add4_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
||
528 | %assign pad 128+4-(stack_offset&7) |
||
529 | SUB rsp, pad |
||
530 | |||
531 | xor r5, r5 |
||
532 | %ifdef PIC |
||
533 | lea picregq, [scan8_mem] |
||
534 | %endif |
||
535 | .nextblock: |
||
536 | movzx r6, byte [scan8+r5] |
||
537 | movzx r6, byte [r4+r6] |
||
538 | test r6, r6 |
||
539 | jz .skipblock |
||
540 | cmp r6, 1 |
||
541 | jnz .no_dc |
||
542 | movsx r6, word [r2] |
||
543 | test r6, r6 |
||
544 | jz .no_dc |
||
545 | mov word [r2], 0 |
||
546 | DC_ADD_MMXEXT_INIT r6, r3 |
||
547 | %if ARCH_X86_64 == 0 |
||
548 | %define dst2q r1 |
||
549 | %define dst2d r1d |
||
550 | %endif |
||
551 | mov dst2d, dword [r1+r5*4] |
||
552 | lea dst2q, [r0+dst2q] |
||
553 | DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 |
||
554 | lea dst2q, [dst2q+r3*4] |
||
555 | DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 |
||
556 | %if ARCH_X86_64 == 0 |
||
557 | mov r1, r1m |
||
558 | %endif |
||
559 | add r5, 4 |
||
560 | add r2, 128 |
||
561 | cmp r5, 16 |
||
562 | jl .nextblock |
||
563 | |||
564 | ADD rsp, pad |
||
565 | RET |
||
566 | .no_dc: |
||
567 | mov r6d, dword [r1+r5*4] |
||
568 | add r6, r0 |
||
569 | add word [r2], 32 |
||
570 | IDCT8_ADD_MMX_START r2 , rsp |
||
571 | IDCT8_ADD_MMX_START r2+8, rsp+64 |
||
572 | IDCT8_ADD_MMX_END r6 , rsp, r3, r2 |
||
573 | mov r6d, dword [r1+r5*4] |
||
574 | lea r6, [r0+r6+4] |
||
575 | IDCT8_ADD_MMX_END r6 , rsp+8, r3 |
||
576 | .skipblock: |
||
577 | add r5, 4 |
||
578 | add r2, 128 |
||
579 | cmp r5, 16 |
||
580 | jl .nextblock |
||
581 | |||
582 | ADD rsp, pad |
||
583 | RET |
||
584 | |||
585 | INIT_XMM sse2 |
||
586 | ; ff_h264_idct8_add4_8_sse2(uint8_t *dst, const int *block_offset, |
||
587 | ; int16_t *block, int stride, |
||
588 | ; const uint8_t nnzc[6 * 8]) |
||
589 | cglobal h264_idct8_add4_8, 5, 8 + npicregs, 10, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
||
590 | xor r5, r5 |
||
591 | %ifdef PIC |
||
592 | lea picregq, [scan8_mem] |
||
593 | %endif |
||
594 | .nextblock: |
||
595 | movzx r6, byte [scan8+r5] |
||
596 | movzx r6, byte [r4+r6] |
||
597 | test r6, r6 |
||
598 | jz .skipblock |
||
599 | cmp r6, 1 |
||
600 | jnz .no_dc |
||
601 | movsx r6, word [r2] |
||
602 | test r6, r6 |
||
603 | jz .no_dc |
||
604 | INIT_MMX cpuname |
||
605 | mov word [r2], 0 |
||
606 | DC_ADD_MMXEXT_INIT r6, r3 |
||
607 | %if ARCH_X86_64 == 0 |
||
608 | %define dst2q r1 |
||
609 | %define dst2d r1d |
||
610 | %endif |
||
611 | mov dst2d, dword [r1+r5*4] |
||
612 | add dst2q, r0 |
||
613 | DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 |
||
614 | lea dst2q, [dst2q+r3*4] |
||
615 | DC_ADD_MMXEXT_OP mova, dst2q, r3, r6 |
||
616 | %if ARCH_X86_64 == 0 |
||
617 | mov r1, r1m |
||
618 | %endif |
||
619 | add r5, 4 |
||
620 | add r2, 128 |
||
621 | cmp r5, 16 |
||
622 | jl .nextblock |
||
623 | REP_RET |
||
624 | .no_dc: |
||
625 | INIT_XMM cpuname |
||
626 | mov dst2d, dword [r1+r5*4] |
||
627 | add dst2q, r0 |
||
628 | IDCT8_ADD_SSE dst2q, r2, r3, r6 |
||
629 | %if ARCH_X86_64 == 0 |
||
630 | mov r1, r1m |
||
631 | %endif |
||
632 | .skipblock: |
||
633 | add r5, 4 |
||
634 | add r2, 128 |
||
635 | cmp r5, 16 |
||
636 | jl .nextblock |
||
637 | REP_RET |
||
638 | |||
639 | INIT_MMX mmx |
||
640 | h264_idct_add8_mmx_plane: |
||
641 | .nextblock: |
||
642 | movzx r6, byte [scan8+r5] |
||
643 | movzx r6, byte [r4+r6] |
||
644 | or r6w, word [r2] |
||
645 | test r6, r6 |
||
646 | jz .skipblock |
||
647 | %if ARCH_X86_64 |
||
648 | mov r0d, dword [r1+r5*4] |
||
649 | add r0, [dst2q] |
||
650 | %else |
||
651 | mov r0, r1m ; XXX r1m here is actually r0m of the calling func |
||
652 | mov r0, [r0] |
||
653 | add r0, dword [r1+r5*4] |
||
654 | %endif |
||
655 | IDCT4_ADD r0, r2, r3 |
||
656 | .skipblock: |
||
657 | inc r5 |
||
658 | add r2, 32 |
||
659 | test r5, 3 |
||
660 | jnz .nextblock |
||
661 | rep ret |
||
662 | |||
663 | ; ff_h264_idct_add8_8_mmx(uint8_t **dest, const int *block_offset, |
||
664 | ; int16_t *block, int stride, const uint8_t nnzc[6 * 8]) |
||
665 | cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
||
666 | mov r5, 16 |
||
667 | add r2, 512 |
||
668 | %ifdef PIC |
||
669 | lea picregq, [scan8_mem] |
||
670 | %endif |
||
671 | %if ARCH_X86_64 |
||
672 | mov dst2q, r0 |
||
673 | %endif |
||
674 | call h264_idct_add8_mmx_plane |
||
675 | mov r5, 32 |
||
676 | add r2, 384 |
||
677 | %if ARCH_X86_64 |
||
678 | add dst2q, gprsize |
||
679 | %else |
||
680 | add r0mp, gprsize |
||
681 | %endif |
||
682 | call h264_idct_add8_mmx_plane |
||
683 | RET |
||
684 | |||
685 | h264_idct_add8_mmxext_plane: |
||
686 | .nextblock: |
||
687 | movzx r6, byte [scan8+r5] |
||
688 | movzx r6, byte [r4+r6] |
||
689 | test r6, r6 |
||
690 | jz .try_dc |
||
691 | %if ARCH_X86_64 |
||
692 | mov r0d, dword [r1+r5*4] |
||
693 | add r0, [dst2q] |
||
694 | %else |
||
695 | mov r0, r1m ; XXX r1m here is actually r0m of the calling func |
||
696 | mov r0, [r0] |
||
697 | add r0, dword [r1+r5*4] |
||
698 | %endif |
||
699 | IDCT4_ADD r0, r2, r3 |
||
700 | inc r5 |
||
701 | add r2, 32 |
||
702 | test r5, 3 |
||
703 | jnz .nextblock |
||
704 | rep ret |
||
705 | .try_dc: |
||
706 | movsx r6, word [r2] |
||
707 | test r6, r6 |
||
708 | jz .skipblock |
||
709 | mov word [r2], 0 |
||
710 | DC_ADD_MMXEXT_INIT r6, r3 |
||
711 | %if ARCH_X86_64 |
||
712 | mov r0d, dword [r1+r5*4] |
||
713 | add r0, [dst2q] |
||
714 | %else |
||
715 | mov r0, r1m ; XXX r1m here is actually r0m of the calling func |
||
716 | mov r0, [r0] |
||
717 | add r0, dword [r1+r5*4] |
||
718 | %endif |
||
719 | DC_ADD_MMXEXT_OP movh, r0, r3, r6 |
||
720 | .skipblock: |
||
721 | inc r5 |
||
722 | add r2, 32 |
||
723 | test r5, 3 |
||
724 | jnz .nextblock |
||
725 | rep ret |
||
726 | |||
727 | INIT_MMX mmxext |
||
728 | ; ff_h264_idct_add8_8_mmxext(uint8_t **dest, const int *block_offset, |
||
729 | ; int16_t *block, int stride, |
||
730 | ; const uint8_t nnzc[6 * 8]) |
||
731 | cglobal h264_idct_add8_8, 5, 8 + npicregs, 0, dst1, block_offset, block, stride, nnzc, cntr, coeff, dst2, picreg |
||
732 | mov r5, 16 |
||
733 | add r2, 512 |
||
734 | %if ARCH_X86_64 |
||
735 | mov dst2q, r0 |
||
736 | %endif |
||
737 | %ifdef PIC |
||
738 | lea picregq, [scan8_mem] |
||
739 | %endif |
||
740 | call h264_idct_add8_mmxext_plane |
||
741 | mov r5, 32 |
||
742 | add r2, 384 |
||
743 | %if ARCH_X86_64 |
||
744 | add dst2q, gprsize |
||
745 | %else |
||
746 | add r0mp, gprsize |
||
747 | %endif |
||
748 | call h264_idct_add8_mmxext_plane |
||
749 | RET |
||
750 | |||
751 | ; r0 = uint8_t *dst, r2 = int16_t *block, r3 = int stride, r6=clobbered |
||
752 | h264_idct_dc_add8_mmxext: |
||
753 | movd m0, [r2 ] ; 0 0 X D |
||
754 | mov word [r2+ 0], 0 |
||
755 | punpcklwd m0, [r2+32] ; x X d D |
||
756 | mov word [r2+32], 0 |
||
757 | paddsw m0, [pw_32] |
||
758 | psraw m0, 6 |
||
759 | punpcklwd m0, m0 ; d d D D |
||
760 | pxor m1, m1 ; 0 0 0 0 |
||
761 | psubw m1, m0 ; -d-d-D-D |
||
762 | packuswb m0, m1 ; -d-d-D-D d d D D |
||
763 | pshufw m1, m0, 0xFA ; -d-d-d-d-D-D-D-D |
||
764 | punpcklwd m0, m0 ; d d d d D D D D |
||
765 | lea r6, [r3*3] |
||
766 | DC_ADD_MMXEXT_OP movq, r0, r3, r6 |
||
767 | ret |
||
768 | |||
769 | ALIGN 16 |
||
770 | INIT_XMM sse2 |
||
771 | ; r0 = uint8_t *dst (clobbered), r2 = int16_t *block, r3 = int stride |
||
772 | h264_add8x4_idct_sse2: |
||
773 | movq m0, [r2+ 0] |
||
774 | movq m1, [r2+ 8] |
||
775 | movq m2, [r2+16] |
||
776 | movq m3, [r2+24] |
||
777 | movhps m0, [r2+32] |
||
778 | movhps m1, [r2+40] |
||
779 | movhps m2, [r2+48] |
||
780 | movhps m3, [r2+56] |
||
781 | IDCT4_1D w,0,1,2,3,4,5 |
||
782 | TRANSPOSE2x4x4W 0,1,2,3,4 |
||
783 | paddw m0, [pw_32] |
||
784 | IDCT4_1D w,0,1,2,3,4,5 |
||
785 | pxor m7, m7 |
||
786 | mova [r2+ 0], m7 |
||
787 | mova [r2+16], m7 |
||
788 | mova [r2+32], m7 |
||
789 | mova [r2+48], m7 |
||
790 | STORE_DIFFx2 m0, m1, m4, m5, m7, 6, r0, r3 |
||
791 | lea r0, [r0+r3*2] |
||
792 | STORE_DIFFx2 m2, m3, m4, m5, m7, 6, r0, r3 |
||
793 | ret |
||
794 | |||
795 | %macro add16_sse2_cycle 2 |
||
796 | movzx r0, word [r4+%2] |
||
797 | test r0, r0 |
||
798 | jz .cycle%1end |
||
799 | mov r0d, dword [r1+%1*8] |
||
800 | %if ARCH_X86_64 |
||
801 | add r0, r5 |
||
802 | %else |
||
803 | add r0, r0m |
||
804 | %endif |
||
805 | call h264_add8x4_idct_sse2 |
||
806 | .cycle%1end: |
||
807 | %if %1 < 7 |
||
808 | add r2, 64 |
||
809 | %endif |
||
810 | %endmacro |
||
811 | |||
812 | ; ff_h264_idct_add16_8_sse2(uint8_t *dst, const int *block_offset, |
||
813 | ; int16_t *block, int stride, |
||
814 | ; const uint8_t nnzc[6 * 8]) |
||
815 | cglobal h264_idct_add16_8, 5, 5 + ARCH_X86_64, 8 |
||
816 | %if ARCH_X86_64 |
||
817 | mov r5, r0 |
||
818 | %endif |
||
819 | ; unrolling of the loop leads to an average performance gain of |
||
820 | ; 20-25% |
||
821 | add16_sse2_cycle 0, 0xc |
||
822 | add16_sse2_cycle 1, 0x14 |
||
823 | add16_sse2_cycle 2, 0xe |
||
824 | add16_sse2_cycle 3, 0x16 |
||
825 | add16_sse2_cycle 4, 0x1c |
||
826 | add16_sse2_cycle 5, 0x24 |
||
827 | add16_sse2_cycle 6, 0x1e |
||
828 | add16_sse2_cycle 7, 0x26 |
||
829 | RET |
||
830 | |||
831 | %macro add16intra_sse2_cycle 2 |
||
832 | movzx r0, word [r4+%2] |
||
833 | test r0, r0 |
||
834 | jz .try%1dc |
||
835 | mov r0d, dword [r1+%1*8] |
||
836 | %if ARCH_X86_64 |
||
837 | add r0, r7 |
||
838 | %else |
||
839 | add r0, r0m |
||
840 | %endif |
||
841 | call h264_add8x4_idct_sse2 |
||
842 | jmp .cycle%1end |
||
843 | .try%1dc: |
||
844 | movsx r0, word [r2 ] |
||
845 | or r0w, word [r2+32] |
||
846 | jz .cycle%1end |
||
847 | mov r0d, dword [r1+%1*8] |
||
848 | %if ARCH_X86_64 |
||
849 | add r0, r7 |
||
850 | %else |
||
851 | add r0, r0m |
||
852 | %endif |
||
853 | call h264_idct_dc_add8_mmxext |
||
854 | .cycle%1end: |
||
855 | %if %1 < 7 |
||
856 | add r2, 64 |
||
857 | %endif |
||
858 | %endmacro |
||
859 | |||
860 | ; ff_h264_idct_add16intra_8_sse2(uint8_t *dst, const int *block_offset, |
||
861 | ; int16_t *block, int stride, |
||
862 | ; const uint8_t nnzc[6 * 8]) |
||
863 | cglobal h264_idct_add16intra_8, 5, 7 + ARCH_X86_64, 8 |
||
864 | %if ARCH_X86_64 |
||
865 | mov r7, r0 |
||
866 | %endif |
||
867 | add16intra_sse2_cycle 0, 0xc |
||
868 | add16intra_sse2_cycle 1, 0x14 |
||
869 | add16intra_sse2_cycle 2, 0xe |
||
870 | add16intra_sse2_cycle 3, 0x16 |
||
871 | add16intra_sse2_cycle 4, 0x1c |
||
872 | add16intra_sse2_cycle 5, 0x24 |
||
873 | add16intra_sse2_cycle 6, 0x1e |
||
874 | add16intra_sse2_cycle 7, 0x26 |
||
875 | RET |
||
876 | |||
877 | %macro add8_sse2_cycle 2 |
||
878 | movzx r0, word [r4+%2] |
||
879 | test r0, r0 |
||
880 | jz .try%1dc |
||
881 | %if ARCH_X86_64 |
||
882 | mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] |
||
883 | add r0, [r7] |
||
884 | %else |
||
885 | mov r0, r0m |
||
886 | mov r0, [r0] |
||
887 | add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] |
||
888 | %endif |
||
889 | call h264_add8x4_idct_sse2 |
||
890 | jmp .cycle%1end |
||
891 | .try%1dc: |
||
892 | movsx r0, word [r2 ] |
||
893 | or r0w, word [r2+32] |
||
894 | jz .cycle%1end |
||
895 | %if ARCH_X86_64 |
||
896 | mov r0d, dword [r1+(%1&1)*8+64*(1+(%1>>1))] |
||
897 | add r0, [r7] |
||
898 | %else |
||
899 | mov r0, r0m |
||
900 | mov r0, [r0] |
||
901 | add r0, dword [r1+(%1&1)*8+64*(1+(%1>>1))] |
||
902 | %endif |
||
903 | call h264_idct_dc_add8_mmxext |
||
904 | .cycle%1end: |
||
905 | %if %1 == 1 |
||
906 | add r2, 384+64 |
||
907 | %elif %1 < 3 |
||
908 | add r2, 64 |
||
909 | %endif |
||
910 | %endmacro |
||
911 | |||
912 | ; ff_h264_idct_add8_8_sse2(uint8_t **dest, const int *block_offset, |
||
913 | ; int16_t *block, int stride, |
||
914 | ; const uint8_t nnzc[6 * 8]) |
||
915 | cglobal h264_idct_add8_8, 5, 7 + ARCH_X86_64, 8 |
||
916 | add r2, 512 |
||
917 | %if ARCH_X86_64 |
||
918 | mov r7, r0 |
||
919 | %endif |
||
920 | add8_sse2_cycle 0, 0x34 |
||
921 | add8_sse2_cycle 1, 0x3c |
||
922 | %if ARCH_X86_64 |
||
923 | add r7, gprsize |
||
924 | %else |
||
925 | add r0mp, gprsize |
||
926 | %endif |
||
927 | add8_sse2_cycle 2, 0x5c |
||
928 | add8_sse2_cycle 3, 0x64 |
||
929 | RET |
||
930 | |||
931 | ;void ff_h264_luma_dc_dequant_idct_mmx(int16_t *output, int16_t *input, int qmul) |
||
932 | |||
933 | %macro WALSH4_1D 5 |
||
934 | SUMSUB_BADC w, %4, %3, %2, %1, %5 |
||
935 | SUMSUB_BADC w, %4, %2, %3, %1, %5 |
||
936 | SWAP %1, %4, %3 |
||
937 | %endmacro |
||
938 | |||
939 | %macro DEQUANT_MMX 3 |
||
940 | mova m7, [pw_1] |
||
941 | mova m4, %1 |
||
942 | punpcklwd %1, m7 |
||
943 | punpckhwd m4, m7 |
||
944 | mova m5, %2 |
||
945 | punpcklwd %2, m7 |
||
946 | punpckhwd m5, m7 |
||
947 | movd m7, t3d |
||
948 | punpckldq m7, m7 |
||
949 | pmaddwd %1, m7 |
||
950 | pmaddwd %2, m7 |
||
951 | pmaddwd m4, m7 |
||
952 | pmaddwd m5, m7 |
||
953 | psrad %1, %3 |
||
954 | psrad %2, %3 |
||
955 | psrad m4, %3 |
||
956 | psrad m5, %3 |
||
957 | packssdw %1, m4 |
||
958 | packssdw %2, m5 |
||
959 | %endmacro |
||
960 | |||
961 | %macro STORE_WORDS 5-9 |
||
962 | %if cpuflag(sse) |
||
963 | movd t0d, %1 |
||
964 | psrldq %1, 4 |
||
965 | movd t1d, %1 |
||
966 | psrldq %1, 4 |
||
967 | mov [t2+%2*32], t0w |
||
968 | mov [t2+%4*32], t1w |
||
969 | shr t0d, 16 |
||
970 | shr t1d, 16 |
||
971 | mov [t2+%3*32], t0w |
||
972 | mov [t2+%5*32], t1w |
||
973 | movd t0d, %1 |
||
974 | psrldq %1, 4 |
||
975 | movd t1d, %1 |
||
976 | mov [t2+%6*32], t0w |
||
977 | mov [t2+%8*32], t1w |
||
978 | shr t0d, 16 |
||
979 | shr t1d, 16 |
||
980 | mov [t2+%7*32], t0w |
||
981 | mov [t2+%9*32], t1w |
||
982 | %else |
||
983 | movd t0d, %1 |
||
984 | psrlq %1, 32 |
||
985 | movd t1d, %1 |
||
986 | mov [t2+%2*32], t0w |
||
987 | mov [t2+%4*32], t1w |
||
988 | shr t0d, 16 |
||
989 | shr t1d, 16 |
||
990 | mov [t2+%3*32], t0w |
||
991 | mov [t2+%5*32], t1w |
||
992 | %endif |
||
993 | %endmacro |
||
994 | |||
995 | %macro DEQUANT_STORE 1 |
||
996 | %if cpuflag(sse2) |
||
997 | movd xmm4, t3d |
||
998 | movq xmm5, [pw_1] |
||
999 | pshufd xmm4, xmm4, 0 |
||
1000 | movq2dq xmm0, m0 |
||
1001 | movq2dq xmm1, m1 |
||
1002 | movq2dq xmm2, m2 |
||
1003 | movq2dq xmm3, m3 |
||
1004 | punpcklwd xmm0, xmm5 |
||
1005 | punpcklwd xmm1, xmm5 |
||
1006 | punpcklwd xmm2, xmm5 |
||
1007 | punpcklwd xmm3, xmm5 |
||
1008 | pmaddwd xmm0, xmm4 |
||
1009 | pmaddwd xmm1, xmm4 |
||
1010 | pmaddwd xmm2, xmm4 |
||
1011 | pmaddwd xmm3, xmm4 |
||
1012 | psrad xmm0, %1 |
||
1013 | psrad xmm1, %1 |
||
1014 | psrad xmm2, %1 |
||
1015 | psrad xmm3, %1 |
||
1016 | packssdw xmm0, xmm1 |
||
1017 | packssdw xmm2, xmm3 |
||
1018 | STORE_WORDS xmm0, 0, 1, 4, 5, 2, 3, 6, 7 |
||
1019 | STORE_WORDS xmm2, 8, 9, 12, 13, 10, 11, 14, 15 |
||
1020 | %else |
||
1021 | DEQUANT_MMX m0, m1, %1 |
||
1022 | STORE_WORDS m0, 0, 1, 4, 5 |
||
1023 | STORE_WORDS m1, 2, 3, 6, 7 |
||
1024 | |||
1025 | DEQUANT_MMX m2, m3, %1 |
||
1026 | STORE_WORDS m2, 8, 9, 12, 13 |
||
1027 | STORE_WORDS m3, 10, 11, 14, 15 |
||
1028 | %endif |
||
1029 | %endmacro |
||
1030 | |||
1031 | %macro IDCT_DC_DEQUANT 1 |
||
1032 | cglobal h264_luma_dc_dequant_idct, 3, 4, %1 |
||
1033 | ; manually spill XMM registers for Win64 because |
||
1034 | ; the code here is initialized with INIT_MMX |
||
1035 | WIN64_SPILL_XMM %1 |
||
1036 | movq m3, [r1+24] |
||
1037 | movq m2, [r1+16] |
||
1038 | movq m1, [r1+ 8] |
||
1039 | movq m0, [r1+ 0] |
||
1040 | WALSH4_1D 0,1,2,3,4 |
||
1041 | TRANSPOSE4x4W 0,1,2,3,4 |
||
1042 | WALSH4_1D 0,1,2,3,4 |
||
1043 | |||
1044 | ; shift, tmp, output, qmul |
||
1045 | %if WIN64 |
||
1046 | DECLARE_REG_TMP 0,3,1,2 |
||
1047 | ; we can't avoid this, because r0 is the shift register (ecx) on win64 |
||
1048 | xchg r0, t2 |
||
1049 | %elif ARCH_X86_64 |
||
1050 | DECLARE_REG_TMP 3,1,0,2 |
||
1051 | %else |
||
1052 | DECLARE_REG_TMP 1,3,0,2 |
||
1053 | %endif |
||
1054 | |||
1055 | cmp t3d, 32767 |
||
1056 | jg .big_qmul |
||
1057 | add t3d, 128 << 16 |
||
1058 | DEQUANT_STORE 8 |
||
1059 | RET |
||
1060 | .big_qmul: |
||
1061 | bsr t0d, t3d |
||
1062 | add t3d, 128 << 16 |
||
1063 | mov t1d, 7 |
||
1064 | cmp t0d, t1d |
||
1065 | cmovg t0d, t1d |
||
1066 | inc t1d |
||
1067 | shr t3d, t0b |
||
1068 | sub t1d, t0d |
||
1069 | %if cpuflag(sse2) |
||
1070 | movd xmm6, t1d |
||
1071 | DEQUANT_STORE xmm6 |
||
1072 | %else |
||
1073 | movd m6, t1d |
||
1074 | DEQUANT_STORE m6 |
||
1075 | %endif |
||
1076 | RET |
||
1077 | %endmacro |
||
1078 | |||
1079 | INIT_MMX mmx |
||
1080 | IDCT_DC_DEQUANT 0 |
||
1081 | INIT_MMX sse2 |
||
1082 | IDCT_DC_DEQUANT 7><>><>>>> |