Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6147 | serge | 1 | ;***************************************************************************** |
2 | ;* x86-optimized functions for fspp filter |
||
3 | ;* |
||
4 | ;* Copyright (c) 2003 Michael Niedermayer |
||
5 | ;* Copyright (C) 2005 Nikolaj Poroshin |
||
6 | ;* |
||
7 | ;* This file is part of FFmpeg. |
||
8 | ;* |
||
9 | ;* FFmpeg is free software; you can redistribute it and/or modify |
||
10 | ;* it under the terms of the GNU General Public License as published by |
||
11 | ;* the Free Software Foundation; either version 2 of the License, or |
||
12 | ;* (at your option) any later version. |
||
13 | ;* |
||
14 | ;* FFmpeg is distributed in the hope that it will be useful, |
||
15 | ;* but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
16 | ;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
||
17 | ;* GNU General Public License for more details. |
||
18 | ;* |
||
19 | ;* You should have received a copy of the GNU General Public License along |
||
20 | ;* with FFmpeg; if not, write to the Free Software Foundation, Inc., |
||
21 | ;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. |
||
22 | ;****************************************************************************** |
||
23 | |||
24 | %include "libavutil/x86/x86util.asm" |
||
25 | |||
26 | SECTION_RODATA |
||
27 | |||
28 | pb_dither: db 0, 48, 12, 60, 3, 51, 15, 63, 32, 16, 44, 28, 35, 19, 47, 31, \ |
||
29 | 8, 56, 4, 52, 11, 59, 7, 55, 40, 24, 36, 20, 43, 27, 39, 23, \ |
||
30 | 2, 50, 14, 62, 1, 49, 13, 61, 34, 18, 46, 30, 33, 17, 45, 29, \ |
||
31 | 10, 58, 6, 54, 9, 57, 5, 53, 42, 26, 38, 22, 41, 25, 37, 21 |
||
32 | pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14) |
||
33 | pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13) |
||
34 | pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13) |
||
35 | pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14) |
||
36 | pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14) |
||
37 | pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13) |
||
38 | pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13) |
||
39 | pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14) |
||
40 | pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14) |
||
41 | pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14) |
||
42 | pw_4: times 4 dw 4 |
||
43 | pw_2: times 4 dw 2 |
||
44 | |||
45 | SECTION .text |
||
46 | |||
47 | %define DCTSIZE 8 |
||
48 | |||
49 | INIT_MMX mmx |
||
50 | |||
51 | ;void ff_store_slice_mmx(uint8_t *dst, int16_t *src, |
||
52 | ; ptrdiff_t dst_stride, ptrdiff_t src_stride, |
||
53 | ; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) |
||
54 | %if ARCH_X86_64 |
||
55 | cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 |
||
56 | %else |
||
57 | cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 |
||
58 | %define dst_strideq r2m |
||
59 | %define src_strideq r3m |
||
60 | mov widthq, r4m |
||
61 | mov dither_heightq, r5m |
||
62 | mov ditherq, r6m ; log2_scale |
||
63 | %endif |
||
64 | add widthq, 7 |
||
65 | mov tmpq, src_strideq |
||
66 | and widthq, ~7 |
||
67 | sub dst_strideq, widthq |
||
68 | movd m5, ditherd ; log2_scale |
||
69 | xor ditherq, -1 ; log2_scale |
||
70 | mov tmp2q, tmpq |
||
71 | add ditherq, 7 ; log2_scale |
||
72 | neg tmpq |
||
73 | sub tmp2q, widthq |
||
74 | movd m2, ditherd ; log2_scale |
||
75 | add tmp2q, tmp2q |
||
76 | lea ditherq, [pb_dither] |
||
77 | mov src_strideq, tmp2q |
||
78 | shl tmpq, 4 |
||
79 | lea dither_heightq, [ditherq+dither_heightq*8] |
||
80 | pxor m7, m7 |
||
81 | |||
82 | .loop_height: |
||
83 | movq m3, [ditherq] |
||
84 | movq m4, m3 |
||
85 | punpcklbw m3, m7 |
||
86 | punpckhbw m4, m7 |
||
87 | mov tmp2q, widthq |
||
88 | psraw m3, m5 |
||
89 | psraw m4, m5 |
||
90 | |||
91 | .loop_width: |
||
92 | movq [srcq+tmpq], m7 |
||
93 | movq m0, [srcq] |
||
94 | movq m1, [srcq+8] |
||
95 | movq [srcq+tmpq+8], m7 |
||
96 | paddw m0, m3 |
||
97 | paddw m1, m4 |
||
98 | movq [srcq], m7 |
||
99 | psraw m0, m2 |
||
100 | psraw m1, m2 |
||
101 | movq [srcq+8], m7 |
||
102 | packuswb m0, m1 |
||
103 | add srcq, 16 |
||
104 | movq [dstq], m0 |
||
105 | add dstq, 8 |
||
106 | sub tmp2q, 8 |
||
107 | jg .loop_width |
||
108 | |||
109 | add srcq, src_strideq |
||
110 | add ditherq, 8 |
||
111 | add dstq, dst_strideq |
||
112 | cmp ditherq, dither_heightq |
||
113 | jl .loop_height |
||
114 | RET |
||
115 | |||
116 | ;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src, |
||
117 | ; ptrdiff_t dst_stride, ptrdiff_t src_stride, |
||
118 | ; ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale) |
||
119 | %if ARCH_X86_64 |
||
120 | cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2 |
||
121 | %else |
||
122 | cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2 |
||
123 | %define dst_strideq r2m |
||
124 | %define src_strideq r3m |
||
125 | mov dstq, dstm |
||
126 | mov srcq, srcm |
||
127 | mov widthq, r4m |
||
128 | mov dither_heightq, r5m |
||
129 | mov ditherq, r6m ; log2_scale |
||
130 | %endif |
||
131 | add widthq, 7 |
||
132 | mov tmpq, src_strideq |
||
133 | and widthq, ~7 |
||
134 | sub dst_strideq, widthq |
||
135 | movd m5, ditherd ; log2_scale |
||
136 | xor ditherq, -1 ; log2_scale |
||
137 | mov tmp2q, tmpq |
||
138 | add ditherq, 7 ; log2_scale |
||
139 | sub tmp2q, widthq |
||
140 | movd m2, ditherd ; log2_scale |
||
141 | add tmp2q, tmp2q |
||
142 | lea ditherq, [pb_dither] |
||
143 | mov src_strideq, tmp2q |
||
144 | shl tmpq, 5 |
||
145 | lea dither_heightq, [ditherq+dither_heightq*8] |
||
146 | pxor m7, m7 |
||
147 | |||
148 | .loop_height: |
||
149 | movq m3, [ditherq] |
||
150 | movq m4, m3 |
||
151 | punpcklbw m3, m7 |
||
152 | punpckhbw m4, m7 |
||
153 | mov tmp2q,widthq |
||
154 | psraw m3, m5 |
||
155 | psraw m4, m5 |
||
156 | |||
157 | .loop_width: |
||
158 | movq m0, [srcq] |
||
159 | movq m1, [srcq+8] |
||
160 | paddw m0, m3 |
||
161 | paddw m0, [srcq+tmpq] |
||
162 | paddw m1, m4 |
||
163 | movq m6, [srcq+tmpq+8] |
||
164 | movq [srcq+tmpq], m7 |
||
165 | psraw m0, m2 |
||
166 | paddw m1, m6 |
||
167 | movq [srcq+tmpq+8], m7 |
||
168 | psraw m1, m2 |
||
169 | packuswb m0, m1 |
||
170 | movq [dstq], m0 |
||
171 | add srcq, 16 |
||
172 | add dstq, 8 |
||
173 | sub tmp2q, 8 |
||
174 | jg .loop_width |
||
175 | |||
176 | add srcq, src_strideq |
||
177 | add ditherq, 8 |
||
178 | add dstq, dst_strideq |
||
179 | cmp ditherq, dither_heightq |
||
180 | jl .loop_height |
||
181 | RET |
||
182 | |||
183 | ;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q); |
||
184 | cglobal mul_thrmat, 3, 3, 0, thrn, thr, q |
||
185 | movd m7, qd |
||
186 | movq m0, [thrnq] |
||
187 | punpcklwd m7, m7 |
||
188 | movq m1, [thrnq+8] |
||
189 | punpckldq m7, m7 |
||
190 | pmullw m0, m7 |
||
191 | movq m2, [thrnq+8*2] |
||
192 | pmullw m1, m7 |
||
193 | movq m3, [thrnq+8*3] |
||
194 | pmullw m2, m7 |
||
195 | movq [thrq], m0 |
||
196 | movq m4, [thrnq+8*4] |
||
197 | pmullw m3, m7 |
||
198 | movq [thrq+8], m1 |
||
199 | movq m5, [thrnq+8*5] |
||
200 | pmullw m4, m7 |
||
201 | movq [thrq+8*2], m2 |
||
202 | movq m6, [thrnq+8*6] |
||
203 | pmullw m5, m7 |
||
204 | movq [thrq+8*3], m3 |
||
205 | movq m0, [thrnq+8*7] |
||
206 | pmullw m6, m7 |
||
207 | movq [thrq+8*4], m4 |
||
208 | movq m1, [thrnq+8*7+8] |
||
209 | pmullw m0, m7 |
||
210 | movq [thrq+8*5], m5 |
||
211 | movq m2, [thrnq+8*7+8*2] |
||
212 | pmullw m1, m7 |
||
213 | movq [thrq+8*6], m6 |
||
214 | movq m3, [thrnq+8*7+8*3] |
||
215 | pmullw m2, m7 |
||
216 | movq [thrq+8*7], m0 |
||
217 | movq m4, [thrnq+8*7+8*4] |
||
218 | pmullw m3, m7 |
||
219 | movq [thrq+8*7+8], m1 |
||
220 | movq m5, [thrnq+8*7+8*5] |
||
221 | pmullw m4, m7 |
||
222 | movq [thrq+8*7+8*2], m2 |
||
223 | movq m6, [thrnq+8*7+8*6] |
||
224 | pmullw m5, m7 |
||
225 | movq [thrq+8*7+8*3], m3 |
||
226 | movq m0, [thrnq+14*8] |
||
227 | pmullw m6, m7 |
||
228 | movq [thrq+8*7+8*4], m4 |
||
229 | movq m1, [thrnq+14*8+8] |
||
230 | pmullw m0, m7 |
||
231 | movq [thrq+8*7+8*5], m5 |
||
232 | pmullw m1, m7 |
||
233 | movq [thrq+8*7+8*6], m6 |
||
234 | movq [thrq+14*8], m0 |
||
235 | movq [thrq+14*8+8], m1 |
||
236 | RET |
||
237 | |||
238 | %macro COLUMN_FDCT 1-3 0, 0 |
||
239 | movq m1, [srcq+DCTSIZE*0*2] |
||
240 | movq m7, [srcq+DCTSIZE*3*2] |
||
241 | movq m0, m1 |
||
242 | paddw m1, [srcq+DCTSIZE*7*2] |
||
243 | movq m3, m7 |
||
244 | paddw m7, [srcq+DCTSIZE*4*2] |
||
245 | movq m5, m1 |
||
246 | movq m6, [srcq+DCTSIZE*1*2] |
||
247 | psubw m1, m7 |
||
248 | movq m2, [srcq+DCTSIZE*2*2] |
||
249 | movq m4, m6 |
||
250 | paddw m6, [srcq+DCTSIZE*6*2] |
||
251 | paddw m5, m7 |
||
252 | paddw m2, [srcq+DCTSIZE*5*2] |
||
253 | movq m7, m6 |
||
254 | paddw m6, m2 |
||
255 | psubw m7, m2 |
||
256 | movq m2, m5 |
||
257 | paddw m5, m6 |
||
258 | psubw m2, m6 |
||
259 | paddw m7, m1 |
||
260 | movq m6, [thrq+4*16+%2] |
||
261 | psllw m7, 2 |
||
262 | psubw m5, [thrq+%2] |
||
263 | psubw m2, m6 |
||
264 | paddusw m5, [thrq+%2] |
||
265 | paddusw m2, m6 |
||
266 | pmulhw m7, [pw_2D41] |
||
267 | paddw m5, [thrq+%2] |
||
268 | paddw m2, m6 |
||
269 | psubusw m5, [thrq+%2] |
||
270 | psubusw m2, m6 |
||
271 | paddw m5, [pw_2] |
||
272 | movq m6, m2 |
||
273 | paddw m2, m5 |
||
274 | psubw m5, m6 |
||
275 | movq m6, m1 |
||
276 | paddw m1, m7 |
||
277 | psubw m1, [thrq+2*16+%2] |
||
278 | psubw m6, m7 |
||
279 | movq m7, [thrq+6*16+%2] |
||
280 | psraw m5, 2 |
||
281 | paddusw m1, [thrq+2*16+%2] |
||
282 | psubw m6, m7 |
||
283 | paddw m1, [thrq+2*16+%2] |
||
284 | paddusw m6, m7 |
||
285 | psubusw m1, [thrq+2*16+%2] |
||
286 | paddw m6, m7 |
||
287 | psubw m3, [srcq+DCTSIZE*4*2] |
||
288 | psubusw m6, m7 |
||
289 | movq m7, m1 |
||
290 | psraw m2, 2 |
||
291 | psubw m4, [srcq+DCTSIZE*6*2] |
||
292 | psubw m1, m6 |
||
293 | psubw m0, [srcq+DCTSIZE*7*2] |
||
294 | paddw m6, m7 |
||
295 | psraw m6, 2 |
||
296 | movq m7, m2 |
||
297 | pmulhw m1, [pw_5A82] |
||
298 | paddw m2, m6 |
||
299 | movq [rsp], m2 |
||
300 | psubw m7, m6 |
||
301 | movq m2, [srcq+DCTSIZE*2*2] |
||
302 | psubw m1, m6 |
||
303 | psubw m2, [srcq+DCTSIZE*5*2] |
||
304 | movq m6, m5 |
||
305 | movq [rsp+8*3], m7 |
||
306 | paddw m3, m2 |
||
307 | paddw m2, m4 |
||
308 | paddw m4, m0 |
||
309 | movq m7, m3 |
||
310 | psubw m3, m4 |
||
311 | psllw m3, 2 |
||
312 | psllw m7, 2 |
||
313 | pmulhw m3, [pw_187E] |
||
314 | psllw m4, 2 |
||
315 | pmulhw m7, [pw_22A3] |
||
316 | psllw m2, 2 |
||
317 | pmulhw m4, [pw_539F] |
||
318 | paddw m5, m1 |
||
319 | pmulhw m2, [pw_2D41] |
||
320 | psubw m6, m1 |
||
321 | paddw m7, m3 |
||
322 | movq [rsp+8], m5 |
||
323 | paddw m4, m3 |
||
324 | movq m3, [thrq+3*16+%2] |
||
325 | movq m1, m0 |
||
326 | movq [rsp+8*2], m6 |
||
327 | psubw m1, m2 |
||
328 | paddw m0, m2 |
||
329 | movq m5, m1 |
||
330 | movq m2, [thrq+5*16+%2] |
||
331 | psubw m1, m7 |
||
332 | paddw m5, m7 |
||
333 | psubw m1, m3 |
||
334 | movq m7, [thrq+16+%2] |
||
335 | psubw m5, m2 |
||
336 | movq m6, m0 |
||
337 | paddw m0, m4 |
||
338 | paddusw m1, m3 |
||
339 | psubw m6, m4 |
||
340 | movq m4, [thrq+7*16+%2] |
||
341 | psubw m0, m7 |
||
342 | psubw m6, m4 |
||
343 | paddusw m5, m2 |
||
344 | paddusw m6, m4 |
||
345 | paddw m1, m3 |
||
346 | paddw m5, m2 |
||
347 | paddw m6, m4 |
||
348 | psubusw m1, m3 |
||
349 | psubusw m5, m2 |
||
350 | psubusw m6, m4 |
||
351 | movq m4, m1 |
||
352 | por m4, m5 |
||
353 | paddusw m0, m7 |
||
354 | por m4, m6 |
||
355 | paddw m0, m7 |
||
356 | packssdw m4, m4 |
||
357 | psubusw m0, m7 |
||
358 | movd tmpd, m4 |
||
359 | or tmpd, tmpd |
||
360 | jnz %1 |
||
361 | movq m4, [rsp] |
||
362 | movq m1, m0 |
||
363 | pmulhw m0, [pw_3642] |
||
364 | movq m2, m1 |
||
365 | movq m5, [outq+DCTSIZE*0*2] |
||
366 | movq m3, m2 |
||
367 | pmulhw m1, [pw_2441] |
||
368 | paddw m5, m4 |
||
369 | movq m6, [rsp+8] |
||
370 | psraw m3, 2 |
||
371 | pmulhw m2, [pw_0CBB] |
||
372 | psubw m4, m3 |
||
373 | movq m7, [outq+DCTSIZE*1*2] |
||
374 | paddw m5, m3 |
||
375 | movq [outq+DCTSIZE*7*2], m4 |
||
376 | paddw m7, m6 |
||
377 | movq m3, [rsp+8*2] |
||
378 | psubw m6, m0 |
||
379 | movq m4, [outq+DCTSIZE*2*2] |
||
380 | paddw m7, m0 |
||
381 | movq [outq], m5 |
||
382 | paddw m4, m3 |
||
383 | movq [outq+DCTSIZE*6*2], m6 |
||
384 | psubw m3, m1 |
||
385 | movq m5, [outq+DCTSIZE*5*2] |
||
386 | paddw m4, m1 |
||
387 | movq m6, [outq+DCTSIZE*3*2] |
||
388 | paddw m5, m3 |
||
389 | movq m0, [rsp+8*3] |
||
390 | add srcq, 8+%3 |
||
391 | movq [outq+DCTSIZE*1*2], m7 |
||
392 | paddw m6, m0 |
||
393 | movq [outq+DCTSIZE*2*2], m4 |
||
394 | psubw m0, m2 |
||
395 | movq m7, [outq+DCTSIZE*4*2] |
||
396 | paddw m6, m2 |
||
397 | movq [outq+DCTSIZE*5*2], m5 |
||
398 | paddw m7, m0 |
||
399 | movq [outq+DCTSIZE*3*2], m6 |
||
400 | movq [outq+DCTSIZE*4*2], m7 |
||
401 | add outq, 8+%3 |
||
402 | %endmacro |
||
403 | |||
404 | %macro COLUMN_IDCT 0-1 0 |
||
405 | movq m3, m5 |
||
406 | psubw m5, m1 |
||
407 | psllw m5, 1 |
||
408 | paddw m3, m1 |
||
409 | movq m2, m0 |
||
410 | psubw m0, m6 |
||
411 | movq m1, m5 |
||
412 | psllw m0, 1 |
||
413 | pmulhw m1, [pw_AC62] |
||
414 | paddw m5, m0 |
||
415 | pmulhw m5, [pw_3B21] |
||
416 | paddw m2, m6 |
||
417 | pmulhw m0, [pw_22A3] |
||
418 | movq m7, m2 |
||
419 | movq m4, [rsp] |
||
420 | psubw m2, m3 |
||
421 | psllw m2, 1 |
||
422 | paddw m7, m3 |
||
423 | pmulhw m2, [pw_2D41] |
||
424 | movq m6, m4 |
||
425 | psraw m7, 2 |
||
426 | paddw m4, [outq] |
||
427 | psubw m6, m7 |
||
428 | movq m3, [rsp+8] |
||
429 | paddw m4, m7 |
||
430 | movq [outq+DCTSIZE*7*2], m6 |
||
431 | paddw m1, m5 |
||
432 | movq [outq], m4 |
||
433 | psubw m1, m7 |
||
434 | movq m7, [rsp+8*2] |
||
435 | psubw m0, m5 |
||
436 | movq m6, [rsp+8*3] |
||
437 | movq m5, m3 |
||
438 | paddw m3, [outq+DCTSIZE*1*2] |
||
439 | psubw m5, m1 |
||
440 | psubw m2, m1 |
||
441 | paddw m3, m1 |
||
442 | movq [outq+DCTSIZE*6*2], m5 |
||
443 | movq m4, m7 |
||
444 | paddw m7, [outq+DCTSIZE*2*2] |
||
445 | psubw m4, m2 |
||
446 | paddw m4, [outq+DCTSIZE*5*2] |
||
447 | paddw m7, m2 |
||
448 | movq [outq+DCTSIZE*1*2], m3 |
||
449 | paddw m0, m2 |
||
450 | movq [outq+DCTSIZE*2*2], m7 |
||
451 | movq m1, m6 |
||
452 | paddw m6, [outq+DCTSIZE*4*2] |
||
453 | psubw m1, m0 |
||
454 | paddw m1, [outq+DCTSIZE*3*2] |
||
455 | paddw m6, m0 |
||
456 | movq [outq+DCTSIZE*5*2], m4 |
||
457 | add srcq, 8+%1 |
||
458 | movq [outq+DCTSIZE*4*2], m6 |
||
459 | movq [outq+DCTSIZE*3*2], m1 |
||
460 | add outq, 8+%1 |
||
461 | %endmacro |
||
462 | |||
463 | ;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt); |
||
464 | cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp |
||
465 | .fdct1: |
||
466 | COLUMN_FDCT .idct1 |
||
467 | jmp .fdct2 |
||
468 | |||
469 | .idct1: |
||
470 | COLUMN_IDCT |
||
471 | |||
472 | .fdct2: |
||
473 | COLUMN_FDCT .idct2, 8, 16 |
||
474 | sub cntd, 2 |
||
475 | jg .fdct1 |
||
476 | RET |
||
477 | |||
478 | .idct2: |
||
479 | COLUMN_IDCT 16 |
||
480 | sub cntd, 2 |
||
481 | jg .fdct1 |
||
482 | RET |
||
483 | |||
484 | ;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt); |
||
485 | cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3 |
||
486 | add strideq, strideq |
||
487 | lea stride3q, [strideq+strideq*2] |
||
488 | .loop: |
||
489 | movq m0, [srcq+DCTSIZE*0*2] |
||
490 | movq m1, [srcq+DCTSIZE*1*2] |
||
491 | movq m4, m0 |
||
492 | movq m2, [srcq+DCTSIZE*2*2] |
||
493 | punpcklwd m0, m1 |
||
494 | movq m3, [srcq+DCTSIZE*3*2] |
||
495 | punpckhwd m4, m1 |
||
496 | movq m7, m2 |
||
497 | punpcklwd m2, m3 |
||
498 | movq m6, m0 |
||
499 | punpckldq m0, m2 |
||
500 | punpckhdq m6, m2 |
||
501 | movq m5, m0 |
||
502 | punpckhwd m7, m3 |
||
503 | psubw m0, m6 |
||
504 | pmulhw m0, [pw_5A82] |
||
505 | movq m2, m4 |
||
506 | punpckldq m4, m7 |
||
507 | paddw m5, m6 |
||
508 | punpckhdq m2, m7 |
||
509 | movq m1, m4 |
||
510 | psllw m0, 2 |
||
511 | paddw m4, m2 |
||
512 | movq m3, [srcq+DCTSIZE*0*2+8] |
||
513 | psubw m1, m2 |
||
514 | movq m2, [srcq+DCTSIZE*1*2+8] |
||
515 | psubw m0, m5 |
||
516 | movq m6, m4 |
||
517 | paddw m4, m5 |
||
518 | psubw m6, m5 |
||
519 | movq m7, m1 |
||
520 | movq m5, [srcq+DCTSIZE*2*2+8] |
||
521 | paddw m1, m0 |
||
522 | movq [rsp], m4 |
||
523 | movq m4, m3 |
||
524 | movq [rsp+8], m6 |
||
525 | punpcklwd m3, m2 |
||
526 | movq m6, [srcq+DCTSIZE*3*2+8] |
||
527 | punpckhwd m4, m2 |
||
528 | movq m2, m5 |
||
529 | punpcklwd m5, m6 |
||
530 | psubw m7, m0 |
||
531 | punpckhwd m2, m6 |
||
532 | movq m0, m3 |
||
533 | punpckldq m3, m5 |
||
534 | punpckhdq m0, m5 |
||
535 | movq m5, m4 |
||
536 | movq m6, m3 |
||
537 | punpckldq m4, m2 |
||
538 | psubw m3, m0 |
||
539 | punpckhdq m5, m2 |
||
540 | paddw m6, m0 |
||
541 | movq m2, m4 |
||
542 | movq m0, m3 |
||
543 | psubw m4, m5 |
||
544 | pmulhw m0, [pw_AC62] |
||
545 | paddw m3, m4 |
||
546 | pmulhw m3, [pw_3B21] |
||
547 | paddw m2, m5 |
||
548 | pmulhw m4, [pw_22A3] |
||
549 | movq m5, m2 |
||
550 | psubw m2, m6 |
||
551 | paddw m5, m6 |
||
552 | pmulhw m2, [pw_2D41] |
||
553 | paddw m0, m3 |
||
554 | psllw m0, 3 |
||
555 | psubw m4, m3 |
||
556 | movq m6, [rsp] |
||
557 | movq m3, m1 |
||
558 | psllw m4, 3 |
||
559 | psubw m0, m5 |
||
560 | psllw m2, 3 |
||
561 | paddw m1, m0 |
||
562 | psubw m2, m0 |
||
563 | psubw m3, m0 |
||
564 | paddw m4, m2 |
||
565 | movq m0, m7 |
||
566 | paddw m7, m2 |
||
567 | psubw m0, m2 |
||
568 | movq m2, [pw_4] |
||
569 | psubw m6, m5 |
||
570 | paddw m5, [rsp] |
||
571 | paddw m1, m2 |
||
572 | paddw m5, m2 |
||
573 | psraw m1, 3 |
||
574 | paddw m7, m2 |
||
575 | psraw m5, 3 |
||
576 | paddw m5, [dstq] |
||
577 | psraw m7, 3 |
||
578 | paddw m1, [dstq+strideq*1] |
||
579 | paddw m0, m2 |
||
580 | paddw m7, [dstq+strideq*2] |
||
581 | paddw m3, m2 |
||
582 | movq [dstq], m5 |
||
583 | paddw m6, m2 |
||
584 | movq [dstq+strideq*1], m1 |
||
585 | psraw m0, 3 |
||
586 | movq [dstq+strideq*2], m7 |
||
587 | add dstq, stride3q |
||
588 | movq m5, [rsp+8] |
||
589 | psraw m3, 3 |
||
590 | paddw m0, [dstq+strideq*2] |
||
591 | psubw m5, m4 |
||
592 | paddw m3, [dstq+stride3q*1] |
||
593 | psraw m6, 3 |
||
594 | paddw m4, [rsp+8] |
||
595 | paddw m5, m2 |
||
596 | paddw m6, [dstq+strideq*4] |
||
597 | paddw m4, m2 |
||
598 | movq [dstq+strideq*2], m0 |
||
599 | psraw m5, 3 |
||
600 | paddw m5, [dstq] |
||
601 | psraw m4, 3 |
||
602 | paddw m4, [dstq+strideq*1] |
||
603 | add srcq, DCTSIZE*2*4 |
||
604 | movq [dstq+stride3q*1], m3 |
||
605 | movq [dstq+strideq*4], m6 |
||
606 | movq [dstq], m5 |
||
607 | movq [dstq+strideq*1], m4 |
||
608 | sub dstq, stride3q |
||
609 | add dstq, 8 |
||
610 | dec r3d |
||
611 | jnz .loop |
||
612 | RET |
||
613 | |||
614 | ;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt); |
||
615 | cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3 |
||
616 | lea stride3q, [strideq+strideq*2] |
||
617 | .loop: |
||
618 | movd m0, [pixq] |
||
619 | pxor m7, m7 |
||
620 | movd m1, [pixq+strideq*1] |
||
621 | punpcklbw m0, m7 |
||
622 | movd m2, [pixq+strideq*2] |
||
623 | punpcklbw m1, m7 |
||
624 | punpcklbw m2, m7 |
||
625 | add pixq,stride3q |
||
626 | movq m5, m0 |
||
627 | movd m3, [pixq+strideq*4] |
||
628 | movq m6, m1 |
||
629 | movd m4, [pixq+stride3q*1] |
||
630 | punpcklbw m3, m7 |
||
631 | psubw m5, m3 |
||
632 | punpcklbw m4, m7 |
||
633 | paddw m0, m3 |
||
634 | psubw m6, m4 |
||
635 | movd m3, [pixq+strideq*2] |
||
636 | paddw m1, m4 |
||
637 | movq [rsp], m5 |
||
638 | punpcklbw m3, m7 |
||
639 | movq [rsp+8], m6 |
||
640 | movq m4, m2 |
||
641 | movd m5, [pixq] |
||
642 | paddw m2, m3 |
||
643 | movd m6, [pixq+strideq*1] |
||
644 | punpcklbw m5, m7 |
||
645 | psubw m4, m3 |
||
646 | punpcklbw m6, m7 |
||
647 | movq m3, m5 |
||
648 | paddw m5, m6 |
||
649 | psubw m3, m6 |
||
650 | movq m6, m0 |
||
651 | movq m7, m1 |
||
652 | psubw m0, m5 |
||
653 | psubw m1, m2 |
||
654 | paddw m7, m2 |
||
655 | paddw m1, m0 |
||
656 | movq m2, m7 |
||
657 | psllw m1, 2 |
||
658 | paddw m6, m5 |
||
659 | pmulhw m1, [pw_2D41] |
||
660 | paddw m7, m6 |
||
661 | psubw m6, m2 |
||
662 | movq m5, m0 |
||
663 | movq m2, m7 |
||
664 | punpcklwd m7, m6 |
||
665 | paddw m0, m1 |
||
666 | punpckhwd m2, m6 |
||
667 | psubw m5, m1 |
||
668 | movq m6, m0 |
||
669 | movq m1, [rsp+8] |
||
670 | punpcklwd m0, m5 |
||
671 | punpckhwd m6, m5 |
||
672 | movq m5, m0 |
||
673 | punpckldq m0, m7 |
||
674 | paddw m3, m4 |
||
675 | punpckhdq m5, m7 |
||
676 | movq m7, m6 |
||
677 | movq [srcq+DCTSIZE*0*2], m0 |
||
678 | punpckldq m6, m2 |
||
679 | movq [srcq+DCTSIZE*1*2], m5 |
||
680 | punpckhdq m7, m2 |
||
681 | movq [srcq+DCTSIZE*2*2], m6 |
||
682 | paddw m4, m1 |
||
683 | movq [srcq+DCTSIZE*3*2], m7 |
||
684 | psllw m3, 2 |
||
685 | movq m2, [rsp] |
||
686 | psllw m4, 2 |
||
687 | pmulhw m4, [pw_2D41] |
||
688 | paddw m1, m2 |
||
689 | psllw m1, 2 |
||
690 | movq m0, m3 |
||
691 | pmulhw m0, [pw_22A3] |
||
692 | psubw m3, m1 |
||
693 | pmulhw m3, [pw_187E] |
||
694 | movq m5, m2 |
||
695 | pmulhw m1, [pw_539F] |
||
696 | psubw m2, m4 |
||
697 | paddw m5, m4 |
||
698 | movq m6, m2 |
||
699 | paddw m0, m3 |
||
700 | movq m7, m5 |
||
701 | paddw m2, m0 |
||
702 | psubw m6, m0 |
||
703 | movq m4, m2 |
||
704 | paddw m1, m3 |
||
705 | punpcklwd m2, m6 |
||
706 | paddw m5, m1 |
||
707 | punpckhwd m4, m6 |
||
708 | psubw m7, m1 |
||
709 | movq m6, m5 |
||
710 | punpcklwd m5, m7 |
||
711 | punpckhwd m6, m7 |
||
712 | movq m7, m2 |
||
713 | punpckldq m2, m5 |
||
714 | sub pixq, stride3q |
||
715 | punpckhdq m7, m5 |
||
716 | movq m5, m4 |
||
717 | movq [srcq+DCTSIZE*0*2+8], m2 |
||
718 | punpckldq m4, m6 |
||
719 | movq [srcq+DCTSIZE*1*2+8], m7 |
||
720 | punpckhdq m5, m6 |
||
721 | movq [srcq+DCTSIZE*2*2+8], m4 |
||
722 | add pixq, 4 |
||
723 | movq [srcq+DCTSIZE*3*2+8], m5 |
||
724 | add srcq, DCTSIZE*4*2 |
||
725 | dec cntd |
||
726 | jnz .loop |
||
727 | RET |