Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6147 serge 1
;*****************************************************************************
2
;* x86-optimized functions for fspp filter
3
;*
4
;* Copyright (c) 2003 Michael Niedermayer 
5
;* Copyright (C) 2005 Nikolaj Poroshin 
6
;*
7
;* This file is part of FFmpeg.
8
;*
9
;* FFmpeg is free software; you can redistribute it and/or modify
10
;* it under the terms of the GNU General Public License as published by
11
;* the Free Software Foundation; either version 2 of the License, or
12
;* (at your option) any later version.
13
;*
14
;* FFmpeg is distributed in the hope that it will be useful,
15
;* but WITHOUT ANY WARRANTY; without even the implied warranty of
16
;* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17
;* GNU General Public License for more details.
18
;*
19
;* You should have received a copy of the GNU General Public License along
20
;* with FFmpeg; if not, write to the Free Software Foundation, Inc.,
21
;* 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22
;******************************************************************************
23
 
24
%include "libavutil/x86/x86util.asm"
25
 
26
SECTION_RODATA
27
 
28
pb_dither: db 0,  48,  12,  60,   3,  51,  15,  63, 32,  16,  44,  28,  35,  19,  47,  31, \
29
              8,  56,   4,  52,  11,  59,   7,  55, 40,  24,  36,  20,  43,  27,  39,  23, \
30
              2,  50,  14,  62,   1,  49,  13,  61, 34,  18,  46,  30,  33,  17,  45,  29, \
31
             10,  58,   6,  54,   9,  57,   5,  53, 42,  26,  38,  22,  41,  25,  37,  21
32
pw_187E: times 4 dw 0x187E ; FIX64(0.382683433, 14)
33
pw_22A3: times 4 dw 0x22A3 ; FIX64(1.082392200, 13)
34
pw_2D41: times 4 dw 0x2D41 ; FIX64(1.414213562, 13)
35
pw_539F: times 4 dw 0x539F ; FIX64(1.306562965, 14)
36
pw_5A82: times 4 dw 0x5A82 ; FIX64(1.414213562, 14)
37
pw_3B21: times 4 dw 0x3B21 ; FIX64(1.847759065, 13)
38
pw_AC62: times 4 dw 0xAC62 ; FIX64(-2.613125930, 13)
39
pw_3642: times 4 dw 0x3642 ; FIX64(0.847759065, 14)
40
pw_2441: times 4 dw 0x2441 ; FIX64(0.566454497, 14)
41
pw_0CBB: times 4 dw 0x0CBB ; FIX64(0.198912367, 14)
42
pw_4:    times 4 dw 4
43
pw_2:    times 4 dw 2
44
 
45
SECTION .text
46
 
47
%define DCTSIZE 8
48
 
49
INIT_MMX mmx
50
 
51
;void ff_store_slice_mmx(uint8_t *dst, int16_t *src,
52
;                        ptrdiff_t dst_stride, ptrdiff_t src_stride,
53
;                        ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
54
%if ARCH_X86_64
55
cglobal store_slice, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
56
%else
57
cglobal store_slice, 2, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
58
%define dst_strideq r2m
59
%define src_strideq r3m
60
    mov       widthq, r4m
61
    mov       dither_heightq, r5m
62
    mov       ditherq, r6m ; log2_scale
63
%endif
64
    add       widthq, 7
65
    mov       tmpq, src_strideq
66
    and       widthq, ~7
67
    sub       dst_strideq, widthq
68
    movd      m5, ditherd ; log2_scale
69
    xor       ditherq, -1 ; log2_scale
70
    mov       tmp2q, tmpq
71
    add       ditherq, 7 ; log2_scale
72
    neg       tmpq
73
    sub       tmp2q, widthq
74
    movd      m2, ditherd ; log2_scale
75
    add       tmp2q, tmp2q
76
    lea       ditherq, [pb_dither]
77
    mov       src_strideq, tmp2q
78
    shl       tmpq, 4
79
    lea       dither_heightq, [ditherq+dither_heightq*8]
80
    pxor      m7, m7
81
 
82
.loop_height:
83
    movq      m3, [ditherq]
84
    movq      m4, m3
85
    punpcklbw m3, m7
86
    punpckhbw m4, m7
87
    mov       tmp2q, widthq
88
    psraw     m3, m5
89
    psraw     m4, m5
90
 
91
.loop_width:
92
    movq      [srcq+tmpq], m7
93
    movq      m0, [srcq]
94
    movq      m1, [srcq+8]
95
    movq      [srcq+tmpq+8], m7
96
    paddw     m0, m3
97
    paddw     m1, m4
98
    movq      [srcq], m7
99
    psraw     m0, m2
100
    psraw     m1, m2
101
    movq      [srcq+8], m7
102
    packuswb  m0, m1
103
    add       srcq, 16
104
    movq      [dstq], m0
105
    add       dstq, 8
106
    sub       tmp2q, 8
107
    jg .loop_width
108
 
109
    add       srcq, src_strideq
110
    add       ditherq, 8
111
    add       dstq, dst_strideq
112
    cmp       ditherq, dither_heightq
113
    jl .loop_height
114
    RET
115
 
116
;void ff_store_slice2_mmx(uint8_t *dst, int16_t *src,
117
;                         ptrdiff_t dst_stride, ptrdiff_t src_stride,
118
;                         ptrdiff_t width, ptrdiff_t height, ptrdiff_t log2_scale)
119
%if ARCH_X86_64
120
cglobal store_slice2, 7, 9, 0, dst, src, dst_stride, src_stride, width, dither_height, dither, tmp, tmp2
121
%else
122
cglobal store_slice2, 0, 7, 0, dst, src, width, dither_height, dither, tmp, tmp2
123
%define dst_strideq r2m
124
%define src_strideq r3m
125
    mov       dstq, dstm
126
    mov       srcq, srcm
127
    mov       widthq, r4m
128
    mov       dither_heightq, r5m
129
    mov       ditherq, r6m ; log2_scale
130
%endif
131
    add       widthq, 7
132
    mov       tmpq, src_strideq
133
    and       widthq, ~7
134
    sub       dst_strideq, widthq
135
    movd      m5, ditherd ; log2_scale
136
    xor       ditherq, -1 ; log2_scale
137
    mov       tmp2q, tmpq
138
    add       ditherq, 7 ; log2_scale
139
    sub       tmp2q, widthq
140
    movd      m2, ditherd ; log2_scale
141
    add       tmp2q, tmp2q
142
    lea       ditherq, [pb_dither]
143
    mov       src_strideq, tmp2q
144
    shl       tmpq, 5
145
    lea       dither_heightq, [ditherq+dither_heightq*8]
146
    pxor      m7, m7
147
 
148
.loop_height:
149
    movq      m3, [ditherq]
150
    movq      m4, m3
151
    punpcklbw m3, m7
152
    punpckhbw m4, m7
153
    mov       tmp2q,widthq
154
    psraw     m3, m5
155
    psraw     m4, m5
156
 
157
.loop_width:
158
    movq      m0, [srcq]
159
    movq      m1, [srcq+8]
160
    paddw     m0, m3
161
    paddw     m0, [srcq+tmpq]
162
    paddw     m1, m4
163
    movq      m6, [srcq+tmpq+8]
164
    movq      [srcq+tmpq], m7
165
    psraw     m0, m2
166
    paddw     m1, m6
167
    movq      [srcq+tmpq+8], m7
168
    psraw     m1, m2
169
    packuswb  m0, m1
170
    movq      [dstq], m0
171
    add       srcq, 16
172
    add       dstq, 8
173
    sub       tmp2q, 8
174
    jg .loop_width
175
 
176
    add       srcq, src_strideq
177
    add       ditherq, 8
178
    add       dstq, dst_strideq
179
    cmp       ditherq, dither_heightq
180
    jl .loop_height
181
    RET
182
 
183
;void ff_mul_thrmat_mmx(int16_t *thr_adr_noq, int16_t *thr_adr, int q);
184
cglobal mul_thrmat, 3, 3, 0, thrn, thr, q
185
    movd      m7, qd
186
    movq      m0, [thrnq]
187
    punpcklwd m7, m7
188
    movq      m1, [thrnq+8]
189
    punpckldq m7, m7
190
    pmullw    m0, m7
191
    movq      m2, [thrnq+8*2]
192
    pmullw    m1, m7
193
    movq      m3, [thrnq+8*3]
194
    pmullw    m2, m7
195
    movq      [thrq], m0
196
    movq      m4, [thrnq+8*4]
197
    pmullw    m3, m7
198
    movq      [thrq+8], m1
199
    movq      m5, [thrnq+8*5]
200
    pmullw    m4, m7
201
    movq      [thrq+8*2], m2
202
    movq      m6, [thrnq+8*6]
203
    pmullw    m5, m7
204
    movq      [thrq+8*3], m3
205
    movq      m0, [thrnq+8*7]
206
    pmullw    m6, m7
207
    movq      [thrq+8*4], m4
208
    movq      m1, [thrnq+8*7+8]
209
    pmullw    m0, m7
210
    movq      [thrq+8*5], m5
211
    movq      m2, [thrnq+8*7+8*2]
212
    pmullw    m1, m7
213
    movq      [thrq+8*6], m6
214
    movq      m3, [thrnq+8*7+8*3]
215
    pmullw    m2, m7
216
    movq      [thrq+8*7], m0
217
    movq      m4, [thrnq+8*7+8*4]
218
    pmullw    m3, m7
219
    movq      [thrq+8*7+8], m1
220
    movq      m5, [thrnq+8*7+8*5]
221
    pmullw    m4, m7
222
    movq      [thrq+8*7+8*2], m2
223
    movq      m6, [thrnq+8*7+8*6]
224
    pmullw    m5, m7
225
    movq      [thrq+8*7+8*3], m3
226
    movq      m0, [thrnq+14*8]
227
    pmullw    m6, m7
228
    movq      [thrq+8*7+8*4], m4
229
    movq      m1, [thrnq+14*8+8]
230
    pmullw    m0, m7
231
    movq      [thrq+8*7+8*5], m5
232
    pmullw    m1, m7
233
    movq      [thrq+8*7+8*6], m6
234
    movq      [thrq+14*8], m0
235
    movq      [thrq+14*8+8], m1
236
    RET
237
 
238
%macro COLUMN_FDCT 1-3 0, 0
239
    movq      m1, [srcq+DCTSIZE*0*2]
240
    movq      m7, [srcq+DCTSIZE*3*2]
241
    movq      m0, m1
242
    paddw     m1, [srcq+DCTSIZE*7*2]
243
    movq      m3, m7
244
    paddw     m7, [srcq+DCTSIZE*4*2]
245
    movq      m5, m1
246
    movq      m6, [srcq+DCTSIZE*1*2]
247
    psubw     m1, m7
248
    movq      m2, [srcq+DCTSIZE*2*2]
249
    movq      m4, m6
250
    paddw     m6, [srcq+DCTSIZE*6*2]
251
    paddw     m5, m7
252
    paddw     m2, [srcq+DCTSIZE*5*2]
253
    movq      m7, m6
254
    paddw     m6, m2
255
    psubw     m7, m2
256
    movq      m2, m5
257
    paddw     m5, m6
258
    psubw     m2, m6
259
    paddw     m7, m1
260
    movq      m6, [thrq+4*16+%2]
261
    psllw     m7, 2
262
    psubw     m5, [thrq+%2]
263
    psubw     m2, m6
264
    paddusw   m5, [thrq+%2]
265
    paddusw   m2, m6
266
    pmulhw    m7, [pw_2D41]
267
    paddw     m5, [thrq+%2]
268
    paddw     m2, m6
269
    psubusw   m5, [thrq+%2]
270
    psubusw   m2, m6
271
    paddw     m5, [pw_2]
272
    movq      m6, m2
273
    paddw     m2, m5
274
    psubw     m5, m6
275
    movq      m6, m1
276
    paddw     m1, m7
277
    psubw     m1, [thrq+2*16+%2]
278
    psubw     m6, m7
279
    movq      m7, [thrq+6*16+%2]
280
    psraw     m5, 2
281
    paddusw   m1, [thrq+2*16+%2]
282
    psubw     m6, m7
283
    paddw     m1, [thrq+2*16+%2]
284
    paddusw   m6, m7
285
    psubusw   m1, [thrq+2*16+%2]
286
    paddw     m6, m7
287
    psubw     m3, [srcq+DCTSIZE*4*2]
288
    psubusw   m6, m7
289
    movq      m7, m1
290
    psraw     m2, 2
291
    psubw     m4, [srcq+DCTSIZE*6*2]
292
    psubw     m1, m6
293
    psubw     m0, [srcq+DCTSIZE*7*2]
294
    paddw     m6, m7
295
    psraw     m6, 2
296
    movq      m7, m2
297
    pmulhw    m1, [pw_5A82]
298
    paddw     m2, m6
299
    movq      [rsp], m2
300
    psubw     m7, m6
301
    movq      m2, [srcq+DCTSIZE*2*2]
302
    psubw     m1, m6
303
    psubw     m2, [srcq+DCTSIZE*5*2]
304
    movq      m6, m5
305
    movq      [rsp+8*3], m7
306
    paddw     m3, m2
307
    paddw     m2, m4
308
    paddw     m4, m0
309
    movq      m7, m3
310
    psubw     m3, m4
311
    psllw     m3, 2
312
    psllw     m7, 2
313
    pmulhw    m3, [pw_187E]
314
    psllw     m4, 2
315
    pmulhw    m7, [pw_22A3]
316
    psllw     m2, 2
317
    pmulhw    m4, [pw_539F]
318
    paddw     m5, m1
319
    pmulhw    m2, [pw_2D41]
320
    psubw     m6, m1
321
    paddw     m7, m3
322
    movq      [rsp+8], m5
323
    paddw     m4, m3
324
    movq      m3, [thrq+3*16+%2]
325
    movq      m1, m0
326
    movq      [rsp+8*2], m6
327
    psubw     m1, m2
328
    paddw     m0, m2
329
    movq      m5, m1
330
    movq      m2, [thrq+5*16+%2]
331
    psubw     m1, m7
332
    paddw     m5, m7
333
    psubw     m1, m3
334
    movq      m7, [thrq+16+%2]
335
    psubw     m5, m2
336
    movq      m6, m0
337
    paddw     m0, m4
338
    paddusw   m1, m3
339
    psubw     m6, m4
340
    movq      m4, [thrq+7*16+%2]
341
    psubw     m0, m7
342
    psubw     m6, m4
343
    paddusw   m5, m2
344
    paddusw   m6, m4
345
    paddw     m1, m3
346
    paddw     m5, m2
347
    paddw     m6, m4
348
    psubusw   m1, m3
349
    psubusw   m5, m2
350
    psubusw   m6, m4
351
    movq      m4, m1
352
    por       m4, m5
353
    paddusw   m0, m7
354
    por       m4, m6
355
    paddw     m0, m7
356
    packssdw  m4, m4
357
    psubusw   m0, m7
358
    movd      tmpd, m4
359
    or        tmpd, tmpd
360
    jnz %1
361
    movq      m4, [rsp]
362
    movq      m1, m0
363
    pmulhw    m0, [pw_3642]
364
    movq      m2, m1
365
    movq      m5, [outq+DCTSIZE*0*2]
366
    movq      m3, m2
367
    pmulhw    m1, [pw_2441]
368
    paddw     m5, m4
369
    movq      m6, [rsp+8]
370
    psraw     m3, 2
371
    pmulhw    m2, [pw_0CBB]
372
    psubw     m4, m3
373
    movq      m7, [outq+DCTSIZE*1*2]
374
    paddw     m5, m3
375
    movq      [outq+DCTSIZE*7*2], m4
376
    paddw     m7, m6
377
    movq      m3, [rsp+8*2]
378
    psubw     m6, m0
379
    movq      m4, [outq+DCTSIZE*2*2]
380
    paddw     m7, m0
381
    movq      [outq], m5
382
    paddw     m4, m3
383
    movq      [outq+DCTSIZE*6*2], m6
384
    psubw     m3, m1
385
    movq      m5, [outq+DCTSIZE*5*2]
386
    paddw     m4, m1
387
    movq      m6, [outq+DCTSIZE*3*2]
388
    paddw     m5, m3
389
    movq      m0, [rsp+8*3]
390
    add       srcq, 8+%3
391
    movq      [outq+DCTSIZE*1*2], m7
392
    paddw     m6, m0
393
    movq      [outq+DCTSIZE*2*2], m4
394
    psubw     m0, m2
395
    movq      m7, [outq+DCTSIZE*4*2]
396
    paddw     m6, m2
397
    movq      [outq+DCTSIZE*5*2], m5
398
    paddw     m7, m0
399
    movq      [outq+DCTSIZE*3*2], m6
400
    movq      [outq+DCTSIZE*4*2], m7
401
    add       outq, 8+%3
402
%endmacro
403
 
404
%macro COLUMN_IDCT 0-1 0
405
    movq      m3, m5
406
    psubw     m5, m1
407
    psllw     m5, 1
408
    paddw     m3, m1
409
    movq      m2, m0
410
    psubw     m0, m6
411
    movq      m1, m5
412
    psllw     m0, 1
413
    pmulhw    m1, [pw_AC62]
414
    paddw     m5, m0
415
    pmulhw    m5, [pw_3B21]
416
    paddw     m2, m6
417
    pmulhw    m0, [pw_22A3]
418
    movq      m7, m2
419
    movq      m4, [rsp]
420
    psubw     m2, m3
421
    psllw     m2, 1
422
    paddw     m7, m3
423
    pmulhw    m2, [pw_2D41]
424
    movq      m6, m4
425
    psraw     m7, 2
426
    paddw     m4, [outq]
427
    psubw     m6, m7
428
    movq      m3, [rsp+8]
429
    paddw     m4, m7
430
    movq      [outq+DCTSIZE*7*2], m6
431
    paddw     m1, m5
432
    movq      [outq], m4
433
    psubw     m1, m7
434
    movq      m7, [rsp+8*2]
435
    psubw     m0, m5
436
    movq      m6, [rsp+8*3]
437
    movq      m5, m3
438
    paddw     m3, [outq+DCTSIZE*1*2]
439
    psubw     m5, m1
440
    psubw     m2, m1
441
    paddw     m3, m1
442
    movq      [outq+DCTSIZE*6*2], m5
443
    movq      m4, m7
444
    paddw     m7, [outq+DCTSIZE*2*2]
445
    psubw     m4, m2
446
    paddw     m4, [outq+DCTSIZE*5*2]
447
    paddw     m7, m2
448
    movq      [outq+DCTSIZE*1*2], m3
449
    paddw     m0, m2
450
    movq      [outq+DCTSIZE*2*2], m7
451
    movq      m1, m6
452
    paddw     m6, [outq+DCTSIZE*4*2]
453
    psubw     m1, m0
454
    paddw     m1, [outq+DCTSIZE*3*2]
455
    paddw     m6, m0
456
    movq      [outq+DCTSIZE*5*2], m4
457
    add       srcq, 8+%1
458
    movq      [outq+DCTSIZE*4*2], m6
459
    movq      [outq+DCTSIZE*3*2], m1
460
    add       outq, 8+%1
461
%endmacro
462
 
463
;void ff_column_fidct_mmx(int16_t *thr_adr, int16_t *data, int16_t *output, int cnt);
464
cglobal column_fidct, 4, 5, 0, 32, thr, src, out, cnt, tmp
465
.fdct1:
466
    COLUMN_FDCT .idct1
467
    jmp .fdct2
468
 
469
.idct1:
470
    COLUMN_IDCT
471
 
472
.fdct2:
473
    COLUMN_FDCT .idct2, 8, 16
474
    sub    cntd, 2
475
    jg .fdct1
476
    RET
477
 
478
.idct2:
479
    COLUMN_IDCT 16
480
    sub    cntd, 2
481
    jg .fdct1
482
    RET
483
 
484
;void ff_row_idct_mmx(int16_t *workspace, int16_t *output_adr, ptrdiff_t output_stride, int cnt);
485
cglobal row_idct, 4, 5, 0, 16, src, dst, stride, cnt, stride3
486
    add       strideq, strideq
487
    lea       stride3q, [strideq+strideq*2]
488
.loop:
489
    movq      m0, [srcq+DCTSIZE*0*2]
490
    movq      m1, [srcq+DCTSIZE*1*2]
491
    movq      m4, m0
492
    movq      m2, [srcq+DCTSIZE*2*2]
493
    punpcklwd m0, m1
494
    movq      m3, [srcq+DCTSIZE*3*2]
495
    punpckhwd m4, m1
496
    movq      m7, m2
497
    punpcklwd m2, m3
498
    movq      m6, m0
499
    punpckldq m0, m2
500
    punpckhdq m6, m2
501
    movq      m5, m0
502
    punpckhwd m7, m3
503
    psubw     m0, m6
504
    pmulhw    m0, [pw_5A82]
505
    movq      m2, m4
506
    punpckldq m4, m7
507
    paddw     m5, m6
508
    punpckhdq m2, m7
509
    movq      m1, m4
510
    psllw     m0, 2
511
    paddw     m4, m2
512
    movq      m3, [srcq+DCTSIZE*0*2+8]
513
    psubw     m1, m2
514
    movq      m2, [srcq+DCTSIZE*1*2+8]
515
    psubw     m0, m5
516
    movq      m6, m4
517
    paddw     m4, m5
518
    psubw     m6, m5
519
    movq      m7, m1
520
    movq      m5, [srcq+DCTSIZE*2*2+8]
521
    paddw     m1, m0
522
    movq      [rsp], m4
523
    movq      m4, m3
524
    movq      [rsp+8], m6
525
    punpcklwd m3, m2
526
    movq      m6, [srcq+DCTSIZE*3*2+8]
527
    punpckhwd m4, m2
528
    movq      m2, m5
529
    punpcklwd m5, m6
530
    psubw     m7, m0
531
    punpckhwd m2, m6
532
    movq      m0, m3
533
    punpckldq m3, m5
534
    punpckhdq m0, m5
535
    movq      m5, m4
536
    movq      m6, m3
537
    punpckldq m4, m2
538
    psubw     m3, m0
539
    punpckhdq m5, m2
540
    paddw     m6, m0
541
    movq      m2, m4
542
    movq      m0, m3
543
    psubw     m4, m5
544
    pmulhw    m0, [pw_AC62]
545
    paddw     m3, m4
546
    pmulhw    m3, [pw_3B21]
547
    paddw     m2, m5
548
    pmulhw    m4, [pw_22A3]
549
    movq      m5, m2
550
    psubw     m2, m6
551
    paddw     m5, m6
552
    pmulhw    m2, [pw_2D41]
553
    paddw     m0, m3
554
    psllw     m0, 3
555
    psubw     m4, m3
556
    movq      m6, [rsp]
557
    movq      m3, m1
558
    psllw     m4, 3
559
    psubw     m0, m5
560
    psllw     m2, 3
561
    paddw     m1, m0
562
    psubw     m2, m0
563
    psubw     m3, m0
564
    paddw     m4, m2
565
    movq      m0, m7
566
    paddw     m7, m2
567
    psubw     m0, m2
568
    movq      m2, [pw_4]
569
    psubw     m6, m5
570
    paddw     m5, [rsp]
571
    paddw     m1, m2
572
    paddw     m5, m2
573
    psraw     m1, 3
574
    paddw     m7, m2
575
    psraw     m5, 3
576
    paddw     m5, [dstq]
577
    psraw     m7, 3
578
    paddw     m1, [dstq+strideq*1]
579
    paddw     m0, m2
580
    paddw     m7, [dstq+strideq*2]
581
    paddw     m3, m2
582
    movq      [dstq], m5
583
    paddw     m6, m2
584
    movq      [dstq+strideq*1], m1
585
    psraw     m0, 3
586
    movq      [dstq+strideq*2], m7
587
    add       dstq, stride3q
588
    movq      m5, [rsp+8]
589
    psraw     m3, 3
590
    paddw     m0, [dstq+strideq*2]
591
    psubw     m5, m4
592
    paddw     m3, [dstq+stride3q*1]
593
    psraw     m6, 3
594
    paddw     m4, [rsp+8]
595
    paddw     m5, m2
596
    paddw     m6, [dstq+strideq*4]
597
    paddw     m4, m2
598
    movq      [dstq+strideq*2], m0
599
    psraw     m5, 3
600
    paddw     m5, [dstq]
601
    psraw     m4, 3
602
    paddw     m4, [dstq+strideq*1]
603
    add       srcq, DCTSIZE*2*4
604
    movq      [dstq+stride3q*1], m3
605
    movq      [dstq+strideq*4], m6
606
    movq      [dstq], m5
607
    movq      [dstq+strideq*1], m4
608
    sub       dstq, stride3q
609
    add       dstq, 8
610
    dec       r3d
611
    jnz .loop
612
    RET
613
 
614
;void ff_row_fdct_mmx(int16_t *data, const uint8_t *pixels, ptrdiff_t line_size, int cnt);
615
cglobal row_fdct, 4, 5, 0, 16, src, pix, stride, cnt, stride3
616
    lea       stride3q, [strideq+strideq*2]
617
.loop:
618
    movd      m0, [pixq]
619
    pxor      m7, m7
620
    movd      m1, [pixq+strideq*1]
621
    punpcklbw m0, m7
622
    movd      m2, [pixq+strideq*2]
623
    punpcklbw m1, m7
624
    punpcklbw m2, m7
625
    add       pixq,stride3q
626
    movq      m5, m0
627
    movd      m3, [pixq+strideq*4]
628
    movq      m6, m1
629
    movd      m4, [pixq+stride3q*1]
630
    punpcklbw m3, m7
631
    psubw     m5, m3
632
    punpcklbw m4, m7
633
    paddw     m0, m3
634
    psubw     m6, m4
635
    movd      m3, [pixq+strideq*2]
636
    paddw     m1, m4
637
    movq      [rsp], m5
638
    punpcklbw m3, m7
639
    movq      [rsp+8], m6
640
    movq      m4, m2
641
    movd      m5, [pixq]
642
    paddw     m2, m3
643
    movd      m6, [pixq+strideq*1]
644
    punpcklbw m5, m7
645
    psubw     m4, m3
646
    punpcklbw m6, m7
647
    movq      m3, m5
648
    paddw     m5, m6
649
    psubw     m3, m6
650
    movq      m6, m0
651
    movq      m7, m1
652
    psubw     m0, m5
653
    psubw     m1, m2
654
    paddw     m7, m2
655
    paddw     m1, m0
656
    movq      m2, m7
657
    psllw     m1, 2
658
    paddw     m6, m5
659
    pmulhw    m1, [pw_2D41]
660
    paddw     m7, m6
661
    psubw     m6, m2
662
    movq      m5, m0
663
    movq      m2, m7
664
    punpcklwd m7, m6
665
    paddw     m0, m1
666
    punpckhwd m2, m6
667
    psubw     m5, m1
668
    movq      m6, m0
669
    movq      m1, [rsp+8]
670
    punpcklwd m0, m5
671
    punpckhwd m6, m5
672
    movq      m5, m0
673
    punpckldq m0, m7
674
    paddw     m3, m4
675
    punpckhdq m5, m7
676
    movq      m7, m6
677
    movq      [srcq+DCTSIZE*0*2], m0
678
    punpckldq m6, m2
679
    movq      [srcq+DCTSIZE*1*2], m5
680
    punpckhdq m7, m2
681
    movq      [srcq+DCTSIZE*2*2], m6
682
    paddw     m4, m1
683
    movq      [srcq+DCTSIZE*3*2], m7
684
    psllw     m3, 2
685
    movq      m2, [rsp]
686
    psllw     m4, 2
687
    pmulhw    m4, [pw_2D41]
688
    paddw     m1, m2
689
    psllw     m1, 2
690
    movq      m0, m3
691
    pmulhw    m0, [pw_22A3]
692
    psubw     m3, m1
693
    pmulhw    m3, [pw_187E]
694
    movq      m5, m2
695
    pmulhw    m1, [pw_539F]
696
    psubw     m2, m4
697
    paddw     m5, m4
698
    movq      m6, m2
699
    paddw     m0, m3
700
    movq      m7, m5
701
    paddw     m2, m0
702
    psubw     m6, m0
703
    movq      m4, m2
704
    paddw     m1, m3
705
    punpcklwd m2, m6
706
    paddw     m5, m1
707
    punpckhwd m4, m6
708
    psubw     m7, m1
709
    movq      m6, m5
710
    punpcklwd m5, m7
711
    punpckhwd m6, m7
712
    movq      m7, m2
713
    punpckldq m2, m5
714
    sub       pixq, stride3q
715
    punpckhdq m7, m5
716
    movq      m5, m4
717
    movq      [srcq+DCTSIZE*0*2+8], m2
718
    punpckldq m4, m6
719
    movq      [srcq+DCTSIZE*1*2+8], m7
720
    punpckhdq m5, m6
721
    movq      [srcq+DCTSIZE*2*2+8], m4
722
    add       pixq, 4
723
    movq      [srcq+DCTSIZE*3*2+8], m5
724
    add       srcq, DCTSIZE*4*2
725
    dec       cntd
726
    jnz .loop
727
    RET