Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
6148 serge 1
/*
2
 * Simple IDCT MMX
3
 *
4
 * Copyright (c) 2001, 2002 Michael Niedermayer 
5
 *
6
 * This file is part of FFmpeg.
7
 *
8
 * FFmpeg is free software; you can redistribute it and/or
9
 * modify it under the terms of the GNU Lesser General Public
10
 * License as published by the Free Software Foundation; either
11
 * version 2.1 of the License, or (at your option) any later version.
12
 *
13
 * FFmpeg is distributed in the hope that it will be useful,
14
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
15
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
16
 * Lesser General Public License for more details.
17
 *
18
 * You should have received a copy of the GNU Lesser General Public
19
 * License along with FFmpeg; if not, write to the Free Software
20
 * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
21
 */
22
#include "libavcodec/simple_idct.h"
23
#include "libavutil/mem.h"
24
#include "dsputil_x86.h"
25
 
26
#if HAVE_INLINE_ASM
27
 
28
/*
29
23170.475006
30
22725.260826
31
21406.727617
32
19265.545870
33
16384.000000
34
12872.826198
35
8866.956905
36
4520.335430
37
*/
38
#define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
39
#define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
40
#define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
41
#define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
42
#define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5
43
#define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
44
#define C6 8867  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
45
#define C7 4520  //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5
46
 
47
#define ROW_SHIFT 11
48
#define COL_SHIFT 20 // 6
49
 
50
DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL;
51
DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL;
52
 
53
DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= {
54
        1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0,
55
//        1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0,
56
//        0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16),
57
        1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0,
58
        // the 1 = ((1<<(COL_SHIFT-1))/C4)<
59
//        0, 0, 0, 0,
60
//        0, 0, 0, 0,
61
 
62
 C4,  C4,  C4,  C4,
63
 C4, -C4,  C4, -C4,
64
 
65
 C2,  C6,  C2,  C6,
66
 C6, -C2,  C6, -C2,
67
 
68
 C1,  C3,  C1,  C3,
69
 C5,  C7,  C5,  C7,
70
 
71
 C3, -C7,  C3, -C7,
72
-C1, -C5, -C1, -C5,
73
 
74
 C5, -C1,  C5, -C1,
75
 C7,  C3,  C7,  C3,
76
 
77
 C7, -C5,  C7, -C5,
78
 C3, -C1,  C3, -C1
79
};
80
 
81
static inline void idct(int16_t *block)
82
{
83
        LOCAL_ALIGNED_8(int64_t, align_tmp, [16]);
84
        int16_t * const temp= (int16_t*)align_tmp;
85
 
86
        __asm__ volatile(
87
#if 0 //Alternative, simpler variant
88
 
89
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
90
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
91
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
92
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
93
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
94
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
95
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
96
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
97
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
98
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
99
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
100
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
101
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
102
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
103
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
104
        #rounder ", %%mm4               \n\t"\
105
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
106
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
107
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
108
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
109
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
110
        #rounder ", %%mm0               \n\t"\
111
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
112
        "paddd %%mm0, %%mm0             \n\t" \
113
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
114
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
115
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
116
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
117
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
118
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
119
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
120
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
121
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
122
        "psrad $" #shift ", %%mm7       \n\t"\
123
        "psrad $" #shift ", %%mm4       \n\t"\
124
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
125
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
126
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
127
        "psrad $" #shift ", %%mm1       \n\t"\
128
        "psrad $" #shift ", %%mm2       \n\t"\
129
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
130
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
131
        "movq %%mm7, " #dst "           \n\t"\
132
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
133
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
134
        "movq %%mm2, 24+" #dst "        \n\t"\
135
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
136
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
137
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
138
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
139
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
140
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
141
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
142
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
143
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
144
        "psrad $" #shift ", %%mm2       \n\t"\
145
        "psrad $" #shift ", %%mm0       \n\t"\
146
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
147
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
148
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
149
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
150
        "psrad $" #shift ", %%mm6       \n\t"\
151
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
152
        "movq %%mm2, 8+" #dst "         \n\t"\
153
        "psrad $" #shift ", %%mm4       \n\t"\
154
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
155
        "movq %%mm4, 16+" #dst "        \n\t"\
156
 
157
#define COL_IDCT(src0, src4, src1, src5, dst, shift) \
158
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
159
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
160
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
161
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
162
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
163
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
164
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
165
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
166
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
167
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
168
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
169
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
170
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
171
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
172
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
173
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
174
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
175
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
176
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
177
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
178
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
179
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
180
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
181
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
182
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
183
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
184
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
185
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
186
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
187
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
188
        "psrad $" #shift ", %%mm7       \n\t"\
189
        "psrad $" #shift ", %%mm4       \n\t"\
190
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
191
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
192
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
193
        "psrad $" #shift ", %%mm0       \n\t"\
194
        "psrad $" #shift ", %%mm2       \n\t"\
195
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
196
        "movd %%mm7, " #dst "           \n\t"\
197
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
198
        "movd %%mm0, 16+" #dst "        \n\t"\
199
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
200
        "movd %%mm2, 96+" #dst "        \n\t"\
201
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
202
        "movd %%mm4, 112+" #dst "       \n\t"\
203
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
204
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
205
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
206
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
207
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
208
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
209
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
210
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
211
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
212
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
213
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
214
        "psrad $" #shift ", %%mm2       \n\t"\
215
        "psrad $" #shift ", %%mm5       \n\t"\
216
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
217
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
218
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
219
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
220
        "psrad $" #shift ", %%mm6       \n\t"\
221
        "psrad $" #shift ", %%mm4       \n\t"\
222
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
223
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
224
        "movd %%mm2, 32+" #dst "        \n\t"\
225
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
226
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
227
        "movd %%mm6, 48+" #dst "        \n\t"\
228
        "movd %%mm4, 64+" #dst "        \n\t"\
229
        "movd %%mm5, 80+" #dst "        \n\t"\
230
 
231
 
232
#define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
233
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
234
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
235
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
236
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
237
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
238
        "pand %%mm0, %%mm4              \n\t"\
239
        "por %%mm1, %%mm4               \n\t"\
240
        "por %%mm2, %%mm4               \n\t"\
241
        "por %%mm3, %%mm4               \n\t"\
242
        "packssdw %%mm4,%%mm4           \n\t"\
243
        "movd %%mm4, %%eax              \n\t"\
244
        "orl %%eax, %%eax               \n\t"\
245
        "jz 1f                          \n\t"\
246
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
247
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
248
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
249
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
250
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
251
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
252
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
253
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
254
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
255
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
256
        #rounder ", %%mm4               \n\t"\
257
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
258
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
259
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
260
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
261
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
262
        #rounder ", %%mm0               \n\t"\
263
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
264
        "paddd %%mm0, %%mm0             \n\t" \
265
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
266
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
267
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
268
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
269
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
270
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
271
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
272
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
273
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
274
        "psrad $" #shift ", %%mm7       \n\t"\
275
        "psrad $" #shift ", %%mm4       \n\t"\
276
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
277
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
278
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
279
        "psrad $" #shift ", %%mm1       \n\t"\
280
        "psrad $" #shift ", %%mm2       \n\t"\
281
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
282
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
283
        "movq %%mm7, " #dst "           \n\t"\
284
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
285
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
286
        "movq %%mm2, 24+" #dst "        \n\t"\
287
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
288
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
289
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
290
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
291
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
292
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
293
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
294
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
295
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
296
        "psrad $" #shift ", %%mm2       \n\t"\
297
        "psrad $" #shift ", %%mm0       \n\t"\
298
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
299
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
300
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
301
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
302
        "psrad $" #shift ", %%mm6       \n\t"\
303
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
304
        "movq %%mm2, 8+" #dst "         \n\t"\
305
        "psrad $" #shift ", %%mm4       \n\t"\
306
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
307
        "movq %%mm4, 16+" #dst "        \n\t"\
308
        "jmp 2f                         \n\t"\
309
        "1:                             \n\t"\
310
        "pslld $16, %%mm0               \n\t"\
311
        "#paddd "MANGLE(d40000)", %%mm0 \n\t"\
312
        "psrad $13, %%mm0               \n\t"\
313
        "packssdw %%mm0, %%mm0          \n\t"\
314
        "movq %%mm0, " #dst "           \n\t"\
315
        "movq %%mm0, 8+" #dst "         \n\t"\
316
        "movq %%mm0, 16+" #dst "        \n\t"\
317
        "movq %%mm0, 24+" #dst "        \n\t"\
318
        "2:                             \n\t"
319
 
320
 
321
//IDCT(      src0,   src4,   src1,   src5,    dst,    rounder, shift)
322
ROW_IDCT(    (%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
323
/*ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11)
324
ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11)
325
ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/
326
 
327
DC_COND_ROW_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11)
328
DC_COND_ROW_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11)
329
DC_COND_ROW_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11)
330
 
331
 
332
//IDCT(      src0,   src4,   src1,    src5,    dst, shift)
333
COL_IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
334
COL_IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
335
COL_IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
336
COL_IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
337
 
338
#else
339
 
340
#define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
341
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
342
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
343
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
344
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
345
        "movq "MANGLE(wm1010)", %%mm4   \n\t"\
346
        "pand %%mm0, %%mm4              \n\t"\
347
        "por %%mm1, %%mm4               \n\t"\
348
        "por %%mm2, %%mm4               \n\t"\
349
        "por %%mm3, %%mm4               \n\t"\
350
        "packssdw %%mm4,%%mm4           \n\t"\
351
        "movd %%mm4, %%eax              \n\t"\
352
        "orl %%eax, %%eax               \n\t"\
353
        "jz 1f                          \n\t"\
354
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
355
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
356
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
357
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
358
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
359
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
360
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
361
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
362
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
363
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
364
        #rounder ", %%mm4               \n\t"\
365
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
366
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
367
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
368
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
369
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
370
        #rounder ", %%mm0               \n\t"\
371
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
372
        "paddd %%mm0, %%mm0             \n\t" \
373
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
374
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
375
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
376
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
377
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
378
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
379
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
380
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
381
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
382
        "psrad $" #shift ", %%mm7       \n\t"\
383
        "psrad $" #shift ", %%mm4       \n\t"\
384
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
385
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
386
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
387
        "psrad $" #shift ", %%mm1       \n\t"\
388
        "psrad $" #shift ", %%mm2       \n\t"\
389
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
390
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
391
        "movq %%mm7, " #dst "           \n\t"\
392
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
393
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
394
        "movq %%mm2, 24+" #dst "        \n\t"\
395
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
396
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
397
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
398
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
399
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
400
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
401
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
402
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
403
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
404
        "psrad $" #shift ", %%mm2       \n\t"\
405
        "psrad $" #shift ", %%mm0       \n\t"\
406
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
407
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
408
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
409
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
410
        "psrad $" #shift ", %%mm6       \n\t"\
411
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
412
        "movq %%mm2, 8+" #dst "         \n\t"\
413
        "psrad $" #shift ", %%mm4       \n\t"\
414
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
415
        "movq %%mm4, 16+" #dst "        \n\t"\
416
        "jmp 2f                         \n\t"\
417
        "1:                             \n\t"\
418
        "pslld $16, %%mm0               \n\t"\
419
        "paddd "MANGLE(d40000)", %%mm0  \n\t"\
420
        "psrad $13, %%mm0               \n\t"\
421
        "packssdw %%mm0, %%mm0          \n\t"\
422
        "movq %%mm0, " #dst "           \n\t"\
423
        "movq %%mm0, 8+" #dst "         \n\t"\
424
        "movq %%mm0, 16+" #dst "        \n\t"\
425
        "movq %%mm0, 24+" #dst "        \n\t"\
426
        "2:                             \n\t"
427
 
428
#define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \
429
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
430
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
431
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
432
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
433
        "movq %%mm0, %%mm4              \n\t"\
434
        "por %%mm1, %%mm4               \n\t"\
435
        "por %%mm2, %%mm4               \n\t"\
436
        "por %%mm3, %%mm4               \n\t"\
437
        "packssdw %%mm4,%%mm4           \n\t"\
438
        "movd %%mm4, %%eax              \n\t"\
439
        "orl %%eax, %%eax               \n\t"\
440
        "jz " #bt "                     \n\t"\
441
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
442
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
443
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
444
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
445
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
446
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
447
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
448
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
449
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
450
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
451
        #rounder ", %%mm4               \n\t"\
452
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
453
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
454
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
455
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
456
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
457
        #rounder ", %%mm0               \n\t"\
458
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
459
        "paddd %%mm0, %%mm0             \n\t" \
460
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
461
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
462
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
463
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
464
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
465
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
466
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
467
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
468
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
469
        "psrad $" #shift ", %%mm7       \n\t"\
470
        "psrad $" #shift ", %%mm4       \n\t"\
471
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
472
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
473
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
474
        "psrad $" #shift ", %%mm1       \n\t"\
475
        "psrad $" #shift ", %%mm2       \n\t"\
476
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
477
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
478
        "movq %%mm7, " #dst "           \n\t"\
479
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
480
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
481
        "movq %%mm2, 24+" #dst "        \n\t"\
482
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
483
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
484
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
485
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
486
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
487
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
488
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
489
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
490
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
491
        "psrad $" #shift ", %%mm2       \n\t"\
492
        "psrad $" #shift ", %%mm0       \n\t"\
493
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
494
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
495
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
496
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
497
        "psrad $" #shift ", %%mm6       \n\t"\
498
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
499
        "movq %%mm2, 8+" #dst "         \n\t"\
500
        "psrad $" #shift ", %%mm4       \n\t"\
501
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
502
        "movq %%mm4, 16+" #dst "        \n\t"\
503
 
504
#define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \
505
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
506
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
507
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
508
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
509
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
510
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
511
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
512
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
513
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
514
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
515
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
516
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
517
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
518
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
519
        #rounder ", %%mm4               \n\t"\
520
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
521
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
522
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
523
        "movq 56(%2), %%mm5             \n\t" /* C7     C5      C7      C5 */\
524
        "pmaddwd %%mm3, %%mm5           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
525
        #rounder ", %%mm0               \n\t"\
526
        "paddd %%mm0, %%mm1             \n\t" /* A1             a1 */\
527
        "paddd %%mm0, %%mm0             \n\t" \
528
        "psubd %%mm1, %%mm0             \n\t" /* A2             a2 */\
529
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
530
        "paddd %%mm5, %%mm7             \n\t" /* B0             b0 */\
531
        "movq 72(%2), %%mm5             \n\t" /* -C5    -C1     -C5     -C1 */\
532
        "pmaddwd %%mm3, %%mm5           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
533
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
534
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
535
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
536
        "paddd %%mm2, %%mm5             \n\t" /* B1             b1 */\
537
        "psrad $" #shift ", %%mm7       \n\t"\
538
        "psrad $" #shift ", %%mm4       \n\t"\
539
        "movq %%mm1, %%mm2              \n\t" /* A1             a1 */\
540
        "paddd %%mm5, %%mm1             \n\t" /* A1+B1          a1+b1 */\
541
        "psubd %%mm5, %%mm2             \n\t" /* A1-B1          a1-b1 */\
542
        "psrad $" #shift ", %%mm1       \n\t"\
543
        "psrad $" #shift ", %%mm2       \n\t"\
544
        "packssdw %%mm1, %%mm7          \n\t" /* A1+B1  a1+b1   A0+B0   a0+b0 */\
545
        "packssdw %%mm4, %%mm2          \n\t" /* A0-B0  a0-b0   A1-B1   a1-b1 */\
546
        "movq %%mm7, " #dst "           \n\t"\
547
        "movq " #src1 ", %%mm1          \n\t" /* R3     R1      r3      r1 */\
548
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
549
        "movq %%mm2, 24+" #dst "        \n\t"\
550
        "pmaddwd %%mm1, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
551
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
552
        "pmaddwd 96(%2), %%mm1          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
553
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
554
        "movq %%mm0, %%mm2              \n\t" /* A2             a2 */\
555
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
556
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
557
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
558
        "psubd %%mm4, %%mm0             \n\t" /* a2-B2          a2-b2 */\
559
        "psrad $" #shift ", %%mm2       \n\t"\
560
        "psrad $" #shift ", %%mm0       \n\t"\
561
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
562
        "paddd %%mm1, %%mm3             \n\t" /* B3             b3 */\
563
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
564
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
565
        "psrad $" #shift ", %%mm6       \n\t"\
566
        "packssdw %%mm6, %%mm2          \n\t" /* A3+B3  a3+b3   A2+B2   a2+b2 */\
567
        "movq %%mm2, 8+" #dst "         \n\t"\
568
        "psrad $" #shift ", %%mm4       \n\t"\
569
        "packssdw %%mm0, %%mm4          \n\t" /* A2-B2  a2-b2   A3-B3   a3-b3 */\
570
        "movq %%mm4, 16+" #dst "        \n\t"\
571
 
572
//IDCT(         src0,   src4,   src1,   src5,    dst,   rounder, shift)
573
DC_COND_IDCT(  0(%0),  8(%0), 16(%0), 24(%0),  0(%1),paddd 8(%2), 11)
574
Z_COND_IDCT(  32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f)
575
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f)
576
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f)
577
 
578
#undef IDCT
579
#define IDCT(src0, src4, src1, src5, dst, shift) \
580
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
581
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
582
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
583
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
584
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
585
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
586
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
587
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
588
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
589
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
590
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
591
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
592
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
593
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
594
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
595
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
596
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
597
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
598
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
599
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
600
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
601
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
602
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
603
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
604
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
605
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
606
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
607
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
608
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
609
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
610
        "psrad $" #shift ", %%mm7       \n\t"\
611
        "psrad $" #shift ", %%mm4       \n\t"\
612
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
613
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
614
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
615
        "psrad $" #shift ", %%mm0       \n\t"\
616
        "psrad $" #shift ", %%mm2       \n\t"\
617
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
618
        "movd %%mm7, " #dst "           \n\t"\
619
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
620
        "movd %%mm0, 16+" #dst "        \n\t"\
621
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
622
        "movd %%mm2, 96+" #dst "        \n\t"\
623
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
624
        "movd %%mm4, 112+" #dst "       \n\t"\
625
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
626
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
627
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
628
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
629
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
630
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
631
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
632
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
633
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
634
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
635
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
636
        "psrad $" #shift ", %%mm2       \n\t"\
637
        "psrad $" #shift ", %%mm5       \n\t"\
638
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
639
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
640
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
641
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
642
        "psrad $" #shift ", %%mm6       \n\t"\
643
        "psrad $" #shift ", %%mm4       \n\t"\
644
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
645
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
646
        "movd %%mm2, 32+" #dst "        \n\t"\
647
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
648
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
649
        "movd %%mm6, 48+" #dst "        \n\t"\
650
        "movd %%mm4, 64+" #dst "        \n\t"\
651
        "movd %%mm5, 80+" #dst "        \n\t"
652
 
653
 
654
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
655
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
656
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
657
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
658
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
659
        "jmp 9f                         \n\t"
660
 
661
        "# .p2align 4                   \n\t"\
662
        "4:                             \n\t"
663
Z_COND_IDCT(  64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f)
664
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f)
665
 
666
#undef IDCT
667
#define IDCT(src0, src4, src1, src5, dst, shift) \
668
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
669
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
670
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
671
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
672
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
673
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
674
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
675
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
676
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
677
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
678
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
679
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
680
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
681
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
682
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
683
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
684
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
685
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
686
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
687
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
688
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
689
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
690
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
691
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
692
        "psrad $" #shift ", %%mm1       \n\t"\
693
        "psrad $" #shift ", %%mm4       \n\t"\
694
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
695
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
696
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
697
        "psrad $" #shift ", %%mm0       \n\t"\
698
        "psrad $" #shift ", %%mm2       \n\t"\
699
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
700
        "movd %%mm1, " #dst "           \n\t"\
701
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
702
        "movd %%mm0, 16+" #dst "        \n\t"\
703
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
704
        "movd %%mm2, 96+" #dst "        \n\t"\
705
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
706
        "movd %%mm4, 112+" #dst "       \n\t"\
707
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
708
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
709
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
710
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
711
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
712
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
713
        "psrad $" #shift ", %%mm2       \n\t"\
714
        "psrad $" #shift ", %%mm5       \n\t"\
715
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
716
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
717
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
718
        "psrad $" #shift ", %%mm6       \n\t"\
719
        "psrad $" #shift ", %%mm1       \n\t"\
720
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
721
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
722
        "movd %%mm2, 32+" #dst "        \n\t"\
723
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
724
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
725
        "movd %%mm6, 48+" #dst "        \n\t"\
726
        "movd %%mm1, 64+" #dst "        \n\t"\
727
        "movd %%mm5, 80+" #dst "        \n\t"
728
 
729
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
730
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
731
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
732
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
733
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
734
        "jmp 9f                         \n\t"
735
 
736
        "# .p2align 4                   \n\t"\
737
        "6:                             \n\t"
738
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f)
739
 
740
#undef IDCT
741
#define IDCT(src0, src4, src1, src5, dst, shift) \
742
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
743
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
744
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
745
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
746
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
747
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
748
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
749
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
750
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
751
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
752
        "movq 72(%2), %%mm7             \n\t" /* -C5    -C1     -C5     -C1 */\
753
        "pmaddwd %%mm3, %%mm7           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
754
        "paddd %%mm4, %%mm1             \n\t" /* A0+B0          a0+b0 */\
755
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
756
        "psubd %%mm1, %%mm4             \n\t" /* A0-B0          a0-b0 */\
757
        "psrad $" #shift ", %%mm1       \n\t"\
758
        "psrad $" #shift ", %%mm4       \n\t"\
759
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
760
        "paddd %%mm7, %%mm0             \n\t" /* A1+B1          a1+b1 */\
761
        "psubd %%mm7, %%mm2             \n\t" /* A1-B1          a1-b1 */\
762
        "psrad $" #shift ", %%mm0       \n\t"\
763
        "psrad $" #shift ", %%mm2       \n\t"\
764
        "packssdw %%mm1, %%mm1          \n\t" /* A0+B0  a0+b0 */\
765
        "movd %%mm1, " #dst "           \n\t"\
766
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
767
        "movd %%mm0, 16+" #dst "        \n\t"\
768
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
769
        "movd %%mm2, 96+" #dst "        \n\t"\
770
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
771
        "movd %%mm4, 112+" #dst "       \n\t"\
772
        "movq 88(%2), %%mm1             \n\t" /* C3     C7      C3      C7 */\
773
        "pmaddwd %%mm3, %%mm1           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
774
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
775
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
776
        "paddd %%mm1, %%mm2             \n\t" /* A2+B2          a2+b2 */\
777
        "psubd %%mm1, %%mm5             \n\t" /* a2-B2          a2-b2 */\
778
        "psrad $" #shift ", %%mm2       \n\t"\
779
        "psrad $" #shift ", %%mm5       \n\t"\
780
        "movq %%mm6, %%mm1              \n\t" /* A3             a3 */\
781
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
782
        "psubd %%mm3, %%mm1             \n\t" /* a3-B3          a3-b3 */\
783
        "psrad $" #shift ", %%mm6       \n\t"\
784
        "psrad $" #shift ", %%mm1       \n\t"\
785
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
786
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
787
        "movd %%mm2, 32+" #dst "        \n\t"\
788
        "packssdw %%mm1, %%mm1          \n\t" /* A3-B3  a3-b3 */\
789
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
790
        "movd %%mm6, 48+" #dst "        \n\t"\
791
        "movd %%mm1, 64+" #dst "        \n\t"\
792
        "movd %%mm5, 80+" #dst "        \n\t"
793
 
794
 
795
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
796
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
797
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
798
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
799
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
800
        "jmp 9f                         \n\t"
801
 
802
        "# .p2align 4                   \n\t"\
803
        "2:                             \n\t"
804
Z_COND_IDCT(  96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f)
805
 
806
#undef IDCT
807
#define IDCT(src0, src4, src1, src5, dst, shift) \
808
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
809
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
810
        "movq " #src5 ", %%mm3          \n\t" /* R7     R5      r7      r5 */\
811
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
812
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
813
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
814
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
815
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
816
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
817
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
818
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
819
        "movq 56(%2), %%mm1             \n\t" /* C7     C5      C7      C5 */\
820
        "pmaddwd %%mm3, %%mm1           \n\t" /* C7R7+C5R5      C7r7+C5r5 */\
821
        "pmaddwd 64(%2), %%mm2          \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
822
        "paddd %%mm1, %%mm7             \n\t" /* B0             b0 */\
823
        "movq 72(%2), %%mm1             \n\t" /* -C5    -C1     -C5     -C1 */\
824
        "pmaddwd %%mm3, %%mm1           \n\t" /* -C5R7-C1R5     -C5r7-C1r5 */\
825
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
826
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
827
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
828
        "paddd %%mm2, %%mm1             \n\t" /* B1             b1 */\
829
        "psrad $" #shift ", %%mm7       \n\t"\
830
        "psrad $" #shift ", %%mm4       \n\t"\
831
        "movq %%mm0, %%mm2              \n\t" /* A1             a1 */\
832
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
833
        "psubd %%mm1, %%mm2             \n\t" /* A1-B1          a1-b1 */\
834
        "psrad $" #shift ", %%mm0       \n\t"\
835
        "psrad $" #shift ", %%mm2       \n\t"\
836
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
837
        "movd %%mm7, " #dst "           \n\t"\
838
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
839
        "movd %%mm0, 16+" #dst "        \n\t"\
840
        "packssdw %%mm2, %%mm2          \n\t" /* A1-B1  a1-b1 */\
841
        "movd %%mm2, 96+" #dst "        \n\t"\
842
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
843
        "movd %%mm4, 112+" #dst "       \n\t"\
844
        "movq " #src1 ", %%mm0          \n\t" /* R3     R1      r3      r1 */\
845
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
846
        "pmaddwd %%mm0, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
847
        "movq 88(%2), %%mm7             \n\t" /* C3     C7      C3      C7 */\
848
        "pmaddwd 96(%2), %%mm0          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
849
        "pmaddwd %%mm3, %%mm7           \n\t" /* C3R7+C7R5      C3r7+C7r5 */\
850
        "movq %%mm5, %%mm2              \n\t" /* A2             a2 */\
851
        "pmaddwd 104(%2), %%mm3         \n\t" /* -C1R7+C3R5     -C1r7+C3r5 */\
852
        "paddd %%mm7, %%mm4             \n\t" /* B2             b2 */\
853
        "paddd %%mm4, %%mm2             \n\t" /* A2+B2          a2+b2 */\
854
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
855
        "psrad $" #shift ", %%mm2       \n\t"\
856
        "psrad $" #shift ", %%mm5       \n\t"\
857
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
858
        "paddd %%mm0, %%mm3             \n\t" /* B3             b3 */\
859
        "paddd %%mm3, %%mm6             \n\t" /* A3+B3          a3+b3 */\
860
        "psubd %%mm3, %%mm4             \n\t" /* a3-B3          a3-b3 */\
861
        "psrad $" #shift ", %%mm6       \n\t"\
862
        "psrad $" #shift ", %%mm4       \n\t"\
863
        "packssdw %%mm2, %%mm2          \n\t" /* A2+B2  a2+b2 */\
864
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
865
        "movd %%mm2, 32+" #dst "        \n\t"\
866
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
867
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
868
        "movd %%mm6, 48+" #dst "        \n\t"\
869
        "movd %%mm4, 64+" #dst "        \n\t"\
870
        "movd %%mm5, 80+" #dst "        \n\t"
871
 
872
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
873
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
874
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
875
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
876
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
877
        "jmp 9f                         \n\t"
878
 
879
        "# .p2align 4                   \n\t"\
880
        "3:                             \n\t"
881
#undef IDCT
882
#define IDCT(src0, src4, src1, src5, dst, shift) \
883
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
884
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
885
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
886
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
887
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
888
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
889
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
890
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
891
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
892
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
893
        "movq 64(%2), %%mm3             \n\t"\
894
        "pmaddwd %%mm2, %%mm3           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
895
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
896
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
897
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
898
        "psrad $" #shift ", %%mm7       \n\t"\
899
        "psrad $" #shift ", %%mm4       \n\t"\
900
        "movq %%mm0, %%mm1              \n\t" /* A1             a1 */\
901
        "paddd %%mm3, %%mm0             \n\t" /* A1+B1          a1+b1 */\
902
        "psubd %%mm3, %%mm1             \n\t" /* A1-B1          a1-b1 */\
903
        "psrad $" #shift ", %%mm0       \n\t"\
904
        "psrad $" #shift ", %%mm1       \n\t"\
905
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
906
        "movd %%mm7, " #dst "           \n\t"\
907
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
908
        "movd %%mm0, 16+" #dst "        \n\t"\
909
        "packssdw %%mm1, %%mm1          \n\t" /* A1-B1  a1-b1 */\
910
        "movd %%mm1, 96+" #dst "        \n\t"\
911
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
912
        "movd %%mm4, 112+" #dst "       \n\t"\
913
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
914
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
915
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
916
        "movq %%mm5, %%mm1              \n\t" /* A2             a2 */\
917
        "paddd %%mm4, %%mm1             \n\t" /* A2+B2          a2+b2 */\
918
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
919
        "psrad $" #shift ", %%mm1       \n\t"\
920
        "psrad $" #shift ", %%mm5       \n\t"\
921
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
922
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
923
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
924
        "psrad $" #shift ", %%mm6       \n\t"\
925
        "psrad $" #shift ", %%mm4       \n\t"\
926
        "packssdw %%mm1, %%mm1          \n\t" /* A2+B2  a2+b2 */\
927
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
928
        "movd %%mm1, 32+" #dst "        \n\t"\
929
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
930
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
931
        "movd %%mm6, 48+" #dst "        \n\t"\
932
        "movd %%mm4, 64+" #dst "        \n\t"\
933
        "movd %%mm5, 80+" #dst "        \n\t"
934
 
935
 
936
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
937
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
938
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
939
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
940
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
941
        "jmp 9f                         \n\t"
942
 
943
        "# .p2align 4                   \n\t"\
944
        "5:                             \n\t"
945
#undef IDCT
946
#define IDCT(src0, src4, src1, src5, dst, shift) \
947
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
948
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
949
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
950
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
951
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
952
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
953
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
954
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
955
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
956
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
957
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
958
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
959
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
960
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
961
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
962
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
963
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
964
        "movq 8+" #src4 ", %%mm3        \n\t" /* R6     R2      r6      r2 */\
965
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
966
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
967
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
968
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
969
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
970
        "pmaddwd %%mm3, %%mm7           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
971
        "pmaddwd 40(%2), %%mm3          \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
972
        "paddd %%mm1, %%mm7             \n\t" /* A0             a0 */\
973
        "paddd %%mm1, %%mm1             \n\t" /* 2C0            2c0 */\
974
        "psubd %%mm7, %%mm1             \n\t" /* A3             a3 */\
975
        "paddd %%mm2, %%mm3             \n\t" /* A1             a1 */\
976
        "paddd %%mm2, %%mm2             \n\t" /* 2C1            2c1 */\
977
        "psubd %%mm3, %%mm2             \n\t" /* A2             a2 */\
978
        "psrad $" #shift ", %%mm4       \n\t"\
979
        "psrad $" #shift ", %%mm7       \n\t"\
980
        "psrad $" #shift ", %%mm3       \n\t"\
981
        "packssdw %%mm7, %%mm4          \n\t" /* A0     a0 */\
982
        "movq %%mm4, " #dst "           \n\t"\
983
        "psrad $" #shift ", %%mm0       \n\t"\
984
        "packssdw %%mm3, %%mm0          \n\t" /* A1     a1 */\
985
        "movq %%mm0, 16+" #dst "        \n\t"\
986
        "movq %%mm0, 96+" #dst "        \n\t"\
987
        "movq %%mm4, 112+" #dst "       \n\t"\
988
        "psrad $" #shift ", %%mm5       \n\t"\
989
        "psrad $" #shift ", %%mm6       \n\t"\
990
        "psrad $" #shift ", %%mm2       \n\t"\
991
        "packssdw %%mm2, %%mm5          \n\t" /* A2-B2  a2-b2 */\
992
        "movq %%mm5, 32+" #dst "        \n\t"\
993
        "psrad $" #shift ", %%mm1       \n\t"\
994
        "packssdw %%mm1, %%mm6          \n\t" /* A3+B3  a3+b3 */\
995
        "movq %%mm6, 48+" #dst "        \n\t"\
996
        "movq %%mm6, 64+" #dst "        \n\t"\
997
        "movq %%mm5, 80+" #dst "        \n\t"
998
 
999
 
1000
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1001
IDCT(    0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1002
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1003
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1004
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1005
        "jmp 9f                         \n\t"
1006
 
1007
 
1008
        "# .p2align 4                   \n\t"\
1009
        "1:                             \n\t"
1010
#undef IDCT
1011
#define IDCT(src0, src4, src1, src5, dst, shift) \
1012
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1013
        "movq " #src4 ", %%mm1          \n\t" /* R6     R2      r6      r2 */\
1014
        "movq " #src1 ", %%mm2          \n\t" /* R3     R1      r3      r1 */\
1015
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1016
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1017
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1018
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1019
        "movq 32(%2), %%mm5             \n\t" /* C6     C2      C6      C2 */\
1020
        "pmaddwd %%mm1, %%mm5           \n\t" /* C6R6+C2R2      C6r6+C2r2 */\
1021
        "movq 40(%2), %%mm6             \n\t" /* -C2    C6      -C2     C6 */\
1022
        "pmaddwd %%mm6, %%mm1           \n\t" /* -C2R6+C6R2     -C2r6+C6r2 */\
1023
        "movq %%mm4, %%mm6              \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1024
        "movq 48(%2), %%mm7             \n\t" /* C3     C1      C3      C1 */\
1025
        "pmaddwd %%mm2, %%mm7           \n\t" /* C3R3+C1R1      C3r3+C1r1 */\
1026
        "paddd %%mm5, %%mm4             \n\t" /* A0             a0 */\
1027
        "psubd %%mm5, %%mm6             \n\t" /* A3             a3 */\
1028
        "movq %%mm0, %%mm5              \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1029
        "paddd %%mm1, %%mm0             \n\t" /* A1             a1 */\
1030
        "psubd %%mm1, %%mm5             \n\t" /* A2             a2 */\
1031
        "movq 64(%2), %%mm1             \n\t"\
1032
        "pmaddwd %%mm2, %%mm1           \n\t" /* -C7R3+C3R1     -C7r3+C3r1 */\
1033
        "paddd %%mm4, %%mm7             \n\t" /* A0+B0          a0+b0 */\
1034
        "paddd %%mm4, %%mm4             \n\t" /* 2A0            2a0 */\
1035
        "psubd %%mm7, %%mm4             \n\t" /* A0-B0          a0-b0 */\
1036
        "psrad $" #shift ", %%mm7       \n\t"\
1037
        "psrad $" #shift ", %%mm4       \n\t"\
1038
        "movq %%mm0, %%mm3              \n\t" /* A1             a1 */\
1039
        "paddd %%mm1, %%mm0             \n\t" /* A1+B1          a1+b1 */\
1040
        "psubd %%mm1, %%mm3             \n\t" /* A1-B1          a1-b1 */\
1041
        "psrad $" #shift ", %%mm0       \n\t"\
1042
        "psrad $" #shift ", %%mm3       \n\t"\
1043
        "packssdw %%mm7, %%mm7          \n\t" /* A0+B0  a0+b0 */\
1044
        "movd %%mm7, " #dst "           \n\t"\
1045
        "packssdw %%mm0, %%mm0          \n\t" /* A1+B1  a1+b1 */\
1046
        "movd %%mm0, 16+" #dst "        \n\t"\
1047
        "packssdw %%mm3, %%mm3          \n\t" /* A1-B1  a1-b1 */\
1048
        "movd %%mm3, 96+" #dst "        \n\t"\
1049
        "packssdw %%mm4, %%mm4          \n\t" /* A0-B0  a0-b0 */\
1050
        "movd %%mm4, 112+" #dst "       \n\t"\
1051
        "movq 80(%2), %%mm4             \n\t" /* -C1    C5      -C1     C5 */\
1052
        "pmaddwd %%mm2, %%mm4           \n\t" /* -C1R3+C5R1     -C1r3+C5r1 */\
1053
        "pmaddwd 96(%2), %%mm2          \n\t" /* -C5R3+C7R1     -C5r3+C7r1 */\
1054
        "movq %%mm5, %%mm3              \n\t" /* A2             a2 */\
1055
        "paddd %%mm4, %%mm3             \n\t" /* A2+B2          a2+b2 */\
1056
        "psubd %%mm4, %%mm5             \n\t" /* a2-B2          a2-b2 */\
1057
        "psrad $" #shift ", %%mm3       \n\t"\
1058
        "psrad $" #shift ", %%mm5       \n\t"\
1059
        "movq %%mm6, %%mm4              \n\t" /* A3             a3 */\
1060
        "paddd %%mm2, %%mm6             \n\t" /* A3+B3          a3+b3 */\
1061
        "psubd %%mm2, %%mm4             \n\t" /* a3-B3          a3-b3 */\
1062
        "psrad $" #shift ", %%mm6       \n\t"\
1063
        "packssdw %%mm3, %%mm3          \n\t" /* A2+B2  a2+b2 */\
1064
        "movd %%mm3, 32+" #dst "        \n\t"\
1065
        "psrad $" #shift ", %%mm4       \n\t"\
1066
        "packssdw %%mm6, %%mm6          \n\t" /* A3+B3  a3+b3 */\
1067
        "movd %%mm6, 48+" #dst "        \n\t"\
1068
        "packssdw %%mm4, %%mm4          \n\t" /* A3-B3  a3-b3 */\
1069
        "packssdw %%mm5, %%mm5          \n\t" /* A2-B2  a2-b2 */\
1070
        "movd %%mm4, 64+" #dst "        \n\t"\
1071
        "movd %%mm5, 80+" #dst "        \n\t"
1072
 
1073
 
1074
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1075
IDCT(    (%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1076
IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1077
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1078
IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1079
        "jmp 9f                         \n\t"
1080
 
1081
 
1082
        "# .p2align 4                   \n\t"
1083
        "7:                             \n\t"
1084
#undef IDCT
1085
#define IDCT(src0, src4, src1, src5, dst, shift) \
1086
        "movq " #src0 ", %%mm0          \n\t" /* R4     R0      r4      r0 */\
1087
        "movq 16(%2), %%mm4             \n\t" /* C4     C4      C4      C4 */\
1088
        "pmaddwd %%mm0, %%mm4           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1089
        "movq 24(%2), %%mm5             \n\t" /* -C4    C4      -C4     C4 */\
1090
        "pmaddwd %%mm5, %%mm0           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1091
        "psrad $" #shift ", %%mm4       \n\t"\
1092
        "psrad $" #shift ", %%mm0       \n\t"\
1093
        "movq 8+" #src0 ", %%mm2        \n\t" /* R4     R0      r4      r0 */\
1094
        "movq 16(%2), %%mm1             \n\t" /* C4     C4      C4      C4 */\
1095
        "pmaddwd %%mm2, %%mm1           \n\t" /* C4R4+C4R0      C4r4+C4r0 */\
1096
        "movq 24(%2), %%mm7             \n\t" /* -C4    C4      -C4     C4 */\
1097
        "pmaddwd %%mm7, %%mm2           \n\t" /* -C4R4+C4R0     -C4r4+C4r0 */\
1098
        "movq 32(%2), %%mm7             \n\t" /* C6     C2      C6      C2 */\
1099
        "psrad $" #shift ", %%mm1       \n\t"\
1100
        "packssdw %%mm1, %%mm4          \n\t" /* A0     a0 */\
1101
        "movq %%mm4, " #dst "           \n\t"\
1102
        "psrad $" #shift ", %%mm2       \n\t"\
1103
        "packssdw %%mm2, %%mm0          \n\t" /* A1     a1 */\
1104
        "movq %%mm0, 16+" #dst "        \n\t"\
1105
        "movq %%mm0, 96+" #dst "        \n\t"\
1106
        "movq %%mm4, 112+" #dst "       \n\t"\
1107
        "movq %%mm0, 32+" #dst "        \n\t"\
1108
        "movq %%mm4, 48+" #dst "        \n\t"\
1109
        "movq %%mm4, 64+" #dst "        \n\t"\
1110
        "movq %%mm0, 80+" #dst "        \n\t"
1111
 
1112
//IDCT(  src0,   src4,   src1,    src5,    dst, shift)
1113
IDCT(   0(%1), 64(%1), 32(%1),  96(%1),  0(%0), 20)
1114
//IDCT(   8(%1), 72(%1), 40(%1), 104(%1),  4(%0), 20)
1115
IDCT(  16(%1), 80(%1), 48(%1), 112(%1),  8(%0), 20)
1116
//IDCT(  24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20)
1117
 
1118
 
1119
#endif
1120
 
1121
/*
1122
Input
1123
 00 40 04 44 20 60 24 64
1124
 10 30 14 34 50 70 54 74
1125
 01 41 03 43 21 61 23 63
1126
 11 31 13 33 51 71 53 73
1127
 02 42 06 46 22 62 26 66
1128
 12 32 16 36 52 72 56 76
1129
 05 45 07 47 25 65 27 67
1130
 15 35 17 37 55 75 57 77
1131
 
1132
Temp
1133
 00 04 10 14 20 24 30 34
1134
 40 44 50 54 60 64 70 74
1135
 01 03 11 13 21 23 31 33
1136
 41 43 51 53 61 63 71 73
1137
 02 06 12 16 22 26 32 36
1138
 42 46 52 56 62 66 72 76
1139
 05 07 15 17 25 27 35 37
1140
 45 47 55 57 65 67 75 77
1141
*/
1142
 
1143
"9: \n\t"
1144
                :: "r" (block), "r" (temp), "r" (coeffs)
1145
                : "%eax"
1146
        );
1147
}
1148
 
1149
void ff_simple_idct_mmx(int16_t *block)
1150
{
1151
    idct(block);
1152
}
1153
 
1154
//FIXME merge add/put into the idct
1155
 
1156
void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block)
1157
{
1158
    idct(block);
1159
    ff_put_pixels_clamped_mmx(block, dest, line_size);
1160
}
1161
void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block)
1162
{
1163
    idct(block);
1164
    ff_add_pixels_clamped_mmx(block, dest, line_size);
1165
}
1166
 
1167
#endif /* HAVE_INLINE_ASM */