Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | /* |
2 | * Simple IDCT MMX |
||
3 | * |
||
4 | * Copyright (c) 2001, 2002 Michael Niedermayer |
||
5 | * |
||
6 | * This file is part of FFmpeg. |
||
7 | * |
||
8 | * FFmpeg is free software; you can redistribute it and/or |
||
9 | * modify it under the terms of the GNU Lesser General Public |
||
10 | * License as published by the Free Software Foundation; either |
||
11 | * version 2.1 of the License, or (at your option) any later version. |
||
12 | * |
||
13 | * FFmpeg is distributed in the hope that it will be useful, |
||
14 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
15 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
16 | * Lesser General Public License for more details. |
||
17 | * |
||
18 | * You should have received a copy of the GNU Lesser General Public |
||
19 | * License along with FFmpeg; if not, write to the Free Software |
||
20 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
21 | */ |
||
22 | #include "libavcodec/simple_idct.h" |
||
23 | #include "libavutil/mem.h" |
||
24 | #include "dsputil_x86.h" |
||
25 | |||
26 | #if HAVE_INLINE_ASM |
||
27 | |||
28 | /* |
||
29 | 23170.475006 |
||
30 | 22725.260826 |
||
31 | 21406.727617 |
||
32 | 19265.545870 |
||
33 | 16384.000000 |
||
34 | 12872.826198 |
||
35 | 8866.956905 |
||
36 | 4520.335430 |
||
37 | */ |
||
38 | #define C0 23170 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
||
39 | #define C1 22725 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
||
40 | #define C2 21407 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
||
41 | #define C3 19266 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
||
42 | #define C4 16383 //cos(i*M_PI/16)*sqrt(2)*(1<<14) - 0.5 |
||
43 | #define C5 12873 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
||
44 | #define C6 8867 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
||
45 | #define C7 4520 //cos(i*M_PI/16)*sqrt(2)*(1<<14) + 0.5 |
||
46 | |||
47 | #define ROW_SHIFT 11 |
||
48 | #define COL_SHIFT 20 // 6 |
||
49 | |||
50 | DECLARE_ASM_CONST(8, uint64_t, wm1010)= 0xFFFF0000FFFF0000ULL; |
||
51 | DECLARE_ASM_CONST(8, uint64_t, d40000)= 0x0000000000040000ULL; |
||
52 | |||
53 | DECLARE_ALIGNED(8, static const int16_t, coeffs)[]= { |
||
54 | 1<<(ROW_SHIFT-1), 0, 1<<(ROW_SHIFT-1), 0, |
||
55 | // 1<<(COL_SHIFT-1), 0, 1<<(COL_SHIFT-1), 0, |
||
56 | // 0, 1<<(COL_SHIFT-1-16), 0, 1<<(COL_SHIFT-1-16), |
||
57 | 1<<(ROW_SHIFT-1), 1, 1<<(ROW_SHIFT-1), 0, |
||
58 | // the 1 = ((1<<(COL_SHIFT-1))/C4)< |
||
59 | // 0, 0, 0, 0, |
||
60 | // 0, 0, 0, 0, |
||
61 | |||
62 | C4, C4, C4, C4, |
||
63 | C4, -C4, C4, -C4, |
||
64 | |||
65 | C2, C6, C2, C6, |
||
66 | C6, -C2, C6, -C2, |
||
67 | |||
68 | C1, C3, C1, C3, |
||
69 | C5, C7, C5, C7, |
||
70 | |||
71 | C3, -C7, C3, -C7, |
||
72 | -C1, -C5, -C1, -C5, |
||
73 | |||
74 | C5, -C1, C5, -C1, |
||
75 | C7, C3, C7, C3, |
||
76 | |||
77 | C7, -C5, C7, -C5, |
||
78 | C3, -C1, C3, -C1 |
||
79 | }; |
||
80 | |||
81 | static inline void idct(int16_t *block) |
||
82 | { |
||
83 | LOCAL_ALIGNED_8(int64_t, align_tmp, [16]); |
||
84 | int16_t * const temp= (int16_t*)align_tmp; |
||
85 | |||
86 | __asm__ volatile( |
||
87 | #if 0 //Alternative, simpler variant |
||
88 | |||
89 | #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
||
90 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
91 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
||
92 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
||
93 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
||
94 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
95 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
96 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
97 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
98 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
||
99 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
100 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
||
101 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
102 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
||
103 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
||
104 | #rounder ", %%mm4 \n\t"\ |
||
105 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
106 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
||
107 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
||
108 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
||
109 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
||
110 | #rounder ", %%mm0 \n\t"\ |
||
111 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
||
112 | "paddd %%mm0, %%mm0 \n\t" \ |
||
113 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
||
114 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
||
115 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
||
116 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
||
117 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
||
118 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
119 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
120 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
121 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
||
122 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
123 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
124 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
||
125 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
||
126 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
127 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
128 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
129 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
||
130 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
||
131 | "movq %%mm7, " #dst " \n\t"\ |
||
132 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
||
133 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
||
134 | "movq %%mm2, 24+" #dst " \n\t"\ |
||
135 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
||
136 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
||
137 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
||
138 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
||
139 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
||
140 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
||
141 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
||
142 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
143 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
||
144 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
145 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
146 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
||
147 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
||
148 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
149 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
||
150 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
151 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
||
152 | "movq %%mm2, 8+" #dst " \n\t"\ |
||
153 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
154 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
||
155 | "movq %%mm4, 16+" #dst " \n\t"\ |
||
156 | |||
157 | #define COL_IDCT(src0, src4, src1, src5, dst, shift) \ |
||
158 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
159 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
||
160 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
||
161 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
||
162 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
163 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
164 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
165 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
166 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
||
167 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
168 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
||
169 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
170 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
171 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
||
172 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
||
173 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
||
174 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
||
175 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
176 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
||
177 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
||
178 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
||
179 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
||
180 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
||
181 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ |
||
182 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ |
||
183 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
||
184 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
185 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
186 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
187 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ |
||
188 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
189 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
190 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
||
191 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
192 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
193 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
194 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
195 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
196 | "movd %%mm7, " #dst " \n\t"\ |
||
197 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
198 | "movd %%mm0, 16+" #dst " \n\t"\ |
||
199 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
200 | "movd %%mm2, 96+" #dst " \n\t"\ |
||
201 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
202 | "movd %%mm4, 112+" #dst " \n\t"\ |
||
203 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ |
||
204 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
||
205 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
||
206 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
||
207 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
||
208 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
||
209 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
||
210 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
||
211 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
||
212 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
213 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
||
214 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
215 | "psrad $" #shift ", %%mm5 \n\t"\ |
||
216 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
||
217 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ |
||
218 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
219 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
||
220 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
221 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
222 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
223 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
224 | "movd %%mm2, 32+" #dst " \n\t"\ |
||
225 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
||
226 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
||
227 | "movd %%mm6, 48+" #dst " \n\t"\ |
||
228 | "movd %%mm4, 64+" #dst " \n\t"\ |
||
229 | "movd %%mm5, 80+" #dst " \n\t"\ |
||
230 | |||
231 | |||
232 | #define DC_COND_ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
||
233 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
234 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
||
235 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
||
236 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
||
237 | "movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
||
238 | "pand %%mm0, %%mm4 \n\t"\ |
||
239 | "por %%mm1, %%mm4 \n\t"\ |
||
240 | "por %%mm2, %%mm4 \n\t"\ |
||
241 | "por %%mm3, %%mm4 \n\t"\ |
||
242 | "packssdw %%mm4,%%mm4 \n\t"\ |
||
243 | "movd %%mm4, %%eax \n\t"\ |
||
244 | "orl %%eax, %%eax \n\t"\ |
||
245 | "jz 1f \n\t"\ |
||
246 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
247 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
248 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
249 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
250 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
||
251 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
252 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
||
253 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
254 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
||
255 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
||
256 | #rounder ", %%mm4 \n\t"\ |
||
257 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
258 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
||
259 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
||
260 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
||
261 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
||
262 | #rounder ", %%mm0 \n\t"\ |
||
263 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
||
264 | "paddd %%mm0, %%mm0 \n\t" \ |
||
265 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
||
266 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
||
267 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
||
268 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
||
269 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
||
270 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
271 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
272 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
273 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
||
274 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
275 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
276 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
||
277 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
||
278 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
279 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
280 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
281 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
||
282 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
||
283 | "movq %%mm7, " #dst " \n\t"\ |
||
284 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
||
285 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
||
286 | "movq %%mm2, 24+" #dst " \n\t"\ |
||
287 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
||
288 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
||
289 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
||
290 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
||
291 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
||
292 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
||
293 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
||
294 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
295 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
||
296 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
297 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
298 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
||
299 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
||
300 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
301 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
||
302 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
303 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
||
304 | "movq %%mm2, 8+" #dst " \n\t"\ |
||
305 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
306 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
||
307 | "movq %%mm4, 16+" #dst " \n\t"\ |
||
308 | "jmp 2f \n\t"\ |
||
309 | "1: \n\t"\ |
||
310 | "pslld $16, %%mm0 \n\t"\ |
||
311 | "#paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
||
312 | "psrad $13, %%mm0 \n\t"\ |
||
313 | "packssdw %%mm0, %%mm0 \n\t"\ |
||
314 | "movq %%mm0, " #dst " \n\t"\ |
||
315 | "movq %%mm0, 8+" #dst " \n\t"\ |
||
316 | "movq %%mm0, 16+" #dst " \n\t"\ |
||
317 | "movq %%mm0, 24+" #dst " \n\t"\ |
||
318 | "2: \n\t" |
||
319 | |||
320 | |||
321 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
||
322 | ROW_IDCT( (%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) |
||
323 | /*ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1), paddd (%2), 11) |
||
324 | ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1), paddd (%2), 11) |
||
325 | ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1), paddd (%2), 11)*/ |
||
326 | |||
327 | DC_COND_ROW_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11) |
||
328 | DC_COND_ROW_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11) |
||
329 | DC_COND_ROW_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11) |
||
330 | |||
331 | |||
332 | //IDCT( src0, src4, src1, src5, dst, shift) |
||
333 | COL_IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
||
334 | COL_IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
||
335 | COL_IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
||
336 | COL_IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
||
337 | |||
338 | #else |
||
339 | |||
340 | #define DC_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
||
341 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
342 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
||
343 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
||
344 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
||
345 | "movq "MANGLE(wm1010)", %%mm4 \n\t"\ |
||
346 | "pand %%mm0, %%mm4 \n\t"\ |
||
347 | "por %%mm1, %%mm4 \n\t"\ |
||
348 | "por %%mm2, %%mm4 \n\t"\ |
||
349 | "por %%mm3, %%mm4 \n\t"\ |
||
350 | "packssdw %%mm4,%%mm4 \n\t"\ |
||
351 | "movd %%mm4, %%eax \n\t"\ |
||
352 | "orl %%eax, %%eax \n\t"\ |
||
353 | "jz 1f \n\t"\ |
||
354 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
355 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
356 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
357 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
358 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
||
359 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
360 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
||
361 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
362 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
||
363 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
||
364 | #rounder ", %%mm4 \n\t"\ |
||
365 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
366 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
||
367 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
||
368 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
||
369 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
||
370 | #rounder ", %%mm0 \n\t"\ |
||
371 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
||
372 | "paddd %%mm0, %%mm0 \n\t" \ |
||
373 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
||
374 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
||
375 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
||
376 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
||
377 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
||
378 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
379 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
380 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
381 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
||
382 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
383 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
384 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
||
385 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
||
386 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
387 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
388 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
389 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
||
390 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
||
391 | "movq %%mm7, " #dst " \n\t"\ |
||
392 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
||
393 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
||
394 | "movq %%mm2, 24+" #dst " \n\t"\ |
||
395 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
||
396 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
||
397 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
||
398 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
||
399 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
||
400 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
||
401 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
||
402 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
403 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
||
404 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
405 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
406 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
||
407 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
||
408 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
409 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
||
410 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
411 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
||
412 | "movq %%mm2, 8+" #dst " \n\t"\ |
||
413 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
414 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
||
415 | "movq %%mm4, 16+" #dst " \n\t"\ |
||
416 | "jmp 2f \n\t"\ |
||
417 | "1: \n\t"\ |
||
418 | "pslld $16, %%mm0 \n\t"\ |
||
419 | "paddd "MANGLE(d40000)", %%mm0 \n\t"\ |
||
420 | "psrad $13, %%mm0 \n\t"\ |
||
421 | "packssdw %%mm0, %%mm0 \n\t"\ |
||
422 | "movq %%mm0, " #dst " \n\t"\ |
||
423 | "movq %%mm0, 8+" #dst " \n\t"\ |
||
424 | "movq %%mm0, 16+" #dst " \n\t"\ |
||
425 | "movq %%mm0, 24+" #dst " \n\t"\ |
||
426 | "2: \n\t" |
||
427 | |||
428 | #define Z_COND_IDCT(src0, src4, src1, src5, dst, rounder, shift, bt) \ |
||
429 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
430 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
||
431 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
||
432 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
||
433 | "movq %%mm0, %%mm4 \n\t"\ |
||
434 | "por %%mm1, %%mm4 \n\t"\ |
||
435 | "por %%mm2, %%mm4 \n\t"\ |
||
436 | "por %%mm3, %%mm4 \n\t"\ |
||
437 | "packssdw %%mm4,%%mm4 \n\t"\ |
||
438 | "movd %%mm4, %%eax \n\t"\ |
||
439 | "orl %%eax, %%eax \n\t"\ |
||
440 | "jz " #bt " \n\t"\ |
||
441 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
442 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
443 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
444 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
445 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
||
446 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
447 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
||
448 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
449 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
||
450 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
||
451 | #rounder ", %%mm4 \n\t"\ |
||
452 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
453 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
||
454 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
||
455 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
||
456 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
||
457 | #rounder ", %%mm0 \n\t"\ |
||
458 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
||
459 | "paddd %%mm0, %%mm0 \n\t" \ |
||
460 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
||
461 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
||
462 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
||
463 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
||
464 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
||
465 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
466 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
467 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
468 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
||
469 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
470 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
471 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
||
472 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
||
473 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
474 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
475 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
476 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
||
477 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
||
478 | "movq %%mm7, " #dst " \n\t"\ |
||
479 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
||
480 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
||
481 | "movq %%mm2, 24+" #dst " \n\t"\ |
||
482 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
||
483 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
||
484 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
||
485 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
||
486 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
||
487 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
||
488 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
||
489 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
490 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
||
491 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
492 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
493 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
||
494 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
||
495 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
496 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
||
497 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
498 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
||
499 | "movq %%mm2, 8+" #dst " \n\t"\ |
||
500 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
501 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
||
502 | "movq %%mm4, 16+" #dst " \n\t"\ |
||
503 | |||
504 | #define ROW_IDCT(src0, src4, src1, src5, dst, rounder, shift) \ |
||
505 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
506 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
||
507 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
||
508 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
||
509 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
510 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
511 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
512 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
513 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
||
514 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
515 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
||
516 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
517 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
||
518 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
||
519 | #rounder ", %%mm4 \n\t"\ |
||
520 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
521 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
||
522 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
||
523 | "movq 56(%2), %%mm5 \n\t" /* C7 C5 C7 C5 */\ |
||
524 | "pmaddwd %%mm3, %%mm5 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
||
525 | #rounder ", %%mm0 \n\t"\ |
||
526 | "paddd %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
||
527 | "paddd %%mm0, %%mm0 \n\t" \ |
||
528 | "psubd %%mm1, %%mm0 \n\t" /* A2 a2 */\ |
||
529 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
||
530 | "paddd %%mm5, %%mm7 \n\t" /* B0 b0 */\ |
||
531 | "movq 72(%2), %%mm5 \n\t" /* -C5 -C1 -C5 -C1 */\ |
||
532 | "pmaddwd %%mm3, %%mm5 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
||
533 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
534 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
535 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
536 | "paddd %%mm2, %%mm5 \n\t" /* B1 b1 */\ |
||
537 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
538 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
539 | "movq %%mm1, %%mm2 \n\t" /* A1 a1 */\ |
||
540 | "paddd %%mm5, %%mm1 \n\t" /* A1+B1 a1+b1 */\ |
||
541 | "psubd %%mm5, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
542 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
543 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
544 | "packssdw %%mm1, %%mm7 \n\t" /* A1+B1 a1+b1 A0+B0 a0+b0 */\ |
||
545 | "packssdw %%mm4, %%mm2 \n\t" /* A0-B0 a0-b0 A1-B1 a1-b1 */\ |
||
546 | "movq %%mm7, " #dst " \n\t"\ |
||
547 | "movq " #src1 ", %%mm1 \n\t" /* R3 R1 r3 r1 */\ |
||
548 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
||
549 | "movq %%mm2, 24+" #dst " \n\t"\ |
||
550 | "pmaddwd %%mm1, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
||
551 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
||
552 | "pmaddwd 96(%2), %%mm1 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
||
553 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
||
554 | "movq %%mm0, %%mm2 \n\t" /* A2 a2 */\ |
||
555 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
||
556 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
||
557 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
558 | "psubd %%mm4, %%mm0 \n\t" /* a2-B2 a2-b2 */\ |
||
559 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
560 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
561 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
||
562 | "paddd %%mm1, %%mm3 \n\t" /* B3 b3 */\ |
||
563 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
564 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
||
565 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
566 | "packssdw %%mm6, %%mm2 \n\t" /* A3+B3 a3+b3 A2+B2 a2+b2 */\ |
||
567 | "movq %%mm2, 8+" #dst " \n\t"\ |
||
568 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
569 | "packssdw %%mm0, %%mm4 \n\t" /* A2-B2 a2-b2 A3-B3 a3-b3 */\ |
||
570 | "movq %%mm4, 16+" #dst " \n\t"\ |
||
571 | |||
572 | //IDCT( src0, src4, src1, src5, dst, rounder, shift) |
||
573 | DC_COND_IDCT( 0(%0), 8(%0), 16(%0), 24(%0), 0(%1),paddd 8(%2), 11) |
||
574 | Z_COND_IDCT( 32(%0), 40(%0), 48(%0), 56(%0), 32(%1),paddd (%2), 11, 4f) |
||
575 | Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 2f) |
||
576 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 1f) |
||
577 | |||
578 | #undef IDCT |
||
579 | #define IDCT(src0, src4, src1, src5, dst, shift) \ |
||
580 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
581 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
||
582 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
||
583 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
||
584 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
585 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
586 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
587 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
588 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
||
589 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
590 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
||
591 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
592 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
593 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
||
594 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
||
595 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
||
596 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
||
597 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
598 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
||
599 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
||
600 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
||
601 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
||
602 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
||
603 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ |
||
604 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ |
||
605 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
||
606 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
607 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
608 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
609 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ |
||
610 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
611 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
612 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
||
613 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
614 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
615 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
616 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
617 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
618 | "movd %%mm7, " #dst " \n\t"\ |
||
619 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
620 | "movd %%mm0, 16+" #dst " \n\t"\ |
||
621 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
622 | "movd %%mm2, 96+" #dst " \n\t"\ |
||
623 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
624 | "movd %%mm4, 112+" #dst " \n\t"\ |
||
625 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ |
||
626 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
||
627 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
||
628 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
||
629 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
||
630 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
||
631 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
||
632 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
||
633 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
||
634 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
635 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
||
636 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
637 | "psrad $" #shift ", %%mm5 \n\t"\ |
||
638 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
||
639 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ |
||
640 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
641 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
||
642 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
643 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
644 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
645 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
646 | "movd %%mm2, 32+" #dst " \n\t"\ |
||
647 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
||
648 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
||
649 | "movd %%mm6, 48+" #dst " \n\t"\ |
||
650 | "movd %%mm4, 64+" #dst " \n\t"\ |
||
651 | "movd %%mm5, 80+" #dst " \n\t" |
||
652 | |||
653 | |||
654 | //IDCT( src0, src4, src1, src5, dst, shift) |
||
655 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
||
656 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
||
657 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
||
658 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
||
659 | "jmp 9f \n\t" |
||
660 | |||
661 | "# .p2align 4 \n\t"\ |
||
662 | "4: \n\t" |
||
663 | Z_COND_IDCT( 64(%0), 72(%0), 80(%0), 88(%0), 64(%1),paddd (%2), 11, 6f) |
||
664 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 5f) |
||
665 | |||
666 | #undef IDCT |
||
667 | #define IDCT(src0, src4, src1, src5, dst, shift) \ |
||
668 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
669 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
||
670 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
||
671 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
672 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
673 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
674 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
675 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
||
676 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
677 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
||
678 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
679 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
680 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
||
681 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
||
682 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
683 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
||
684 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
||
685 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
||
686 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
||
687 | "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
||
688 | "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
||
689 | "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
||
690 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
691 | "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
692 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
693 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
694 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
||
695 | "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
696 | "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
697 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
698 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
699 | "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
||
700 | "movd %%mm1, " #dst " \n\t"\ |
||
701 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
702 | "movd %%mm0, 16+" #dst " \n\t"\ |
||
703 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
704 | "movd %%mm2, 96+" #dst " \n\t"\ |
||
705 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
706 | "movd %%mm4, 112+" #dst " \n\t"\ |
||
707 | "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ |
||
708 | "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
||
709 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
||
710 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
||
711 | "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
712 | "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
||
713 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
714 | "psrad $" #shift ", %%mm5 \n\t"\ |
||
715 | "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ |
||
716 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
717 | "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ |
||
718 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
719 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
720 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
721 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
722 | "movd %%mm2, 32+" #dst " \n\t"\ |
||
723 | "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ |
||
724 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
||
725 | "movd %%mm6, 48+" #dst " \n\t"\ |
||
726 | "movd %%mm1, 64+" #dst " \n\t"\ |
||
727 | "movd %%mm5, 80+" #dst " \n\t" |
||
728 | |||
729 | //IDCT( src0, src4, src1, src5, dst, shift) |
||
730 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
||
731 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
||
732 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
||
733 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
||
734 | "jmp 9f \n\t" |
||
735 | |||
736 | "# .p2align 4 \n\t"\ |
||
737 | "6: \n\t" |
||
738 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 7f) |
||
739 | |||
740 | #undef IDCT |
||
741 | #define IDCT(src0, src4, src1, src5, dst, shift) \ |
||
742 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
743 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
||
744 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
745 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
746 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
747 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
748 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
749 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
750 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
||
751 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
||
752 | "movq 72(%2), %%mm7 \n\t" /* -C5 -C1 -C5 -C1 */\ |
||
753 | "pmaddwd %%mm3, %%mm7 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
||
754 | "paddd %%mm4, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
||
755 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
756 | "psubd %%mm1, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
757 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
758 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
759 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
||
760 | "paddd %%mm7, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
761 | "psubd %%mm7, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
762 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
763 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
764 | "packssdw %%mm1, %%mm1 \n\t" /* A0+B0 a0+b0 */\ |
||
765 | "movd %%mm1, " #dst " \n\t"\ |
||
766 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
767 | "movd %%mm0, 16+" #dst " \n\t"\ |
||
768 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
769 | "movd %%mm2, 96+" #dst " \n\t"\ |
||
770 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
771 | "movd %%mm4, 112+" #dst " \n\t"\ |
||
772 | "movq 88(%2), %%mm1 \n\t" /* C3 C7 C3 C7 */\ |
||
773 | "pmaddwd %%mm3, %%mm1 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
||
774 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
||
775 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
||
776 | "paddd %%mm1, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
777 | "psubd %%mm1, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
||
778 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
779 | "psrad $" #shift ", %%mm5 \n\t"\ |
||
780 | "movq %%mm6, %%mm1 \n\t" /* A3 a3 */\ |
||
781 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
782 | "psubd %%mm3, %%mm1 \n\t" /* a3-B3 a3-b3 */\ |
||
783 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
784 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
785 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
786 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
787 | "movd %%mm2, 32+" #dst " \n\t"\ |
||
788 | "packssdw %%mm1, %%mm1 \n\t" /* A3-B3 a3-b3 */\ |
||
789 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
||
790 | "movd %%mm6, 48+" #dst " \n\t"\ |
||
791 | "movd %%mm1, 64+" #dst " \n\t"\ |
||
792 | "movd %%mm5, 80+" #dst " \n\t" |
||
793 | |||
794 | |||
795 | //IDCT( src0, src4, src1, src5, dst, shift) |
||
796 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
||
797 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
||
798 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
||
799 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
||
800 | "jmp 9f \n\t" |
||
801 | |||
802 | "# .p2align 4 \n\t"\ |
||
803 | "2: \n\t" |
||
804 | Z_COND_IDCT( 96(%0),104(%0),112(%0),120(%0), 96(%1),paddd (%2), 11, 3f) |
||
805 | |||
806 | #undef IDCT |
||
807 | #define IDCT(src0, src4, src1, src5, dst, shift) \ |
||
808 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
809 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
||
810 | "movq " #src5 ", %%mm3 \n\t" /* R7 R5 r7 r5 */\ |
||
811 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
812 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
813 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
814 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
815 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
816 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
||
817 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
||
818 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
819 | "movq 56(%2), %%mm1 \n\t" /* C7 C5 C7 C5 */\ |
||
820 | "pmaddwd %%mm3, %%mm1 \n\t" /* C7R7+C5R5 C7r7+C5r5 */\ |
||
821 | "pmaddwd 64(%2), %%mm2 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
||
822 | "paddd %%mm1, %%mm7 \n\t" /* B0 b0 */\ |
||
823 | "movq 72(%2), %%mm1 \n\t" /* -C5 -C1 -C5 -C1 */\ |
||
824 | "pmaddwd %%mm3, %%mm1 \n\t" /* -C5R7-C1R5 -C5r7-C1r5 */\ |
||
825 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
826 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
827 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
828 | "paddd %%mm2, %%mm1 \n\t" /* B1 b1 */\ |
||
829 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
830 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
831 | "movq %%mm0, %%mm2 \n\t" /* A1 a1 */\ |
||
832 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
833 | "psubd %%mm1, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
834 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
835 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
836 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
837 | "movd %%mm7, " #dst " \n\t"\ |
||
838 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
839 | "movd %%mm0, 16+" #dst " \n\t"\ |
||
840 | "packssdw %%mm2, %%mm2 \n\t" /* A1-B1 a1-b1 */\ |
||
841 | "movd %%mm2, 96+" #dst " \n\t"\ |
||
842 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
843 | "movd %%mm4, 112+" #dst " \n\t"\ |
||
844 | "movq " #src1 ", %%mm0 \n\t" /* R3 R1 r3 r1 */\ |
||
845 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
||
846 | "pmaddwd %%mm0, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
||
847 | "movq 88(%2), %%mm7 \n\t" /* C3 C7 C3 C7 */\ |
||
848 | "pmaddwd 96(%2), %%mm0 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
||
849 | "pmaddwd %%mm3, %%mm7 \n\t" /* C3R7+C7R5 C3r7+C7r5 */\ |
||
850 | "movq %%mm5, %%mm2 \n\t" /* A2 a2 */\ |
||
851 | "pmaddwd 104(%2), %%mm3 \n\t" /* -C1R7+C3R5 -C1r7+C3r5 */\ |
||
852 | "paddd %%mm7, %%mm4 \n\t" /* B2 b2 */\ |
||
853 | "paddd %%mm4, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
854 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
||
855 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
856 | "psrad $" #shift ", %%mm5 \n\t"\ |
||
857 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
||
858 | "paddd %%mm0, %%mm3 \n\t" /* B3 b3 */\ |
||
859 | "paddd %%mm3, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
860 | "psubd %%mm3, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
||
861 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
862 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
863 | "packssdw %%mm2, %%mm2 \n\t" /* A2+B2 a2+b2 */\ |
||
864 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
865 | "movd %%mm2, 32+" #dst " \n\t"\ |
||
866 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
||
867 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
||
868 | "movd %%mm6, 48+" #dst " \n\t"\ |
||
869 | "movd %%mm4, 64+" #dst " \n\t"\ |
||
870 | "movd %%mm5, 80+" #dst " \n\t" |
||
871 | |||
872 | //IDCT( src0, src4, src1, src5, dst, shift) |
||
873 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
||
874 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
||
875 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
||
876 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
||
877 | "jmp 9f \n\t" |
||
878 | |||
879 | "# .p2align 4 \n\t"\ |
||
880 | "3: \n\t" |
||
881 | #undef IDCT |
||
882 | #define IDCT(src0, src4, src1, src5, dst, shift) \ |
||
883 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
884 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
||
885 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
886 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
887 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
888 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
889 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
890 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
||
891 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
||
892 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
893 | "movq 64(%2), %%mm3 \n\t"\ |
||
894 | "pmaddwd %%mm2, %%mm3 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
||
895 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
896 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
897 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
898 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
899 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
900 | "movq %%mm0, %%mm1 \n\t" /* A1 a1 */\ |
||
901 | "paddd %%mm3, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
902 | "psubd %%mm3, %%mm1 \n\t" /* A1-B1 a1-b1 */\ |
||
903 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
904 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
905 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
906 | "movd %%mm7, " #dst " \n\t"\ |
||
907 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
908 | "movd %%mm0, 16+" #dst " \n\t"\ |
||
909 | "packssdw %%mm1, %%mm1 \n\t" /* A1-B1 a1-b1 */\ |
||
910 | "movd %%mm1, 96+" #dst " \n\t"\ |
||
911 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
912 | "movd %%mm4, 112+" #dst " \n\t"\ |
||
913 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
||
914 | "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
||
915 | "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
||
916 | "movq %%mm5, %%mm1 \n\t" /* A2 a2 */\ |
||
917 | "paddd %%mm4, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
||
918 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
||
919 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
920 | "psrad $" #shift ", %%mm5 \n\t"\ |
||
921 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
||
922 | "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
923 | "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
||
924 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
925 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
926 | "packssdw %%mm1, %%mm1 \n\t" /* A2+B2 a2+b2 */\ |
||
927 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
928 | "movd %%mm1, 32+" #dst " \n\t"\ |
||
929 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
||
930 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
||
931 | "movd %%mm6, 48+" #dst " \n\t"\ |
||
932 | "movd %%mm4, 64+" #dst " \n\t"\ |
||
933 | "movd %%mm5, 80+" #dst " \n\t" |
||
934 | |||
935 | |||
936 | //IDCT( src0, src4, src1, src5, dst, shift) |
||
937 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
||
938 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
||
939 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
||
940 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
||
941 | "jmp 9f \n\t" |
||
942 | |||
943 | "# .p2align 4 \n\t"\ |
||
944 | "5: \n\t" |
||
945 | #undef IDCT |
||
946 | #define IDCT(src0, src4, src1, src5, dst, shift) \ |
||
947 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
948 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
||
949 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
950 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
951 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
952 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
953 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
||
954 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
955 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
||
956 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
957 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
958 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
||
959 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
||
960 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
961 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
||
962 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
||
963 | "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ |
||
964 | "movq 8+" #src4 ", %%mm3 \n\t" /* R6 R2 r6 r2 */\ |
||
965 | "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ |
||
966 | "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
967 | "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ |
||
968 | "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
969 | "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ |
||
970 | "pmaddwd %%mm3, %%mm7 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
971 | "pmaddwd 40(%2), %%mm3 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
972 | "paddd %%mm1, %%mm7 \n\t" /* A0 a0 */\ |
||
973 | "paddd %%mm1, %%mm1 \n\t" /* 2C0 2c0 */\ |
||
974 | "psubd %%mm7, %%mm1 \n\t" /* A3 a3 */\ |
||
975 | "paddd %%mm2, %%mm3 \n\t" /* A1 a1 */\ |
||
976 | "paddd %%mm2, %%mm2 \n\t" /* 2C1 2c1 */\ |
||
977 | "psubd %%mm3, %%mm2 \n\t" /* A2 a2 */\ |
||
978 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
979 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
980 | "psrad $" #shift ", %%mm3 \n\t"\ |
||
981 | "packssdw %%mm7, %%mm4 \n\t" /* A0 a0 */\ |
||
982 | "movq %%mm4, " #dst " \n\t"\ |
||
983 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
984 | "packssdw %%mm3, %%mm0 \n\t" /* A1 a1 */\ |
||
985 | "movq %%mm0, 16+" #dst " \n\t"\ |
||
986 | "movq %%mm0, 96+" #dst " \n\t"\ |
||
987 | "movq %%mm4, 112+" #dst " \n\t"\ |
||
988 | "psrad $" #shift ", %%mm5 \n\t"\ |
||
989 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
990 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
991 | "packssdw %%mm2, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
||
992 | "movq %%mm5, 32+" #dst " \n\t"\ |
||
993 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
994 | "packssdw %%mm1, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
995 | "movq %%mm6, 48+" #dst " \n\t"\ |
||
996 | "movq %%mm6, 64+" #dst " \n\t"\ |
||
997 | "movq %%mm5, 80+" #dst " \n\t" |
||
998 | |||
999 | |||
1000 | //IDCT( src0, src4, src1, src5, dst, shift) |
||
1001 | IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
||
1002 | //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
||
1003 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
||
1004 | //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
||
1005 | "jmp 9f \n\t" |
||
1006 | |||
1007 | |||
1008 | "# .p2align 4 \n\t"\ |
||
1009 | "1: \n\t" |
||
1010 | #undef IDCT |
||
1011 | #define IDCT(src0, src4, src1, src5, dst, shift) \ |
||
1012 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
1013 | "movq " #src4 ", %%mm1 \n\t" /* R6 R2 r6 r2 */\ |
||
1014 | "movq " #src1 ", %%mm2 \n\t" /* R3 R1 r3 r1 */\ |
||
1015 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
1016 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
1017 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
1018 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
1019 | "movq 32(%2), %%mm5 \n\t" /* C6 C2 C6 C2 */\ |
||
1020 | "pmaddwd %%mm1, %%mm5 \n\t" /* C6R6+C2R2 C6r6+C2r2 */\ |
||
1021 | "movq 40(%2), %%mm6 \n\t" /* -C2 C6 -C2 C6 */\ |
||
1022 | "pmaddwd %%mm6, %%mm1 \n\t" /* -C2R6+C6R2 -C2r6+C6r2 */\ |
||
1023 | "movq %%mm4, %%mm6 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
1024 | "movq 48(%2), %%mm7 \n\t" /* C3 C1 C3 C1 */\ |
||
1025 | "pmaddwd %%mm2, %%mm7 \n\t" /* C3R3+C1R1 C3r3+C1r1 */\ |
||
1026 | "paddd %%mm5, %%mm4 \n\t" /* A0 a0 */\ |
||
1027 | "psubd %%mm5, %%mm6 \n\t" /* A3 a3 */\ |
||
1028 | "movq %%mm0, %%mm5 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
1029 | "paddd %%mm1, %%mm0 \n\t" /* A1 a1 */\ |
||
1030 | "psubd %%mm1, %%mm5 \n\t" /* A2 a2 */\ |
||
1031 | "movq 64(%2), %%mm1 \n\t"\ |
||
1032 | "pmaddwd %%mm2, %%mm1 \n\t" /* -C7R3+C3R1 -C7r3+C3r1 */\ |
||
1033 | "paddd %%mm4, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
1034 | "paddd %%mm4, %%mm4 \n\t" /* 2A0 2a0 */\ |
||
1035 | "psubd %%mm7, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
1036 | "psrad $" #shift ", %%mm7 \n\t"\ |
||
1037 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
1038 | "movq %%mm0, %%mm3 \n\t" /* A1 a1 */\ |
||
1039 | "paddd %%mm1, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
1040 | "psubd %%mm1, %%mm3 \n\t" /* A1-B1 a1-b1 */\ |
||
1041 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
1042 | "psrad $" #shift ", %%mm3 \n\t"\ |
||
1043 | "packssdw %%mm7, %%mm7 \n\t" /* A0+B0 a0+b0 */\ |
||
1044 | "movd %%mm7, " #dst " \n\t"\ |
||
1045 | "packssdw %%mm0, %%mm0 \n\t" /* A1+B1 a1+b1 */\ |
||
1046 | "movd %%mm0, 16+" #dst " \n\t"\ |
||
1047 | "packssdw %%mm3, %%mm3 \n\t" /* A1-B1 a1-b1 */\ |
||
1048 | "movd %%mm3, 96+" #dst " \n\t"\ |
||
1049 | "packssdw %%mm4, %%mm4 \n\t" /* A0-B0 a0-b0 */\ |
||
1050 | "movd %%mm4, 112+" #dst " \n\t"\ |
||
1051 | "movq 80(%2), %%mm4 \n\t" /* -C1 C5 -C1 C5 */\ |
||
1052 | "pmaddwd %%mm2, %%mm4 \n\t" /* -C1R3+C5R1 -C1r3+C5r1 */\ |
||
1053 | "pmaddwd 96(%2), %%mm2 \n\t" /* -C5R3+C7R1 -C5r3+C7r1 */\ |
||
1054 | "movq %%mm5, %%mm3 \n\t" /* A2 a2 */\ |
||
1055 | "paddd %%mm4, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
||
1056 | "psubd %%mm4, %%mm5 \n\t" /* a2-B2 a2-b2 */\ |
||
1057 | "psrad $" #shift ", %%mm3 \n\t"\ |
||
1058 | "psrad $" #shift ", %%mm5 \n\t"\ |
||
1059 | "movq %%mm6, %%mm4 \n\t" /* A3 a3 */\ |
||
1060 | "paddd %%mm2, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
1061 | "psubd %%mm2, %%mm4 \n\t" /* a3-B3 a3-b3 */\ |
||
1062 | "psrad $" #shift ", %%mm6 \n\t"\ |
||
1063 | "packssdw %%mm3, %%mm3 \n\t" /* A2+B2 a2+b2 */\ |
||
1064 | "movd %%mm3, 32+" #dst " \n\t"\ |
||
1065 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
1066 | "packssdw %%mm6, %%mm6 \n\t" /* A3+B3 a3+b3 */\ |
||
1067 | "movd %%mm6, 48+" #dst " \n\t"\ |
||
1068 | "packssdw %%mm4, %%mm4 \n\t" /* A3-B3 a3-b3 */\ |
||
1069 | "packssdw %%mm5, %%mm5 \n\t" /* A2-B2 a2-b2 */\ |
||
1070 | "movd %%mm4, 64+" #dst " \n\t"\ |
||
1071 | "movd %%mm5, 80+" #dst " \n\t" |
||
1072 | |||
1073 | |||
1074 | //IDCT( src0, src4, src1, src5, dst, shift) |
||
1075 | IDCT( (%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
||
1076 | IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
||
1077 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
||
1078 | IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
||
1079 | "jmp 9f \n\t" |
||
1080 | |||
1081 | |||
1082 | "# .p2align 4 \n\t" |
||
1083 | "7: \n\t" |
||
1084 | #undef IDCT |
||
1085 | #define IDCT(src0, src4, src1, src5, dst, shift) \ |
||
1086 | "movq " #src0 ", %%mm0 \n\t" /* R4 R0 r4 r0 */\ |
||
1087 | "movq 16(%2), %%mm4 \n\t" /* C4 C4 C4 C4 */\ |
||
1088 | "pmaddwd %%mm0, %%mm4 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
1089 | "movq 24(%2), %%mm5 \n\t" /* -C4 C4 -C4 C4 */\ |
||
1090 | "pmaddwd %%mm5, %%mm0 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
1091 | "psrad $" #shift ", %%mm4 \n\t"\ |
||
1092 | "psrad $" #shift ", %%mm0 \n\t"\ |
||
1093 | "movq 8+" #src0 ", %%mm2 \n\t" /* R4 R0 r4 r0 */\ |
||
1094 | "movq 16(%2), %%mm1 \n\t" /* C4 C4 C4 C4 */\ |
||
1095 | "pmaddwd %%mm2, %%mm1 \n\t" /* C4R4+C4R0 C4r4+C4r0 */\ |
||
1096 | "movq 24(%2), %%mm7 \n\t" /* -C4 C4 -C4 C4 */\ |
||
1097 | "pmaddwd %%mm7, %%mm2 \n\t" /* -C4R4+C4R0 -C4r4+C4r0 */\ |
||
1098 | "movq 32(%2), %%mm7 \n\t" /* C6 C2 C6 C2 */\ |
||
1099 | "psrad $" #shift ", %%mm1 \n\t"\ |
||
1100 | "packssdw %%mm1, %%mm4 \n\t" /* A0 a0 */\ |
||
1101 | "movq %%mm4, " #dst " \n\t"\ |
||
1102 | "psrad $" #shift ", %%mm2 \n\t"\ |
||
1103 | "packssdw %%mm2, %%mm0 \n\t" /* A1 a1 */\ |
||
1104 | "movq %%mm0, 16+" #dst " \n\t"\ |
||
1105 | "movq %%mm0, 96+" #dst " \n\t"\ |
||
1106 | "movq %%mm4, 112+" #dst " \n\t"\ |
||
1107 | "movq %%mm0, 32+" #dst " \n\t"\ |
||
1108 | "movq %%mm4, 48+" #dst " \n\t"\ |
||
1109 | "movq %%mm4, 64+" #dst " \n\t"\ |
||
1110 | "movq %%mm0, 80+" #dst " \n\t" |
||
1111 | |||
1112 | //IDCT( src0, src4, src1, src5, dst, shift) |
||
1113 | IDCT( 0(%1), 64(%1), 32(%1), 96(%1), 0(%0), 20) |
||
1114 | //IDCT( 8(%1), 72(%1), 40(%1), 104(%1), 4(%0), 20) |
||
1115 | IDCT( 16(%1), 80(%1), 48(%1), 112(%1), 8(%0), 20) |
||
1116 | //IDCT( 24(%1), 88(%1), 56(%1), 120(%1), 12(%0), 20) |
||
1117 | |||
1118 | |||
1119 | #endif |
||
1120 | |||
1121 | /* |
||
1122 | Input |
||
1123 | 00 40 04 44 20 60 24 64 |
||
1124 | 10 30 14 34 50 70 54 74 |
||
1125 | 01 41 03 43 21 61 23 63 |
||
1126 | 11 31 13 33 51 71 53 73 |
||
1127 | 02 42 06 46 22 62 26 66 |
||
1128 | 12 32 16 36 52 72 56 76 |
||
1129 | 05 45 07 47 25 65 27 67 |
||
1130 | 15 35 17 37 55 75 57 77 |
||
1131 | |||
1132 | Temp |
||
1133 | 00 04 10 14 20 24 30 34 |
||
1134 | 40 44 50 54 60 64 70 74 |
||
1135 | 01 03 11 13 21 23 31 33 |
||
1136 | 41 43 51 53 61 63 71 73 |
||
1137 | 02 06 12 16 22 26 32 36 |
||
1138 | 42 46 52 56 62 66 72 76 |
||
1139 | 05 07 15 17 25 27 35 37 |
||
1140 | 45 47 55 57 65 67 75 77 |
||
1141 | */ |
||
1142 | |||
1143 | "9: \n\t" |
||
1144 | :: "r" (block), "r" (temp), "r" (coeffs) |
||
1145 | : "%eax" |
||
1146 | ); |
||
1147 | } |
||
1148 | |||
1149 | void ff_simple_idct_mmx(int16_t *block) |
||
1150 | { |
||
1151 | idct(block); |
||
1152 | } |
||
1153 | |||
1154 | //FIXME merge add/put into the idct |
||
1155 | |||
1156 | void ff_simple_idct_put_mmx(uint8_t *dest, int line_size, int16_t *block) |
||
1157 | { |
||
1158 | idct(block); |
||
1159 | ff_put_pixels_clamped_mmx(block, dest, line_size); |
||
1160 | } |
||
1161 | void ff_simple_idct_add_mmx(uint8_t *dest, int line_size, int16_t *block) |
||
1162 | { |
||
1163 | idct(block); |
||
1164 | ff_add_pixels_clamped_mmx(block, dest, line_size); |
||
1165 | } |
||
1166 | |||
1167 | #endif /* HAVE_INLINE_ASM */ |