Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4349 | Serge | 1 | /* |
2 | * Chinese AVS video (AVS1-P2, JiZhun profile) decoder. |
||
3 | * Copyright (c) 2006 Stefan Gehrer |
||
4 | * |
||
5 | * MMX-optimized DSP functions, based on H.264 optimizations by |
||
6 | * Michael Niedermayer and Loren Merritt |
||
7 | * |
||
8 | * This file is part of FFmpeg. |
||
9 | * |
||
10 | * FFmpeg is free software; you can redistribute it and/or |
||
11 | * modify it under the terms of the GNU Lesser General Public |
||
12 | * License as published by the Free Software Foundation; either |
||
13 | * version 2.1 of the License, or (at your option) any later version. |
||
14 | * |
||
15 | * FFmpeg is distributed in the hope that it will be useful, |
||
16 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
17 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
18 | * Lesser General Public License for more details. |
||
19 | * |
||
20 | * You should have received a copy of the GNU Lesser General Public |
||
21 | * License along with FFmpeg; if not, write to the Free Software |
||
22 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
23 | */ |
||
24 | |||
25 | #include "libavutil/attributes.h" |
||
26 | #include "libavutil/common.h" |
||
27 | #include "libavutil/cpu.h" |
||
28 | #include "libavutil/x86/asm.h" |
||
29 | #include "libavutil/x86/cpu.h" |
||
30 | #include "libavcodec/cavsdsp.h" |
||
31 | #include "constants.h" |
||
32 | #include "dsputil_x86.h" |
||
33 | #include "config.h" |
||
34 | |||
35 | #if HAVE_MMX_INLINE |
||
36 | |||
37 | /* in/out: mma=mma+mmb, mmb=mmb-mma */ |
||
38 | #define SUMSUB_BA( a, b ) \ |
||
39 | "paddw "#b", "#a" \n\t"\ |
||
40 | "paddw "#b", "#b" \n\t"\ |
||
41 | "psubw "#a", "#b" \n\t" |
||
42 | |||
43 | /***************************************************************************** |
||
44 | * |
||
45 | * inverse transform |
||
46 | * |
||
47 | ****************************************************************************/ |
||
48 | |||
49 | static inline void cavs_idct8_1d(int16_t *block, uint64_t bias) |
||
50 | { |
||
51 | __asm__ volatile( |
||
52 | "movq 112(%0), %%mm4 \n\t" /* mm4 = src7 */ |
||
53 | "movq 16(%0), %%mm5 \n\t" /* mm5 = src1 */ |
||
54 | "movq 80(%0), %%mm2 \n\t" /* mm2 = src5 */ |
||
55 | "movq 48(%0), %%mm7 \n\t" /* mm7 = src3 */ |
||
56 | "movq %%mm4, %%mm0 \n\t" |
||
57 | "movq %%mm5, %%mm3 \n\t" |
||
58 | "movq %%mm2, %%mm6 \n\t" |
||
59 | "movq %%mm7, %%mm1 \n\t" |
||
60 | |||
61 | "paddw %%mm4, %%mm4 \n\t" /* mm4 = 2*src7 */ |
||
62 | "paddw %%mm3, %%mm3 \n\t" /* mm3 = 2*src1 */ |
||
63 | "paddw %%mm6, %%mm6 \n\t" /* mm6 = 2*src5 */ |
||
64 | "paddw %%mm1, %%mm1 \n\t" /* mm1 = 2*src3 */ |
||
65 | "paddw %%mm4, %%mm0 \n\t" /* mm0 = 3*src7 */ |
||
66 | "paddw %%mm3, %%mm5 \n\t" /* mm5 = 3*src1 */ |
||
67 | "paddw %%mm6, %%mm2 \n\t" /* mm2 = 3*src5 */ |
||
68 | "paddw %%mm1, %%mm7 \n\t" /* mm7 = 3*src3 */ |
||
69 | "psubw %%mm4, %%mm5 \n\t" /* mm5 = 3*src1 - 2*src7 = a0 */ |
||
70 | "paddw %%mm6, %%mm7 \n\t" /* mm7 = 3*src3 + 2*src5 = a1 */ |
||
71 | "psubw %%mm2, %%mm1 \n\t" /* mm1 = 2*src3 - 3*src5 = a2 */ |
||
72 | "paddw %%mm0, %%mm3 \n\t" /* mm3 = 2*src1 + 3*src7 = a3 */ |
||
73 | |||
74 | "movq %%mm5, %%mm4 \n\t" |
||
75 | "movq %%mm7, %%mm6 \n\t" |
||
76 | "movq %%mm3, %%mm0 \n\t" |
||
77 | "movq %%mm1, %%mm2 \n\t" |
||
78 | SUMSUB_BA( %%mm7, %%mm5 ) /* mm7 = a0 + a1 mm5 = a0 - a1 */ |
||
79 | "paddw %%mm3, %%mm7 \n\t" /* mm7 = a0 + a1 + a3 */ |
||
80 | "paddw %%mm1, %%mm5 \n\t" /* mm5 = a0 - a1 + a2 */ |
||
81 | "paddw %%mm7, %%mm7 \n\t" |
||
82 | "paddw %%mm5, %%mm5 \n\t" |
||
83 | "paddw %%mm6, %%mm7 \n\t" /* mm7 = b4 */ |
||
84 | "paddw %%mm4, %%mm5 \n\t" /* mm5 = b5 */ |
||
85 | |||
86 | SUMSUB_BA( %%mm1, %%mm3 ) /* mm1 = a3 + a2 mm3 = a3 - a2 */ |
||
87 | "psubw %%mm1, %%mm4 \n\t" /* mm4 = a0 - a2 - a3 */ |
||
88 | "movq %%mm4, %%mm1 \n\t" /* mm1 = a0 - a2 - a3 */ |
||
89 | "psubw %%mm6, %%mm3 \n\t" /* mm3 = a3 - a2 - a1 */ |
||
90 | "paddw %%mm1, %%mm1 \n\t" |
||
91 | "paddw %%mm3, %%mm3 \n\t" |
||
92 | "psubw %%mm2, %%mm1 \n\t" /* mm1 = b7 */ |
||
93 | "paddw %%mm0, %%mm3 \n\t" /* mm3 = b6 */ |
||
94 | |||
95 | "movq 32(%0), %%mm2 \n\t" /* mm2 = src2 */ |
||
96 | "movq 96(%0), %%mm6 \n\t" /* mm6 = src6 */ |
||
97 | "movq %%mm2, %%mm4 \n\t" |
||
98 | "movq %%mm6, %%mm0 \n\t" |
||
99 | "psllw $2, %%mm4 \n\t" /* mm4 = 4*src2 */ |
||
100 | "psllw $2, %%mm6 \n\t" /* mm6 = 4*src6 */ |
||
101 | "paddw %%mm4, %%mm2 \n\t" /* mm2 = 5*src2 */ |
||
102 | "paddw %%mm6, %%mm0 \n\t" /* mm0 = 5*src6 */ |
||
103 | "paddw %%mm2, %%mm2 \n\t" |
||
104 | "paddw %%mm0, %%mm0 \n\t" |
||
105 | "psubw %%mm0, %%mm4 \n\t" /* mm4 = 4*src2 - 10*src6 = a7 */ |
||
106 | "paddw %%mm2, %%mm6 \n\t" /* mm6 = 4*src6 + 10*src2 = a6 */ |
||
107 | |||
108 | "movq (%0), %%mm2 \n\t" /* mm2 = src0 */ |
||
109 | "movq 64(%0), %%mm0 \n\t" /* mm0 = src4 */ |
||
110 | SUMSUB_BA( %%mm0, %%mm2 ) /* mm0 = src0+src4 mm2 = src0-src4 */ |
||
111 | "psllw $3, %%mm0 \n\t" |
||
112 | "psllw $3, %%mm2 \n\t" |
||
113 | "paddw %1, %%mm0 \n\t" /* add rounding bias */ |
||
114 | "paddw %1, %%mm2 \n\t" /* add rounding bias */ |
||
115 | |||
116 | SUMSUB_BA( %%mm6, %%mm0 ) /* mm6 = a4 + a6 mm0 = a4 - a6 */ |
||
117 | SUMSUB_BA( %%mm4, %%mm2 ) /* mm4 = a5 + a7 mm2 = a5 - a7 */ |
||
118 | SUMSUB_BA( %%mm7, %%mm6 ) /* mm7 = dst0 mm6 = dst7 */ |
||
119 | SUMSUB_BA( %%mm5, %%mm4 ) /* mm5 = dst1 mm4 = dst6 */ |
||
120 | SUMSUB_BA( %%mm3, %%mm2 ) /* mm3 = dst2 mm2 = dst5 */ |
||
121 | SUMSUB_BA( %%mm1, %%mm0 ) /* mm1 = dst3 mm0 = dst4 */ |
||
122 | :: "r"(block), "m"(bias) |
||
123 | ); |
||
124 | } |
||
125 | |||
126 | #define SBUTTERFLY(a,b,t,n,m)\ |
||
127 | "mov" #m " " #a ", " #t " \n\t" /* abcd */\ |
||
128 | "punpckl" #n " " #b ", " #a " \n\t" /* aebf */\ |
||
129 | "punpckh" #n " " #b ", " #t " \n\t" /* cgdh */\ |
||
130 | |||
131 | #define TRANSPOSE4(a,b,c,d,t)\ |
||
132 | SBUTTERFLY(a,b,t,wd,q) /* a=aebf t=cgdh */\ |
||
133 | SBUTTERFLY(c,d,b,wd,q) /* c=imjn b=kolp */\ |
||
134 | SBUTTERFLY(a,c,d,dq,q) /* a=aeim d=bfjn */\ |
||
135 | SBUTTERFLY(t,b,c,dq,q) /* t=cgko c=dhlp */ |
||
136 | |||
137 | static void cavs_idct8_add_mmx(uint8_t *dst, int16_t *block, int stride) |
||
138 | { |
||
139 | int i; |
||
140 | DECLARE_ALIGNED(8, int16_t, b2)[64]; |
||
141 | |||
142 | for(i=0; i<2; i++){ |
||
143 | DECLARE_ALIGNED(8, uint64_t, tmp); |
||
144 | |||
145 | cavs_idct8_1d(block+4*i, ff_pw_4.a); |
||
146 | |||
147 | __asm__ volatile( |
||
148 | "psraw $3, %%mm7 \n\t" |
||
149 | "psraw $3, %%mm6 \n\t" |
||
150 | "psraw $3, %%mm5 \n\t" |
||
151 | "psraw $3, %%mm4 \n\t" |
||
152 | "psraw $3, %%mm3 \n\t" |
||
153 | "psraw $3, %%mm2 \n\t" |
||
154 | "psraw $3, %%mm1 \n\t" |
||
155 | "psraw $3, %%mm0 \n\t" |
||
156 | "movq %%mm7, %0 \n\t" |
||
157 | TRANSPOSE4( %%mm0, %%mm2, %%mm4, %%mm6, %%mm7 ) |
||
158 | "movq %%mm0, 8(%1) \n\t" |
||
159 | "movq %%mm6, 24(%1) \n\t" |
||
160 | "movq %%mm7, 40(%1) \n\t" |
||
161 | "movq %%mm4, 56(%1) \n\t" |
||
162 | "movq %0, %%mm7 \n\t" |
||
163 | TRANSPOSE4( %%mm7, %%mm5, %%mm3, %%mm1, %%mm0 ) |
||
164 | "movq %%mm7, (%1) \n\t" |
||
165 | "movq %%mm1, 16(%1) \n\t" |
||
166 | "movq %%mm0, 32(%1) \n\t" |
||
167 | "movq %%mm3, 48(%1) \n\t" |
||
168 | : "=m"(tmp) |
||
169 | : "r"(b2+32*i) |
||
170 | : "memory" |
||
171 | ); |
||
172 | } |
||
173 | |||
174 | for(i=0; i<2; i++){ |
||
175 | cavs_idct8_1d(b2+4*i, ff_pw_64.a); |
||
176 | |||
177 | __asm__ volatile( |
||
178 | "psraw $7, %%mm7 \n\t" |
||
179 | "psraw $7, %%mm6 \n\t" |
||
180 | "psraw $7, %%mm5 \n\t" |
||
181 | "psraw $7, %%mm4 \n\t" |
||
182 | "psraw $7, %%mm3 \n\t" |
||
183 | "psraw $7, %%mm2 \n\t" |
||
184 | "psraw $7, %%mm1 \n\t" |
||
185 | "psraw $7, %%mm0 \n\t" |
||
186 | "movq %%mm7, (%0) \n\t" |
||
187 | "movq %%mm5, 16(%0) \n\t" |
||
188 | "movq %%mm3, 32(%0) \n\t" |
||
189 | "movq %%mm1, 48(%0) \n\t" |
||
190 | "movq %%mm0, 64(%0) \n\t" |
||
191 | "movq %%mm2, 80(%0) \n\t" |
||
192 | "movq %%mm4, 96(%0) \n\t" |
||
193 | "movq %%mm6, 112(%0) \n\t" |
||
194 | :: "r"(b2+4*i) |
||
195 | : "memory" |
||
196 | ); |
||
197 | } |
||
198 | |||
199 | ff_add_pixels_clamped_mmx(b2, dst, stride); |
||
200 | } |
||
201 | |||
202 | #endif /* HAVE_MMX_INLINE */ |
||
203 | |||
204 | #if (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) |
||
205 | |||
206 | /***************************************************************************** |
||
207 | * |
||
208 | * motion compensation |
||
209 | * |
||
210 | ****************************************************************************/ |
||
211 | |||
212 | /* vertical filter [-1 -2 96 42 -7 0] */ |
||
213 | #define QPEL_CAVSV1(A,B,C,D,E,F,OP,MUL2) \ |
||
214 | "movd (%0), "#F" \n\t"\ |
||
215 | "movq "#C", %%mm6 \n\t"\ |
||
216 | "pmullw %5, %%mm6 \n\t"\ |
||
217 | "movq "#D", %%mm7 \n\t"\ |
||
218 | "pmullw "MANGLE(MUL2)", %%mm7\n\t"\ |
||
219 | "psllw $3, "#E" \n\t"\ |
||
220 | "psubw "#E", %%mm6 \n\t"\ |
||
221 | "psraw $3, "#E" \n\t"\ |
||
222 | "paddw %%mm7, %%mm6 \n\t"\ |
||
223 | "paddw "#E", %%mm6 \n\t"\ |
||
224 | "paddw "#B", "#B" \n\t"\ |
||
225 | "pxor %%mm7, %%mm7 \n\t"\ |
||
226 | "add %2, %0 \n\t"\ |
||
227 | "punpcklbw %%mm7, "#F" \n\t"\ |
||
228 | "psubw "#B", %%mm6 \n\t"\ |
||
229 | "psraw $1, "#B" \n\t"\ |
||
230 | "psubw "#A", %%mm6 \n\t"\ |
||
231 | "paddw %4, %%mm6 \n\t"\ |
||
232 | "psraw $7, %%mm6 \n\t"\ |
||
233 | "packuswb %%mm6, %%mm6 \n\t"\ |
||
234 | OP(%%mm6, (%1), A, d) \ |
||
235 | "add %3, %1 \n\t" |
||
236 | |||
237 | /* vertical filter [ 0 -1 5 5 -1 0] */ |
||
238 | #define QPEL_CAVSV2(A,B,C,D,E,F,OP,MUL2) \ |
||
239 | "movd (%0), "#F" \n\t"\ |
||
240 | "movq "#C", %%mm6 \n\t"\ |
||
241 | "paddw "#D", %%mm6 \n\t"\ |
||
242 | "pmullw %5, %%mm6 \n\t"\ |
||
243 | "add %2, %0 \n\t"\ |
||
244 | "punpcklbw %%mm7, "#F" \n\t"\ |
||
245 | "psubw "#B", %%mm6 \n\t"\ |
||
246 | "psubw "#E", %%mm6 \n\t"\ |
||
247 | "paddw %4, %%mm6 \n\t"\ |
||
248 | "psraw $3, %%mm6 \n\t"\ |
||
249 | "packuswb %%mm6, %%mm6 \n\t"\ |
||
250 | OP(%%mm6, (%1), A, d) \ |
||
251 | "add %3, %1 \n\t" |
||
252 | |||
253 | /* vertical filter [ 0 -7 42 96 -2 -1] */ |
||
254 | #define QPEL_CAVSV3(A,B,C,D,E,F,OP,MUL2) \ |
||
255 | "movd (%0), "#F" \n\t"\ |
||
256 | "movq "#C", %%mm6 \n\t"\ |
||
257 | "pmullw "MANGLE(MUL2)", %%mm6\n\t"\ |
||
258 | "movq "#D", %%mm7 \n\t"\ |
||
259 | "pmullw %5, %%mm7 \n\t"\ |
||
260 | "psllw $3, "#B" \n\t"\ |
||
261 | "psubw "#B", %%mm6 \n\t"\ |
||
262 | "psraw $3, "#B" \n\t"\ |
||
263 | "paddw %%mm7, %%mm6 \n\t"\ |
||
264 | "paddw "#B", %%mm6 \n\t"\ |
||
265 | "paddw "#E", "#E" \n\t"\ |
||
266 | "pxor %%mm7, %%mm7 \n\t"\ |
||
267 | "add %2, %0 \n\t"\ |
||
268 | "punpcklbw %%mm7, "#F" \n\t"\ |
||
269 | "psubw "#E", %%mm6 \n\t"\ |
||
270 | "psraw $1, "#E" \n\t"\ |
||
271 | "psubw "#F", %%mm6 \n\t"\ |
||
272 | "paddw %4, %%mm6 \n\t"\ |
||
273 | "psraw $7, %%mm6 \n\t"\ |
||
274 | "packuswb %%mm6, %%mm6 \n\t"\ |
||
275 | OP(%%mm6, (%1), A, d) \ |
||
276 | "add %3, %1 \n\t" |
||
277 | |||
278 | |||
279 | #define QPEL_CAVSVNUM(VOP,OP,ADD,MUL1,MUL2)\ |
||
280 | int w= 2;\ |
||
281 | src -= 2*srcStride;\ |
||
282 | \ |
||
283 | while(w--){\ |
||
284 | __asm__ volatile(\ |
||
285 | "pxor %%mm7, %%mm7 \n\t"\ |
||
286 | "movd (%0), %%mm0 \n\t"\ |
||
287 | "add %2, %0 \n\t"\ |
||
288 | "movd (%0), %%mm1 \n\t"\ |
||
289 | "add %2, %0 \n\t"\ |
||
290 | "movd (%0), %%mm2 \n\t"\ |
||
291 | "add %2, %0 \n\t"\ |
||
292 | "movd (%0), %%mm3 \n\t"\ |
||
293 | "add %2, %0 \n\t"\ |
||
294 | "movd (%0), %%mm4 \n\t"\ |
||
295 | "add %2, %0 \n\t"\ |
||
296 | "punpcklbw %%mm7, %%mm0 \n\t"\ |
||
297 | "punpcklbw %%mm7, %%mm1 \n\t"\ |
||
298 | "punpcklbw %%mm7, %%mm2 \n\t"\ |
||
299 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
||
300 | "punpcklbw %%mm7, %%mm4 \n\t"\ |
||
301 | VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
||
302 | VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
||
303 | VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
||
304 | VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
||
305 | VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ |
||
306 | VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ |
||
307 | VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
||
308 | VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
||
309 | \ |
||
310 | : "+a"(src), "+c"(dst)\ |
||
311 | : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ |
||
312 | : "memory"\ |
||
313 | );\ |
||
314 | if(h==16){\ |
||
315 | __asm__ volatile(\ |
||
316 | VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
||
317 | VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
||
318 | VOP(%%mm4, %%mm5, %%mm0, %%mm1, %%mm2, %%mm3, OP, MUL2)\ |
||
319 | VOP(%%mm5, %%mm0, %%mm1, %%mm2, %%mm3, %%mm4, OP, MUL2)\ |
||
320 | VOP(%%mm0, %%mm1, %%mm2, %%mm3, %%mm4, %%mm5, OP, MUL2)\ |
||
321 | VOP(%%mm1, %%mm2, %%mm3, %%mm4, %%mm5, %%mm0, OP, MUL2)\ |
||
322 | VOP(%%mm2, %%mm3, %%mm4, %%mm5, %%mm0, %%mm1, OP, MUL2)\ |
||
323 | VOP(%%mm3, %%mm4, %%mm5, %%mm0, %%mm1, %%mm2, OP, MUL2)\ |
||
324 | \ |
||
325 | : "+a"(src), "+c"(dst)\ |
||
326 | : "S"((x86_reg)srcStride), "r"((x86_reg)dstStride), "m"(ADD), "m"(MUL1)\ |
||
327 | : "memory"\ |
||
328 | );\ |
||
329 | }\ |
||
330 | src += 4-(h+5)*srcStride;\ |
||
331 | dst += 4-h*dstStride;\ |
||
332 | } |
||
333 | |||
334 | #define QPEL_CAVS(OPNAME, OP, MMX)\ |
||
335 | static void OPNAME ## cavs_qpel8_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
||
336 | int h=8;\ |
||
337 | __asm__ volatile(\ |
||
338 | "pxor %%mm7, %%mm7 \n\t"\ |
||
339 | "movq %5, %%mm6 \n\t"\ |
||
340 | "1: \n\t"\ |
||
341 | "movq (%0), %%mm0 \n\t"\ |
||
342 | "movq 1(%0), %%mm2 \n\t"\ |
||
343 | "movq %%mm0, %%mm1 \n\t"\ |
||
344 | "movq %%mm2, %%mm3 \n\t"\ |
||
345 | "punpcklbw %%mm7, %%mm0 \n\t"\ |
||
346 | "punpckhbw %%mm7, %%mm1 \n\t"\ |
||
347 | "punpcklbw %%mm7, %%mm2 \n\t"\ |
||
348 | "punpckhbw %%mm7, %%mm3 \n\t"\ |
||
349 | "paddw %%mm2, %%mm0 \n\t"\ |
||
350 | "paddw %%mm3, %%mm1 \n\t"\ |
||
351 | "pmullw %%mm6, %%mm0 \n\t"\ |
||
352 | "pmullw %%mm6, %%mm1 \n\t"\ |
||
353 | "movq -1(%0), %%mm2 \n\t"\ |
||
354 | "movq 2(%0), %%mm4 \n\t"\ |
||
355 | "movq %%mm2, %%mm3 \n\t"\ |
||
356 | "movq %%mm4, %%mm5 \n\t"\ |
||
357 | "punpcklbw %%mm7, %%mm2 \n\t"\ |
||
358 | "punpckhbw %%mm7, %%mm3 \n\t"\ |
||
359 | "punpcklbw %%mm7, %%mm4 \n\t"\ |
||
360 | "punpckhbw %%mm7, %%mm5 \n\t"\ |
||
361 | "paddw %%mm4, %%mm2 \n\t"\ |
||
362 | "paddw %%mm3, %%mm5 \n\t"\ |
||
363 | "psubw %%mm2, %%mm0 \n\t"\ |
||
364 | "psubw %%mm5, %%mm1 \n\t"\ |
||
365 | "movq %6, %%mm5 \n\t"\ |
||
366 | "paddw %%mm5, %%mm0 \n\t"\ |
||
367 | "paddw %%mm5, %%mm1 \n\t"\ |
||
368 | "psraw $3, %%mm0 \n\t"\ |
||
369 | "psraw $3, %%mm1 \n\t"\ |
||
370 | "packuswb %%mm1, %%mm0 \n\t"\ |
||
371 | OP(%%mm0, (%1),%%mm5, q) \ |
||
372 | "add %3, %0 \n\t"\ |
||
373 | "add %4, %1 \n\t"\ |
||
374 | "decl %2 \n\t"\ |
||
375 | " jnz 1b \n\t"\ |
||
376 | : "+a"(src), "+c"(dst), "+m"(h)\ |
||
377 | : "d"((x86_reg)srcStride), "S"((x86_reg)dstStride), "m"(ff_pw_5), "m"(ff_pw_4)\ |
||
378 | : "memory"\ |
||
379 | );\ |
||
380 | }\ |
||
381 | \ |
||
382 | static inline void OPNAME ## cavs_qpel8or16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
||
383 | QPEL_CAVSVNUM(QPEL_CAVSV1,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ |
||
384 | }\ |
||
385 | \ |
||
386 | static inline void OPNAME ## cavs_qpel8or16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
||
387 | QPEL_CAVSVNUM(QPEL_CAVSV2,OP,ff_pw_4,ff_pw_5,ff_pw_5) \ |
||
388 | }\ |
||
389 | \ |
||
390 | static inline void OPNAME ## cavs_qpel8or16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride, int h){\ |
||
391 | QPEL_CAVSVNUM(QPEL_CAVSV3,OP,ff_pw_64,ff_pw_96,ff_pw_42) \ |
||
392 | }\ |
||
393 | \ |
||
394 | static void OPNAME ## cavs_qpel8_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
||
395 | OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
||
396 | }\ |
||
397 | static void OPNAME ## cavs_qpel16_v1_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
||
398 | OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
||
399 | OPNAME ## cavs_qpel8or16_v1_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
||
400 | }\ |
||
401 | \ |
||
402 | static void OPNAME ## cavs_qpel8_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
||
403 | OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
||
404 | }\ |
||
405 | static void OPNAME ## cavs_qpel16_v2_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
||
406 | OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
||
407 | OPNAME ## cavs_qpel8or16_v2_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
||
408 | }\ |
||
409 | \ |
||
410 | static void OPNAME ## cavs_qpel8_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
||
411 | OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 8);\ |
||
412 | }\ |
||
413 | static void OPNAME ## cavs_qpel16_v3_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
||
414 | OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst , src , dstStride, srcStride, 16);\ |
||
415 | OPNAME ## cavs_qpel8or16_v3_ ## MMX(dst+8, src+8, dstStride, srcStride, 16);\ |
||
416 | }\ |
||
417 | \ |
||
418 | static void OPNAME ## cavs_qpel16_h_ ## MMX(uint8_t *dst, uint8_t *src, int dstStride, int srcStride){\ |
||
419 | OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ |
||
420 | OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ |
||
421 | src += 8*srcStride;\ |
||
422 | dst += 8*dstStride;\ |
||
423 | OPNAME ## cavs_qpel8_h_ ## MMX(dst , src , dstStride, srcStride);\ |
||
424 | OPNAME ## cavs_qpel8_h_ ## MMX(dst+8, src+8, dstStride, srcStride);\ |
||
425 | }\ |
||
426 | |||
427 | #define CAVS_MC(OPNAME, SIZE, MMX) \ |
||
428 | static void OPNAME ## cavs_qpel ## SIZE ## _mc20_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
||
429 | {\ |
||
430 | OPNAME ## cavs_qpel ## SIZE ## _h_ ## MMX(dst, src, stride, stride);\ |
||
431 | }\ |
||
432 | \ |
||
433 | static void OPNAME ## cavs_qpel ## SIZE ## _mc01_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
||
434 | {\ |
||
435 | OPNAME ## cavs_qpel ## SIZE ## _v1_ ## MMX(dst, src, stride, stride);\ |
||
436 | }\ |
||
437 | \ |
||
438 | static void OPNAME ## cavs_qpel ## SIZE ## _mc02_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
||
439 | {\ |
||
440 | OPNAME ## cavs_qpel ## SIZE ## _v2_ ## MMX(dst, src, stride, stride);\ |
||
441 | }\ |
||
442 | \ |
||
443 | static void OPNAME ## cavs_qpel ## SIZE ## _mc03_ ## MMX(uint8_t *dst, uint8_t *src, ptrdiff_t stride)\ |
||
444 | {\ |
||
445 | OPNAME ## cavs_qpel ## SIZE ## _v3_ ## MMX(dst, src, stride, stride);\ |
||
446 | }\ |
||
447 | |||
448 | #define PUT_OP(a,b,temp, size) "mov" #size " " #a ", " #b " \n\t" |
||
449 | #define AVG_3DNOW_OP(a,b,temp, size) \ |
||
450 | "mov" #size " " #b ", " #temp " \n\t"\ |
||
451 | "pavgusb " #temp ", " #a " \n\t"\ |
||
452 | "mov" #size " " #a ", " #b " \n\t" |
||
453 | #define AVG_MMXEXT_OP(a, b, temp, size) \ |
||
454 | "mov" #size " " #b ", " #temp " \n\t"\ |
||
455 | "pavgb " #temp ", " #a " \n\t"\ |
||
456 | "mov" #size " " #a ", " #b " \n\t" |
||
457 | |||
458 | #endif /* (HAVE_MMXEXT_INLINE || HAVE_AMD3DNOW_INLINE) */ |
||
459 | |||
460 | #if HAVE_MMX_INLINE |
||
461 | static void put_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, |
||
462 | ptrdiff_t stride) |
||
463 | { |
||
464 | ff_put_pixels8_mmx(dst, src, stride, 8); |
||
465 | } |
||
466 | |||
467 | static void avg_cavs_qpel8_mc00_mmx(uint8_t *dst, uint8_t *src, |
||
468 | ptrdiff_t stride) |
||
469 | { |
||
470 | ff_avg_pixels8_mmx(dst, src, stride, 8); |
||
471 | } |
||
472 | |||
473 | static void put_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, |
||
474 | ptrdiff_t stride) |
||
475 | { |
||
476 | ff_put_pixels16_mmx(dst, src, stride, 16); |
||
477 | } |
||
478 | |||
479 | static void avg_cavs_qpel16_mc00_mmx(uint8_t *dst, uint8_t *src, |
||
480 | ptrdiff_t stride) |
||
481 | { |
||
482 | ff_avg_pixels16_mmx(dst, src, stride, 16); |
||
483 | } |
||
484 | |||
485 | static av_cold void cavsdsp_init_mmx(CAVSDSPContext *c, |
||
486 | AVCodecContext *avctx) |
||
487 | { |
||
488 | c->put_cavs_qpel_pixels_tab[0][0] = put_cavs_qpel16_mc00_mmx; |
||
489 | c->put_cavs_qpel_pixels_tab[1][0] = put_cavs_qpel8_mc00_mmx; |
||
490 | c->avg_cavs_qpel_pixels_tab[0][0] = avg_cavs_qpel16_mc00_mmx; |
||
491 | c->avg_cavs_qpel_pixels_tab[1][0] = avg_cavs_qpel8_mc00_mmx; |
||
492 | |||
493 | c->cavs_idct8_add = cavs_idct8_add_mmx; |
||
494 | c->idct_perm = FF_TRANSPOSE_IDCT_PERM; |
||
495 | } |
||
496 | #endif /* HAVE_MMX_INLINE */ |
||
497 | |||
498 | #define DSPFUNC(PFX, IDX, NUM, EXT) \ |
||
499 | c->PFX ## _cavs_qpel_pixels_tab[IDX][ 2] = PFX ## _cavs_qpel ## NUM ## _mc20_ ## EXT; \ |
||
500 | c->PFX ## _cavs_qpel_pixels_tab[IDX][ 4] = PFX ## _cavs_qpel ## NUM ## _mc01_ ## EXT; \ |
||
501 | c->PFX ## _cavs_qpel_pixels_tab[IDX][ 8] = PFX ## _cavs_qpel ## NUM ## _mc02_ ## EXT; \ |
||
502 | c->PFX ## _cavs_qpel_pixels_tab[IDX][12] = PFX ## _cavs_qpel ## NUM ## _mc03_ ## EXT; \ |
||
503 | |||
504 | #if HAVE_MMXEXT_INLINE |
||
505 | QPEL_CAVS(put_, PUT_OP, mmxext) |
||
506 | QPEL_CAVS(avg_, AVG_MMXEXT_OP, mmxext) |
||
507 | |||
508 | CAVS_MC(put_, 8, mmxext) |
||
509 | CAVS_MC(put_, 16, mmxext) |
||
510 | CAVS_MC(avg_, 8, mmxext) |
||
511 | CAVS_MC(avg_, 16, mmxext) |
||
512 | |||
513 | static av_cold void cavsdsp_init_mmxext(CAVSDSPContext *c, |
||
514 | AVCodecContext *avctx) |
||
515 | { |
||
516 | DSPFUNC(put, 0, 16, mmxext); |
||
517 | DSPFUNC(put, 1, 8, mmxext); |
||
518 | DSPFUNC(avg, 0, 16, mmxext); |
||
519 | DSPFUNC(avg, 1, 8, mmxext); |
||
520 | } |
||
521 | #endif /* HAVE_MMXEXT_INLINE */ |
||
522 | |||
523 | #if HAVE_AMD3DNOW_INLINE |
||
524 | QPEL_CAVS(put_, PUT_OP, 3dnow) |
||
525 | QPEL_CAVS(avg_, AVG_3DNOW_OP, 3dnow) |
||
526 | |||
527 | CAVS_MC(put_, 8, 3dnow) |
||
528 | CAVS_MC(put_, 16,3dnow) |
||
529 | CAVS_MC(avg_, 8, 3dnow) |
||
530 | CAVS_MC(avg_, 16,3dnow) |
||
531 | |||
532 | static av_cold void cavsdsp_init_3dnow(CAVSDSPContext *c, |
||
533 | AVCodecContext *avctx) |
||
534 | { |
||
535 | DSPFUNC(put, 0, 16, 3dnow); |
||
536 | DSPFUNC(put, 1, 8, 3dnow); |
||
537 | DSPFUNC(avg, 0, 16, 3dnow); |
||
538 | DSPFUNC(avg, 1, 8, 3dnow); |
||
539 | } |
||
540 | #endif /* HAVE_AMD3DNOW_INLINE */ |
||
541 | |||
542 | av_cold void ff_cavsdsp_init_x86(CAVSDSPContext *c, AVCodecContext *avctx) |
||
543 | { |
||
544 | #if HAVE_MMX_INLINE |
||
545 | int cpu_flags = av_get_cpu_flags(); |
||
546 | |||
547 | if (INLINE_MMX(cpu_flags)) |
||
548 | cavsdsp_init_mmx(c, avctx); |
||
549 | #endif /* HAVE_MMX_INLINE */ |
||
550 | #if HAVE_MMXEXT_INLINE |
||
551 | if (INLINE_MMXEXT(cpu_flags)) |
||
552 | cavsdsp_init_mmxext(c, avctx); |
||
553 | #endif /* HAVE_MMXEXT_INLINE */ |
||
554 | #if HAVE_AMD3DNOW_INLINE |
||
555 | if (INLINE_AMD3DNOW(cpu_flags)) |
||
556 | cavsdsp_init_3dnow(c, avctx); |
||
557 | #endif /* HAVE_AMD3DNOW_INLINE */ |
||
558 | }2;>2;> |