Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | /* |
2 | * Optimized for ia32 CPUs by Nick Kurshev |
||
3 | * h263, mpeg1, mpeg2 dequantizer & draw_edges by Michael Niedermayer |
||
4 | * |
||
5 | * This file is part of FFmpeg. |
||
6 | * |
||
7 | * FFmpeg is free software; you can redistribute it and/or |
||
8 | * modify it under the terms of the GNU Lesser General Public |
||
9 | * License as published by the Free Software Foundation; either |
||
10 | * version 2.1 of the License, or (at your option) any later version. |
||
11 | * |
||
12 | * FFmpeg is distributed in the hope that it will be useful, |
||
13 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
14 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
15 | * Lesser General Public License for more details. |
||
16 | * |
||
17 | * You should have received a copy of the GNU Lesser General Public |
||
18 | * License along with FFmpeg; if not, write to the Free Software |
||
19 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
20 | */ |
||
21 | |||
22 | #include "libavutil/attributes.h" |
||
23 | #include "libavutil/cpu.h" |
||
24 | #include "libavutil/x86/asm.h" |
||
25 | #include "libavutil/x86/cpu.h" |
||
26 | #include "libavcodec/avcodec.h" |
||
27 | #include "libavcodec/mpegvideo.h" |
||
28 | #include "dsputil_x86.h" |
||
29 | |||
30 | #if HAVE_MMX_INLINE |
||
31 | |||
32 | static void dct_unquantize_h263_intra_mmx(MpegEncContext *s, |
||
33 | int16_t *block, int n, int qscale) |
||
34 | { |
||
35 | x86_reg level, qmul, qadd, nCoeffs; |
||
36 | |||
37 | qmul = qscale << 1; |
||
38 | |||
39 | av_assert2(s->block_last_index[n]>=0 || s->h263_aic); |
||
40 | |||
41 | if (!s->h263_aic) { |
||
42 | if (n < 4) |
||
43 | level = block[0] * s->y_dc_scale; |
||
44 | else |
||
45 | level = block[0] * s->c_dc_scale; |
||
46 | qadd = (qscale - 1) | 1; |
||
47 | }else{ |
||
48 | qadd = 0; |
||
49 | level= block[0]; |
||
50 | } |
||
51 | if(s->ac_pred) |
||
52 | nCoeffs=63; |
||
53 | else |
||
54 | nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; |
||
55 | |||
56 | __asm__ volatile( |
||
57 | "movd %1, %%mm6 \n\t" //qmul |
||
58 | "packssdw %%mm6, %%mm6 \n\t" |
||
59 | "packssdw %%mm6, %%mm6 \n\t" |
||
60 | "movd %2, %%mm5 \n\t" //qadd |
||
61 | "pxor %%mm7, %%mm7 \n\t" |
||
62 | "packssdw %%mm5, %%mm5 \n\t" |
||
63 | "packssdw %%mm5, %%mm5 \n\t" |
||
64 | "psubw %%mm5, %%mm7 \n\t" |
||
65 | "pxor %%mm4, %%mm4 \n\t" |
||
66 | ".p2align 4 \n\t" |
||
67 | "1: \n\t" |
||
68 | "movq (%0, %3), %%mm0 \n\t" |
||
69 | "movq 8(%0, %3), %%mm1 \n\t" |
||
70 | |||
71 | "pmullw %%mm6, %%mm0 \n\t" |
||
72 | "pmullw %%mm6, %%mm1 \n\t" |
||
73 | |||
74 | "movq (%0, %3), %%mm2 \n\t" |
||
75 | "movq 8(%0, %3), %%mm3 \n\t" |
||
76 | |||
77 | "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
||
78 | "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
||
79 | |||
80 | "pxor %%mm2, %%mm0 \n\t" |
||
81 | "pxor %%mm3, %%mm1 \n\t" |
||
82 | |||
83 | "paddw %%mm7, %%mm0 \n\t" |
||
84 | "paddw %%mm7, %%mm1 \n\t" |
||
85 | |||
86 | "pxor %%mm0, %%mm2 \n\t" |
||
87 | "pxor %%mm1, %%mm3 \n\t" |
||
88 | |||
89 | "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 |
||
90 | "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 |
||
91 | |||
92 | "pandn %%mm2, %%mm0 \n\t" |
||
93 | "pandn %%mm3, %%mm1 \n\t" |
||
94 | |||
95 | "movq %%mm0, (%0, %3) \n\t" |
||
96 | "movq %%mm1, 8(%0, %3) \n\t" |
||
97 | |||
98 | "add $16, %3 \n\t" |
||
99 | "jng 1b \n\t" |
||
100 | ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) |
||
101 | : "memory" |
||
102 | ); |
||
103 | block[0]= level; |
||
104 | } |
||
105 | |||
106 | |||
107 | static void dct_unquantize_h263_inter_mmx(MpegEncContext *s, |
||
108 | int16_t *block, int n, int qscale) |
||
109 | { |
||
110 | x86_reg qmul, qadd, nCoeffs; |
||
111 | |||
112 | qmul = qscale << 1; |
||
113 | qadd = (qscale - 1) | 1; |
||
114 | |||
115 | av_assert2(s->block_last_index[n]>=0 || s->h263_aic); |
||
116 | |||
117 | nCoeffs= s->inter_scantable.raster_end[ s->block_last_index[n] ]; |
||
118 | |||
119 | __asm__ volatile( |
||
120 | "movd %1, %%mm6 \n\t" //qmul |
||
121 | "packssdw %%mm6, %%mm6 \n\t" |
||
122 | "packssdw %%mm6, %%mm6 \n\t" |
||
123 | "movd %2, %%mm5 \n\t" //qadd |
||
124 | "pxor %%mm7, %%mm7 \n\t" |
||
125 | "packssdw %%mm5, %%mm5 \n\t" |
||
126 | "packssdw %%mm5, %%mm5 \n\t" |
||
127 | "psubw %%mm5, %%mm7 \n\t" |
||
128 | "pxor %%mm4, %%mm4 \n\t" |
||
129 | ".p2align 4 \n\t" |
||
130 | "1: \n\t" |
||
131 | "movq (%0, %3), %%mm0 \n\t" |
||
132 | "movq 8(%0, %3), %%mm1 \n\t" |
||
133 | |||
134 | "pmullw %%mm6, %%mm0 \n\t" |
||
135 | "pmullw %%mm6, %%mm1 \n\t" |
||
136 | |||
137 | "movq (%0, %3), %%mm2 \n\t" |
||
138 | "movq 8(%0, %3), %%mm3 \n\t" |
||
139 | |||
140 | "pcmpgtw %%mm4, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
||
141 | "pcmpgtw %%mm4, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
||
142 | |||
143 | "pxor %%mm2, %%mm0 \n\t" |
||
144 | "pxor %%mm3, %%mm1 \n\t" |
||
145 | |||
146 | "paddw %%mm7, %%mm0 \n\t" |
||
147 | "paddw %%mm7, %%mm1 \n\t" |
||
148 | |||
149 | "pxor %%mm0, %%mm2 \n\t" |
||
150 | "pxor %%mm1, %%mm3 \n\t" |
||
151 | |||
152 | "pcmpeqw %%mm7, %%mm0 \n\t" // block[i] == 0 ? -1 : 0 |
||
153 | "pcmpeqw %%mm7, %%mm1 \n\t" // block[i] == 0 ? -1 : 0 |
||
154 | |||
155 | "pandn %%mm2, %%mm0 \n\t" |
||
156 | "pandn %%mm3, %%mm1 \n\t" |
||
157 | |||
158 | "movq %%mm0, (%0, %3) \n\t" |
||
159 | "movq %%mm1, 8(%0, %3) \n\t" |
||
160 | |||
161 | "add $16, %3 \n\t" |
||
162 | "jng 1b \n\t" |
||
163 | ::"r" (block+nCoeffs), "rm"(qmul), "rm" (qadd), "r" (2*(-nCoeffs)) |
||
164 | : "memory" |
||
165 | ); |
||
166 | } |
||
167 | |||
168 | static void dct_unquantize_mpeg1_intra_mmx(MpegEncContext *s, |
||
169 | int16_t *block, int n, int qscale) |
||
170 | { |
||
171 | x86_reg nCoeffs; |
||
172 | const uint16_t *quant_matrix; |
||
173 | int block0; |
||
174 | |||
175 | av_assert2(s->block_last_index[n]>=0); |
||
176 | |||
177 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; |
||
178 | |||
179 | if (n < 4) |
||
180 | block0 = block[0] * s->y_dc_scale; |
||
181 | else |
||
182 | block0 = block[0] * s->c_dc_scale; |
||
183 | /* XXX: only mpeg1 */ |
||
184 | quant_matrix = s->intra_matrix; |
||
185 | __asm__ volatile( |
||
186 | "pcmpeqw %%mm7, %%mm7 \n\t" |
||
187 | "psrlw $15, %%mm7 \n\t" |
||
188 | "movd %2, %%mm6 \n\t" |
||
189 | "packssdw %%mm6, %%mm6 \n\t" |
||
190 | "packssdw %%mm6, %%mm6 \n\t" |
||
191 | "mov %3, %%"REG_a" \n\t" |
||
192 | ".p2align 4 \n\t" |
||
193 | "1: \n\t" |
||
194 | "movq (%0, %%"REG_a"), %%mm0 \n\t" |
||
195 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
||
196 | "movq (%1, %%"REG_a"), %%mm4 \n\t" |
||
197 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
||
198 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
||
199 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
||
200 | "pxor %%mm2, %%mm2 \n\t" |
||
201 | "pxor %%mm3, %%mm3 \n\t" |
||
202 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
||
203 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
||
204 | "pxor %%mm2, %%mm0 \n\t" |
||
205 | "pxor %%mm3, %%mm1 \n\t" |
||
206 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
||
207 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
||
208 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q |
||
209 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q |
||
210 | "pxor %%mm4, %%mm4 \n\t" |
||
211 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow |
||
212 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
||
213 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
||
214 | "psraw $3, %%mm0 \n\t" |
||
215 | "psraw $3, %%mm1 \n\t" |
||
216 | "psubw %%mm7, %%mm0 \n\t" |
||
217 | "psubw %%mm7, %%mm1 \n\t" |
||
218 | "por %%mm7, %%mm0 \n\t" |
||
219 | "por %%mm7, %%mm1 \n\t" |
||
220 | "pxor %%mm2, %%mm0 \n\t" |
||
221 | "pxor %%mm3, %%mm1 \n\t" |
||
222 | "psubw %%mm2, %%mm0 \n\t" |
||
223 | "psubw %%mm3, %%mm1 \n\t" |
||
224 | "pandn %%mm0, %%mm4 \n\t" |
||
225 | "pandn %%mm1, %%mm5 \n\t" |
||
226 | "movq %%mm4, (%0, %%"REG_a") \n\t" |
||
227 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" |
||
228 | |||
229 | "add $16, %%"REG_a" \n\t" |
||
230 | "js 1b \n\t" |
||
231 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) |
||
232 | : "%"REG_a, "memory" |
||
233 | ); |
||
234 | block[0]= block0; |
||
235 | } |
||
236 | |||
237 | static void dct_unquantize_mpeg1_inter_mmx(MpegEncContext *s, |
||
238 | int16_t *block, int n, int qscale) |
||
239 | { |
||
240 | x86_reg nCoeffs; |
||
241 | const uint16_t *quant_matrix; |
||
242 | |||
243 | av_assert2(s->block_last_index[n]>=0); |
||
244 | |||
245 | nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]+1; |
||
246 | |||
247 | quant_matrix = s->inter_matrix; |
||
248 | __asm__ volatile( |
||
249 | "pcmpeqw %%mm7, %%mm7 \n\t" |
||
250 | "psrlw $15, %%mm7 \n\t" |
||
251 | "movd %2, %%mm6 \n\t" |
||
252 | "packssdw %%mm6, %%mm6 \n\t" |
||
253 | "packssdw %%mm6, %%mm6 \n\t" |
||
254 | "mov %3, %%"REG_a" \n\t" |
||
255 | ".p2align 4 \n\t" |
||
256 | "1: \n\t" |
||
257 | "movq (%0, %%"REG_a"), %%mm0 \n\t" |
||
258 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
||
259 | "movq (%1, %%"REG_a"), %%mm4 \n\t" |
||
260 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
||
261 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
||
262 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
||
263 | "pxor %%mm2, %%mm2 \n\t" |
||
264 | "pxor %%mm3, %%mm3 \n\t" |
||
265 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
||
266 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
||
267 | "pxor %%mm2, %%mm0 \n\t" |
||
268 | "pxor %%mm3, %%mm1 \n\t" |
||
269 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
||
270 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
||
271 | "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 |
||
272 | "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 |
||
273 | "paddw %%mm7, %%mm0 \n\t" // abs(block[i])*2 + 1 |
||
274 | "paddw %%mm7, %%mm1 \n\t" // abs(block[i])*2 + 1 |
||
275 | "pmullw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q |
||
276 | "pmullw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q |
||
277 | "pxor %%mm4, %%mm4 \n\t" |
||
278 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow |
||
279 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
||
280 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
||
281 | "psraw $4, %%mm0 \n\t" |
||
282 | "psraw $4, %%mm1 \n\t" |
||
283 | "psubw %%mm7, %%mm0 \n\t" |
||
284 | "psubw %%mm7, %%mm1 \n\t" |
||
285 | "por %%mm7, %%mm0 \n\t" |
||
286 | "por %%mm7, %%mm1 \n\t" |
||
287 | "pxor %%mm2, %%mm0 \n\t" |
||
288 | "pxor %%mm3, %%mm1 \n\t" |
||
289 | "psubw %%mm2, %%mm0 \n\t" |
||
290 | "psubw %%mm3, %%mm1 \n\t" |
||
291 | "pandn %%mm0, %%mm4 \n\t" |
||
292 | "pandn %%mm1, %%mm5 \n\t" |
||
293 | "movq %%mm4, (%0, %%"REG_a") \n\t" |
||
294 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" |
||
295 | |||
296 | "add $16, %%"REG_a" \n\t" |
||
297 | "js 1b \n\t" |
||
298 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) |
||
299 | : "%"REG_a, "memory" |
||
300 | ); |
||
301 | } |
||
302 | |||
303 | static void dct_unquantize_mpeg2_intra_mmx(MpegEncContext *s, |
||
304 | int16_t *block, int n, int qscale) |
||
305 | { |
||
306 | x86_reg nCoeffs; |
||
307 | const uint16_t *quant_matrix; |
||
308 | int block0; |
||
309 | |||
310 | av_assert2(s->block_last_index[n]>=0); |
||
311 | |||
312 | if(s->alternate_scan) nCoeffs= 63; //FIXME |
||
313 | else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; |
||
314 | |||
315 | if (n < 4) |
||
316 | block0 = block[0] * s->y_dc_scale; |
||
317 | else |
||
318 | block0 = block[0] * s->c_dc_scale; |
||
319 | quant_matrix = s->intra_matrix; |
||
320 | __asm__ volatile( |
||
321 | "pcmpeqw %%mm7, %%mm7 \n\t" |
||
322 | "psrlw $15, %%mm7 \n\t" |
||
323 | "movd %2, %%mm6 \n\t" |
||
324 | "packssdw %%mm6, %%mm6 \n\t" |
||
325 | "packssdw %%mm6, %%mm6 \n\t" |
||
326 | "mov %3, %%"REG_a" \n\t" |
||
327 | ".p2align 4 \n\t" |
||
328 | "1: \n\t" |
||
329 | "movq (%0, %%"REG_a"), %%mm0 \n\t" |
||
330 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
||
331 | "movq (%1, %%"REG_a"), %%mm4 \n\t" |
||
332 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
||
333 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
||
334 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
||
335 | "pxor %%mm2, %%mm2 \n\t" |
||
336 | "pxor %%mm3, %%mm3 \n\t" |
||
337 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
||
338 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
||
339 | "pxor %%mm2, %%mm0 \n\t" |
||
340 | "pxor %%mm3, %%mm1 \n\t" |
||
341 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
||
342 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
||
343 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*q |
||
344 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*q |
||
345 | "pxor %%mm4, %%mm4 \n\t" |
||
346 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow |
||
347 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
||
348 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
||
349 | "psraw $3, %%mm0 \n\t" |
||
350 | "psraw $3, %%mm1 \n\t" |
||
351 | "pxor %%mm2, %%mm0 \n\t" |
||
352 | "pxor %%mm3, %%mm1 \n\t" |
||
353 | "psubw %%mm2, %%mm0 \n\t" |
||
354 | "psubw %%mm3, %%mm1 \n\t" |
||
355 | "pandn %%mm0, %%mm4 \n\t" |
||
356 | "pandn %%mm1, %%mm5 \n\t" |
||
357 | "movq %%mm4, (%0, %%"REG_a") \n\t" |
||
358 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" |
||
359 | |||
360 | "add $16, %%"REG_a" \n\t" |
||
361 | "jng 1b \n\t" |
||
362 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "g" (-2*nCoeffs) |
||
363 | : "%"REG_a, "memory" |
||
364 | ); |
||
365 | block[0]= block0; |
||
366 | //Note, we do not do mismatch control for intra as errors cannot accumulate |
||
367 | } |
||
368 | |||
369 | static void dct_unquantize_mpeg2_inter_mmx(MpegEncContext *s, |
||
370 | int16_t *block, int n, int qscale) |
||
371 | { |
||
372 | x86_reg nCoeffs; |
||
373 | const uint16_t *quant_matrix; |
||
374 | |||
375 | av_assert2(s->block_last_index[n]>=0); |
||
376 | |||
377 | if(s->alternate_scan) nCoeffs= 63; //FIXME |
||
378 | else nCoeffs= s->intra_scantable.raster_end[ s->block_last_index[n] ]; |
||
379 | |||
380 | quant_matrix = s->inter_matrix; |
||
381 | __asm__ volatile( |
||
382 | "pcmpeqw %%mm7, %%mm7 \n\t" |
||
383 | "psrlq $48, %%mm7 \n\t" |
||
384 | "movd %2, %%mm6 \n\t" |
||
385 | "packssdw %%mm6, %%mm6 \n\t" |
||
386 | "packssdw %%mm6, %%mm6 \n\t" |
||
387 | "mov %3, %%"REG_a" \n\t" |
||
388 | ".p2align 4 \n\t" |
||
389 | "1: \n\t" |
||
390 | "movq (%0, %%"REG_a"), %%mm0 \n\t" |
||
391 | "movq 8(%0, %%"REG_a"), %%mm1 \n\t" |
||
392 | "movq (%1, %%"REG_a"), %%mm4 \n\t" |
||
393 | "movq 8(%1, %%"REG_a"), %%mm5 \n\t" |
||
394 | "pmullw %%mm6, %%mm4 \n\t" // q=qscale*quant_matrix[i] |
||
395 | "pmullw %%mm6, %%mm5 \n\t" // q=qscale*quant_matrix[i] |
||
396 | "pxor %%mm2, %%mm2 \n\t" |
||
397 | "pxor %%mm3, %%mm3 \n\t" |
||
398 | "pcmpgtw %%mm0, %%mm2 \n\t" // block[i] < 0 ? -1 : 0 |
||
399 | "pcmpgtw %%mm1, %%mm3 \n\t" // block[i] < 0 ? -1 : 0 |
||
400 | "pxor %%mm2, %%mm0 \n\t" |
||
401 | "pxor %%mm3, %%mm1 \n\t" |
||
402 | "psubw %%mm2, %%mm0 \n\t" // abs(block[i]) |
||
403 | "psubw %%mm3, %%mm1 \n\t" // abs(block[i]) |
||
404 | "paddw %%mm0, %%mm0 \n\t" // abs(block[i])*2 |
||
405 | "paddw %%mm1, %%mm1 \n\t" // abs(block[i])*2 |
||
406 | "pmullw %%mm4, %%mm0 \n\t" // abs(block[i])*2*q |
||
407 | "pmullw %%mm5, %%mm1 \n\t" // abs(block[i])*2*q |
||
408 | "paddw %%mm4, %%mm0 \n\t" // (abs(block[i])*2 + 1)*q |
||
409 | "paddw %%mm5, %%mm1 \n\t" // (abs(block[i])*2 + 1)*q |
||
410 | "pxor %%mm4, %%mm4 \n\t" |
||
411 | "pxor %%mm5, %%mm5 \n\t" // FIXME slow |
||
412 | "pcmpeqw (%0, %%"REG_a"), %%mm4 \n\t" // block[i] == 0 ? -1 : 0 |
||
413 | "pcmpeqw 8(%0, %%"REG_a"), %%mm5\n\t" // block[i] == 0 ? -1 : 0 |
||
414 | "psrlw $4, %%mm0 \n\t" |
||
415 | "psrlw $4, %%mm1 \n\t" |
||
416 | "pxor %%mm2, %%mm0 \n\t" |
||
417 | "pxor %%mm3, %%mm1 \n\t" |
||
418 | "psubw %%mm2, %%mm0 \n\t" |
||
419 | "psubw %%mm3, %%mm1 \n\t" |
||
420 | "pandn %%mm0, %%mm4 \n\t" |
||
421 | "pandn %%mm1, %%mm5 \n\t" |
||
422 | "pxor %%mm4, %%mm7 \n\t" |
||
423 | "pxor %%mm5, %%mm7 \n\t" |
||
424 | "movq %%mm4, (%0, %%"REG_a") \n\t" |
||
425 | "movq %%mm5, 8(%0, %%"REG_a") \n\t" |
||
426 | |||
427 | "add $16, %%"REG_a" \n\t" |
||
428 | "jng 1b \n\t" |
||
429 | "movd 124(%0, %3), %%mm0 \n\t" |
||
430 | "movq %%mm7, %%mm6 \n\t" |
||
431 | "psrlq $32, %%mm7 \n\t" |
||
432 | "pxor %%mm6, %%mm7 \n\t" |
||
433 | "movq %%mm7, %%mm6 \n\t" |
||
434 | "psrlq $16, %%mm7 \n\t" |
||
435 | "pxor %%mm6, %%mm7 \n\t" |
||
436 | "pslld $31, %%mm7 \n\t" |
||
437 | "psrlq $15, %%mm7 \n\t" |
||
438 | "pxor %%mm7, %%mm0 \n\t" |
||
439 | "movd %%mm0, 124(%0, %3) \n\t" |
||
440 | |||
441 | ::"r" (block+nCoeffs), "r"(quant_matrix+nCoeffs), "rm" (qscale), "r" (-2*nCoeffs) |
||
442 | : "%"REG_a, "memory" |
||
443 | ); |
||
444 | } |
||
445 | |||
446 | static void denoise_dct_mmx(MpegEncContext *s, int16_t *block){ |
||
447 | const int intra= s->mb_intra; |
||
448 | int *sum= s->dct_error_sum[intra]; |
||
449 | uint16_t *offset= s->dct_offset[intra]; |
||
450 | |||
451 | s->dct_count[intra]++; |
||
452 | |||
453 | __asm__ volatile( |
||
454 | "pxor %%mm7, %%mm7 \n\t" |
||
455 | "1: \n\t" |
||
456 | "pxor %%mm0, %%mm0 \n\t" |
||
457 | "pxor %%mm1, %%mm1 \n\t" |
||
458 | "movq (%0), %%mm2 \n\t" |
||
459 | "movq 8(%0), %%mm3 \n\t" |
||
460 | "pcmpgtw %%mm2, %%mm0 \n\t" |
||
461 | "pcmpgtw %%mm3, %%mm1 \n\t" |
||
462 | "pxor %%mm0, %%mm2 \n\t" |
||
463 | "pxor %%mm1, %%mm3 \n\t" |
||
464 | "psubw %%mm0, %%mm2 \n\t" |
||
465 | "psubw %%mm1, %%mm3 \n\t" |
||
466 | "movq %%mm2, %%mm4 \n\t" |
||
467 | "movq %%mm3, %%mm5 \n\t" |
||
468 | "psubusw (%2), %%mm2 \n\t" |
||
469 | "psubusw 8(%2), %%mm3 \n\t" |
||
470 | "pxor %%mm0, %%mm2 \n\t" |
||
471 | "pxor %%mm1, %%mm3 \n\t" |
||
472 | "psubw %%mm0, %%mm2 \n\t" |
||
473 | "psubw %%mm1, %%mm3 \n\t" |
||
474 | "movq %%mm2, (%0) \n\t" |
||
475 | "movq %%mm3, 8(%0) \n\t" |
||
476 | "movq %%mm4, %%mm2 \n\t" |
||
477 | "movq %%mm5, %%mm3 \n\t" |
||
478 | "punpcklwd %%mm7, %%mm4 \n\t" |
||
479 | "punpckhwd %%mm7, %%mm2 \n\t" |
||
480 | "punpcklwd %%mm7, %%mm5 \n\t" |
||
481 | "punpckhwd %%mm7, %%mm3 \n\t" |
||
482 | "paddd (%1), %%mm4 \n\t" |
||
483 | "paddd 8(%1), %%mm2 \n\t" |
||
484 | "paddd 16(%1), %%mm5 \n\t" |
||
485 | "paddd 24(%1), %%mm3 \n\t" |
||
486 | "movq %%mm4, (%1) \n\t" |
||
487 | "movq %%mm2, 8(%1) \n\t" |
||
488 | "movq %%mm5, 16(%1) \n\t" |
||
489 | "movq %%mm3, 24(%1) \n\t" |
||
490 | "add $16, %0 \n\t" |
||
491 | "add $32, %1 \n\t" |
||
492 | "add $16, %2 \n\t" |
||
493 | "cmp %3, %0 \n\t" |
||
494 | " jb 1b \n\t" |
||
495 | : "+r" (block), "+r" (sum), "+r" (offset) |
||
496 | : "r"(block+64) |
||
497 | ); |
||
498 | } |
||
499 | |||
500 | static void denoise_dct_sse2(MpegEncContext *s, int16_t *block){ |
||
501 | const int intra= s->mb_intra; |
||
502 | int *sum= s->dct_error_sum[intra]; |
||
503 | uint16_t *offset= s->dct_offset[intra]; |
||
504 | |||
505 | s->dct_count[intra]++; |
||
506 | |||
507 | __asm__ volatile( |
||
508 | "pxor %%xmm7, %%xmm7 \n\t" |
||
509 | "1: \n\t" |
||
510 | "pxor %%xmm0, %%xmm0 \n\t" |
||
511 | "pxor %%xmm1, %%xmm1 \n\t" |
||
512 | "movdqa (%0), %%xmm2 \n\t" |
||
513 | "movdqa 16(%0), %%xmm3 \n\t" |
||
514 | "pcmpgtw %%xmm2, %%xmm0 \n\t" |
||
515 | "pcmpgtw %%xmm3, %%xmm1 \n\t" |
||
516 | "pxor %%xmm0, %%xmm2 \n\t" |
||
517 | "pxor %%xmm1, %%xmm3 \n\t" |
||
518 | "psubw %%xmm0, %%xmm2 \n\t" |
||
519 | "psubw %%xmm1, %%xmm3 \n\t" |
||
520 | "movdqa %%xmm2, %%xmm4 \n\t" |
||
521 | "movdqa %%xmm3, %%xmm5 \n\t" |
||
522 | "psubusw (%2), %%xmm2 \n\t" |
||
523 | "psubusw 16(%2), %%xmm3 \n\t" |
||
524 | "pxor %%xmm0, %%xmm2 \n\t" |
||
525 | "pxor %%xmm1, %%xmm3 \n\t" |
||
526 | "psubw %%xmm0, %%xmm2 \n\t" |
||
527 | "psubw %%xmm1, %%xmm3 \n\t" |
||
528 | "movdqa %%xmm2, (%0) \n\t" |
||
529 | "movdqa %%xmm3, 16(%0) \n\t" |
||
530 | "movdqa %%xmm4, %%xmm6 \n\t" |
||
531 | "movdqa %%xmm5, %%xmm0 \n\t" |
||
532 | "punpcklwd %%xmm7, %%xmm4 \n\t" |
||
533 | "punpckhwd %%xmm7, %%xmm6 \n\t" |
||
534 | "punpcklwd %%xmm7, %%xmm5 \n\t" |
||
535 | "punpckhwd %%xmm7, %%xmm0 \n\t" |
||
536 | "paddd (%1), %%xmm4 \n\t" |
||
537 | "paddd 16(%1), %%xmm6 \n\t" |
||
538 | "paddd 32(%1), %%xmm5 \n\t" |
||
539 | "paddd 48(%1), %%xmm0 \n\t" |
||
540 | "movdqa %%xmm4, (%1) \n\t" |
||
541 | "movdqa %%xmm6, 16(%1) \n\t" |
||
542 | "movdqa %%xmm5, 32(%1) \n\t" |
||
543 | "movdqa %%xmm0, 48(%1) \n\t" |
||
544 | "add $32, %0 \n\t" |
||
545 | "add $64, %1 \n\t" |
||
546 | "add $32, %2 \n\t" |
||
547 | "cmp %3, %0 \n\t" |
||
548 | " jb 1b \n\t" |
||
549 | : "+r" (block), "+r" (sum), "+r" (offset) |
||
550 | : "r"(block+64) |
||
551 | XMM_CLOBBERS_ONLY("%xmm0", "%xmm1", "%xmm2", "%xmm3", |
||
552 | "%xmm4", "%xmm5", "%xmm6", "%xmm7") |
||
553 | ); |
||
554 | } |
||
555 | |||
556 | #endif /* HAVE_MMX_INLINE */ |
||
557 | |||
558 | av_cold void ff_MPV_common_init_x86(MpegEncContext *s) |
||
559 | { |
||
560 | #if HAVE_MMX_INLINE |
||
561 | int cpu_flags = av_get_cpu_flags(); |
||
562 | |||
563 | if (INLINE_MMX(cpu_flags)) { |
||
564 | s->dct_unquantize_h263_intra = dct_unquantize_h263_intra_mmx; |
||
565 | s->dct_unquantize_h263_inter = dct_unquantize_h263_inter_mmx; |
||
566 | s->dct_unquantize_mpeg1_intra = dct_unquantize_mpeg1_intra_mmx; |
||
567 | s->dct_unquantize_mpeg1_inter = dct_unquantize_mpeg1_inter_mmx; |
||
568 | if(!(s->flags & CODEC_FLAG_BITEXACT)) |
||
569 | s->dct_unquantize_mpeg2_intra = dct_unquantize_mpeg2_intra_mmx; |
||
570 | s->dct_unquantize_mpeg2_inter = dct_unquantize_mpeg2_inter_mmx; |
||
571 | s->denoise_dct = denoise_dct_mmx; |
||
572 | } |
||
573 | if (INLINE_SSE2(cpu_flags)) { |
||
574 | s->denoise_dct = denoise_dct_sse2; |
||
575 | } |
||
576 | #endif /* HAVE_MMX_INLINE */ |
||
577 | }>>>>>>>>>>>>><>>>>><> |