Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
6148 | serge | 1 | /* |
2 | * Copyright (C) 2001-2011 Michael Niedermayer |
||
3 | * |
||
4 | * This file is part of FFmpeg. |
||
5 | * |
||
6 | * FFmpeg is free software; you can redistribute it and/or |
||
7 | * modify it under the terms of the GNU Lesser General Public |
||
8 | * License as published by the Free Software Foundation; either |
||
9 | * version 2.1 of the License, or (at your option) any later version. |
||
10 | * |
||
11 | * FFmpeg is distributed in the hope that it will be useful, |
||
12 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
||
13 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
||
14 | * Lesser General Public License for more details. |
||
15 | * |
||
16 | * You should have received a copy of the GNU Lesser General Public |
||
17 | * License along with FFmpeg; if not, write to the Free Software |
||
18 | * Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA |
||
19 | */ |
||
20 | |||
21 | #undef REAL_MOVNTQ |
||
22 | #undef MOVNTQ |
||
23 | #undef MOVNTQ2 |
||
24 | #undef PREFETCH |
||
25 | |||
26 | #if COMPILE_TEMPLATE_MMXEXT |
||
27 | #define PREFETCH "prefetchnta" |
||
28 | #else |
||
29 | #define PREFETCH " # nop" |
||
30 | #endif |
||
31 | |||
32 | #if COMPILE_TEMPLATE_MMXEXT |
||
33 | #define REAL_MOVNTQ(a,b) "movntq " #a ", " #b " \n\t" |
||
34 | #define MOVNTQ2 "movntq " |
||
35 | #else |
||
36 | #define REAL_MOVNTQ(a,b) "movq " #a ", " #b " \n\t" |
||
37 | #define MOVNTQ2 "movq " |
||
38 | #endif |
||
39 | #define MOVNTQ(a,b) REAL_MOVNTQ(a,b) |
||
40 | |||
41 | #if !COMPILE_TEMPLATE_MMXEXT |
||
42 | static av_always_inline void |
||
43 | dither_8to16(const uint8_t *srcDither, int rot) |
||
44 | { |
||
45 | if (rot) { |
||
46 | __asm__ volatile("pxor %%mm0, %%mm0\n\t" |
||
47 | "movq (%0), %%mm3\n\t" |
||
48 | "movq %%mm3, %%mm4\n\t" |
||
49 | "psrlq $24, %%mm3\n\t" |
||
50 | "psllq $40, %%mm4\n\t" |
||
51 | "por %%mm4, %%mm3\n\t" |
||
52 | "movq %%mm3, %%mm4\n\t" |
||
53 | "punpcklbw %%mm0, %%mm3\n\t" |
||
54 | "punpckhbw %%mm0, %%mm4\n\t" |
||
55 | :: "r"(srcDither) |
||
56 | ); |
||
57 | } else { |
||
58 | __asm__ volatile("pxor %%mm0, %%mm0\n\t" |
||
59 | "movq (%0), %%mm3\n\t" |
||
60 | "movq %%mm3, %%mm4\n\t" |
||
61 | "punpcklbw %%mm0, %%mm3\n\t" |
||
62 | "punpckhbw %%mm0, %%mm4\n\t" |
||
63 | :: "r"(srcDither) |
||
64 | ); |
||
65 | } |
||
66 | } |
||
67 | #endif |
||
68 | |||
69 | static void RENAME(yuv2yuvX)(const int16_t *filter, int filterSize, |
||
70 | const int16_t **src, uint8_t *dest, int dstW, |
||
71 | const uint8_t *dither, int offset) |
||
72 | { |
||
73 | dither_8to16(dither, offset); |
||
74 | filterSize--; |
||
75 | __asm__ volatile( |
||
76 | "movd %0, %%mm1\n\t" |
||
77 | "punpcklwd %%mm1, %%mm1\n\t" |
||
78 | "punpckldq %%mm1, %%mm1\n\t" |
||
79 | "psllw $3, %%mm1\n\t" |
||
80 | "paddw %%mm1, %%mm3\n\t" |
||
81 | "paddw %%mm1, %%mm4\n\t" |
||
82 | "psraw $4, %%mm3\n\t" |
||
83 | "psraw $4, %%mm4\n\t" |
||
84 | ::"m"(filterSize) |
||
85 | ); |
||
86 | |||
87 | __asm__ volatile(\ |
||
88 | "movq %%mm3, %%mm6\n\t" |
||
89 | "movq %%mm4, %%mm7\n\t" |
||
90 | "movl %3, %%ecx\n\t" |
||
91 | "mov %0, %%"REG_d" \n\t"\ |
||
92 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
||
93 | ".p2align 4 \n\t" /* FIXME Unroll? */\ |
||
94 | "1: \n\t"\ |
||
95 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
||
96 | "movq (%%"REG_S", %%"REG_c", 2), %%mm2 \n\t" /* srcData */\ |
||
97 | "movq 8(%%"REG_S", %%"REG_c", 2), %%mm5 \n\t" /* srcData */\ |
||
98 | "add $16, %%"REG_d" \n\t"\ |
||
99 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
||
100 | "test %%"REG_S", %%"REG_S" \n\t"\ |
||
101 | "pmulhw %%mm0, %%mm2 \n\t"\ |
||
102 | "pmulhw %%mm0, %%mm5 \n\t"\ |
||
103 | "paddw %%mm2, %%mm3 \n\t"\ |
||
104 | "paddw %%mm5, %%mm4 \n\t"\ |
||
105 | " jnz 1b \n\t"\ |
||
106 | "psraw $3, %%mm3 \n\t"\ |
||
107 | "psraw $3, %%mm4 \n\t"\ |
||
108 | "packuswb %%mm4, %%mm3 \n\t" |
||
109 | MOVNTQ2 " %%mm3, (%1, %%"REG_c")\n\t" |
||
110 | "add $8, %%"REG_c" \n\t"\ |
||
111 | "cmp %2, %%"REG_c" \n\t"\ |
||
112 | "movq %%mm6, %%mm3\n\t" |
||
113 | "movq %%mm7, %%mm4\n\t" |
||
114 | "mov %0, %%"REG_d" \n\t"\ |
||
115 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
||
116 | "jb 1b \n\t"\ |
||
117 | :: "g" (filter), |
||
118 | "r" (dest-offset), "g" ((x86_reg)(dstW+offset)), "m" (offset) |
||
119 | : "%"REG_d, "%"REG_S, "%"REG_c |
||
120 | ); |
||
121 | } |
||
122 | |||
123 | #define YSCALEYUV2PACKEDX_UV \ |
||
124 | __asm__ volatile(\ |
||
125 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
||
126 | ".p2align 4 \n\t"\ |
||
127 | "nop \n\t"\ |
||
128 | "1: \n\t"\ |
||
129 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ |
||
130 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
||
131 | "movq "VROUNDER_OFFSET"(%0), %%mm3 \n\t"\ |
||
132 | "movq %%mm3, %%mm4 \n\t"\ |
||
133 | ".p2align 4 \n\t"\ |
||
134 | "2: \n\t"\ |
||
135 | "movq 8(%%"REG_d"), %%mm0 \n\t" /* filterCoeff */\ |
||
136 | "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* UsrcData */\ |
||
137 | "add %6, %%"REG_S" \n\t" \ |
||
138 | "movq (%%"REG_S", %%"REG_a"), %%mm5 \n\t" /* VsrcData */\ |
||
139 | "add $16, %%"REG_d" \n\t"\ |
||
140 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
||
141 | "pmulhw %%mm0, %%mm2 \n\t"\ |
||
142 | "pmulhw %%mm0, %%mm5 \n\t"\ |
||
143 | "paddw %%mm2, %%mm3 \n\t"\ |
||
144 | "paddw %%mm5, %%mm4 \n\t"\ |
||
145 | "test %%"REG_S", %%"REG_S" \n\t"\ |
||
146 | " jnz 2b \n\t"\ |
||
147 | |||
148 | #define YSCALEYUV2PACKEDX_YA(offset,coeff,src1,src2,dst1,dst2) \ |
||
149 | "lea "offset"(%0), %%"REG_d" \n\t"\ |
||
150 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
||
151 | "movq "VROUNDER_OFFSET"(%0), "#dst1" \n\t"\ |
||
152 | "movq "#dst1", "#dst2" \n\t"\ |
||
153 | ".p2align 4 \n\t"\ |
||
154 | "2: \n\t"\ |
||
155 | "movq 8(%%"REG_d"), "#coeff" \n\t" /* filterCoeff */\ |
||
156 | "movq (%%"REG_S", %%"REG_a", 2), "#src1" \n\t" /* Y1srcData */\ |
||
157 | "movq 8(%%"REG_S", %%"REG_a", 2), "#src2" \n\t" /* Y2srcData */\ |
||
158 | "add $16, %%"REG_d" \n\t"\ |
||
159 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
||
160 | "pmulhw "#coeff", "#src1" \n\t"\ |
||
161 | "pmulhw "#coeff", "#src2" \n\t"\ |
||
162 | "paddw "#src1", "#dst1" \n\t"\ |
||
163 | "paddw "#src2", "#dst2" \n\t"\ |
||
164 | "test %%"REG_S", %%"REG_S" \n\t"\ |
||
165 | " jnz 2b \n\t"\ |
||
166 | |||
167 | #define YSCALEYUV2PACKEDX \ |
||
168 | YSCALEYUV2PACKEDX_UV \ |
||
169 | YSCALEYUV2PACKEDX_YA(LUM_MMX_FILTER_OFFSET,%%mm0,%%mm2,%%mm5,%%mm1,%%mm7) \ |
||
170 | |||
171 | #define YSCALEYUV2PACKEDX_END \ |
||
172 | :: "r" (&c->redDither), \ |
||
173 | "m" (dummy), "m" (dummy), "m" (dummy),\ |
||
174 | "r" (dest), "m" (dstW_reg), "m"(uv_off) \ |
||
175 | : "%"REG_a, "%"REG_d, "%"REG_S \ |
||
176 | ); |
||
177 | |||
178 | #define YSCALEYUV2PACKEDX_ACCURATE_UV \ |
||
179 | __asm__ volatile(\ |
||
180 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
||
181 | ".p2align 4 \n\t"\ |
||
182 | "nop \n\t"\ |
||
183 | "1: \n\t"\ |
||
184 | "lea "CHR_MMX_FILTER_OFFSET"(%0), %%"REG_d" \n\t"\ |
||
185 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
||
186 | "pxor %%mm4, %%mm4 \n\t"\ |
||
187 | "pxor %%mm5, %%mm5 \n\t"\ |
||
188 | "pxor %%mm6, %%mm6 \n\t"\ |
||
189 | "pxor %%mm7, %%mm7 \n\t"\ |
||
190 | ".p2align 4 \n\t"\ |
||
191 | "2: \n\t"\ |
||
192 | "movq (%%"REG_S", %%"REG_a"), %%mm0 \n\t" /* UsrcData */\ |
||
193 | "add %6, %%"REG_S" \n\t" \ |
||
194 | "movq (%%"REG_S", %%"REG_a"), %%mm2 \n\t" /* VsrcData */\ |
||
195 | "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ |
||
196 | "movq (%%"REG_S", %%"REG_a"), %%mm1 \n\t" /* UsrcData */\ |
||
197 | "movq %%mm0, %%mm3 \n\t"\ |
||
198 | "punpcklwd %%mm1, %%mm0 \n\t"\ |
||
199 | "punpckhwd %%mm1, %%mm3 \n\t"\ |
||
200 | "movq "STR(APCK_COEF)"(%%"REG_d"),%%mm1 \n\t" /* filterCoeff */\ |
||
201 | "pmaddwd %%mm1, %%mm0 \n\t"\ |
||
202 | "pmaddwd %%mm1, %%mm3 \n\t"\ |
||
203 | "paddd %%mm0, %%mm4 \n\t"\ |
||
204 | "paddd %%mm3, %%mm5 \n\t"\ |
||
205 | "add %6, %%"REG_S" \n\t" \ |
||
206 | "movq (%%"REG_S", %%"REG_a"), %%mm3 \n\t" /* VsrcData */\ |
||
207 | "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ |
||
208 | "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ |
||
209 | "test %%"REG_S", %%"REG_S" \n\t"\ |
||
210 | "movq %%mm2, %%mm0 \n\t"\ |
||
211 | "punpcklwd %%mm3, %%mm2 \n\t"\ |
||
212 | "punpckhwd %%mm3, %%mm0 \n\t"\ |
||
213 | "pmaddwd %%mm1, %%mm2 \n\t"\ |
||
214 | "pmaddwd %%mm1, %%mm0 \n\t"\ |
||
215 | "paddd %%mm2, %%mm6 \n\t"\ |
||
216 | "paddd %%mm0, %%mm7 \n\t"\ |
||
217 | " jnz 2b \n\t"\ |
||
218 | "psrad $16, %%mm4 \n\t"\ |
||
219 | "psrad $16, %%mm5 \n\t"\ |
||
220 | "psrad $16, %%mm6 \n\t"\ |
||
221 | "psrad $16, %%mm7 \n\t"\ |
||
222 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ |
||
223 | "packssdw %%mm5, %%mm4 \n\t"\ |
||
224 | "packssdw %%mm7, %%mm6 \n\t"\ |
||
225 | "paddw %%mm0, %%mm4 \n\t"\ |
||
226 | "paddw %%mm0, %%mm6 \n\t"\ |
||
227 | "movq %%mm4, "U_TEMP"(%0) \n\t"\ |
||
228 | "movq %%mm6, "V_TEMP"(%0) \n\t"\ |
||
229 | |||
230 | #define YSCALEYUV2PACKEDX_ACCURATE_YA(offset) \ |
||
231 | "lea "offset"(%0), %%"REG_d" \n\t"\ |
||
232 | "mov (%%"REG_d"), %%"REG_S" \n\t"\ |
||
233 | "pxor %%mm1, %%mm1 \n\t"\ |
||
234 | "pxor %%mm5, %%mm5 \n\t"\ |
||
235 | "pxor %%mm7, %%mm7 \n\t"\ |
||
236 | "pxor %%mm6, %%mm6 \n\t"\ |
||
237 | ".p2align 4 \n\t"\ |
||
238 | "2: \n\t"\ |
||
239 | "movq (%%"REG_S", %%"REG_a", 2), %%mm0 \n\t" /* Y1srcData */\ |
||
240 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm2 \n\t" /* Y2srcData */\ |
||
241 | "mov "STR(APCK_PTR2)"(%%"REG_d"), %%"REG_S" \n\t"\ |
||
242 | "movq (%%"REG_S", %%"REG_a", 2), %%mm4 \n\t" /* Y1srcData */\ |
||
243 | "movq %%mm0, %%mm3 \n\t"\ |
||
244 | "punpcklwd %%mm4, %%mm0 \n\t"\ |
||
245 | "punpckhwd %%mm4, %%mm3 \n\t"\ |
||
246 | "movq "STR(APCK_COEF)"(%%"REG_d"), %%mm4 \n\t" /* filterCoeff */\ |
||
247 | "pmaddwd %%mm4, %%mm0 \n\t"\ |
||
248 | "pmaddwd %%mm4, %%mm3 \n\t"\ |
||
249 | "paddd %%mm0, %%mm1 \n\t"\ |
||
250 | "paddd %%mm3, %%mm5 \n\t"\ |
||
251 | "movq 8(%%"REG_S", %%"REG_a", 2), %%mm3 \n\t" /* Y2srcData */\ |
||
252 | "mov "STR(APCK_SIZE)"(%%"REG_d"), %%"REG_S" \n\t"\ |
||
253 | "add $"STR(APCK_SIZE)", %%"REG_d" \n\t"\ |
||
254 | "test %%"REG_S", %%"REG_S" \n\t"\ |
||
255 | "movq %%mm2, %%mm0 \n\t"\ |
||
256 | "punpcklwd %%mm3, %%mm2 \n\t"\ |
||
257 | "punpckhwd %%mm3, %%mm0 \n\t"\ |
||
258 | "pmaddwd %%mm4, %%mm2 \n\t"\ |
||
259 | "pmaddwd %%mm4, %%mm0 \n\t"\ |
||
260 | "paddd %%mm2, %%mm7 \n\t"\ |
||
261 | "paddd %%mm0, %%mm6 \n\t"\ |
||
262 | " jnz 2b \n\t"\ |
||
263 | "psrad $16, %%mm1 \n\t"\ |
||
264 | "psrad $16, %%mm5 \n\t"\ |
||
265 | "psrad $16, %%mm7 \n\t"\ |
||
266 | "psrad $16, %%mm6 \n\t"\ |
||
267 | "movq "VROUNDER_OFFSET"(%0), %%mm0 \n\t"\ |
||
268 | "packssdw %%mm5, %%mm1 \n\t"\ |
||
269 | "packssdw %%mm6, %%mm7 \n\t"\ |
||
270 | "paddw %%mm0, %%mm1 \n\t"\ |
||
271 | "paddw %%mm0, %%mm7 \n\t"\ |
||
272 | "movq "U_TEMP"(%0), %%mm3 \n\t"\ |
||
273 | "movq "V_TEMP"(%0), %%mm4 \n\t"\ |
||
274 | |||
275 | #define YSCALEYUV2PACKEDX_ACCURATE \ |
||
276 | YSCALEYUV2PACKEDX_ACCURATE_UV \ |
||
277 | YSCALEYUV2PACKEDX_ACCURATE_YA(LUM_MMX_FILTER_OFFSET) |
||
278 | |||
279 | #define YSCALEYUV2RGBX \ |
||
280 | "psubw "U_OFFSET"(%0), %%mm3 \n\t" /* (U-128)8*/\ |
||
281 | "psubw "V_OFFSET"(%0), %%mm4 \n\t" /* (V-128)8*/\ |
||
282 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
||
283 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
||
284 | "pmulhw "UG_COEFF"(%0), %%mm3 \n\t"\ |
||
285 | "pmulhw "VG_COEFF"(%0), %%mm4 \n\t"\ |
||
286 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
||
287 | "pmulhw "UB_COEFF"(%0), %%mm2 \n\t"\ |
||
288 | "pmulhw "VR_COEFF"(%0), %%mm5 \n\t"\ |
||
289 | "psubw "Y_OFFSET"(%0), %%mm1 \n\t" /* 8(Y-16)*/\ |
||
290 | "psubw "Y_OFFSET"(%0), %%mm7 \n\t" /* 8(Y-16)*/\ |
||
291 | "pmulhw "Y_COEFF"(%0), %%mm1 \n\t"\ |
||
292 | "pmulhw "Y_COEFF"(%0), %%mm7 \n\t"\ |
||
293 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
||
294 | "paddw %%mm3, %%mm4 \n\t"\ |
||
295 | "movq %%mm2, %%mm0 \n\t"\ |
||
296 | "movq %%mm5, %%mm6 \n\t"\ |
||
297 | "movq %%mm4, %%mm3 \n\t"\ |
||
298 | "punpcklwd %%mm2, %%mm2 \n\t"\ |
||
299 | "punpcklwd %%mm5, %%mm5 \n\t"\ |
||
300 | "punpcklwd %%mm4, %%mm4 \n\t"\ |
||
301 | "paddw %%mm1, %%mm2 \n\t"\ |
||
302 | "paddw %%mm1, %%mm5 \n\t"\ |
||
303 | "paddw %%mm1, %%mm4 \n\t"\ |
||
304 | "punpckhwd %%mm0, %%mm0 \n\t"\ |
||
305 | "punpckhwd %%mm6, %%mm6 \n\t"\ |
||
306 | "punpckhwd %%mm3, %%mm3 \n\t"\ |
||
307 | "paddw %%mm7, %%mm0 \n\t"\ |
||
308 | "paddw %%mm7, %%mm6 \n\t"\ |
||
309 | "paddw %%mm7, %%mm3 \n\t"\ |
||
310 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
||
311 | "packuswb %%mm0, %%mm2 \n\t"\ |
||
312 | "packuswb %%mm6, %%mm5 \n\t"\ |
||
313 | "packuswb %%mm3, %%mm4 \n\t"\ |
||
314 | |||
315 | #define REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) \ |
||
316 | "movq "#b", "#q2" \n\t" /* B */\ |
||
317 | "movq "#r", "#t" \n\t" /* R */\ |
||
318 | "punpcklbw "#g", "#b" \n\t" /* GBGBGBGB 0 */\ |
||
319 | "punpcklbw "#a", "#r" \n\t" /* ARARARAR 0 */\ |
||
320 | "punpckhbw "#g", "#q2" \n\t" /* GBGBGBGB 2 */\ |
||
321 | "punpckhbw "#a", "#t" \n\t" /* ARARARAR 2 */\ |
||
322 | "movq "#b", "#q0" \n\t" /* GBGBGBGB 0 */\ |
||
323 | "movq "#q2", "#q3" \n\t" /* GBGBGBGB 2 */\ |
||
324 | "punpcklwd "#r", "#q0" \n\t" /* ARGBARGB 0 */\ |
||
325 | "punpckhwd "#r", "#b" \n\t" /* ARGBARGB 1 */\ |
||
326 | "punpcklwd "#t", "#q2" \n\t" /* ARGBARGB 2 */\ |
||
327 | "punpckhwd "#t", "#q3" \n\t" /* ARGBARGB 3 */\ |
||
328 | \ |
||
329 | MOVNTQ( q0, (dst, index, 4))\ |
||
330 | MOVNTQ( b, 8(dst, index, 4))\ |
||
331 | MOVNTQ( q2, 16(dst, index, 4))\ |
||
332 | MOVNTQ( q3, 24(dst, index, 4))\ |
||
333 | \ |
||
334 | "add $8, "#index" \n\t"\ |
||
335 | "cmp "#dstw", "#index" \n\t"\ |
||
336 | " jb 1b \n\t" |
||
337 | #define WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) REAL_WRITEBGR32(dst, dstw, index, b, g, r, a, q0, q2, q3, t) |
||
338 | |||
339 | static void RENAME(yuv2rgb32_X_ar)(SwsContext *c, const int16_t *lumFilter, |
||
340 | const int16_t **lumSrc, int lumFilterSize, |
||
341 | const int16_t *chrFilter, const int16_t **chrUSrc, |
||
342 | const int16_t **chrVSrc, |
||
343 | int chrFilterSize, const int16_t **alpSrc, |
||
344 | uint8_t *dest, int dstW, int dstY) |
||
345 | { |
||
346 | x86_reg dummy=0; |
||
347 | x86_reg dstW_reg = dstW; |
||
348 | x86_reg uv_off = c->uv_offx2; |
||
349 | |||
350 | if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { |
||
351 | YSCALEYUV2PACKEDX_ACCURATE |
||
352 | YSCALEYUV2RGBX |
||
353 | "movq %%mm2, "U_TEMP"(%0) \n\t" |
||
354 | "movq %%mm4, "V_TEMP"(%0) \n\t" |
||
355 | "movq %%mm5, "Y_TEMP"(%0) \n\t" |
||
356 | YSCALEYUV2PACKEDX_ACCURATE_YA(ALP_MMX_FILTER_OFFSET) |
||
357 | "movq "Y_TEMP"(%0), %%mm5 \n\t" |
||
358 | "psraw $3, %%mm1 \n\t" |
||
359 | "psraw $3, %%mm7 \n\t" |
||
360 | "packuswb %%mm7, %%mm1 \n\t" |
||
361 | WRITEBGR32(%4, %5, %%REGa, %%mm3, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm2, %%mm6) |
||
362 | YSCALEYUV2PACKEDX_END |
||
363 | } else { |
||
364 | YSCALEYUV2PACKEDX_ACCURATE |
||
365 | YSCALEYUV2RGBX |
||
366 | "pcmpeqd %%mm7, %%mm7 \n\t" |
||
367 | WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
||
368 | YSCALEYUV2PACKEDX_END |
||
369 | } |
||
370 | } |
||
371 | |||
372 | static void RENAME(yuv2rgb32_X)(SwsContext *c, const int16_t *lumFilter, |
||
373 | const int16_t **lumSrc, int lumFilterSize, |
||
374 | const int16_t *chrFilter, const int16_t **chrUSrc, |
||
375 | const int16_t **chrVSrc, |
||
376 | int chrFilterSize, const int16_t **alpSrc, |
||
377 | uint8_t *dest, int dstW, int dstY) |
||
378 | { |
||
379 | x86_reg dummy=0; |
||
380 | x86_reg dstW_reg = dstW; |
||
381 | x86_reg uv_off = c->uv_offx2; |
||
382 | |||
383 | if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { |
||
384 | YSCALEYUV2PACKEDX |
||
385 | YSCALEYUV2RGBX |
||
386 | YSCALEYUV2PACKEDX_YA(ALP_MMX_FILTER_OFFSET, %%mm0, %%mm3, %%mm6, %%mm1, %%mm7) |
||
387 | "psraw $3, %%mm1 \n\t" |
||
388 | "psraw $3, %%mm7 \n\t" |
||
389 | "packuswb %%mm7, %%mm1 \n\t" |
||
390 | WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) |
||
391 | YSCALEYUV2PACKEDX_END |
||
392 | } else { |
||
393 | YSCALEYUV2PACKEDX |
||
394 | YSCALEYUV2RGBX |
||
395 | "pcmpeqd %%mm7, %%mm7 \n\t" |
||
396 | WRITEBGR32(%4, %5, %%REGa, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
||
397 | YSCALEYUV2PACKEDX_END |
||
398 | } |
||
399 | } |
||
400 | |||
401 | #define REAL_WRITERGB16(dst, dstw, index) \ |
||
402 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
||
403 | "pand "MANGLE(bFC)", %%mm4 \n\t" /* G */\ |
||
404 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ |
||
405 | "psrlq $3, %%mm2 \n\t"\ |
||
406 | \ |
||
407 | "movq %%mm2, %%mm1 \n\t"\ |
||
408 | "movq %%mm4, %%mm3 \n\t"\ |
||
409 | \ |
||
410 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
||
411 | "punpcklbw %%mm5, %%mm2 \n\t"\ |
||
412 | "punpckhbw %%mm7, %%mm4 \n\t"\ |
||
413 | "punpckhbw %%mm5, %%mm1 \n\t"\ |
||
414 | \ |
||
415 | "psllq $3, %%mm3 \n\t"\ |
||
416 | "psllq $3, %%mm4 \n\t"\ |
||
417 | \ |
||
418 | "por %%mm3, %%mm2 \n\t"\ |
||
419 | "por %%mm4, %%mm1 \n\t"\ |
||
420 | \ |
||
421 | MOVNTQ(%%mm2, (dst, index, 2))\ |
||
422 | MOVNTQ(%%mm1, 8(dst, index, 2))\ |
||
423 | \ |
||
424 | "add $8, "#index" \n\t"\ |
||
425 | "cmp "#dstw", "#index" \n\t"\ |
||
426 | " jb 1b \n\t" |
||
427 | #define WRITERGB16(dst, dstw, index) REAL_WRITERGB16(dst, dstw, index) |
||
428 | |||
429 | static void RENAME(yuv2rgb565_X_ar)(SwsContext *c, const int16_t *lumFilter, |
||
430 | const int16_t **lumSrc, int lumFilterSize, |
||
431 | const int16_t *chrFilter, const int16_t **chrUSrc, |
||
432 | const int16_t **chrVSrc, |
||
433 | int chrFilterSize, const int16_t **alpSrc, |
||
434 | uint8_t *dest, int dstW, int dstY) |
||
435 | { |
||
436 | x86_reg dummy=0; |
||
437 | x86_reg dstW_reg = dstW; |
||
438 | x86_reg uv_off = c->uv_offx2; |
||
439 | |||
440 | YSCALEYUV2PACKEDX_ACCURATE |
||
441 | YSCALEYUV2RGBX |
||
442 | "pxor %%mm7, %%mm7 \n\t" |
||
443 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
444 | #ifdef DITHER1XBPP |
||
445 | "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" |
||
446 | "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" |
||
447 | "paddusb "RED_DITHER"(%0), %%mm5\n\t" |
||
448 | #endif |
||
449 | WRITERGB16(%4, %5, %%REGa) |
||
450 | YSCALEYUV2PACKEDX_END |
||
451 | } |
||
452 | |||
453 | static void RENAME(yuv2rgb565_X)(SwsContext *c, const int16_t *lumFilter, |
||
454 | const int16_t **lumSrc, int lumFilterSize, |
||
455 | const int16_t *chrFilter, const int16_t **chrUSrc, |
||
456 | const int16_t **chrVSrc, |
||
457 | int chrFilterSize, const int16_t **alpSrc, |
||
458 | uint8_t *dest, int dstW, int dstY) |
||
459 | { |
||
460 | x86_reg dummy=0; |
||
461 | x86_reg dstW_reg = dstW; |
||
462 | x86_reg uv_off = c->uv_offx2; |
||
463 | |||
464 | YSCALEYUV2PACKEDX |
||
465 | YSCALEYUV2RGBX |
||
466 | "pxor %%mm7, %%mm7 \n\t" |
||
467 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
468 | #ifdef DITHER1XBPP |
||
469 | "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" |
||
470 | "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" |
||
471 | "paddusb "RED_DITHER"(%0), %%mm5 \n\t" |
||
472 | #endif |
||
473 | WRITERGB16(%4, %5, %%REGa) |
||
474 | YSCALEYUV2PACKEDX_END |
||
475 | } |
||
476 | |||
477 | #define REAL_WRITERGB15(dst, dstw, index) \ |
||
478 | "pand "MANGLE(bF8)", %%mm2 \n\t" /* B */\ |
||
479 | "pand "MANGLE(bF8)", %%mm4 \n\t" /* G */\ |
||
480 | "pand "MANGLE(bF8)", %%mm5 \n\t" /* R */\ |
||
481 | "psrlq $3, %%mm2 \n\t"\ |
||
482 | "psrlq $1, %%mm5 \n\t"\ |
||
483 | \ |
||
484 | "movq %%mm2, %%mm1 \n\t"\ |
||
485 | "movq %%mm4, %%mm3 \n\t"\ |
||
486 | \ |
||
487 | "punpcklbw %%mm7, %%mm3 \n\t"\ |
||
488 | "punpcklbw %%mm5, %%mm2 \n\t"\ |
||
489 | "punpckhbw %%mm7, %%mm4 \n\t"\ |
||
490 | "punpckhbw %%mm5, %%mm1 \n\t"\ |
||
491 | \ |
||
492 | "psllq $2, %%mm3 \n\t"\ |
||
493 | "psllq $2, %%mm4 \n\t"\ |
||
494 | \ |
||
495 | "por %%mm3, %%mm2 \n\t"\ |
||
496 | "por %%mm4, %%mm1 \n\t"\ |
||
497 | \ |
||
498 | MOVNTQ(%%mm2, (dst, index, 2))\ |
||
499 | MOVNTQ(%%mm1, 8(dst, index, 2))\ |
||
500 | \ |
||
501 | "add $8, "#index" \n\t"\ |
||
502 | "cmp "#dstw", "#index" \n\t"\ |
||
503 | " jb 1b \n\t" |
||
504 | #define WRITERGB15(dst, dstw, index) REAL_WRITERGB15(dst, dstw, index) |
||
505 | |||
506 | static void RENAME(yuv2rgb555_X_ar)(SwsContext *c, const int16_t *lumFilter, |
||
507 | const int16_t **lumSrc, int lumFilterSize, |
||
508 | const int16_t *chrFilter, const int16_t **chrUSrc, |
||
509 | const int16_t **chrVSrc, |
||
510 | int chrFilterSize, const int16_t **alpSrc, |
||
511 | uint8_t *dest, int dstW, int dstY) |
||
512 | { |
||
513 | x86_reg dummy=0; |
||
514 | x86_reg dstW_reg = dstW; |
||
515 | x86_reg uv_off = c->uv_offx2; |
||
516 | |||
517 | YSCALEYUV2PACKEDX_ACCURATE |
||
518 | YSCALEYUV2RGBX |
||
519 | "pxor %%mm7, %%mm7 \n\t" |
||
520 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
521 | #ifdef DITHER1XBPP |
||
522 | "paddusb "BLUE_DITHER"(%0), %%mm2\n\t" |
||
523 | "paddusb "GREEN_DITHER"(%0), %%mm4\n\t" |
||
524 | "paddusb "RED_DITHER"(%0), %%mm5\n\t" |
||
525 | #endif |
||
526 | WRITERGB15(%4, %5, %%REGa) |
||
527 | YSCALEYUV2PACKEDX_END |
||
528 | } |
||
529 | |||
530 | static void RENAME(yuv2rgb555_X)(SwsContext *c, const int16_t *lumFilter, |
||
531 | const int16_t **lumSrc, int lumFilterSize, |
||
532 | const int16_t *chrFilter, const int16_t **chrUSrc, |
||
533 | const int16_t **chrVSrc, |
||
534 | int chrFilterSize, const int16_t **alpSrc, |
||
535 | uint8_t *dest, int dstW, int dstY) |
||
536 | { |
||
537 | x86_reg dummy=0; |
||
538 | x86_reg dstW_reg = dstW; |
||
539 | x86_reg uv_off = c->uv_offx2; |
||
540 | |||
541 | YSCALEYUV2PACKEDX |
||
542 | YSCALEYUV2RGBX |
||
543 | "pxor %%mm7, %%mm7 \n\t" |
||
544 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
545 | #ifdef DITHER1XBPP |
||
546 | "paddusb "BLUE_DITHER"(%0), %%mm2 \n\t" |
||
547 | "paddusb "GREEN_DITHER"(%0), %%mm4 \n\t" |
||
548 | "paddusb "RED_DITHER"(%0), %%mm5 \n\t" |
||
549 | #endif |
||
550 | WRITERGB15(%4, %5, %%REGa) |
||
551 | YSCALEYUV2PACKEDX_END |
||
552 | } |
||
553 | |||
554 | #define WRITEBGR24MMX(dst, dstw, index) \ |
||
555 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
||
556 | "movq %%mm2, %%mm1 \n\t" /* B */\ |
||
557 | "movq %%mm5, %%mm6 \n\t" /* R */\ |
||
558 | "punpcklbw %%mm4, %%mm2 \n\t" /* GBGBGBGB 0 */\ |
||
559 | "punpcklbw %%mm7, %%mm5 \n\t" /* 0R0R0R0R 0 */\ |
||
560 | "punpckhbw %%mm4, %%mm1 \n\t" /* GBGBGBGB 2 */\ |
||
561 | "punpckhbw %%mm7, %%mm6 \n\t" /* 0R0R0R0R 2 */\ |
||
562 | "movq %%mm2, %%mm0 \n\t" /* GBGBGBGB 0 */\ |
||
563 | "movq %%mm1, %%mm3 \n\t" /* GBGBGBGB 2 */\ |
||
564 | "punpcklwd %%mm5, %%mm0 \n\t" /* 0RGB0RGB 0 */\ |
||
565 | "punpckhwd %%mm5, %%mm2 \n\t" /* 0RGB0RGB 1 */\ |
||
566 | "punpcklwd %%mm6, %%mm1 \n\t" /* 0RGB0RGB 2 */\ |
||
567 | "punpckhwd %%mm6, %%mm3 \n\t" /* 0RGB0RGB 3 */\ |
||
568 | \ |
||
569 | "movq %%mm0, %%mm4 \n\t" /* 0RGB0RGB 0 */\ |
||
570 | "movq %%mm2, %%mm6 \n\t" /* 0RGB0RGB 1 */\ |
||
571 | "movq %%mm1, %%mm5 \n\t" /* 0RGB0RGB 2 */\ |
||
572 | "movq %%mm3, %%mm7 \n\t" /* 0RGB0RGB 3 */\ |
||
573 | \ |
||
574 | "psllq $40, %%mm0 \n\t" /* RGB00000 0 */\ |
||
575 | "psllq $40, %%mm2 \n\t" /* RGB00000 1 */\ |
||
576 | "psllq $40, %%mm1 \n\t" /* RGB00000 2 */\ |
||
577 | "psllq $40, %%mm3 \n\t" /* RGB00000 3 */\ |
||
578 | \ |
||
579 | "punpckhdq %%mm4, %%mm0 \n\t" /* 0RGBRGB0 0 */\ |
||
580 | "punpckhdq %%mm6, %%mm2 \n\t" /* 0RGBRGB0 1 */\ |
||
581 | "punpckhdq %%mm5, %%mm1 \n\t" /* 0RGBRGB0 2 */\ |
||
582 | "punpckhdq %%mm7, %%mm3 \n\t" /* 0RGBRGB0 3 */\ |
||
583 | \ |
||
584 | "psrlq $8, %%mm0 \n\t" /* 00RGBRGB 0 */\ |
||
585 | "movq %%mm2, %%mm6 \n\t" /* 0RGBRGB0 1 */\ |
||
586 | "psllq $40, %%mm2 \n\t" /* GB000000 1 */\ |
||
587 | "por %%mm2, %%mm0 \n\t" /* GBRGBRGB 0 */\ |
||
588 | MOVNTQ(%%mm0, (dst))\ |
||
589 | \ |
||
590 | "psrlq $24, %%mm6 \n\t" /* 0000RGBR 1 */\ |
||
591 | "movq %%mm1, %%mm5 \n\t" /* 0RGBRGB0 2 */\ |
||
592 | "psllq $24, %%mm1 \n\t" /* BRGB0000 2 */\ |
||
593 | "por %%mm1, %%mm6 \n\t" /* BRGBRGBR 1 */\ |
||
594 | MOVNTQ(%%mm6, 8(dst))\ |
||
595 | \ |
||
596 | "psrlq $40, %%mm5 \n\t" /* 000000RG 2 */\ |
||
597 | "psllq $8, %%mm3 \n\t" /* RGBRGB00 3 */\ |
||
598 | "por %%mm3, %%mm5 \n\t" /* RGBRGBRG 2 */\ |
||
599 | MOVNTQ(%%mm5, 16(dst))\ |
||
600 | \ |
||
601 | "add $24, "#dst" \n\t"\ |
||
602 | \ |
||
603 | "add $8, "#index" \n\t"\ |
||
604 | "cmp "#dstw", "#index" \n\t"\ |
||
605 | " jb 1b \n\t" |
||
606 | |||
607 | #define WRITEBGR24MMXEXT(dst, dstw, index) \ |
||
608 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */\ |
||
609 | "movq "MANGLE(ff_M24A)", %%mm0 \n\t"\ |
||
610 | "movq "MANGLE(ff_M24C)", %%mm7 \n\t"\ |
||
611 | "pshufw $0x50, %%mm2, %%mm1 \n\t" /* B3 B2 B3 B2 B1 B0 B1 B0 */\ |
||
612 | "pshufw $0x50, %%mm4, %%mm3 \n\t" /* G3 G2 G3 G2 G1 G0 G1 G0 */\ |
||
613 | "pshufw $0x00, %%mm5, %%mm6 \n\t" /* R1 R0 R1 R0 R1 R0 R1 R0 */\ |
||
614 | \ |
||
615 | "pand %%mm0, %%mm1 \n\t" /* B2 B1 B0 */\ |
||
616 | "pand %%mm0, %%mm3 \n\t" /* G2 G1 G0 */\ |
||
617 | "pand %%mm7, %%mm6 \n\t" /* R1 R0 */\ |
||
618 | \ |
||
619 | "psllq $8, %%mm3 \n\t" /* G2 G1 G0 */\ |
||
620 | "por %%mm1, %%mm6 \n\t"\ |
||
621 | "por %%mm3, %%mm6 \n\t"\ |
||
622 | MOVNTQ(%%mm6, (dst))\ |
||
623 | \ |
||
624 | "psrlq $8, %%mm4 \n\t" /* 00 G7 G6 G5 G4 G3 G2 G1 */\ |
||
625 | "pshufw $0xA5, %%mm2, %%mm1 \n\t" /* B5 B4 B5 B4 B3 B2 B3 B2 */\ |
||
626 | "pshufw $0x55, %%mm4, %%mm3 \n\t" /* G4 G3 G4 G3 G4 G3 G4 G3 */\ |
||
627 | "pshufw $0xA5, %%mm5, %%mm6 \n\t" /* R5 R4 R5 R4 R3 R2 R3 R2 */\ |
||
628 | \ |
||
629 | "pand "MANGLE(ff_M24B)", %%mm1 \n\t" /* B5 B4 B3 */\ |
||
630 | "pand %%mm7, %%mm3 \n\t" /* G4 G3 */\ |
||
631 | "pand %%mm0, %%mm6 \n\t" /* R4 R3 R2 */\ |
||
632 | \ |
||
633 | "por %%mm1, %%mm3 \n\t" /* B5 G4 B4 G3 B3 */\ |
||
634 | "por %%mm3, %%mm6 \n\t"\ |
||
635 | MOVNTQ(%%mm6, 8(dst))\ |
||
636 | \ |
||
637 | "pshufw $0xFF, %%mm2, %%mm1 \n\t" /* B7 B6 B7 B6 B7 B6 B6 B7 */\ |
||
638 | "pshufw $0xFA, %%mm4, %%mm3 \n\t" /* 00 G7 00 G7 G6 G5 G6 G5 */\ |
||
639 | "pshufw $0xFA, %%mm5, %%mm6 \n\t" /* R7 R6 R7 R6 R5 R4 R5 R4 */\ |
||
640 | \ |
||
641 | "pand %%mm7, %%mm1 \n\t" /* B7 B6 */\ |
||
642 | "pand %%mm0, %%mm3 \n\t" /* G7 G6 G5 */\ |
||
643 | "pand "MANGLE(ff_M24B)", %%mm6 \n\t" /* R7 R6 R5 */\ |
||
644 | \ |
||
645 | "por %%mm1, %%mm3 \n\t"\ |
||
646 | "por %%mm3, %%mm6 \n\t"\ |
||
647 | MOVNTQ(%%mm6, 16(dst))\ |
||
648 | \ |
||
649 | "add $24, "#dst" \n\t"\ |
||
650 | \ |
||
651 | "add $8, "#index" \n\t"\ |
||
652 | "cmp "#dstw", "#index" \n\t"\ |
||
653 | " jb 1b \n\t" |
||
654 | |||
655 | #if COMPILE_TEMPLATE_MMXEXT |
||
656 | #undef WRITEBGR24 |
||
657 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMXEXT(dst, dstw, index) |
||
658 | #else |
||
659 | #undef WRITEBGR24 |
||
660 | #define WRITEBGR24(dst, dstw, index) WRITEBGR24MMX(dst, dstw, index) |
||
661 | #endif |
||
662 | |||
663 | static void RENAME(yuv2bgr24_X_ar)(SwsContext *c, const int16_t *lumFilter, |
||
664 | const int16_t **lumSrc, int lumFilterSize, |
||
665 | const int16_t *chrFilter, const int16_t **chrUSrc, |
||
666 | const int16_t **chrVSrc, |
||
667 | int chrFilterSize, const int16_t **alpSrc, |
||
668 | uint8_t *dest, int dstW, int dstY) |
||
669 | { |
||
670 | x86_reg dummy=0; |
||
671 | x86_reg dstW_reg = dstW; |
||
672 | x86_reg uv_off = c->uv_offx2; |
||
673 | |||
674 | YSCALEYUV2PACKEDX_ACCURATE |
||
675 | YSCALEYUV2RGBX |
||
676 | "pxor %%mm7, %%mm7 \n\t" |
||
677 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c"\n\t" //FIXME optimize |
||
678 | "add %4, %%"REG_c" \n\t" |
||
679 | WRITEBGR24(%%REGc, %5, %%REGa) |
||
680 | :: "r" (&c->redDither), |
||
681 | "m" (dummy), "m" (dummy), "m" (dummy), |
||
682 | "r" (dest), "m" (dstW_reg), "m"(uv_off) |
||
683 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S |
||
684 | ); |
||
685 | } |
||
686 | |||
687 | static void RENAME(yuv2bgr24_X)(SwsContext *c, const int16_t *lumFilter, |
||
688 | const int16_t **lumSrc, int lumFilterSize, |
||
689 | const int16_t *chrFilter, const int16_t **chrUSrc, |
||
690 | const int16_t **chrVSrc, |
||
691 | int chrFilterSize, const int16_t **alpSrc, |
||
692 | uint8_t *dest, int dstW, int dstY) |
||
693 | { |
||
694 | x86_reg dummy=0; |
||
695 | x86_reg dstW_reg = dstW; |
||
696 | x86_reg uv_off = c->uv_offx2; |
||
697 | |||
698 | YSCALEYUV2PACKEDX |
||
699 | YSCALEYUV2RGBX |
||
700 | "pxor %%mm7, %%mm7 \n\t" |
||
701 | "lea (%%"REG_a", %%"REG_a", 2), %%"REG_c" \n\t" //FIXME optimize |
||
702 | "add %4, %%"REG_c" \n\t" |
||
703 | WRITEBGR24(%%REGc, %5, %%REGa) |
||
704 | :: "r" (&c->redDither), |
||
705 | "m" (dummy), "m" (dummy), "m" (dummy), |
||
706 | "r" (dest), "m" (dstW_reg), "m"(uv_off) |
||
707 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S |
||
708 | ); |
||
709 | } |
||
710 | |||
711 | #define REAL_WRITEYUY2(dst, dstw, index) \ |
||
712 | "packuswb %%mm3, %%mm3 \n\t"\ |
||
713 | "packuswb %%mm4, %%mm4 \n\t"\ |
||
714 | "packuswb %%mm7, %%mm1 \n\t"\ |
||
715 | "punpcklbw %%mm4, %%mm3 \n\t"\ |
||
716 | "movq %%mm1, %%mm7 \n\t"\ |
||
717 | "punpcklbw %%mm3, %%mm1 \n\t"\ |
||
718 | "punpckhbw %%mm3, %%mm7 \n\t"\ |
||
719 | \ |
||
720 | MOVNTQ(%%mm1, (dst, index, 2))\ |
||
721 | MOVNTQ(%%mm7, 8(dst, index, 2))\ |
||
722 | \ |
||
723 | "add $8, "#index" \n\t"\ |
||
724 | "cmp "#dstw", "#index" \n\t"\ |
||
725 | " jb 1b \n\t" |
||
726 | #define WRITEYUY2(dst, dstw, index) REAL_WRITEYUY2(dst, dstw, index) |
||
727 | |||
728 | static void RENAME(yuv2yuyv422_X_ar)(SwsContext *c, const int16_t *lumFilter, |
||
729 | const int16_t **lumSrc, int lumFilterSize, |
||
730 | const int16_t *chrFilter, const int16_t **chrUSrc, |
||
731 | const int16_t **chrVSrc, |
||
732 | int chrFilterSize, const int16_t **alpSrc, |
||
733 | uint8_t *dest, int dstW, int dstY) |
||
734 | { |
||
735 | x86_reg dummy=0; |
||
736 | x86_reg dstW_reg = dstW; |
||
737 | x86_reg uv_off = c->uv_offx2; |
||
738 | |||
739 | YSCALEYUV2PACKEDX_ACCURATE |
||
740 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
741 | "psraw $3, %%mm3 \n\t" |
||
742 | "psraw $3, %%mm4 \n\t" |
||
743 | "psraw $3, %%mm1 \n\t" |
||
744 | "psraw $3, %%mm7 \n\t" |
||
745 | WRITEYUY2(%4, %5, %%REGa) |
||
746 | YSCALEYUV2PACKEDX_END |
||
747 | } |
||
748 | |||
749 | static void RENAME(yuv2yuyv422_X)(SwsContext *c, const int16_t *lumFilter, |
||
750 | const int16_t **lumSrc, int lumFilterSize, |
||
751 | const int16_t *chrFilter, const int16_t **chrUSrc, |
||
752 | const int16_t **chrVSrc, |
||
753 | int chrFilterSize, const int16_t **alpSrc, |
||
754 | uint8_t *dest, int dstW, int dstY) |
||
755 | { |
||
756 | x86_reg dummy=0; |
||
757 | x86_reg dstW_reg = dstW; |
||
758 | x86_reg uv_off = c->uv_offx2; |
||
759 | |||
760 | YSCALEYUV2PACKEDX |
||
761 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
762 | "psraw $3, %%mm3 \n\t" |
||
763 | "psraw $3, %%mm4 \n\t" |
||
764 | "psraw $3, %%mm1 \n\t" |
||
765 | "psraw $3, %%mm7 \n\t" |
||
766 | WRITEYUY2(%4, %5, %%REGa) |
||
767 | YSCALEYUV2PACKEDX_END |
||
768 | } |
||
769 | |||
770 | #define REAL_YSCALEYUV2RGB_UV(index, c) \ |
||
771 | "xor "#index", "#index" \n\t"\ |
||
772 | ".p2align 4 \n\t"\ |
||
773 | "1: \n\t"\ |
||
774 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
||
775 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
||
776 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
777 | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
||
778 | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
||
779 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
780 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
||
781 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
||
782 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ |
||
783 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
||
784 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
||
785 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
||
786 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
||
787 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
||
788 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
||
789 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
||
790 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ |
||
791 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
||
792 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
||
793 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ |
||
794 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ |
||
795 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
||
796 | |||
797 | #define REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) \ |
||
798 | "movq ("#b1", "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
||
799 | "movq ("#b2", "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ |
||
800 | "movq 8("#b1", "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ |
||
801 | "movq 8("#b2", "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ |
||
802 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
||
803 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ |
||
804 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
||
805 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
||
806 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
||
807 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
||
808 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
||
809 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
||
810 | |||
811 | #define REAL_YSCALEYUV2RGB_COEFF(c) \ |
||
812 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ |
||
813 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ |
||
814 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ |
||
815 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ |
||
816 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ |
||
817 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ |
||
818 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
||
819 | "paddw %%mm3, %%mm4 \n\t"\ |
||
820 | "movq %%mm2, %%mm0 \n\t"\ |
||
821 | "movq %%mm5, %%mm6 \n\t"\ |
||
822 | "movq %%mm4, %%mm3 \n\t"\ |
||
823 | "punpcklwd %%mm2, %%mm2 \n\t"\ |
||
824 | "punpcklwd %%mm5, %%mm5 \n\t"\ |
||
825 | "punpcklwd %%mm4, %%mm4 \n\t"\ |
||
826 | "paddw %%mm1, %%mm2 \n\t"\ |
||
827 | "paddw %%mm1, %%mm5 \n\t"\ |
||
828 | "paddw %%mm1, %%mm4 \n\t"\ |
||
829 | "punpckhwd %%mm0, %%mm0 \n\t"\ |
||
830 | "punpckhwd %%mm6, %%mm6 \n\t"\ |
||
831 | "punpckhwd %%mm3, %%mm3 \n\t"\ |
||
832 | "paddw %%mm7, %%mm0 \n\t"\ |
||
833 | "paddw %%mm7, %%mm6 \n\t"\ |
||
834 | "paddw %%mm7, %%mm3 \n\t"\ |
||
835 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
||
836 | "packuswb %%mm0, %%mm2 \n\t"\ |
||
837 | "packuswb %%mm6, %%mm5 \n\t"\ |
||
838 | "packuswb %%mm3, %%mm4 \n\t"\ |
||
839 | |||
840 | #define YSCALEYUV2RGB_YA(index, c, b1, b2) REAL_YSCALEYUV2RGB_YA(index, c, b1, b2) |
||
841 | |||
842 | #define YSCALEYUV2RGB(index, c) \ |
||
843 | REAL_YSCALEYUV2RGB_UV(index, c) \ |
||
844 | REAL_YSCALEYUV2RGB_YA(index, c, %0, %1) \ |
||
845 | REAL_YSCALEYUV2RGB_COEFF(c) |
||
846 | |||
847 | /** |
||
848 | * vertical bilinear scale YV12 to RGB |
||
849 | */ |
||
850 | static void RENAME(yuv2rgb32_2)(SwsContext *c, const int16_t *buf[2], |
||
851 | const int16_t *ubuf[2], const int16_t *vbuf[2], |
||
852 | const int16_t *abuf[2], uint8_t *dest, |
||
853 | int dstW, int yalpha, int uvalpha, int y) |
||
854 | { |
||
855 | const int16_t *buf0 = buf[0], *buf1 = buf[1], |
||
856 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; |
||
857 | |||
858 | if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { |
||
859 | const int16_t *abuf0 = abuf[0], *abuf1 = abuf[1]; |
||
860 | #if ARCH_X86_64 |
||
861 | __asm__ volatile( |
||
862 | YSCALEYUV2RGB(%%r8, %5) |
||
863 | YSCALEYUV2RGB_YA(%%r8, %5, %6, %7) |
||
864 | "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ |
||
865 | "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ |
||
866 | "packuswb %%mm7, %%mm1 \n\t" |
||
867 | WRITEBGR32(%4, 8280(%5), %%r8, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) |
||
868 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "r" (dest), |
||
869 | "a" (&c->redDither), |
||
870 | "r" (abuf0), "r" (abuf1) |
||
871 | : "%r8" |
||
872 | ); |
||
873 | #else |
||
874 | c->u_temp=(intptr_t)abuf0; |
||
875 | c->v_temp=(intptr_t)abuf1; |
||
876 | __asm__ volatile( |
||
877 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
878 | "mov %4, %%"REG_b" \n\t" |
||
879 | "push %%"REG_BP" \n\t" |
||
880 | YSCALEYUV2RGB(%%REGBP, %5) |
||
881 | "push %0 \n\t" |
||
882 | "push %1 \n\t" |
||
883 | "mov "U_TEMP"(%5), %0 \n\t" |
||
884 | "mov "V_TEMP"(%5), %1 \n\t" |
||
885 | YSCALEYUV2RGB_YA(%%REGBP, %5, %0, %1) |
||
886 | "psraw $3, %%mm1 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ |
||
887 | "psraw $3, %%mm7 \n\t" /* abuf0[eax] - abuf1[eax] >>7*/ |
||
888 | "packuswb %%mm7, %%mm1 \n\t" |
||
889 | "pop %1 \n\t" |
||
890 | "pop %0 \n\t" |
||
891 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm1, %%mm0, %%mm7, %%mm3, %%mm6) |
||
892 | "pop %%"REG_BP" \n\t" |
||
893 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
894 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
895 | "a" (&c->redDither) |
||
896 | ); |
||
897 | #endif |
||
898 | } else { |
||
899 | __asm__ volatile( |
||
900 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
901 | "mov %4, %%"REG_b" \n\t" |
||
902 | "push %%"REG_BP" \n\t" |
||
903 | YSCALEYUV2RGB(%%REGBP, %5) |
||
904 | "pcmpeqd %%mm7, %%mm7 \n\t" |
||
905 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
||
906 | "pop %%"REG_BP" \n\t" |
||
907 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
908 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
909 | "a" (&c->redDither) |
||
910 | ); |
||
911 | } |
||
912 | } |
||
913 | |||
914 | static void RENAME(yuv2bgr24_2)(SwsContext *c, const int16_t *buf[2], |
||
915 | const int16_t *ubuf[2], const int16_t *vbuf[2], |
||
916 | const int16_t *abuf[2], uint8_t *dest, |
||
917 | int dstW, int yalpha, int uvalpha, int y) |
||
918 | { |
||
919 | const int16_t *buf0 = buf[0], *buf1 = buf[1], |
||
920 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; |
||
921 | |||
922 | //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( |
||
923 | __asm__ volatile( |
||
924 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
925 | "mov %4, %%"REG_b" \n\t" |
||
926 | "push %%"REG_BP" \n\t" |
||
927 | YSCALEYUV2RGB(%%REGBP, %5) |
||
928 | "pxor %%mm7, %%mm7 \n\t" |
||
929 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) |
||
930 | "pop %%"REG_BP" \n\t" |
||
931 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
932 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
933 | "a" (&c->redDither) |
||
934 | ); |
||
935 | } |
||
936 | |||
937 | static void RENAME(yuv2rgb555_2)(SwsContext *c, const int16_t *buf[2], |
||
938 | const int16_t *ubuf[2], const int16_t *vbuf[2], |
||
939 | const int16_t *abuf[2], uint8_t *dest, |
||
940 | int dstW, int yalpha, int uvalpha, int y) |
||
941 | { |
||
942 | const int16_t *buf0 = buf[0], *buf1 = buf[1], |
||
943 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; |
||
944 | |||
945 | //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( |
||
946 | __asm__ volatile( |
||
947 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
948 | "mov %4, %%"REG_b" \n\t" |
||
949 | "push %%"REG_BP" \n\t" |
||
950 | YSCALEYUV2RGB(%%REGBP, %5) |
||
951 | "pxor %%mm7, %%mm7 \n\t" |
||
952 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
953 | #ifdef DITHER1XBPP |
||
954 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
||
955 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
||
956 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
||
957 | #endif |
||
958 | WRITERGB15(%%REGb, 8280(%5), %%REGBP) |
||
959 | "pop %%"REG_BP" \n\t" |
||
960 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
961 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
962 | "a" (&c->redDither) |
||
963 | ); |
||
964 | } |
||
965 | |||
966 | static void RENAME(yuv2rgb565_2)(SwsContext *c, const int16_t *buf[2], |
||
967 | const int16_t *ubuf[2], const int16_t *vbuf[2], |
||
968 | const int16_t *abuf[2], uint8_t *dest, |
||
969 | int dstW, int yalpha, int uvalpha, int y) |
||
970 | { |
||
971 | const int16_t *buf0 = buf[0], *buf1 = buf[1], |
||
972 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; |
||
973 | |||
974 | //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( |
||
975 | __asm__ volatile( |
||
976 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
977 | "mov %4, %%"REG_b" \n\t" |
||
978 | "push %%"REG_BP" \n\t" |
||
979 | YSCALEYUV2RGB(%%REGBP, %5) |
||
980 | "pxor %%mm7, %%mm7 \n\t" |
||
981 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
982 | #ifdef DITHER1XBPP |
||
983 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
||
984 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
||
985 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
||
986 | #endif |
||
987 | WRITERGB16(%%REGb, 8280(%5), %%REGBP) |
||
988 | "pop %%"REG_BP" \n\t" |
||
989 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
990 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
991 | "a" (&c->redDither) |
||
992 | ); |
||
993 | } |
||
994 | |||
995 | #define REAL_YSCALEYUV2PACKED(index, c) \ |
||
996 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ |
||
997 | "movq "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm1 \n\t"\ |
||
998 | "psraw $3, %%mm0 \n\t"\ |
||
999 | "psraw $3, %%mm1 \n\t"\ |
||
1000 | "movq %%mm0, "CHR_MMX_FILTER_OFFSET"+8("#c") \n\t"\ |
||
1001 | "movq %%mm1, "LUM_MMX_FILTER_OFFSET"+8("#c") \n\t"\ |
||
1002 | "xor "#index", "#index" \n\t"\ |
||
1003 | ".p2align 4 \n\t"\ |
||
1004 | "1: \n\t"\ |
||
1005 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
||
1006 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
||
1007 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
1008 | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
||
1009 | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
||
1010 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
1011 | "psubw %%mm3, %%mm2 \n\t" /* uvbuf0[eax] - uvbuf1[eax]*/\ |
||
1012 | "psubw %%mm4, %%mm5 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048]*/\ |
||
1013 | "movq "CHR_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t"\ |
||
1014 | "pmulhw %%mm0, %%mm2 \n\t" /* (uvbuf0[eax] - uvbuf1[eax])uvalpha1>>16*/\ |
||
1015 | "pmulhw %%mm0, %%mm5 \n\t" /* (uvbuf0[eax+2048] - uvbuf1[eax+2048])uvalpha1>>16*/\ |
||
1016 | "psraw $7, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
||
1017 | "psraw $7, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
||
1018 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax]uvalpha1 - uvbuf1[eax](1-uvalpha1)*/\ |
||
1019 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048]uvalpha1 - uvbuf1[eax+2048](1-uvalpha1)*/\ |
||
1020 | "movq (%0, "#index", 2), %%mm0 \n\t" /*buf0[eax]*/\ |
||
1021 | "movq (%1, "#index", 2), %%mm1 \n\t" /*buf1[eax]*/\ |
||
1022 | "movq 8(%0, "#index", 2), %%mm6 \n\t" /*buf0[eax]*/\ |
||
1023 | "movq 8(%1, "#index", 2), %%mm7 \n\t" /*buf1[eax]*/\ |
||
1024 | "psubw %%mm1, %%mm0 \n\t" /* buf0[eax] - buf1[eax]*/\ |
||
1025 | "psubw %%mm7, %%mm6 \n\t" /* buf0[eax] - buf1[eax]*/\ |
||
1026 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm0 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
||
1027 | "pmulhw "LUM_MMX_FILTER_OFFSET"+8("#c"), %%mm6 \n\t" /* (buf0[eax] - buf1[eax])yalpha1>>16*/\ |
||
1028 | "psraw $7, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
||
1029 | "psraw $7, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
||
1030 | "paddw %%mm0, %%mm1 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
||
1031 | "paddw %%mm6, %%mm7 \n\t" /* buf0[eax]yalpha1 + buf1[eax](1-yalpha1) >>16*/\ |
||
1032 | |||
1033 | #define YSCALEYUV2PACKED(index, c) REAL_YSCALEYUV2PACKED(index, c) |
||
1034 | |||
1035 | static void RENAME(yuv2yuyv422_2)(SwsContext *c, const int16_t *buf[2], |
||
1036 | const int16_t *ubuf[2], const int16_t *vbuf[2], |
||
1037 | const int16_t *abuf[2], uint8_t *dest, |
||
1038 | int dstW, int yalpha, int uvalpha, int y) |
||
1039 | { |
||
1040 | const int16_t *buf0 = buf[0], *buf1 = buf[1], |
||
1041 | *ubuf0 = ubuf[0], *ubuf1 = ubuf[1]; |
||
1042 | |||
1043 | //Note 8280 == DSTW_OFFSET but the preprocessor can't handle that there :( |
||
1044 | __asm__ volatile( |
||
1045 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1046 | "mov %4, %%"REG_b" \n\t" |
||
1047 | "push %%"REG_BP" \n\t" |
||
1048 | YSCALEYUV2PACKED(%%REGBP, %5) |
||
1049 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) |
||
1050 | "pop %%"REG_BP" \n\t" |
||
1051 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1052 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1053 | "a" (&c->redDither) |
||
1054 | ); |
||
1055 | } |
||
1056 | |||
1057 | #define REAL_YSCALEYUV2RGB1(index, c) \ |
||
1058 | "xor "#index", "#index" \n\t"\ |
||
1059 | ".p2align 4 \n\t"\ |
||
1060 | "1: \n\t"\ |
||
1061 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
||
1062 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
1063 | "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
||
1064 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
1065 | "psraw $4, %%mm3 \n\t" /* uvbuf0[eax] - uvbuf1[eax] >>4*/\ |
||
1066 | "psraw $4, %%mm4 \n\t" /* uvbuf0[eax+2048] - uvbuf1[eax+2048] >>4*/\ |
||
1067 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
||
1068 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ |
||
1069 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
||
1070 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
||
1071 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ |
||
1072 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ |
||
1073 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
||
1074 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
||
1075 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ |
||
1076 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
||
1077 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
||
1078 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ |
||
1079 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ |
||
1080 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ |
||
1081 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ |
||
1082 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ |
||
1083 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ |
||
1084 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
||
1085 | "paddw %%mm3, %%mm4 \n\t"\ |
||
1086 | "movq %%mm2, %%mm0 \n\t"\ |
||
1087 | "movq %%mm5, %%mm6 \n\t"\ |
||
1088 | "movq %%mm4, %%mm3 \n\t"\ |
||
1089 | "punpcklwd %%mm2, %%mm2 \n\t"\ |
||
1090 | "punpcklwd %%mm5, %%mm5 \n\t"\ |
||
1091 | "punpcklwd %%mm4, %%mm4 \n\t"\ |
||
1092 | "paddw %%mm1, %%mm2 \n\t"\ |
||
1093 | "paddw %%mm1, %%mm5 \n\t"\ |
||
1094 | "paddw %%mm1, %%mm4 \n\t"\ |
||
1095 | "punpckhwd %%mm0, %%mm0 \n\t"\ |
||
1096 | "punpckhwd %%mm6, %%mm6 \n\t"\ |
||
1097 | "punpckhwd %%mm3, %%mm3 \n\t"\ |
||
1098 | "paddw %%mm7, %%mm0 \n\t"\ |
||
1099 | "paddw %%mm7, %%mm6 \n\t"\ |
||
1100 | "paddw %%mm7, %%mm3 \n\t"\ |
||
1101 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
||
1102 | "packuswb %%mm0, %%mm2 \n\t"\ |
||
1103 | "packuswb %%mm6, %%mm5 \n\t"\ |
||
1104 | "packuswb %%mm3, %%mm4 \n\t"\ |
||
1105 | |||
1106 | #define YSCALEYUV2RGB1(index, c) REAL_YSCALEYUV2RGB1(index, c) |
||
1107 | |||
1108 | // do vertical chrominance interpolation |
||
1109 | #define REAL_YSCALEYUV2RGB1b(index, c) \ |
||
1110 | "xor "#index", "#index" \n\t"\ |
||
1111 | ".p2align 4 \n\t"\ |
||
1112 | "1: \n\t"\ |
||
1113 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
||
1114 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
||
1115 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
1116 | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
||
1117 | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
||
1118 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
1119 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
||
1120 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ |
||
1121 | "psrlw $5, %%mm3 \n\t" /*FIXME might overflow*/\ |
||
1122 | "psrlw $5, %%mm4 \n\t" /*FIXME might overflow*/\ |
||
1123 | "psubw "U_OFFSET"("#c"), %%mm3 \n\t" /* (U-128)8*/\ |
||
1124 | "psubw "V_OFFSET"("#c"), %%mm4 \n\t" /* (V-128)8*/\ |
||
1125 | "movq %%mm3, %%mm2 \n\t" /* (U-128)8*/\ |
||
1126 | "movq %%mm4, %%mm5 \n\t" /* (V-128)8*/\ |
||
1127 | "pmulhw "UG_COEFF"("#c"), %%mm3 \n\t"\ |
||
1128 | "pmulhw "VG_COEFF"("#c"), %%mm4 \n\t"\ |
||
1129 | /* mm2=(U-128)8, mm3=ug, mm4=vg mm5=(V-128)8 */\ |
||
1130 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
||
1131 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ |
||
1132 | "psraw $4, %%mm1 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
||
1133 | "psraw $4, %%mm7 \n\t" /* buf0[eax] - buf1[eax] >>4*/\ |
||
1134 | "pmulhw "UB_COEFF"("#c"), %%mm2 \n\t"\ |
||
1135 | "pmulhw "VR_COEFF"("#c"), %%mm5 \n\t"\ |
||
1136 | "psubw "Y_OFFSET"("#c"), %%mm1 \n\t" /* 8(Y-16)*/\ |
||
1137 | "psubw "Y_OFFSET"("#c"), %%mm7 \n\t" /* 8(Y-16)*/\ |
||
1138 | "pmulhw "Y_COEFF"("#c"), %%mm1 \n\t"\ |
||
1139 | "pmulhw "Y_COEFF"("#c"), %%mm7 \n\t"\ |
||
1140 | /* mm1= Y1, mm2=ub, mm3=ug, mm4=vg mm5=vr, mm7=Y2 */\ |
||
1141 | "paddw %%mm3, %%mm4 \n\t"\ |
||
1142 | "movq %%mm2, %%mm0 \n\t"\ |
||
1143 | "movq %%mm5, %%mm6 \n\t"\ |
||
1144 | "movq %%mm4, %%mm3 \n\t"\ |
||
1145 | "punpcklwd %%mm2, %%mm2 \n\t"\ |
||
1146 | "punpcklwd %%mm5, %%mm5 \n\t"\ |
||
1147 | "punpcklwd %%mm4, %%mm4 \n\t"\ |
||
1148 | "paddw %%mm1, %%mm2 \n\t"\ |
||
1149 | "paddw %%mm1, %%mm5 \n\t"\ |
||
1150 | "paddw %%mm1, %%mm4 \n\t"\ |
||
1151 | "punpckhwd %%mm0, %%mm0 \n\t"\ |
||
1152 | "punpckhwd %%mm6, %%mm6 \n\t"\ |
||
1153 | "punpckhwd %%mm3, %%mm3 \n\t"\ |
||
1154 | "paddw %%mm7, %%mm0 \n\t"\ |
||
1155 | "paddw %%mm7, %%mm6 \n\t"\ |
||
1156 | "paddw %%mm7, %%mm3 \n\t"\ |
||
1157 | /* mm0=B1, mm2=B2, mm3=G2, mm4=G1, mm5=R1, mm6=R2 */\ |
||
1158 | "packuswb %%mm0, %%mm2 \n\t"\ |
||
1159 | "packuswb %%mm6, %%mm5 \n\t"\ |
||
1160 | "packuswb %%mm3, %%mm4 \n\t"\ |
||
1161 | |||
1162 | #define YSCALEYUV2RGB1b(index, c) REAL_YSCALEYUV2RGB1b(index, c) |
||
1163 | |||
1164 | #define REAL_YSCALEYUV2RGB1_ALPHA(index) \ |
||
1165 | "movq (%1, "#index", 2), %%mm7 \n\t" /* abuf0[index ] */\ |
||
1166 | "movq 8(%1, "#index", 2), %%mm1 \n\t" /* abuf0[index+4] */\ |
||
1167 | "psraw $7, %%mm7 \n\t" /* abuf0[index ] >>7 */\ |
||
1168 | "psraw $7, %%mm1 \n\t" /* abuf0[index+4] >>7 */\ |
||
1169 | "packuswb %%mm1, %%mm7 \n\t" |
||
1170 | #define YSCALEYUV2RGB1_ALPHA(index) REAL_YSCALEYUV2RGB1_ALPHA(index) |
||
1171 | |||
1172 | /** |
||
1173 | * YV12 to RGB without scaling or interpolating |
||
1174 | */ |
||
1175 | static void RENAME(yuv2rgb32_1)(SwsContext *c, const int16_t *buf0, |
||
1176 | const int16_t *ubuf[2], const int16_t *vbuf[2], |
||
1177 | const int16_t *abuf0, uint8_t *dest, |
||
1178 | int dstW, int uvalpha, int y) |
||
1179 | { |
||
1180 | const int16_t *ubuf0 = ubuf[0]; |
||
1181 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
||
1182 | |||
1183 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
||
1184 | const int16_t *ubuf1 = ubuf[0]; |
||
1185 | if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { |
||
1186 | __asm__ volatile( |
||
1187 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1188 | "mov %4, %%"REG_b" \n\t" |
||
1189 | "push %%"REG_BP" \n\t" |
||
1190 | YSCALEYUV2RGB1(%%REGBP, %5) |
||
1191 | YSCALEYUV2RGB1_ALPHA(%%REGBP) |
||
1192 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
||
1193 | "pop %%"REG_BP" \n\t" |
||
1194 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1195 | :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1196 | "a" (&c->redDither) |
||
1197 | ); |
||
1198 | } else { |
||
1199 | __asm__ volatile( |
||
1200 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1201 | "mov %4, %%"REG_b" \n\t" |
||
1202 | "push %%"REG_BP" \n\t" |
||
1203 | YSCALEYUV2RGB1(%%REGBP, %5) |
||
1204 | "pcmpeqd %%mm7, %%mm7 \n\t" |
||
1205 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
||
1206 | "pop %%"REG_BP" \n\t" |
||
1207 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1208 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1209 | "a" (&c->redDither) |
||
1210 | ); |
||
1211 | } |
||
1212 | } else { |
||
1213 | const int16_t *ubuf1 = ubuf[1]; |
||
1214 | if (CONFIG_SWSCALE_ALPHA && c->alpPixBuf) { |
||
1215 | __asm__ volatile( |
||
1216 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1217 | "mov %4, %%"REG_b" \n\t" |
||
1218 | "push %%"REG_BP" \n\t" |
||
1219 | YSCALEYUV2RGB1b(%%REGBP, %5) |
||
1220 | YSCALEYUV2RGB1_ALPHA(%%REGBP) |
||
1221 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
||
1222 | "pop %%"REG_BP" \n\t" |
||
1223 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1224 | :: "c" (buf0), "d" (abuf0), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1225 | "a" (&c->redDither) |
||
1226 | ); |
||
1227 | } else { |
||
1228 | __asm__ volatile( |
||
1229 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1230 | "mov %4, %%"REG_b" \n\t" |
||
1231 | "push %%"REG_BP" \n\t" |
||
1232 | YSCALEYUV2RGB1b(%%REGBP, %5) |
||
1233 | "pcmpeqd %%mm7, %%mm7 \n\t" |
||
1234 | WRITEBGR32(%%REGb, 8280(%5), %%REGBP, %%mm2, %%mm4, %%mm5, %%mm7, %%mm0, %%mm1, %%mm3, %%mm6) |
||
1235 | "pop %%"REG_BP" \n\t" |
||
1236 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1237 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1238 | "a" (&c->redDither) |
||
1239 | ); |
||
1240 | } |
||
1241 | } |
||
1242 | } |
||
1243 | |||
1244 | static void RENAME(yuv2bgr24_1)(SwsContext *c, const int16_t *buf0, |
||
1245 | const int16_t *ubuf[2], const int16_t *vbuf[2], |
||
1246 | const int16_t *abuf0, uint8_t *dest, |
||
1247 | int dstW, int uvalpha, int y) |
||
1248 | { |
||
1249 | const int16_t *ubuf0 = ubuf[0]; |
||
1250 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
||
1251 | |||
1252 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
||
1253 | const int16_t *ubuf1 = ubuf[0]; |
||
1254 | __asm__ volatile( |
||
1255 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1256 | "mov %4, %%"REG_b" \n\t" |
||
1257 | "push %%"REG_BP" \n\t" |
||
1258 | YSCALEYUV2RGB1(%%REGBP, %5) |
||
1259 | "pxor %%mm7, %%mm7 \n\t" |
||
1260 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) |
||
1261 | "pop %%"REG_BP" \n\t" |
||
1262 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1263 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1264 | "a" (&c->redDither) |
||
1265 | ); |
||
1266 | } else { |
||
1267 | const int16_t *ubuf1 = ubuf[1]; |
||
1268 | __asm__ volatile( |
||
1269 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1270 | "mov %4, %%"REG_b" \n\t" |
||
1271 | "push %%"REG_BP" \n\t" |
||
1272 | YSCALEYUV2RGB1b(%%REGBP, %5) |
||
1273 | "pxor %%mm7, %%mm7 \n\t" |
||
1274 | WRITEBGR24(%%REGb, 8280(%5), %%REGBP) |
||
1275 | "pop %%"REG_BP" \n\t" |
||
1276 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1277 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1278 | "a" (&c->redDither) |
||
1279 | ); |
||
1280 | } |
||
1281 | } |
||
1282 | |||
1283 | static void RENAME(yuv2rgb555_1)(SwsContext *c, const int16_t *buf0, |
||
1284 | const int16_t *ubuf[2], const int16_t *vbuf[2], |
||
1285 | const int16_t *abuf0, uint8_t *dest, |
||
1286 | int dstW, int uvalpha, int y) |
||
1287 | { |
||
1288 | const int16_t *ubuf0 = ubuf[0]; |
||
1289 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
||
1290 | |||
1291 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
||
1292 | const int16_t *ubuf1 = ubuf[0]; |
||
1293 | __asm__ volatile( |
||
1294 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1295 | "mov %4, %%"REG_b" \n\t" |
||
1296 | "push %%"REG_BP" \n\t" |
||
1297 | YSCALEYUV2RGB1(%%REGBP, %5) |
||
1298 | "pxor %%mm7, %%mm7 \n\t" |
||
1299 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
1300 | #ifdef DITHER1XBPP |
||
1301 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
||
1302 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
||
1303 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
||
1304 | #endif |
||
1305 | WRITERGB15(%%REGb, 8280(%5), %%REGBP) |
||
1306 | "pop %%"REG_BP" \n\t" |
||
1307 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1308 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1309 | "a" (&c->redDither) |
||
1310 | ); |
||
1311 | } else { |
||
1312 | const int16_t *ubuf1 = ubuf[1]; |
||
1313 | __asm__ volatile( |
||
1314 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1315 | "mov %4, %%"REG_b" \n\t" |
||
1316 | "push %%"REG_BP" \n\t" |
||
1317 | YSCALEYUV2RGB1b(%%REGBP, %5) |
||
1318 | "pxor %%mm7, %%mm7 \n\t" |
||
1319 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
1320 | #ifdef DITHER1XBPP |
||
1321 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
||
1322 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
||
1323 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
||
1324 | #endif |
||
1325 | WRITERGB15(%%REGb, 8280(%5), %%REGBP) |
||
1326 | "pop %%"REG_BP" \n\t" |
||
1327 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1328 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1329 | "a" (&c->redDither) |
||
1330 | ); |
||
1331 | } |
||
1332 | } |
||
1333 | |||
1334 | static void RENAME(yuv2rgb565_1)(SwsContext *c, const int16_t *buf0, |
||
1335 | const int16_t *ubuf[2], const int16_t *vbuf[2], |
||
1336 | const int16_t *abuf0, uint8_t *dest, |
||
1337 | int dstW, int uvalpha, int y) |
||
1338 | { |
||
1339 | const int16_t *ubuf0 = ubuf[0]; |
||
1340 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
||
1341 | |||
1342 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
||
1343 | const int16_t *ubuf1 = ubuf[0]; |
||
1344 | __asm__ volatile( |
||
1345 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1346 | "mov %4, %%"REG_b" \n\t" |
||
1347 | "push %%"REG_BP" \n\t" |
||
1348 | YSCALEYUV2RGB1(%%REGBP, %5) |
||
1349 | "pxor %%mm7, %%mm7 \n\t" |
||
1350 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
1351 | #ifdef DITHER1XBPP |
||
1352 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
||
1353 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
||
1354 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
||
1355 | #endif |
||
1356 | WRITERGB16(%%REGb, 8280(%5), %%REGBP) |
||
1357 | "pop %%"REG_BP" \n\t" |
||
1358 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1359 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1360 | "a" (&c->redDither) |
||
1361 | ); |
||
1362 | } else { |
||
1363 | const int16_t *ubuf1 = ubuf[1]; |
||
1364 | __asm__ volatile( |
||
1365 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1366 | "mov %4, %%"REG_b" \n\t" |
||
1367 | "push %%"REG_BP" \n\t" |
||
1368 | YSCALEYUV2RGB1b(%%REGBP, %5) |
||
1369 | "pxor %%mm7, %%mm7 \n\t" |
||
1370 | /* mm2=B, %%mm4=G, %%mm5=R, %%mm7=0 */ |
||
1371 | #ifdef DITHER1XBPP |
||
1372 | "paddusb "BLUE_DITHER"(%5), %%mm2 \n\t" |
||
1373 | "paddusb "GREEN_DITHER"(%5), %%mm4 \n\t" |
||
1374 | "paddusb "RED_DITHER"(%5), %%mm5 \n\t" |
||
1375 | #endif |
||
1376 | WRITERGB16(%%REGb, 8280(%5), %%REGBP) |
||
1377 | "pop %%"REG_BP" \n\t" |
||
1378 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1379 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1380 | "a" (&c->redDither) |
||
1381 | ); |
||
1382 | } |
||
1383 | } |
||
1384 | |||
1385 | #define REAL_YSCALEYUV2PACKED1(index, c) \ |
||
1386 | "xor "#index", "#index" \n\t"\ |
||
1387 | ".p2align 4 \n\t"\ |
||
1388 | "1: \n\t"\ |
||
1389 | "movq (%2, "#index"), %%mm3 \n\t" /* uvbuf0[eax]*/\ |
||
1390 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
1391 | "movq (%2, "#index"), %%mm4 \n\t" /* uvbuf0[eax+2048]*/\ |
||
1392 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
1393 | "psraw $7, %%mm3 \n\t" \ |
||
1394 | "psraw $7, %%mm4 \n\t" \ |
||
1395 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
||
1396 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ |
||
1397 | "psraw $7, %%mm1 \n\t" \ |
||
1398 | "psraw $7, %%mm7 \n\t" \ |
||
1399 | |||
1400 | #define YSCALEYUV2PACKED1(index, c) REAL_YSCALEYUV2PACKED1(index, c) |
||
1401 | |||
1402 | #define REAL_YSCALEYUV2PACKED1b(index, c) \ |
||
1403 | "xor "#index", "#index" \n\t"\ |
||
1404 | ".p2align 4 \n\t"\ |
||
1405 | "1: \n\t"\ |
||
1406 | "movq (%2, "#index"), %%mm2 \n\t" /* uvbuf0[eax]*/\ |
||
1407 | "movq (%3, "#index"), %%mm3 \n\t" /* uvbuf1[eax]*/\ |
||
1408 | "add "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
1409 | "movq (%2, "#index"), %%mm5 \n\t" /* uvbuf0[eax+2048]*/\ |
||
1410 | "movq (%3, "#index"), %%mm4 \n\t" /* uvbuf1[eax+2048]*/\ |
||
1411 | "sub "UV_OFF_BYTE"("#c"), "#index" \n\t" \ |
||
1412 | "paddw %%mm2, %%mm3 \n\t" /* uvbuf0[eax] + uvbuf1[eax]*/\ |
||
1413 | "paddw %%mm5, %%mm4 \n\t" /* uvbuf0[eax+2048] + uvbuf1[eax+2048]*/\ |
||
1414 | "psrlw $8, %%mm3 \n\t" \ |
||
1415 | "psrlw $8, %%mm4 \n\t" \ |
||
1416 | "movq (%0, "#index", 2), %%mm1 \n\t" /*buf0[eax]*/\ |
||
1417 | "movq 8(%0, "#index", 2), %%mm7 \n\t" /*buf0[eax]*/\ |
||
1418 | "psraw $7, %%mm1 \n\t" \ |
||
1419 | "psraw $7, %%mm7 \n\t" |
||
1420 | #define YSCALEYUV2PACKED1b(index, c) REAL_YSCALEYUV2PACKED1b(index, c) |
||
1421 | |||
1422 | static void RENAME(yuv2yuyv422_1)(SwsContext *c, const int16_t *buf0, |
||
1423 | const int16_t *ubuf[2], const int16_t *vbuf[2], |
||
1424 | const int16_t *abuf0, uint8_t *dest, |
||
1425 | int dstW, int uvalpha, int y) |
||
1426 | { |
||
1427 | const int16_t *ubuf0 = ubuf[0]; |
||
1428 | const int16_t *buf1= buf0; //FIXME needed for RGB1/BGR1 |
||
1429 | |||
1430 | if (uvalpha < 2048) { // note this is not correct (shifts chrominance by 0.5 pixels) but it is a bit faster |
||
1431 | const int16_t *ubuf1 = ubuf[0]; |
||
1432 | __asm__ volatile( |
||
1433 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1434 | "mov %4, %%"REG_b" \n\t" |
||
1435 | "push %%"REG_BP" \n\t" |
||
1436 | YSCALEYUV2PACKED1(%%REGBP, %5) |
||
1437 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) |
||
1438 | "pop %%"REG_BP" \n\t" |
||
1439 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1440 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1441 | "a" (&c->redDither) |
||
1442 | ); |
||
1443 | } else { |
||
1444 | const int16_t *ubuf1 = ubuf[1]; |
||
1445 | __asm__ volatile( |
||
1446 | "mov %%"REG_b", "ESP_OFFSET"(%5) \n\t" |
||
1447 | "mov %4, %%"REG_b" \n\t" |
||
1448 | "push %%"REG_BP" \n\t" |
||
1449 | YSCALEYUV2PACKED1b(%%REGBP, %5) |
||
1450 | WRITEYUY2(%%REGb, 8280(%5), %%REGBP) |
||
1451 | "pop %%"REG_BP" \n\t" |
||
1452 | "mov "ESP_OFFSET"(%5), %%"REG_b" \n\t" |
||
1453 | :: "c" (buf0), "d" (buf1), "S" (ubuf0), "D" (ubuf1), "m" (dest), |
||
1454 | "a" (&c->redDither) |
||
1455 | ); |
||
1456 | } |
||
1457 | } |
||
1458 | |||
1459 | #if COMPILE_TEMPLATE_MMXEXT |
||
1460 | static void RENAME(hyscale_fast)(SwsContext *c, int16_t *dst, |
||
1461 | int dstWidth, const uint8_t *src, |
||
1462 | int srcW, int xInc) |
||
1463 | { |
||
1464 | int32_t *filterPos = c->hLumFilterPos; |
||
1465 | int16_t *filter = c->hLumFilter; |
||
1466 | void *mmxextFilterCode = c->lumMmxextFilterCode; |
||
1467 | int i; |
||
1468 | #if defined(PIC) |
||
1469 | uint64_t ebxsave; |
||
1470 | #endif |
||
1471 | #if ARCH_X86_64 |
||
1472 | uint64_t retsave; |
||
1473 | #endif |
||
1474 | |||
1475 | __asm__ volatile( |
||
1476 | #if defined(PIC) |
||
1477 | "mov %%"REG_b", %5 \n\t" |
||
1478 | #if ARCH_X86_64 |
||
1479 | "mov -8(%%rsp), %%"REG_a" \n\t" |
||
1480 | "mov %%"REG_a", %6 \n\t" |
||
1481 | #endif |
||
1482 | #else |
||
1483 | #if ARCH_X86_64 |
||
1484 | "mov -8(%%rsp), %%"REG_a" \n\t" |
||
1485 | "mov %%"REG_a", %5 \n\t" |
||
1486 | #endif |
||
1487 | #endif |
||
1488 | "pxor %%mm7, %%mm7 \n\t" |
||
1489 | "mov %0, %%"REG_c" \n\t" |
||
1490 | "mov %1, %%"REG_D" \n\t" |
||
1491 | "mov %2, %%"REG_d" \n\t" |
||
1492 | "mov %3, %%"REG_b" \n\t" |
||
1493 | "xor %%"REG_a", %%"REG_a" \n\t" // i |
||
1494 | PREFETCH" (%%"REG_c") \n\t" |
||
1495 | PREFETCH" 32(%%"REG_c") \n\t" |
||
1496 | PREFETCH" 64(%%"REG_c") \n\t" |
||
1497 | |||
1498 | #if ARCH_X86_64 |
||
1499 | #define CALL_MMXEXT_FILTER_CODE \ |
||
1500 | "movl (%%"REG_b"), %%esi \n\t"\ |
||
1501 | "call *%4 \n\t"\ |
||
1502 | "movl (%%"REG_b", %%"REG_a"), %%esi \n\t"\ |
||
1503 | "add %%"REG_S", %%"REG_c" \n\t"\ |
||
1504 | "add %%"REG_a", %%"REG_D" \n\t"\ |
||
1505 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
||
1506 | |||
1507 | #else |
||
1508 | #define CALL_MMXEXT_FILTER_CODE \ |
||
1509 | "movl (%%"REG_b"), %%esi \n\t"\ |
||
1510 | "call *%4 \n\t"\ |
||
1511 | "addl (%%"REG_b", %%"REG_a"), %%"REG_c" \n\t"\ |
||
1512 | "add %%"REG_a", %%"REG_D" \n\t"\ |
||
1513 | "xor %%"REG_a", %%"REG_a" \n\t"\ |
||
1514 | |||
1515 | #endif /* ARCH_X86_64 */ |
||
1516 | |||
1517 | CALL_MMXEXT_FILTER_CODE |
||
1518 | CALL_MMXEXT_FILTER_CODE |
||
1519 | CALL_MMXEXT_FILTER_CODE |
||
1520 | CALL_MMXEXT_FILTER_CODE |
||
1521 | CALL_MMXEXT_FILTER_CODE |
||
1522 | CALL_MMXEXT_FILTER_CODE |
||
1523 | CALL_MMXEXT_FILTER_CODE |
||
1524 | CALL_MMXEXT_FILTER_CODE |
||
1525 | |||
1526 | #if defined(PIC) |
||
1527 | "mov %5, %%"REG_b" \n\t" |
||
1528 | #if ARCH_X86_64 |
||
1529 | "mov %6, %%"REG_a" \n\t" |
||
1530 | "mov %%"REG_a", -8(%%rsp) \n\t" |
||
1531 | #endif |
||
1532 | #else |
||
1533 | #if ARCH_X86_64 |
||
1534 | "mov %5, %%"REG_a" \n\t" |
||
1535 | "mov %%"REG_a", -8(%%rsp) \n\t" |
||
1536 | #endif |
||
1537 | #endif |
||
1538 | :: "m" (src), "m" (dst), "m" (filter), "m" (filterPos), |
||
1539 | "m" (mmxextFilterCode) |
||
1540 | #if defined(PIC) |
||
1541 | ,"m" (ebxsave) |
||
1542 | #endif |
||
1543 | #if ARCH_X86_64 |
||
1544 | ,"m"(retsave) |
||
1545 | #endif |
||
1546 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
||
1547 | #if !defined(PIC) |
||
1548 | ,"%"REG_b |
||
1549 | #endif |
||
1550 | ); |
||
1551 | |||
1552 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) |
||
1553 | dst[i] = src[srcW-1]*128; |
||
1554 | } |
||
1555 | |||
1556 | static void RENAME(hcscale_fast)(SwsContext *c, int16_t *dst1, int16_t *dst2, |
||
1557 | int dstWidth, const uint8_t *src1, |
||
1558 | const uint8_t *src2, int srcW, int xInc) |
||
1559 | { |
||
1560 | int32_t *filterPos = c->hChrFilterPos; |
||
1561 | int16_t *filter = c->hChrFilter; |
||
1562 | void *mmxextFilterCode = c->chrMmxextFilterCode; |
||
1563 | int i; |
||
1564 | #if defined(PIC) |
||
1565 | DECLARE_ALIGNED(8, uint64_t, ebxsave); |
||
1566 | #endif |
||
1567 | #if ARCH_X86_64 |
||
1568 | DECLARE_ALIGNED(8, uint64_t, retsave); |
||
1569 | #endif |
||
1570 | |||
1571 | __asm__ volatile( |
||
1572 | #if defined(PIC) |
||
1573 | "mov %%"REG_b", %7 \n\t" |
||
1574 | #if ARCH_X86_64 |
||
1575 | "mov -8(%%rsp), %%"REG_a" \n\t" |
||
1576 | "mov %%"REG_a", %8 \n\t" |
||
1577 | #endif |
||
1578 | #else |
||
1579 | #if ARCH_X86_64 |
||
1580 | "mov -8(%%rsp), %%"REG_a" \n\t" |
||
1581 | "mov %%"REG_a", %7 \n\t" |
||
1582 | #endif |
||
1583 | #endif |
||
1584 | "pxor %%mm7, %%mm7 \n\t" |
||
1585 | "mov %0, %%"REG_c" \n\t" |
||
1586 | "mov %1, %%"REG_D" \n\t" |
||
1587 | "mov %2, %%"REG_d" \n\t" |
||
1588 | "mov %3, %%"REG_b" \n\t" |
||
1589 | "xor %%"REG_a", %%"REG_a" \n\t" // i |
||
1590 | PREFETCH" (%%"REG_c") \n\t" |
||
1591 | PREFETCH" 32(%%"REG_c") \n\t" |
||
1592 | PREFETCH" 64(%%"REG_c") \n\t" |
||
1593 | |||
1594 | CALL_MMXEXT_FILTER_CODE |
||
1595 | CALL_MMXEXT_FILTER_CODE |
||
1596 | CALL_MMXEXT_FILTER_CODE |
||
1597 | CALL_MMXEXT_FILTER_CODE |
||
1598 | "xor %%"REG_a", %%"REG_a" \n\t" // i |
||
1599 | "mov %5, %%"REG_c" \n\t" // src |
||
1600 | "mov %6, %%"REG_D" \n\t" // buf2 |
||
1601 | PREFETCH" (%%"REG_c") \n\t" |
||
1602 | PREFETCH" 32(%%"REG_c") \n\t" |
||
1603 | PREFETCH" 64(%%"REG_c") \n\t" |
||
1604 | |||
1605 | CALL_MMXEXT_FILTER_CODE |
||
1606 | CALL_MMXEXT_FILTER_CODE |
||
1607 | CALL_MMXEXT_FILTER_CODE |
||
1608 | CALL_MMXEXT_FILTER_CODE |
||
1609 | |||
1610 | #if defined(PIC) |
||
1611 | "mov %7, %%"REG_b" \n\t" |
||
1612 | #if ARCH_X86_64 |
||
1613 | "mov %8, %%"REG_a" \n\t" |
||
1614 | "mov %%"REG_a", -8(%%rsp) \n\t" |
||
1615 | #endif |
||
1616 | #else |
||
1617 | #if ARCH_X86_64 |
||
1618 | "mov %7, %%"REG_a" \n\t" |
||
1619 | "mov %%"REG_a", -8(%%rsp) \n\t" |
||
1620 | #endif |
||
1621 | #endif |
||
1622 | :: "m" (src1), "m" (dst1), "m" (filter), "m" (filterPos), |
||
1623 | "m" (mmxextFilterCode), "m" (src2), "m"(dst2) |
||
1624 | #if defined(PIC) |
||
1625 | ,"m" (ebxsave) |
||
1626 | #endif |
||
1627 | #if ARCH_X86_64 |
||
1628 | ,"m"(retsave) |
||
1629 | #endif |
||
1630 | : "%"REG_a, "%"REG_c, "%"REG_d, "%"REG_S, "%"REG_D |
||
1631 | #if !defined(PIC) |
||
1632 | ,"%"REG_b |
||
1633 | #endif |
||
1634 | ); |
||
1635 | |||
1636 | for (i=dstWidth-1; (i*xInc)>>16 >=srcW-1; i--) { |
||
1637 | dst1[i] = src1[srcW-1]*128; |
||
1638 | dst2[i] = src2[srcW-1]*128; |
||
1639 | } |
||
1640 | } |
||
1641 | #endif /* COMPILE_TEMPLATE_MMXEXT */ |
||
1642 | |||
1643 | static av_cold void RENAME(sws_init_swscale)(SwsContext *c) |
||
1644 | { |
||
1645 | enum AVPixelFormat dstFormat = c->dstFormat; |
||
1646 | |||
1647 | c->use_mmx_vfilter= 0; |
||
1648 | if (!is16BPS(dstFormat) && !is9_OR_10BPS(dstFormat) && dstFormat != AV_PIX_FMT_NV12 |
||
1649 | && dstFormat != AV_PIX_FMT_NV21 && !(c->flags & SWS_BITEXACT)) { |
||
1650 | if (c->flags & SWS_ACCURATE_RND) { |
||
1651 | if (!(c->flags & SWS_FULL_CHR_H_INT)) { |
||
1652 | switch (c->dstFormat) { |
||
1653 | case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X_ar); break; |
||
1654 | case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X_ar); break; |
||
1655 | case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X_ar); break; |
||
1656 | case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X_ar); break; |
||
1657 | case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X_ar); break; |
||
1658 | default: break; |
||
1659 | } |
||
1660 | } |
||
1661 | } else { |
||
1662 | c->use_mmx_vfilter= 1; |
||
1663 | c->yuv2planeX = RENAME(yuv2yuvX ); |
||
1664 | if (!(c->flags & SWS_FULL_CHR_H_INT)) { |
||
1665 | switch (c->dstFormat) { |
||
1666 | case AV_PIX_FMT_RGB32: c->yuv2packedX = RENAME(yuv2rgb32_X); break; |
||
1667 | case AV_PIX_FMT_BGR24: c->yuv2packedX = RENAME(yuv2bgr24_X); break; |
||
1668 | case AV_PIX_FMT_RGB555: c->yuv2packedX = RENAME(yuv2rgb555_X); break; |
||
1669 | case AV_PIX_FMT_RGB565: c->yuv2packedX = RENAME(yuv2rgb565_X); break; |
||
1670 | case AV_PIX_FMT_YUYV422: c->yuv2packedX = RENAME(yuv2yuyv422_X); break; |
||
1671 | default: break; |
||
1672 | } |
||
1673 | } |
||
1674 | } |
||
1675 | if (!(c->flags & SWS_FULL_CHR_H_INT)) { |
||
1676 | switch (c->dstFormat) { |
||
1677 | case AV_PIX_FMT_RGB32: |
||
1678 | c->yuv2packed1 = RENAME(yuv2rgb32_1); |
||
1679 | c->yuv2packed2 = RENAME(yuv2rgb32_2); |
||
1680 | break; |
||
1681 | case AV_PIX_FMT_BGR24: |
||
1682 | c->yuv2packed1 = RENAME(yuv2bgr24_1); |
||
1683 | c->yuv2packed2 = RENAME(yuv2bgr24_2); |
||
1684 | break; |
||
1685 | case AV_PIX_FMT_RGB555: |
||
1686 | c->yuv2packed1 = RENAME(yuv2rgb555_1); |
||
1687 | c->yuv2packed2 = RENAME(yuv2rgb555_2); |
||
1688 | break; |
||
1689 | case AV_PIX_FMT_RGB565: |
||
1690 | c->yuv2packed1 = RENAME(yuv2rgb565_1); |
||
1691 | c->yuv2packed2 = RENAME(yuv2rgb565_2); |
||
1692 | break; |
||
1693 | case AV_PIX_FMT_YUYV422: |
||
1694 | c->yuv2packed1 = RENAME(yuv2yuyv422_1); |
||
1695 | c->yuv2packed2 = RENAME(yuv2yuyv422_2); |
||
1696 | break; |
||
1697 | default: |
||
1698 | break; |
||
1699 | } |
||
1700 | } |
||
1701 | } |
||
1702 | |||
1703 | if (c->srcBpc == 8 && c->dstBpc <= 14) { |
||
1704 | // Use the new MMX scaler if the MMXEXT one can't be used (it is faster than the x86 ASM one). |
||
1705 | #if COMPILE_TEMPLATE_MMXEXT |
||
1706 | if (c->flags & SWS_FAST_BILINEAR && c->canMMXEXTBeUsed) { |
||
1707 | c->hyscale_fast = RENAME(hyscale_fast); |
||
1708 | c->hcscale_fast = RENAME(hcscale_fast); |
||
1709 | } else { |
||
1710 | #endif /* COMPILE_TEMPLATE_MMXEXT */ |
||
1711 | c->hyscale_fast = NULL; |
||
1712 | c->hcscale_fast = NULL; |
||
1713 | #if COMPILE_TEMPLATE_MMXEXT |
||
1714 | } |
||
1715 | #endif /* COMPILE_TEMPLATE_MMXEXT */ |
||
1716 | } |
||
1717 | }=>>>>>> |