Rev 1891 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
1891 | serge | 1 | /* |
2 | * Copyright © 2004, 2005 Red Hat, Inc. |
||
3 | * Copyright © 2004 Nicholas Miell |
||
4 | * Copyright © 2005 Trolltech AS |
||
5 | * |
||
6 | * Permission to use, copy, modify, distribute, and sell this software and its |
||
7 | * documentation for any purpose is hereby granted without fee, provided that |
||
8 | * the above copyright notice appear in all copies and that both that |
||
9 | * copyright notice and this permission notice appear in supporting |
||
10 | * documentation, and that the name of Red Hat not be used in advertising or |
||
11 | * publicity pertaining to distribution of the software without specific, |
||
12 | * written prior permission. Red Hat makes no representations about the |
||
13 | * suitability of this software for any purpose. It is provided "as is" |
||
14 | * without express or implied warranty. |
||
15 | * |
||
16 | * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
||
17 | * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
||
18 | * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
||
19 | * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
||
20 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
||
21 | * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
||
22 | * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
||
23 | * SOFTWARE. |
||
24 | * |
||
25 | * Author: Søren Sandmann (sandmann@redhat.com) |
||
26 | * Minor Improvements: Nicholas Miell (nmiell@gmail.com) |
||
27 | * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) |
||
28 | * |
||
29 | * Based on work by Owen Taylor |
||
30 | */ |
||
31 | |||
32 | #ifdef HAVE_CONFIG_H |
||
33 | #include |
||
34 | #endif |
||
35 | |||
3931 | Serge | 36 | #if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI |
1891 | serge | 37 | |
3931 | Serge | 38 | #ifdef USE_LOONGSON_MMI |
39 | #include |
||
40 | #else |
||
1891 | serge | 41 | #include |
3931 | Serge | 42 | #endif |
1891 | serge | 43 | #include "pixman-private.h" |
44 | #include "pixman-combine32.h" |
||
3931 | Serge | 45 | #include "pixman-inlines.h" |
1891 | serge | 46 | |
47 | #ifdef VERBOSE |
||
48 | #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__) |
||
49 | #else |
||
50 | #define CHECKPOINT() |
||
51 | #endif |
||
52 | |||
3931 | Serge | 53 | #if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8 |
54 | /* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */ |
||
55 | extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
56 | _mm_empty (void) |
||
57 | { |
||
58 | |||
59 | } |
||
60 | #endif |
||
61 | |||
62 | #ifdef USE_X86_MMX |
||
63 | # if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64)) |
||
64 | # include |
||
65 | # else |
||
66 | /* We have to compile with -msse to use xmmintrin.h, but that causes SSE |
||
67 | * instructions to be generated that we don't want. Just duplicate the |
||
68 | * functions we want to use. */ |
||
69 | extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
70 | _mm_movemask_pi8 (__m64 __A) |
||
71 | { |
||
72 | int ret; |
||
73 | |||
74 | asm ("pmovmskb %1, %0\n\t" |
||
75 | : "=r" (ret) |
||
76 | : "y" (__A) |
||
77 | ); |
||
78 | |||
79 | return ret; |
||
80 | } |
||
81 | |||
82 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
83 | _mm_mulhi_pu16 (__m64 __A, __m64 __B) |
||
84 | { |
||
85 | asm ("pmulhuw %1, %0\n\t" |
||
86 | : "+y" (__A) |
||
87 | : "y" (__B) |
||
88 | ); |
||
89 | return __A; |
||
90 | } |
||
91 | |||
92 | # ifdef __OPTIMIZE__ |
||
93 | extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
94 | _mm_shuffle_pi16 (__m64 __A, int8_t const __N) |
||
95 | { |
||
96 | __m64 ret; |
||
97 | |||
98 | asm ("pshufw %2, %1, %0\n\t" |
||
99 | : "=y" (ret) |
||
100 | : "y" (__A), "K" (__N) |
||
101 | ); |
||
102 | |||
103 | return ret; |
||
104 | } |
||
105 | # else |
||
106 | # define _mm_shuffle_pi16(A, N) \ |
||
107 | ({ \ |
||
108 | __m64 ret; \ |
||
109 | \ |
||
110 | asm ("pshufw %2, %1, %0\n\t" \ |
||
111 | : "=y" (ret) \ |
||
112 | : "y" (A), "K" ((const int8_t)N) \ |
||
113 | ); \ |
||
114 | \ |
||
115 | ret; \ |
||
116 | }) |
||
117 | # endif |
||
118 | # endif |
||
119 | #endif |
||
120 | |||
121 | #ifndef _MSC_VER |
||
122 | #define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \ |
||
123 | (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0)) |
||
124 | #endif |
||
125 | |||
1891 | serge | 126 | /* Notes about writing mmx code |
127 | * |
||
128 | * give memory operands as the second operand. If you give it as the |
||
129 | * first, gcc will first load it into a register, then use that |
||
130 | * register |
||
131 | * |
||
132 | * ie. use |
||
133 | * |
||
134 | * _mm_mullo_pi16 (x, mmx_constant); |
||
135 | * |
||
136 | * not |
||
137 | * |
||
138 | * _mm_mullo_pi16 (mmx_constant, x); |
||
139 | * |
||
140 | * Also try to minimize dependencies. i.e. when you need a value, try |
||
141 | * to calculate it from a value that was calculated as early as |
||
142 | * possible. |
||
143 | */ |
||
144 | |||
145 | /* --------------- MMX primitives ------------------------------------- */ |
||
146 | |||
3931 | Serge | 147 | /* If __m64 is defined as a struct or union, then define M64_MEMBER to be |
148 | * the name of the member used to access the data. |
||
149 | * If __m64 requires using mm_cvt* intrinsics functions to convert between |
||
150 | * uint64_t and __m64 values, then define USE_CVT_INTRINSICS. |
||
151 | * If __m64 and uint64_t values can just be cast to each other directly, |
||
152 | * then define USE_M64_CASTS. |
||
153 | * If __m64 is a double datatype, then define USE_M64_DOUBLE. |
||
154 | */ |
||
155 | #ifdef _MSC_VER |
||
156 | # define M64_MEMBER m64_u64 |
||
157 | #elif defined(__ICC) |
||
158 | # define USE_CVT_INTRINSICS |
||
159 | #elif defined(USE_LOONGSON_MMI) |
||
160 | # define USE_M64_DOUBLE |
||
161 | #elif defined(__GNUC__) |
||
162 | # define USE_M64_CASTS |
||
163 | #elif defined(__SUNPRO_C) |
||
164 | # if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__) |
||
165 | /* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__) |
||
166 | * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__ |
||
167 | * is defined. If it is used, then the mm_cvt* intrinsics must be used. |
||
168 | */ |
||
169 | # define USE_CVT_INTRINSICS |
||
170 | # else |
||
171 | /* For Studio 12.2 or older, or when __attribute__(__vector_size__) is |
||
172 | * disabled, __m64 is defined as a struct containing "unsigned long long l_". |
||
173 | */ |
||
174 | # define M64_MEMBER l_ |
||
175 | # endif |
||
176 | #endif |
||
177 | |||
178 | #if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE) |
||
1891 | serge | 179 | typedef uint64_t mmxdatafield; |
180 | #else |
||
181 | typedef __m64 mmxdatafield; |
||
182 | #endif |
||
183 | |||
184 | typedef struct |
||
185 | { |
||
186 | mmxdatafield mmx_4x00ff; |
||
187 | mmxdatafield mmx_4x0080; |
||
188 | mmxdatafield mmx_565_rgb; |
||
189 | mmxdatafield mmx_565_unpack_multiplier; |
||
3931 | Serge | 190 | mmxdatafield mmx_565_pack_multiplier; |
1891 | serge | 191 | mmxdatafield mmx_565_r; |
192 | mmxdatafield mmx_565_g; |
||
193 | mmxdatafield mmx_565_b; |
||
3931 | Serge | 194 | mmxdatafield mmx_packed_565_rb; |
195 | mmxdatafield mmx_packed_565_g; |
||
196 | mmxdatafield mmx_expand_565_g; |
||
197 | mmxdatafield mmx_expand_565_b; |
||
198 | mmxdatafield mmx_expand_565_r; |
||
199 | #ifndef USE_LOONGSON_MMI |
||
1891 | serge | 200 | mmxdatafield mmx_mask_0; |
201 | mmxdatafield mmx_mask_1; |
||
202 | mmxdatafield mmx_mask_2; |
||
203 | mmxdatafield mmx_mask_3; |
||
3931 | Serge | 204 | #endif |
1891 | serge | 205 | mmxdatafield mmx_full_alpha; |
3931 | Serge | 206 | mmxdatafield mmx_4x0101; |
207 | mmxdatafield mmx_ff000000; |
||
1891 | serge | 208 | } mmx_data_t; |
209 | |||
210 | #if defined(_MSC_VER) |
||
211 | # define MMXDATA_INIT(field, val) { val ## UI64 } |
||
212 | #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */ |
||
213 | # define MMXDATA_INIT(field, val) field = { val ## ULL } |
||
3931 | Serge | 214 | #else /* mmxdatafield is an integral type */ |
1891 | serge | 215 | # define MMXDATA_INIT(field, val) field = val ## ULL |
216 | #endif |
||
217 | |||
218 | static const mmx_data_t c = |
||
219 | { |
||
220 | MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff), |
||
221 | MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080), |
||
222 | MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f), |
||
223 | MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840), |
||
3931 | Serge | 224 | MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004), |
1891 | serge | 225 | MMXDATA_INIT (.mmx_565_r, 0x000000f800000000), |
226 | MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000), |
||
227 | MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), |
||
3931 | Serge | 228 | MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8), |
229 | MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00), |
||
230 | MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0), |
||
231 | MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f), |
||
232 | MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800), |
||
233 | #ifndef USE_LOONGSON_MMI |
||
1891 | serge | 234 | MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), |
235 | MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), |
||
236 | MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff), |
||
237 | MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff), |
||
3931 | Serge | 238 | #endif |
1891 | serge | 239 | MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000), |
3931 | Serge | 240 | MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101), |
241 | MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000), |
||
1891 | serge | 242 | }; |
243 | |||
3931 | Serge | 244 | #ifdef USE_CVT_INTRINSICS |
245 | # define MC(x) to_m64 (c.mmx_ ## x) |
||
246 | #elif defined(USE_M64_CASTS) |
||
247 | # define MC(x) ((__m64)c.mmx_ ## x) |
||
248 | #elif defined(USE_M64_DOUBLE) |
||
249 | # define MC(x) (*(__m64 *)&c.mmx_ ## x) |
||
1891 | serge | 250 | #else |
251 | # define MC(x) c.mmx_ ## x |
||
252 | #endif |
||
253 | |||
254 | static force_inline __m64 |
||
255 | to_m64 (uint64_t x) |
||
256 | { |
||
3931 | Serge | 257 | #ifdef USE_CVT_INTRINSICS |
1891 | serge | 258 | return _mm_cvtsi64_m64 (x); |
259 | #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ |
||
260 | __m64 res; |
||
261 | |||
262 | res.M64_MEMBER = x; |
||
263 | return res; |
||
3931 | Serge | 264 | #elif defined USE_M64_DOUBLE |
265 | return *(__m64 *)&x; |
||
266 | #else /* USE_M64_CASTS */ |
||
1891 | serge | 267 | return (__m64)x; |
268 | #endif |
||
269 | } |
||
270 | |||
271 | static force_inline uint64_t |
||
272 | to_uint64 (__m64 x) |
||
273 | { |
||
3931 | Serge | 274 | #ifdef USE_CVT_INTRINSICS |
1891 | serge | 275 | return _mm_cvtm64_si64 (x); |
276 | #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ |
||
277 | uint64_t res = x.M64_MEMBER; |
||
278 | return res; |
||
3931 | Serge | 279 | #elif defined USE_M64_DOUBLE |
280 | return *(uint64_t *)&x; |
||
281 | #else /* USE_M64_CASTS */ |
||
1891 | serge | 282 | return (uint64_t)x; |
283 | #endif |
||
284 | } |
||
285 | |||
286 | static force_inline __m64 |
||
287 | shift (__m64 v, |
||
288 | int s) |
||
289 | { |
||
290 | if (s > 0) |
||
291 | return _mm_slli_si64 (v, s); |
||
292 | else if (s < 0) |
||
293 | return _mm_srli_si64 (v, -s); |
||
294 | else |
||
295 | return v; |
||
296 | } |
||
297 | |||
298 | static force_inline __m64 |
||
299 | negate (__m64 mask) |
||
300 | { |
||
301 | return _mm_xor_si64 (mask, MC (4x00ff)); |
||
302 | } |
||
303 | |||
304 | static force_inline __m64 |
||
305 | pix_multiply (__m64 a, __m64 b) |
||
306 | { |
||
307 | __m64 res; |
||
308 | |||
309 | res = _mm_mullo_pi16 (a, b); |
||
310 | res = _mm_adds_pu16 (res, MC (4x0080)); |
||
3931 | Serge | 311 | res = _mm_mulhi_pu16 (res, MC (4x0101)); |
1891 | serge | 312 | |
313 | return res; |
||
314 | } |
||
315 | |||
316 | static force_inline __m64 |
||
317 | pix_add (__m64 a, __m64 b) |
||
318 | { |
||
319 | return _mm_adds_pu8 (a, b); |
||
320 | } |
||
321 | |||
322 | static force_inline __m64 |
||
323 | expand_alpha (__m64 pixel) |
||
324 | { |
||
3931 | Serge | 325 | return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3)); |
1891 | serge | 326 | } |
327 | |||
328 | static force_inline __m64 |
||
329 | expand_alpha_rev (__m64 pixel) |
||
330 | { |
||
3931 | Serge | 331 | return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0)); |
1891 | serge | 332 | } |
333 | |||
334 | static force_inline __m64 |
||
335 | invert_colors (__m64 pixel) |
||
336 | { |
||
3931 | Serge | 337 | return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2)); |
1891 | serge | 338 | } |
339 | |||
340 | static force_inline __m64 |
||
341 | over (__m64 src, |
||
342 | __m64 srca, |
||
343 | __m64 dest) |
||
344 | { |
||
345 | return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca))); |
||
346 | } |
||
347 | |||
348 | static force_inline __m64 |
||
349 | over_rev_non_pre (__m64 src, __m64 dest) |
||
350 | { |
||
351 | __m64 srca = expand_alpha (src); |
||
352 | __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha)); |
||
353 | |||
354 | return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest); |
||
355 | } |
||
356 | |||
357 | static force_inline __m64 |
||
358 | in (__m64 src, __m64 mask) |
||
359 | { |
||
360 | return pix_multiply (src, mask); |
||
361 | } |
||
362 | |||
363 | #ifndef _MSC_VER |
||
364 | static force_inline __m64 |
||
365 | in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest) |
||
366 | { |
||
367 | return over (in (src, mask), pix_multiply (srca, mask), dest); |
||
368 | } |
||
369 | |||
370 | #else |
||
371 | |||
372 | #define in_over(src, srca, mask, dest) \ |
||
373 | over (in (src, mask), pix_multiply (srca, mask), dest) |
||
374 | |||
375 | #endif |
||
376 | |||
3931 | Serge | 377 | /* Elemental unaligned loads */ |
378 | |||
379 | static force_inline __m64 ldq_u(__m64 *p) |
||
380 | { |
||
381 | #ifdef USE_X86_MMX |
||
382 | /* x86's alignment restrictions are very relaxed. */ |
||
383 | return *(__m64 *)p; |
||
384 | #elif defined USE_ARM_IWMMXT |
||
385 | int align = (uintptr_t)p & 7; |
||
386 | __m64 *aligned_p; |
||
387 | if (align == 0) |
||
388 | return *p; |
||
389 | aligned_p = (__m64 *)((uintptr_t)p & ~7); |
||
390 | return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align); |
||
391 | #else |
||
392 | struct __una_u64 { __m64 x __attribute__((packed)); }; |
||
393 | const struct __una_u64 *ptr = (const struct __una_u64 *) p; |
||
394 | return (__m64) ptr->x; |
||
395 | #endif |
||
396 | } |
||
397 | |||
398 | static force_inline uint32_t ldl_u(const uint32_t *p) |
||
399 | { |
||
400 | #ifdef USE_X86_MMX |
||
401 | /* x86's alignment restrictions are very relaxed. */ |
||
402 | return *p; |
||
403 | #else |
||
404 | struct __una_u32 { uint32_t x __attribute__((packed)); }; |
||
405 | const struct __una_u32 *ptr = (const struct __una_u32 *) p; |
||
406 | return ptr->x; |
||
407 | #endif |
||
408 | } |
||
409 | |||
1891 | serge | 410 | static force_inline __m64 |
3931 | Serge | 411 | load (const uint32_t *v) |
1891 | serge | 412 | { |
3931 | Serge | 413 | #ifdef USE_LOONGSON_MMI |
414 | __m64 ret; |
||
415 | asm ("lwc1 %0, %1\n\t" |
||
416 | : "=f" (ret) |
||
417 | : "m" (*v) |
||
418 | ); |
||
419 | return ret; |
||
420 | #else |
||
421 | return _mm_cvtsi32_si64 (*v); |
||
422 | #endif |
||
1891 | serge | 423 | } |
424 | |||
425 | static force_inline __m64 |
||
3931 | Serge | 426 | load8888 (const uint32_t *v) |
427 | { |
||
428 | #ifdef USE_LOONGSON_MMI |
||
429 | return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ()); |
||
430 | #else |
||
431 | return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ()); |
||
432 | #endif |
||
433 | } |
||
434 | |||
435 | static force_inline __m64 |
||
436 | load8888u (const uint32_t *v) |
||
437 | { |
||
438 | uint32_t l = ldl_u (v); |
||
439 | return load8888 (&l); |
||
440 | } |
||
441 | |||
442 | static force_inline __m64 |
||
1891 | serge | 443 | pack8888 (__m64 lo, __m64 hi) |
444 | { |
||
445 | return _mm_packs_pu16 (lo, hi); |
||
446 | } |
||
447 | |||
3931 | Serge | 448 | static force_inline void |
449 | store (uint32_t *dest, __m64 v) |
||
1891 | serge | 450 | { |
3931 | Serge | 451 | #ifdef USE_LOONGSON_MMI |
452 | asm ("swc1 %1, %0\n\t" |
||
453 | : "=m" (*dest) |
||
454 | : "f" (v) |
||
455 | : "memory" |
||
456 | ); |
||
457 | #else |
||
458 | *dest = _mm_cvtsi64_si32 (v); |
||
459 | #endif |
||
1891 | serge | 460 | } |
461 | |||
3931 | Serge | 462 | static force_inline void |
463 | store8888 (uint32_t *dest, __m64 v) |
||
464 | { |
||
465 | v = pack8888 (v, _mm_setzero_si64 ()); |
||
466 | store (dest, v); |
||
467 | } |
||
468 | |||
469 | static force_inline pixman_bool_t |
||
470 | is_equal (__m64 a, __m64 b) |
||
471 | { |
||
472 | #ifdef USE_LOONGSON_MMI |
||
473 | /* __m64 is double, we can compare directly. */ |
||
474 | return a == b; |
||
475 | #else |
||
476 | return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff; |
||
477 | #endif |
||
478 | } |
||
479 | |||
480 | static force_inline pixman_bool_t |
||
481 | is_opaque (__m64 v) |
||
482 | { |
||
483 | #ifdef USE_LOONGSON_MMI |
||
484 | return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha)); |
||
485 | #else |
||
486 | __m64 ffs = _mm_cmpeq_pi8 (v, v); |
||
487 | return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40); |
||
488 | #endif |
||
489 | } |
||
490 | |||
491 | static force_inline pixman_bool_t |
||
492 | is_zero (__m64 v) |
||
493 | { |
||
494 | return is_equal (v, _mm_setzero_si64 ()); |
||
495 | } |
||
496 | |||
1891 | serge | 497 | /* Expand 16 bits positioned at @pos (0-3) of a mmx register into |
498 | * |
||
499 | * 00RR00GG00BB |
||
500 | * |
||
501 | * --- Expanding 565 in the low word --- |
||
502 | * |
||
503 | * m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
||
504 | * m = m & (01f0003f001f); |
||
505 | * m = m * (008404100840); |
||
506 | * m = m >> 8; |
||
507 | * |
||
508 | * Note the trick here - the top word is shifted by another nibble to |
||
509 | * avoid it bumping into the middle word |
||
510 | */ |
||
511 | static force_inline __m64 |
||
512 | expand565 (__m64 pixel, int pos) |
||
513 | { |
||
514 | __m64 p = pixel; |
||
515 | __m64 t1, t2; |
||
516 | |||
517 | /* move pixel to low 16 bit and zero the rest */ |
||
3931 | Serge | 518 | #ifdef USE_LOONGSON_MMI |
519 | p = loongson_extract_pi16 (p, pos); |
||
520 | #else |
||
1891 | serge | 521 | p = shift (shift (p, (3 - pos) * 16), -48); |
3931 | Serge | 522 | #endif |
1891 | serge | 523 | |
524 | t1 = shift (p, 36 - 11); |
||
525 | t2 = shift (p, 16 - 5); |
||
526 | |||
527 | p = _mm_or_si64 (t1, p); |
||
528 | p = _mm_or_si64 (t2, p); |
||
529 | p = _mm_and_si64 (p, MC (565_rgb)); |
||
530 | |||
531 | pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier)); |
||
532 | return _mm_srli_pi16 (pixel, 8); |
||
533 | } |
||
534 | |||
3931 | Serge | 535 | /* Expand 4 16 bit pixels in an mmx register into two mmx registers of |
536 | * |
||
537 | * AARRGGBBRRGGBB |
||
538 | */ |
||
539 | static force_inline void |
||
540 | expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha) |
||
541 | { |
||
542 | __m64 t0, t1, alpha = _mm_setzero_si64 (); |
||
543 | __m64 r = _mm_and_si64 (vin, MC (expand_565_r)); |
||
544 | __m64 g = _mm_and_si64 (vin, MC (expand_565_g)); |
||
545 | __m64 b = _mm_and_si64 (vin, MC (expand_565_b)); |
||
546 | if (full_alpha) |
||
547 | alpha = _mm_cmpeq_pi32 (alpha, alpha); |
||
548 | |||
549 | /* Replicate high bits into empty low bits. */ |
||
550 | r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13)); |
||
551 | g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9)); |
||
552 | b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2)); |
||
553 | |||
554 | r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */ |
||
555 | g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */ |
||
556 | b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */ |
||
557 | |||
558 | t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */ |
||
559 | t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */ |
||
560 | |||
561 | *vout0 = _mm_unpacklo_pi16 (t0, t1); /* A1 R1 G1 B1 A0 R0 G0 B0 */ |
||
562 | *vout1 = _mm_unpackhi_pi16 (t0, t1); /* A3 R3 G3 B3 A2 R2 G2 B2 */ |
||
563 | } |
||
564 | |||
1891 | serge | 565 | static force_inline __m64 |
566 | expand8888 (__m64 in, int pos) |
||
567 | { |
||
568 | if (pos == 0) |
||
569 | return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ()); |
||
570 | else |
||
571 | return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ()); |
||
572 | } |
||
573 | |||
574 | static force_inline __m64 |
||
575 | expandx888 (__m64 in, int pos) |
||
576 | { |
||
577 | return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); |
||
578 | } |
||
579 | |||
3931 | Serge | 580 | static force_inline void |
581 | expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha) |
||
582 | { |
||
583 | __m64 v0, v1; |
||
584 | expand_4xpacked565 (vin, &v0, &v1, full_alpha); |
||
585 | *vout0 = expand8888 (v0, 0); |
||
586 | *vout1 = expand8888 (v0, 1); |
||
587 | *vout2 = expand8888 (v1, 0); |
||
588 | *vout3 = expand8888 (v1, 1); |
||
589 | } |
||
590 | |||
1891 | serge | 591 | static force_inline __m64 |
592 | pack_565 (__m64 pixel, __m64 target, int pos) |
||
593 | { |
||
594 | __m64 p = pixel; |
||
595 | __m64 t = target; |
||
596 | __m64 r, g, b; |
||
597 | |||
598 | r = _mm_and_si64 (p, MC (565_r)); |
||
599 | g = _mm_and_si64 (p, MC (565_g)); |
||
600 | b = _mm_and_si64 (p, MC (565_b)); |
||
601 | |||
3931 | Serge | 602 | #ifdef USE_LOONGSON_MMI |
603 | r = shift (r, -(32 - 8)); |
||
604 | g = shift (g, -(16 - 3)); |
||
605 | b = shift (b, -(0 + 3)); |
||
606 | |||
607 | p = _mm_or_si64 (r, g); |
||
608 | p = _mm_or_si64 (p, b); |
||
609 | return loongson_insert_pi16 (t, p, pos); |
||
610 | #else |
||
1891 | serge | 611 | r = shift (r, -(32 - 8) + pos * 16); |
612 | g = shift (g, -(16 - 3) + pos * 16); |
||
613 | b = shift (b, -(0 + 3) + pos * 16); |
||
614 | |||
615 | if (pos == 0) |
||
616 | t = _mm_and_si64 (t, MC (mask_0)); |
||
617 | else if (pos == 1) |
||
618 | t = _mm_and_si64 (t, MC (mask_1)); |
||
619 | else if (pos == 2) |
||
620 | t = _mm_and_si64 (t, MC (mask_2)); |
||
621 | else if (pos == 3) |
||
622 | t = _mm_and_si64 (t, MC (mask_3)); |
||
623 | |||
624 | p = _mm_or_si64 (r, t); |
||
625 | p = _mm_or_si64 (g, p); |
||
626 | |||
627 | return _mm_or_si64 (b, p); |
||
3931 | Serge | 628 | #endif |
1891 | serge | 629 | } |
630 | |||
3931 | Serge | 631 | static force_inline __m64 |
632 | pack_4xpacked565 (__m64 a, __m64 b) |
||
633 | { |
||
634 | __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb)); |
||
635 | __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb)); |
||
636 | |||
637 | __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier)); |
||
638 | __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier)); |
||
639 | |||
640 | __m64 g0 = _mm_and_si64 (a, MC (packed_565_g)); |
||
641 | __m64 g1 = _mm_and_si64 (b, MC (packed_565_g)); |
||
642 | |||
643 | t0 = _mm_or_si64 (t0, g0); |
||
644 | t1 = _mm_or_si64 (t1, g1); |
||
645 | |||
646 | t0 = shift(t0, -5); |
||
647 | #ifdef USE_ARM_IWMMXT |
||
648 | t1 = shift(t1, -5); |
||
649 | return _mm_packs_pu32 (t0, t1); |
||
650 | #else |
||
651 | t1 = shift(t1, -5 + 16); |
||
652 | return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0)); |
||
653 | #endif |
||
654 | } |
||
655 | |||
1891 | serge | 656 | #ifndef _MSC_VER |
657 | |||
658 | static force_inline __m64 |
||
3931 | Serge | 659 | pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3) |
660 | { |
||
661 | return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)); |
||
662 | } |
||
663 | |||
664 | static force_inline __m64 |
||
1891 | serge | 665 | pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) |
666 | { |
||
667 | x = pix_multiply (x, a); |
||
668 | y = pix_multiply (y, b); |
||
669 | |||
670 | return pix_add (x, y); |
||
671 | } |
||
672 | |||
673 | #else |
||
674 | |||
3931 | Serge | 675 | /* MSVC only handles a "pass by register" of up to three SSE intrinsics */ |
676 | |||
677 | #define pack_4x565(v0, v1, v2, v3) \ |
||
678 | pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3)) |
||
679 | |||
1891 | serge | 680 | #define pix_add_mul(x, a, y, b) \ |
681 | ( x = pix_multiply (x, a), \ |
||
3931 | Serge | 682 | y = pix_multiply (y, b), \ |
1891 | serge | 683 | pix_add (x, y) ) |
684 | |||
685 | #endif |
||
686 | |||
687 | /* --------------- MMX code patch for fbcompose.c --------------------- */ |
||
688 | |||
3931 | Serge | 689 | static force_inline __m64 |
1891 | serge | 690 | combine (const uint32_t *src, const uint32_t *mask) |
691 | { |
||
3931 | Serge | 692 | __m64 vsrc = load8888 (src); |
1891 | serge | 693 | |
694 | if (mask) |
||
695 | { |
||
3931 | Serge | 696 | __m64 m = load8888 (mask); |
1891 | serge | 697 | |
698 | m = expand_alpha (m); |
||
3931 | Serge | 699 | vsrc = pix_multiply (vsrc, m); |
700 | } |
||
1891 | serge | 701 | |
3931 | Serge | 702 | return vsrc; |
703 | } |
||
704 | |||
705 | static force_inline __m64 |
||
706 | core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst) |
||
707 | { |
||
708 | vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ()); |
||
709 | |||
710 | if (is_opaque (vsrc)) |
||
711 | { |
||
712 | return vsrc; |
||
1891 | serge | 713 | } |
3931 | Serge | 714 | else if (!is_zero (vsrc)) |
715 | { |
||
716 | return over (vsrc, expand_alpha (vsrc), |
||
717 | _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ())); |
||
718 | } |
||
1891 | serge | 719 | |
3931 | Serge | 720 | return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()); |
1891 | serge | 721 | } |
722 | |||
723 | static void |
||
724 | mmx_combine_over_u (pixman_implementation_t *imp, |
||
725 | pixman_op_t op, |
||
726 | uint32_t * dest, |
||
727 | const uint32_t * src, |
||
728 | const uint32_t * mask, |
||
729 | int width) |
||
730 | { |
||
731 | const uint32_t *end = dest + width; |
||
732 | |||
733 | while (dest < end) |
||
734 | { |
||
3931 | Serge | 735 | __m64 vsrc = combine (src, mask); |
1891 | serge | 736 | |
3931 | Serge | 737 | if (is_opaque (vsrc)) |
1891 | serge | 738 | { |
3931 | Serge | 739 | store8888 (dest, vsrc); |
1891 | serge | 740 | } |
3931 | Serge | 741 | else if (!is_zero (vsrc)) |
1891 | serge | 742 | { |
3931 | Serge | 743 | __m64 sa = expand_alpha (vsrc); |
744 | store8888 (dest, over (vsrc, sa, load8888 (dest))); |
||
1891 | serge | 745 | } |
746 | |||
747 | ++dest; |
||
748 | ++src; |
||
749 | if (mask) |
||
750 | ++mask; |
||
751 | } |
||
752 | _mm_empty (); |
||
753 | } |
||
754 | |||
755 | static void |
||
756 | mmx_combine_over_reverse_u (pixman_implementation_t *imp, |
||
757 | pixman_op_t op, |
||
758 | uint32_t * dest, |
||
759 | const uint32_t * src, |
||
760 | const uint32_t * mask, |
||
761 | int width) |
||
762 | { |
||
763 | const uint32_t *end = dest + width; |
||
764 | |||
765 | while (dest < end) |
||
766 | { |
||
767 | __m64 d, da; |
||
3931 | Serge | 768 | __m64 s = combine (src, mask); |
1891 | serge | 769 | |
3931 | Serge | 770 | d = load8888 (dest); |
1891 | serge | 771 | da = expand_alpha (d); |
3931 | Serge | 772 | store8888 (dest, over (d, da, s)); |
1891 | serge | 773 | |
774 | ++dest; |
||
775 | ++src; |
||
776 | if (mask) |
||
777 | mask++; |
||
778 | } |
||
779 | _mm_empty (); |
||
780 | } |
||
781 | |||
782 | static void |
||
783 | mmx_combine_in_u (pixman_implementation_t *imp, |
||
784 | pixman_op_t op, |
||
785 | uint32_t * dest, |
||
786 | const uint32_t * src, |
||
787 | const uint32_t * mask, |
||
788 | int width) |
||
789 | { |
||
790 | const uint32_t *end = dest + width; |
||
791 | |||
792 | while (dest < end) |
||
793 | { |
||
3931 | Serge | 794 | __m64 a; |
795 | __m64 x = combine (src, mask); |
||
1891 | serge | 796 | |
3931 | Serge | 797 | a = load8888 (dest); |
1891 | serge | 798 | a = expand_alpha (a); |
799 | x = pix_multiply (x, a); |
||
800 | |||
3931 | Serge | 801 | store8888 (dest, x); |
1891 | serge | 802 | |
803 | ++dest; |
||
804 | ++src; |
||
805 | if (mask) |
||
806 | mask++; |
||
807 | } |
||
808 | _mm_empty (); |
||
809 | } |
||
810 | |||
811 | static void |
||
812 | mmx_combine_in_reverse_u (pixman_implementation_t *imp, |
||
813 | pixman_op_t op, |
||
814 | uint32_t * dest, |
||
815 | const uint32_t * src, |
||
816 | const uint32_t * mask, |
||
817 | int width) |
||
818 | { |
||
819 | const uint32_t *end = dest + width; |
||
820 | |||
821 | while (dest < end) |
||
822 | { |
||
3931 | Serge | 823 | __m64 a = combine (src, mask); |
824 | __m64 x; |
||
1891 | serge | 825 | |
3931 | Serge | 826 | x = load8888 (dest); |
1891 | serge | 827 | a = expand_alpha (a); |
828 | x = pix_multiply (x, a); |
||
3931 | Serge | 829 | store8888 (dest, x); |
1891 | serge | 830 | |
831 | ++dest; |
||
832 | ++src; |
||
833 | if (mask) |
||
834 | mask++; |
||
835 | } |
||
836 | _mm_empty (); |
||
837 | } |
||
838 | |||
839 | static void |
||
840 | mmx_combine_out_u (pixman_implementation_t *imp, |
||
841 | pixman_op_t op, |
||
842 | uint32_t * dest, |
||
843 | const uint32_t * src, |
||
844 | const uint32_t * mask, |
||
845 | int width) |
||
846 | { |
||
847 | const uint32_t *end = dest + width; |
||
848 | |||
849 | while (dest < end) |
||
850 | { |
||
3931 | Serge | 851 | __m64 a; |
852 | __m64 x = combine (src, mask); |
||
1891 | serge | 853 | |
3931 | Serge | 854 | a = load8888 (dest); |
1891 | serge | 855 | a = expand_alpha (a); |
856 | a = negate (a); |
||
857 | x = pix_multiply (x, a); |
||
3931 | Serge | 858 | store8888 (dest, x); |
1891 | serge | 859 | |
860 | ++dest; |
||
861 | ++src; |
||
862 | if (mask) |
||
863 | mask++; |
||
864 | } |
||
865 | _mm_empty (); |
||
866 | } |
||
867 | |||
868 | static void |
||
869 | mmx_combine_out_reverse_u (pixman_implementation_t *imp, |
||
870 | pixman_op_t op, |
||
871 | uint32_t * dest, |
||
872 | const uint32_t * src, |
||
873 | const uint32_t * mask, |
||
874 | int width) |
||
875 | { |
||
876 | const uint32_t *end = dest + width; |
||
877 | |||
878 | while (dest < end) |
||
879 | { |
||
3931 | Serge | 880 | __m64 a = combine (src, mask); |
881 | __m64 x; |
||
1891 | serge | 882 | |
3931 | Serge | 883 | x = load8888 (dest); |
1891 | serge | 884 | a = expand_alpha (a); |
885 | a = negate (a); |
||
886 | x = pix_multiply (x, a); |
||
887 | |||
3931 | Serge | 888 | store8888 (dest, x); |
1891 | serge | 889 | |
890 | ++dest; |
||
891 | ++src; |
||
892 | if (mask) |
||
893 | mask++; |
||
894 | } |
||
895 | _mm_empty (); |
||
896 | } |
||
897 | |||
898 | static void |
||
899 | mmx_combine_atop_u (pixman_implementation_t *imp, |
||
900 | pixman_op_t op, |
||
901 | uint32_t * dest, |
||
902 | const uint32_t * src, |
||
903 | const uint32_t * mask, |
||
904 | int width) |
||
905 | { |
||
906 | const uint32_t *end = dest + width; |
||
907 | |||
908 | while (dest < end) |
||
909 | { |
||
3931 | Serge | 910 | __m64 da, d, sia; |
911 | __m64 s = combine (src, mask); |
||
1891 | serge | 912 | |
3931 | Serge | 913 | d = load8888 (dest); |
1891 | serge | 914 | sia = expand_alpha (s); |
915 | sia = negate (sia); |
||
916 | da = expand_alpha (d); |
||
917 | s = pix_add_mul (s, da, d, sia); |
||
3931 | Serge | 918 | store8888 (dest, s); |
1891 | serge | 919 | |
920 | ++dest; |
||
921 | ++src; |
||
922 | if (mask) |
||
923 | mask++; |
||
924 | } |
||
925 | _mm_empty (); |
||
926 | } |
||
927 | |||
928 | static void |
||
929 | mmx_combine_atop_reverse_u (pixman_implementation_t *imp, |
||
930 | pixman_op_t op, |
||
931 | uint32_t * dest, |
||
932 | const uint32_t * src, |
||
933 | const uint32_t * mask, |
||
934 | int width) |
||
935 | { |
||
936 | const uint32_t *end; |
||
937 | |||
938 | end = dest + width; |
||
939 | |||
940 | while (dest < end) |
||
941 | { |
||
3931 | Serge | 942 | __m64 dia, d, sa; |
943 | __m64 s = combine (src, mask); |
||
1891 | serge | 944 | |
3931 | Serge | 945 | d = load8888 (dest); |
1891 | serge | 946 | sa = expand_alpha (s); |
947 | dia = expand_alpha (d); |
||
948 | dia = negate (dia); |
||
949 | s = pix_add_mul (s, dia, d, sa); |
||
3931 | Serge | 950 | store8888 (dest, s); |
1891 | serge | 951 | |
952 | ++dest; |
||
953 | ++src; |
||
954 | if (mask) |
||
955 | mask++; |
||
956 | } |
||
957 | _mm_empty (); |
||
958 | } |
||
959 | |||
960 | static void |
||
961 | mmx_combine_xor_u (pixman_implementation_t *imp, |
||
962 | pixman_op_t op, |
||
963 | uint32_t * dest, |
||
964 | const uint32_t * src, |
||
965 | const uint32_t * mask, |
||
966 | int width) |
||
967 | { |
||
968 | const uint32_t *end = dest + width; |
||
969 | |||
970 | while (dest < end) |
||
971 | { |
||
3931 | Serge | 972 | __m64 dia, d, sia; |
973 | __m64 s = combine (src, mask); |
||
1891 | serge | 974 | |
3931 | Serge | 975 | d = load8888 (dest); |
1891 | serge | 976 | sia = expand_alpha (s); |
977 | dia = expand_alpha (d); |
||
978 | sia = negate (sia); |
||
979 | dia = negate (dia); |
||
980 | s = pix_add_mul (s, dia, d, sia); |
||
3931 | Serge | 981 | store8888 (dest, s); |
1891 | serge | 982 | |
983 | ++dest; |
||
984 | ++src; |
||
985 | if (mask) |
||
986 | mask++; |
||
987 | } |
||
988 | _mm_empty (); |
||
989 | } |
||
990 | |||
991 | static void |
||
992 | mmx_combine_add_u (pixman_implementation_t *imp, |
||
993 | pixman_op_t op, |
||
994 | uint32_t * dest, |
||
995 | const uint32_t * src, |
||
996 | const uint32_t * mask, |
||
997 | int width) |
||
998 | { |
||
999 | const uint32_t *end = dest + width; |
||
1000 | |||
1001 | while (dest < end) |
||
1002 | { |
||
3931 | Serge | 1003 | __m64 d; |
1004 | __m64 s = combine (src, mask); |
||
1891 | serge | 1005 | |
3931 | Serge | 1006 | d = load8888 (dest); |
1891 | serge | 1007 | s = pix_add (s, d); |
3931 | Serge | 1008 | store8888 (dest, s); |
1891 | serge | 1009 | |
1010 | ++dest; |
||
1011 | ++src; |
||
1012 | if (mask) |
||
1013 | mask++; |
||
1014 | } |
||
1015 | _mm_empty (); |
||
1016 | } |
||
1017 | |||
1018 | static void |
||
1019 | mmx_combine_saturate_u (pixman_implementation_t *imp, |
||
1020 | pixman_op_t op, |
||
1021 | uint32_t * dest, |
||
1022 | const uint32_t * src, |
||
1023 | const uint32_t * mask, |
||
1024 | int width) |
||
1025 | { |
||
1026 | const uint32_t *end = dest + width; |
||
1027 | |||
1028 | while (dest < end) |
||
1029 | { |
||
3931 | Serge | 1030 | uint32_t s, sa, da; |
1891 | serge | 1031 | uint32_t d = *dest; |
3931 | Serge | 1032 | __m64 ms = combine (src, mask); |
1033 | __m64 md = load8888 (dest); |
||
1891 | serge | 1034 | |
3931 | Serge | 1035 | store8888(&s, ms); |
1036 | da = ~d >> 24; |
||
1037 | sa = s >> 24; |
||
1038 | |||
1891 | serge | 1039 | if (sa > da) |
1040 | { |
||
3931 | Serge | 1041 | uint32_t quot = DIV_UN8 (da, sa) << 24; |
1042 | __m64 msa = load8888 ("); |
||
1891 | serge | 1043 | msa = expand_alpha (msa); |
1044 | ms = pix_multiply (ms, msa); |
||
1045 | } |
||
1046 | |||
1047 | md = pix_add (md, ms); |
||
3931 | Serge | 1048 | store8888 (dest, md); |
1891 | serge | 1049 | |
1050 | ++src; |
||
1051 | ++dest; |
||
1052 | if (mask) |
||
1053 | mask++; |
||
1054 | } |
||
1055 | _mm_empty (); |
||
1056 | } |
||
1057 | |||
1058 | static void |
||
1059 | mmx_combine_src_ca (pixman_implementation_t *imp, |
||
1060 | pixman_op_t op, |
||
1061 | uint32_t * dest, |
||
1062 | const uint32_t * src, |
||
1063 | const uint32_t * mask, |
||
1064 | int width) |
||
1065 | { |
||
1066 | const uint32_t *end = src + width; |
||
1067 | |||
1068 | while (src < end) |
||
1069 | { |
||
3931 | Serge | 1070 | __m64 a = load8888 (mask); |
1071 | __m64 s = load8888 (src); |
||
1891 | serge | 1072 | |
1073 | s = pix_multiply (s, a); |
||
3931 | Serge | 1074 | store8888 (dest, s); |
1891 | serge | 1075 | |
1076 | ++src; |
||
1077 | ++mask; |
||
1078 | ++dest; |
||
1079 | } |
||
1080 | _mm_empty (); |
||
1081 | } |
||
1082 | |||
1083 | static void |
||
1084 | mmx_combine_over_ca (pixman_implementation_t *imp, |
||
1085 | pixman_op_t op, |
||
1086 | uint32_t * dest, |
||
1087 | const uint32_t * src, |
||
1088 | const uint32_t * mask, |
||
1089 | int width) |
||
1090 | { |
||
1091 | const uint32_t *end = src + width; |
||
1092 | |||
1093 | while (src < end) |
||
1094 | { |
||
3931 | Serge | 1095 | __m64 a = load8888 (mask); |
1096 | __m64 s = load8888 (src); |
||
1097 | __m64 d = load8888 (dest); |
||
1891 | serge | 1098 | __m64 sa = expand_alpha (s); |
1099 | |||
3931 | Serge | 1100 | store8888 (dest, in_over (s, sa, a, d)); |
1891 | serge | 1101 | |
1102 | ++src; |
||
1103 | ++dest; |
||
1104 | ++mask; |
||
1105 | } |
||
1106 | _mm_empty (); |
||
1107 | } |
||
1108 | |||
1109 | static void |
||
1110 | mmx_combine_over_reverse_ca (pixman_implementation_t *imp, |
||
1111 | pixman_op_t op, |
||
1112 | uint32_t * dest, |
||
1113 | const uint32_t * src, |
||
1114 | const uint32_t * mask, |
||
1115 | int width) |
||
1116 | { |
||
1117 | const uint32_t *end = src + width; |
||
1118 | |||
1119 | while (src < end) |
||
1120 | { |
||
3931 | Serge | 1121 | __m64 a = load8888 (mask); |
1122 | __m64 s = load8888 (src); |
||
1123 | __m64 d = load8888 (dest); |
||
1891 | serge | 1124 | __m64 da = expand_alpha (d); |
1125 | |||
3931 | Serge | 1126 | store8888 (dest, over (d, da, in (s, a))); |
1891 | serge | 1127 | |
1128 | ++src; |
||
1129 | ++dest; |
||
1130 | ++mask; |
||
1131 | } |
||
1132 | _mm_empty (); |
||
1133 | } |
||
1134 | |||
1135 | static void |
||
1136 | mmx_combine_in_ca (pixman_implementation_t *imp, |
||
1137 | pixman_op_t op, |
||
1138 | uint32_t * dest, |
||
1139 | const uint32_t * src, |
||
1140 | const uint32_t * mask, |
||
1141 | int width) |
||
1142 | { |
||
1143 | const uint32_t *end = src + width; |
||
1144 | |||
1145 | while (src < end) |
||
1146 | { |
||
3931 | Serge | 1147 | __m64 a = load8888 (mask); |
1148 | __m64 s = load8888 (src); |
||
1149 | __m64 d = load8888 (dest); |
||
1891 | serge | 1150 | __m64 da = expand_alpha (d); |
1151 | |||
1152 | s = pix_multiply (s, a); |
||
1153 | s = pix_multiply (s, da); |
||
3931 | Serge | 1154 | store8888 (dest, s); |
1891 | serge | 1155 | |
1156 | ++src; |
||
1157 | ++dest; |
||
1158 | ++mask; |
||
1159 | } |
||
1160 | _mm_empty (); |
||
1161 | } |
||
1162 | |||
1163 | static void |
||
1164 | mmx_combine_in_reverse_ca (pixman_implementation_t *imp, |
||
1165 | pixman_op_t op, |
||
1166 | uint32_t * dest, |
||
1167 | const uint32_t * src, |
||
1168 | const uint32_t * mask, |
||
1169 | int width) |
||
1170 | { |
||
1171 | const uint32_t *end = src + width; |
||
1172 | |||
1173 | while (src < end) |
||
1174 | { |
||
3931 | Serge | 1175 | __m64 a = load8888 (mask); |
1176 | __m64 s = load8888 (src); |
||
1177 | __m64 d = load8888 (dest); |
||
1891 | serge | 1178 | __m64 sa = expand_alpha (s); |
1179 | |||
1180 | a = pix_multiply (a, sa); |
||
1181 | d = pix_multiply (d, a); |
||
3931 | Serge | 1182 | store8888 (dest, d); |
1891 | serge | 1183 | |
1184 | ++src; |
||
1185 | ++dest; |
||
1186 | ++mask; |
||
1187 | } |
||
1188 | _mm_empty (); |
||
1189 | } |
||
1190 | |||
1191 | static void |
||
1192 | mmx_combine_out_ca (pixman_implementation_t *imp, |
||
1193 | pixman_op_t op, |
||
1194 | uint32_t * dest, |
||
1195 | const uint32_t * src, |
||
1196 | const uint32_t * mask, |
||
1197 | int width) |
||
1198 | { |
||
1199 | const uint32_t *end = src + width; |
||
1200 | |||
1201 | while (src < end) |
||
1202 | { |
||
3931 | Serge | 1203 | __m64 a = load8888 (mask); |
1204 | __m64 s = load8888 (src); |
||
1205 | __m64 d = load8888 (dest); |
||
1891 | serge | 1206 | __m64 da = expand_alpha (d); |
1207 | |||
1208 | da = negate (da); |
||
1209 | s = pix_multiply (s, a); |
||
1210 | s = pix_multiply (s, da); |
||
3931 | Serge | 1211 | store8888 (dest, s); |
1891 | serge | 1212 | |
1213 | ++src; |
||
1214 | ++dest; |
||
1215 | ++mask; |
||
1216 | } |
||
1217 | _mm_empty (); |
||
1218 | } |
||
1219 | |||
1220 | static void |
||
1221 | mmx_combine_out_reverse_ca (pixman_implementation_t *imp, |
||
1222 | pixman_op_t op, |
||
1223 | uint32_t * dest, |
||
1224 | const uint32_t * src, |
||
1225 | const uint32_t * mask, |
||
1226 | int width) |
||
1227 | { |
||
1228 | const uint32_t *end = src + width; |
||
1229 | |||
1230 | while (src < end) |
||
1231 | { |
||
3931 | Serge | 1232 | __m64 a = load8888 (mask); |
1233 | __m64 s = load8888 (src); |
||
1234 | __m64 d = load8888 (dest); |
||
1891 | serge | 1235 | __m64 sa = expand_alpha (s); |
1236 | |||
1237 | a = pix_multiply (a, sa); |
||
1238 | a = negate (a); |
||
1239 | d = pix_multiply (d, a); |
||
3931 | Serge | 1240 | store8888 (dest, d); |
1891 | serge | 1241 | |
1242 | ++src; |
||
1243 | ++dest; |
||
1244 | ++mask; |
||
1245 | } |
||
1246 | _mm_empty (); |
||
1247 | } |
||
1248 | |||
1249 | static void |
||
1250 | mmx_combine_atop_ca (pixman_implementation_t *imp, |
||
1251 | pixman_op_t op, |
||
1252 | uint32_t * dest, |
||
1253 | const uint32_t * src, |
||
1254 | const uint32_t * mask, |
||
1255 | int width) |
||
1256 | { |
||
1257 | const uint32_t *end = src + width; |
||
1258 | |||
1259 | while (src < end) |
||
1260 | { |
||
3931 | Serge | 1261 | __m64 a = load8888 (mask); |
1262 | __m64 s = load8888 (src); |
||
1263 | __m64 d = load8888 (dest); |
||
1891 | serge | 1264 | __m64 da = expand_alpha (d); |
1265 | __m64 sa = expand_alpha (s); |
||
1266 | |||
1267 | s = pix_multiply (s, a); |
||
1268 | a = pix_multiply (a, sa); |
||
1269 | a = negate (a); |
||
1270 | d = pix_add_mul (d, a, s, da); |
||
3931 | Serge | 1271 | store8888 (dest, d); |
1891 | serge | 1272 | |
1273 | ++src; |
||
1274 | ++dest; |
||
1275 | ++mask; |
||
1276 | } |
||
1277 | _mm_empty (); |
||
1278 | } |
||
1279 | |||
1280 | static void |
||
1281 | mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, |
||
1282 | pixman_op_t op, |
||
1283 | uint32_t * dest, |
||
1284 | const uint32_t * src, |
||
1285 | const uint32_t * mask, |
||
1286 | int width) |
||
1287 | { |
||
1288 | const uint32_t *end = src + width; |
||
1289 | |||
1290 | while (src < end) |
||
1291 | { |
||
3931 | Serge | 1292 | __m64 a = load8888 (mask); |
1293 | __m64 s = load8888 (src); |
||
1294 | __m64 d = load8888 (dest); |
||
1891 | serge | 1295 | __m64 da = expand_alpha (d); |
1296 | __m64 sa = expand_alpha (s); |
||
1297 | |||
1298 | s = pix_multiply (s, a); |
||
1299 | a = pix_multiply (a, sa); |
||
1300 | da = negate (da); |
||
1301 | d = pix_add_mul (d, a, s, da); |
||
3931 | Serge | 1302 | store8888 (dest, d); |
1891 | serge | 1303 | |
1304 | ++src; |
||
1305 | ++dest; |
||
1306 | ++mask; |
||
1307 | } |
||
1308 | _mm_empty (); |
||
1309 | } |
||
1310 | |||
1311 | static void |
||
1312 | mmx_combine_xor_ca (pixman_implementation_t *imp, |
||
1313 | pixman_op_t op, |
||
1314 | uint32_t * dest, |
||
1315 | const uint32_t * src, |
||
1316 | const uint32_t * mask, |
||
1317 | int width) |
||
1318 | { |
||
1319 | const uint32_t *end = src + width; |
||
1320 | |||
1321 | while (src < end) |
||
1322 | { |
||
3931 | Serge | 1323 | __m64 a = load8888 (mask); |
1324 | __m64 s = load8888 (src); |
||
1325 | __m64 d = load8888 (dest); |
||
1891 | serge | 1326 | __m64 da = expand_alpha (d); |
1327 | __m64 sa = expand_alpha (s); |
||
1328 | |||
1329 | s = pix_multiply (s, a); |
||
1330 | a = pix_multiply (a, sa); |
||
1331 | da = negate (da); |
||
1332 | a = negate (a); |
||
1333 | d = pix_add_mul (d, a, s, da); |
||
3931 | Serge | 1334 | store8888 (dest, d); |
1891 | serge | 1335 | |
1336 | ++src; |
||
1337 | ++dest; |
||
1338 | ++mask; |
||
1339 | } |
||
1340 | _mm_empty (); |
||
1341 | } |
||
1342 | |||
1343 | static void |
||
1344 | mmx_combine_add_ca (pixman_implementation_t *imp, |
||
1345 | pixman_op_t op, |
||
1346 | uint32_t * dest, |
||
1347 | const uint32_t * src, |
||
1348 | const uint32_t * mask, |
||
1349 | int width) |
||
1350 | { |
||
1351 | const uint32_t *end = src + width; |
||
1352 | |||
1353 | while (src < end) |
||
1354 | { |
||
3931 | Serge | 1355 | __m64 a = load8888 (mask); |
1356 | __m64 s = load8888 (src); |
||
1357 | __m64 d = load8888 (dest); |
||
1891 | serge | 1358 | |
1359 | s = pix_multiply (s, a); |
||
1360 | d = pix_add (s, d); |
||
3931 | Serge | 1361 | store8888 (dest, d); |
1891 | serge | 1362 | |
1363 | ++src; |
||
1364 | ++dest; |
||
1365 | ++mask; |
||
1366 | } |
||
1367 | _mm_empty (); |
||
1368 | } |
||
1369 | |||
1370 | /* ------------- MMX code paths called from fbpict.c -------------------- */ |
||
1371 | |||
1372 | static void |
||
1373 | mmx_composite_over_n_8888 (pixman_implementation_t *imp, |
||
3931 | Serge | 1374 | pixman_composite_info_t *info) |
1891 | serge | 1375 | { |
3931 | Serge | 1376 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 1377 | uint32_t src; |
1378 | uint32_t *dst_line, *dst; |
||
1379 | int32_t w; |
||
1380 | int dst_stride; |
||
1381 | __m64 vsrc, vsrca; |
||
1382 | |||
1383 | CHECKPOINT (); |
||
1384 | |||
3931 | Serge | 1385 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1891 | serge | 1386 | |
1387 | if (src == 0) |
||
1388 | return; |
||
1389 | |||
3931 | Serge | 1390 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1891 | serge | 1391 | |
3931 | Serge | 1392 | vsrc = load8888 (&src); |
1891 | serge | 1393 | vsrca = expand_alpha (vsrc); |
1394 | |||
1395 | while (height--) |
||
1396 | { |
||
1397 | dst = dst_line; |
||
1398 | dst_line += dst_stride; |
||
1399 | w = width; |
||
1400 | |||
1401 | CHECKPOINT (); |
||
1402 | |||
3931 | Serge | 1403 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 1404 | { |
3931 | Serge | 1405 | store8888 (dst, over (vsrc, vsrca, load8888 (dst))); |
1891 | serge | 1406 | |
1407 | w--; |
||
1408 | dst++; |
||
1409 | } |
||
1410 | |||
1411 | while (w >= 2) |
||
1412 | { |
||
1413 | __m64 vdest; |
||
1414 | __m64 dest0, dest1; |
||
1415 | |||
1416 | vdest = *(__m64 *)dst; |
||
1417 | |||
1418 | dest0 = over (vsrc, vsrca, expand8888 (vdest, 0)); |
||
1419 | dest1 = over (vsrc, vsrca, expand8888 (vdest, 1)); |
||
1420 | |||
1421 | *(__m64 *)dst = pack8888 (dest0, dest1); |
||
1422 | |||
1423 | dst += 2; |
||
1424 | w -= 2; |
||
1425 | } |
||
1426 | |||
1427 | CHECKPOINT (); |
||
1428 | |||
3931 | Serge | 1429 | if (w) |
1891 | serge | 1430 | { |
3931 | Serge | 1431 | store8888 (dst, over (vsrc, vsrca, load8888 (dst))); |
1891 | serge | 1432 | } |
1433 | } |
||
1434 | |||
1435 | _mm_empty (); |
||
1436 | } |
||
1437 | |||
1438 | static void |
||
1439 | mmx_composite_over_n_0565 (pixman_implementation_t *imp, |
||
3931 | Serge | 1440 | pixman_composite_info_t *info) |
1891 | serge | 1441 | { |
3931 | Serge | 1442 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 1443 | uint32_t src; |
1444 | uint16_t *dst_line, *dst; |
||
1445 | int32_t w; |
||
1446 | int dst_stride; |
||
1447 | __m64 vsrc, vsrca; |
||
1448 | |||
1449 | CHECKPOINT (); |
||
1450 | |||
3931 | Serge | 1451 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1891 | serge | 1452 | |
1453 | if (src == 0) |
||
1454 | return; |
||
1455 | |||
3931 | Serge | 1456 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
1891 | serge | 1457 | |
3931 | Serge | 1458 | vsrc = load8888 (&src); |
1891 | serge | 1459 | vsrca = expand_alpha (vsrc); |
1460 | |||
1461 | while (height--) |
||
1462 | { |
||
1463 | dst = dst_line; |
||
1464 | dst_line += dst_stride; |
||
1465 | w = width; |
||
1466 | |||
1467 | CHECKPOINT (); |
||
1468 | |||
3931 | Serge | 1469 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 1470 | { |
1471 | uint64_t d = *dst; |
||
1472 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
1473 | |||
1474 | vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); |
||
1475 | *dst = to_uint64 (vdest); |
||
1476 | |||
1477 | w--; |
||
1478 | dst++; |
||
1479 | } |
||
1480 | |||
1481 | while (w >= 4) |
||
1482 | { |
||
3931 | Serge | 1483 | __m64 vdest = *(__m64 *)dst; |
1484 | __m64 v0, v1, v2, v3; |
||
1891 | serge | 1485 | |
3931 | Serge | 1486 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
1891 | serge | 1487 | |
3931 | Serge | 1488 | v0 = over (vsrc, vsrca, v0); |
1489 | v1 = over (vsrc, vsrca, v1); |
||
1490 | v2 = over (vsrc, vsrca, v2); |
||
1491 | v3 = over (vsrc, vsrca, v3); |
||
1891 | serge | 1492 | |
3931 | Serge | 1493 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
1891 | serge | 1494 | |
1495 | dst += 4; |
||
1496 | w -= 4; |
||
1497 | } |
||
1498 | |||
1499 | CHECKPOINT (); |
||
1500 | |||
1501 | while (w) |
||
1502 | { |
||
1503 | uint64_t d = *dst; |
||
1504 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
1505 | |||
1506 | vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); |
||
1507 | *dst = to_uint64 (vdest); |
||
1508 | |||
1509 | w--; |
||
1510 | dst++; |
||
1511 | } |
||
1512 | } |
||
1513 | |||
1514 | _mm_empty (); |
||
1515 | } |
||
1516 | |||
1517 | static void |
||
1518 | mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, |
||
3931 | Serge | 1519 | pixman_composite_info_t *info) |
1891 | serge | 1520 | { |
3931 | Serge | 1521 | PIXMAN_COMPOSITE_ARGS (info); |
1522 | uint32_t src; |
||
1891 | serge | 1523 | uint32_t *dst_line; |
1524 | uint32_t *mask_line; |
||
1525 | int dst_stride, mask_stride; |
||
1526 | __m64 vsrc, vsrca; |
||
1527 | |||
1528 | CHECKPOINT (); |
||
1529 | |||
3931 | Serge | 1530 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1891 | serge | 1531 | |
1532 | if (src == 0) |
||
1533 | return; |
||
1534 | |||
3931 | Serge | 1535 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1891 | serge | 1536 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
1537 | |||
3931 | Serge | 1538 | vsrc = load8888 (&src); |
1891 | serge | 1539 | vsrca = expand_alpha (vsrc); |
1540 | |||
1541 | while (height--) |
||
1542 | { |
||
1543 | int twidth = width; |
||
1544 | uint32_t *p = (uint32_t *)mask_line; |
||
1545 | uint32_t *q = (uint32_t *)dst_line; |
||
1546 | |||
3931 | Serge | 1547 | while (twidth && (uintptr_t)q & 7) |
1891 | serge | 1548 | { |
1549 | uint32_t m = *(uint32_t *)p; |
||
1550 | |||
1551 | if (m) |
||
1552 | { |
||
3931 | Serge | 1553 | __m64 vdest = load8888 (q); |
1554 | vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); |
||
1555 | store8888 (q, vdest); |
||
1891 | serge | 1556 | } |
1557 | |||
1558 | twidth--; |
||
1559 | p++; |
||
1560 | q++; |
||
1561 | } |
||
1562 | |||
1563 | while (twidth >= 2) |
||
1564 | { |
||
1565 | uint32_t m0, m1; |
||
1566 | m0 = *p; |
||
1567 | m1 = *(p + 1); |
||
1568 | |||
1569 | if (m0 | m1) |
||
1570 | { |
||
1571 | __m64 dest0, dest1; |
||
1572 | __m64 vdest = *(__m64 *)q; |
||
1573 | |||
3931 | Serge | 1574 | dest0 = in_over (vsrc, vsrca, load8888 (&m0), |
1891 | serge | 1575 | expand8888 (vdest, 0)); |
3931 | Serge | 1576 | dest1 = in_over (vsrc, vsrca, load8888 (&m1), |
1891 | serge | 1577 | expand8888 (vdest, 1)); |
1578 | |||
1579 | *(__m64 *)q = pack8888 (dest0, dest1); |
||
1580 | } |
||
1581 | |||
1582 | p += 2; |
||
1583 | q += 2; |
||
1584 | twidth -= 2; |
||
1585 | } |
||
1586 | |||
3931 | Serge | 1587 | if (twidth) |
1891 | serge | 1588 | { |
1589 | uint32_t m = *(uint32_t *)p; |
||
1590 | |||
1591 | if (m) |
||
1592 | { |
||
3931 | Serge | 1593 | __m64 vdest = load8888 (q); |
1594 | vdest = in_over (vsrc, vsrca, load8888 (&m), vdest); |
||
1595 | store8888 (q, vdest); |
||
1891 | serge | 1596 | } |
1597 | |||
1598 | twidth--; |
||
1599 | p++; |
||
1600 | q++; |
||
1601 | } |
||
1602 | |||
1603 | dst_line += dst_stride; |
||
1604 | mask_line += mask_stride; |
||
1605 | } |
||
1606 | |||
1607 | _mm_empty (); |
||
1608 | } |
||
1609 | |||
1610 | static void |
||
1611 | mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, |
||
3931 | Serge | 1612 | pixman_composite_info_t *info) |
1891 | serge | 1613 | { |
3931 | Serge | 1614 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 1615 | uint32_t *dst_line, *dst; |
1616 | uint32_t *src_line, *src; |
||
1617 | uint32_t mask; |
||
1618 | __m64 vmask; |
||
1619 | int dst_stride, src_stride; |
||
1620 | int32_t w; |
||
1621 | |||
1622 | CHECKPOINT (); |
||
1623 | |||
3931 | Serge | 1624 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1891 | serge | 1625 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1626 | |||
3931 | Serge | 1627 | mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); |
1628 | vmask = expand_alpha (load8888 (&mask)); |
||
1891 | serge | 1629 | |
1630 | while (height--) |
||
1631 | { |
||
1632 | dst = dst_line; |
||
1633 | dst_line += dst_stride; |
||
1634 | src = src_line; |
||
1635 | src_line += src_stride; |
||
1636 | w = width; |
||
1637 | |||
3931 | Serge | 1638 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 1639 | { |
3931 | Serge | 1640 | __m64 s = load8888 (src); |
1641 | __m64 d = load8888 (dst); |
||
1891 | serge | 1642 | |
3931 | Serge | 1643 | store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); |
1891 | serge | 1644 | |
1645 | w--; |
||
1646 | dst++; |
||
1647 | src++; |
||
1648 | } |
||
1649 | |||
1650 | while (w >= 2) |
||
1651 | { |
||
3931 | Serge | 1652 | __m64 vs = ldq_u ((__m64 *)src); |
1891 | serge | 1653 | __m64 vd = *(__m64 *)dst; |
1654 | __m64 vsrc0 = expand8888 (vs, 0); |
||
1655 | __m64 vsrc1 = expand8888 (vs, 1); |
||
1656 | |||
1657 | *(__m64 *)dst = pack8888 ( |
||
1658 | in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), |
||
1659 | in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); |
||
1660 | |||
1661 | w -= 2; |
||
1662 | dst += 2; |
||
1663 | src += 2; |
||
1664 | } |
||
1665 | |||
3931 | Serge | 1666 | if (w) |
1891 | serge | 1667 | { |
3931 | Serge | 1668 | __m64 s = load8888 (src); |
1669 | __m64 d = load8888 (dst); |
||
1891 | serge | 1670 | |
3931 | Serge | 1671 | store8888 (dst, in_over (s, expand_alpha (s), vmask, d)); |
1891 | serge | 1672 | } |
1673 | } |
||
1674 | |||
1675 | _mm_empty (); |
||
1676 | } |
||
1677 | |||
1678 | static void |
||
1679 | mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, |
||
3931 | Serge | 1680 | pixman_composite_info_t *info) |
1891 | serge | 1681 | { |
3931 | Serge | 1682 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 1683 | uint32_t *dst_line, *dst; |
1684 | uint32_t *src_line, *src; |
||
1685 | uint32_t mask; |
||
1686 | __m64 vmask; |
||
1687 | int dst_stride, src_stride; |
||
1688 | int32_t w; |
||
1689 | __m64 srca; |
||
1690 | |||
1691 | CHECKPOINT (); |
||
1692 | |||
3931 | Serge | 1693 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1891 | serge | 1694 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
3931 | Serge | 1695 | mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format); |
1891 | serge | 1696 | |
3931 | Serge | 1697 | vmask = expand_alpha (load8888 (&mask)); |
1891 | serge | 1698 | srca = MC (4x00ff); |
1699 | |||
1700 | while (height--) |
||
1701 | { |
||
1702 | dst = dst_line; |
||
1703 | dst_line += dst_stride; |
||
1704 | src = src_line; |
||
1705 | src_line += src_stride; |
||
1706 | w = width; |
||
1707 | |||
3931 | Serge | 1708 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 1709 | { |
3931 | Serge | 1710 | uint32_t ssrc = *src | 0xff000000; |
1711 | __m64 s = load8888 (&ssrc); |
||
1712 | __m64 d = load8888 (dst); |
||
1891 | serge | 1713 | |
3931 | Serge | 1714 | store8888 (dst, in_over (s, srca, vmask, d)); |
1891 | serge | 1715 | |
1716 | w--; |
||
1717 | dst++; |
||
1718 | src++; |
||
1719 | } |
||
1720 | |||
1721 | while (w >= 16) |
||
1722 | { |
||
1723 | __m64 vd0 = *(__m64 *)(dst + 0); |
||
1724 | __m64 vd1 = *(__m64 *)(dst + 2); |
||
1725 | __m64 vd2 = *(__m64 *)(dst + 4); |
||
1726 | __m64 vd3 = *(__m64 *)(dst + 6); |
||
1727 | __m64 vd4 = *(__m64 *)(dst + 8); |
||
1728 | __m64 vd5 = *(__m64 *)(dst + 10); |
||
1729 | __m64 vd6 = *(__m64 *)(dst + 12); |
||
1730 | __m64 vd7 = *(__m64 *)(dst + 14); |
||
1731 | |||
3931 | Serge | 1732 | __m64 vs0 = ldq_u ((__m64 *)(src + 0)); |
1733 | __m64 vs1 = ldq_u ((__m64 *)(src + 2)); |
||
1734 | __m64 vs2 = ldq_u ((__m64 *)(src + 4)); |
||
1735 | __m64 vs3 = ldq_u ((__m64 *)(src + 6)); |
||
1736 | __m64 vs4 = ldq_u ((__m64 *)(src + 8)); |
||
1737 | __m64 vs5 = ldq_u ((__m64 *)(src + 10)); |
||
1738 | __m64 vs6 = ldq_u ((__m64 *)(src + 12)); |
||
1739 | __m64 vs7 = ldq_u ((__m64 *)(src + 14)); |
||
1891 | serge | 1740 | |
1741 | vd0 = pack8888 ( |
||
1742 | in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), |
||
1743 | in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); |
||
1744 | |||
1745 | vd1 = pack8888 ( |
||
1746 | in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), |
||
1747 | in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); |
||
1748 | |||
1749 | vd2 = pack8888 ( |
||
1750 | in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), |
||
1751 | in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); |
||
1752 | |||
1753 | vd3 = pack8888 ( |
||
1754 | in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), |
||
1755 | in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); |
||
1756 | |||
1757 | vd4 = pack8888 ( |
||
1758 | in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), |
||
1759 | in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); |
||
1760 | |||
1761 | vd5 = pack8888 ( |
||
1762 | in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), |
||
1763 | in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); |
||
1764 | |||
1765 | vd6 = pack8888 ( |
||
1766 | in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), |
||
1767 | in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); |
||
1768 | |||
1769 | vd7 = pack8888 ( |
||
1770 | in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), |
||
1771 | in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); |
||
1772 | |||
1773 | *(__m64 *)(dst + 0) = vd0; |
||
1774 | *(__m64 *)(dst + 2) = vd1; |
||
1775 | *(__m64 *)(dst + 4) = vd2; |
||
1776 | *(__m64 *)(dst + 6) = vd3; |
||
1777 | *(__m64 *)(dst + 8) = vd4; |
||
1778 | *(__m64 *)(dst + 10) = vd5; |
||
1779 | *(__m64 *)(dst + 12) = vd6; |
||
1780 | *(__m64 *)(dst + 14) = vd7; |
||
1781 | |||
1782 | w -= 16; |
||
1783 | dst += 16; |
||
1784 | src += 16; |
||
1785 | } |
||
1786 | |||
1787 | while (w) |
||
1788 | { |
||
3931 | Serge | 1789 | uint32_t ssrc = *src | 0xff000000; |
1790 | __m64 s = load8888 (&ssrc); |
||
1791 | __m64 d = load8888 (dst); |
||
1891 | serge | 1792 | |
3931 | Serge | 1793 | store8888 (dst, in_over (s, srca, vmask, d)); |
1891 | serge | 1794 | |
1795 | w--; |
||
1796 | dst++; |
||
1797 | src++; |
||
1798 | } |
||
1799 | } |
||
1800 | |||
1801 | _mm_empty (); |
||
1802 | } |
||
1803 | |||
1804 | static void |
||
1805 | mmx_composite_over_8888_8888 (pixman_implementation_t *imp, |
||
3931 | Serge | 1806 | pixman_composite_info_t *info) |
1891 | serge | 1807 | { |
3931 | Serge | 1808 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 1809 | uint32_t *dst_line, *dst; |
1810 | uint32_t *src_line, *src; |
||
1811 | uint32_t s; |
||
1812 | int dst_stride, src_stride; |
||
1813 | uint8_t a; |
||
1814 | int32_t w; |
||
1815 | |||
1816 | CHECKPOINT (); |
||
1817 | |||
3931 | Serge | 1818 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1891 | serge | 1819 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1820 | |||
1821 | while (height--) |
||
1822 | { |
||
1823 | dst = dst_line; |
||
1824 | dst_line += dst_stride; |
||
1825 | src = src_line; |
||
1826 | src_line += src_stride; |
||
1827 | w = width; |
||
1828 | |||
1829 | while (w--) |
||
1830 | { |
||
1831 | s = *src++; |
||
1832 | a = s >> 24; |
||
1833 | |||
1834 | if (a == 0xff) |
||
1835 | { |
||
1836 | *dst = s; |
||
1837 | } |
||
1838 | else if (s) |
||
1839 | { |
||
1840 | __m64 ms, sa; |
||
3931 | Serge | 1841 | ms = load8888 (&s); |
1891 | serge | 1842 | sa = expand_alpha (ms); |
3931 | Serge | 1843 | store8888 (dst, over (ms, sa, load8888 (dst))); |
1891 | serge | 1844 | } |
1845 | |||
1846 | dst++; |
||
1847 | } |
||
1848 | } |
||
1849 | _mm_empty (); |
||
1850 | } |
||
1851 | |||
1852 | static void |
||
1853 | mmx_composite_over_8888_0565 (pixman_implementation_t *imp, |
||
3931 | Serge | 1854 | pixman_composite_info_t *info) |
1891 | serge | 1855 | { |
3931 | Serge | 1856 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 1857 | uint16_t *dst_line, *dst; |
1858 | uint32_t *src_line, *src; |
||
1859 | int dst_stride, src_stride; |
||
1860 | int32_t w; |
||
1861 | |||
1862 | CHECKPOINT (); |
||
1863 | |||
3931 | Serge | 1864 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
1891 | serge | 1865 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
1866 | |||
1867 | #if 0 |
||
1868 | /* FIXME */ |
||
1869 | assert (src_image->drawable == mask_image->drawable); |
||
1870 | #endif |
||
1871 | |||
1872 | while (height--) |
||
1873 | { |
||
1874 | dst = dst_line; |
||
1875 | dst_line += dst_stride; |
||
1876 | src = src_line; |
||
1877 | src_line += src_stride; |
||
1878 | w = width; |
||
1879 | |||
1880 | CHECKPOINT (); |
||
1881 | |||
3931 | Serge | 1882 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 1883 | { |
3931 | Serge | 1884 | __m64 vsrc = load8888 (src); |
1891 | serge | 1885 | uint64_t d = *dst; |
1886 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
1887 | |||
1888 | vdest = pack_565 ( |
||
1889 | over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); |
||
1890 | |||
1891 | *dst = to_uint64 (vdest); |
||
1892 | |||
1893 | w--; |
||
1894 | dst++; |
||
1895 | src++; |
||
1896 | } |
||
1897 | |||
1898 | CHECKPOINT (); |
||
1899 | |||
1900 | while (w >= 4) |
||
1901 | { |
||
3931 | Serge | 1902 | __m64 vdest = *(__m64 *)dst; |
1903 | __m64 v0, v1, v2, v3; |
||
1891 | serge | 1904 | __m64 vsrc0, vsrc1, vsrc2, vsrc3; |
1905 | |||
3931 | Serge | 1906 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
1891 | serge | 1907 | |
3931 | Serge | 1908 | vsrc0 = load8888 ((src + 0)); |
1909 | vsrc1 = load8888 ((src + 1)); |
||
1910 | vsrc2 = load8888 ((src + 2)); |
||
1911 | vsrc3 = load8888 ((src + 3)); |
||
1891 | serge | 1912 | |
3931 | Serge | 1913 | v0 = over (vsrc0, expand_alpha (vsrc0), v0); |
1914 | v1 = over (vsrc1, expand_alpha (vsrc1), v1); |
||
1915 | v2 = over (vsrc2, expand_alpha (vsrc2), v2); |
||
1916 | v3 = over (vsrc3, expand_alpha (vsrc3), v3); |
||
1891 | serge | 1917 | |
3931 | Serge | 1918 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
1891 | serge | 1919 | |
1920 | w -= 4; |
||
1921 | dst += 4; |
||
1922 | src += 4; |
||
1923 | } |
||
1924 | |||
1925 | CHECKPOINT (); |
||
1926 | |||
1927 | while (w) |
||
1928 | { |
||
3931 | Serge | 1929 | __m64 vsrc = load8888 (src); |
1891 | serge | 1930 | uint64_t d = *dst; |
1931 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
1932 | |||
1933 | vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); |
||
1934 | |||
1935 | *dst = to_uint64 (vdest); |
||
1936 | |||
1937 | w--; |
||
1938 | dst++; |
||
1939 | src++; |
||
1940 | } |
||
1941 | } |
||
1942 | |||
1943 | _mm_empty (); |
||
1944 | } |
||
1945 | |||
1946 | static void |
||
1947 | mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, |
||
3931 | Serge | 1948 | pixman_composite_info_t *info) |
1891 | serge | 1949 | { |
3931 | Serge | 1950 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 1951 | uint32_t src, srca; |
1952 | uint32_t *dst_line, *dst; |
||
1953 | uint8_t *mask_line, *mask; |
||
1954 | int dst_stride, mask_stride; |
||
1955 | int32_t w; |
||
1956 | __m64 vsrc, vsrca; |
||
1957 | uint64_t srcsrc; |
||
1958 | |||
1959 | CHECKPOINT (); |
||
1960 | |||
3931 | Serge | 1961 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1891 | serge | 1962 | |
1963 | srca = src >> 24; |
||
1964 | if (src == 0) |
||
1965 | return; |
||
1966 | |||
1967 | srcsrc = (uint64_t)src << 32 | src; |
||
1968 | |||
3931 | Serge | 1969 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1891 | serge | 1970 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
1971 | |||
3931 | Serge | 1972 | vsrc = load8888 (&src); |
1891 | serge | 1973 | vsrca = expand_alpha (vsrc); |
1974 | |||
1975 | while (height--) |
||
1976 | { |
||
1977 | dst = dst_line; |
||
1978 | dst_line += dst_stride; |
||
1979 | mask = mask_line; |
||
1980 | mask_line += mask_stride; |
||
1981 | w = width; |
||
1982 | |||
1983 | CHECKPOINT (); |
||
1984 | |||
3931 | Serge | 1985 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 1986 | { |
1987 | uint64_t m = *mask; |
||
1988 | |||
1989 | if (m) |
||
1990 | { |
||
1991 | __m64 vdest = in_over (vsrc, vsrca, |
||
1992 | expand_alpha_rev (to_m64 (m)), |
||
3931 | Serge | 1993 | load8888 (dst)); |
1891 | serge | 1994 | |
3931 | Serge | 1995 | store8888 (dst, vdest); |
1891 | serge | 1996 | } |
1997 | |||
1998 | w--; |
||
1999 | mask++; |
||
2000 | dst++; |
||
2001 | } |
||
2002 | |||
2003 | CHECKPOINT (); |
||
2004 | |||
2005 | while (w >= 2) |
||
2006 | { |
||
2007 | uint64_t m0, m1; |
||
2008 | |||
2009 | m0 = *mask; |
||
2010 | m1 = *(mask + 1); |
||
2011 | |||
2012 | if (srca == 0xff && (m0 & m1) == 0xff) |
||
2013 | { |
||
2014 | *(uint64_t *)dst = srcsrc; |
||
2015 | } |
||
2016 | else if (m0 | m1) |
||
2017 | { |
||
2018 | __m64 vdest; |
||
2019 | __m64 dest0, dest1; |
||
2020 | |||
2021 | vdest = *(__m64 *)dst; |
||
2022 | |||
2023 | dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)), |
||
2024 | expand8888 (vdest, 0)); |
||
2025 | dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)), |
||
2026 | expand8888 (vdest, 1)); |
||
2027 | |||
2028 | *(__m64 *)dst = pack8888 (dest0, dest1); |
||
2029 | } |
||
2030 | |||
2031 | mask += 2; |
||
2032 | dst += 2; |
||
2033 | w -= 2; |
||
2034 | } |
||
2035 | |||
2036 | CHECKPOINT (); |
||
2037 | |||
3931 | Serge | 2038 | if (w) |
1891 | serge | 2039 | { |
2040 | uint64_t m = *mask; |
||
2041 | |||
2042 | if (m) |
||
2043 | { |
||
3931 | Serge | 2044 | __m64 vdest = load8888 (dst); |
1891 | serge | 2045 | |
2046 | vdest = in_over ( |
||
2047 | vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); |
||
3931 | Serge | 2048 | store8888 (dst, vdest); |
1891 | serge | 2049 | } |
2050 | } |
||
2051 | } |
||
2052 | |||
2053 | _mm_empty (); |
||
2054 | } |
||
2055 | |||
3931 | Serge | 2056 | static pixman_bool_t |
2057 | mmx_fill (pixman_implementation_t *imp, |
||
2058 | uint32_t * bits, |
||
2059 | int stride, |
||
2060 | int bpp, |
||
2061 | int x, |
||
2062 | int y, |
||
2063 | int width, |
||
2064 | int height, |
||
2065 | uint32_t filler) |
||
1891 | serge | 2066 | { |
2067 | uint64_t fill; |
||
2068 | __m64 vfill; |
||
2069 | uint32_t byte_width; |
||
2070 | uint8_t *byte_line; |
||
2071 | |||
3931 | Serge | 2072 | #if defined __GNUC__ && defined USE_X86_MMX |
1891 | serge | 2073 | __m64 v1, v2, v3, v4, v5, v6, v7; |
2074 | #endif |
||
2075 | |||
2076 | if (bpp != 16 && bpp != 32 && bpp != 8) |
||
2077 | return FALSE; |
||
2078 | |||
2079 | if (bpp == 8) |
||
2080 | { |
||
2081 | stride = stride * (int) sizeof (uint32_t) / 1; |
||
2082 | byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); |
||
2083 | byte_width = width; |
||
2084 | stride *= 1; |
||
3931 | Serge | 2085 | filler = (filler & 0xff) * 0x01010101; |
1891 | serge | 2086 | } |
2087 | else if (bpp == 16) |
||
2088 | { |
||
2089 | stride = stride * (int) sizeof (uint32_t) / 2; |
||
2090 | byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); |
||
2091 | byte_width = 2 * width; |
||
2092 | stride *= 2; |
||
3931 | Serge | 2093 | filler = (filler & 0xffff) * 0x00010001; |
1891 | serge | 2094 | } |
2095 | else |
||
2096 | { |
||
2097 | stride = stride * (int) sizeof (uint32_t) / 4; |
||
2098 | byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); |
||
2099 | byte_width = 4 * width; |
||
2100 | stride *= 4; |
||
2101 | } |
||
2102 | |||
3931 | Serge | 2103 | fill = ((uint64_t)filler << 32) | filler; |
1891 | serge | 2104 | vfill = to_m64 (fill); |
2105 | |||
3931 | Serge | 2106 | #if defined __GNUC__ && defined USE_X86_MMX |
1891 | serge | 2107 | __asm__ ( |
2108 | "movq %7, %0\n" |
||
2109 | "movq %7, %1\n" |
||
2110 | "movq %7, %2\n" |
||
2111 | "movq %7, %3\n" |
||
2112 | "movq %7, %4\n" |
||
2113 | "movq %7, %5\n" |
||
2114 | "movq %7, %6\n" |
||
2115 | : "=&y" (v1), "=&y" (v2), "=&y" (v3), |
||
2116 | "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7) |
||
2117 | : "y" (vfill)); |
||
2118 | #endif |
||
2119 | |||
2120 | while (height--) |
||
2121 | { |
||
2122 | int w; |
||
2123 | uint8_t *d = byte_line; |
||
2124 | |||
2125 | byte_line += stride; |
||
2126 | w = byte_width; |
||
2127 | |||
3931 | Serge | 2128 | if (w >= 1 && ((uintptr_t)d & 1)) |
1891 | serge | 2129 | { |
3931 | Serge | 2130 | *(uint8_t *)d = (filler & 0xff); |
1891 | serge | 2131 | w--; |
2132 | d++; |
||
2133 | } |
||
2134 | |||
3931 | Serge | 2135 | if (w >= 2 && ((uintptr_t)d & 3)) |
1891 | serge | 2136 | { |
3931 | Serge | 2137 | *(uint16_t *)d = filler; |
1891 | serge | 2138 | w -= 2; |
2139 | d += 2; |
||
2140 | } |
||
2141 | |||
3931 | Serge | 2142 | while (w >= 4 && ((uintptr_t)d & 7)) |
1891 | serge | 2143 | { |
3931 | Serge | 2144 | *(uint32_t *)d = filler; |
1891 | serge | 2145 | |
2146 | w -= 4; |
||
2147 | d += 4; |
||
2148 | } |
||
2149 | |||
2150 | while (w >= 64) |
||
2151 | { |
||
3931 | Serge | 2152 | #if defined __GNUC__ && defined USE_X86_MMX |
1891 | serge | 2153 | __asm__ ( |
2154 | "movq %1, (%0)\n" |
||
2155 | "movq %2, 8(%0)\n" |
||
2156 | "movq %3, 16(%0)\n" |
||
2157 | "movq %4, 24(%0)\n" |
||
2158 | "movq %5, 32(%0)\n" |
||
2159 | "movq %6, 40(%0)\n" |
||
2160 | "movq %7, 48(%0)\n" |
||
2161 | "movq %8, 56(%0)\n" |
||
2162 | : |
||
2163 | : "r" (d), |
||
2164 | "y" (vfill), "y" (v1), "y" (v2), "y" (v3), |
||
2165 | "y" (v4), "y" (v5), "y" (v6), "y" (v7) |
||
2166 | : "memory"); |
||
2167 | #else |
||
2168 | *(__m64*) (d + 0) = vfill; |
||
2169 | *(__m64*) (d + 8) = vfill; |
||
2170 | *(__m64*) (d + 16) = vfill; |
||
2171 | *(__m64*) (d + 24) = vfill; |
||
2172 | *(__m64*) (d + 32) = vfill; |
||
2173 | *(__m64*) (d + 40) = vfill; |
||
2174 | *(__m64*) (d + 48) = vfill; |
||
2175 | *(__m64*) (d + 56) = vfill; |
||
2176 | #endif |
||
2177 | w -= 64; |
||
2178 | d += 64; |
||
2179 | } |
||
2180 | |||
2181 | while (w >= 4) |
||
2182 | { |
||
3931 | Serge | 2183 | *(uint32_t *)d = filler; |
1891 | serge | 2184 | |
2185 | w -= 4; |
||
2186 | d += 4; |
||
2187 | } |
||
3931 | Serge | 2188 | if (w >= 2) |
1891 | serge | 2189 | { |
3931 | Serge | 2190 | *(uint16_t *)d = filler; |
1891 | serge | 2191 | w -= 2; |
2192 | d += 2; |
||
2193 | } |
||
3931 | Serge | 2194 | if (w >= 1) |
1891 | serge | 2195 | { |
3931 | Serge | 2196 | *(uint8_t *)d = (filler & 0xff); |
1891 | serge | 2197 | w--; |
2198 | d++; |
||
2199 | } |
||
2200 | |||
2201 | } |
||
2202 | |||
2203 | _mm_empty (); |
||
2204 | return TRUE; |
||
2205 | } |
||
2206 | |||
2207 | static void |
||
3931 | Serge | 2208 | mmx_composite_src_x888_0565 (pixman_implementation_t *imp, |
2209 | pixman_composite_info_t *info) |
||
2210 | { |
||
2211 | PIXMAN_COMPOSITE_ARGS (info); |
||
2212 | uint16_t *dst_line, *dst; |
||
2213 | uint32_t *src_line, *src, s; |
||
2214 | int dst_stride, src_stride; |
||
2215 | int32_t w; |
||
2216 | |||
2217 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
2218 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
||
2219 | |||
2220 | while (height--) |
||
2221 | { |
||
2222 | dst = dst_line; |
||
2223 | dst_line += dst_stride; |
||
2224 | src = src_line; |
||
2225 | src_line += src_stride; |
||
2226 | w = width; |
||
2227 | |||
2228 | while (w && (uintptr_t)dst & 7) |
||
2229 | { |
||
2230 | s = *src++; |
||
2231 | *dst = convert_8888_to_0565 (s); |
||
2232 | dst++; |
||
2233 | w--; |
||
2234 | } |
||
2235 | |||
2236 | while (w >= 4) |
||
2237 | { |
||
2238 | __m64 vdest; |
||
2239 | __m64 vsrc0 = ldq_u ((__m64 *)(src + 0)); |
||
2240 | __m64 vsrc1 = ldq_u ((__m64 *)(src + 2)); |
||
2241 | |||
2242 | vdest = pack_4xpacked565 (vsrc0, vsrc1); |
||
2243 | |||
2244 | *(__m64 *)dst = vdest; |
||
2245 | |||
2246 | w -= 4; |
||
2247 | src += 4; |
||
2248 | dst += 4; |
||
2249 | } |
||
2250 | |||
2251 | while (w) |
||
2252 | { |
||
2253 | s = *src++; |
||
2254 | *dst = convert_8888_to_0565 (s); |
||
2255 | dst++; |
||
2256 | w--; |
||
2257 | } |
||
2258 | } |
||
2259 | |||
2260 | _mm_empty (); |
||
2261 | } |
||
2262 | |||
2263 | static void |
||
1891 | serge | 2264 | mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, |
3931 | Serge | 2265 | pixman_composite_info_t *info) |
1891 | serge | 2266 | { |
3931 | Serge | 2267 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 2268 | uint32_t src, srca; |
2269 | uint32_t *dst_line, *dst; |
||
2270 | uint8_t *mask_line, *mask; |
||
2271 | int dst_stride, mask_stride; |
||
2272 | int32_t w; |
||
3931 | Serge | 2273 | __m64 vsrc; |
1891 | serge | 2274 | uint64_t srcsrc; |
2275 | |||
2276 | CHECKPOINT (); |
||
2277 | |||
3931 | Serge | 2278 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1891 | serge | 2279 | |
2280 | srca = src >> 24; |
||
2281 | if (src == 0) |
||
2282 | { |
||
3931 | Serge | 2283 | mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride, |
2284 | PIXMAN_FORMAT_BPP (dest_image->bits.format), |
||
2285 | dest_x, dest_y, width, height, 0); |
||
1891 | serge | 2286 | return; |
2287 | } |
||
2288 | |||
2289 | srcsrc = (uint64_t)src << 32 | src; |
||
2290 | |||
3931 | Serge | 2291 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1891 | serge | 2292 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2293 | |||
3931 | Serge | 2294 | vsrc = load8888 (&src); |
1891 | serge | 2295 | |
2296 | while (height--) |
||
2297 | { |
||
2298 | dst = dst_line; |
||
2299 | dst_line += dst_stride; |
||
2300 | mask = mask_line; |
||
2301 | mask_line += mask_stride; |
||
2302 | w = width; |
||
2303 | |||
2304 | CHECKPOINT (); |
||
2305 | |||
3931 | Serge | 2306 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 2307 | { |
2308 | uint64_t m = *mask; |
||
2309 | |||
2310 | if (m) |
||
2311 | { |
||
2312 | __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); |
||
2313 | |||
3931 | Serge | 2314 | store8888 (dst, vdest); |
1891 | serge | 2315 | } |
2316 | else |
||
2317 | { |
||
2318 | *dst = 0; |
||
2319 | } |
||
2320 | |||
2321 | w--; |
||
2322 | mask++; |
||
2323 | dst++; |
||
2324 | } |
||
2325 | |||
2326 | CHECKPOINT (); |
||
2327 | |||
2328 | while (w >= 2) |
||
2329 | { |
||
2330 | uint64_t m0, m1; |
||
2331 | m0 = *mask; |
||
2332 | m1 = *(mask + 1); |
||
2333 | |||
2334 | if (srca == 0xff && (m0 & m1) == 0xff) |
||
2335 | { |
||
2336 | *(uint64_t *)dst = srcsrc; |
||
2337 | } |
||
2338 | else if (m0 | m1) |
||
2339 | { |
||
2340 | __m64 dest0, dest1; |
||
2341 | |||
2342 | dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0))); |
||
2343 | dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1))); |
||
2344 | |||
2345 | *(__m64 *)dst = pack8888 (dest0, dest1); |
||
2346 | } |
||
2347 | else |
||
2348 | { |
||
2349 | *(uint64_t *)dst = 0; |
||
2350 | } |
||
2351 | |||
2352 | mask += 2; |
||
2353 | dst += 2; |
||
2354 | w -= 2; |
||
2355 | } |
||
2356 | |||
2357 | CHECKPOINT (); |
||
2358 | |||
3931 | Serge | 2359 | if (w) |
1891 | serge | 2360 | { |
2361 | uint64_t m = *mask; |
||
2362 | |||
2363 | if (m) |
||
2364 | { |
||
3931 | Serge | 2365 | __m64 vdest = load8888 (dst); |
1891 | serge | 2366 | |
2367 | vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); |
||
3931 | Serge | 2368 | store8888 (dst, vdest); |
1891 | serge | 2369 | } |
2370 | else |
||
2371 | { |
||
2372 | *dst = 0; |
||
2373 | } |
||
2374 | } |
||
2375 | } |
||
2376 | |||
2377 | _mm_empty (); |
||
2378 | } |
||
2379 | |||
2380 | static void |
||
2381 | mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, |
||
3931 | Serge | 2382 | pixman_composite_info_t *info) |
1891 | serge | 2383 | { |
3931 | Serge | 2384 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 2385 | uint32_t src, srca; |
2386 | uint16_t *dst_line, *dst; |
||
2387 | uint8_t *mask_line, *mask; |
||
2388 | int dst_stride, mask_stride; |
||
2389 | int32_t w; |
||
2390 | __m64 vsrc, vsrca, tmp; |
||
3931 | Serge | 2391 | __m64 srcsrcsrcsrc; |
1891 | serge | 2392 | |
2393 | CHECKPOINT (); |
||
2394 | |||
3931 | Serge | 2395 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1891 | serge | 2396 | |
2397 | srca = src >> 24; |
||
2398 | if (src == 0) |
||
2399 | return; |
||
2400 | |||
3931 | Serge | 2401 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
1891 | serge | 2402 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2403 | |||
3931 | Serge | 2404 | vsrc = load8888 (&src); |
1891 | serge | 2405 | vsrca = expand_alpha (vsrc); |
2406 | |||
2407 | tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); |
||
3931 | Serge | 2408 | srcsrcsrcsrc = expand_alpha_rev (tmp); |
1891 | serge | 2409 | |
2410 | while (height--) |
||
2411 | { |
||
2412 | dst = dst_line; |
||
2413 | dst_line += dst_stride; |
||
2414 | mask = mask_line; |
||
2415 | mask_line += mask_stride; |
||
2416 | w = width; |
||
2417 | |||
2418 | CHECKPOINT (); |
||
2419 | |||
3931 | Serge | 2420 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 2421 | { |
2422 | uint64_t m = *mask; |
||
2423 | |||
2424 | if (m) |
||
2425 | { |
||
2426 | uint64_t d = *dst; |
||
2427 | __m64 vd = to_m64 (d); |
||
2428 | __m64 vdest = in_over ( |
||
2429 | vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0)); |
||
2430 | |||
2431 | vd = pack_565 (vdest, _mm_setzero_si64 (), 0); |
||
2432 | *dst = to_uint64 (vd); |
||
2433 | } |
||
2434 | |||
2435 | w--; |
||
2436 | mask++; |
||
2437 | dst++; |
||
2438 | } |
||
2439 | |||
2440 | CHECKPOINT (); |
||
2441 | |||
2442 | while (w >= 4) |
||
2443 | { |
||
2444 | uint64_t m0, m1, m2, m3; |
||
2445 | m0 = *mask; |
||
2446 | m1 = *(mask + 1); |
||
2447 | m2 = *(mask + 2); |
||
2448 | m3 = *(mask + 3); |
||
2449 | |||
2450 | if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
||
2451 | { |
||
3931 | Serge | 2452 | *(__m64 *)dst = srcsrcsrcsrc; |
1891 | serge | 2453 | } |
2454 | else if (m0 | m1 | m2 | m3) |
||
2455 | { |
||
3931 | Serge | 2456 | __m64 vdest = *(__m64 *)dst; |
2457 | __m64 v0, v1, v2, v3; |
||
1891 | serge | 2458 | __m64 vm0, vm1, vm2, vm3; |
2459 | |||
3931 | Serge | 2460 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
1891 | serge | 2461 | |
2462 | vm0 = to_m64 (m0); |
||
3931 | Serge | 2463 | v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0); |
2464 | |||
1891 | serge | 2465 | vm1 = to_m64 (m1); |
3931 | Serge | 2466 | v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1); |
2467 | |||
1891 | serge | 2468 | vm2 = to_m64 (m2); |
3931 | Serge | 2469 | v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2); |
2470 | |||
1891 | serge | 2471 | vm3 = to_m64 (m3); |
3931 | Serge | 2472 | v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3); |
1891 | serge | 2473 | |
3931 | Serge | 2474 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);; |
1891 | serge | 2475 | } |
2476 | |||
2477 | w -= 4; |
||
2478 | mask += 4; |
||
2479 | dst += 4; |
||
2480 | } |
||
2481 | |||
2482 | CHECKPOINT (); |
||
2483 | |||
2484 | while (w) |
||
2485 | { |
||
2486 | uint64_t m = *mask; |
||
2487 | |||
2488 | if (m) |
||
2489 | { |
||
2490 | uint64_t d = *dst; |
||
2491 | __m64 vd = to_m64 (d); |
||
2492 | __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), |
||
2493 | expand565 (vd, 0)); |
||
2494 | vd = pack_565 (vdest, _mm_setzero_si64 (), 0); |
||
2495 | *dst = to_uint64 (vd); |
||
2496 | } |
||
2497 | |||
2498 | w--; |
||
2499 | mask++; |
||
2500 | dst++; |
||
2501 | } |
||
2502 | } |
||
2503 | |||
2504 | _mm_empty (); |
||
2505 | } |
||
2506 | |||
2507 | static void |
||
2508 | mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, |
||
3931 | Serge | 2509 | pixman_composite_info_t *info) |
1891 | serge | 2510 | { |
3931 | Serge | 2511 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 2512 | uint16_t *dst_line, *dst; |
2513 | uint32_t *src_line, *src; |
||
2514 | int dst_stride, src_stride; |
||
2515 | int32_t w; |
||
2516 | |||
2517 | CHECKPOINT (); |
||
2518 | |||
3931 | Serge | 2519 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
1891 | serge | 2520 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2521 | |||
2522 | #if 0 |
||
2523 | /* FIXME */ |
||
2524 | assert (src_image->drawable == mask_image->drawable); |
||
2525 | #endif |
||
2526 | |||
2527 | while (height--) |
||
2528 | { |
||
2529 | dst = dst_line; |
||
2530 | dst_line += dst_stride; |
||
2531 | src = src_line; |
||
2532 | src_line += src_stride; |
||
2533 | w = width; |
||
2534 | |||
2535 | CHECKPOINT (); |
||
2536 | |||
3931 | Serge | 2537 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 2538 | { |
3931 | Serge | 2539 | __m64 vsrc = load8888 (src); |
1891 | serge | 2540 | uint64_t d = *dst; |
2541 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
2542 | |||
2543 | vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); |
||
2544 | |||
2545 | *dst = to_uint64 (vdest); |
||
2546 | |||
2547 | w--; |
||
2548 | dst++; |
||
2549 | src++; |
||
2550 | } |
||
2551 | |||
2552 | CHECKPOINT (); |
||
2553 | |||
2554 | while (w >= 4) |
||
2555 | { |
||
2556 | uint32_t s0, s1, s2, s3; |
||
2557 | unsigned char a0, a1, a2, a3; |
||
2558 | |||
2559 | s0 = *src; |
||
2560 | s1 = *(src + 1); |
||
2561 | s2 = *(src + 2); |
||
2562 | s3 = *(src + 3); |
||
2563 | |||
2564 | a0 = (s0 >> 24); |
||
2565 | a1 = (s1 >> 24); |
||
2566 | a2 = (s2 >> 24); |
||
2567 | a3 = (s3 >> 24); |
||
2568 | |||
2569 | if ((a0 & a1 & a2 & a3) == 0xFF) |
||
2570 | { |
||
3931 | Serge | 2571 | __m64 v0 = invert_colors (load8888 (&s0)); |
2572 | __m64 v1 = invert_colors (load8888 (&s1)); |
||
2573 | __m64 v2 = invert_colors (load8888 (&s2)); |
||
2574 | __m64 v3 = invert_colors (load8888 (&s3)); |
||
1891 | serge | 2575 | |
3931 | Serge | 2576 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
1891 | serge | 2577 | } |
2578 | else if (s0 | s1 | s2 | s3) |
||
2579 | { |
||
2580 | __m64 vdest = *(__m64 *)dst; |
||
3931 | Serge | 2581 | __m64 v0, v1, v2, v3; |
1891 | serge | 2582 | |
3931 | Serge | 2583 | __m64 vsrc0 = load8888 (&s0); |
2584 | __m64 vsrc1 = load8888 (&s1); |
||
2585 | __m64 vsrc2 = load8888 (&s2); |
||
2586 | __m64 vsrc3 = load8888 (&s3); |
||
1891 | serge | 2587 | |
3931 | Serge | 2588 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
2589 | |||
2590 | v0 = over_rev_non_pre (vsrc0, v0); |
||
2591 | v1 = over_rev_non_pre (vsrc1, v1); |
||
2592 | v2 = over_rev_non_pre (vsrc2, v2); |
||
2593 | v3 = over_rev_non_pre (vsrc3, v3); |
||
2594 | |||
2595 | *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3); |
||
1891 | serge | 2596 | } |
2597 | |||
2598 | w -= 4; |
||
2599 | dst += 4; |
||
2600 | src += 4; |
||
2601 | } |
||
2602 | |||
2603 | CHECKPOINT (); |
||
2604 | |||
2605 | while (w) |
||
2606 | { |
||
3931 | Serge | 2607 | __m64 vsrc = load8888 (src); |
1891 | serge | 2608 | uint64_t d = *dst; |
2609 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
2610 | |||
2611 | vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); |
||
2612 | |||
2613 | *dst = to_uint64 (vdest); |
||
2614 | |||
2615 | w--; |
||
2616 | dst++; |
||
2617 | src++; |
||
2618 | } |
||
2619 | } |
||
2620 | |||
2621 | _mm_empty (); |
||
2622 | } |
||
2623 | |||
2624 | static void |
||
2625 | mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, |
||
3931 | Serge | 2626 | pixman_composite_info_t *info) |
1891 | serge | 2627 | { |
3931 | Serge | 2628 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 2629 | uint32_t *dst_line, *dst; |
2630 | uint32_t *src_line, *src; |
||
2631 | int dst_stride, src_stride; |
||
2632 | int32_t w; |
||
2633 | |||
2634 | CHECKPOINT (); |
||
2635 | |||
3931 | Serge | 2636 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1891 | serge | 2637 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
2638 | |||
2639 | #if 0 |
||
2640 | /* FIXME */ |
||
2641 | assert (src_image->drawable == mask_image->drawable); |
||
2642 | #endif |
||
2643 | |||
2644 | while (height--) |
||
2645 | { |
||
2646 | dst = dst_line; |
||
2647 | dst_line += dst_stride; |
||
2648 | src = src_line; |
||
2649 | src_line += src_stride; |
||
2650 | w = width; |
||
2651 | |||
3931 | Serge | 2652 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 2653 | { |
3931 | Serge | 2654 | __m64 s = load8888 (src); |
2655 | __m64 d = load8888 (dst); |
||
1891 | serge | 2656 | |
3931 | Serge | 2657 | store8888 (dst, over_rev_non_pre (s, d)); |
1891 | serge | 2658 | |
2659 | w--; |
||
2660 | dst++; |
||
2661 | src++; |
||
2662 | } |
||
2663 | |||
2664 | while (w >= 2) |
||
2665 | { |
||
3931 | Serge | 2666 | uint32_t s0, s1; |
1891 | serge | 2667 | unsigned char a0, a1; |
2668 | __m64 d0, d1; |
||
2669 | |||
2670 | s0 = *src; |
||
2671 | s1 = *(src + 1); |
||
2672 | |||
2673 | a0 = (s0 >> 24); |
||
2674 | a1 = (s1 >> 24); |
||
2675 | |||
2676 | if ((a0 & a1) == 0xFF) |
||
2677 | { |
||
3931 | Serge | 2678 | d0 = invert_colors (load8888 (&s0)); |
2679 | d1 = invert_colors (load8888 (&s1)); |
||
1891 | serge | 2680 | |
2681 | *(__m64 *)dst = pack8888 (d0, d1); |
||
2682 | } |
||
2683 | else if (s0 | s1) |
||
2684 | { |
||
2685 | __m64 vdest = *(__m64 *)dst; |
||
2686 | |||
3931 | Serge | 2687 | d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0)); |
2688 | d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1)); |
||
1891 | serge | 2689 | |
2690 | *(__m64 *)dst = pack8888 (d0, d1); |
||
2691 | } |
||
2692 | |||
2693 | w -= 2; |
||
2694 | dst += 2; |
||
2695 | src += 2; |
||
2696 | } |
||
2697 | |||
3931 | Serge | 2698 | if (w) |
1891 | serge | 2699 | { |
3931 | Serge | 2700 | __m64 s = load8888 (src); |
2701 | __m64 d = load8888 (dst); |
||
1891 | serge | 2702 | |
3931 | Serge | 2703 | store8888 (dst, over_rev_non_pre (s, d)); |
1891 | serge | 2704 | } |
2705 | } |
||
2706 | |||
2707 | _mm_empty (); |
||
2708 | } |
||
2709 | |||
2710 | static void |
||
2711 | mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, |
||
3931 | Serge | 2712 | pixman_composite_info_t *info) |
1891 | serge | 2713 | { |
3931 | Serge | 2714 | PIXMAN_COMPOSITE_ARGS (info); |
2715 | uint32_t src; |
||
1891 | serge | 2716 | uint16_t *dst_line; |
2717 | uint32_t *mask_line; |
||
2718 | int dst_stride, mask_stride; |
||
2719 | __m64 vsrc, vsrca; |
||
2720 | |||
2721 | CHECKPOINT (); |
||
2722 | |||
3931 | Serge | 2723 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1891 | serge | 2724 | |
2725 | if (src == 0) |
||
2726 | return; |
||
2727 | |||
3931 | Serge | 2728 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
1891 | serge | 2729 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
2730 | |||
3931 | Serge | 2731 | vsrc = load8888 (&src); |
1891 | serge | 2732 | vsrca = expand_alpha (vsrc); |
2733 | |||
2734 | while (height--) |
||
2735 | { |
||
2736 | int twidth = width; |
||
2737 | uint32_t *p = (uint32_t *)mask_line; |
||
2738 | uint16_t *q = (uint16_t *)dst_line; |
||
2739 | |||
3931 | Serge | 2740 | while (twidth && ((uintptr_t)q & 7)) |
1891 | serge | 2741 | { |
2742 | uint32_t m = *(uint32_t *)p; |
||
2743 | |||
2744 | if (m) |
||
2745 | { |
||
2746 | uint64_t d = *q; |
||
2747 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
3931 | Serge | 2748 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); |
1891 | serge | 2749 | *q = to_uint64 (vdest); |
2750 | } |
||
2751 | |||
2752 | twidth--; |
||
2753 | p++; |
||
2754 | q++; |
||
2755 | } |
||
2756 | |||
2757 | while (twidth >= 4) |
||
2758 | { |
||
2759 | uint32_t m0, m1, m2, m3; |
||
2760 | |||
2761 | m0 = *p; |
||
2762 | m1 = *(p + 1); |
||
2763 | m2 = *(p + 2); |
||
2764 | m3 = *(p + 3); |
||
2765 | |||
2766 | if ((m0 | m1 | m2 | m3)) |
||
2767 | { |
||
2768 | __m64 vdest = *(__m64 *)q; |
||
3931 | Serge | 2769 | __m64 v0, v1, v2, v3; |
1891 | serge | 2770 | |
3931 | Serge | 2771 | expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0); |
1891 | serge | 2772 | |
3931 | Serge | 2773 | v0 = in_over (vsrc, vsrca, load8888 (&m0), v0); |
2774 | v1 = in_over (vsrc, vsrca, load8888 (&m1), v1); |
||
2775 | v2 = in_over (vsrc, vsrca, load8888 (&m2), v2); |
||
2776 | v3 = in_over (vsrc, vsrca, load8888 (&m3), v3); |
||
2777 | |||
2778 | *(__m64 *)q = pack_4x565 (v0, v1, v2, v3); |
||
1891 | serge | 2779 | } |
2780 | twidth -= 4; |
||
2781 | p += 4; |
||
2782 | q += 4; |
||
2783 | } |
||
2784 | |||
2785 | while (twidth) |
||
2786 | { |
||
2787 | uint32_t m; |
||
2788 | |||
2789 | m = *(uint32_t *)p; |
||
2790 | if (m) |
||
2791 | { |
||
2792 | uint64_t d = *q; |
||
2793 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
3931 | Serge | 2794 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0); |
1891 | serge | 2795 | *q = to_uint64 (vdest); |
2796 | } |
||
2797 | |||
2798 | twidth--; |
||
2799 | p++; |
||
2800 | q++; |
||
2801 | } |
||
2802 | |||
2803 | mask_line += mask_stride; |
||
2804 | dst_line += dst_stride; |
||
2805 | } |
||
2806 | |||
2807 | _mm_empty (); |
||
2808 | } |
||
2809 | |||
2810 | static void |
||
2811 | mmx_composite_in_n_8_8 (pixman_implementation_t *imp, |
||
3931 | Serge | 2812 | pixman_composite_info_t *info) |
1891 | serge | 2813 | { |
3931 | Serge | 2814 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 2815 | uint8_t *dst_line, *dst; |
2816 | uint8_t *mask_line, *mask; |
||
2817 | int dst_stride, mask_stride; |
||
2818 | int32_t w; |
||
2819 | uint32_t src; |
||
2820 | uint8_t sa; |
||
2821 | __m64 vsrc, vsrca; |
||
2822 | |||
3931 | Serge | 2823 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
1891 | serge | 2824 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2825 | |||
3931 | Serge | 2826 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1891 | serge | 2827 | |
2828 | sa = src >> 24; |
||
2829 | |||
3931 | Serge | 2830 | vsrc = load8888 (&src); |
1891 | serge | 2831 | vsrca = expand_alpha (vsrc); |
2832 | |||
2833 | while (height--) |
||
2834 | { |
||
2835 | dst = dst_line; |
||
2836 | dst_line += dst_stride; |
||
2837 | mask = mask_line; |
||
2838 | mask_line += mask_stride; |
||
2839 | w = width; |
||
2840 | |||
3931 | Serge | 2841 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 2842 | { |
3931 | Serge | 2843 | uint16_t tmp; |
2844 | uint8_t a; |
||
2845 | uint32_t m, d; |
||
1891 | serge | 2846 | |
3931 | Serge | 2847 | a = *mask++; |
2848 | d = *dst; |
||
1891 | serge | 2849 | |
3931 | Serge | 2850 | m = MUL_UN8 (sa, a, tmp); |
2851 | d = MUL_UN8 (m, d, tmp); |
||
1891 | serge | 2852 | |
3931 | Serge | 2853 | *dst++ = d; |
2854 | w--; |
||
2855 | } |
||
1891 | serge | 2856 | |
3931 | Serge | 2857 | while (w >= 4) |
2858 | { |
||
2859 | __m64 vmask; |
||
2860 | __m64 vdest; |
||
2861 | |||
2862 | vmask = load8888u ((uint32_t *)mask); |
||
2863 | vdest = load8888 ((uint32_t *)dst); |
||
2864 | |||
2865 | store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest)); |
||
2866 | |||
2867 | dst += 4; |
||
2868 | mask += 4; |
||
2869 | w -= 4; |
||
1891 | serge | 2870 | } |
2871 | |||
2872 | while (w--) |
||
2873 | { |
||
2874 | uint16_t tmp; |
||
2875 | uint8_t a; |
||
2876 | uint32_t m, d; |
||
2877 | |||
2878 | a = *mask++; |
||
2879 | d = *dst; |
||
2880 | |||
2881 | m = MUL_UN8 (sa, a, tmp); |
||
2882 | d = MUL_UN8 (m, d, tmp); |
||
2883 | |||
2884 | *dst++ = d; |
||
2885 | } |
||
2886 | } |
||
2887 | |||
2888 | _mm_empty (); |
||
2889 | } |
||
2890 | |||
2891 | static void |
||
2892 | mmx_composite_in_8_8 (pixman_implementation_t *imp, |
||
3931 | Serge | 2893 | pixman_composite_info_t *info) |
1891 | serge | 2894 | { |
3931 | Serge | 2895 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 2896 | uint8_t *dst_line, *dst; |
2897 | uint8_t *src_line, *src; |
||
2898 | int src_stride, dst_stride; |
||
2899 | int32_t w; |
||
2900 | |||
3931 | Serge | 2901 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
1891 | serge | 2902 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
2903 | |||
2904 | while (height--) |
||
2905 | { |
||
2906 | dst = dst_line; |
||
2907 | dst_line += dst_stride; |
||
2908 | src = src_line; |
||
2909 | src_line += src_stride; |
||
2910 | w = width; |
||
2911 | |||
3931 | Serge | 2912 | while (w && (uintptr_t)dst & 3) |
1891 | serge | 2913 | { |
3931 | Serge | 2914 | uint8_t s, d; |
2915 | uint16_t tmp; |
||
1891 | serge | 2916 | |
3931 | Serge | 2917 | s = *src; |
2918 | d = *dst; |
||
1891 | serge | 2919 | |
3931 | Serge | 2920 | *dst = MUL_UN8 (s, d, tmp); |
2921 | |||
2922 | src++; |
||
2923 | dst++; |
||
2924 | w--; |
||
1891 | serge | 2925 | } |
2926 | |||
3931 | Serge | 2927 | while (w >= 4) |
2928 | { |
||
2929 | uint32_t *s = (uint32_t *)src; |
||
2930 | uint32_t *d = (uint32_t *)dst; |
||
2931 | |||
2932 | store8888 (d, in (load8888u (s), load8888 (d))); |
||
2933 | |||
2934 | w -= 4; |
||
2935 | dst += 4; |
||
2936 | src += 4; |
||
2937 | } |
||
2938 | |||
1891 | serge | 2939 | while (w--) |
2940 | { |
||
2941 | uint8_t s, d; |
||
2942 | uint16_t tmp; |
||
2943 | |||
2944 | s = *src; |
||
2945 | d = *dst; |
||
2946 | |||
2947 | *dst = MUL_UN8 (s, d, tmp); |
||
2948 | |||
2949 | src++; |
||
2950 | dst++; |
||
2951 | } |
||
2952 | } |
||
2953 | |||
2954 | _mm_empty (); |
||
2955 | } |
||
2956 | |||
2957 | static void |
||
2958 | mmx_composite_add_n_8_8 (pixman_implementation_t *imp, |
||
3931 | Serge | 2959 | pixman_composite_info_t *info) |
1891 | serge | 2960 | { |
3931 | Serge | 2961 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 2962 | uint8_t *dst_line, *dst; |
2963 | uint8_t *mask_line, *mask; |
||
2964 | int dst_stride, mask_stride; |
||
2965 | int32_t w; |
||
2966 | uint32_t src; |
||
2967 | uint8_t sa; |
||
2968 | __m64 vsrc, vsrca; |
||
2969 | |||
3931 | Serge | 2970 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
1891 | serge | 2971 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
2972 | |||
3931 | Serge | 2973 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
1891 | serge | 2974 | |
2975 | sa = src >> 24; |
||
2976 | |||
2977 | if (src == 0) |
||
2978 | return; |
||
2979 | |||
3931 | Serge | 2980 | vsrc = load8888 (&src); |
1891 | serge | 2981 | vsrca = expand_alpha (vsrc); |
2982 | |||
2983 | while (height--) |
||
2984 | { |
||
2985 | dst = dst_line; |
||
2986 | dst_line += dst_stride; |
||
2987 | mask = mask_line; |
||
2988 | mask_line += mask_stride; |
||
2989 | w = width; |
||
2990 | |||
3931 | Serge | 2991 | while (w && (uintptr_t)dst & 3) |
1891 | serge | 2992 | { |
3931 | Serge | 2993 | uint16_t tmp; |
2994 | uint16_t a; |
||
2995 | uint32_t m, d; |
||
2996 | uint32_t r; |
||
1891 | serge | 2997 | |
3931 | Serge | 2998 | a = *mask++; |
2999 | d = *dst; |
||
1891 | serge | 3000 | |
3931 | Serge | 3001 | m = MUL_UN8 (sa, a, tmp); |
3002 | r = ADD_UN8 (m, d, tmp); |
||
3003 | |||
3004 | *dst++ = r; |
||
3005 | w--; |
||
1891 | serge | 3006 | } |
3007 | |||
3931 | Serge | 3008 | while (w >= 4) |
3009 | { |
||
3010 | __m64 vmask; |
||
3011 | __m64 vdest; |
||
3012 | |||
3013 | vmask = load8888u ((uint32_t *)mask); |
||
3014 | vdest = load8888 ((uint32_t *)dst); |
||
3015 | |||
3016 | store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest)); |
||
3017 | |||
3018 | dst += 4; |
||
3019 | mask += 4; |
||
3020 | w -= 4; |
||
3021 | } |
||
3022 | |||
1891 | serge | 3023 | while (w--) |
3024 | { |
||
3025 | uint16_t tmp; |
||
3026 | uint16_t a; |
||
3027 | uint32_t m, d; |
||
3028 | uint32_t r; |
||
3029 | |||
3030 | a = *mask++; |
||
3031 | d = *dst; |
||
3032 | |||
3033 | m = MUL_UN8 (sa, a, tmp); |
||
3034 | r = ADD_UN8 (m, d, tmp); |
||
3035 | |||
3036 | *dst++ = r; |
||
3037 | } |
||
3038 | } |
||
3039 | |||
3040 | _mm_empty (); |
||
3041 | } |
||
3042 | |||
3043 | static void |
||
3044 | mmx_composite_add_8_8 (pixman_implementation_t *imp, |
||
3931 | Serge | 3045 | pixman_composite_info_t *info) |
1891 | serge | 3046 | { |
3931 | Serge | 3047 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 3048 | uint8_t *dst_line, *dst; |
3049 | uint8_t *src_line, *src; |
||
3050 | int dst_stride, src_stride; |
||
3051 | int32_t w; |
||
3052 | uint8_t s, d; |
||
3053 | uint16_t t; |
||
3054 | |||
3055 | CHECKPOINT (); |
||
3056 | |||
3057 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
||
3931 | Serge | 3058 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
1891 | serge | 3059 | |
3060 | while (height--) |
||
3061 | { |
||
3062 | dst = dst_line; |
||
3063 | dst_line += dst_stride; |
||
3064 | src = src_line; |
||
3065 | src_line += src_stride; |
||
3066 | w = width; |
||
3067 | |||
3931 | Serge | 3068 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 3069 | { |
3070 | s = *src; |
||
3071 | d = *dst; |
||
3072 | t = d + s; |
||
3073 | s = t | (0 - (t >> 8)); |
||
3074 | *dst = s; |
||
3075 | |||
3076 | dst++; |
||
3077 | src++; |
||
3078 | w--; |
||
3079 | } |
||
3080 | |||
3081 | while (w >= 8) |
||
3082 | { |
||
3931 | Serge | 3083 | *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); |
1891 | serge | 3084 | dst += 8; |
3085 | src += 8; |
||
3086 | w -= 8; |
||
3087 | } |
||
3088 | |||
3089 | while (w) |
||
3090 | { |
||
3091 | s = *src; |
||
3092 | d = *dst; |
||
3093 | t = d + s; |
||
3094 | s = t | (0 - (t >> 8)); |
||
3095 | *dst = s; |
||
3096 | |||
3097 | dst++; |
||
3098 | src++; |
||
3099 | w--; |
||
3100 | } |
||
3101 | } |
||
3102 | |||
3103 | _mm_empty (); |
||
3104 | } |
||
3105 | |||
3106 | static void |
||
3931 | Serge | 3107 | mmx_composite_add_0565_0565 (pixman_implementation_t *imp, |
3108 | pixman_composite_info_t *info) |
||
3109 | { |
||
3110 | PIXMAN_COMPOSITE_ARGS (info); |
||
3111 | uint16_t *dst_line, *dst; |
||
3112 | uint32_t d; |
||
3113 | uint16_t *src_line, *src; |
||
3114 | uint32_t s; |
||
3115 | int dst_stride, src_stride; |
||
3116 | int32_t w; |
||
3117 | |||
3118 | CHECKPOINT (); |
||
3119 | |||
3120 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1); |
||
3121 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
||
3122 | |||
3123 | while (height--) |
||
3124 | { |
||
3125 | dst = dst_line; |
||
3126 | dst_line += dst_stride; |
||
3127 | src = src_line; |
||
3128 | src_line += src_stride; |
||
3129 | w = width; |
||
3130 | |||
3131 | while (w && (uintptr_t)dst & 7) |
||
3132 | { |
||
3133 | s = *src++; |
||
3134 | if (s) |
||
3135 | { |
||
3136 | d = *dst; |
||
3137 | s = convert_0565_to_8888 (s); |
||
3138 | if (d) |
||
3139 | { |
||
3140 | d = convert_0565_to_8888 (d); |
||
3141 | UN8x4_ADD_UN8x4 (s, d); |
||
3142 | } |
||
3143 | *dst = convert_8888_to_0565 (s); |
||
3144 | } |
||
3145 | dst++; |
||
3146 | w--; |
||
3147 | } |
||
3148 | |||
3149 | while (w >= 4) |
||
3150 | { |
||
3151 | __m64 vdest = *(__m64 *)dst; |
||
3152 | __m64 vsrc = ldq_u ((__m64 *)src); |
||
3153 | __m64 vd0, vd1; |
||
3154 | __m64 vs0, vs1; |
||
3155 | |||
3156 | expand_4xpacked565 (vdest, &vd0, &vd1, 0); |
||
3157 | expand_4xpacked565 (vsrc, &vs0, &vs1, 0); |
||
3158 | |||
3159 | vd0 = _mm_adds_pu8 (vd0, vs0); |
||
3160 | vd1 = _mm_adds_pu8 (vd1, vs1); |
||
3161 | |||
3162 | *(__m64 *)dst = pack_4xpacked565 (vd0, vd1); |
||
3163 | |||
3164 | dst += 4; |
||
3165 | src += 4; |
||
3166 | w -= 4; |
||
3167 | } |
||
3168 | |||
3169 | while (w--) |
||
3170 | { |
||
3171 | s = *src++; |
||
3172 | if (s) |
||
3173 | { |
||
3174 | d = *dst; |
||
3175 | s = convert_0565_to_8888 (s); |
||
3176 | if (d) |
||
3177 | { |
||
3178 | d = convert_0565_to_8888 (d); |
||
3179 | UN8x4_ADD_UN8x4 (s, d); |
||
3180 | } |
||
3181 | *dst = convert_8888_to_0565 (s); |
||
3182 | } |
||
3183 | dst++; |
||
3184 | } |
||
3185 | } |
||
3186 | |||
3187 | _mm_empty (); |
||
3188 | } |
||
3189 | |||
3190 | static void |
||
1891 | serge | 3191 | mmx_composite_add_8888_8888 (pixman_implementation_t *imp, |
3931 | Serge | 3192 | pixman_composite_info_t *info) |
1891 | serge | 3193 | { |
3931 | Serge | 3194 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 3195 | uint32_t *dst_line, *dst; |
3196 | uint32_t *src_line, *src; |
||
3197 | int dst_stride, src_stride; |
||
3198 | int32_t w; |
||
3199 | |||
3200 | CHECKPOINT (); |
||
3201 | |||
3202 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
3931 | Serge | 3203 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1891 | serge | 3204 | |
3205 | while (height--) |
||
3206 | { |
||
3207 | dst = dst_line; |
||
3208 | dst_line += dst_stride; |
||
3209 | src = src_line; |
||
3210 | src_line += src_stride; |
||
3211 | w = width; |
||
3212 | |||
3931 | Serge | 3213 | while (w && (uintptr_t)dst & 7) |
1891 | serge | 3214 | { |
3931 | Serge | 3215 | store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), |
3216 | load ((const uint32_t *)dst))); |
||
1891 | serge | 3217 | dst++; |
3218 | src++; |
||
3219 | w--; |
||
3220 | } |
||
3221 | |||
3222 | while (w >= 2) |
||
3223 | { |
||
3931 | Serge | 3224 | *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst); |
1891 | serge | 3225 | dst += 2; |
3226 | src += 2; |
||
3227 | w -= 2; |
||
3228 | } |
||
3229 | |||
3230 | if (w) |
||
3231 | { |
||
3931 | Serge | 3232 | store (dst, _mm_adds_pu8 (load ((const uint32_t *)src), |
3233 | load ((const uint32_t *)dst))); |
||
1891 | serge | 3234 | |
3235 | } |
||
3236 | } |
||
3237 | |||
3238 | _mm_empty (); |
||
3239 | } |
||
3240 | |||
3241 | static pixman_bool_t |
||
3931 | Serge | 3242 | mmx_blt (pixman_implementation_t *imp, |
3243 | uint32_t * src_bits, |
||
3244 | uint32_t * dst_bits, |
||
3245 | int src_stride, |
||
3246 | int dst_stride, |
||
3247 | int src_bpp, |
||
3248 | int dst_bpp, |
||
3249 | int src_x, |
||
3250 | int src_y, |
||
3251 | int dest_x, |
||
3252 | int dest_y, |
||
3253 | int width, |
||
3254 | int height) |
||
1891 | serge | 3255 | { |
3256 | uint8_t * src_bytes; |
||
3257 | uint8_t * dst_bytes; |
||
3258 | int byte_width; |
||
3259 | |||
3260 | if (src_bpp != dst_bpp) |
||
3261 | return FALSE; |
||
3262 | |||
3263 | if (src_bpp == 16) |
||
3264 | { |
||
3265 | src_stride = src_stride * (int) sizeof (uint32_t) / 2; |
||
3266 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; |
||
3267 | src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); |
||
3931 | Serge | 3268 | dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
1891 | serge | 3269 | byte_width = 2 * width; |
3270 | src_stride *= 2; |
||
3271 | dst_stride *= 2; |
||
3272 | } |
||
3273 | else if (src_bpp == 32) |
||
3274 | { |
||
3275 | src_stride = src_stride * (int) sizeof (uint32_t) / 4; |
||
3276 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; |
||
3277 | src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); |
||
3931 | Serge | 3278 | dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x)); |
1891 | serge | 3279 | byte_width = 4 * width; |
3280 | src_stride *= 4; |
||
3281 | dst_stride *= 4; |
||
3282 | } |
||
3283 | else |
||
3284 | { |
||
3285 | return FALSE; |
||
3286 | } |
||
3287 | |||
3288 | while (height--) |
||
3289 | { |
||
3290 | int w; |
||
3291 | uint8_t *s = src_bytes; |
||
3292 | uint8_t *d = dst_bytes; |
||
3293 | src_bytes += src_stride; |
||
3294 | dst_bytes += dst_stride; |
||
3295 | w = byte_width; |
||
3296 | |||
3931 | Serge | 3297 | if (w >= 1 && ((uintptr_t)d & 1)) |
1891 | serge | 3298 | { |
3931 | Serge | 3299 | *(uint8_t *)d = *(uint8_t *)s; |
3300 | w -= 1; |
||
3301 | s += 1; |
||
3302 | d += 1; |
||
3303 | } |
||
3304 | |||
3305 | if (w >= 2 && ((uintptr_t)d & 3)) |
||
3306 | { |
||
1891 | serge | 3307 | *(uint16_t *)d = *(uint16_t *)s; |
3308 | w -= 2; |
||
3309 | s += 2; |
||
3310 | d += 2; |
||
3311 | } |
||
3312 | |||
3931 | Serge | 3313 | while (w >= 4 && ((uintptr_t)d & 7)) |
1891 | serge | 3314 | { |
3931 | Serge | 3315 | *(uint32_t *)d = ldl_u ((uint32_t *)s); |
1891 | serge | 3316 | |
3317 | w -= 4; |
||
3318 | s += 4; |
||
3319 | d += 4; |
||
3320 | } |
||
3321 | |||
3322 | while (w >= 64) |
||
3323 | { |
||
3931 | Serge | 3324 | #if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX |
1891 | serge | 3325 | __asm__ ( |
3326 | "movq (%1), %%mm0\n" |
||
3327 | "movq 8(%1), %%mm1\n" |
||
3328 | "movq 16(%1), %%mm2\n" |
||
3329 | "movq 24(%1), %%mm3\n" |
||
3330 | "movq 32(%1), %%mm4\n" |
||
3331 | "movq 40(%1), %%mm5\n" |
||
3332 | "movq 48(%1), %%mm6\n" |
||
3333 | "movq 56(%1), %%mm7\n" |
||
3334 | |||
3335 | "movq %%mm0, (%0)\n" |
||
3336 | "movq %%mm1, 8(%0)\n" |
||
3337 | "movq %%mm2, 16(%0)\n" |
||
3338 | "movq %%mm3, 24(%0)\n" |
||
3339 | "movq %%mm4, 32(%0)\n" |
||
3340 | "movq %%mm5, 40(%0)\n" |
||
3341 | "movq %%mm6, 48(%0)\n" |
||
3342 | "movq %%mm7, 56(%0)\n" |
||
3343 | : |
||
3344 | : "r" (d), "r" (s) |
||
3345 | : "memory", |
||
3346 | "%mm0", "%mm1", "%mm2", "%mm3", |
||
3347 | "%mm4", "%mm5", "%mm6", "%mm7"); |
||
3348 | #else |
||
3931 | Serge | 3349 | __m64 v0 = ldq_u ((__m64 *)(s + 0)); |
3350 | __m64 v1 = ldq_u ((__m64 *)(s + 8)); |
||
3351 | __m64 v2 = ldq_u ((__m64 *)(s + 16)); |
||
3352 | __m64 v3 = ldq_u ((__m64 *)(s + 24)); |
||
3353 | __m64 v4 = ldq_u ((__m64 *)(s + 32)); |
||
3354 | __m64 v5 = ldq_u ((__m64 *)(s + 40)); |
||
3355 | __m64 v6 = ldq_u ((__m64 *)(s + 48)); |
||
3356 | __m64 v7 = ldq_u ((__m64 *)(s + 56)); |
||
1891 | serge | 3357 | *(__m64 *)(d + 0) = v0; |
3358 | *(__m64 *)(d + 8) = v1; |
||
3359 | *(__m64 *)(d + 16) = v2; |
||
3360 | *(__m64 *)(d + 24) = v3; |
||
3361 | *(__m64 *)(d + 32) = v4; |
||
3362 | *(__m64 *)(d + 40) = v5; |
||
3363 | *(__m64 *)(d + 48) = v6; |
||
3364 | *(__m64 *)(d + 56) = v7; |
||
3365 | #endif |
||
3366 | |||
3367 | w -= 64; |
||
3368 | s += 64; |
||
3369 | d += 64; |
||
3370 | } |
||
3371 | while (w >= 4) |
||
3372 | { |
||
3931 | Serge | 3373 | *(uint32_t *)d = ldl_u ((uint32_t *)s); |
1891 | serge | 3374 | |
3375 | w -= 4; |
||
3376 | s += 4; |
||
3377 | d += 4; |
||
3378 | } |
||
3379 | if (w >= 2) |
||
3380 | { |
||
3381 | *(uint16_t *)d = *(uint16_t *)s; |
||
3382 | w -= 2; |
||
3383 | s += 2; |
||
3384 | d += 2; |
||
3385 | } |
||
3386 | } |
||
3387 | |||
3388 | _mm_empty (); |
||
3389 | |||
3390 | return TRUE; |
||
3391 | } |
||
3392 | |||
3393 | static void |
||
3394 | mmx_composite_copy_area (pixman_implementation_t *imp, |
||
3931 | Serge | 3395 | pixman_composite_info_t *info) |
1891 | serge | 3396 | { |
3931 | Serge | 3397 | PIXMAN_COMPOSITE_ARGS (info); |
3398 | |||
3399 | mmx_blt (imp, src_image->bits.bits, |
||
3400 | dest_image->bits.bits, |
||
3401 | src_image->bits.rowstride, |
||
3402 | dest_image->bits.rowstride, |
||
3403 | PIXMAN_FORMAT_BPP (src_image->bits.format), |
||
3404 | PIXMAN_FORMAT_BPP (dest_image->bits.format), |
||
3405 | src_x, src_y, dest_x, dest_y, width, height); |
||
1891 | serge | 3406 | } |
3407 | |||
3408 | static void |
||
3409 | mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, |
||
3931 | Serge | 3410 | pixman_composite_info_t *info) |
1891 | serge | 3411 | { |
3931 | Serge | 3412 | PIXMAN_COMPOSITE_ARGS (info); |
1891 | serge | 3413 | uint32_t *src, *src_line; |
3414 | uint32_t *dst, *dst_line; |
||
3415 | uint8_t *mask, *mask_line; |
||
3416 | int src_stride, mask_stride, dst_stride; |
||
3417 | int32_t w; |
||
3418 | |||
3931 | Serge | 3419 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
1891 | serge | 3420 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
3421 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
3422 | |||
3423 | while (height--) |
||
3424 | { |
||
3425 | src = src_line; |
||
3426 | src_line += src_stride; |
||
3427 | dst = dst_line; |
||
3428 | dst_line += dst_stride; |
||
3429 | mask = mask_line; |
||
3430 | mask_line += mask_stride; |
||
3431 | |||
3432 | w = width; |
||
3433 | |||
3434 | while (w--) |
||
3435 | { |
||
3436 | uint64_t m = *mask; |
||
3437 | |||
3438 | if (m) |
||
3439 | { |
||
3931 | Serge | 3440 | uint32_t ssrc = *src | 0xff000000; |
3441 | __m64 s = load8888 (&ssrc); |
||
1891 | serge | 3442 | |
3443 | if (m == 0xff) |
||
3444 | { |
||
3931 | Serge | 3445 | store8888 (dst, s); |
1891 | serge | 3446 | } |
3447 | else |
||
3448 | { |
||
3449 | __m64 sa = expand_alpha (s); |
||
3450 | __m64 vm = expand_alpha_rev (to_m64 (m)); |
||
3931 | Serge | 3451 | __m64 vdest = in_over (s, sa, vm, load8888 (dst)); |
1891 | serge | 3452 | |
3931 | Serge | 3453 | store8888 (dst, vdest); |
1891 | serge | 3454 | } |
3455 | } |
||
3456 | |||
3457 | mask++; |
||
3458 | dst++; |
||
3459 | src++; |
||
3460 | } |
||
3461 | } |
||
3462 | |||
3463 | _mm_empty (); |
||
3464 | } |
||
3465 | |||
3931 | Serge | 3466 | static void |
3467 | mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp, |
||
3468 | pixman_composite_info_t *info) |
||
3469 | { |
||
3470 | PIXMAN_COMPOSITE_ARGS (info); |
||
3471 | uint32_t src; |
||
3472 | uint32_t *dst_line, *dst; |
||
3473 | int32_t w; |
||
3474 | int dst_stride; |
||
3475 | __m64 vsrc; |
||
3476 | |||
3477 | CHECKPOINT (); |
||
3478 | |||
3479 | src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format); |
||
3480 | |||
3481 | if (src == 0) |
||
3482 | return; |
||
3483 | |||
3484 | PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
3485 | |||
3486 | vsrc = load8888 (&src); |
||
3487 | |||
3488 | while (height--) |
||
3489 | { |
||
3490 | dst = dst_line; |
||
3491 | dst_line += dst_stride; |
||
3492 | w = width; |
||
3493 | |||
3494 | CHECKPOINT (); |
||
3495 | |||
3496 | while (w && (uintptr_t)dst & 7) |
||
3497 | { |
||
3498 | __m64 vdest = load8888 (dst); |
||
3499 | |||
3500 | store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); |
||
3501 | |||
3502 | w--; |
||
3503 | dst++; |
||
3504 | } |
||
3505 | |||
3506 | while (w >= 2) |
||
3507 | { |
||
3508 | __m64 vdest = *(__m64 *)dst; |
||
3509 | __m64 dest0 = expand8888 (vdest, 0); |
||
3510 | __m64 dest1 = expand8888 (vdest, 1); |
||
3511 | |||
3512 | |||
3513 | dest0 = over (dest0, expand_alpha (dest0), vsrc); |
||
3514 | dest1 = over (dest1, expand_alpha (dest1), vsrc); |
||
3515 | |||
3516 | *(__m64 *)dst = pack8888 (dest0, dest1); |
||
3517 | |||
3518 | dst += 2; |
||
3519 | w -= 2; |
||
3520 | } |
||
3521 | |||
3522 | CHECKPOINT (); |
||
3523 | |||
3524 | if (w) |
||
3525 | { |
||
3526 | __m64 vdest = load8888 (dst); |
||
3527 | |||
3528 | store8888 (dst, over (vdest, expand_alpha (vdest), vsrc)); |
||
3529 | } |
||
3530 | } |
||
3531 | |||
3532 | _mm_empty (); |
||
3533 | } |
||
3534 | |||
3535 | #define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS)) |
||
3536 | #define BMSK (BSHIFT - 1) |
||
3537 | |||
3538 | #define BILINEAR_DECLARE_VARIABLES \ |
||
3539 | const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \ |
||
3540 | const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \ |
||
3541 | const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \ |
||
3542 | const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \ |
||
3543 | const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \ |
||
3544 | const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \ |
||
3545 | const __m64 mm_zero = _mm_setzero_si64 (); \ |
||
3546 | __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx) |
||
3547 | |||
3548 | #define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \ |
||
3549 | do { \ |
||
3550 | /* fetch 2x2 pixel block into 2 mmx registers */ \ |
||
3551 | __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \ |
||
3552 | __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \ |
||
3553 | /* vertical interpolation */ \ |
||
3554 | __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \ |
||
3555 | __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \ |
||
3556 | __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \ |
||
3557 | __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \ |
||
3558 | __m64 hi = _mm_add_pi16 (t_hi, b_hi); \ |
||
3559 | __m64 lo = _mm_add_pi16 (t_lo, b_lo); \ |
||
3560 | vx += unit_x; \ |
||
3561 | if (BILINEAR_INTERPOLATION_BITS < 8) \ |
||
3562 | { \ |
||
3563 | /* calculate horizontal weights */ \ |
||
3564 | __m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \ |
||
3565 | _mm_srli_pi16 (mm_x, \ |
||
3566 | 16 - BILINEAR_INTERPOLATION_BITS))); \ |
||
3567 | /* horizontal interpolation */ \ |
||
3568 | __m64 p = _mm_unpacklo_pi16 (lo, hi); \ |
||
3569 | __m64 q = _mm_unpackhi_pi16 (lo, hi); \ |
||
3570 | lo = _mm_madd_pi16 (p, mm_wh); \ |
||
3571 | hi = _mm_madd_pi16 (q, mm_wh); \ |
||
3572 | } \ |
||
3573 | else \ |
||
3574 | { \ |
||
3575 | /* calculate horizontal weights */ \ |
||
3576 | __m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \ |
||
3577 | 16 - BILINEAR_INTERPOLATION_BITS)); \ |
||
3578 | __m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \ |
||
3579 | 16 - BILINEAR_INTERPOLATION_BITS); \ |
||
3580 | /* horizontal interpolation */ \ |
||
3581 | __m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \ |
||
3582 | __m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \ |
||
3583 | __m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \ |
||
3584 | __m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \ |
||
3585 | lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \ |
||
3586 | _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \ |
||
3587 | hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \ |
||
3588 | _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \ |
||
3589 | } \ |
||
3590 | mm_x = _mm_add_pi16 (mm_x, mm_ux); \ |
||
3591 | /* shift and pack the result */ \ |
||
3592 | hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \ |
||
3593 | lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \ |
||
3594 | lo = _mm_packs_pi32 (lo, hi); \ |
||
3595 | lo = _mm_packs_pu16 (lo, lo); \ |
||
3596 | pix = lo; \ |
||
3597 | } while (0) |
||
3598 | |||
3599 | #define BILINEAR_SKIP_ONE_PIXEL() \ |
||
3600 | do { \ |
||
3601 | vx += unit_x; \ |
||
3602 | mm_x = _mm_add_pi16 (mm_x, mm_ux); \ |
||
3603 | } while(0) |
||
3604 | |||
3605 | static force_inline void |
||
3606 | scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst, |
||
3607 | const uint32_t * mask, |
||
3608 | const uint32_t * src_top, |
||
3609 | const uint32_t * src_bottom, |
||
3610 | int32_t w, |
||
3611 | int wt, |
||
3612 | int wb, |
||
3613 | pixman_fixed_t vx, |
||
3614 | pixman_fixed_t unit_x, |
||
3615 | pixman_fixed_t max_vx, |
||
3616 | pixman_bool_t zero_src) |
||
3617 | { |
||
3618 | BILINEAR_DECLARE_VARIABLES; |
||
3619 | __m64 pix; |
||
3620 | |||
3621 | while (w--) |
||
3622 | { |
||
3623 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix); |
||
3624 | store (dst, pix); |
||
3625 | dst++; |
||
3626 | } |
||
3627 | |||
3628 | _mm_empty (); |
||
3629 | } |
||
3630 | |||
3631 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC, |
||
3632 | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
||
3633 | uint32_t, uint32_t, uint32_t, |
||
3634 | COVER, FLAG_NONE) |
||
3635 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC, |
||
3636 | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
||
3637 | uint32_t, uint32_t, uint32_t, |
||
3638 | PAD, FLAG_NONE) |
||
3639 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC, |
||
3640 | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
||
3641 | uint32_t, uint32_t, uint32_t, |
||
3642 | NONE, FLAG_NONE) |
||
3643 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC, |
||
3644 | scaled_bilinear_scanline_mmx_8888_8888_SRC, |
||
3645 | uint32_t, uint32_t, uint32_t, |
||
3646 | NORMAL, FLAG_NONE) |
||
3647 | |||
3648 | static force_inline void |
||
3649 | scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst, |
||
3650 | const uint32_t * mask, |
||
3651 | const uint32_t * src_top, |
||
3652 | const uint32_t * src_bottom, |
||
3653 | int32_t w, |
||
3654 | int wt, |
||
3655 | int wb, |
||
3656 | pixman_fixed_t vx, |
||
3657 | pixman_fixed_t unit_x, |
||
3658 | pixman_fixed_t max_vx, |
||
3659 | pixman_bool_t zero_src) |
||
3660 | { |
||
3661 | BILINEAR_DECLARE_VARIABLES; |
||
3662 | __m64 pix1, pix2; |
||
3663 | |||
3664 | while (w) |
||
3665 | { |
||
3666 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
||
3667 | |||
3668 | if (!is_zero (pix1)) |
||
3669 | { |
||
3670 | pix2 = load (dst); |
||
3671 | store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2)); |
||
3672 | } |
||
3673 | |||
3674 | w--; |
||
3675 | dst++; |
||
3676 | } |
||
3677 | |||
3678 | _mm_empty (); |
||
3679 | } |
||
3680 | |||
3681 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER, |
||
3682 | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
||
3683 | uint32_t, uint32_t, uint32_t, |
||
3684 | COVER, FLAG_NONE) |
||
3685 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER, |
||
3686 | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
||
3687 | uint32_t, uint32_t, uint32_t, |
||
3688 | PAD, FLAG_NONE) |
||
3689 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER, |
||
3690 | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
||
3691 | uint32_t, uint32_t, uint32_t, |
||
3692 | NONE, FLAG_NONE) |
||
3693 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER, |
||
3694 | scaled_bilinear_scanline_mmx_8888_8888_OVER, |
||
3695 | uint32_t, uint32_t, uint32_t, |
||
3696 | NORMAL, FLAG_NONE) |
||
3697 | |||
3698 | static force_inline void |
||
3699 | scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst, |
||
3700 | const uint8_t * mask, |
||
3701 | const uint32_t * src_top, |
||
3702 | const uint32_t * src_bottom, |
||
3703 | int32_t w, |
||
3704 | int wt, |
||
3705 | int wb, |
||
3706 | pixman_fixed_t vx, |
||
3707 | pixman_fixed_t unit_x, |
||
3708 | pixman_fixed_t max_vx, |
||
3709 | pixman_bool_t zero_src) |
||
3710 | { |
||
3711 | BILINEAR_DECLARE_VARIABLES; |
||
3712 | __m64 pix1, pix2; |
||
3713 | uint32_t m; |
||
3714 | |||
3715 | while (w) |
||
3716 | { |
||
3717 | m = (uint32_t) *mask++; |
||
3718 | |||
3719 | if (m) |
||
3720 | { |
||
3721 | BILINEAR_INTERPOLATE_ONE_PIXEL (pix1); |
||
3722 | |||
3723 | if (m == 0xff && is_opaque (pix1)) |
||
3724 | { |
||
3725 | store (dst, pix1); |
||
3726 | } |
||
3727 | else |
||
3728 | { |
||
3729 | __m64 ms, md, ma, msa; |
||
3730 | |||
3731 | pix2 = load (dst); |
||
3732 | ma = expand_alpha_rev (to_m64 (m)); |
||
3733 | ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ()); |
||
3734 | md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ()); |
||
3735 | |||
3736 | msa = expand_alpha (ms); |
||
3737 | |||
3738 | store8888 (dst, (in_over (ms, msa, ma, md))); |
||
3739 | } |
||
3740 | } |
||
3741 | else |
||
3742 | { |
||
3743 | BILINEAR_SKIP_ONE_PIXEL (); |
||
3744 | } |
||
3745 | |||
3746 | w--; |
||
3747 | dst++; |
||
3748 | } |
||
3749 | |||
3750 | _mm_empty (); |
||
3751 | } |
||
3752 | |||
3753 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER, |
||
3754 | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
||
3755 | uint32_t, uint8_t, uint32_t, |
||
3756 | COVER, FLAG_HAVE_NON_SOLID_MASK) |
||
3757 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER, |
||
3758 | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
||
3759 | uint32_t, uint8_t, uint32_t, |
||
3760 | PAD, FLAG_HAVE_NON_SOLID_MASK) |
||
3761 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER, |
||
3762 | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
||
3763 | uint32_t, uint8_t, uint32_t, |
||
3764 | NONE, FLAG_HAVE_NON_SOLID_MASK) |
||
3765 | FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER, |
||
3766 | scaled_bilinear_scanline_mmx_8888_8_8888_OVER, |
||
3767 | uint32_t, uint8_t, uint32_t, |
||
3768 | NORMAL, FLAG_HAVE_NON_SOLID_MASK) |
||
3769 | |||
3770 | static uint32_t * |
||
3771 | mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask) |
||
3772 | { |
||
3773 | int w = iter->width; |
||
3774 | uint32_t *dst = iter->buffer; |
||
3775 | uint32_t *src = (uint32_t *)iter->bits; |
||
3776 | |||
3777 | iter->bits += iter->stride; |
||
3778 | |||
3779 | while (w && ((uintptr_t)dst) & 7) |
||
3780 | { |
||
3781 | *dst++ = (*src++) | 0xff000000; |
||
3782 | w--; |
||
3783 | } |
||
3784 | |||
3785 | while (w >= 8) |
||
3786 | { |
||
3787 | __m64 vsrc1 = ldq_u ((__m64 *)(src + 0)); |
||
3788 | __m64 vsrc2 = ldq_u ((__m64 *)(src + 2)); |
||
3789 | __m64 vsrc3 = ldq_u ((__m64 *)(src + 4)); |
||
3790 | __m64 vsrc4 = ldq_u ((__m64 *)(src + 6)); |
||
3791 | |||
3792 | *(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000)); |
||
3793 | *(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000)); |
||
3794 | *(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000)); |
||
3795 | *(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000)); |
||
3796 | |||
3797 | dst += 8; |
||
3798 | src += 8; |
||
3799 | w -= 8; |
||
3800 | } |
||
3801 | |||
3802 | while (w) |
||
3803 | { |
||
3804 | *dst++ = (*src++) | 0xff000000; |
||
3805 | w--; |
||
3806 | } |
||
3807 | |||
3808 | _mm_empty (); |
||
3809 | return iter->buffer; |
||
3810 | } |
||
3811 | |||
3812 | static uint32_t * |
||
3813 | mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask) |
||
3814 | { |
||
3815 | int w = iter->width; |
||
3816 | uint32_t *dst = iter->buffer; |
||
3817 | uint16_t *src = (uint16_t *)iter->bits; |
||
3818 | |||
3819 | iter->bits += iter->stride; |
||
3820 | |||
3821 | while (w && ((uintptr_t)dst) & 0x0f) |
||
3822 | { |
||
3823 | uint16_t s = *src++; |
||
3824 | |||
3825 | *dst++ = convert_0565_to_8888 (s); |
||
3826 | w--; |
||
3827 | } |
||
3828 | |||
3829 | while (w >= 4) |
||
3830 | { |
||
3831 | __m64 vsrc = ldq_u ((__m64 *)src); |
||
3832 | __m64 mm0, mm1; |
||
3833 | |||
3834 | expand_4xpacked565 (vsrc, &mm0, &mm1, 1); |
||
3835 | |||
3836 | *(__m64 *)(dst + 0) = mm0; |
||
3837 | *(__m64 *)(dst + 2) = mm1; |
||
3838 | |||
3839 | dst += 4; |
||
3840 | src += 4; |
||
3841 | w -= 4; |
||
3842 | } |
||
3843 | |||
3844 | while (w) |
||
3845 | { |
||
3846 | uint16_t s = *src++; |
||
3847 | |||
3848 | *dst++ = convert_0565_to_8888 (s); |
||
3849 | w--; |
||
3850 | } |
||
3851 | |||
3852 | _mm_empty (); |
||
3853 | return iter->buffer; |
||
3854 | } |
||
3855 | |||
3856 | static uint32_t * |
||
3857 | mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask) |
||
3858 | { |
||
3859 | int w = iter->width; |
||
3860 | uint32_t *dst = iter->buffer; |
||
3861 | uint8_t *src = iter->bits; |
||
3862 | |||
3863 | iter->bits += iter->stride; |
||
3864 | |||
3865 | while (w && (((uintptr_t)dst) & 15)) |
||
3866 | { |
||
3867 | *dst++ = *(src++) << 24; |
||
3868 | w--; |
||
3869 | } |
||
3870 | |||
3871 | while (w >= 8) |
||
3872 | { |
||
3873 | __m64 mm0 = ldq_u ((__m64 *)src); |
||
3874 | |||
3875 | __m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0); |
||
3876 | __m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0); |
||
3877 | __m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1); |
||
3878 | __m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1); |
||
3879 | __m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2); |
||
3880 | __m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2); |
||
3881 | |||
3882 | *(__m64 *)(dst + 0) = mm3; |
||
3883 | *(__m64 *)(dst + 2) = mm4; |
||
3884 | *(__m64 *)(dst + 4) = mm5; |
||
3885 | *(__m64 *)(dst + 6) = mm6; |
||
3886 | |||
3887 | dst += 8; |
||
3888 | src += 8; |
||
3889 | w -= 8; |
||
3890 | } |
||
3891 | |||
3892 | while (w) |
||
3893 | { |
||
3894 | *dst++ = *(src++) << 24; |
||
3895 | w--; |
||
3896 | } |
||
3897 | |||
3898 | _mm_empty (); |
||
3899 | return iter->buffer; |
||
3900 | } |
||
3901 | |||
3902 | typedef struct |
||
3903 | { |
||
3904 | pixman_format_code_t format; |
||
3905 | pixman_iter_get_scanline_t get_scanline; |
||
3906 | } fetcher_info_t; |
||
3907 | |||
3908 | static const fetcher_info_t fetchers[] = |
||
3909 | { |
||
3910 | { PIXMAN_x8r8g8b8, mmx_fetch_x8r8g8b8 }, |
||
3911 | { PIXMAN_r5g6b5, mmx_fetch_r5g6b5 }, |
||
3912 | { PIXMAN_a8, mmx_fetch_a8 }, |
||
3913 | { PIXMAN_null } |
||
3914 | }; |
||
3915 | |||
3916 | static pixman_bool_t |
||
3917 | mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter) |
||
3918 | { |
||
3919 | pixman_image_t *image = iter->image; |
||
3920 | |||
3921 | #define FLAGS \ |
||
3922 | (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM | \ |
||
3923 | FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST) |
||
3924 | |||
3925 | if ((iter->iter_flags & ITER_NARROW) && |
||
3926 | (iter->image_flags & FLAGS) == FLAGS) |
||
3927 | { |
||
3928 | const fetcher_info_t *f; |
||
3929 | |||
3930 | for (f = &fetchers[0]; f->format != PIXMAN_null; f++) |
||
3931 | { |
||
3932 | if (image->common.extended_format_code == f->format) |
||
3933 | { |
||
3934 | uint8_t *b = (uint8_t *)image->bits.bits; |
||
3935 | int s = image->bits.rowstride * 4; |
||
3936 | |||
3937 | iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8; |
||
3938 | iter->stride = s; |
||
3939 | |||
3940 | iter->get_scanline = f->get_scanline; |
||
3941 | return TRUE; |
||
3942 | } |
||
3943 | } |
||
3944 | } |
||
3945 | |||
3946 | return FALSE; |
||
3947 | } |
||
3948 | |||
1891 | serge | 3949 | static const pixman_fast_path_t mmx_fast_paths[] = |
3950 | { |
||
3951 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ), |
||
3952 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ), |
||
3953 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ), |
||
3954 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ), |
||
3955 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ), |
||
3956 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ), |
||
3957 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ), |
||
3958 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ), |
||
3959 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ), |
||
3960 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ), |
||
3961 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ), |
||
3962 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ), |
||
3963 | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ), |
||
3964 | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ), |
||
3965 | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ), |
||
3966 | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ), |
||
3967 | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ), |
||
3968 | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ), |
||
3969 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ), |
||
3970 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ), |
||
3971 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ), |
||
3972 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ), |
||
3973 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ), |
||
3974 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ), |
||
3975 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ), |
||
3976 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ), |
||
3977 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ), |
||
3978 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ), |
||
3931 | Serge | 3979 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ), |
3980 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ), |
||
1891 | serge | 3981 | PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ), |
3982 | PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ), |
||
3983 | PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ), |
||
3931 | Serge | 3984 | PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ), |
1891 | serge | 3985 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
3986 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
||
3987 | |||
3988 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ), |
||
3989 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ), |
||
3990 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ), |
||
3991 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ), |
||
3992 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ), |
||
3993 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ), |
||
3994 | |||
3931 | Serge | 3995 | PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888), |
3996 | PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888), |
||
3997 | |||
3998 | PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ), |
||
3999 | PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ), |
||
1891 | serge | 4000 | PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ), |
4001 | PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ), |
||
4002 | PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), |
||
4003 | PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), |
||
4004 | |||
3931 | Serge | 4005 | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), |
4006 | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), |
||
4007 | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ), |
||
4008 | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ), |
||
1891 | serge | 4009 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), |
4010 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ), |
||
4011 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ), |
||
4012 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ), |
||
4013 | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ), |
||
4014 | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ), |
||
4015 | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
||
4016 | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
||
4017 | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
||
4018 | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
||
4019 | PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ), |
||
4020 | PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ), |
||
4021 | |||
4022 | PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ), |
||
4023 | PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ), |
||
4024 | |||
3931 | Serge | 4025 | SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), |
4026 | SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
||
4027 | SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
||
4028 | SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), |
||
4029 | SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
||
4030 | SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
||
1891 | serge | 4031 | |
3931 | Serge | 4032 | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ), |
4033 | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ), |
||
4034 | SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ), |
||
4035 | SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ), |
||
1891 | serge | 4036 | |
3931 | Serge | 4037 | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ), |
4038 | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ), |
||
4039 | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ), |
||
4040 | SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ), |
||
1891 | serge | 4041 | |
3931 | Serge | 4042 | { PIXMAN_OP_NONE }, |
4043 | }; |
||
1891 | serge | 4044 | |
4045 | pixman_implementation_t * |
||
3931 | Serge | 4046 | _pixman_implementation_create_mmx (pixman_implementation_t *fallback) |
1891 | serge | 4047 | { |
3931 | Serge | 4048 | pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths); |
1891 | serge | 4049 | |
4050 | imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; |
||
4051 | imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; |
||
4052 | imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; |
||
4053 | imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; |
||
4054 | imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; |
||
4055 | imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u; |
||
4056 | imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u; |
||
4057 | imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u; |
||
4058 | imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u; |
||
4059 | imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u; |
||
4060 | imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u; |
||
4061 | |||
4062 | imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca; |
||
4063 | imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca; |
||
4064 | imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca; |
||
4065 | imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca; |
||
4066 | imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca; |
||
4067 | imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca; |
||
4068 | imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca; |
||
4069 | imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca; |
||
4070 | imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; |
||
4071 | imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; |
||
4072 | imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; |
||
4073 | |||
4074 | imp->blt = mmx_blt; |
||
4075 | imp->fill = mmx_fill; |
||
4076 | |||
3931 | Serge | 4077 | imp->src_iter_init = mmx_src_iter_init; |
4078 | |||
1891 | serge | 4079 | return imp; |
4080 | } |
||
4081 | |||
3931 | Serge | 4082 | #endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */><>><>>><>><>><>><>>>>>>>>>>>>><>>>>>>>>>>>>><>><>>><>><>><>> |