Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
1891 | serge | 1 | /* |
2 | * Copyright © 2004, 2005 Red Hat, Inc. |
||
3 | * Copyright © 2004 Nicholas Miell |
||
4 | * Copyright © 2005 Trolltech AS |
||
5 | * |
||
6 | * Permission to use, copy, modify, distribute, and sell this software and its |
||
7 | * documentation for any purpose is hereby granted without fee, provided that |
||
8 | * the above copyright notice appear in all copies and that both that |
||
9 | * copyright notice and this permission notice appear in supporting |
||
10 | * documentation, and that the name of Red Hat not be used in advertising or |
||
11 | * publicity pertaining to distribution of the software without specific, |
||
12 | * written prior permission. Red Hat makes no representations about the |
||
13 | * suitability of this software for any purpose. It is provided "as is" |
||
14 | * without express or implied warranty. |
||
15 | * |
||
16 | * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS |
||
17 | * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND |
||
18 | * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY |
||
19 | * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES |
||
20 | * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN |
||
21 | * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING |
||
22 | * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS |
||
23 | * SOFTWARE. |
||
24 | * |
||
25 | * Author: Søren Sandmann (sandmann@redhat.com) |
||
26 | * Minor Improvements: Nicholas Miell (nmiell@gmail.com) |
||
27 | * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com) |
||
28 | * |
||
29 | * Based on work by Owen Taylor |
||
30 | */ |
||
31 | |||
32 | #ifdef HAVE_CONFIG_H |
||
33 | #include |
||
34 | #endif |
||
35 | |||
36 | #ifdef USE_MMX |
||
37 | |||
38 | #include |
||
39 | #include "pixman-private.h" |
||
40 | #include "pixman-combine32.h" |
||
41 | |||
42 | #define no_vERBOSE |
||
43 | |||
44 | #ifdef VERBOSE |
||
45 | #define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__) |
||
46 | #else |
||
47 | #define CHECKPOINT() |
||
48 | #endif |
||
49 | |||
50 | /* Notes about writing mmx code |
||
51 | * |
||
52 | * give memory operands as the second operand. If you give it as the |
||
53 | * first, gcc will first load it into a register, then use that |
||
54 | * register |
||
55 | * |
||
56 | * ie. use |
||
57 | * |
||
58 | * _mm_mullo_pi16 (x, mmx_constant); |
||
59 | * |
||
60 | * not |
||
61 | * |
||
62 | * _mm_mullo_pi16 (mmx_constant, x); |
||
63 | * |
||
64 | * Also try to minimize dependencies. i.e. when you need a value, try |
||
65 | * to calculate it from a value that was calculated as early as |
||
66 | * possible. |
||
67 | */ |
||
68 | |||
69 | /* --------------- MMX primitives ------------------------------------- */ |
||
70 | |||
71 | #ifdef __GNUC__ |
||
72 | typedef uint64_t mmxdatafield; |
||
73 | #else |
||
74 | typedef __m64 mmxdatafield; |
||
75 | /* If __m64 is defined as a struct or union, define M64_MEMBER to be the |
||
76 | name of the member used to access the data */ |
||
77 | # ifdef _MSC_VER |
||
78 | # define M64_MEMBER m64_u64 |
||
79 | # elif defined(__SUNPRO_C) |
||
80 | # define M64_MEMBER l_ |
||
81 | # endif |
||
82 | #endif |
||
83 | |||
84 | typedef struct |
||
85 | { |
||
86 | mmxdatafield mmx_4x00ff; |
||
87 | mmxdatafield mmx_4x0080; |
||
88 | mmxdatafield mmx_565_rgb; |
||
89 | mmxdatafield mmx_565_unpack_multiplier; |
||
90 | mmxdatafield mmx_565_r; |
||
91 | mmxdatafield mmx_565_g; |
||
92 | mmxdatafield mmx_565_b; |
||
93 | mmxdatafield mmx_mask_0; |
||
94 | mmxdatafield mmx_mask_1; |
||
95 | mmxdatafield mmx_mask_2; |
||
96 | mmxdatafield mmx_mask_3; |
||
97 | mmxdatafield mmx_full_alpha; |
||
98 | mmxdatafield mmx_ffff0000ffff0000; |
||
99 | mmxdatafield mmx_0000ffff00000000; |
||
100 | mmxdatafield mmx_000000000000ffff; |
||
101 | } mmx_data_t; |
||
102 | |||
103 | #if defined(_MSC_VER) |
||
104 | # define MMXDATA_INIT(field, val) { val ## UI64 } |
||
105 | #elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */ |
||
106 | # define MMXDATA_INIT(field, val) field = { val ## ULL } |
||
107 | #else /* __m64 is an integral type */ |
||
108 | # define MMXDATA_INIT(field, val) field = val ## ULL |
||
109 | #endif |
||
110 | |||
111 | static const mmx_data_t c = |
||
112 | { |
||
113 | MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff), |
||
114 | MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080), |
||
115 | MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f), |
||
116 | MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840), |
||
117 | MMXDATA_INIT (.mmx_565_r, 0x000000f800000000), |
||
118 | MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000), |
||
119 | MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8), |
||
120 | MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000), |
||
121 | MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff), |
||
122 | MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff), |
||
123 | MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff), |
||
124 | MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000), |
||
125 | MMXDATA_INIT (.mmx_ffff0000ffff0000, 0xffff0000ffff0000), |
||
126 | MMXDATA_INIT (.mmx_0000ffff00000000, 0x0000ffff00000000), |
||
127 | MMXDATA_INIT (.mmx_000000000000ffff, 0x000000000000ffff), |
||
128 | }; |
||
129 | |||
130 | #ifdef __GNUC__ |
||
131 | # ifdef __ICC |
||
132 | # define MC(x) to_m64 (c.mmx_ ## x) |
||
133 | # else |
||
134 | # define MC(x) ((__m64)c.mmx_ ## x) |
||
135 | # endif |
||
136 | #else |
||
137 | # define MC(x) c.mmx_ ## x |
||
138 | #endif |
||
139 | |||
140 | static force_inline __m64 |
||
141 | to_m64 (uint64_t x) |
||
142 | { |
||
143 | #ifdef __ICC |
||
144 | return _mm_cvtsi64_m64 (x); |
||
145 | #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ |
||
146 | __m64 res; |
||
147 | |||
148 | res.M64_MEMBER = x; |
||
149 | return res; |
||
150 | #else /* __m64 is an integral type */ |
||
151 | return (__m64)x; |
||
152 | #endif |
||
153 | } |
||
154 | |||
155 | static force_inline uint64_t |
||
156 | to_uint64 (__m64 x) |
||
157 | { |
||
158 | #ifdef __ICC |
||
159 | return _mm_cvtm64_si64 (x); |
||
160 | #elif defined M64_MEMBER /* __m64 is a struct, not an integral type */ |
||
161 | uint64_t res = x.M64_MEMBER; |
||
162 | return res; |
||
163 | #else /* __m64 is an integral type */ |
||
164 | return (uint64_t)x; |
||
165 | #endif |
||
166 | } |
||
167 | |||
168 | static force_inline __m64 |
||
169 | shift (__m64 v, |
||
170 | int s) |
||
171 | { |
||
172 | if (s > 0) |
||
173 | return _mm_slli_si64 (v, s); |
||
174 | else if (s < 0) |
||
175 | return _mm_srli_si64 (v, -s); |
||
176 | else |
||
177 | return v; |
||
178 | } |
||
179 | |||
180 | static force_inline __m64 |
||
181 | negate (__m64 mask) |
||
182 | { |
||
183 | return _mm_xor_si64 (mask, MC (4x00ff)); |
||
184 | } |
||
185 | |||
186 | static force_inline __m64 |
||
187 | pix_multiply (__m64 a, __m64 b) |
||
188 | { |
||
189 | __m64 res; |
||
190 | |||
191 | res = _mm_mullo_pi16 (a, b); |
||
192 | res = _mm_adds_pu16 (res, MC (4x0080)); |
||
193 | res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8)); |
||
194 | res = _mm_srli_pi16 (res, 8); |
||
195 | |||
196 | return res; |
||
197 | } |
||
198 | |||
199 | static force_inline __m64 |
||
200 | pix_add (__m64 a, __m64 b) |
||
201 | { |
||
202 | return _mm_adds_pu8 (a, b); |
||
203 | } |
||
204 | |||
205 | static force_inline __m64 |
||
206 | expand_alpha (__m64 pixel) |
||
207 | { |
||
208 | __m64 t1, t2; |
||
209 | |||
210 | t1 = shift (pixel, -48); |
||
211 | t2 = shift (t1, 16); |
||
212 | t1 = _mm_or_si64 (t1, t2); |
||
213 | t2 = shift (t1, 32); |
||
214 | t1 = _mm_or_si64 (t1, t2); |
||
215 | |||
216 | return t1; |
||
217 | } |
||
218 | |||
219 | static force_inline __m64 |
||
220 | expand_alpha_rev (__m64 pixel) |
||
221 | { |
||
222 | __m64 t1, t2; |
||
223 | |||
224 | /* move alpha to low 16 bits and zero the rest */ |
||
225 | t1 = shift (pixel, 48); |
||
226 | t1 = shift (t1, -48); |
||
227 | |||
228 | t2 = shift (t1, 16); |
||
229 | t1 = _mm_or_si64 (t1, t2); |
||
230 | t2 = shift (t1, 32); |
||
231 | t1 = _mm_or_si64 (t1, t2); |
||
232 | |||
233 | return t1; |
||
234 | } |
||
235 | |||
236 | static force_inline __m64 |
||
237 | invert_colors (__m64 pixel) |
||
238 | { |
||
239 | __m64 x, y, z; |
||
240 | |||
241 | x = y = z = pixel; |
||
242 | |||
243 | x = _mm_and_si64 (x, MC (ffff0000ffff0000)); |
||
244 | y = _mm_and_si64 (y, MC (000000000000ffff)); |
||
245 | z = _mm_and_si64 (z, MC (0000ffff00000000)); |
||
246 | |||
247 | y = shift (y, 32); |
||
248 | z = shift (z, -32); |
||
249 | |||
250 | x = _mm_or_si64 (x, y); |
||
251 | x = _mm_or_si64 (x, z); |
||
252 | |||
253 | return x; |
||
254 | } |
||
255 | |||
256 | static force_inline __m64 |
||
257 | over (__m64 src, |
||
258 | __m64 srca, |
||
259 | __m64 dest) |
||
260 | { |
||
261 | return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca))); |
||
262 | } |
||
263 | |||
264 | static force_inline __m64 |
||
265 | over_rev_non_pre (__m64 src, __m64 dest) |
||
266 | { |
||
267 | __m64 srca = expand_alpha (src); |
||
268 | __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha)); |
||
269 | |||
270 | return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest); |
||
271 | } |
||
272 | |||
273 | static force_inline __m64 |
||
274 | in (__m64 src, __m64 mask) |
||
275 | { |
||
276 | return pix_multiply (src, mask); |
||
277 | } |
||
278 | |||
279 | static force_inline __m64 |
||
280 | in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest) |
||
281 | { |
||
282 | src = _mm_or_si64 (src, MC (full_alpha)); |
||
283 | |||
284 | return over (in (src, mask), mask, dest); |
||
285 | } |
||
286 | |||
287 | #ifndef _MSC_VER |
||
288 | static force_inline __m64 |
||
289 | in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest) |
||
290 | { |
||
291 | return over (in (src, mask), pix_multiply (srca, mask), dest); |
||
292 | } |
||
293 | |||
294 | #else |
||
295 | |||
296 | #define in_over(src, srca, mask, dest) \ |
||
297 | over (in (src, mask), pix_multiply (srca, mask), dest) |
||
298 | |||
299 | #endif |
||
300 | |||
301 | static force_inline __m64 |
||
302 | load8888 (uint32_t v) |
||
303 | { |
||
304 | return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ()); |
||
305 | } |
||
306 | |||
307 | static force_inline __m64 |
||
308 | pack8888 (__m64 lo, __m64 hi) |
||
309 | { |
||
310 | return _mm_packs_pu16 (lo, hi); |
||
311 | } |
||
312 | |||
313 | static force_inline uint32_t |
||
314 | store8888 (__m64 v) |
||
315 | { |
||
316 | return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ())); |
||
317 | } |
||
318 | |||
319 | /* Expand 16 bits positioned at @pos (0-3) of a mmx register into |
||
320 | * |
||
321 | * 00RR00GG00BB |
||
322 | * |
||
323 | * --- Expanding 565 in the low word --- |
||
324 | * |
||
325 | * m = (m << (32 - 3)) | (m << (16 - 5)) | m; |
||
326 | * m = m & (01f0003f001f); |
||
327 | * m = m * (008404100840); |
||
328 | * m = m >> 8; |
||
329 | * |
||
330 | * Note the trick here - the top word is shifted by another nibble to |
||
331 | * avoid it bumping into the middle word |
||
332 | */ |
||
333 | static force_inline __m64 |
||
334 | expand565 (__m64 pixel, int pos) |
||
335 | { |
||
336 | __m64 p = pixel; |
||
337 | __m64 t1, t2; |
||
338 | |||
339 | /* move pixel to low 16 bit and zero the rest */ |
||
340 | p = shift (shift (p, (3 - pos) * 16), -48); |
||
341 | |||
342 | t1 = shift (p, 36 - 11); |
||
343 | t2 = shift (p, 16 - 5); |
||
344 | |||
345 | p = _mm_or_si64 (t1, p); |
||
346 | p = _mm_or_si64 (t2, p); |
||
347 | p = _mm_and_si64 (p, MC (565_rgb)); |
||
348 | |||
349 | pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier)); |
||
350 | return _mm_srli_pi16 (pixel, 8); |
||
351 | } |
||
352 | |||
353 | static force_inline __m64 |
||
354 | expand8888 (__m64 in, int pos) |
||
355 | { |
||
356 | if (pos == 0) |
||
357 | return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ()); |
||
358 | else |
||
359 | return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ()); |
||
360 | } |
||
361 | |||
362 | static force_inline __m64 |
||
363 | expandx888 (__m64 in, int pos) |
||
364 | { |
||
365 | return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha)); |
||
366 | } |
||
367 | |||
368 | static force_inline __m64 |
||
369 | pack_565 (__m64 pixel, __m64 target, int pos) |
||
370 | { |
||
371 | __m64 p = pixel; |
||
372 | __m64 t = target; |
||
373 | __m64 r, g, b; |
||
374 | |||
375 | r = _mm_and_si64 (p, MC (565_r)); |
||
376 | g = _mm_and_si64 (p, MC (565_g)); |
||
377 | b = _mm_and_si64 (p, MC (565_b)); |
||
378 | |||
379 | r = shift (r, -(32 - 8) + pos * 16); |
||
380 | g = shift (g, -(16 - 3) + pos * 16); |
||
381 | b = shift (b, -(0 + 3) + pos * 16); |
||
382 | |||
383 | if (pos == 0) |
||
384 | t = _mm_and_si64 (t, MC (mask_0)); |
||
385 | else if (pos == 1) |
||
386 | t = _mm_and_si64 (t, MC (mask_1)); |
||
387 | else if (pos == 2) |
||
388 | t = _mm_and_si64 (t, MC (mask_2)); |
||
389 | else if (pos == 3) |
||
390 | t = _mm_and_si64 (t, MC (mask_3)); |
||
391 | |||
392 | p = _mm_or_si64 (r, t); |
||
393 | p = _mm_or_si64 (g, p); |
||
394 | |||
395 | return _mm_or_si64 (b, p); |
||
396 | } |
||
397 | |||
398 | #ifndef _MSC_VER |
||
399 | |||
400 | static force_inline __m64 |
||
401 | pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b) |
||
402 | { |
||
403 | x = pix_multiply (x, a); |
||
404 | y = pix_multiply (y, b); |
||
405 | |||
406 | return pix_add (x, y); |
||
407 | } |
||
408 | |||
409 | #else |
||
410 | |||
411 | #define pix_add_mul(x, a, y, b) \ |
||
412 | ( x = pix_multiply (x, a), \ |
||
413 | y = pix_multiply (y, a), \ |
||
414 | pix_add (x, y) ) |
||
415 | |||
416 | #endif |
||
417 | |||
418 | /* --------------- MMX code patch for fbcompose.c --------------------- */ |
||
419 | |||
420 | static force_inline uint32_t |
||
421 | combine (const uint32_t *src, const uint32_t *mask) |
||
422 | { |
||
423 | uint32_t ssrc = *src; |
||
424 | |||
425 | if (mask) |
||
426 | { |
||
427 | __m64 m = load8888 (*mask); |
||
428 | __m64 s = load8888 (ssrc); |
||
429 | |||
430 | m = expand_alpha (m); |
||
431 | s = pix_multiply (s, m); |
||
432 | |||
433 | ssrc = store8888 (s); |
||
434 | } |
||
435 | |||
436 | return ssrc; |
||
437 | } |
||
438 | |||
439 | static void |
||
440 | mmx_combine_over_u (pixman_implementation_t *imp, |
||
441 | pixman_op_t op, |
||
442 | uint32_t * dest, |
||
443 | const uint32_t * src, |
||
444 | const uint32_t * mask, |
||
445 | int width) |
||
446 | { |
||
447 | const uint32_t *end = dest + width; |
||
448 | |||
449 | while (dest < end) |
||
450 | { |
||
451 | uint32_t ssrc = combine (src, mask); |
||
452 | uint32_t a = ssrc >> 24; |
||
453 | |||
454 | if (a == 0xff) |
||
455 | { |
||
456 | *dest = ssrc; |
||
457 | } |
||
458 | else if (ssrc) |
||
459 | { |
||
460 | __m64 s, sa; |
||
461 | s = load8888 (ssrc); |
||
462 | sa = expand_alpha (s); |
||
463 | *dest = store8888 (over (s, sa, load8888 (*dest))); |
||
464 | } |
||
465 | |||
466 | ++dest; |
||
467 | ++src; |
||
468 | if (mask) |
||
469 | ++mask; |
||
470 | } |
||
471 | _mm_empty (); |
||
472 | } |
||
473 | |||
474 | static void |
||
475 | mmx_combine_over_reverse_u (pixman_implementation_t *imp, |
||
476 | pixman_op_t op, |
||
477 | uint32_t * dest, |
||
478 | const uint32_t * src, |
||
479 | const uint32_t * mask, |
||
480 | int width) |
||
481 | { |
||
482 | const uint32_t *end = dest + width; |
||
483 | |||
484 | while (dest < end) |
||
485 | { |
||
486 | __m64 d, da; |
||
487 | uint32_t s = combine (src, mask); |
||
488 | |||
489 | d = load8888 (*dest); |
||
490 | da = expand_alpha (d); |
||
491 | *dest = store8888 (over (d, da, load8888 (s))); |
||
492 | |||
493 | ++dest; |
||
494 | ++src; |
||
495 | if (mask) |
||
496 | mask++; |
||
497 | } |
||
498 | _mm_empty (); |
||
499 | } |
||
500 | |||
501 | static void |
||
502 | mmx_combine_in_u (pixman_implementation_t *imp, |
||
503 | pixman_op_t op, |
||
504 | uint32_t * dest, |
||
505 | const uint32_t * src, |
||
506 | const uint32_t * mask, |
||
507 | int width) |
||
508 | { |
||
509 | const uint32_t *end = dest + width; |
||
510 | |||
511 | while (dest < end) |
||
512 | { |
||
513 | __m64 x, a; |
||
514 | |||
515 | x = load8888 (combine (src, mask)); |
||
516 | a = load8888 (*dest); |
||
517 | a = expand_alpha (a); |
||
518 | x = pix_multiply (x, a); |
||
519 | |||
520 | *dest = store8888 (x); |
||
521 | |||
522 | ++dest; |
||
523 | ++src; |
||
524 | if (mask) |
||
525 | mask++; |
||
526 | } |
||
527 | _mm_empty (); |
||
528 | } |
||
529 | |||
530 | static void |
||
531 | mmx_combine_in_reverse_u (pixman_implementation_t *imp, |
||
532 | pixman_op_t op, |
||
533 | uint32_t * dest, |
||
534 | const uint32_t * src, |
||
535 | const uint32_t * mask, |
||
536 | int width) |
||
537 | { |
||
538 | const uint32_t *end = dest + width; |
||
539 | |||
540 | while (dest < end) |
||
541 | { |
||
542 | __m64 x, a; |
||
543 | |||
544 | x = load8888 (*dest); |
||
545 | a = load8888 (combine (src, mask)); |
||
546 | a = expand_alpha (a); |
||
547 | x = pix_multiply (x, a); |
||
548 | *dest = store8888 (x); |
||
549 | |||
550 | ++dest; |
||
551 | ++src; |
||
552 | if (mask) |
||
553 | mask++; |
||
554 | } |
||
555 | _mm_empty (); |
||
556 | } |
||
557 | |||
558 | static void |
||
559 | mmx_combine_out_u (pixman_implementation_t *imp, |
||
560 | pixman_op_t op, |
||
561 | uint32_t * dest, |
||
562 | const uint32_t * src, |
||
563 | const uint32_t * mask, |
||
564 | int width) |
||
565 | { |
||
566 | const uint32_t *end = dest + width; |
||
567 | |||
568 | while (dest < end) |
||
569 | { |
||
570 | __m64 x, a; |
||
571 | |||
572 | x = load8888 (combine (src, mask)); |
||
573 | a = load8888 (*dest); |
||
574 | a = expand_alpha (a); |
||
575 | a = negate (a); |
||
576 | x = pix_multiply (x, a); |
||
577 | *dest = store8888 (x); |
||
578 | |||
579 | ++dest; |
||
580 | ++src; |
||
581 | if (mask) |
||
582 | mask++; |
||
583 | } |
||
584 | _mm_empty (); |
||
585 | } |
||
586 | |||
587 | static void |
||
588 | mmx_combine_out_reverse_u (pixman_implementation_t *imp, |
||
589 | pixman_op_t op, |
||
590 | uint32_t * dest, |
||
591 | const uint32_t * src, |
||
592 | const uint32_t * mask, |
||
593 | int width) |
||
594 | { |
||
595 | const uint32_t *end = dest + width; |
||
596 | |||
597 | while (dest < end) |
||
598 | { |
||
599 | __m64 x, a; |
||
600 | |||
601 | x = load8888 (*dest); |
||
602 | a = load8888 (combine (src, mask)); |
||
603 | a = expand_alpha (a); |
||
604 | a = negate (a); |
||
605 | x = pix_multiply (x, a); |
||
606 | |||
607 | *dest = store8888 (x); |
||
608 | |||
609 | ++dest; |
||
610 | ++src; |
||
611 | if (mask) |
||
612 | mask++; |
||
613 | } |
||
614 | _mm_empty (); |
||
615 | } |
||
616 | |||
617 | static void |
||
618 | mmx_combine_atop_u (pixman_implementation_t *imp, |
||
619 | pixman_op_t op, |
||
620 | uint32_t * dest, |
||
621 | const uint32_t * src, |
||
622 | const uint32_t * mask, |
||
623 | int width) |
||
624 | { |
||
625 | const uint32_t *end = dest + width; |
||
626 | |||
627 | while (dest < end) |
||
628 | { |
||
629 | __m64 s, da, d, sia; |
||
630 | |||
631 | s = load8888 (combine (src, mask)); |
||
632 | d = load8888 (*dest); |
||
633 | sia = expand_alpha (s); |
||
634 | sia = negate (sia); |
||
635 | da = expand_alpha (d); |
||
636 | s = pix_add_mul (s, da, d, sia); |
||
637 | *dest = store8888 (s); |
||
638 | |||
639 | ++dest; |
||
640 | ++src; |
||
641 | if (mask) |
||
642 | mask++; |
||
643 | } |
||
644 | _mm_empty (); |
||
645 | } |
||
646 | |||
647 | static void |
||
648 | mmx_combine_atop_reverse_u (pixman_implementation_t *imp, |
||
649 | pixman_op_t op, |
||
650 | uint32_t * dest, |
||
651 | const uint32_t * src, |
||
652 | const uint32_t * mask, |
||
653 | int width) |
||
654 | { |
||
655 | const uint32_t *end; |
||
656 | |||
657 | end = dest + width; |
||
658 | |||
659 | while (dest < end) |
||
660 | { |
||
661 | __m64 s, dia, d, sa; |
||
662 | |||
663 | s = load8888 (combine (src, mask)); |
||
664 | d = load8888 (*dest); |
||
665 | sa = expand_alpha (s); |
||
666 | dia = expand_alpha (d); |
||
667 | dia = negate (dia); |
||
668 | s = pix_add_mul (s, dia, d, sa); |
||
669 | *dest = store8888 (s); |
||
670 | |||
671 | ++dest; |
||
672 | ++src; |
||
673 | if (mask) |
||
674 | mask++; |
||
675 | } |
||
676 | _mm_empty (); |
||
677 | } |
||
678 | |||
679 | static void |
||
680 | mmx_combine_xor_u (pixman_implementation_t *imp, |
||
681 | pixman_op_t op, |
||
682 | uint32_t * dest, |
||
683 | const uint32_t * src, |
||
684 | const uint32_t * mask, |
||
685 | int width) |
||
686 | { |
||
687 | const uint32_t *end = dest + width; |
||
688 | |||
689 | while (dest < end) |
||
690 | { |
||
691 | __m64 s, dia, d, sia; |
||
692 | |||
693 | s = load8888 (combine (src, mask)); |
||
694 | d = load8888 (*dest); |
||
695 | sia = expand_alpha (s); |
||
696 | dia = expand_alpha (d); |
||
697 | sia = negate (sia); |
||
698 | dia = negate (dia); |
||
699 | s = pix_add_mul (s, dia, d, sia); |
||
700 | *dest = store8888 (s); |
||
701 | |||
702 | ++dest; |
||
703 | ++src; |
||
704 | if (mask) |
||
705 | mask++; |
||
706 | } |
||
707 | _mm_empty (); |
||
708 | } |
||
709 | |||
710 | static void |
||
711 | mmx_combine_add_u (pixman_implementation_t *imp, |
||
712 | pixman_op_t op, |
||
713 | uint32_t * dest, |
||
714 | const uint32_t * src, |
||
715 | const uint32_t * mask, |
||
716 | int width) |
||
717 | { |
||
718 | const uint32_t *end = dest + width; |
||
719 | |||
720 | while (dest < end) |
||
721 | { |
||
722 | __m64 s, d; |
||
723 | |||
724 | s = load8888 (combine (src, mask)); |
||
725 | d = load8888 (*dest); |
||
726 | s = pix_add (s, d); |
||
727 | *dest = store8888 (s); |
||
728 | |||
729 | ++dest; |
||
730 | ++src; |
||
731 | if (mask) |
||
732 | mask++; |
||
733 | } |
||
734 | _mm_empty (); |
||
735 | } |
||
736 | |||
737 | static void |
||
738 | mmx_combine_saturate_u (pixman_implementation_t *imp, |
||
739 | pixman_op_t op, |
||
740 | uint32_t * dest, |
||
741 | const uint32_t * src, |
||
742 | const uint32_t * mask, |
||
743 | int width) |
||
744 | { |
||
745 | const uint32_t *end = dest + width; |
||
746 | |||
747 | while (dest < end) |
||
748 | { |
||
749 | uint32_t s = combine (src, mask); |
||
750 | uint32_t d = *dest; |
||
751 | __m64 ms = load8888 (s); |
||
752 | __m64 md = load8888 (d); |
||
753 | uint32_t sa = s >> 24; |
||
754 | uint32_t da = ~d >> 24; |
||
755 | |||
756 | if (sa > da) |
||
757 | { |
||
758 | __m64 msa = load8888 (DIV_UN8 (da, sa) << 24); |
||
759 | msa = expand_alpha (msa); |
||
760 | ms = pix_multiply (ms, msa); |
||
761 | } |
||
762 | |||
763 | md = pix_add (md, ms); |
||
764 | *dest = store8888 (md); |
||
765 | |||
766 | ++src; |
||
767 | ++dest; |
||
768 | if (mask) |
||
769 | mask++; |
||
770 | } |
||
771 | _mm_empty (); |
||
772 | } |
||
773 | |||
774 | static void |
||
775 | mmx_combine_src_ca (pixman_implementation_t *imp, |
||
776 | pixman_op_t op, |
||
777 | uint32_t * dest, |
||
778 | const uint32_t * src, |
||
779 | const uint32_t * mask, |
||
780 | int width) |
||
781 | { |
||
782 | const uint32_t *end = src + width; |
||
783 | |||
784 | while (src < end) |
||
785 | { |
||
786 | __m64 a = load8888 (*mask); |
||
787 | __m64 s = load8888 (*src); |
||
788 | |||
789 | s = pix_multiply (s, a); |
||
790 | *dest = store8888 (s); |
||
791 | |||
792 | ++src; |
||
793 | ++mask; |
||
794 | ++dest; |
||
795 | } |
||
796 | _mm_empty (); |
||
797 | } |
||
798 | |||
799 | static void |
||
800 | mmx_combine_over_ca (pixman_implementation_t *imp, |
||
801 | pixman_op_t op, |
||
802 | uint32_t * dest, |
||
803 | const uint32_t * src, |
||
804 | const uint32_t * mask, |
||
805 | int width) |
||
806 | { |
||
807 | const uint32_t *end = src + width; |
||
808 | |||
809 | while (src < end) |
||
810 | { |
||
811 | __m64 a = load8888 (*mask); |
||
812 | __m64 s = load8888 (*src); |
||
813 | __m64 d = load8888 (*dest); |
||
814 | __m64 sa = expand_alpha (s); |
||
815 | |||
816 | *dest = store8888 (in_over (s, sa, a, d)); |
||
817 | |||
818 | ++src; |
||
819 | ++dest; |
||
820 | ++mask; |
||
821 | } |
||
822 | _mm_empty (); |
||
823 | } |
||
824 | |||
825 | static void |
||
826 | mmx_combine_over_reverse_ca (pixman_implementation_t *imp, |
||
827 | pixman_op_t op, |
||
828 | uint32_t * dest, |
||
829 | const uint32_t * src, |
||
830 | const uint32_t * mask, |
||
831 | int width) |
||
832 | { |
||
833 | const uint32_t *end = src + width; |
||
834 | |||
835 | while (src < end) |
||
836 | { |
||
837 | __m64 a = load8888 (*mask); |
||
838 | __m64 s = load8888 (*src); |
||
839 | __m64 d = load8888 (*dest); |
||
840 | __m64 da = expand_alpha (d); |
||
841 | |||
842 | *dest = store8888 (over (d, da, in (s, a))); |
||
843 | |||
844 | ++src; |
||
845 | ++dest; |
||
846 | ++mask; |
||
847 | } |
||
848 | _mm_empty (); |
||
849 | } |
||
850 | |||
851 | static void |
||
852 | mmx_combine_in_ca (pixman_implementation_t *imp, |
||
853 | pixman_op_t op, |
||
854 | uint32_t * dest, |
||
855 | const uint32_t * src, |
||
856 | const uint32_t * mask, |
||
857 | int width) |
||
858 | { |
||
859 | const uint32_t *end = src + width; |
||
860 | |||
861 | while (src < end) |
||
862 | { |
||
863 | __m64 a = load8888 (*mask); |
||
864 | __m64 s = load8888 (*src); |
||
865 | __m64 d = load8888 (*dest); |
||
866 | __m64 da = expand_alpha (d); |
||
867 | |||
868 | s = pix_multiply (s, a); |
||
869 | s = pix_multiply (s, da); |
||
870 | *dest = store8888 (s); |
||
871 | |||
872 | ++src; |
||
873 | ++dest; |
||
874 | ++mask; |
||
875 | } |
||
876 | _mm_empty (); |
||
877 | } |
||
878 | |||
879 | static void |
||
880 | mmx_combine_in_reverse_ca (pixman_implementation_t *imp, |
||
881 | pixman_op_t op, |
||
882 | uint32_t * dest, |
||
883 | const uint32_t * src, |
||
884 | const uint32_t * mask, |
||
885 | int width) |
||
886 | { |
||
887 | const uint32_t *end = src + width; |
||
888 | |||
889 | while (src < end) |
||
890 | { |
||
891 | __m64 a = load8888 (*mask); |
||
892 | __m64 s = load8888 (*src); |
||
893 | __m64 d = load8888 (*dest); |
||
894 | __m64 sa = expand_alpha (s); |
||
895 | |||
896 | a = pix_multiply (a, sa); |
||
897 | d = pix_multiply (d, a); |
||
898 | *dest = store8888 (d); |
||
899 | |||
900 | ++src; |
||
901 | ++dest; |
||
902 | ++mask; |
||
903 | } |
||
904 | _mm_empty (); |
||
905 | } |
||
906 | |||
907 | static void |
||
908 | mmx_combine_out_ca (pixman_implementation_t *imp, |
||
909 | pixman_op_t op, |
||
910 | uint32_t * dest, |
||
911 | const uint32_t * src, |
||
912 | const uint32_t * mask, |
||
913 | int width) |
||
914 | { |
||
915 | const uint32_t *end = src + width; |
||
916 | |||
917 | while (src < end) |
||
918 | { |
||
919 | __m64 a = load8888 (*mask); |
||
920 | __m64 s = load8888 (*src); |
||
921 | __m64 d = load8888 (*dest); |
||
922 | __m64 da = expand_alpha (d); |
||
923 | |||
924 | da = negate (da); |
||
925 | s = pix_multiply (s, a); |
||
926 | s = pix_multiply (s, da); |
||
927 | *dest = store8888 (s); |
||
928 | |||
929 | ++src; |
||
930 | ++dest; |
||
931 | ++mask; |
||
932 | } |
||
933 | _mm_empty (); |
||
934 | } |
||
935 | |||
936 | static void |
||
937 | mmx_combine_out_reverse_ca (pixman_implementation_t *imp, |
||
938 | pixman_op_t op, |
||
939 | uint32_t * dest, |
||
940 | const uint32_t * src, |
||
941 | const uint32_t * mask, |
||
942 | int width) |
||
943 | { |
||
944 | const uint32_t *end = src + width; |
||
945 | |||
946 | while (src < end) |
||
947 | { |
||
948 | __m64 a = load8888 (*mask); |
||
949 | __m64 s = load8888 (*src); |
||
950 | __m64 d = load8888 (*dest); |
||
951 | __m64 sa = expand_alpha (s); |
||
952 | |||
953 | a = pix_multiply (a, sa); |
||
954 | a = negate (a); |
||
955 | d = pix_multiply (d, a); |
||
956 | *dest = store8888 (d); |
||
957 | |||
958 | ++src; |
||
959 | ++dest; |
||
960 | ++mask; |
||
961 | } |
||
962 | _mm_empty (); |
||
963 | } |
||
964 | |||
965 | static void |
||
966 | mmx_combine_atop_ca (pixman_implementation_t *imp, |
||
967 | pixman_op_t op, |
||
968 | uint32_t * dest, |
||
969 | const uint32_t * src, |
||
970 | const uint32_t * mask, |
||
971 | int width) |
||
972 | { |
||
973 | const uint32_t *end = src + width; |
||
974 | |||
975 | while (src < end) |
||
976 | { |
||
977 | __m64 a = load8888 (*mask); |
||
978 | __m64 s = load8888 (*src); |
||
979 | __m64 d = load8888 (*dest); |
||
980 | __m64 da = expand_alpha (d); |
||
981 | __m64 sa = expand_alpha (s); |
||
982 | |||
983 | s = pix_multiply (s, a); |
||
984 | a = pix_multiply (a, sa); |
||
985 | a = negate (a); |
||
986 | d = pix_add_mul (d, a, s, da); |
||
987 | *dest = store8888 (d); |
||
988 | |||
989 | ++src; |
||
990 | ++dest; |
||
991 | ++mask; |
||
992 | } |
||
993 | _mm_empty (); |
||
994 | } |
||
995 | |||
996 | static void |
||
997 | mmx_combine_atop_reverse_ca (pixman_implementation_t *imp, |
||
998 | pixman_op_t op, |
||
999 | uint32_t * dest, |
||
1000 | const uint32_t * src, |
||
1001 | const uint32_t * mask, |
||
1002 | int width) |
||
1003 | { |
||
1004 | const uint32_t *end = src + width; |
||
1005 | |||
1006 | while (src < end) |
||
1007 | { |
||
1008 | __m64 a = load8888 (*mask); |
||
1009 | __m64 s = load8888 (*src); |
||
1010 | __m64 d = load8888 (*dest); |
||
1011 | __m64 da = expand_alpha (d); |
||
1012 | __m64 sa = expand_alpha (s); |
||
1013 | |||
1014 | s = pix_multiply (s, a); |
||
1015 | a = pix_multiply (a, sa); |
||
1016 | da = negate (da); |
||
1017 | d = pix_add_mul (d, a, s, da); |
||
1018 | *dest = store8888 (d); |
||
1019 | |||
1020 | ++src; |
||
1021 | ++dest; |
||
1022 | ++mask; |
||
1023 | } |
||
1024 | _mm_empty (); |
||
1025 | } |
||
1026 | |||
1027 | static void |
||
1028 | mmx_combine_xor_ca (pixman_implementation_t *imp, |
||
1029 | pixman_op_t op, |
||
1030 | uint32_t * dest, |
||
1031 | const uint32_t * src, |
||
1032 | const uint32_t * mask, |
||
1033 | int width) |
||
1034 | { |
||
1035 | const uint32_t *end = src + width; |
||
1036 | |||
1037 | while (src < end) |
||
1038 | { |
||
1039 | __m64 a = load8888 (*mask); |
||
1040 | __m64 s = load8888 (*src); |
||
1041 | __m64 d = load8888 (*dest); |
||
1042 | __m64 da = expand_alpha (d); |
||
1043 | __m64 sa = expand_alpha (s); |
||
1044 | |||
1045 | s = pix_multiply (s, a); |
||
1046 | a = pix_multiply (a, sa); |
||
1047 | da = negate (da); |
||
1048 | a = negate (a); |
||
1049 | d = pix_add_mul (d, a, s, da); |
||
1050 | *dest = store8888 (d); |
||
1051 | |||
1052 | ++src; |
||
1053 | ++dest; |
||
1054 | ++mask; |
||
1055 | } |
||
1056 | _mm_empty (); |
||
1057 | } |
||
1058 | |||
1059 | static void |
||
1060 | mmx_combine_add_ca (pixman_implementation_t *imp, |
||
1061 | pixman_op_t op, |
||
1062 | uint32_t * dest, |
||
1063 | const uint32_t * src, |
||
1064 | const uint32_t * mask, |
||
1065 | int width) |
||
1066 | { |
||
1067 | const uint32_t *end = src + width; |
||
1068 | |||
1069 | while (src < end) |
||
1070 | { |
||
1071 | __m64 a = load8888 (*mask); |
||
1072 | __m64 s = load8888 (*src); |
||
1073 | __m64 d = load8888 (*dest); |
||
1074 | |||
1075 | s = pix_multiply (s, a); |
||
1076 | d = pix_add (s, d); |
||
1077 | *dest = store8888 (d); |
||
1078 | |||
1079 | ++src; |
||
1080 | ++dest; |
||
1081 | ++mask; |
||
1082 | } |
||
1083 | _mm_empty (); |
||
1084 | } |
||
1085 | |||
1086 | /* ------------- MMX code paths called from fbpict.c -------------------- */ |
||
1087 | |||
1088 | static void |
||
1089 | mmx_composite_over_n_8888 (pixman_implementation_t *imp, |
||
1090 | pixman_op_t op, |
||
1091 | pixman_image_t * src_image, |
||
1092 | pixman_image_t * mask_image, |
||
1093 | pixman_image_t * dst_image, |
||
1094 | int32_t src_x, |
||
1095 | int32_t src_y, |
||
1096 | int32_t mask_x, |
||
1097 | int32_t mask_y, |
||
1098 | int32_t dest_x, |
||
1099 | int32_t dest_y, |
||
1100 | int32_t width, |
||
1101 | int32_t height) |
||
1102 | { |
||
1103 | uint32_t src; |
||
1104 | uint32_t *dst_line, *dst; |
||
1105 | int32_t w; |
||
1106 | int dst_stride; |
||
1107 | __m64 vsrc, vsrca; |
||
1108 | |||
1109 | CHECKPOINT (); |
||
1110 | |||
1111 | src = _pixman_image_get_solid (src_image, dst_image->bits.format); |
||
1112 | |||
1113 | if (src == 0) |
||
1114 | return; |
||
1115 | |||
1116 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
1117 | |||
1118 | vsrc = load8888 (src); |
||
1119 | vsrca = expand_alpha (vsrc); |
||
1120 | |||
1121 | while (height--) |
||
1122 | { |
||
1123 | dst = dst_line; |
||
1124 | dst_line += dst_stride; |
||
1125 | w = width; |
||
1126 | |||
1127 | CHECKPOINT (); |
||
1128 | |||
1129 | while (w && (unsigned long)dst & 7) |
||
1130 | { |
||
1131 | *dst = store8888 (over (vsrc, vsrca, load8888 (*dst))); |
||
1132 | |||
1133 | w--; |
||
1134 | dst++; |
||
1135 | } |
||
1136 | |||
1137 | while (w >= 2) |
||
1138 | { |
||
1139 | __m64 vdest; |
||
1140 | __m64 dest0, dest1; |
||
1141 | |||
1142 | vdest = *(__m64 *)dst; |
||
1143 | |||
1144 | dest0 = over (vsrc, vsrca, expand8888 (vdest, 0)); |
||
1145 | dest1 = over (vsrc, vsrca, expand8888 (vdest, 1)); |
||
1146 | |||
1147 | *(__m64 *)dst = pack8888 (dest0, dest1); |
||
1148 | |||
1149 | dst += 2; |
||
1150 | w -= 2; |
||
1151 | } |
||
1152 | |||
1153 | CHECKPOINT (); |
||
1154 | |||
1155 | while (w) |
||
1156 | { |
||
1157 | *dst = store8888 (over (vsrc, vsrca, load8888 (*dst))); |
||
1158 | |||
1159 | w--; |
||
1160 | dst++; |
||
1161 | } |
||
1162 | } |
||
1163 | |||
1164 | _mm_empty (); |
||
1165 | } |
||
1166 | |||
1167 | static void |
||
1168 | mmx_composite_over_n_0565 (pixman_implementation_t *imp, |
||
1169 | pixman_op_t op, |
||
1170 | pixman_image_t * src_image, |
||
1171 | pixman_image_t * mask_image, |
||
1172 | pixman_image_t * dst_image, |
||
1173 | int32_t src_x, |
||
1174 | int32_t src_y, |
||
1175 | int32_t mask_x, |
||
1176 | int32_t mask_y, |
||
1177 | int32_t dest_x, |
||
1178 | int32_t dest_y, |
||
1179 | int32_t width, |
||
1180 | int32_t height) |
||
1181 | { |
||
1182 | uint32_t src; |
||
1183 | uint16_t *dst_line, *dst; |
||
1184 | int32_t w; |
||
1185 | int dst_stride; |
||
1186 | __m64 vsrc, vsrca; |
||
1187 | |||
1188 | CHECKPOINT (); |
||
1189 | |||
1190 | src = _pixman_image_get_solid (src_image, dst_image->bits.format); |
||
1191 | |||
1192 | if (src == 0) |
||
1193 | return; |
||
1194 | |||
1195 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
||
1196 | |||
1197 | vsrc = load8888 (src); |
||
1198 | vsrca = expand_alpha (vsrc); |
||
1199 | |||
1200 | while (height--) |
||
1201 | { |
||
1202 | dst = dst_line; |
||
1203 | dst_line += dst_stride; |
||
1204 | w = width; |
||
1205 | |||
1206 | CHECKPOINT (); |
||
1207 | |||
1208 | while (w && (unsigned long)dst & 7) |
||
1209 | { |
||
1210 | uint64_t d = *dst; |
||
1211 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
1212 | |||
1213 | vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); |
||
1214 | *dst = to_uint64 (vdest); |
||
1215 | |||
1216 | w--; |
||
1217 | dst++; |
||
1218 | } |
||
1219 | |||
1220 | while (w >= 4) |
||
1221 | { |
||
1222 | __m64 vdest; |
||
1223 | |||
1224 | vdest = *(__m64 *)dst; |
||
1225 | |||
1226 | vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0); |
||
1227 | vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1); |
||
1228 | vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2); |
||
1229 | vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3); |
||
1230 | |||
1231 | *(__m64 *)dst = vdest; |
||
1232 | |||
1233 | dst += 4; |
||
1234 | w -= 4; |
||
1235 | } |
||
1236 | |||
1237 | CHECKPOINT (); |
||
1238 | |||
1239 | while (w) |
||
1240 | { |
||
1241 | uint64_t d = *dst; |
||
1242 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
1243 | |||
1244 | vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0); |
||
1245 | *dst = to_uint64 (vdest); |
||
1246 | |||
1247 | w--; |
||
1248 | dst++; |
||
1249 | } |
||
1250 | } |
||
1251 | |||
1252 | _mm_empty (); |
||
1253 | } |
||
1254 | |||
1255 | static void |
||
1256 | mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp, |
||
1257 | pixman_op_t op, |
||
1258 | pixman_image_t * src_image, |
||
1259 | pixman_image_t * mask_image, |
||
1260 | pixman_image_t * dst_image, |
||
1261 | int32_t src_x, |
||
1262 | int32_t src_y, |
||
1263 | int32_t mask_x, |
||
1264 | int32_t mask_y, |
||
1265 | int32_t dest_x, |
||
1266 | int32_t dest_y, |
||
1267 | int32_t width, |
||
1268 | int32_t height) |
||
1269 | { |
||
1270 | uint32_t src, srca; |
||
1271 | uint32_t *dst_line; |
||
1272 | uint32_t *mask_line; |
||
1273 | int dst_stride, mask_stride; |
||
1274 | __m64 vsrc, vsrca; |
||
1275 | |||
1276 | CHECKPOINT (); |
||
1277 | |||
1278 | src = _pixman_image_get_solid (src_image, dst_image->bits.format); |
||
1279 | |||
1280 | srca = src >> 24; |
||
1281 | if (src == 0) |
||
1282 | return; |
||
1283 | |||
1284 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
1285 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
||
1286 | |||
1287 | vsrc = load8888 (src); |
||
1288 | vsrca = expand_alpha (vsrc); |
||
1289 | |||
1290 | while (height--) |
||
1291 | { |
||
1292 | int twidth = width; |
||
1293 | uint32_t *p = (uint32_t *)mask_line; |
||
1294 | uint32_t *q = (uint32_t *)dst_line; |
||
1295 | |||
1296 | while (twidth && (unsigned long)q & 7) |
||
1297 | { |
||
1298 | uint32_t m = *(uint32_t *)p; |
||
1299 | |||
1300 | if (m) |
||
1301 | { |
||
1302 | __m64 vdest = load8888 (*q); |
||
1303 | vdest = in_over (vsrc, vsrca, load8888 (m), vdest); |
||
1304 | *q = store8888 (vdest); |
||
1305 | } |
||
1306 | |||
1307 | twidth--; |
||
1308 | p++; |
||
1309 | q++; |
||
1310 | } |
||
1311 | |||
1312 | while (twidth >= 2) |
||
1313 | { |
||
1314 | uint32_t m0, m1; |
||
1315 | m0 = *p; |
||
1316 | m1 = *(p + 1); |
||
1317 | |||
1318 | if (m0 | m1) |
||
1319 | { |
||
1320 | __m64 dest0, dest1; |
||
1321 | __m64 vdest = *(__m64 *)q; |
||
1322 | |||
1323 | dest0 = in_over (vsrc, vsrca, load8888 (m0), |
||
1324 | expand8888 (vdest, 0)); |
||
1325 | dest1 = in_over (vsrc, vsrca, load8888 (m1), |
||
1326 | expand8888 (vdest, 1)); |
||
1327 | |||
1328 | *(__m64 *)q = pack8888 (dest0, dest1); |
||
1329 | } |
||
1330 | |||
1331 | p += 2; |
||
1332 | q += 2; |
||
1333 | twidth -= 2; |
||
1334 | } |
||
1335 | |||
1336 | while (twidth) |
||
1337 | { |
||
1338 | uint32_t m = *(uint32_t *)p; |
||
1339 | |||
1340 | if (m) |
||
1341 | { |
||
1342 | __m64 vdest = load8888 (*q); |
||
1343 | vdest = in_over (vsrc, vsrca, load8888 (m), vdest); |
||
1344 | *q = store8888 (vdest); |
||
1345 | } |
||
1346 | |||
1347 | twidth--; |
||
1348 | p++; |
||
1349 | q++; |
||
1350 | } |
||
1351 | |||
1352 | dst_line += dst_stride; |
||
1353 | mask_line += mask_stride; |
||
1354 | } |
||
1355 | |||
1356 | _mm_empty (); |
||
1357 | } |
||
1358 | |||
1359 | static void |
||
1360 | mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp, |
||
1361 | pixman_op_t op, |
||
1362 | pixman_image_t * src_image, |
||
1363 | pixman_image_t * mask_image, |
||
1364 | pixman_image_t * dst_image, |
||
1365 | int32_t src_x, |
||
1366 | int32_t src_y, |
||
1367 | int32_t mask_x, |
||
1368 | int32_t mask_y, |
||
1369 | int32_t dest_x, |
||
1370 | int32_t dest_y, |
||
1371 | int32_t width, |
||
1372 | int32_t height) |
||
1373 | { |
||
1374 | uint32_t *dst_line, *dst; |
||
1375 | uint32_t *src_line, *src; |
||
1376 | uint32_t mask; |
||
1377 | __m64 vmask; |
||
1378 | int dst_stride, src_stride; |
||
1379 | int32_t w; |
||
1380 | __m64 srca; |
||
1381 | |||
1382 | CHECKPOINT (); |
||
1383 | |||
1384 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
1385 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
1386 | |||
1387 | mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); |
||
1388 | mask &= 0xff000000; |
||
1389 | mask = mask | mask >> 8 | mask >> 16 | mask >> 24; |
||
1390 | vmask = load8888 (mask); |
||
1391 | srca = MC (4x00ff); |
||
1392 | |||
1393 | while (height--) |
||
1394 | { |
||
1395 | dst = dst_line; |
||
1396 | dst_line += dst_stride; |
||
1397 | src = src_line; |
||
1398 | src_line += src_stride; |
||
1399 | w = width; |
||
1400 | |||
1401 | while (w && (unsigned long)dst & 7) |
||
1402 | { |
||
1403 | __m64 s = load8888 (*src); |
||
1404 | __m64 d = load8888 (*dst); |
||
1405 | |||
1406 | *dst = store8888 (in_over (s, expand_alpha (s), vmask, d)); |
||
1407 | |||
1408 | w--; |
||
1409 | dst++; |
||
1410 | src++; |
||
1411 | } |
||
1412 | |||
1413 | while (w >= 2) |
||
1414 | { |
||
1415 | __m64 vs = *(__m64 *)src; |
||
1416 | __m64 vd = *(__m64 *)dst; |
||
1417 | __m64 vsrc0 = expand8888 (vs, 0); |
||
1418 | __m64 vsrc1 = expand8888 (vs, 1); |
||
1419 | |||
1420 | *(__m64 *)dst = pack8888 ( |
||
1421 | in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)), |
||
1422 | in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1))); |
||
1423 | |||
1424 | w -= 2; |
||
1425 | dst += 2; |
||
1426 | src += 2; |
||
1427 | } |
||
1428 | |||
1429 | while (w) |
||
1430 | { |
||
1431 | __m64 s = load8888 (*src); |
||
1432 | __m64 d = load8888 (*dst); |
||
1433 | |||
1434 | *dst = store8888 (in_over (s, expand_alpha (s), vmask, d)); |
||
1435 | |||
1436 | w--; |
||
1437 | dst++; |
||
1438 | src++; |
||
1439 | } |
||
1440 | } |
||
1441 | |||
1442 | _mm_empty (); |
||
1443 | } |
||
1444 | |||
1445 | static void |
||
1446 | mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp, |
||
1447 | pixman_op_t op, |
||
1448 | pixman_image_t * src_image, |
||
1449 | pixman_image_t * mask_image, |
||
1450 | pixman_image_t * dst_image, |
||
1451 | int32_t src_x, |
||
1452 | int32_t src_y, |
||
1453 | int32_t mask_x, |
||
1454 | int32_t mask_y, |
||
1455 | int32_t dest_x, |
||
1456 | int32_t dest_y, |
||
1457 | int32_t width, |
||
1458 | int32_t height) |
||
1459 | { |
||
1460 | uint32_t *dst_line, *dst; |
||
1461 | uint32_t *src_line, *src; |
||
1462 | uint32_t mask; |
||
1463 | __m64 vmask; |
||
1464 | int dst_stride, src_stride; |
||
1465 | int32_t w; |
||
1466 | __m64 srca; |
||
1467 | |||
1468 | CHECKPOINT (); |
||
1469 | |||
1470 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
1471 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
1472 | mask = _pixman_image_get_solid (mask_image, dst_image->bits.format); |
||
1473 | |||
1474 | mask &= 0xff000000; |
||
1475 | mask = mask | mask >> 8 | mask >> 16 | mask >> 24; |
||
1476 | vmask = load8888 (mask); |
||
1477 | srca = MC (4x00ff); |
||
1478 | |||
1479 | while (height--) |
||
1480 | { |
||
1481 | dst = dst_line; |
||
1482 | dst_line += dst_stride; |
||
1483 | src = src_line; |
||
1484 | src_line += src_stride; |
||
1485 | w = width; |
||
1486 | |||
1487 | while (w && (unsigned long)dst & 7) |
||
1488 | { |
||
1489 | __m64 s = load8888 (*src | 0xff000000); |
||
1490 | __m64 d = load8888 (*dst); |
||
1491 | |||
1492 | *dst = store8888 (in_over (s, srca, vmask, d)); |
||
1493 | |||
1494 | w--; |
||
1495 | dst++; |
||
1496 | src++; |
||
1497 | } |
||
1498 | |||
1499 | while (w >= 16) |
||
1500 | { |
||
1501 | __m64 vd0 = *(__m64 *)(dst + 0); |
||
1502 | __m64 vd1 = *(__m64 *)(dst + 2); |
||
1503 | __m64 vd2 = *(__m64 *)(dst + 4); |
||
1504 | __m64 vd3 = *(__m64 *)(dst + 6); |
||
1505 | __m64 vd4 = *(__m64 *)(dst + 8); |
||
1506 | __m64 vd5 = *(__m64 *)(dst + 10); |
||
1507 | __m64 vd6 = *(__m64 *)(dst + 12); |
||
1508 | __m64 vd7 = *(__m64 *)(dst + 14); |
||
1509 | |||
1510 | __m64 vs0 = *(__m64 *)(src + 0); |
||
1511 | __m64 vs1 = *(__m64 *)(src + 2); |
||
1512 | __m64 vs2 = *(__m64 *)(src + 4); |
||
1513 | __m64 vs3 = *(__m64 *)(src + 6); |
||
1514 | __m64 vs4 = *(__m64 *)(src + 8); |
||
1515 | __m64 vs5 = *(__m64 *)(src + 10); |
||
1516 | __m64 vs6 = *(__m64 *)(src + 12); |
||
1517 | __m64 vs7 = *(__m64 *)(src + 14); |
||
1518 | |||
1519 | vd0 = pack8888 ( |
||
1520 | in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)), |
||
1521 | in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1))); |
||
1522 | |||
1523 | vd1 = pack8888 ( |
||
1524 | in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)), |
||
1525 | in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1))); |
||
1526 | |||
1527 | vd2 = pack8888 ( |
||
1528 | in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)), |
||
1529 | in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1))); |
||
1530 | |||
1531 | vd3 = pack8888 ( |
||
1532 | in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)), |
||
1533 | in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1))); |
||
1534 | |||
1535 | vd4 = pack8888 ( |
||
1536 | in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)), |
||
1537 | in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1))); |
||
1538 | |||
1539 | vd5 = pack8888 ( |
||
1540 | in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)), |
||
1541 | in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1))); |
||
1542 | |||
1543 | vd6 = pack8888 ( |
||
1544 | in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)), |
||
1545 | in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1))); |
||
1546 | |||
1547 | vd7 = pack8888 ( |
||
1548 | in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)), |
||
1549 | in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1))); |
||
1550 | |||
1551 | *(__m64 *)(dst + 0) = vd0; |
||
1552 | *(__m64 *)(dst + 2) = vd1; |
||
1553 | *(__m64 *)(dst + 4) = vd2; |
||
1554 | *(__m64 *)(dst + 6) = vd3; |
||
1555 | *(__m64 *)(dst + 8) = vd4; |
||
1556 | *(__m64 *)(dst + 10) = vd5; |
||
1557 | *(__m64 *)(dst + 12) = vd6; |
||
1558 | *(__m64 *)(dst + 14) = vd7; |
||
1559 | |||
1560 | w -= 16; |
||
1561 | dst += 16; |
||
1562 | src += 16; |
||
1563 | } |
||
1564 | |||
1565 | while (w) |
||
1566 | { |
||
1567 | __m64 s = load8888 (*src | 0xff000000); |
||
1568 | __m64 d = load8888 (*dst); |
||
1569 | |||
1570 | *dst = store8888 (in_over (s, srca, vmask, d)); |
||
1571 | |||
1572 | w--; |
||
1573 | dst++; |
||
1574 | src++; |
||
1575 | } |
||
1576 | } |
||
1577 | |||
1578 | _mm_empty (); |
||
1579 | } |
||
1580 | |||
1581 | static void |
||
1582 | mmx_composite_over_8888_8888 (pixman_implementation_t *imp, |
||
1583 | pixman_op_t op, |
||
1584 | pixman_image_t * src_image, |
||
1585 | pixman_image_t * mask_image, |
||
1586 | pixman_image_t * dst_image, |
||
1587 | int32_t src_x, |
||
1588 | int32_t src_y, |
||
1589 | int32_t mask_x, |
||
1590 | int32_t mask_y, |
||
1591 | int32_t dest_x, |
||
1592 | int32_t dest_y, |
||
1593 | int32_t width, |
||
1594 | int32_t height) |
||
1595 | { |
||
1596 | uint32_t *dst_line, *dst; |
||
1597 | uint32_t *src_line, *src; |
||
1598 | uint32_t s; |
||
1599 | int dst_stride, src_stride; |
||
1600 | uint8_t a; |
||
1601 | int32_t w; |
||
1602 | |||
1603 | CHECKPOINT (); |
||
1604 | |||
1605 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
1606 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
1607 | |||
1608 | while (height--) |
||
1609 | { |
||
1610 | dst = dst_line; |
||
1611 | dst_line += dst_stride; |
||
1612 | src = src_line; |
||
1613 | src_line += src_stride; |
||
1614 | w = width; |
||
1615 | |||
1616 | while (w--) |
||
1617 | { |
||
1618 | s = *src++; |
||
1619 | a = s >> 24; |
||
1620 | |||
1621 | if (a == 0xff) |
||
1622 | { |
||
1623 | *dst = s; |
||
1624 | } |
||
1625 | else if (s) |
||
1626 | { |
||
1627 | __m64 ms, sa; |
||
1628 | ms = load8888 (s); |
||
1629 | sa = expand_alpha (ms); |
||
1630 | *dst = store8888 (over (ms, sa, load8888 (*dst))); |
||
1631 | } |
||
1632 | |||
1633 | dst++; |
||
1634 | } |
||
1635 | } |
||
1636 | _mm_empty (); |
||
1637 | } |
||
1638 | |||
1639 | static void |
||
1640 | mmx_composite_over_8888_0565 (pixman_implementation_t *imp, |
||
1641 | pixman_op_t op, |
||
1642 | pixman_image_t * src_image, |
||
1643 | pixman_image_t * mask_image, |
||
1644 | pixman_image_t * dst_image, |
||
1645 | int32_t src_x, |
||
1646 | int32_t src_y, |
||
1647 | int32_t mask_x, |
||
1648 | int32_t mask_y, |
||
1649 | int32_t dest_x, |
||
1650 | int32_t dest_y, |
||
1651 | int32_t width, |
||
1652 | int32_t height) |
||
1653 | { |
||
1654 | uint16_t *dst_line, *dst; |
||
1655 | uint32_t *src_line, *src; |
||
1656 | int dst_stride, src_stride; |
||
1657 | int32_t w; |
||
1658 | |||
1659 | CHECKPOINT (); |
||
1660 | |||
1661 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
||
1662 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
1663 | |||
1664 | #if 0 |
||
1665 | /* FIXME */ |
||
1666 | assert (src_image->drawable == mask_image->drawable); |
||
1667 | #endif |
||
1668 | |||
1669 | while (height--) |
||
1670 | { |
||
1671 | dst = dst_line; |
||
1672 | dst_line += dst_stride; |
||
1673 | src = src_line; |
||
1674 | src_line += src_stride; |
||
1675 | w = width; |
||
1676 | |||
1677 | CHECKPOINT (); |
||
1678 | |||
1679 | while (w && (unsigned long)dst & 7) |
||
1680 | { |
||
1681 | __m64 vsrc = load8888 (*src); |
||
1682 | uint64_t d = *dst; |
||
1683 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
1684 | |||
1685 | vdest = pack_565 ( |
||
1686 | over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); |
||
1687 | |||
1688 | *dst = to_uint64 (vdest); |
||
1689 | |||
1690 | w--; |
||
1691 | dst++; |
||
1692 | src++; |
||
1693 | } |
||
1694 | |||
1695 | CHECKPOINT (); |
||
1696 | |||
1697 | while (w >= 4) |
||
1698 | { |
||
1699 | __m64 vsrc0, vsrc1, vsrc2, vsrc3; |
||
1700 | __m64 vdest; |
||
1701 | |||
1702 | vsrc0 = load8888 (*(src + 0)); |
||
1703 | vsrc1 = load8888 (*(src + 1)); |
||
1704 | vsrc2 = load8888 (*(src + 2)); |
||
1705 | vsrc3 = load8888 (*(src + 3)); |
||
1706 | |||
1707 | vdest = *(__m64 *)dst; |
||
1708 | |||
1709 | vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0); |
||
1710 | vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1); |
||
1711 | vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2); |
||
1712 | vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3); |
||
1713 | |||
1714 | *(__m64 *)dst = vdest; |
||
1715 | |||
1716 | w -= 4; |
||
1717 | dst += 4; |
||
1718 | src += 4; |
||
1719 | } |
||
1720 | |||
1721 | CHECKPOINT (); |
||
1722 | |||
1723 | while (w) |
||
1724 | { |
||
1725 | __m64 vsrc = load8888 (*src); |
||
1726 | uint64_t d = *dst; |
||
1727 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
1728 | |||
1729 | vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0); |
||
1730 | |||
1731 | *dst = to_uint64 (vdest); |
||
1732 | |||
1733 | w--; |
||
1734 | dst++; |
||
1735 | src++; |
||
1736 | } |
||
1737 | } |
||
1738 | |||
1739 | _mm_empty (); |
||
1740 | } |
||
1741 | |||
1742 | static void |
||
1743 | mmx_composite_over_n_8_8888 (pixman_implementation_t *imp, |
||
1744 | pixman_op_t op, |
||
1745 | pixman_image_t * src_image, |
||
1746 | pixman_image_t * mask_image, |
||
1747 | pixman_image_t * dst_image, |
||
1748 | int32_t src_x, |
||
1749 | int32_t src_y, |
||
1750 | int32_t mask_x, |
||
1751 | int32_t mask_y, |
||
1752 | int32_t dest_x, |
||
1753 | int32_t dest_y, |
||
1754 | int32_t width, |
||
1755 | int32_t height) |
||
1756 | { |
||
1757 | uint32_t src, srca; |
||
1758 | uint32_t *dst_line, *dst; |
||
1759 | uint8_t *mask_line, *mask; |
||
1760 | int dst_stride, mask_stride; |
||
1761 | int32_t w; |
||
1762 | __m64 vsrc, vsrca; |
||
1763 | uint64_t srcsrc; |
||
1764 | |||
1765 | CHECKPOINT (); |
||
1766 | |||
1767 | src = _pixman_image_get_solid (src_image, dst_image->bits.format); |
||
1768 | |||
1769 | srca = src >> 24; |
||
1770 | if (src == 0) |
||
1771 | return; |
||
1772 | |||
1773 | srcsrc = (uint64_t)src << 32 | src; |
||
1774 | |||
1775 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
1776 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
||
1777 | |||
1778 | vsrc = load8888 (src); |
||
1779 | vsrca = expand_alpha (vsrc); |
||
1780 | |||
1781 | while (height--) |
||
1782 | { |
||
1783 | dst = dst_line; |
||
1784 | dst_line += dst_stride; |
||
1785 | mask = mask_line; |
||
1786 | mask_line += mask_stride; |
||
1787 | w = width; |
||
1788 | |||
1789 | CHECKPOINT (); |
||
1790 | |||
1791 | while (w && (unsigned long)dst & 7) |
||
1792 | { |
||
1793 | uint64_t m = *mask; |
||
1794 | |||
1795 | if (m) |
||
1796 | { |
||
1797 | __m64 vdest = in_over (vsrc, vsrca, |
||
1798 | expand_alpha_rev (to_m64 (m)), |
||
1799 | load8888 (*dst)); |
||
1800 | |||
1801 | *dst = store8888 (vdest); |
||
1802 | } |
||
1803 | |||
1804 | w--; |
||
1805 | mask++; |
||
1806 | dst++; |
||
1807 | } |
||
1808 | |||
1809 | CHECKPOINT (); |
||
1810 | |||
1811 | while (w >= 2) |
||
1812 | { |
||
1813 | uint64_t m0, m1; |
||
1814 | |||
1815 | m0 = *mask; |
||
1816 | m1 = *(mask + 1); |
||
1817 | |||
1818 | if (srca == 0xff && (m0 & m1) == 0xff) |
||
1819 | { |
||
1820 | *(uint64_t *)dst = srcsrc; |
||
1821 | } |
||
1822 | else if (m0 | m1) |
||
1823 | { |
||
1824 | __m64 vdest; |
||
1825 | __m64 dest0, dest1; |
||
1826 | |||
1827 | vdest = *(__m64 *)dst; |
||
1828 | |||
1829 | dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)), |
||
1830 | expand8888 (vdest, 0)); |
||
1831 | dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)), |
||
1832 | expand8888 (vdest, 1)); |
||
1833 | |||
1834 | *(__m64 *)dst = pack8888 (dest0, dest1); |
||
1835 | } |
||
1836 | |||
1837 | mask += 2; |
||
1838 | dst += 2; |
||
1839 | w -= 2; |
||
1840 | } |
||
1841 | |||
1842 | CHECKPOINT (); |
||
1843 | |||
1844 | while (w) |
||
1845 | { |
||
1846 | uint64_t m = *mask; |
||
1847 | |||
1848 | if (m) |
||
1849 | { |
||
1850 | __m64 vdest = load8888 (*dst); |
||
1851 | |||
1852 | vdest = in_over ( |
||
1853 | vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest); |
||
1854 | *dst = store8888 (vdest); |
||
1855 | } |
||
1856 | |||
1857 | w--; |
||
1858 | mask++; |
||
1859 | dst++; |
||
1860 | } |
||
1861 | } |
||
1862 | |||
1863 | _mm_empty (); |
||
1864 | } |
||
1865 | |||
1866 | pixman_bool_t |
||
1867 | pixman_fill_mmx (uint32_t *bits, |
||
1868 | int stride, |
||
1869 | int bpp, |
||
1870 | int x, |
||
1871 | int y, |
||
1872 | int width, |
||
1873 | int height, |
||
1874 | uint32_t xor) |
||
1875 | { |
||
1876 | uint64_t fill; |
||
1877 | __m64 vfill; |
||
1878 | uint32_t byte_width; |
||
1879 | uint8_t *byte_line; |
||
1880 | |||
1881 | #ifdef __GNUC__ |
||
1882 | __m64 v1, v2, v3, v4, v5, v6, v7; |
||
1883 | #endif |
||
1884 | |||
1885 | if (bpp != 16 && bpp != 32 && bpp != 8) |
||
1886 | return FALSE; |
||
1887 | |||
1888 | if (bpp == 8) |
||
1889 | { |
||
1890 | stride = stride * (int) sizeof (uint32_t) / 1; |
||
1891 | byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x); |
||
1892 | byte_width = width; |
||
1893 | stride *= 1; |
||
1894 | xor = (xor & 0xff) * 0x01010101; |
||
1895 | } |
||
1896 | else if (bpp == 16) |
||
1897 | { |
||
1898 | stride = stride * (int) sizeof (uint32_t) / 2; |
||
1899 | byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x); |
||
1900 | byte_width = 2 * width; |
||
1901 | stride *= 2; |
||
1902 | xor = (xor & 0xffff) * 0x00010001; |
||
1903 | } |
||
1904 | else |
||
1905 | { |
||
1906 | stride = stride * (int) sizeof (uint32_t) / 4; |
||
1907 | byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x); |
||
1908 | byte_width = 4 * width; |
||
1909 | stride *= 4; |
||
1910 | } |
||
1911 | |||
1912 | fill = ((uint64_t)xor << 32) | xor; |
||
1913 | vfill = to_m64 (fill); |
||
1914 | |||
1915 | #ifdef __GNUC__ |
||
1916 | __asm__ ( |
||
1917 | "movq %7, %0\n" |
||
1918 | "movq %7, %1\n" |
||
1919 | "movq %7, %2\n" |
||
1920 | "movq %7, %3\n" |
||
1921 | "movq %7, %4\n" |
||
1922 | "movq %7, %5\n" |
||
1923 | "movq %7, %6\n" |
||
1924 | : "=&y" (v1), "=&y" (v2), "=&y" (v3), |
||
1925 | "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7) |
||
1926 | : "y" (vfill)); |
||
1927 | #endif |
||
1928 | |||
1929 | while (height--) |
||
1930 | { |
||
1931 | int w; |
||
1932 | uint8_t *d = byte_line; |
||
1933 | |||
1934 | byte_line += stride; |
||
1935 | w = byte_width; |
||
1936 | |||
1937 | while (w >= 1 && ((unsigned long)d & 1)) |
||
1938 | { |
||
1939 | *(uint8_t *)d = (xor & 0xff); |
||
1940 | w--; |
||
1941 | d++; |
||
1942 | } |
||
1943 | |||
1944 | while (w >= 2 && ((unsigned long)d & 3)) |
||
1945 | { |
||
1946 | *(uint16_t *)d = xor; |
||
1947 | w -= 2; |
||
1948 | d += 2; |
||
1949 | } |
||
1950 | |||
1951 | while (w >= 4 && ((unsigned long)d & 7)) |
||
1952 | { |
||
1953 | *(uint32_t *)d = xor; |
||
1954 | |||
1955 | w -= 4; |
||
1956 | d += 4; |
||
1957 | } |
||
1958 | |||
1959 | while (w >= 64) |
||
1960 | { |
||
1961 | #ifdef __GNUC__ |
||
1962 | __asm__ ( |
||
1963 | "movq %1, (%0)\n" |
||
1964 | "movq %2, 8(%0)\n" |
||
1965 | "movq %3, 16(%0)\n" |
||
1966 | "movq %4, 24(%0)\n" |
||
1967 | "movq %5, 32(%0)\n" |
||
1968 | "movq %6, 40(%0)\n" |
||
1969 | "movq %7, 48(%0)\n" |
||
1970 | "movq %8, 56(%0)\n" |
||
1971 | : |
||
1972 | : "r" (d), |
||
1973 | "y" (vfill), "y" (v1), "y" (v2), "y" (v3), |
||
1974 | "y" (v4), "y" (v5), "y" (v6), "y" (v7) |
||
1975 | : "memory"); |
||
1976 | #else |
||
1977 | *(__m64*) (d + 0) = vfill; |
||
1978 | *(__m64*) (d + 8) = vfill; |
||
1979 | *(__m64*) (d + 16) = vfill; |
||
1980 | *(__m64*) (d + 24) = vfill; |
||
1981 | *(__m64*) (d + 32) = vfill; |
||
1982 | *(__m64*) (d + 40) = vfill; |
||
1983 | *(__m64*) (d + 48) = vfill; |
||
1984 | *(__m64*) (d + 56) = vfill; |
||
1985 | #endif |
||
1986 | w -= 64; |
||
1987 | d += 64; |
||
1988 | } |
||
1989 | |||
1990 | while (w >= 4) |
||
1991 | { |
||
1992 | *(uint32_t *)d = xor; |
||
1993 | |||
1994 | w -= 4; |
||
1995 | d += 4; |
||
1996 | } |
||
1997 | while (w >= 2) |
||
1998 | { |
||
1999 | *(uint16_t *)d = xor; |
||
2000 | w -= 2; |
||
2001 | d += 2; |
||
2002 | } |
||
2003 | while (w >= 1) |
||
2004 | { |
||
2005 | *(uint8_t *)d = (xor & 0xff); |
||
2006 | w--; |
||
2007 | d++; |
||
2008 | } |
||
2009 | |||
2010 | } |
||
2011 | |||
2012 | _mm_empty (); |
||
2013 | return TRUE; |
||
2014 | } |
||
2015 | |||
2016 | static void |
||
2017 | mmx_composite_src_n_8_8888 (pixman_implementation_t *imp, |
||
2018 | pixman_op_t op, |
||
2019 | pixman_image_t * src_image, |
||
2020 | pixman_image_t * mask_image, |
||
2021 | pixman_image_t * dst_image, |
||
2022 | int32_t src_x, |
||
2023 | int32_t src_y, |
||
2024 | int32_t mask_x, |
||
2025 | int32_t mask_y, |
||
2026 | int32_t dest_x, |
||
2027 | int32_t dest_y, |
||
2028 | int32_t width, |
||
2029 | int32_t height) |
||
2030 | { |
||
2031 | uint32_t src, srca; |
||
2032 | uint32_t *dst_line, *dst; |
||
2033 | uint8_t *mask_line, *mask; |
||
2034 | int dst_stride, mask_stride; |
||
2035 | int32_t w; |
||
2036 | __m64 vsrc, vsrca; |
||
2037 | uint64_t srcsrc; |
||
2038 | |||
2039 | CHECKPOINT (); |
||
2040 | |||
2041 | src = _pixman_image_get_solid (src_image, dst_image->bits.format); |
||
2042 | |||
2043 | srca = src >> 24; |
||
2044 | if (src == 0) |
||
2045 | { |
||
2046 | pixman_fill_mmx (dst_image->bits.bits, dst_image->bits.rowstride, |
||
2047 | PIXMAN_FORMAT_BPP (dst_image->bits.format), |
||
2048 | dest_x, dest_y, width, height, 0); |
||
2049 | return; |
||
2050 | } |
||
2051 | |||
2052 | srcsrc = (uint64_t)src << 32 | src; |
||
2053 | |||
2054 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
2055 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
||
2056 | |||
2057 | vsrc = load8888 (src); |
||
2058 | vsrca = expand_alpha (vsrc); |
||
2059 | |||
2060 | while (height--) |
||
2061 | { |
||
2062 | dst = dst_line; |
||
2063 | dst_line += dst_stride; |
||
2064 | mask = mask_line; |
||
2065 | mask_line += mask_stride; |
||
2066 | w = width; |
||
2067 | |||
2068 | CHECKPOINT (); |
||
2069 | |||
2070 | while (w && (unsigned long)dst & 7) |
||
2071 | { |
||
2072 | uint64_t m = *mask; |
||
2073 | |||
2074 | if (m) |
||
2075 | { |
||
2076 | __m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); |
||
2077 | |||
2078 | *dst = store8888 (vdest); |
||
2079 | } |
||
2080 | else |
||
2081 | { |
||
2082 | *dst = 0; |
||
2083 | } |
||
2084 | |||
2085 | w--; |
||
2086 | mask++; |
||
2087 | dst++; |
||
2088 | } |
||
2089 | |||
2090 | CHECKPOINT (); |
||
2091 | |||
2092 | while (w >= 2) |
||
2093 | { |
||
2094 | uint64_t m0, m1; |
||
2095 | m0 = *mask; |
||
2096 | m1 = *(mask + 1); |
||
2097 | |||
2098 | if (srca == 0xff && (m0 & m1) == 0xff) |
||
2099 | { |
||
2100 | *(uint64_t *)dst = srcsrc; |
||
2101 | } |
||
2102 | else if (m0 | m1) |
||
2103 | { |
||
2104 | __m64 vdest; |
||
2105 | __m64 dest0, dest1; |
||
2106 | |||
2107 | vdest = *(__m64 *)dst; |
||
2108 | |||
2109 | dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0))); |
||
2110 | dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1))); |
||
2111 | |||
2112 | *(__m64 *)dst = pack8888 (dest0, dest1); |
||
2113 | } |
||
2114 | else |
||
2115 | { |
||
2116 | *(uint64_t *)dst = 0; |
||
2117 | } |
||
2118 | |||
2119 | mask += 2; |
||
2120 | dst += 2; |
||
2121 | w -= 2; |
||
2122 | } |
||
2123 | |||
2124 | CHECKPOINT (); |
||
2125 | |||
2126 | while (w) |
||
2127 | { |
||
2128 | uint64_t m = *mask; |
||
2129 | |||
2130 | if (m) |
||
2131 | { |
||
2132 | __m64 vdest = load8888 (*dst); |
||
2133 | |||
2134 | vdest = in (vsrc, expand_alpha_rev (to_m64 (m))); |
||
2135 | *dst = store8888 (vdest); |
||
2136 | } |
||
2137 | else |
||
2138 | { |
||
2139 | *dst = 0; |
||
2140 | } |
||
2141 | |||
2142 | w--; |
||
2143 | mask++; |
||
2144 | dst++; |
||
2145 | } |
||
2146 | } |
||
2147 | |||
2148 | _mm_empty (); |
||
2149 | } |
||
2150 | |||
2151 | static void |
||
2152 | mmx_composite_over_n_8_0565 (pixman_implementation_t *imp, |
||
2153 | pixman_op_t op, |
||
2154 | pixman_image_t * src_image, |
||
2155 | pixman_image_t * mask_image, |
||
2156 | pixman_image_t * dst_image, |
||
2157 | int32_t src_x, |
||
2158 | int32_t src_y, |
||
2159 | int32_t mask_x, |
||
2160 | int32_t mask_y, |
||
2161 | int32_t dest_x, |
||
2162 | int32_t dest_y, |
||
2163 | int32_t width, |
||
2164 | int32_t height) |
||
2165 | { |
||
2166 | uint32_t src, srca; |
||
2167 | uint16_t *dst_line, *dst; |
||
2168 | uint8_t *mask_line, *mask; |
||
2169 | int dst_stride, mask_stride; |
||
2170 | int32_t w; |
||
2171 | __m64 vsrc, vsrca, tmp; |
||
2172 | uint64_t srcsrcsrcsrc, src16; |
||
2173 | |||
2174 | CHECKPOINT (); |
||
2175 | |||
2176 | src = _pixman_image_get_solid (src_image, dst_image->bits.format); |
||
2177 | |||
2178 | srca = src >> 24; |
||
2179 | if (src == 0) |
||
2180 | return; |
||
2181 | |||
2182 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
||
2183 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
||
2184 | |||
2185 | vsrc = load8888 (src); |
||
2186 | vsrca = expand_alpha (vsrc); |
||
2187 | |||
2188 | tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0); |
||
2189 | src16 = to_uint64 (tmp); |
||
2190 | |||
2191 | srcsrcsrcsrc = |
||
2192 | (uint64_t)src16 << 48 | (uint64_t)src16 << 32 | |
||
2193 | (uint64_t)src16 << 16 | (uint64_t)src16; |
||
2194 | |||
2195 | while (height--) |
||
2196 | { |
||
2197 | dst = dst_line; |
||
2198 | dst_line += dst_stride; |
||
2199 | mask = mask_line; |
||
2200 | mask_line += mask_stride; |
||
2201 | w = width; |
||
2202 | |||
2203 | CHECKPOINT (); |
||
2204 | |||
2205 | while (w && (unsigned long)dst & 7) |
||
2206 | { |
||
2207 | uint64_t m = *mask; |
||
2208 | |||
2209 | if (m) |
||
2210 | { |
||
2211 | uint64_t d = *dst; |
||
2212 | __m64 vd = to_m64 (d); |
||
2213 | __m64 vdest = in_over ( |
||
2214 | vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0)); |
||
2215 | |||
2216 | vd = pack_565 (vdest, _mm_setzero_si64 (), 0); |
||
2217 | *dst = to_uint64 (vd); |
||
2218 | } |
||
2219 | |||
2220 | w--; |
||
2221 | mask++; |
||
2222 | dst++; |
||
2223 | } |
||
2224 | |||
2225 | CHECKPOINT (); |
||
2226 | |||
2227 | while (w >= 4) |
||
2228 | { |
||
2229 | uint64_t m0, m1, m2, m3; |
||
2230 | m0 = *mask; |
||
2231 | m1 = *(mask + 1); |
||
2232 | m2 = *(mask + 2); |
||
2233 | m3 = *(mask + 3); |
||
2234 | |||
2235 | if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff) |
||
2236 | { |
||
2237 | *(uint64_t *)dst = srcsrcsrcsrc; |
||
2238 | } |
||
2239 | else if (m0 | m1 | m2 | m3) |
||
2240 | { |
||
2241 | __m64 vdest; |
||
2242 | __m64 vm0, vm1, vm2, vm3; |
||
2243 | |||
2244 | vdest = *(__m64 *)dst; |
||
2245 | |||
2246 | vm0 = to_m64 (m0); |
||
2247 | vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0), |
||
2248 | expand565 (vdest, 0)), vdest, 0); |
||
2249 | vm1 = to_m64 (m1); |
||
2250 | vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1), |
||
2251 | expand565 (vdest, 1)), vdest, 1); |
||
2252 | vm2 = to_m64 (m2); |
||
2253 | vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2), |
||
2254 | expand565 (vdest, 2)), vdest, 2); |
||
2255 | vm3 = to_m64 (m3); |
||
2256 | vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3), |
||
2257 | expand565 (vdest, 3)), vdest, 3); |
||
2258 | |||
2259 | *(__m64 *)dst = vdest; |
||
2260 | } |
||
2261 | |||
2262 | w -= 4; |
||
2263 | mask += 4; |
||
2264 | dst += 4; |
||
2265 | } |
||
2266 | |||
2267 | CHECKPOINT (); |
||
2268 | |||
2269 | while (w) |
||
2270 | { |
||
2271 | uint64_t m = *mask; |
||
2272 | |||
2273 | if (m) |
||
2274 | { |
||
2275 | uint64_t d = *dst; |
||
2276 | __m64 vd = to_m64 (d); |
||
2277 | __m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)), |
||
2278 | expand565 (vd, 0)); |
||
2279 | vd = pack_565 (vdest, _mm_setzero_si64 (), 0); |
||
2280 | *dst = to_uint64 (vd); |
||
2281 | } |
||
2282 | |||
2283 | w--; |
||
2284 | mask++; |
||
2285 | dst++; |
||
2286 | } |
||
2287 | } |
||
2288 | |||
2289 | _mm_empty (); |
||
2290 | } |
||
2291 | |||
2292 | static void |
||
2293 | mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp, |
||
2294 | pixman_op_t op, |
||
2295 | pixman_image_t * src_image, |
||
2296 | pixman_image_t * mask_image, |
||
2297 | pixman_image_t * dst_image, |
||
2298 | int32_t src_x, |
||
2299 | int32_t src_y, |
||
2300 | int32_t mask_x, |
||
2301 | int32_t mask_y, |
||
2302 | int32_t dest_x, |
||
2303 | int32_t dest_y, |
||
2304 | int32_t width, |
||
2305 | int32_t height) |
||
2306 | { |
||
2307 | uint16_t *dst_line, *dst; |
||
2308 | uint32_t *src_line, *src; |
||
2309 | int dst_stride, src_stride; |
||
2310 | int32_t w; |
||
2311 | |||
2312 | CHECKPOINT (); |
||
2313 | |||
2314 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
||
2315 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
2316 | |||
2317 | #if 0 |
||
2318 | /* FIXME */ |
||
2319 | assert (src_image->drawable == mask_image->drawable); |
||
2320 | #endif |
||
2321 | |||
2322 | while (height--) |
||
2323 | { |
||
2324 | dst = dst_line; |
||
2325 | dst_line += dst_stride; |
||
2326 | src = src_line; |
||
2327 | src_line += src_stride; |
||
2328 | w = width; |
||
2329 | |||
2330 | CHECKPOINT (); |
||
2331 | |||
2332 | while (w && (unsigned long)dst & 7) |
||
2333 | { |
||
2334 | __m64 vsrc = load8888 (*src); |
||
2335 | uint64_t d = *dst; |
||
2336 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
2337 | |||
2338 | vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); |
||
2339 | |||
2340 | *dst = to_uint64 (vdest); |
||
2341 | |||
2342 | w--; |
||
2343 | dst++; |
||
2344 | src++; |
||
2345 | } |
||
2346 | |||
2347 | CHECKPOINT (); |
||
2348 | |||
2349 | while (w >= 4) |
||
2350 | { |
||
2351 | uint32_t s0, s1, s2, s3; |
||
2352 | unsigned char a0, a1, a2, a3; |
||
2353 | |||
2354 | s0 = *src; |
||
2355 | s1 = *(src + 1); |
||
2356 | s2 = *(src + 2); |
||
2357 | s3 = *(src + 3); |
||
2358 | |||
2359 | a0 = (s0 >> 24); |
||
2360 | a1 = (s1 >> 24); |
||
2361 | a2 = (s2 >> 24); |
||
2362 | a3 = (s3 >> 24); |
||
2363 | |||
2364 | if ((a0 & a1 & a2 & a3) == 0xFF) |
||
2365 | { |
||
2366 | __m64 vdest; |
||
2367 | vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0); |
||
2368 | vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1); |
||
2369 | vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2); |
||
2370 | vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3); |
||
2371 | |||
2372 | *(__m64 *)dst = vdest; |
||
2373 | } |
||
2374 | else if (s0 | s1 | s2 | s3) |
||
2375 | { |
||
2376 | __m64 vdest = *(__m64 *)dst; |
||
2377 | |||
2378 | vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0); |
||
2379 | vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1); |
||
2380 | vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2); |
||
2381 | vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3); |
||
2382 | |||
2383 | *(__m64 *)dst = vdest; |
||
2384 | } |
||
2385 | |||
2386 | w -= 4; |
||
2387 | dst += 4; |
||
2388 | src += 4; |
||
2389 | } |
||
2390 | |||
2391 | CHECKPOINT (); |
||
2392 | |||
2393 | while (w) |
||
2394 | { |
||
2395 | __m64 vsrc = load8888 (*src); |
||
2396 | uint64_t d = *dst; |
||
2397 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
2398 | |||
2399 | vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0); |
||
2400 | |||
2401 | *dst = to_uint64 (vdest); |
||
2402 | |||
2403 | w--; |
||
2404 | dst++; |
||
2405 | src++; |
||
2406 | } |
||
2407 | } |
||
2408 | |||
2409 | _mm_empty (); |
||
2410 | } |
||
2411 | |||
2412 | static void |
||
2413 | mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp, |
||
2414 | pixman_op_t op, |
||
2415 | pixman_image_t * src_image, |
||
2416 | pixman_image_t * mask_image, |
||
2417 | pixman_image_t * dst_image, |
||
2418 | int32_t src_x, |
||
2419 | int32_t src_y, |
||
2420 | int32_t mask_x, |
||
2421 | int32_t mask_y, |
||
2422 | int32_t dest_x, |
||
2423 | int32_t dest_y, |
||
2424 | int32_t width, |
||
2425 | int32_t height) |
||
2426 | { |
||
2427 | uint32_t *dst_line, *dst; |
||
2428 | uint32_t *src_line, *src; |
||
2429 | int dst_stride, src_stride; |
||
2430 | int32_t w; |
||
2431 | |||
2432 | CHECKPOINT (); |
||
2433 | |||
2434 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
2435 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
2436 | |||
2437 | #if 0 |
||
2438 | /* FIXME */ |
||
2439 | assert (src_image->drawable == mask_image->drawable); |
||
2440 | #endif |
||
2441 | |||
2442 | while (height--) |
||
2443 | { |
||
2444 | dst = dst_line; |
||
2445 | dst_line += dst_stride; |
||
2446 | src = src_line; |
||
2447 | src_line += src_stride; |
||
2448 | w = width; |
||
2449 | |||
2450 | while (w && (unsigned long)dst & 7) |
||
2451 | { |
||
2452 | __m64 s = load8888 (*src); |
||
2453 | __m64 d = load8888 (*dst); |
||
2454 | |||
2455 | *dst = store8888 (over_rev_non_pre (s, d)); |
||
2456 | |||
2457 | w--; |
||
2458 | dst++; |
||
2459 | src++; |
||
2460 | } |
||
2461 | |||
2462 | while (w >= 2) |
||
2463 | { |
||
2464 | uint64_t s0, s1; |
||
2465 | unsigned char a0, a1; |
||
2466 | __m64 d0, d1; |
||
2467 | |||
2468 | s0 = *src; |
||
2469 | s1 = *(src + 1); |
||
2470 | |||
2471 | a0 = (s0 >> 24); |
||
2472 | a1 = (s1 >> 24); |
||
2473 | |||
2474 | if ((a0 & a1) == 0xFF) |
||
2475 | { |
||
2476 | d0 = invert_colors (load8888 (s0)); |
||
2477 | d1 = invert_colors (load8888 (s1)); |
||
2478 | |||
2479 | *(__m64 *)dst = pack8888 (d0, d1); |
||
2480 | } |
||
2481 | else if (s0 | s1) |
||
2482 | { |
||
2483 | __m64 vdest = *(__m64 *)dst; |
||
2484 | |||
2485 | d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0)); |
||
2486 | d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1)); |
||
2487 | |||
2488 | *(__m64 *)dst = pack8888 (d0, d1); |
||
2489 | } |
||
2490 | |||
2491 | w -= 2; |
||
2492 | dst += 2; |
||
2493 | src += 2; |
||
2494 | } |
||
2495 | |||
2496 | while (w) |
||
2497 | { |
||
2498 | __m64 s = load8888 (*src); |
||
2499 | __m64 d = load8888 (*dst); |
||
2500 | |||
2501 | *dst = store8888 (over_rev_non_pre (s, d)); |
||
2502 | |||
2503 | w--; |
||
2504 | dst++; |
||
2505 | src++; |
||
2506 | } |
||
2507 | } |
||
2508 | |||
2509 | _mm_empty (); |
||
2510 | } |
||
2511 | |||
2512 | static void |
||
2513 | mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp, |
||
2514 | pixman_op_t op, |
||
2515 | pixman_image_t * src_image, |
||
2516 | pixman_image_t * mask_image, |
||
2517 | pixman_image_t * dst_image, |
||
2518 | int32_t src_x, |
||
2519 | int32_t src_y, |
||
2520 | int32_t mask_x, |
||
2521 | int32_t mask_y, |
||
2522 | int32_t dest_x, |
||
2523 | int32_t dest_y, |
||
2524 | int32_t width, |
||
2525 | int32_t height) |
||
2526 | { |
||
2527 | uint32_t src, srca; |
||
2528 | uint16_t *dst_line; |
||
2529 | uint32_t *mask_line; |
||
2530 | int dst_stride, mask_stride; |
||
2531 | __m64 vsrc, vsrca; |
||
2532 | |||
2533 | CHECKPOINT (); |
||
2534 | |||
2535 | src = _pixman_image_get_solid (src_image, dst_image->bits.format); |
||
2536 | |||
2537 | srca = src >> 24; |
||
2538 | if (src == 0) |
||
2539 | return; |
||
2540 | |||
2541 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1); |
||
2542 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1); |
||
2543 | |||
2544 | vsrc = load8888 (src); |
||
2545 | vsrca = expand_alpha (vsrc); |
||
2546 | |||
2547 | while (height--) |
||
2548 | { |
||
2549 | int twidth = width; |
||
2550 | uint32_t *p = (uint32_t *)mask_line; |
||
2551 | uint16_t *q = (uint16_t *)dst_line; |
||
2552 | |||
2553 | while (twidth && ((unsigned long)q & 7)) |
||
2554 | { |
||
2555 | uint32_t m = *(uint32_t *)p; |
||
2556 | |||
2557 | if (m) |
||
2558 | { |
||
2559 | uint64_t d = *q; |
||
2560 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
2561 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); |
||
2562 | *q = to_uint64 (vdest); |
||
2563 | } |
||
2564 | |||
2565 | twidth--; |
||
2566 | p++; |
||
2567 | q++; |
||
2568 | } |
||
2569 | |||
2570 | while (twidth >= 4) |
||
2571 | { |
||
2572 | uint32_t m0, m1, m2, m3; |
||
2573 | |||
2574 | m0 = *p; |
||
2575 | m1 = *(p + 1); |
||
2576 | m2 = *(p + 2); |
||
2577 | m3 = *(p + 3); |
||
2578 | |||
2579 | if ((m0 | m1 | m2 | m3)) |
||
2580 | { |
||
2581 | __m64 vdest = *(__m64 *)q; |
||
2582 | |||
2583 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0); |
||
2584 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1); |
||
2585 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2); |
||
2586 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3); |
||
2587 | |||
2588 | *(__m64 *)q = vdest; |
||
2589 | } |
||
2590 | twidth -= 4; |
||
2591 | p += 4; |
||
2592 | q += 4; |
||
2593 | } |
||
2594 | |||
2595 | while (twidth) |
||
2596 | { |
||
2597 | uint32_t m; |
||
2598 | |||
2599 | m = *(uint32_t *)p; |
||
2600 | if (m) |
||
2601 | { |
||
2602 | uint64_t d = *q; |
||
2603 | __m64 vdest = expand565 (to_m64 (d), 0); |
||
2604 | vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0); |
||
2605 | *q = to_uint64 (vdest); |
||
2606 | } |
||
2607 | |||
2608 | twidth--; |
||
2609 | p++; |
||
2610 | q++; |
||
2611 | } |
||
2612 | |||
2613 | mask_line += mask_stride; |
||
2614 | dst_line += dst_stride; |
||
2615 | } |
||
2616 | |||
2617 | _mm_empty (); |
||
2618 | } |
||
2619 | |||
2620 | static void |
||
2621 | mmx_composite_in_n_8_8 (pixman_implementation_t *imp, |
||
2622 | pixman_op_t op, |
||
2623 | pixman_image_t * src_image, |
||
2624 | pixman_image_t * mask_image, |
||
2625 | pixman_image_t * dst_image, |
||
2626 | int32_t src_x, |
||
2627 | int32_t src_y, |
||
2628 | int32_t mask_x, |
||
2629 | int32_t mask_y, |
||
2630 | int32_t dest_x, |
||
2631 | int32_t dest_y, |
||
2632 | int32_t width, |
||
2633 | int32_t height) |
||
2634 | { |
||
2635 | uint8_t *dst_line, *dst; |
||
2636 | uint8_t *mask_line, *mask; |
||
2637 | int dst_stride, mask_stride; |
||
2638 | int32_t w; |
||
2639 | uint32_t src; |
||
2640 | uint8_t sa; |
||
2641 | __m64 vsrc, vsrca; |
||
2642 | |||
2643 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
||
2644 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
||
2645 | |||
2646 | src = _pixman_image_get_solid (src_image, dst_image->bits.format); |
||
2647 | |||
2648 | sa = src >> 24; |
||
2649 | |||
2650 | vsrc = load8888 (src); |
||
2651 | vsrca = expand_alpha (vsrc); |
||
2652 | |||
2653 | while (height--) |
||
2654 | { |
||
2655 | dst = dst_line; |
||
2656 | dst_line += dst_stride; |
||
2657 | mask = mask_line; |
||
2658 | mask_line += mask_stride; |
||
2659 | w = width; |
||
2660 | |||
2661 | if ((((unsigned long)dst_image & 3) == 0) && |
||
2662 | (((unsigned long)src_image & 3) == 0)) |
||
2663 | { |
||
2664 | while (w >= 4) |
||
2665 | { |
||
2666 | uint32_t m; |
||
2667 | __m64 vmask; |
||
2668 | __m64 vdest; |
||
2669 | |||
2670 | m = 0; |
||
2671 | |||
2672 | vmask = load8888 (*(uint32_t *)mask); |
||
2673 | vdest = load8888 (*(uint32_t *)dst); |
||
2674 | |||
2675 | *(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest)); |
||
2676 | |||
2677 | dst += 4; |
||
2678 | mask += 4; |
||
2679 | w -= 4; |
||
2680 | } |
||
2681 | } |
||
2682 | |||
2683 | while (w--) |
||
2684 | { |
||
2685 | uint16_t tmp; |
||
2686 | uint8_t a; |
||
2687 | uint32_t m, d; |
||
2688 | |||
2689 | a = *mask++; |
||
2690 | d = *dst; |
||
2691 | |||
2692 | m = MUL_UN8 (sa, a, tmp); |
||
2693 | d = MUL_UN8 (m, d, tmp); |
||
2694 | |||
2695 | *dst++ = d; |
||
2696 | } |
||
2697 | } |
||
2698 | |||
2699 | _mm_empty (); |
||
2700 | } |
||
2701 | |||
2702 | static void |
||
2703 | mmx_composite_in_8_8 (pixman_implementation_t *imp, |
||
2704 | pixman_op_t op, |
||
2705 | pixman_image_t * src_image, |
||
2706 | pixman_image_t * mask_image, |
||
2707 | pixman_image_t * dst_image, |
||
2708 | int32_t src_x, |
||
2709 | int32_t src_y, |
||
2710 | int32_t mask_x, |
||
2711 | int32_t mask_y, |
||
2712 | int32_t dest_x, |
||
2713 | int32_t dest_y, |
||
2714 | int32_t width, |
||
2715 | int32_t height) |
||
2716 | { |
||
2717 | uint8_t *dst_line, *dst; |
||
2718 | uint8_t *src_line, *src; |
||
2719 | int src_stride, dst_stride; |
||
2720 | int32_t w; |
||
2721 | |||
2722 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
||
2723 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
||
2724 | |||
2725 | while (height--) |
||
2726 | { |
||
2727 | dst = dst_line; |
||
2728 | dst_line += dst_stride; |
||
2729 | src = src_line; |
||
2730 | src_line += src_stride; |
||
2731 | w = width; |
||
2732 | |||
2733 | if ((((unsigned long)dst_image & 3) == 0) && |
||
2734 | (((unsigned long)src_image & 3) == 0)) |
||
2735 | { |
||
2736 | while (w >= 4) |
||
2737 | { |
||
2738 | uint32_t *s = (uint32_t *)src; |
||
2739 | uint32_t *d = (uint32_t *)dst; |
||
2740 | |||
2741 | *d = store8888 (in (load8888 (*s), load8888 (*d))); |
||
2742 | |||
2743 | w -= 4; |
||
2744 | dst += 4; |
||
2745 | src += 4; |
||
2746 | } |
||
2747 | } |
||
2748 | |||
2749 | while (w--) |
||
2750 | { |
||
2751 | uint8_t s, d; |
||
2752 | uint16_t tmp; |
||
2753 | |||
2754 | s = *src; |
||
2755 | d = *dst; |
||
2756 | |||
2757 | *dst = MUL_UN8 (s, d, tmp); |
||
2758 | |||
2759 | src++; |
||
2760 | dst++; |
||
2761 | } |
||
2762 | } |
||
2763 | |||
2764 | _mm_empty (); |
||
2765 | } |
||
2766 | |||
2767 | static void |
||
2768 | mmx_composite_add_n_8_8 (pixman_implementation_t *imp, |
||
2769 | pixman_op_t op, |
||
2770 | pixman_image_t * src_image, |
||
2771 | pixman_image_t * mask_image, |
||
2772 | pixman_image_t * dst_image, |
||
2773 | int32_t src_x, |
||
2774 | int32_t src_y, |
||
2775 | int32_t mask_x, |
||
2776 | int32_t mask_y, |
||
2777 | int32_t dest_x, |
||
2778 | int32_t dest_y, |
||
2779 | int32_t width, |
||
2780 | int32_t height) |
||
2781 | { |
||
2782 | uint8_t *dst_line, *dst; |
||
2783 | uint8_t *mask_line, *mask; |
||
2784 | int dst_stride, mask_stride; |
||
2785 | int32_t w; |
||
2786 | uint32_t src; |
||
2787 | uint8_t sa; |
||
2788 | __m64 vsrc, vsrca; |
||
2789 | |||
2790 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
||
2791 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
||
2792 | |||
2793 | src = _pixman_image_get_solid (src_image, dst_image->bits.format); |
||
2794 | |||
2795 | sa = src >> 24; |
||
2796 | |||
2797 | if (src == 0) |
||
2798 | return; |
||
2799 | |||
2800 | vsrc = load8888 (src); |
||
2801 | vsrca = expand_alpha (vsrc); |
||
2802 | |||
2803 | while (height--) |
||
2804 | { |
||
2805 | dst = dst_line; |
||
2806 | dst_line += dst_stride; |
||
2807 | mask = mask_line; |
||
2808 | mask_line += mask_stride; |
||
2809 | w = width; |
||
2810 | |||
2811 | if ((((unsigned long)mask_image & 3) == 0) && |
||
2812 | (((unsigned long)dst_image & 3) == 0)) |
||
2813 | { |
||
2814 | while (w >= 4) |
||
2815 | { |
||
2816 | __m64 vmask = load8888 (*(uint32_t *)mask); |
||
2817 | __m64 vdest = load8888 (*(uint32_t *)dst); |
||
2818 | |||
2819 | *(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest)); |
||
2820 | |||
2821 | w -= 4; |
||
2822 | dst += 4; |
||
2823 | mask += 4; |
||
2824 | } |
||
2825 | } |
||
2826 | |||
2827 | while (w--) |
||
2828 | { |
||
2829 | uint16_t tmp; |
||
2830 | uint16_t a; |
||
2831 | uint32_t m, d; |
||
2832 | uint32_t r; |
||
2833 | |||
2834 | a = *mask++; |
||
2835 | d = *dst; |
||
2836 | |||
2837 | m = MUL_UN8 (sa, a, tmp); |
||
2838 | r = ADD_UN8 (m, d, tmp); |
||
2839 | |||
2840 | *dst++ = r; |
||
2841 | } |
||
2842 | } |
||
2843 | |||
2844 | _mm_empty (); |
||
2845 | } |
||
2846 | |||
2847 | static void |
||
2848 | mmx_composite_add_8_8 (pixman_implementation_t *imp, |
||
2849 | pixman_op_t op, |
||
2850 | pixman_image_t * src_image, |
||
2851 | pixman_image_t * mask_image, |
||
2852 | pixman_image_t * dst_image, |
||
2853 | int32_t src_x, |
||
2854 | int32_t src_y, |
||
2855 | int32_t mask_x, |
||
2856 | int32_t mask_y, |
||
2857 | int32_t dest_x, |
||
2858 | int32_t dest_y, |
||
2859 | int32_t width, |
||
2860 | int32_t height) |
||
2861 | { |
||
2862 | uint8_t *dst_line, *dst; |
||
2863 | uint8_t *src_line, *src; |
||
2864 | int dst_stride, src_stride; |
||
2865 | int32_t w; |
||
2866 | uint8_t s, d; |
||
2867 | uint16_t t; |
||
2868 | |||
2869 | CHECKPOINT (); |
||
2870 | |||
2871 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1); |
||
2872 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1); |
||
2873 | |||
2874 | while (height--) |
||
2875 | { |
||
2876 | dst = dst_line; |
||
2877 | dst_line += dst_stride; |
||
2878 | src = src_line; |
||
2879 | src_line += src_stride; |
||
2880 | w = width; |
||
2881 | |||
2882 | while (w && (unsigned long)dst & 7) |
||
2883 | { |
||
2884 | s = *src; |
||
2885 | d = *dst; |
||
2886 | t = d + s; |
||
2887 | s = t | (0 - (t >> 8)); |
||
2888 | *dst = s; |
||
2889 | |||
2890 | dst++; |
||
2891 | src++; |
||
2892 | w--; |
||
2893 | } |
||
2894 | |||
2895 | while (w >= 8) |
||
2896 | { |
||
2897 | *(__m64*)dst = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst); |
||
2898 | dst += 8; |
||
2899 | src += 8; |
||
2900 | w -= 8; |
||
2901 | } |
||
2902 | |||
2903 | while (w) |
||
2904 | { |
||
2905 | s = *src; |
||
2906 | d = *dst; |
||
2907 | t = d + s; |
||
2908 | s = t | (0 - (t >> 8)); |
||
2909 | *dst = s; |
||
2910 | |||
2911 | dst++; |
||
2912 | src++; |
||
2913 | w--; |
||
2914 | } |
||
2915 | } |
||
2916 | |||
2917 | _mm_empty (); |
||
2918 | } |
||
2919 | |||
2920 | static void |
||
2921 | mmx_composite_add_8888_8888 (pixman_implementation_t *imp, |
||
2922 | pixman_op_t op, |
||
2923 | pixman_image_t * src_image, |
||
2924 | pixman_image_t * mask_image, |
||
2925 | pixman_image_t * dst_image, |
||
2926 | int32_t src_x, |
||
2927 | int32_t src_y, |
||
2928 | int32_t mask_x, |
||
2929 | int32_t mask_y, |
||
2930 | int32_t dest_x, |
||
2931 | int32_t dest_y, |
||
2932 | int32_t width, |
||
2933 | int32_t height) |
||
2934 | { |
||
2935 | __m64 dst64; |
||
2936 | uint32_t *dst_line, *dst; |
||
2937 | uint32_t *src_line, *src; |
||
2938 | int dst_stride, src_stride; |
||
2939 | int32_t w; |
||
2940 | |||
2941 | CHECKPOINT (); |
||
2942 | |||
2943 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
2944 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
2945 | |||
2946 | while (height--) |
||
2947 | { |
||
2948 | dst = dst_line; |
||
2949 | dst_line += dst_stride; |
||
2950 | src = src_line; |
||
2951 | src_line += src_stride; |
||
2952 | w = width; |
||
2953 | |||
2954 | while (w && (unsigned long)dst & 7) |
||
2955 | { |
||
2956 | *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src), |
||
2957 | _mm_cvtsi32_si64 (*dst))); |
||
2958 | dst++; |
||
2959 | src++; |
||
2960 | w--; |
||
2961 | } |
||
2962 | |||
2963 | while (w >= 2) |
||
2964 | { |
||
2965 | dst64 = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst); |
||
2966 | *(uint64_t*)dst = to_uint64 (dst64); |
||
2967 | dst += 2; |
||
2968 | src += 2; |
||
2969 | w -= 2; |
||
2970 | } |
||
2971 | |||
2972 | if (w) |
||
2973 | { |
||
2974 | *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src), |
||
2975 | _mm_cvtsi32_si64 (*dst))); |
||
2976 | |||
2977 | } |
||
2978 | } |
||
2979 | |||
2980 | _mm_empty (); |
||
2981 | } |
||
2982 | |||
2983 | static pixman_bool_t |
||
2984 | pixman_blt_mmx (uint32_t *src_bits, |
||
2985 | uint32_t *dst_bits, |
||
2986 | int src_stride, |
||
2987 | int dst_stride, |
||
2988 | int src_bpp, |
||
2989 | int dst_bpp, |
||
2990 | int src_x, |
||
2991 | int src_y, |
||
2992 | int dst_x, |
||
2993 | int dst_y, |
||
2994 | int width, |
||
2995 | int height) |
||
2996 | { |
||
2997 | uint8_t * src_bytes; |
||
2998 | uint8_t * dst_bytes; |
||
2999 | int byte_width; |
||
3000 | |||
3001 | if (src_bpp != dst_bpp) |
||
3002 | return FALSE; |
||
3003 | |||
3004 | if (src_bpp == 16) |
||
3005 | { |
||
3006 | src_stride = src_stride * (int) sizeof (uint32_t) / 2; |
||
3007 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 2; |
||
3008 | src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x)); |
||
3009 | dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); |
||
3010 | byte_width = 2 * width; |
||
3011 | src_stride *= 2; |
||
3012 | dst_stride *= 2; |
||
3013 | } |
||
3014 | else if (src_bpp == 32) |
||
3015 | { |
||
3016 | src_stride = src_stride * (int) sizeof (uint32_t) / 4; |
||
3017 | dst_stride = dst_stride * (int) sizeof (uint32_t) / 4; |
||
3018 | src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x)); |
||
3019 | dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x)); |
||
3020 | byte_width = 4 * width; |
||
3021 | src_stride *= 4; |
||
3022 | dst_stride *= 4; |
||
3023 | } |
||
3024 | else |
||
3025 | { |
||
3026 | return FALSE; |
||
3027 | } |
||
3028 | |||
3029 | while (height--) |
||
3030 | { |
||
3031 | int w; |
||
3032 | uint8_t *s = src_bytes; |
||
3033 | uint8_t *d = dst_bytes; |
||
3034 | src_bytes += src_stride; |
||
3035 | dst_bytes += dst_stride; |
||
3036 | w = byte_width; |
||
3037 | |||
3038 | while (w >= 2 && ((unsigned long)d & 3)) |
||
3039 | { |
||
3040 | *(uint16_t *)d = *(uint16_t *)s; |
||
3041 | w -= 2; |
||
3042 | s += 2; |
||
3043 | d += 2; |
||
3044 | } |
||
3045 | |||
3046 | while (w >= 4 && ((unsigned long)d & 7)) |
||
3047 | { |
||
3048 | *(uint32_t *)d = *(uint32_t *)s; |
||
3049 | |||
3050 | w -= 4; |
||
3051 | s += 4; |
||
3052 | d += 4; |
||
3053 | } |
||
3054 | |||
3055 | while (w >= 64) |
||
3056 | { |
||
3057 | #if defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)) |
||
3058 | __asm__ ( |
||
3059 | "movq (%1), %%mm0\n" |
||
3060 | "movq 8(%1), %%mm1\n" |
||
3061 | "movq 16(%1), %%mm2\n" |
||
3062 | "movq 24(%1), %%mm3\n" |
||
3063 | "movq 32(%1), %%mm4\n" |
||
3064 | "movq 40(%1), %%mm5\n" |
||
3065 | "movq 48(%1), %%mm6\n" |
||
3066 | "movq 56(%1), %%mm7\n" |
||
3067 | |||
3068 | "movq %%mm0, (%0)\n" |
||
3069 | "movq %%mm1, 8(%0)\n" |
||
3070 | "movq %%mm2, 16(%0)\n" |
||
3071 | "movq %%mm3, 24(%0)\n" |
||
3072 | "movq %%mm4, 32(%0)\n" |
||
3073 | "movq %%mm5, 40(%0)\n" |
||
3074 | "movq %%mm6, 48(%0)\n" |
||
3075 | "movq %%mm7, 56(%0)\n" |
||
3076 | : |
||
3077 | : "r" (d), "r" (s) |
||
3078 | : "memory", |
||
3079 | "%mm0", "%mm1", "%mm2", "%mm3", |
||
3080 | "%mm4", "%mm5", "%mm6", "%mm7"); |
||
3081 | #else |
||
3082 | __m64 v0 = *(__m64 *)(s + 0); |
||
3083 | __m64 v1 = *(__m64 *)(s + 8); |
||
3084 | __m64 v2 = *(__m64 *)(s + 16); |
||
3085 | __m64 v3 = *(__m64 *)(s + 24); |
||
3086 | __m64 v4 = *(__m64 *)(s + 32); |
||
3087 | __m64 v5 = *(__m64 *)(s + 40); |
||
3088 | __m64 v6 = *(__m64 *)(s + 48); |
||
3089 | __m64 v7 = *(__m64 *)(s + 56); |
||
3090 | *(__m64 *)(d + 0) = v0; |
||
3091 | *(__m64 *)(d + 8) = v1; |
||
3092 | *(__m64 *)(d + 16) = v2; |
||
3093 | *(__m64 *)(d + 24) = v3; |
||
3094 | *(__m64 *)(d + 32) = v4; |
||
3095 | *(__m64 *)(d + 40) = v5; |
||
3096 | *(__m64 *)(d + 48) = v6; |
||
3097 | *(__m64 *)(d + 56) = v7; |
||
3098 | #endif |
||
3099 | |||
3100 | w -= 64; |
||
3101 | s += 64; |
||
3102 | d += 64; |
||
3103 | } |
||
3104 | while (w >= 4) |
||
3105 | { |
||
3106 | *(uint32_t *)d = *(uint32_t *)s; |
||
3107 | |||
3108 | w -= 4; |
||
3109 | s += 4; |
||
3110 | d += 4; |
||
3111 | } |
||
3112 | if (w >= 2) |
||
3113 | { |
||
3114 | *(uint16_t *)d = *(uint16_t *)s; |
||
3115 | w -= 2; |
||
3116 | s += 2; |
||
3117 | d += 2; |
||
3118 | } |
||
3119 | } |
||
3120 | |||
3121 | _mm_empty (); |
||
3122 | |||
3123 | return TRUE; |
||
3124 | } |
||
3125 | |||
3126 | static void |
||
3127 | mmx_composite_copy_area (pixman_implementation_t *imp, |
||
3128 | pixman_op_t op, |
||
3129 | pixman_image_t * src_image, |
||
3130 | pixman_image_t * mask_image, |
||
3131 | pixman_image_t * dst_image, |
||
3132 | int32_t src_x, |
||
3133 | int32_t src_y, |
||
3134 | int32_t mask_x, |
||
3135 | int32_t mask_y, |
||
3136 | int32_t dest_x, |
||
3137 | int32_t dest_y, |
||
3138 | int32_t width, |
||
3139 | int32_t height) |
||
3140 | { |
||
3141 | pixman_blt_mmx (src_image->bits.bits, |
||
3142 | dst_image->bits.bits, |
||
3143 | src_image->bits.rowstride, |
||
3144 | dst_image->bits.rowstride, |
||
3145 | PIXMAN_FORMAT_BPP (src_image->bits.format), |
||
3146 | PIXMAN_FORMAT_BPP (dst_image->bits.format), |
||
3147 | src_x, src_y, dest_x, dest_y, width, height); |
||
3148 | } |
||
3149 | |||
3150 | #if 0 |
||
3151 | static void |
||
3152 | mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp, |
||
3153 | pixman_op_t op, |
||
3154 | pixman_image_t * src_image, |
||
3155 | pixman_image_t * mask_image, |
||
3156 | pixman_image_t * dst_image, |
||
3157 | int32_t src_x, |
||
3158 | int32_t src_y, |
||
3159 | int32_t mask_x, |
||
3160 | int32_t mask_y, |
||
3161 | int32_t dest_x, |
||
3162 | int32_t dest_y, |
||
3163 | int32_t width, |
||
3164 | int32_t height) |
||
3165 | { |
||
3166 | uint32_t *src, *src_line; |
||
3167 | uint32_t *dst, *dst_line; |
||
3168 | uint8_t *mask, *mask_line; |
||
3169 | int src_stride, mask_stride, dst_stride; |
||
3170 | int32_t w; |
||
3171 | |||
3172 | PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1); |
||
3173 | PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1); |
||
3174 | PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1); |
||
3175 | |||
3176 | while (height--) |
||
3177 | { |
||
3178 | src = src_line; |
||
3179 | src_line += src_stride; |
||
3180 | dst = dst_line; |
||
3181 | dst_line += dst_stride; |
||
3182 | mask = mask_line; |
||
3183 | mask_line += mask_stride; |
||
3184 | |||
3185 | w = width; |
||
3186 | |||
3187 | while (w--) |
||
3188 | { |
||
3189 | uint64_t m = *mask; |
||
3190 | |||
3191 | if (m) |
||
3192 | { |
||
3193 | __m64 s = load8888 (*src | 0xff000000); |
||
3194 | |||
3195 | if (m == 0xff) |
||
3196 | { |
||
3197 | *dst = store8888 (s); |
||
3198 | } |
||
3199 | else |
||
3200 | { |
||
3201 | __m64 sa = expand_alpha (s); |
||
3202 | __m64 vm = expand_alpha_rev (to_m64 (m)); |
||
3203 | __m64 vdest = in_over (s, sa, vm, load8888 (*dst)); |
||
3204 | |||
3205 | *dst = store8888 (vdest); |
||
3206 | } |
||
3207 | } |
||
3208 | |||
3209 | mask++; |
||
3210 | dst++; |
||
3211 | src++; |
||
3212 | } |
||
3213 | } |
||
3214 | |||
3215 | _mm_empty (); |
||
3216 | } |
||
3217 | #endif |
||
3218 | |||
3219 | static const pixman_fast_path_t mmx_fast_paths[] = |
||
3220 | { |
||
3221 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ), |
||
3222 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ), |
||
3223 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ), |
||
3224 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ), |
||
3225 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ), |
||
3226 | PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ), |
||
3227 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ), |
||
3228 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ), |
||
3229 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ), |
||
3230 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ), |
||
3231 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ), |
||
3232 | PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ), |
||
3233 | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ), |
||
3234 | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ), |
||
3235 | PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ), |
||
3236 | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ), |
||
3237 | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ), |
||
3238 | PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ), |
||
3239 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ), |
||
3240 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ), |
||
3241 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ), |
||
3242 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ), |
||
3243 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ), |
||
3244 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ), |
||
3245 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ), |
||
3246 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ), |
||
3247 | #if 0 |
||
3248 | /* FIXME: This code is commented out since it's apparently |
||
3249 | * not actually faster than the generic code. |
||
3250 | */ |
||
3251 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ), |
||
3252 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ), |
||
3253 | PIXMAN_STD_FAST_PATH (OVER, x8b8r8g8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ), |
||
3254 | PIXMAN_STD_FAST_PATH (OVER, x8b8r8g8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ), |
||
3255 | #endif |
||
3256 | PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ), |
||
3257 | PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ), |
||
3258 | PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ), |
||
3259 | PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
||
3260 | PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
||
3261 | |||
3262 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ), |
||
3263 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ), |
||
3264 | PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ), |
||
3265 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ), |
||
3266 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ), |
||
3267 | PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ), |
||
3268 | |||
3269 | PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ), |
||
3270 | PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ), |
||
3271 | PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ), |
||
3272 | PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ), |
||
3273 | |||
3274 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ), |
||
3275 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ), |
||
3276 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ), |
||
3277 | PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ), |
||
3278 | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ), |
||
3279 | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ), |
||
3280 | PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
||
3281 | PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
||
3282 | PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ), |
||
3283 | PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ), |
||
3284 | PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ), |
||
3285 | PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ), |
||
3286 | |||
3287 | PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ), |
||
3288 | PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ), |
||
3289 | |||
3290 | { PIXMAN_OP_NONE }, |
||
3291 | }; |
||
3292 | |||
3293 | static pixman_bool_t |
||
3294 | mmx_blt (pixman_implementation_t *imp, |
||
3295 | uint32_t * src_bits, |
||
3296 | uint32_t * dst_bits, |
||
3297 | int src_stride, |
||
3298 | int dst_stride, |
||
3299 | int src_bpp, |
||
3300 | int dst_bpp, |
||
3301 | int src_x, |
||
3302 | int src_y, |
||
3303 | int dst_x, |
||
3304 | int dst_y, |
||
3305 | int width, |
||
3306 | int height) |
||
3307 | { |
||
3308 | if (!pixman_blt_mmx ( |
||
3309 | src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, |
||
3310 | src_x, src_y, dst_x, dst_y, width, height)) |
||
3311 | |||
3312 | { |
||
3313 | return _pixman_implementation_blt ( |
||
3314 | imp->delegate, |
||
3315 | src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp, |
||
3316 | src_x, src_y, dst_x, dst_y, width, height); |
||
3317 | } |
||
3318 | |||
3319 | return TRUE; |
||
3320 | } |
||
3321 | |||
3322 | static pixman_bool_t |
||
3323 | mmx_fill (pixman_implementation_t *imp, |
||
3324 | uint32_t * bits, |
||
3325 | int stride, |
||
3326 | int bpp, |
||
3327 | int x, |
||
3328 | int y, |
||
3329 | int width, |
||
3330 | int height, |
||
3331 | uint32_t xor) |
||
3332 | { |
||
3333 | if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor)) |
||
3334 | { |
||
3335 | return _pixman_implementation_fill ( |
||
3336 | imp->delegate, bits, stride, bpp, x, y, width, height, xor); |
||
3337 | } |
||
3338 | |||
3339 | return TRUE; |
||
3340 | } |
||
3341 | |||
3342 | pixman_implementation_t * |
||
3343 | _pixman_implementation_create_mmx (void) |
||
3344 | { |
||
3345 | pixman_implementation_t *general = _pixman_implementation_create_fast_path (); |
||
3346 | pixman_implementation_t *imp = _pixman_implementation_create (general, mmx_fast_paths); |
||
3347 | |||
3348 | imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u; |
||
3349 | imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u; |
||
3350 | imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u; |
||
3351 | imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u; |
||
3352 | imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u; |
||
3353 | imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u; |
||
3354 | imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u; |
||
3355 | imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u; |
||
3356 | imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u; |
||
3357 | imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u; |
||
3358 | imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u; |
||
3359 | |||
3360 | imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca; |
||
3361 | imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca; |
||
3362 | imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca; |
||
3363 | imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca; |
||
3364 | imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca; |
||
3365 | imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca; |
||
3366 | imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca; |
||
3367 | imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca; |
||
3368 | imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca; |
||
3369 | imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca; |
||
3370 | imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca; |
||
3371 | |||
3372 | imp->blt = mmx_blt; |
||
3373 | imp->fill = mmx_fill; |
||
3374 | |||
3375 | return imp; |
||
3376 | } |
||
3377 | |||
3378 | #endif /* USE_MMX */><>><>><>><>><>><>>>>>>>>>>>>><>>>>>>>>>>>>><>><>> |