Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4349 Serge 1
/*
2
 * Copyright © 2008 Rodrigo Kumpera
3
 * Copyright © 2008 André Tupinambá
4
 *
5
 * Permission to use, copy, modify, distribute, and sell this software and its
6
 * documentation for any purpose is hereby granted without fee, provided that
7
 * the above copyright notice appear in all copies and that both that
8
 * copyright notice and this permission notice appear in supporting
9
 * documentation, and that the name of Red Hat not be used in advertising or
10
 * publicity pertaining to distribution of the software without specific,
11
 * written prior permission.  Red Hat makes no representations about the
12
 * suitability of this software for any purpose.  It is provided "as is"
13
 * without express or implied warranty.
14
 *
15
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
16
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
17
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
18
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
19
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
20
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
21
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
22
 * SOFTWARE.
23
 *
24
 * Author:  Rodrigo Kumpera (kumpera@gmail.com)
25
 *          André Tupinambá (andrelrt@gmail.com)
26
 *
27
 * Based on work by Owen Taylor and Søren Sandmann
28
 */
29
#ifdef HAVE_CONFIG_H
30
#include 
31
#endif
32
 
33
#include  /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
34
#include  /* for SSE2 intrinsics */
35
#include "pixman-private.h"
36
#include "pixman-combine32.h"
37
#include "pixman-inlines.h"
38
 
39
static __m128i mask_0080;
40
static __m128i mask_00ff;
41
static __m128i mask_0101;
42
static __m128i mask_ffff;
43
static __m128i mask_ff000000;
44
static __m128i mask_alpha;
45
 
46
static __m128i mask_565_r;
47
static __m128i mask_565_g1, mask_565_g2;
48
static __m128i mask_565_b;
49
static __m128i mask_red;
50
static __m128i mask_green;
51
static __m128i mask_blue;
52
 
53
static __m128i mask_565_fix_rb;
54
static __m128i mask_565_fix_g;
55
 
56
static __m128i mask_565_rb;
57
static __m128i mask_565_pack_multiplier;
58
 
59
static force_inline __m128i
60
unpack_32_1x128 (uint32_t data)
61
{
62
    return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
63
}
64
 
65
static force_inline void
66
unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
67
{
68
    *data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
69
    *data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
70
}
71
 
72
static force_inline __m128i
73
unpack_565_to_8888 (__m128i lo)
74
{
75
    __m128i r, g, b, rb, t;
76
 
77
    r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
78
    g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
79
    b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
80
 
81
    rb = _mm_or_si128 (r, b);
82
    t  = _mm_and_si128 (rb, mask_565_fix_rb);
83
    t  = _mm_srli_epi32 (t, 5);
84
    rb = _mm_or_si128 (rb, t);
85
 
86
    t  = _mm_and_si128 (g, mask_565_fix_g);
87
    t  = _mm_srli_epi32 (t, 6);
88
    g  = _mm_or_si128 (g, t);
89
 
90
    return _mm_or_si128 (rb, g);
91
}
92
 
93
static force_inline void
94
unpack_565_128_4x128 (__m128i  data,
95
                      __m128i* data0,
96
                      __m128i* data1,
97
                      __m128i* data2,
98
                      __m128i* data3)
99
{
100
    __m128i lo, hi;
101
 
102
    lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
103
    hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
104
 
105
    lo = unpack_565_to_8888 (lo);
106
    hi = unpack_565_to_8888 (hi);
107
 
108
    unpack_128_2x128 (lo, data0, data1);
109
    unpack_128_2x128 (hi, data2, data3);
110
}
111
 
112
static force_inline uint16_t
113
pack_565_32_16 (uint32_t pixel)
114
{
115
    return (uint16_t) (((pixel >> 8) & 0xf800) |
116
		       ((pixel >> 5) & 0x07e0) |
117
		       ((pixel >> 3) & 0x001f));
118
}
119
 
120
static force_inline __m128i
121
pack_2x128_128 (__m128i lo, __m128i hi)
122
{
123
    return _mm_packus_epi16 (lo, hi);
124
}
125
 
126
static force_inline __m128i
127
pack_565_2packedx128_128 (__m128i lo, __m128i hi)
128
{
129
    __m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
130
    __m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
131
 
132
    __m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
133
    __m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
134
 
135
    __m128i g0 = _mm_and_si128 (lo, mask_green);
136
    __m128i g1 = _mm_and_si128 (hi, mask_green);
137
 
138
    t0 = _mm_or_si128 (t0, g0);
139
    t1 = _mm_or_si128 (t1, g1);
140
 
141
    /* Simulates _mm_packus_epi32 */
142
    t0 = _mm_slli_epi32 (t0, 16 - 5);
143
    t1 = _mm_slli_epi32 (t1, 16 - 5);
144
    t0 = _mm_srai_epi32 (t0, 16);
145
    t1 = _mm_srai_epi32 (t1, 16);
146
    return _mm_packs_epi32 (t0, t1);
147
}
148
 
149
static force_inline __m128i
150
pack_565_2x128_128 (__m128i lo, __m128i hi)
151
{
152
    __m128i data;
153
    __m128i r, g1, g2, b;
154
 
155
    data = pack_2x128_128 (lo, hi);
156
 
157
    r  = _mm_and_si128 (data, mask_565_r);
158
    g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
159
    g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
160
    b  = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
161
 
162
    return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
163
}
164
 
165
static force_inline __m128i
166
pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
167
{
168
    return _mm_packus_epi16 (pack_565_2x128_128 (*xmm0, *xmm1),
169
			     pack_565_2x128_128 (*xmm2, *xmm3));
170
}
171
 
172
static force_inline int
173
is_opaque (__m128i x)
174
{
175
    __m128i ffs = _mm_cmpeq_epi8 (x, x);
176
 
177
    return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
178
}
179
 
180
static force_inline int
181
is_zero (__m128i x)
182
{
183
    return _mm_movemask_epi8 (
184
	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
185
}
186
 
187
static force_inline int
188
is_transparent (__m128i x)
189
{
190
    return (_mm_movemask_epi8 (
191
		_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
192
}
193
 
194
static force_inline __m128i
195
expand_pixel_32_1x128 (uint32_t data)
196
{
197
    return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
198
}
199
 
200
static force_inline __m128i
201
expand_alpha_1x128 (__m128i data)
202
{
203
    return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
204
						     _MM_SHUFFLE (3, 3, 3, 3)),
205
				_MM_SHUFFLE (3, 3, 3, 3));
206
}
207
 
208
static force_inline void
209
expand_alpha_2x128 (__m128i  data_lo,
210
                    __m128i  data_hi,
211
                    __m128i* alpha_lo,
212
                    __m128i* alpha_hi)
213
{
214
    __m128i lo, hi;
215
 
216
    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
217
    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
218
 
219
    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
220
    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
221
}
222
 
223
static force_inline void
224
expand_alpha_rev_2x128 (__m128i  data_lo,
225
                        __m128i  data_hi,
226
                        __m128i* alpha_lo,
227
                        __m128i* alpha_hi)
228
{
229
    __m128i lo, hi;
230
 
231
    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
232
    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
233
    *alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
234
    *alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
235
}
236
 
237
static force_inline void
238
pix_multiply_2x128 (__m128i* data_lo,
239
                    __m128i* data_hi,
240
                    __m128i* alpha_lo,
241
                    __m128i* alpha_hi,
242
                    __m128i* ret_lo,
243
                    __m128i* ret_hi)
244
{
245
    __m128i lo, hi;
246
 
247
    lo = _mm_mullo_epi16 (*data_lo, *alpha_lo);
248
    hi = _mm_mullo_epi16 (*data_hi, *alpha_hi);
249
    lo = _mm_adds_epu16 (lo, mask_0080);
250
    hi = _mm_adds_epu16 (hi, mask_0080);
251
    *ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
252
    *ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
253
}
254
 
255
static force_inline void
256
pix_add_multiply_2x128 (__m128i* src_lo,
257
                        __m128i* src_hi,
258
                        __m128i* alpha_dst_lo,
259
                        __m128i* alpha_dst_hi,
260
                        __m128i* dst_lo,
261
                        __m128i* dst_hi,
262
                        __m128i* alpha_src_lo,
263
                        __m128i* alpha_src_hi,
264
                        __m128i* ret_lo,
265
                        __m128i* ret_hi)
266
{
267
    __m128i t1_lo, t1_hi;
268
    __m128i t2_lo, t2_hi;
269
 
270
    pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
271
    pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
272
 
273
    *ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
274
    *ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
275
}
276
 
277
static force_inline void
278
negate_2x128 (__m128i  data_lo,
279
              __m128i  data_hi,
280
              __m128i* neg_lo,
281
              __m128i* neg_hi)
282
{
283
    *neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
284
    *neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
285
}
286
 
287
static force_inline void
288
invert_colors_2x128 (__m128i  data_lo,
289
                     __m128i  data_hi,
290
                     __m128i* inv_lo,
291
                     __m128i* inv_hi)
292
{
293
    __m128i lo, hi;
294
 
295
    lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
296
    hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
297
    *inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
298
    *inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
299
}
300
 
301
static force_inline void
302
over_2x128 (__m128i* src_lo,
303
            __m128i* src_hi,
304
            __m128i* alpha_lo,
305
            __m128i* alpha_hi,
306
            __m128i* dst_lo,
307
            __m128i* dst_hi)
308
{
309
    __m128i t1, t2;
310
 
311
    negate_2x128 (*alpha_lo, *alpha_hi, &t1, &t2);
312
 
313
    pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
314
 
315
    *dst_lo = _mm_adds_epu8 (*src_lo, *dst_lo);
316
    *dst_hi = _mm_adds_epu8 (*src_hi, *dst_hi);
317
}
318
 
319
static force_inline void
320
over_rev_non_pre_2x128 (__m128i  src_lo,
321
                        __m128i  src_hi,
322
                        __m128i* dst_lo,
323
                        __m128i* dst_hi)
324
{
325
    __m128i lo, hi;
326
    __m128i alpha_lo, alpha_hi;
327
 
328
    expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
329
 
330
    lo = _mm_or_si128 (alpha_lo, mask_alpha);
331
    hi = _mm_or_si128 (alpha_hi, mask_alpha);
332
 
333
    invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
334
 
335
    pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
336
 
337
    over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
338
}
339
 
340
static force_inline void
341
in_over_2x128 (__m128i* src_lo,
342
               __m128i* src_hi,
343
               __m128i* alpha_lo,
344
               __m128i* alpha_hi,
345
               __m128i* mask_lo,
346
               __m128i* mask_hi,
347
               __m128i* dst_lo,
348
               __m128i* dst_hi)
349
{
350
    __m128i s_lo, s_hi;
351
    __m128i a_lo, a_hi;
352
 
353
    pix_multiply_2x128 (src_lo,   src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
354
    pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
355
 
356
    over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
357
}
358
 
359
/* load 4 pixels from a 16-byte boundary aligned address */
360
static force_inline __m128i
361
load_128_aligned (__m128i* src)
362
{
363
    return _mm_load_si128 (src);
364
}
365
 
366
/* load 4 pixels from a unaligned address */
367
static force_inline __m128i
368
load_128_unaligned (const __m128i* src)
369
{
370
    return _mm_loadu_si128 (src);
371
}
372
 
373
/* save 4 pixels using Write Combining memory on a 16-byte
374
 * boundary aligned address
375
 */
376
static force_inline void
377
save_128_write_combining (__m128i* dst,
378
                          __m128i  data)
379
{
380
    _mm_stream_si128 (dst, data);
381
}
382
 
383
/* save 4 pixels on a 16-byte boundary aligned address */
384
static force_inline void
385
save_128_aligned (__m128i* dst,
386
                  __m128i  data)
387
{
388
    _mm_store_si128 (dst, data);
389
}
390
 
391
/* save 4 pixels on a unaligned address */
392
static force_inline void
393
save_128_unaligned (__m128i* dst,
394
                    __m128i  data)
395
{
396
    _mm_storeu_si128 (dst, data);
397
}
398
 
399
static force_inline __m128i
400
load_32_1x128 (uint32_t data)
401
{
402
    return _mm_cvtsi32_si128 (data);
403
}
404
 
405
static force_inline __m128i
406
expand_alpha_rev_1x128 (__m128i data)
407
{
408
    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
409
}
410
 
411
static force_inline __m128i
412
expand_pixel_8_1x128 (uint8_t data)
413
{
414
    return _mm_shufflelo_epi16 (
415
	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
416
}
417
 
418
static force_inline __m128i
419
pix_multiply_1x128 (__m128i data,
420
		    __m128i alpha)
421
{
422
    return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
423
					    mask_0080),
424
			    mask_0101);
425
}
426
 
427
static force_inline __m128i
428
pix_add_multiply_1x128 (__m128i* src,
429
			__m128i* alpha_dst,
430
			__m128i* dst,
431
			__m128i* alpha_src)
432
{
433
    __m128i t1 = pix_multiply_1x128 (*src, *alpha_dst);
434
    __m128i t2 = pix_multiply_1x128 (*dst, *alpha_src);
435
 
436
    return _mm_adds_epu8 (t1, t2);
437
}
438
 
439
static force_inline __m128i
440
negate_1x128 (__m128i data)
441
{
442
    return _mm_xor_si128 (data, mask_00ff);
443
}
444
 
445
static force_inline __m128i
446
invert_colors_1x128 (__m128i data)
447
{
448
    return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
449
}
450
 
451
static force_inline __m128i
452
over_1x128 (__m128i src, __m128i alpha, __m128i dst)
453
{
454
    return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
455
}
456
 
457
static force_inline __m128i
458
in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
459
{
460
    return over_1x128 (pix_multiply_1x128 (*src, *mask),
461
		       pix_multiply_1x128 (*alpha, *mask),
462
		       *dst);
463
}
464
 
465
static force_inline __m128i
466
over_rev_non_pre_1x128 (__m128i src, __m128i dst)
467
{
468
    __m128i alpha = expand_alpha_1x128 (src);
469
 
470
    return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
471
					   _mm_or_si128 (alpha, mask_alpha)),
472
		       alpha,
473
		       dst);
474
}
475
 
476
static force_inline uint32_t
477
pack_1x128_32 (__m128i data)
478
{
479
    return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
480
}
481
 
482
static force_inline __m128i
483
expand565_16_1x128 (uint16_t pixel)
484
{
485
    __m128i m = _mm_cvtsi32_si128 (pixel);
486
 
487
    m = unpack_565_to_8888 (m);
488
 
489
    return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
490
}
491
 
492
static force_inline uint32_t
493
core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
494
{
495
    uint8_t a;
496
    __m128i xmms;
497
 
498
    a = src >> 24;
499
 
500
    if (a == 0xff)
501
    {
502
	return src;
503
    }
504
    else if (src)
505
    {
506
	xmms = unpack_32_1x128 (src);
507
	return pack_1x128_32 (
508
	    over_1x128 (xmms, expand_alpha_1x128 (xmms),
509
			unpack_32_1x128 (dst)));
510
    }
511
 
512
    return dst;
513
}
514
 
515
static force_inline uint32_t
516
combine1 (const uint32_t *ps, const uint32_t *pm)
517
{
518
    uint32_t s = *ps;
519
 
520
    if (pm)
521
    {
522
	__m128i ms, mm;
523
 
524
	mm = unpack_32_1x128 (*pm);
525
	mm = expand_alpha_1x128 (mm);
526
 
527
	ms = unpack_32_1x128 (s);
528
	ms = pix_multiply_1x128 (ms, mm);
529
 
530
	s = pack_1x128_32 (ms);
531
    }
532
 
533
    return s;
534
}
535
 
536
static force_inline __m128i
537
combine4 (const __m128i *ps, const __m128i *pm)
538
{
539
    __m128i xmm_src_lo, xmm_src_hi;
540
    __m128i xmm_msk_lo, xmm_msk_hi;
541
    __m128i s;
542
 
543
    if (pm)
544
    {
545
	xmm_msk_lo = load_128_unaligned (pm);
546
 
547
	if (is_transparent (xmm_msk_lo))
548
	    return _mm_setzero_si128 ();
549
    }
550
 
551
    s = load_128_unaligned (ps);
552
 
553
    if (pm)
554
    {
555
	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
556
	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
557
 
558
	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
559
 
560
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
561
			    &xmm_msk_lo, &xmm_msk_hi,
562
			    &xmm_src_lo, &xmm_src_hi);
563
 
564
	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
565
    }
566
 
567
    return s;
568
}
569
 
570
static force_inline void
571
core_combine_over_u_sse2_mask (uint32_t *	  pd,
572
			       const uint32_t*    ps,
573
			       const uint32_t*    pm,
574
			       int                w)
575
{
576
    uint32_t s, d;
577
 
578
    /* Align dst on a 16-byte boundary */
579
    while (w && ((uintptr_t)pd & 15))
580
    {
581
	d = *pd;
582
	s = combine1 (ps, pm);
583
 
584
	if (s)
585
	    *pd = core_combine_over_u_pixel_sse2 (s, d);
586
	pd++;
587
	ps++;
588
	pm++;
589
	w--;
590
    }
591
 
592
    while (w >= 4)
593
    {
594
	__m128i mask = load_128_unaligned ((__m128i *)pm);
595
 
596
	if (!is_zero (mask))
597
	{
598
	    __m128i src;
599
	    __m128i src_hi, src_lo;
600
	    __m128i mask_hi, mask_lo;
601
	    __m128i alpha_hi, alpha_lo;
602
 
603
	    src = load_128_unaligned ((__m128i *)ps);
604
 
605
	    if (is_opaque (_mm_and_si128 (src, mask)))
606
	    {
607
		save_128_aligned ((__m128i *)pd, src);
608
	    }
609
	    else
610
	    {
611
		__m128i dst = load_128_aligned ((__m128i *)pd);
612
		__m128i dst_hi, dst_lo;
613
 
614
		unpack_128_2x128 (mask, &mask_lo, &mask_hi);
615
		unpack_128_2x128 (src, &src_lo, &src_hi);
616
 
617
		expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
618
		pix_multiply_2x128 (&src_lo, &src_hi,
619
				    &mask_lo, &mask_hi,
620
				    &src_lo, &src_hi);
621
 
622
		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
623
 
624
		expand_alpha_2x128 (src_lo, src_hi,
625
				    &alpha_lo, &alpha_hi);
626
 
627
		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
628
			    &dst_lo, &dst_hi);
629
 
630
		save_128_aligned (
631
		    (__m128i *)pd,
632
		    pack_2x128_128 (dst_lo, dst_hi));
633
	    }
634
	}
635
 
636
	pm += 4;
637
	ps += 4;
638
	pd += 4;
639
	w -= 4;
640
    }
641
    while (w)
642
    {
643
	d = *pd;
644
	s = combine1 (ps, pm);
645
 
646
	if (s)
647
	    *pd = core_combine_over_u_pixel_sse2 (s, d);
648
	pd++;
649
	ps++;
650
	pm++;
651
 
652
	w--;
653
    }
654
}
655
 
656
static force_inline void
657
core_combine_over_u_sse2_no_mask (uint32_t *	  pd,
658
				  const uint32_t*    ps,
659
				  int                w)
660
{
661
    uint32_t s, d;
662
 
663
    /* Align dst on a 16-byte boundary */
664
    while (w && ((uintptr_t)pd & 15))
665
    {
666
	d = *pd;
667
	s = *ps;
668
 
669
	if (s)
670
	    *pd = core_combine_over_u_pixel_sse2 (s, d);
671
	pd++;
672
	ps++;
673
	w--;
674
    }
675
 
676
    while (w >= 4)
677
    {
678
	__m128i src;
679
	__m128i src_hi, src_lo, dst_hi, dst_lo;
680
	__m128i alpha_hi, alpha_lo;
681
 
682
	src = load_128_unaligned ((__m128i *)ps);
683
 
684
	if (!is_zero (src))
685
	{
686
	    if (is_opaque (src))
687
	    {
688
		save_128_aligned ((__m128i *)pd, src);
689
	    }
690
	    else
691
	    {
692
		__m128i dst = load_128_aligned ((__m128i *)pd);
693
 
694
		unpack_128_2x128 (src, &src_lo, &src_hi);
695
		unpack_128_2x128 (dst, &dst_lo, &dst_hi);
696
 
697
		expand_alpha_2x128 (src_lo, src_hi,
698
				    &alpha_lo, &alpha_hi);
699
		over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
700
			    &dst_lo, &dst_hi);
701
 
702
		save_128_aligned (
703
		    (__m128i *)pd,
704
		    pack_2x128_128 (dst_lo, dst_hi));
705
	    }
706
	}
707
 
708
	ps += 4;
709
	pd += 4;
710
	w -= 4;
711
    }
712
    while (w)
713
    {
714
	d = *pd;
715
	s = *ps;
716
 
717
	if (s)
718
	    *pd = core_combine_over_u_pixel_sse2 (s, d);
719
	pd++;
720
	ps++;
721
 
722
	w--;
723
    }
724
}
725
 
726
static force_inline void
727
sse2_combine_over_u (pixman_implementation_t *imp,
728
                     pixman_op_t              op,
729
                     uint32_t *               pd,
730
                     const uint32_t *         ps,
731
                     const uint32_t *         pm,
732
                     int                      w)
733
{
734
    if (pm)
735
	core_combine_over_u_sse2_mask (pd, ps, pm, w);
736
    else
737
	core_combine_over_u_sse2_no_mask (pd, ps, w);
738
}
739
 
740
static void
741
sse2_combine_over_reverse_u (pixman_implementation_t *imp,
742
                             pixman_op_t              op,
743
                             uint32_t *               pd,
744
                             const uint32_t *         ps,
745
                             const uint32_t *         pm,
746
                             int                      w)
747
{
748
    uint32_t s, d;
749
 
750
    __m128i xmm_dst_lo, xmm_dst_hi;
751
    __m128i xmm_src_lo, xmm_src_hi;
752
    __m128i xmm_alpha_lo, xmm_alpha_hi;
753
 
754
    /* Align dst on a 16-byte boundary */
755
    while (w &&
756
           ((uintptr_t)pd & 15))
757
    {
758
	d = *pd;
759
	s = combine1 (ps, pm);
760
 
761
	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
762
	w--;
763
	ps++;
764
	if (pm)
765
	    pm++;
766
    }
767
 
768
    while (w >= 4)
769
    {
770
	/* I'm loading unaligned because I'm not sure
771
	 * about the address alignment.
772
	 */
773
	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
774
	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
775
 
776
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
777
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
778
 
779
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
780
			    &xmm_alpha_lo, &xmm_alpha_hi);
781
 
782
	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
783
		    &xmm_alpha_lo, &xmm_alpha_hi,
784
		    &xmm_src_lo, &xmm_src_hi);
785
 
786
	/* rebuid the 4 pixel data and save*/
787
	save_128_aligned ((__m128i*)pd,
788
			  pack_2x128_128 (xmm_src_lo, xmm_src_hi));
789
 
790
	w -= 4;
791
	ps += 4;
792
	pd += 4;
793
 
794
	if (pm)
795
	    pm += 4;
796
    }
797
 
798
    while (w)
799
    {
800
	d = *pd;
801
	s = combine1 (ps, pm);
802
 
803
	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
804
	ps++;
805
	w--;
806
	if (pm)
807
	    pm++;
808
    }
809
}
810
 
811
static force_inline uint32_t
812
core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
813
{
814
    uint32_t maska = src >> 24;
815
 
816
    if (maska == 0)
817
    {
818
	return 0;
819
    }
820
    else if (maska != 0xff)
821
    {
822
	return pack_1x128_32 (
823
	    pix_multiply_1x128 (unpack_32_1x128 (dst),
824
				expand_alpha_1x128 (unpack_32_1x128 (src))));
825
    }
826
 
827
    return dst;
828
}
829
 
830
static void
831
sse2_combine_in_u (pixman_implementation_t *imp,
832
                   pixman_op_t              op,
833
                   uint32_t *               pd,
834
                   const uint32_t *         ps,
835
                   const uint32_t *         pm,
836
                   int                      w)
837
{
838
    uint32_t s, d;
839
 
840
    __m128i xmm_src_lo, xmm_src_hi;
841
    __m128i xmm_dst_lo, xmm_dst_hi;
842
 
843
    while (w && ((uintptr_t)pd & 15))
844
    {
845
	s = combine1 (ps, pm);
846
	d = *pd;
847
 
848
	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
849
	w--;
850
	ps++;
851
	if (pm)
852
	    pm++;
853
    }
854
 
855
    while (w >= 4)
856
    {
857
	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
858
	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*) pm);
859
 
860
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
861
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
862
 
863
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
864
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
865
			    &xmm_dst_lo, &xmm_dst_hi,
866
			    &xmm_dst_lo, &xmm_dst_hi);
867
 
868
	save_128_aligned ((__m128i*)pd,
869
			  pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
870
 
871
	ps += 4;
872
	pd += 4;
873
	w -= 4;
874
	if (pm)
875
	    pm += 4;
876
    }
877
 
878
    while (w)
879
    {
880
	s = combine1 (ps, pm);
881
	d = *pd;
882
 
883
	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
884
	w--;
885
	ps++;
886
	if (pm)
887
	    pm++;
888
    }
889
}
890
 
891
static void
892
sse2_combine_in_reverse_u (pixman_implementation_t *imp,
893
                           pixman_op_t              op,
894
                           uint32_t *               pd,
895
                           const uint32_t *         ps,
896
                           const uint32_t *         pm,
897
                           int                      w)
898
{
899
    uint32_t s, d;
900
 
901
    __m128i xmm_src_lo, xmm_src_hi;
902
    __m128i xmm_dst_lo, xmm_dst_hi;
903
 
904
    while (w && ((uintptr_t)pd & 15))
905
    {
906
	s = combine1 (ps, pm);
907
	d = *pd;
908
 
909
	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
910
	ps++;
911
	w--;
912
	if (pm)
913
	    pm++;
914
    }
915
 
916
    while (w >= 4)
917
    {
918
	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
919
	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
920
 
921
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
922
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
923
 
924
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
925
	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
926
			    &xmm_src_lo, &xmm_src_hi,
927
			    &xmm_dst_lo, &xmm_dst_hi);
928
 
929
	save_128_aligned (
930
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
931
 
932
	ps += 4;
933
	pd += 4;
934
	w -= 4;
935
	if (pm)
936
	    pm += 4;
937
    }
938
 
939
    while (w)
940
    {
941
	s = combine1 (ps, pm);
942
	d = *pd;
943
 
944
	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
945
	w--;
946
	ps++;
947
	if (pm)
948
	    pm++;
949
    }
950
}
951
 
952
static void
953
sse2_combine_out_reverse_u (pixman_implementation_t *imp,
954
                            pixman_op_t              op,
955
                            uint32_t *               pd,
956
                            const uint32_t *         ps,
957
                            const uint32_t *         pm,
958
                            int                      w)
959
{
960
    while (w && ((uintptr_t)pd & 15))
961
    {
962
	uint32_t s = combine1 (ps, pm);
963
	uint32_t d = *pd;
964
 
965
	*pd++ = pack_1x128_32 (
966
	    pix_multiply_1x128 (
967
		unpack_32_1x128 (d), negate_1x128 (
968
		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
969
 
970
	if (pm)
971
	    pm++;
972
	ps++;
973
	w--;
974
    }
975
 
976
    while (w >= 4)
977
    {
978
	__m128i xmm_src_lo, xmm_src_hi;
979
	__m128i xmm_dst_lo, xmm_dst_hi;
980
 
981
	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
982
	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
983
 
984
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
985
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
986
 
987
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
988
	negate_2x128       (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
989
 
990
	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
991
			    &xmm_src_lo, &xmm_src_hi,
992
			    &xmm_dst_lo, &xmm_dst_hi);
993
 
994
	save_128_aligned (
995
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
996
 
997
	ps += 4;
998
	pd += 4;
999
	if (pm)
1000
	    pm += 4;
1001
 
1002
	w -= 4;
1003
    }
1004
 
1005
    while (w)
1006
    {
1007
	uint32_t s = combine1 (ps, pm);
1008
	uint32_t d = *pd;
1009
 
1010
	*pd++ = pack_1x128_32 (
1011
	    pix_multiply_1x128 (
1012
		unpack_32_1x128 (d), negate_1x128 (
1013
		    expand_alpha_1x128 (unpack_32_1x128 (s)))));
1014
	ps++;
1015
	if (pm)
1016
	    pm++;
1017
	w--;
1018
    }
1019
}
1020
 
1021
static void
1022
sse2_combine_out_u (pixman_implementation_t *imp,
1023
                    pixman_op_t              op,
1024
                    uint32_t *               pd,
1025
                    const uint32_t *         ps,
1026
                    const uint32_t *         pm,
1027
                    int                      w)
1028
{
1029
    while (w && ((uintptr_t)pd & 15))
1030
    {
1031
	uint32_t s = combine1 (ps, pm);
1032
	uint32_t d = *pd;
1033
 
1034
	*pd++ = pack_1x128_32 (
1035
	    pix_multiply_1x128 (
1036
		unpack_32_1x128 (s), negate_1x128 (
1037
		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
1038
	w--;
1039
	ps++;
1040
	if (pm)
1041
	    pm++;
1042
    }
1043
 
1044
    while (w >= 4)
1045
    {
1046
	__m128i xmm_src_lo, xmm_src_hi;
1047
	__m128i xmm_dst_lo, xmm_dst_hi;
1048
 
1049
	xmm_src_hi = combine4 ((__m128i*) ps, (__m128i*)pm);
1050
	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1051
 
1052
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1053
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1054
 
1055
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1056
	negate_2x128       (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1057
 
1058
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1059
			    &xmm_dst_lo, &xmm_dst_hi,
1060
			    &xmm_dst_lo, &xmm_dst_hi);
1061
 
1062
	save_128_aligned (
1063
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1064
 
1065
	ps += 4;
1066
	pd += 4;
1067
	w -= 4;
1068
	if (pm)
1069
	    pm += 4;
1070
    }
1071
 
1072
    while (w)
1073
    {
1074
	uint32_t s = combine1 (ps, pm);
1075
	uint32_t d = *pd;
1076
 
1077
	*pd++ = pack_1x128_32 (
1078
	    pix_multiply_1x128 (
1079
		unpack_32_1x128 (s), negate_1x128 (
1080
		    expand_alpha_1x128 (unpack_32_1x128 (d)))));
1081
	w--;
1082
	ps++;
1083
	if (pm)
1084
	    pm++;
1085
    }
1086
}
1087
 
1088
static force_inline uint32_t
1089
core_combine_atop_u_pixel_sse2 (uint32_t src,
1090
                                uint32_t dst)
1091
{
1092
    __m128i s = unpack_32_1x128 (src);
1093
    __m128i d = unpack_32_1x128 (dst);
1094
 
1095
    __m128i sa = negate_1x128 (expand_alpha_1x128 (s));
1096
    __m128i da = expand_alpha_1x128 (d);
1097
 
1098
    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1099
}
1100
 
1101
static void
1102
sse2_combine_atop_u (pixman_implementation_t *imp,
1103
                     pixman_op_t              op,
1104
                     uint32_t *               pd,
1105
                     const uint32_t *         ps,
1106
                     const uint32_t *         pm,
1107
                     int                      w)
1108
{
1109
    uint32_t s, d;
1110
 
1111
    __m128i xmm_src_lo, xmm_src_hi;
1112
    __m128i xmm_dst_lo, xmm_dst_hi;
1113
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1114
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1115
 
1116
    while (w && ((uintptr_t)pd & 15))
1117
    {
1118
	s = combine1 (ps, pm);
1119
	d = *pd;
1120
 
1121
	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1122
	w--;
1123
	ps++;
1124
	if (pm)
1125
	    pm++;
1126
    }
1127
 
1128
    while (w >= 4)
1129
    {
1130
	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1131
	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1132
 
1133
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1134
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1135
 
1136
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1137
			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1138
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1139
			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1140
 
1141
	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1142
		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1143
 
1144
	pix_add_multiply_2x128 (
1145
	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1146
	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1147
	    &xmm_dst_lo, &xmm_dst_hi);
1148
 
1149
	save_128_aligned (
1150
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1151
 
1152
	ps += 4;
1153
	pd += 4;
1154
	w -= 4;
1155
	if (pm)
1156
	    pm += 4;
1157
    }
1158
 
1159
    while (w)
1160
    {
1161
	s = combine1 (ps, pm);
1162
	d = *pd;
1163
 
1164
	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
1165
	w--;
1166
	ps++;
1167
	if (pm)
1168
	    pm++;
1169
    }
1170
}
1171
 
1172
static force_inline uint32_t
1173
core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
1174
                                        uint32_t dst)
1175
{
1176
    __m128i s = unpack_32_1x128 (src);
1177
    __m128i d = unpack_32_1x128 (dst);
1178
 
1179
    __m128i sa = expand_alpha_1x128 (s);
1180
    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
1181
 
1182
    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
1183
}
1184
 
1185
static void
1186
sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
1187
                             pixman_op_t              op,
1188
                             uint32_t *               pd,
1189
                             const uint32_t *         ps,
1190
                             const uint32_t *         pm,
1191
                             int                      w)
1192
{
1193
    uint32_t s, d;
1194
 
1195
    __m128i xmm_src_lo, xmm_src_hi;
1196
    __m128i xmm_dst_lo, xmm_dst_hi;
1197
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1198
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1199
 
1200
    while (w && ((uintptr_t)pd & 15))
1201
    {
1202
	s = combine1 (ps, pm);
1203
	d = *pd;
1204
 
1205
	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1206
	ps++;
1207
	w--;
1208
	if (pm)
1209
	    pm++;
1210
    }
1211
 
1212
    while (w >= 4)
1213
    {
1214
	xmm_src_hi = combine4 ((__m128i*)ps, (__m128i*)pm);
1215
	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
1216
 
1217
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1218
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1219
 
1220
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1221
			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1222
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1223
			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1224
 
1225
	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1226
		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1227
 
1228
	pix_add_multiply_2x128 (
1229
	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1230
	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1231
	    &xmm_dst_lo, &xmm_dst_hi);
1232
 
1233
	save_128_aligned (
1234
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1235
 
1236
	ps += 4;
1237
	pd += 4;
1238
	w -= 4;
1239
	if (pm)
1240
	    pm += 4;
1241
    }
1242
 
1243
    while (w)
1244
    {
1245
	s = combine1 (ps, pm);
1246
	d = *pd;
1247
 
1248
	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
1249
	ps++;
1250
	w--;
1251
	if (pm)
1252
	    pm++;
1253
    }
1254
}
1255
 
1256
static force_inline uint32_t
1257
core_combine_xor_u_pixel_sse2 (uint32_t src,
1258
                               uint32_t dst)
1259
{
1260
    __m128i s = unpack_32_1x128 (src);
1261
    __m128i d = unpack_32_1x128 (dst);
1262
 
1263
    __m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
1264
    __m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
1265
 
1266
    return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
1267
}
1268
 
1269
static void
1270
sse2_combine_xor_u (pixman_implementation_t *imp,
1271
                    pixman_op_t              op,
1272
                    uint32_t *               dst,
1273
                    const uint32_t *         src,
1274
                    const uint32_t *         mask,
1275
                    int                      width)
1276
{
1277
    int w = width;
1278
    uint32_t s, d;
1279
    uint32_t* pd = dst;
1280
    const uint32_t* ps = src;
1281
    const uint32_t* pm = mask;
1282
 
1283
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
1284
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
1285
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
1286
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
1287
 
1288
    while (w && ((uintptr_t)pd & 15))
1289
    {
1290
	s = combine1 (ps, pm);
1291
	d = *pd;
1292
 
1293
	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1294
	w--;
1295
	ps++;
1296
	if (pm)
1297
	    pm++;
1298
    }
1299
 
1300
    while (w >= 4)
1301
    {
1302
	xmm_src = combine4 ((__m128i*) ps, (__m128i*) pm);
1303
	xmm_dst = load_128_aligned ((__m128i*) pd);
1304
 
1305
	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
1306
	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
1307
 
1308
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1309
			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1310
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1311
			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1312
 
1313
	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
1314
		      &xmm_alpha_src_lo, &xmm_alpha_src_hi);
1315
	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
1316
		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
1317
 
1318
	pix_add_multiply_2x128 (
1319
	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
1320
	    &xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
1321
	    &xmm_dst_lo, &xmm_dst_hi);
1322
 
1323
	save_128_aligned (
1324
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1325
 
1326
	ps += 4;
1327
	pd += 4;
1328
	w -= 4;
1329
	if (pm)
1330
	    pm += 4;
1331
    }
1332
 
1333
    while (w)
1334
    {
1335
	s = combine1 (ps, pm);
1336
	d = *pd;
1337
 
1338
	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
1339
	w--;
1340
	ps++;
1341
	if (pm)
1342
	    pm++;
1343
    }
1344
}
1345
 
1346
static force_inline void
1347
sse2_combine_add_u (pixman_implementation_t *imp,
1348
                    pixman_op_t              op,
1349
                    uint32_t *               dst,
1350
                    const uint32_t *         src,
1351
                    const uint32_t *         mask,
1352
                    int                      width)
1353
{
1354
    int w = width;
1355
    uint32_t s, d;
1356
    uint32_t* pd = dst;
1357
    const uint32_t* ps = src;
1358
    const uint32_t* pm = mask;
1359
 
1360
    while (w && (uintptr_t)pd & 15)
1361
    {
1362
	s = combine1 (ps, pm);
1363
	d = *pd;
1364
 
1365
	ps++;
1366
	if (pm)
1367
	    pm++;
1368
	*pd++ = _mm_cvtsi128_si32 (
1369
	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1370
	w--;
1371
    }
1372
 
1373
    while (w >= 4)
1374
    {
1375
	__m128i s;
1376
 
1377
	s = combine4 ((__m128i*)ps, (__m128i*)pm);
1378
 
1379
	save_128_aligned (
1380
	    (__m128i*)pd, _mm_adds_epu8 (s, load_128_aligned  ((__m128i*)pd)));
1381
 
1382
	pd += 4;
1383
	ps += 4;
1384
	if (pm)
1385
	    pm += 4;
1386
	w -= 4;
1387
    }
1388
 
1389
    while (w--)
1390
    {
1391
	s = combine1 (ps, pm);
1392
	d = *pd;
1393
 
1394
	ps++;
1395
	*pd++ = _mm_cvtsi128_si32 (
1396
	    _mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
1397
	if (pm)
1398
	    pm++;
1399
    }
1400
}
1401
 
1402
static force_inline uint32_t
1403
core_combine_saturate_u_pixel_sse2 (uint32_t src,
1404
                                    uint32_t dst)
1405
{
1406
    __m128i ms = unpack_32_1x128 (src);
1407
    __m128i md = unpack_32_1x128 (dst);
1408
    uint32_t sa = src >> 24;
1409
    uint32_t da = ~dst >> 24;
1410
 
1411
    if (sa > da)
1412
    {
1413
	ms = pix_multiply_1x128 (
1414
	    ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
1415
    }
1416
 
1417
    return pack_1x128_32 (_mm_adds_epu16 (md, ms));
1418
}
1419
 
1420
static void
1421
sse2_combine_saturate_u (pixman_implementation_t *imp,
1422
                         pixman_op_t              op,
1423
                         uint32_t *               pd,
1424
                         const uint32_t *         ps,
1425
                         const uint32_t *         pm,
1426
                         int                      w)
1427
{
1428
    uint32_t s, d;
1429
 
1430
    uint32_t pack_cmp;
1431
    __m128i xmm_src, xmm_dst;
1432
 
1433
    while (w && (uintptr_t)pd & 15)
1434
    {
1435
	s = combine1 (ps, pm);
1436
	d = *pd;
1437
 
1438
	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1439
	w--;
1440
	ps++;
1441
	if (pm)
1442
	    pm++;
1443
    }
1444
 
1445
    while (w >= 4)
1446
    {
1447
	xmm_dst = load_128_aligned  ((__m128i*)pd);
1448
	xmm_src = combine4 ((__m128i*)ps, (__m128i*)pm);
1449
 
1450
	pack_cmp = _mm_movemask_epi8 (
1451
	    _mm_cmpgt_epi32 (
1452
		_mm_srli_epi32 (xmm_src, 24),
1453
		_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
1454
 
1455
	/* if some alpha src is grater than respective ~alpha dst */
1456
	if (pack_cmp)
1457
	{
1458
	    s = combine1 (ps++, pm);
1459
	    d = *pd;
1460
	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1461
	    if (pm)
1462
		pm++;
1463
 
1464
	    s = combine1 (ps++, pm);
1465
	    d = *pd;
1466
	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1467
	    if (pm)
1468
		pm++;
1469
 
1470
	    s = combine1 (ps++, pm);
1471
	    d = *pd;
1472
	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1473
	    if (pm)
1474
		pm++;
1475
 
1476
	    s = combine1 (ps++, pm);
1477
	    d = *pd;
1478
	    *pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1479
	    if (pm)
1480
		pm++;
1481
	}
1482
	else
1483
	{
1484
	    save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
1485
 
1486
	    pd += 4;
1487
	    ps += 4;
1488
	    if (pm)
1489
		pm += 4;
1490
	}
1491
 
1492
	w -= 4;
1493
    }
1494
 
1495
    while (w--)
1496
    {
1497
	s = combine1 (ps, pm);
1498
	d = *pd;
1499
 
1500
	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
1501
	ps++;
1502
	if (pm)
1503
	    pm++;
1504
    }
1505
}
1506
 
1507
static void
1508
sse2_combine_src_ca (pixman_implementation_t *imp,
1509
                     pixman_op_t              op,
1510
                     uint32_t *               pd,
1511
                     const uint32_t *         ps,
1512
                     const uint32_t *         pm,
1513
                     int                      w)
1514
{
1515
    uint32_t s, m;
1516
 
1517
    __m128i xmm_src_lo, xmm_src_hi;
1518
    __m128i xmm_mask_lo, xmm_mask_hi;
1519
    __m128i xmm_dst_lo, xmm_dst_hi;
1520
 
1521
    while (w && (uintptr_t)pd & 15)
1522
    {
1523
	s = *ps++;
1524
	m = *pm++;
1525
	*pd++ = pack_1x128_32 (
1526
	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1527
	w--;
1528
    }
1529
 
1530
    while (w >= 4)
1531
    {
1532
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1533
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1534
 
1535
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1536
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1537
 
1538
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1539
			    &xmm_mask_lo, &xmm_mask_hi,
1540
			    &xmm_dst_lo, &xmm_dst_hi);
1541
 
1542
	save_128_aligned (
1543
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1544
 
1545
	ps += 4;
1546
	pd += 4;
1547
	pm += 4;
1548
	w -= 4;
1549
    }
1550
 
1551
    while (w)
1552
    {
1553
	s = *ps++;
1554
	m = *pm++;
1555
	*pd++ = pack_1x128_32 (
1556
	    pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
1557
	w--;
1558
    }
1559
}
1560
 
1561
static force_inline uint32_t
1562
core_combine_over_ca_pixel_sse2 (uint32_t src,
1563
                                 uint32_t mask,
1564
                                 uint32_t dst)
1565
{
1566
    __m128i s = unpack_32_1x128 (src);
1567
    __m128i expAlpha = expand_alpha_1x128 (s);
1568
    __m128i unpk_mask = unpack_32_1x128 (mask);
1569
    __m128i unpk_dst  = unpack_32_1x128 (dst);
1570
 
1571
    return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
1572
}
1573
 
1574
static void
1575
sse2_combine_over_ca (pixman_implementation_t *imp,
1576
                      pixman_op_t              op,
1577
                      uint32_t *               pd,
1578
                      const uint32_t *         ps,
1579
                      const uint32_t *         pm,
1580
                      int                      w)
1581
{
1582
    uint32_t s, m, d;
1583
 
1584
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1585
    __m128i xmm_src_lo, xmm_src_hi;
1586
    __m128i xmm_dst_lo, xmm_dst_hi;
1587
    __m128i xmm_mask_lo, xmm_mask_hi;
1588
 
1589
    while (w && (uintptr_t)pd & 15)
1590
    {
1591
	s = *ps++;
1592
	m = *pm++;
1593
	d = *pd;
1594
 
1595
	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1596
	w--;
1597
    }
1598
 
1599
    while (w >= 4)
1600
    {
1601
	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1602
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1603
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1604
 
1605
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1606
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1607
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1608
 
1609
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1610
			    &xmm_alpha_lo, &xmm_alpha_hi);
1611
 
1612
	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
1613
		       &xmm_alpha_lo, &xmm_alpha_hi,
1614
		       &xmm_mask_lo, &xmm_mask_hi,
1615
		       &xmm_dst_lo, &xmm_dst_hi);
1616
 
1617
	save_128_aligned (
1618
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1619
 
1620
	ps += 4;
1621
	pd += 4;
1622
	pm += 4;
1623
	w -= 4;
1624
    }
1625
 
1626
    while (w)
1627
    {
1628
	s = *ps++;
1629
	m = *pm++;
1630
	d = *pd;
1631
 
1632
	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
1633
	w--;
1634
    }
1635
}
1636
 
1637
static force_inline uint32_t
1638
core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
1639
                                         uint32_t mask,
1640
                                         uint32_t dst)
1641
{
1642
    __m128i d = unpack_32_1x128 (dst);
1643
 
1644
    return pack_1x128_32 (
1645
	over_1x128 (d, expand_alpha_1x128 (d),
1646
		    pix_multiply_1x128 (unpack_32_1x128 (src),
1647
					unpack_32_1x128 (mask))));
1648
}
1649
 
1650
static void
1651
sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
1652
                              pixman_op_t              op,
1653
                              uint32_t *               pd,
1654
                              const uint32_t *         ps,
1655
                              const uint32_t *         pm,
1656
                              int                      w)
1657
{
1658
    uint32_t s, m, d;
1659
 
1660
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1661
    __m128i xmm_src_lo, xmm_src_hi;
1662
    __m128i xmm_dst_lo, xmm_dst_hi;
1663
    __m128i xmm_mask_lo, xmm_mask_hi;
1664
 
1665
    while (w && (uintptr_t)pd & 15)
1666
    {
1667
	s = *ps++;
1668
	m = *pm++;
1669
	d = *pd;
1670
 
1671
	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1672
	w--;
1673
    }
1674
 
1675
    while (w >= 4)
1676
    {
1677
	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1678
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1679
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1680
 
1681
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1682
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1683
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1684
 
1685
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1686
			    &xmm_alpha_lo, &xmm_alpha_hi);
1687
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1688
			    &xmm_mask_lo, &xmm_mask_hi,
1689
			    &xmm_mask_lo, &xmm_mask_hi);
1690
 
1691
	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1692
		    &xmm_alpha_lo, &xmm_alpha_hi,
1693
		    &xmm_mask_lo, &xmm_mask_hi);
1694
 
1695
	save_128_aligned (
1696
	    (__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
1697
 
1698
	ps += 4;
1699
	pd += 4;
1700
	pm += 4;
1701
	w -= 4;
1702
    }
1703
 
1704
    while (w)
1705
    {
1706
	s = *ps++;
1707
	m = *pm++;
1708
	d = *pd;
1709
 
1710
	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
1711
	w--;
1712
    }
1713
}
1714
 
1715
static void
1716
sse2_combine_in_ca (pixman_implementation_t *imp,
1717
                    pixman_op_t              op,
1718
                    uint32_t *               pd,
1719
                    const uint32_t *         ps,
1720
                    const uint32_t *         pm,
1721
                    int                      w)
1722
{
1723
    uint32_t s, m, d;
1724
 
1725
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1726
    __m128i xmm_src_lo, xmm_src_hi;
1727
    __m128i xmm_dst_lo, xmm_dst_hi;
1728
    __m128i xmm_mask_lo, xmm_mask_hi;
1729
 
1730
    while (w && (uintptr_t)pd & 15)
1731
    {
1732
	s = *ps++;
1733
	m = *pm++;
1734
	d = *pd;
1735
 
1736
	*pd++ = pack_1x128_32 (
1737
	    pix_multiply_1x128 (
1738
		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
1739
		expand_alpha_1x128 (unpack_32_1x128 (d))));
1740
 
1741
	w--;
1742
    }
1743
 
1744
    while (w >= 4)
1745
    {
1746
	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1747
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1748
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1749
 
1750
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1751
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1752
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1753
 
1754
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1755
			    &xmm_alpha_lo, &xmm_alpha_hi);
1756
 
1757
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1758
			    &xmm_mask_lo, &xmm_mask_hi,
1759
			    &xmm_dst_lo, &xmm_dst_hi);
1760
 
1761
	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1762
			    &xmm_alpha_lo, &xmm_alpha_hi,
1763
			    &xmm_dst_lo, &xmm_dst_hi);
1764
 
1765
	save_128_aligned (
1766
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1767
 
1768
	ps += 4;
1769
	pd += 4;
1770
	pm += 4;
1771
	w -= 4;
1772
    }
1773
 
1774
    while (w)
1775
    {
1776
	s = *ps++;
1777
	m = *pm++;
1778
	d = *pd;
1779
 
1780
	*pd++ = pack_1x128_32 (
1781
	    pix_multiply_1x128 (
1782
		pix_multiply_1x128 (
1783
		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1784
		expand_alpha_1x128 (unpack_32_1x128 (d))));
1785
 
1786
	w--;
1787
    }
1788
}
1789
 
1790
static void
1791
sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
1792
                            pixman_op_t              op,
1793
                            uint32_t *               pd,
1794
                            const uint32_t *         ps,
1795
                            const uint32_t *         pm,
1796
                            int                      w)
1797
{
1798
    uint32_t s, m, d;
1799
 
1800
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1801
    __m128i xmm_src_lo, xmm_src_hi;
1802
    __m128i xmm_dst_lo, xmm_dst_hi;
1803
    __m128i xmm_mask_lo, xmm_mask_hi;
1804
 
1805
    while (w && (uintptr_t)pd & 15)
1806
    {
1807
	s = *ps++;
1808
	m = *pm++;
1809
	d = *pd;
1810
 
1811
	*pd++ = pack_1x128_32 (
1812
	    pix_multiply_1x128 (
1813
		unpack_32_1x128 (d),
1814
		pix_multiply_1x128 (unpack_32_1x128 (m),
1815
				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
1816
	w--;
1817
    }
1818
 
1819
    while (w >= 4)
1820
    {
1821
	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1822
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1823
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1824
 
1825
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1826
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1827
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1828
 
1829
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1830
			    &xmm_alpha_lo, &xmm_alpha_hi);
1831
	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1832
			    &xmm_alpha_lo, &xmm_alpha_hi,
1833
			    &xmm_alpha_lo, &xmm_alpha_hi);
1834
 
1835
	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1836
			    &xmm_alpha_lo, &xmm_alpha_hi,
1837
			    &xmm_dst_lo, &xmm_dst_hi);
1838
 
1839
	save_128_aligned (
1840
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1841
 
1842
	ps += 4;
1843
	pd += 4;
1844
	pm += 4;
1845
	w -= 4;
1846
    }
1847
 
1848
    while (w)
1849
    {
1850
	s = *ps++;
1851
	m = *pm++;
1852
	d = *pd;
1853
 
1854
	*pd++ = pack_1x128_32 (
1855
	    pix_multiply_1x128 (
1856
		unpack_32_1x128 (d),
1857
		pix_multiply_1x128 (unpack_32_1x128 (m),
1858
				   expand_alpha_1x128 (unpack_32_1x128 (s)))));
1859
	w--;
1860
    }
1861
}
1862
 
1863
static void
1864
sse2_combine_out_ca (pixman_implementation_t *imp,
1865
                     pixman_op_t              op,
1866
                     uint32_t *               pd,
1867
                     const uint32_t *         ps,
1868
                     const uint32_t *         pm,
1869
                     int                      w)
1870
{
1871
    uint32_t s, m, d;
1872
 
1873
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1874
    __m128i xmm_src_lo, xmm_src_hi;
1875
    __m128i xmm_dst_lo, xmm_dst_hi;
1876
    __m128i xmm_mask_lo, xmm_mask_hi;
1877
 
1878
    while (w && (uintptr_t)pd & 15)
1879
    {
1880
	s = *ps++;
1881
	m = *pm++;
1882
	d = *pd;
1883
 
1884
	*pd++ = pack_1x128_32 (
1885
	    pix_multiply_1x128 (
1886
		pix_multiply_1x128 (
1887
		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1888
		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1889
	w--;
1890
    }
1891
 
1892
    while (w >= 4)
1893
    {
1894
	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1895
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1896
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1897
 
1898
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1899
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1900
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1901
 
1902
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
1903
			    &xmm_alpha_lo, &xmm_alpha_hi);
1904
	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
1905
		      &xmm_alpha_lo, &xmm_alpha_hi);
1906
 
1907
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
1908
			    &xmm_mask_lo, &xmm_mask_hi,
1909
			    &xmm_dst_lo, &xmm_dst_hi);
1910
	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1911
			    &xmm_alpha_lo, &xmm_alpha_hi,
1912
			    &xmm_dst_lo, &xmm_dst_hi);
1913
 
1914
	save_128_aligned (
1915
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1916
 
1917
	ps += 4;
1918
	pd += 4;
1919
	pm += 4;
1920
	w -= 4;
1921
    }
1922
 
1923
    while (w)
1924
    {
1925
	s = *ps++;
1926
	m = *pm++;
1927
	d = *pd;
1928
 
1929
	*pd++ = pack_1x128_32 (
1930
	    pix_multiply_1x128 (
1931
		pix_multiply_1x128 (
1932
		    unpack_32_1x128 (s), unpack_32_1x128 (m)),
1933
		negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
1934
 
1935
	w--;
1936
    }
1937
}
1938
 
1939
static void
1940
sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
1941
                             pixman_op_t              op,
1942
                             uint32_t *               pd,
1943
                             const uint32_t *         ps,
1944
                             const uint32_t *         pm,
1945
                             int                      w)
1946
{
1947
    uint32_t s, m, d;
1948
 
1949
    __m128i xmm_alpha_lo, xmm_alpha_hi;
1950
    __m128i xmm_src_lo, xmm_src_hi;
1951
    __m128i xmm_dst_lo, xmm_dst_hi;
1952
    __m128i xmm_mask_lo, xmm_mask_hi;
1953
 
1954
    while (w && (uintptr_t)pd & 15)
1955
    {
1956
	s = *ps++;
1957
	m = *pm++;
1958
	d = *pd;
1959
 
1960
	*pd++ = pack_1x128_32 (
1961
	    pix_multiply_1x128 (
1962
		unpack_32_1x128 (d),
1963
		negate_1x128 (pix_multiply_1x128 (
1964
				 unpack_32_1x128 (m),
1965
				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
1966
	w--;
1967
    }
1968
 
1969
    while (w >= 4)
1970
    {
1971
	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
1972
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
1973
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
1974
 
1975
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
1976
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
1977
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
1978
 
1979
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
1980
			    &xmm_alpha_lo, &xmm_alpha_hi);
1981
 
1982
	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
1983
			    &xmm_alpha_lo, &xmm_alpha_hi,
1984
			    &xmm_mask_lo, &xmm_mask_hi);
1985
 
1986
	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
1987
		      &xmm_mask_lo, &xmm_mask_hi);
1988
 
1989
	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
1990
			    &xmm_mask_lo, &xmm_mask_hi,
1991
			    &xmm_dst_lo, &xmm_dst_hi);
1992
 
1993
	save_128_aligned (
1994
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
1995
 
1996
	ps += 4;
1997
	pd += 4;
1998
	pm += 4;
1999
	w -= 4;
2000
    }
2001
 
2002
    while (w)
2003
    {
2004
	s = *ps++;
2005
	m = *pm++;
2006
	d = *pd;
2007
 
2008
	*pd++ = pack_1x128_32 (
2009
	    pix_multiply_1x128 (
2010
		unpack_32_1x128 (d),
2011
		negate_1x128 (pix_multiply_1x128 (
2012
				 unpack_32_1x128 (m),
2013
				 expand_alpha_1x128 (unpack_32_1x128 (s))))));
2014
	w--;
2015
    }
2016
}
2017
 
2018
static force_inline uint32_t
2019
core_combine_atop_ca_pixel_sse2 (uint32_t src,
2020
                                 uint32_t mask,
2021
                                 uint32_t dst)
2022
{
2023
    __m128i m = unpack_32_1x128 (mask);
2024
    __m128i s = unpack_32_1x128 (src);
2025
    __m128i d = unpack_32_1x128 (dst);
2026
    __m128i sa = expand_alpha_1x128 (s);
2027
    __m128i da = expand_alpha_1x128 (d);
2028
 
2029
    s = pix_multiply_1x128 (s, m);
2030
    m = negate_1x128 (pix_multiply_1x128 (m, sa));
2031
 
2032
    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2033
}
2034
 
2035
static void
2036
sse2_combine_atop_ca (pixman_implementation_t *imp,
2037
                      pixman_op_t              op,
2038
                      uint32_t *               pd,
2039
                      const uint32_t *         ps,
2040
                      const uint32_t *         pm,
2041
                      int                      w)
2042
{
2043
    uint32_t s, m, d;
2044
 
2045
    __m128i xmm_src_lo, xmm_src_hi;
2046
    __m128i xmm_dst_lo, xmm_dst_hi;
2047
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2048
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2049
    __m128i xmm_mask_lo, xmm_mask_hi;
2050
 
2051
    while (w && (uintptr_t)pd & 15)
2052
    {
2053
	s = *ps++;
2054
	m = *pm++;
2055
	d = *pd;
2056
 
2057
	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2058
	w--;
2059
    }
2060
 
2061
    while (w >= 4)
2062
    {
2063
	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2064
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2065
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2066
 
2067
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2068
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2069
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2070
 
2071
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2072
			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2073
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2074
			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2075
 
2076
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2077
			    &xmm_mask_lo, &xmm_mask_hi,
2078
			    &xmm_src_lo, &xmm_src_hi);
2079
	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2080
			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2081
			    &xmm_mask_lo, &xmm_mask_hi);
2082
 
2083
	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2084
 
2085
	pix_add_multiply_2x128 (
2086
	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2087
	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2088
	    &xmm_dst_lo, &xmm_dst_hi);
2089
 
2090
	save_128_aligned (
2091
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2092
 
2093
	ps += 4;
2094
	pd += 4;
2095
	pm += 4;
2096
	w -= 4;
2097
    }
2098
 
2099
    while (w)
2100
    {
2101
	s = *ps++;
2102
	m = *pm++;
2103
	d = *pd;
2104
 
2105
	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
2106
	w--;
2107
    }
2108
}
2109
 
2110
static force_inline uint32_t
2111
core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
2112
                                         uint32_t mask,
2113
                                         uint32_t dst)
2114
{
2115
    __m128i m = unpack_32_1x128 (mask);
2116
    __m128i s = unpack_32_1x128 (src);
2117
    __m128i d = unpack_32_1x128 (dst);
2118
 
2119
    __m128i da = negate_1x128 (expand_alpha_1x128 (d));
2120
    __m128i sa = expand_alpha_1x128 (s);
2121
 
2122
    s = pix_multiply_1x128 (s, m);
2123
    m = pix_multiply_1x128 (m, sa);
2124
 
2125
    return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
2126
}
2127
 
2128
static void
2129
sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
2130
                              pixman_op_t              op,
2131
                              uint32_t *               pd,
2132
                              const uint32_t *         ps,
2133
                              const uint32_t *         pm,
2134
                              int                      w)
2135
{
2136
    uint32_t s, m, d;
2137
 
2138
    __m128i xmm_src_lo, xmm_src_hi;
2139
    __m128i xmm_dst_lo, xmm_dst_hi;
2140
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2141
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2142
    __m128i xmm_mask_lo, xmm_mask_hi;
2143
 
2144
    while (w && (uintptr_t)pd & 15)
2145
    {
2146
	s = *ps++;
2147
	m = *pm++;
2148
	d = *pd;
2149
 
2150
	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2151
	w--;
2152
    }
2153
 
2154
    while (w >= 4)
2155
    {
2156
	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2157
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2158
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2159
 
2160
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2161
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2162
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2163
 
2164
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2165
			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2166
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2167
			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2168
 
2169
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2170
			    &xmm_mask_lo, &xmm_mask_hi,
2171
			    &xmm_src_lo, &xmm_src_hi);
2172
	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2173
			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2174
			    &xmm_mask_lo, &xmm_mask_hi);
2175
 
2176
	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2177
		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2178
 
2179
	pix_add_multiply_2x128 (
2180
	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2181
	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2182
	    &xmm_dst_lo, &xmm_dst_hi);
2183
 
2184
	save_128_aligned (
2185
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2186
 
2187
	ps += 4;
2188
	pd += 4;
2189
	pm += 4;
2190
	w -= 4;
2191
    }
2192
 
2193
    while (w)
2194
    {
2195
	s = *ps++;
2196
	m = *pm++;
2197
	d = *pd;
2198
 
2199
	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
2200
	w--;
2201
    }
2202
}
2203
 
2204
static force_inline uint32_t
2205
core_combine_xor_ca_pixel_sse2 (uint32_t src,
2206
                                uint32_t mask,
2207
                                uint32_t dst)
2208
{
2209
    __m128i a = unpack_32_1x128 (mask);
2210
    __m128i s = unpack_32_1x128 (src);
2211
    __m128i d = unpack_32_1x128 (dst);
2212
 
2213
    __m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
2214
				       a, expand_alpha_1x128 (s)));
2215
    __m128i dest      = pix_multiply_1x128 (s, a);
2216
    __m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
2217
 
2218
    return pack_1x128_32 (pix_add_multiply_1x128 (&d,
2219
                                                &alpha_dst,
2220
                                                &dest,
2221
                                                &alpha_src));
2222
}
2223
 
2224
static void
2225
sse2_combine_xor_ca (pixman_implementation_t *imp,
2226
                     pixman_op_t              op,
2227
                     uint32_t *               pd,
2228
                     const uint32_t *         ps,
2229
                     const uint32_t *         pm,
2230
                     int                      w)
2231
{
2232
    uint32_t s, m, d;
2233
 
2234
    __m128i xmm_src_lo, xmm_src_hi;
2235
    __m128i xmm_dst_lo, xmm_dst_hi;
2236
    __m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
2237
    __m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
2238
    __m128i xmm_mask_lo, xmm_mask_hi;
2239
 
2240
    while (w && (uintptr_t)pd & 15)
2241
    {
2242
	s = *ps++;
2243
	m = *pm++;
2244
	d = *pd;
2245
 
2246
	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2247
	w--;
2248
    }
2249
 
2250
    while (w >= 4)
2251
    {
2252
	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2253
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2254
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2255
 
2256
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2257
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2258
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2259
 
2260
	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2261
			    &xmm_alpha_src_lo, &xmm_alpha_src_hi);
2262
	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
2263
			    &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2264
 
2265
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2266
			    &xmm_mask_lo, &xmm_mask_hi,
2267
			    &xmm_src_lo, &xmm_src_hi);
2268
	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
2269
			    &xmm_alpha_src_lo, &xmm_alpha_src_hi,
2270
			    &xmm_mask_lo, &xmm_mask_hi);
2271
 
2272
	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
2273
		      &xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
2274
	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
2275
		      &xmm_mask_lo, &xmm_mask_hi);
2276
 
2277
	pix_add_multiply_2x128 (
2278
	    &xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
2279
	    &xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
2280
	    &xmm_dst_lo, &xmm_dst_hi);
2281
 
2282
	save_128_aligned (
2283
	    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2284
 
2285
	ps += 4;
2286
	pd += 4;
2287
	pm += 4;
2288
	w -= 4;
2289
    }
2290
 
2291
    while (w)
2292
    {
2293
	s = *ps++;
2294
	m = *pm++;
2295
	d = *pd;
2296
 
2297
	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
2298
	w--;
2299
    }
2300
}
2301
 
2302
static void
2303
sse2_combine_add_ca (pixman_implementation_t *imp,
2304
                     pixman_op_t              op,
2305
                     uint32_t *               pd,
2306
                     const uint32_t *         ps,
2307
                     const uint32_t *         pm,
2308
                     int                      w)
2309
{
2310
    uint32_t s, m, d;
2311
 
2312
    __m128i xmm_src_lo, xmm_src_hi;
2313
    __m128i xmm_dst_lo, xmm_dst_hi;
2314
    __m128i xmm_mask_lo, xmm_mask_hi;
2315
 
2316
    while (w && (uintptr_t)pd & 15)
2317
    {
2318
	s = *ps++;
2319
	m = *pm++;
2320
	d = *pd;
2321
 
2322
	*pd++ = pack_1x128_32 (
2323
	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2324
					       unpack_32_1x128 (m)),
2325
			   unpack_32_1x128 (d)));
2326
	w--;
2327
    }
2328
 
2329
    while (w >= 4)
2330
    {
2331
	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
2332
	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
2333
	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
2334
 
2335
	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
2336
	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
2337
	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
2338
 
2339
	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
2340
			    &xmm_mask_lo, &xmm_mask_hi,
2341
			    &xmm_src_lo, &xmm_src_hi);
2342
 
2343
	save_128_aligned (
2344
	    (__m128i*)pd, pack_2x128_128 (
2345
		_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
2346
		_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
2347
 
2348
	ps += 4;
2349
	pd += 4;
2350
	pm += 4;
2351
	w -= 4;
2352
    }
2353
 
2354
    while (w)
2355
    {
2356
	s = *ps++;
2357
	m = *pm++;
2358
	d = *pd;
2359
 
2360
	*pd++ = pack_1x128_32 (
2361
	    _mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
2362
					       unpack_32_1x128 (m)),
2363
			   unpack_32_1x128 (d)));
2364
	w--;
2365
    }
2366
}
2367
 
2368
static force_inline __m128i
2369
create_mask_16_128 (uint16_t mask)
2370
{
2371
    return _mm_set1_epi16 (mask);
2372
}
2373
 
2374
/* Work around a code generation bug in Sun Studio 12. */
2375
#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
2376
# define create_mask_2x32_128(mask0, mask1)				\
2377
    (_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
2378
#else
2379
static force_inline __m128i
2380
create_mask_2x32_128 (uint32_t mask0,
2381
                      uint32_t mask1)
2382
{
2383
    return _mm_set_epi32 (mask0, mask1, mask0, mask1);
2384
}
2385
#endif
2386
 
2387
static void
2388
sse2_composite_over_n_8888 (pixman_implementation_t *imp,
2389
                            pixman_composite_info_t *info)
2390
{
2391
    PIXMAN_COMPOSITE_ARGS (info);
2392
    uint32_t src;
2393
    uint32_t    *dst_line, *dst, d;
2394
    int32_t w;
2395
    int dst_stride;
2396
    __m128i xmm_src, xmm_alpha;
2397
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2398
 
2399
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2400
 
2401
    if (src == 0)
2402
	return;
2403
 
2404
    PIXMAN_IMAGE_GET_LINE (
2405
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2406
 
2407
    xmm_src = expand_pixel_32_1x128 (src);
2408
    xmm_alpha = expand_alpha_1x128 (xmm_src);
2409
 
2410
    while (height--)
2411
    {
2412
	dst = dst_line;
2413
 
2414
	dst_line += dst_stride;
2415
	w = width;
2416
 
2417
	while (w && (uintptr_t)dst & 15)
2418
	{
2419
	    d = *dst;
2420
	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2421
						xmm_alpha,
2422
						unpack_32_1x128 (d)));
2423
	    w--;
2424
	}
2425
 
2426
	while (w >= 4)
2427
	{
2428
	    xmm_dst = load_128_aligned ((__m128i*)dst);
2429
 
2430
	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2431
 
2432
	    over_2x128 (&xmm_src, &xmm_src,
2433
			&xmm_alpha, &xmm_alpha,
2434
			&xmm_dst_lo, &xmm_dst_hi);
2435
 
2436
	    /* rebuid the 4 pixel data and save*/
2437
	    save_128_aligned (
2438
		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2439
 
2440
	    w -= 4;
2441
	    dst += 4;
2442
	}
2443
 
2444
	while (w)
2445
	{
2446
	    d = *dst;
2447
	    *dst++ = pack_1x128_32 (over_1x128 (xmm_src,
2448
						xmm_alpha,
2449
						unpack_32_1x128 (d)));
2450
	    w--;
2451
	}
2452
 
2453
    }
2454
}
2455
 
2456
static void
2457
sse2_composite_over_n_0565 (pixman_implementation_t *imp,
2458
                            pixman_composite_info_t *info)
2459
{
2460
    PIXMAN_COMPOSITE_ARGS (info);
2461
    uint32_t src;
2462
    uint16_t    *dst_line, *dst, d;
2463
    int32_t w;
2464
    int dst_stride;
2465
    __m128i xmm_src, xmm_alpha;
2466
    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
2467
 
2468
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2469
 
2470
    if (src == 0)
2471
	return;
2472
 
2473
    PIXMAN_IMAGE_GET_LINE (
2474
	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2475
 
2476
    xmm_src = expand_pixel_32_1x128 (src);
2477
    xmm_alpha = expand_alpha_1x128 (xmm_src);
2478
 
2479
    while (height--)
2480
    {
2481
	dst = dst_line;
2482
 
2483
	dst_line += dst_stride;
2484
	w = width;
2485
 
2486
	while (w && (uintptr_t)dst & 15)
2487
	{
2488
	    d = *dst;
2489
 
2490
	    *dst++ = pack_565_32_16 (
2491
		pack_1x128_32 (over_1x128 (xmm_src,
2492
					   xmm_alpha,
2493
					   expand565_16_1x128 (d))));
2494
	    w--;
2495
	}
2496
 
2497
	while (w >= 8)
2498
	{
2499
	    xmm_dst = load_128_aligned ((__m128i*)dst);
2500
 
2501
	    unpack_565_128_4x128 (xmm_dst,
2502
				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2503
 
2504
	    over_2x128 (&xmm_src, &xmm_src,
2505
			&xmm_alpha, &xmm_alpha,
2506
			&xmm_dst0, &xmm_dst1);
2507
	    over_2x128 (&xmm_src, &xmm_src,
2508
			&xmm_alpha, &xmm_alpha,
2509
			&xmm_dst2, &xmm_dst3);
2510
 
2511
	    xmm_dst = pack_565_4x128_128 (
2512
		&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
2513
 
2514
	    save_128_aligned ((__m128i*)dst, xmm_dst);
2515
 
2516
	    dst += 8;
2517
	    w -= 8;
2518
	}
2519
 
2520
	while (w--)
2521
	{
2522
	    d = *dst;
2523
	    *dst++ = pack_565_32_16 (
2524
		pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
2525
					   expand565_16_1x128 (d))));
2526
	}
2527
    }
2528
 
2529
}
2530
 
2531
static void
2532
sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
2533
				   pixman_composite_info_t *info)
2534
{
2535
    PIXMAN_COMPOSITE_ARGS (info);
2536
    uint32_t src;
2537
    uint32_t    *dst_line, d;
2538
    uint32_t    *mask_line, m;
2539
    uint32_t pack_cmp;
2540
    int dst_stride, mask_stride;
2541
 
2542
    __m128i xmm_src;
2543
    __m128i xmm_dst;
2544
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2545
 
2546
    __m128i mmx_src, mmx_mask, mmx_dest;
2547
 
2548
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2549
 
2550
    if (src == 0)
2551
	return;
2552
 
2553
    PIXMAN_IMAGE_GET_LINE (
2554
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2555
    PIXMAN_IMAGE_GET_LINE (
2556
	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2557
 
2558
    xmm_src = _mm_unpacklo_epi8 (
2559
	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2560
    mmx_src   = xmm_src;
2561
 
2562
    while (height--)
2563
    {
2564
	int w = width;
2565
	const uint32_t *pm = (uint32_t *)mask_line;
2566
	uint32_t *pd = (uint32_t *)dst_line;
2567
 
2568
	dst_line += dst_stride;
2569
	mask_line += mask_stride;
2570
 
2571
	while (w && (uintptr_t)pd & 15)
2572
	{
2573
	    m = *pm++;
2574
 
2575
	    if (m)
2576
	    {
2577
		d = *pd;
2578
 
2579
		mmx_mask = unpack_32_1x128 (m);
2580
		mmx_dest = unpack_32_1x128 (d);
2581
 
2582
		*pd = pack_1x128_32 (
2583
		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2584
				   mmx_dest));
2585
	    }
2586
 
2587
	    pd++;
2588
	    w--;
2589
	}
2590
 
2591
	while (w >= 4)
2592
	{
2593
	    xmm_mask = load_128_unaligned ((__m128i*)pm);
2594
 
2595
	    pack_cmp =
2596
		_mm_movemask_epi8 (
2597
		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2598
 
2599
	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2600
	    if (pack_cmp != 0xffff)
2601
	    {
2602
		xmm_dst = load_128_aligned ((__m128i*)pd);
2603
 
2604
		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2605
 
2606
		pix_multiply_2x128 (&xmm_src, &xmm_src,
2607
				    &xmm_mask_lo, &xmm_mask_hi,
2608
				    &xmm_mask_lo, &xmm_mask_hi);
2609
		xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
2610
 
2611
		save_128_aligned (
2612
		    (__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
2613
	    }
2614
 
2615
	    pd += 4;
2616
	    pm += 4;
2617
	    w -= 4;
2618
	}
2619
 
2620
	while (w)
2621
	{
2622
	    m = *pm++;
2623
 
2624
	    if (m)
2625
	    {
2626
		d = *pd;
2627
 
2628
		mmx_mask = unpack_32_1x128 (m);
2629
		mmx_dest = unpack_32_1x128 (d);
2630
 
2631
		*pd = pack_1x128_32 (
2632
		    _mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
2633
				   mmx_dest));
2634
	    }
2635
 
2636
	    pd++;
2637
	    w--;
2638
	}
2639
    }
2640
 
2641
}
2642
 
2643
static void
2644
sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
2645
                                    pixman_composite_info_t *info)
2646
{
2647
    PIXMAN_COMPOSITE_ARGS (info);
2648
    uint32_t src;
2649
    uint32_t    *dst_line, d;
2650
    uint32_t    *mask_line, m;
2651
    uint32_t pack_cmp;
2652
    int dst_stride, mask_stride;
2653
 
2654
    __m128i xmm_src, xmm_alpha;
2655
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2656
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
2657
 
2658
    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
2659
 
2660
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
2661
 
2662
    if (src == 0)
2663
	return;
2664
 
2665
    PIXMAN_IMAGE_GET_LINE (
2666
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2667
    PIXMAN_IMAGE_GET_LINE (
2668
	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2669
 
2670
    xmm_src = _mm_unpacklo_epi8 (
2671
	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
2672
    xmm_alpha = expand_alpha_1x128 (xmm_src);
2673
    mmx_src   = xmm_src;
2674
    mmx_alpha = xmm_alpha;
2675
 
2676
    while (height--)
2677
    {
2678
	int w = width;
2679
	const uint32_t *pm = (uint32_t *)mask_line;
2680
	uint32_t *pd = (uint32_t *)dst_line;
2681
 
2682
	dst_line += dst_stride;
2683
	mask_line += mask_stride;
2684
 
2685
	while (w && (uintptr_t)pd & 15)
2686
	{
2687
	    m = *pm++;
2688
 
2689
	    if (m)
2690
	    {
2691
		d = *pd;
2692
		mmx_mask = unpack_32_1x128 (m);
2693
		mmx_dest = unpack_32_1x128 (d);
2694
 
2695
		*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
2696
		                                  &mmx_alpha,
2697
		                                  &mmx_mask,
2698
		                                  &mmx_dest));
2699
	    }
2700
 
2701
	    pd++;
2702
	    w--;
2703
	}
2704
 
2705
	while (w >= 4)
2706
	{
2707
	    xmm_mask = load_128_unaligned ((__m128i*)pm);
2708
 
2709
	    pack_cmp =
2710
		_mm_movemask_epi8 (
2711
		    _mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
2712
 
2713
	    /* if all bits in mask are zero, pack_cmp are equal to 0xffff */
2714
	    if (pack_cmp != 0xffff)
2715
	    {
2716
		xmm_dst = load_128_aligned ((__m128i*)pd);
2717
 
2718
		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
2719
		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2720
 
2721
		in_over_2x128 (&xmm_src, &xmm_src,
2722
			       &xmm_alpha, &xmm_alpha,
2723
			       &xmm_mask_lo, &xmm_mask_hi,
2724
			       &xmm_dst_lo, &xmm_dst_hi);
2725
 
2726
		save_128_aligned (
2727
		    (__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2728
	    }
2729
 
2730
	    pd += 4;
2731
	    pm += 4;
2732
	    w -= 4;
2733
	}
2734
 
2735
	while (w)
2736
	{
2737
	    m = *pm++;
2738
 
2739
	    if (m)
2740
	    {
2741
		d = *pd;
2742
		mmx_mask = unpack_32_1x128 (m);
2743
		mmx_dest = unpack_32_1x128 (d);
2744
 
2745
		*pd = pack_1x128_32 (
2746
		    in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
2747
	    }
2748
 
2749
	    pd++;
2750
	    w--;
2751
	}
2752
    }
2753
 
2754
}
2755
 
2756
static void
2757
sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
2758
                                 pixman_composite_info_t *info)
2759
{
2760
    PIXMAN_COMPOSITE_ARGS (info);
2761
    uint32_t    *dst_line, *dst;
2762
    uint32_t    *src_line, *src;
2763
    uint32_t mask;
2764
    int32_t w;
2765
    int dst_stride, src_stride;
2766
 
2767
    __m128i xmm_mask;
2768
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2769
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2770
    __m128i xmm_alpha_lo, xmm_alpha_hi;
2771
 
2772
    PIXMAN_IMAGE_GET_LINE (
2773
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2774
    PIXMAN_IMAGE_GET_LINE (
2775
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2776
 
2777
    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2778
 
2779
    xmm_mask = create_mask_16_128 (mask >> 24);
2780
 
2781
    while (height--)
2782
    {
2783
	dst = dst_line;
2784
	dst_line += dst_stride;
2785
	src = src_line;
2786
	src_line += src_stride;
2787
	w = width;
2788
 
2789
	while (w && (uintptr_t)dst & 15)
2790
	{
2791
	    uint32_t s = *src++;
2792
 
2793
	    if (s)
2794
	    {
2795
		uint32_t d = *dst;
2796
 
2797
		__m128i ms = unpack_32_1x128 (s);
2798
		__m128i alpha    = expand_alpha_1x128 (ms);
2799
		__m128i dest     = xmm_mask;
2800
		__m128i alpha_dst = unpack_32_1x128 (d);
2801
 
2802
		*dst = pack_1x128_32 (
2803
		    in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
2804
	    }
2805
	    dst++;
2806
	    w--;
2807
	}
2808
 
2809
	while (w >= 4)
2810
	{
2811
	    xmm_src = load_128_unaligned ((__m128i*)src);
2812
 
2813
	    if (!is_zero (xmm_src))
2814
	    {
2815
		xmm_dst = load_128_aligned ((__m128i*)dst);
2816
 
2817
		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
2818
		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
2819
		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
2820
				    &xmm_alpha_lo, &xmm_alpha_hi);
2821
 
2822
		in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
2823
			       &xmm_alpha_lo, &xmm_alpha_hi,
2824
			       &xmm_mask, &xmm_mask,
2825
			       &xmm_dst_lo, &xmm_dst_hi);
2826
 
2827
		save_128_aligned (
2828
		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
2829
	    }
2830
 
2831
	    dst += 4;
2832
	    src += 4;
2833
	    w -= 4;
2834
	}
2835
 
2836
	while (w)
2837
	{
2838
	    uint32_t s = *src++;
2839
 
2840
	    if (s)
2841
	    {
2842
		uint32_t d = *dst;
2843
 
2844
		__m128i ms = unpack_32_1x128 (s);
2845
		__m128i alpha = expand_alpha_1x128 (ms);
2846
		__m128i mask  = xmm_mask;
2847
		__m128i dest  = unpack_32_1x128 (d);
2848
 
2849
		*dst = pack_1x128_32 (
2850
		    in_over_1x128 (&ms, &alpha, &mask, &dest));
2851
	    }
2852
 
2853
	    dst++;
2854
	    w--;
2855
	}
2856
    }
2857
 
2858
}
2859
 
2860
static void
2861
sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
2862
                              pixman_composite_info_t *info)
2863
{
2864
    PIXMAN_COMPOSITE_ARGS (info);
2865
    uint16_t    *dst_line, *dst;
2866
    uint32_t    *src_line, *src, s;
2867
    int dst_stride, src_stride;
2868
    int32_t w;
2869
 
2870
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2871
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2872
 
2873
    while (height--)
2874
    {
2875
	dst = dst_line;
2876
	dst_line += dst_stride;
2877
	src = src_line;
2878
	src_line += src_stride;
2879
	w = width;
2880
 
2881
	while (w && (uintptr_t)dst & 15)
2882
	{
2883
	    s = *src++;
2884
	    *dst = convert_8888_to_0565 (s);
2885
	    dst++;
2886
	    w--;
2887
	}
2888
 
2889
	while (w >= 8)
2890
	{
2891
	    __m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
2892
	    __m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
2893
 
2894
	    save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
2895
 
2896
	    w -= 8;
2897
	    src += 8;
2898
	    dst += 8;
2899
	}
2900
 
2901
	while (w)
2902
	{
2903
	    s = *src++;
2904
	    *dst = convert_8888_to_0565 (s);
2905
	    dst++;
2906
	    w--;
2907
	}
2908
    }
2909
}
2910
 
2911
static void
2912
sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
2913
			      pixman_composite_info_t *info)
2914
{
2915
    PIXMAN_COMPOSITE_ARGS (info);
2916
    uint32_t    *dst_line, *dst;
2917
    uint32_t    *src_line, *src;
2918
    int32_t w;
2919
    int dst_stride, src_stride;
2920
 
2921
 
2922
    PIXMAN_IMAGE_GET_LINE (
2923
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2924
    PIXMAN_IMAGE_GET_LINE (
2925
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2926
 
2927
    while (height--)
2928
    {
2929
	dst = dst_line;
2930
	dst_line += dst_stride;
2931
	src = src_line;
2932
	src_line += src_stride;
2933
	w = width;
2934
 
2935
	while (w && (uintptr_t)dst & 15)
2936
	{
2937
	    *dst++ = *src++ | 0xff000000;
2938
	    w--;
2939
	}
2940
 
2941
	while (w >= 16)
2942
	{
2943
	    __m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
2944
 
2945
	    xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
2946
	    xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
2947
	    xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
2948
	    xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
2949
 
2950
	    save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
2951
	    save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
2952
	    save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
2953
	    save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
2954
 
2955
	    dst += 16;
2956
	    src += 16;
2957
	    w -= 16;
2958
	}
2959
 
2960
	while (w)
2961
	{
2962
	    *dst++ = *src++ | 0xff000000;
2963
	    w--;
2964
	}
2965
    }
2966
 
2967
}
2968
 
2969
static void
2970
sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
2971
                                 pixman_composite_info_t *info)
2972
{
2973
    PIXMAN_COMPOSITE_ARGS (info);
2974
    uint32_t    *dst_line, *dst;
2975
    uint32_t    *src_line, *src;
2976
    uint32_t mask;
2977
    int dst_stride, src_stride;
2978
    int32_t w;
2979
 
2980
    __m128i xmm_mask, xmm_alpha;
2981
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
2982
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
2983
 
2984
    PIXMAN_IMAGE_GET_LINE (
2985
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2986
    PIXMAN_IMAGE_GET_LINE (
2987
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2988
 
2989
    mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
2990
 
2991
    xmm_mask = create_mask_16_128 (mask >> 24);
2992
    xmm_alpha = mask_00ff;
2993
 
2994
    while (height--)
2995
    {
2996
	dst = dst_line;
2997
	dst_line += dst_stride;
2998
	src = src_line;
2999
	src_line += src_stride;
3000
	w = width;
3001
 
3002
	while (w && (uintptr_t)dst & 15)
3003
	{
3004
	    uint32_t s = (*src++) | 0xff000000;
3005
	    uint32_t d = *dst;
3006
 
3007
	    __m128i src   = unpack_32_1x128 (s);
3008
	    __m128i alpha = xmm_alpha;
3009
	    __m128i mask  = xmm_mask;
3010
	    __m128i dest  = unpack_32_1x128 (d);
3011
 
3012
	    *dst++ = pack_1x128_32 (
3013
		in_over_1x128 (&src, &alpha, &mask, &dest));
3014
 
3015
	    w--;
3016
	}
3017
 
3018
	while (w >= 4)
3019
	{
3020
	    xmm_src = _mm_or_si128 (
3021
		load_128_unaligned ((__m128i*)src), mask_ff000000);
3022
	    xmm_dst = load_128_aligned ((__m128i*)dst);
3023
 
3024
	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3025
	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3026
 
3027
	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
3028
			   &xmm_alpha, &xmm_alpha,
3029
			   &xmm_mask, &xmm_mask,
3030
			   &xmm_dst_lo, &xmm_dst_hi);
3031
 
3032
	    save_128_aligned (
3033
		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3034
 
3035
	    dst += 4;
3036
	    src += 4;
3037
	    w -= 4;
3038
 
3039
	}
3040
 
3041
	while (w)
3042
	{
3043
	    uint32_t s = (*src++) | 0xff000000;
3044
	    uint32_t d = *dst;
3045
 
3046
	    __m128i src  = unpack_32_1x128 (s);
3047
	    __m128i alpha = xmm_alpha;
3048
	    __m128i mask  = xmm_mask;
3049
	    __m128i dest  = unpack_32_1x128 (d);
3050
 
3051
	    *dst++ = pack_1x128_32 (
3052
		in_over_1x128 (&src, &alpha, &mask, &dest));
3053
 
3054
	    w--;
3055
	}
3056
    }
3057
 
3058
}
3059
 
3060
static void
3061
sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
3062
                               pixman_composite_info_t *info)
3063
{
3064
    PIXMAN_COMPOSITE_ARGS (info);
3065
    int dst_stride, src_stride;
3066
    uint32_t    *dst_line, *dst;
3067
    uint32_t    *src_line, *src;
3068
 
3069
    PIXMAN_IMAGE_GET_LINE (
3070
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3071
    PIXMAN_IMAGE_GET_LINE (
3072
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3073
 
3074
    dst = dst_line;
3075
    src = src_line;
3076
 
3077
    while (height--)
3078
    {
3079
	sse2_combine_over_u (imp, op, dst, src, NULL, width);
3080
 
3081
	dst += dst_stride;
3082
	src += src_stride;
3083
    }
3084
}
3085
 
3086
static force_inline uint16_t
3087
composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
3088
{
3089
    __m128i ms;
3090
 
3091
    ms = unpack_32_1x128 (src);
3092
    return pack_565_32_16 (
3093
	pack_1x128_32 (
3094
	    over_1x128 (
3095
		ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
3096
}
3097
 
3098
static void
3099
sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
3100
                               pixman_composite_info_t *info)
3101
{
3102
    PIXMAN_COMPOSITE_ARGS (info);
3103
    uint16_t    *dst_line, *dst, d;
3104
    uint32_t    *src_line, *src, s;
3105
    int dst_stride, src_stride;
3106
    int32_t w;
3107
 
3108
    __m128i xmm_alpha_lo, xmm_alpha_hi;
3109
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3110
    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3111
 
3112
    PIXMAN_IMAGE_GET_LINE (
3113
	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3114
    PIXMAN_IMAGE_GET_LINE (
3115
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3116
 
3117
    while (height--)
3118
    {
3119
	dst = dst_line;
3120
	src = src_line;
3121
 
3122
	dst_line += dst_stride;
3123
	src_line += src_stride;
3124
	w = width;
3125
 
3126
	/* Align dst on a 16-byte boundary */
3127
	while (w &&
3128
	       ((uintptr_t)dst & 15))
3129
	{
3130
	    s = *src++;
3131
	    d = *dst;
3132
 
3133
	    *dst++ = composite_over_8888_0565pixel (s, d);
3134
	    w--;
3135
	}
3136
 
3137
	/* It's a 8 pixel loop */
3138
	while (w >= 8)
3139
	{
3140
	    /* I'm loading unaligned because I'm not sure
3141
	     * about the address alignment.
3142
	     */
3143
	    xmm_src = load_128_unaligned ((__m128i*) src);
3144
	    xmm_dst = load_128_aligned ((__m128i*) dst);
3145
 
3146
	    /* Unpacking */
3147
	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3148
	    unpack_565_128_4x128 (xmm_dst,
3149
				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3150
	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3151
				&xmm_alpha_lo, &xmm_alpha_hi);
3152
 
3153
	    /* I'm loading next 4 pixels from memory
3154
	     * before to optimze the memory read.
3155
	     */
3156
	    xmm_src = load_128_unaligned ((__m128i*) (src + 4));
3157
 
3158
	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
3159
			&xmm_alpha_lo, &xmm_alpha_hi,
3160
			&xmm_dst0, &xmm_dst1);
3161
 
3162
	    /* Unpacking */
3163
	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3164
	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
3165
				&xmm_alpha_lo, &xmm_alpha_hi);
3166
 
3167
	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
3168
			&xmm_alpha_lo, &xmm_alpha_hi,
3169
			&xmm_dst2, &xmm_dst3);
3170
 
3171
	    save_128_aligned (
3172
		(__m128i*)dst, pack_565_4x128_128 (
3173
		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3174
 
3175
	    w -= 8;
3176
	    dst += 8;
3177
	    src += 8;
3178
	}
3179
 
3180
	while (w--)
3181
	{
3182
	    s = *src++;
3183
	    d = *dst;
3184
 
3185
	    *dst++ = composite_over_8888_0565pixel (s, d);
3186
	}
3187
    }
3188
 
3189
}
3190
 
3191
static void
3192
sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
3193
                              pixman_composite_info_t *info)
3194
{
3195
    PIXMAN_COMPOSITE_ARGS (info);
3196
    uint32_t src, srca;
3197
    uint32_t *dst_line, *dst;
3198
    uint8_t *mask_line, *mask;
3199
    int dst_stride, mask_stride;
3200
    int32_t w;
3201
    uint32_t m, d;
3202
 
3203
    __m128i xmm_src, xmm_alpha, xmm_def;
3204
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
3205
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3206
 
3207
    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3208
 
3209
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3210
 
3211
    srca = src >> 24;
3212
    if (src == 0)
3213
	return;
3214
 
3215
    PIXMAN_IMAGE_GET_LINE (
3216
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3217
    PIXMAN_IMAGE_GET_LINE (
3218
	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3219
 
3220
    xmm_def = create_mask_2x32_128 (src, src);
3221
    xmm_src = expand_pixel_32_1x128 (src);
3222
    xmm_alpha = expand_alpha_1x128 (xmm_src);
3223
    mmx_src   = xmm_src;
3224
    mmx_alpha = xmm_alpha;
3225
 
3226
    while (height--)
3227
    {
3228
	dst = dst_line;
3229
	dst_line += dst_stride;
3230
	mask = mask_line;
3231
	mask_line += mask_stride;
3232
	w = width;
3233
 
3234
	while (w && (uintptr_t)dst & 15)
3235
	{
3236
	    uint8_t m = *mask++;
3237
 
3238
	    if (m)
3239
	    {
3240
		d = *dst;
3241
		mmx_mask = expand_pixel_8_1x128 (m);
3242
		mmx_dest = unpack_32_1x128 (d);
3243
 
3244
		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3245
		                                   &mmx_alpha,
3246
		                                   &mmx_mask,
3247
		                                   &mmx_dest));
3248
	    }
3249
 
3250
	    w--;
3251
	    dst++;
3252
	}
3253
 
3254
	while (w >= 4)
3255
	{
3256
	    m = *((uint32_t*)mask);
3257
 
3258
	    if (srca == 0xff && m == 0xffffffff)
3259
	    {
3260
		save_128_aligned ((__m128i*)dst, xmm_def);
3261
	    }
3262
	    else if (m)
3263
	    {
3264
		xmm_dst = load_128_aligned ((__m128i*) dst);
3265
		xmm_mask = unpack_32_1x128 (m);
3266
		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3267
 
3268
		/* Unpacking */
3269
		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
3270
		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3271
 
3272
		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3273
					&xmm_mask_lo, &xmm_mask_hi);
3274
 
3275
		in_over_2x128 (&xmm_src, &xmm_src,
3276
			       &xmm_alpha, &xmm_alpha,
3277
			       &xmm_mask_lo, &xmm_mask_hi,
3278
			       &xmm_dst_lo, &xmm_dst_hi);
3279
 
3280
		save_128_aligned (
3281
		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3282
	    }
3283
 
3284
	    w -= 4;
3285
	    dst += 4;
3286
	    mask += 4;
3287
	}
3288
 
3289
	while (w)
3290
	{
3291
	    uint8_t m = *mask++;
3292
 
3293
	    if (m)
3294
	    {
3295
		d = *dst;
3296
		mmx_mask = expand_pixel_8_1x128 (m);
3297
		mmx_dest = unpack_32_1x128 (d);
3298
 
3299
		*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
3300
		                                   &mmx_alpha,
3301
		                                   &mmx_mask,
3302
		                                   &mmx_dest));
3303
	    }
3304
 
3305
	    w--;
3306
	    dst++;
3307
	}
3308
    }
3309
 
3310
}
3311
 
3312
#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
3313
__attribute__((__force_align_arg_pointer__))
3314
#endif
3315
static pixman_bool_t
3316
sse2_fill (pixman_implementation_t *imp,
3317
           uint32_t *               bits,
3318
           int                      stride,
3319
           int                      bpp,
3320
           int                      x,
3321
           int                      y,
3322
           int                      width,
3323
           int                      height,
3324
           uint32_t		    filler)
3325
{
3326
    uint32_t byte_width;
3327
    uint8_t *byte_line;
3328
 
3329
    __m128i xmm_def;
3330
 
3331
    if (bpp == 8)
3332
    {
3333
	uint8_t b;
3334
	uint16_t w;
3335
 
3336
	stride = stride * (int) sizeof (uint32_t) / 1;
3337
	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
3338
	byte_width = width;
3339
	stride *= 1;
3340
 
3341
	b = filler & 0xff;
3342
	w = (b << 8) | b;
3343
	filler = (w << 16) | w;
3344
    }
3345
    else if (bpp == 16)
3346
    {
3347
	stride = stride * (int) sizeof (uint32_t) / 2;
3348
	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
3349
	byte_width = 2 * width;
3350
	stride *= 2;
3351
 
3352
        filler = (filler & 0xffff) * 0x00010001;
3353
    }
3354
    else if (bpp == 32)
3355
    {
3356
	stride = stride * (int) sizeof (uint32_t) / 4;
3357
	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
3358
	byte_width = 4 * width;
3359
	stride *= 4;
3360
    }
3361
    else
3362
    {
3363
	return FALSE;
3364
    }
3365
 
3366
    xmm_def = create_mask_2x32_128 (filler, filler);
3367
 
3368
    while (height--)
3369
    {
3370
	int w;
3371
	uint8_t *d = byte_line;
3372
	byte_line += stride;
3373
	w = byte_width;
3374
 
3375
	if (w >= 1 && ((uintptr_t)d & 1))
3376
	{
3377
	    *(uint8_t *)d = filler;
3378
	    w -= 1;
3379
	    d += 1;
3380
	}
3381
 
3382
	while (w >= 2 && ((uintptr_t)d & 3))
3383
	{
3384
	    *(uint16_t *)d = filler;
3385
	    w -= 2;
3386
	    d += 2;
3387
	}
3388
 
3389
	while (w >= 4 && ((uintptr_t)d & 15))
3390
	{
3391
	    *(uint32_t *)d = filler;
3392
 
3393
	    w -= 4;
3394
	    d += 4;
3395
	}
3396
 
3397
	while (w >= 128)
3398
	{
3399
	    save_128_aligned ((__m128i*)(d),     xmm_def);
3400
	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3401
	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3402
	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3403
	    save_128_aligned ((__m128i*)(d + 64),  xmm_def);
3404
	    save_128_aligned ((__m128i*)(d + 80),  xmm_def);
3405
	    save_128_aligned ((__m128i*)(d + 96),  xmm_def);
3406
	    save_128_aligned ((__m128i*)(d + 112), xmm_def);
3407
 
3408
	    d += 128;
3409
	    w -= 128;
3410
	}
3411
 
3412
	if (w >= 64)
3413
	{
3414
	    save_128_aligned ((__m128i*)(d),     xmm_def);
3415
	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3416
	    save_128_aligned ((__m128i*)(d + 32),  xmm_def);
3417
	    save_128_aligned ((__m128i*)(d + 48),  xmm_def);
3418
 
3419
	    d += 64;
3420
	    w -= 64;
3421
	}
3422
 
3423
	if (w >= 32)
3424
	{
3425
	    save_128_aligned ((__m128i*)(d),     xmm_def);
3426
	    save_128_aligned ((__m128i*)(d + 16),  xmm_def);
3427
 
3428
	    d += 32;
3429
	    w -= 32;
3430
	}
3431
 
3432
	if (w >= 16)
3433
	{
3434
	    save_128_aligned ((__m128i*)(d),     xmm_def);
3435
 
3436
	    d += 16;
3437
	    w -= 16;
3438
	}
3439
 
3440
	while (w >= 4)
3441
	{
3442
	    *(uint32_t *)d = filler;
3443
 
3444
	    w -= 4;
3445
	    d += 4;
3446
	}
3447
 
3448
	if (w >= 2)
3449
	{
3450
	    *(uint16_t *)d = filler;
3451
	    w -= 2;
3452
	    d += 2;
3453
	}
3454
 
3455
	if (w >= 1)
3456
	{
3457
	    *(uint8_t *)d = filler;
3458
	    w -= 1;
3459
	    d += 1;
3460
	}
3461
    }
3462
 
3463
    return TRUE;
3464
}
3465
 
3466
static void
3467
sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
3468
                             pixman_composite_info_t *info)
3469
{
3470
    PIXMAN_COMPOSITE_ARGS (info);
3471
    uint32_t src, srca;
3472
    uint32_t    *dst_line, *dst;
3473
    uint8_t     *mask_line, *mask;
3474
    int dst_stride, mask_stride;
3475
    int32_t w;
3476
    uint32_t m;
3477
 
3478
    __m128i xmm_src, xmm_def;
3479
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3480
 
3481
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3482
 
3483
    srca = src >> 24;
3484
    if (src == 0)
3485
    {
3486
	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
3487
		   PIXMAN_FORMAT_BPP (dest_image->bits.format),
3488
		   dest_x, dest_y, width, height, 0);
3489
	return;
3490
    }
3491
 
3492
    PIXMAN_IMAGE_GET_LINE (
3493
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3494
    PIXMAN_IMAGE_GET_LINE (
3495
	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3496
 
3497
    xmm_def = create_mask_2x32_128 (src, src);
3498
    xmm_src = expand_pixel_32_1x128 (src);
3499
 
3500
    while (height--)
3501
    {
3502
	dst = dst_line;
3503
	dst_line += dst_stride;
3504
	mask = mask_line;
3505
	mask_line += mask_stride;
3506
	w = width;
3507
 
3508
	while (w && (uintptr_t)dst & 15)
3509
	{
3510
	    uint8_t m = *mask++;
3511
 
3512
	    if (m)
3513
	    {
3514
		*dst = pack_1x128_32 (
3515
		    pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
3516
	    }
3517
	    else
3518
	    {
3519
		*dst = 0;
3520
	    }
3521
 
3522
	    w--;
3523
	    dst++;
3524
	}
3525
 
3526
	while (w >= 4)
3527
	{
3528
	    m = *((uint32_t*)mask);
3529
 
3530
	    if (srca == 0xff && m == 0xffffffff)
3531
	    {
3532
		save_128_aligned ((__m128i*)dst, xmm_def);
3533
	    }
3534
	    else if (m)
3535
	    {
3536
		xmm_mask = unpack_32_1x128 (m);
3537
		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3538
 
3539
		/* Unpacking */
3540
		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3541
 
3542
		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3543
					&xmm_mask_lo, &xmm_mask_hi);
3544
 
3545
		pix_multiply_2x128 (&xmm_src, &xmm_src,
3546
				    &xmm_mask_lo, &xmm_mask_hi,
3547
				    &xmm_mask_lo, &xmm_mask_hi);
3548
 
3549
		save_128_aligned (
3550
		    (__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
3551
	    }
3552
	    else
3553
	    {
3554
		save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
3555
	    }
3556
 
3557
	    w -= 4;
3558
	    dst += 4;
3559
	    mask += 4;
3560
	}
3561
 
3562
	while (w)
3563
	{
3564
	    uint8_t m = *mask++;
3565
 
3566
	    if (m)
3567
	    {
3568
		*dst = pack_1x128_32 (
3569
		    pix_multiply_1x128 (
3570
			xmm_src, expand_pixel_8_1x128 (m)));
3571
	    }
3572
	    else
3573
	    {
3574
		*dst = 0;
3575
	    }
3576
 
3577
	    w--;
3578
	    dst++;
3579
	}
3580
    }
3581
 
3582
}
3583
 
3584
static void
3585
sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
3586
                              pixman_composite_info_t *info)
3587
{
3588
    PIXMAN_COMPOSITE_ARGS (info);
3589
    uint32_t src;
3590
    uint16_t    *dst_line, *dst, d;
3591
    uint8_t     *mask_line, *mask;
3592
    int dst_stride, mask_stride;
3593
    int32_t w;
3594
    uint32_t m;
3595
    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3596
 
3597
    __m128i xmm_src, xmm_alpha;
3598
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3599
    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3600
 
3601
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3602
 
3603
    if (src == 0)
3604
	return;
3605
 
3606
    PIXMAN_IMAGE_GET_LINE (
3607
	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3608
    PIXMAN_IMAGE_GET_LINE (
3609
	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3610
 
3611
    xmm_src = expand_pixel_32_1x128 (src);
3612
    xmm_alpha = expand_alpha_1x128 (xmm_src);
3613
    mmx_src = xmm_src;
3614
    mmx_alpha = xmm_alpha;
3615
 
3616
    while (height--)
3617
    {
3618
	dst = dst_line;
3619
	dst_line += dst_stride;
3620
	mask = mask_line;
3621
	mask_line += mask_stride;
3622
	w = width;
3623
 
3624
	while (w && (uintptr_t)dst & 15)
3625
	{
3626
	    m = *mask++;
3627
 
3628
	    if (m)
3629
	    {
3630
		d = *dst;
3631
		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3632
		mmx_dest = expand565_16_1x128 (d);
3633
 
3634
		*dst = pack_565_32_16 (
3635
		    pack_1x128_32 (
3636
			in_over_1x128 (
3637
			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3638
	    }
3639
 
3640
	    w--;
3641
	    dst++;
3642
	}
3643
 
3644
	while (w >= 8)
3645
	{
3646
	    xmm_dst = load_128_aligned ((__m128i*) dst);
3647
	    unpack_565_128_4x128 (xmm_dst,
3648
				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3649
 
3650
	    m = *((uint32_t*)mask);
3651
	    mask += 4;
3652
 
3653
	    if (m)
3654
	    {
3655
		xmm_mask = unpack_32_1x128 (m);
3656
		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3657
 
3658
		/* Unpacking */
3659
		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3660
 
3661
		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3662
					&xmm_mask_lo, &xmm_mask_hi);
3663
 
3664
		in_over_2x128 (&xmm_src, &xmm_src,
3665
			       &xmm_alpha, &xmm_alpha,
3666
			       &xmm_mask_lo, &xmm_mask_hi,
3667
			       &xmm_dst0, &xmm_dst1);
3668
	    }
3669
 
3670
	    m = *((uint32_t*)mask);
3671
	    mask += 4;
3672
 
3673
	    if (m)
3674
	    {
3675
		xmm_mask = unpack_32_1x128 (m);
3676
		xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
3677
 
3678
		/* Unpacking */
3679
		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3680
 
3681
		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
3682
					&xmm_mask_lo, &xmm_mask_hi);
3683
		in_over_2x128 (&xmm_src, &xmm_src,
3684
			       &xmm_alpha, &xmm_alpha,
3685
			       &xmm_mask_lo, &xmm_mask_hi,
3686
			       &xmm_dst2, &xmm_dst3);
3687
	    }
3688
 
3689
	    save_128_aligned (
3690
		(__m128i*)dst, pack_565_4x128_128 (
3691
		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3692
 
3693
	    w -= 8;
3694
	    dst += 8;
3695
	}
3696
 
3697
	while (w)
3698
	{
3699
	    m = *mask++;
3700
 
3701
	    if (m)
3702
	    {
3703
		d = *dst;
3704
		mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
3705
		mmx_dest = expand565_16_1x128 (d);
3706
 
3707
		*dst = pack_565_32_16 (
3708
		    pack_1x128_32 (
3709
			in_over_1x128 (
3710
			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3711
	    }
3712
 
3713
	    w--;
3714
	    dst++;
3715
	}
3716
    }
3717
 
3718
}
3719
 
3720
static void
3721
sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3722
                                 pixman_composite_info_t *info)
3723
{
3724
    PIXMAN_COMPOSITE_ARGS (info);
3725
    uint16_t    *dst_line, *dst, d;
3726
    uint32_t    *src_line, *src, s;
3727
    int dst_stride, src_stride;
3728
    int32_t w;
3729
    uint32_t opaque, zero;
3730
 
3731
    __m128i ms;
3732
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
3733
    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3734
 
3735
    PIXMAN_IMAGE_GET_LINE (
3736
	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3737
    PIXMAN_IMAGE_GET_LINE (
3738
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3739
 
3740
    while (height--)
3741
    {
3742
	dst = dst_line;
3743
	dst_line += dst_stride;
3744
	src = src_line;
3745
	src_line += src_stride;
3746
	w = width;
3747
 
3748
	while (w && (uintptr_t)dst & 15)
3749
	{
3750
	    s = *src++;
3751
	    d = *dst;
3752
 
3753
	    ms = unpack_32_1x128 (s);
3754
 
3755
	    *dst++ = pack_565_32_16 (
3756
		pack_1x128_32 (
3757
		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3758
	    w--;
3759
	}
3760
 
3761
	while (w >= 8)
3762
	{
3763
	    /* First round */
3764
	    xmm_src = load_128_unaligned ((__m128i*)src);
3765
	    xmm_dst = load_128_aligned  ((__m128i*)dst);
3766
 
3767
	    opaque = is_opaque (xmm_src);
3768
	    zero = is_zero (xmm_src);
3769
 
3770
	    unpack_565_128_4x128 (xmm_dst,
3771
				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3772
	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3773
 
3774
	    /* preload next round*/
3775
	    xmm_src = load_128_unaligned ((__m128i*)(src + 4));
3776
 
3777
	    if (opaque)
3778
	    {
3779
		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3780
				     &xmm_dst0, &xmm_dst1);
3781
	    }
3782
	    else if (!zero)
3783
	    {
3784
		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3785
					&xmm_dst0, &xmm_dst1);
3786
	    }
3787
 
3788
	    /* Second round */
3789
	    opaque = is_opaque (xmm_src);
3790
	    zero = is_zero (xmm_src);
3791
 
3792
	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
3793
 
3794
	    if (opaque)
3795
	    {
3796
		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3797
				     &xmm_dst2, &xmm_dst3);
3798
	    }
3799
	    else if (!zero)
3800
	    {
3801
		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3802
					&xmm_dst2, &xmm_dst3);
3803
	    }
3804
 
3805
	    save_128_aligned (
3806
		(__m128i*)dst, pack_565_4x128_128 (
3807
		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
3808
 
3809
	    w -= 8;
3810
	    src += 8;
3811
	    dst += 8;
3812
	}
3813
 
3814
	while (w)
3815
	{
3816
	    s = *src++;
3817
	    d = *dst;
3818
 
3819
	    ms = unpack_32_1x128 (s);
3820
 
3821
	    *dst++ = pack_565_32_16 (
3822
		pack_1x128_32 (
3823
		    over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
3824
	    w--;
3825
	}
3826
    }
3827
 
3828
}
3829
 
3830
static void
3831
sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3832
                                 pixman_composite_info_t *info)
3833
{
3834
    PIXMAN_COMPOSITE_ARGS (info);
3835
    uint32_t    *dst_line, *dst, d;
3836
    uint32_t    *src_line, *src, s;
3837
    int dst_stride, src_stride;
3838
    int32_t w;
3839
    uint32_t opaque, zero;
3840
 
3841
    __m128i xmm_src_lo, xmm_src_hi;
3842
    __m128i xmm_dst_lo, xmm_dst_hi;
3843
 
3844
    PIXMAN_IMAGE_GET_LINE (
3845
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3846
    PIXMAN_IMAGE_GET_LINE (
3847
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3848
 
3849
    while (height--)
3850
    {
3851
	dst = dst_line;
3852
	dst_line += dst_stride;
3853
	src = src_line;
3854
	src_line += src_stride;
3855
	w = width;
3856
 
3857
	while (w && (uintptr_t)dst & 15)
3858
	{
3859
	    s = *src++;
3860
	    d = *dst;
3861
 
3862
	    *dst++ = pack_1x128_32 (
3863
		over_rev_non_pre_1x128 (
3864
		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
3865
 
3866
	    w--;
3867
	}
3868
 
3869
	while (w >= 4)
3870
	{
3871
	    xmm_src_hi = load_128_unaligned ((__m128i*)src);
3872
 
3873
	    opaque = is_opaque (xmm_src_hi);
3874
	    zero = is_zero (xmm_src_hi);
3875
 
3876
	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
3877
 
3878
	    if (opaque)
3879
	    {
3880
		invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
3881
				     &xmm_dst_lo, &xmm_dst_hi);
3882
 
3883
		save_128_aligned (
3884
		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3885
	    }
3886
	    else if (!zero)
3887
	    {
3888
		xmm_dst_hi = load_128_aligned  ((__m128i*)dst);
3889
 
3890
		unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
3891
 
3892
		over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
3893
					&xmm_dst_lo, &xmm_dst_hi);
3894
 
3895
		save_128_aligned (
3896
		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
3897
	    }
3898
 
3899
	    w -= 4;
3900
	    dst += 4;
3901
	    src += 4;
3902
	}
3903
 
3904
	while (w)
3905
	{
3906
	    s = *src++;
3907
	    d = *dst;
3908
 
3909
	    *dst++ = pack_1x128_32 (
3910
		over_rev_non_pre_1x128 (
3911
		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
3912
 
3913
	    w--;
3914
	}
3915
    }
3916
 
3917
}
3918
 
3919
static void
3920
sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3921
                                    pixman_composite_info_t *info)
3922
{
3923
    PIXMAN_COMPOSITE_ARGS (info);
3924
    uint32_t src;
3925
    uint16_t    *dst_line, *dst, d;
3926
    uint32_t    *mask_line, *mask, m;
3927
    int dst_stride, mask_stride;
3928
    int w;
3929
    uint32_t pack_cmp;
3930
 
3931
    __m128i xmm_src, xmm_alpha;
3932
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
3933
    __m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
3934
 
3935
    __m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
3936
 
3937
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3938
 
3939
    if (src == 0)
3940
	return;
3941
 
3942
    PIXMAN_IMAGE_GET_LINE (
3943
	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3944
    PIXMAN_IMAGE_GET_LINE (
3945
	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
3946
 
3947
    xmm_src = expand_pixel_32_1x128 (src);
3948
    xmm_alpha = expand_alpha_1x128 (xmm_src);
3949
    mmx_src = xmm_src;
3950
    mmx_alpha = xmm_alpha;
3951
 
3952
    while (height--)
3953
    {
3954
	w = width;
3955
	mask = mask_line;
3956
	dst = dst_line;
3957
	mask_line += mask_stride;
3958
	dst_line += dst_stride;
3959
 
3960
	while (w && ((uintptr_t)dst & 15))
3961
	{
3962
	    m = *(uint32_t *) mask;
3963
 
3964
	    if (m)
3965
	    {
3966
		d = *dst;
3967
		mmx_mask = unpack_32_1x128 (m);
3968
		mmx_dest = expand565_16_1x128 (d);
3969
 
3970
		*dst = pack_565_32_16 (
3971
		    pack_1x128_32 (
3972
			in_over_1x128 (
3973
			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
3974
	    }
3975
 
3976
	    w--;
3977
	    dst++;
3978
	    mask++;
3979
	}
3980
 
3981
	while (w >= 8)
3982
	{
3983
	    /* First round */
3984
	    xmm_mask = load_128_unaligned ((__m128i*)mask);
3985
	    xmm_dst = load_128_aligned ((__m128i*)dst);
3986
 
3987
	    pack_cmp = _mm_movemask_epi8 (
3988
		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
3989
 
3990
	    unpack_565_128_4x128 (xmm_dst,
3991
				  &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
3992
	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
3993
 
3994
	    /* preload next round */
3995
	    xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
3996
 
3997
	    /* preload next round */
3998
	    if (pack_cmp != 0xffff)
3999
	    {
4000
		in_over_2x128 (&xmm_src, &xmm_src,
4001
			       &xmm_alpha, &xmm_alpha,
4002
			       &xmm_mask_lo, &xmm_mask_hi,
4003
			       &xmm_dst0, &xmm_dst1);
4004
	    }
4005
 
4006
	    /* Second round */
4007
	    pack_cmp = _mm_movemask_epi8 (
4008
		_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
4009
 
4010
	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4011
 
4012
	    if (pack_cmp != 0xffff)
4013
	    {
4014
		in_over_2x128 (&xmm_src, &xmm_src,
4015
			       &xmm_alpha, &xmm_alpha,
4016
			       &xmm_mask_lo, &xmm_mask_hi,
4017
			       &xmm_dst2, &xmm_dst3);
4018
	    }
4019
 
4020
	    save_128_aligned (
4021
		(__m128i*)dst, pack_565_4x128_128 (
4022
		    &xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
4023
 
4024
	    w -= 8;
4025
	    dst += 8;
4026
	    mask += 8;
4027
	}
4028
 
4029
	while (w)
4030
	{
4031
	    m = *(uint32_t *) mask;
4032
 
4033
	    if (m)
4034
	    {
4035
		d = *dst;
4036
		mmx_mask = unpack_32_1x128 (m);
4037
		mmx_dest = expand565_16_1x128 (d);
4038
 
4039
		*dst = pack_565_32_16 (
4040
		    pack_1x128_32 (
4041
			in_over_1x128 (
4042
			    &mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
4043
	    }
4044
 
4045
	    w--;
4046
	    dst++;
4047
	    mask++;
4048
	}
4049
    }
4050
 
4051
}
4052
 
4053
static void
4054
sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
4055
                         pixman_composite_info_t *info)
4056
{
4057
    PIXMAN_COMPOSITE_ARGS (info);
4058
    uint8_t     *dst_line, *dst;
4059
    uint8_t     *mask_line, *mask;
4060
    int dst_stride, mask_stride;
4061
    uint32_t d, m;
4062
    uint32_t src;
4063
    int32_t w;
4064
 
4065
    __m128i xmm_alpha;
4066
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4067
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4068
 
4069
    PIXMAN_IMAGE_GET_LINE (
4070
	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4071
    PIXMAN_IMAGE_GET_LINE (
4072
	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4073
 
4074
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4075
 
4076
    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4077
 
4078
    while (height--)
4079
    {
4080
	dst = dst_line;
4081
	dst_line += dst_stride;
4082
	mask = mask_line;
4083
	mask_line += mask_stride;
4084
	w = width;
4085
 
4086
	while (w && ((uintptr_t)dst & 15))
4087
	{
4088
	    m = (uint32_t) *mask++;
4089
	    d = (uint32_t) *dst;
4090
 
4091
	    *dst++ = (uint8_t) pack_1x128_32 (
4092
		pix_multiply_1x128 (
4093
		    pix_multiply_1x128 (xmm_alpha,
4094
				       unpack_32_1x128 (m)),
4095
		    unpack_32_1x128 (d)));
4096
	    w--;
4097
	}
4098
 
4099
	while (w >= 16)
4100
	{
4101
	    xmm_mask = load_128_unaligned ((__m128i*)mask);
4102
	    xmm_dst = load_128_aligned ((__m128i*)dst);
4103
 
4104
	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4105
	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4106
 
4107
	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4108
				&xmm_mask_lo, &xmm_mask_hi,
4109
				&xmm_mask_lo, &xmm_mask_hi);
4110
 
4111
	    pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
4112
				&xmm_dst_lo, &xmm_dst_hi,
4113
				&xmm_dst_lo, &xmm_dst_hi);
4114
 
4115
	    save_128_aligned (
4116
		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4117
 
4118
	    mask += 16;
4119
	    dst += 16;
4120
	    w -= 16;
4121
	}
4122
 
4123
	while (w)
4124
	{
4125
	    m = (uint32_t) *mask++;
4126
	    d = (uint32_t) *dst;
4127
 
4128
	    *dst++ = (uint8_t) pack_1x128_32 (
4129
		pix_multiply_1x128 (
4130
		    pix_multiply_1x128 (
4131
			xmm_alpha, unpack_32_1x128 (m)),
4132
		    unpack_32_1x128 (d)));
4133
	    w--;
4134
	}
4135
    }
4136
 
4137
}
4138
 
4139
static void
4140
sse2_composite_in_n_8 (pixman_implementation_t *imp,
4141
		       pixman_composite_info_t *info)
4142
{
4143
    PIXMAN_COMPOSITE_ARGS (info);
4144
    uint8_t     *dst_line, *dst;
4145
    int dst_stride;
4146
    uint32_t d;
4147
    uint32_t src;
4148
    int32_t w;
4149
 
4150
    __m128i xmm_alpha;
4151
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4152
 
4153
    PIXMAN_IMAGE_GET_LINE (
4154
	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4155
 
4156
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4157
 
4158
    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4159
 
4160
    src = src >> 24;
4161
 
4162
    if (src == 0xff)
4163
	return;
4164
 
4165
    if (src == 0x00)
4166
    {
4167
	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4168
		     8, dest_x, dest_y, width, height, src);
4169
 
4170
	return;
4171
    }
4172
 
4173
    while (height--)
4174
    {
4175
	dst = dst_line;
4176
	dst_line += dst_stride;
4177
	w = width;
4178
 
4179
	while (w && ((uintptr_t)dst & 15))
4180
	{
4181
	    d = (uint32_t) *dst;
4182
 
4183
	    *dst++ = (uint8_t) pack_1x128_32 (
4184
		pix_multiply_1x128 (
4185
		    xmm_alpha,
4186
		    unpack_32_1x128 (d)));
4187
	    w--;
4188
	}
4189
 
4190
	while (w >= 16)
4191
	{
4192
	    xmm_dst = load_128_aligned ((__m128i*)dst);
4193
 
4194
	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4195
 
4196
	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4197
				&xmm_dst_lo, &xmm_dst_hi,
4198
				&xmm_dst_lo, &xmm_dst_hi);
4199
 
4200
	    save_128_aligned (
4201
		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4202
 
4203
	    dst += 16;
4204
	    w -= 16;
4205
	}
4206
 
4207
	while (w)
4208
	{
4209
	    d = (uint32_t) *dst;
4210
 
4211
	    *dst++ = (uint8_t) pack_1x128_32 (
4212
		pix_multiply_1x128 (
4213
		    xmm_alpha,
4214
		    unpack_32_1x128 (d)));
4215
	    w--;
4216
	}
4217
    }
4218
 
4219
}
4220
 
4221
static void
4222
sse2_composite_in_8_8 (pixman_implementation_t *imp,
4223
                       pixman_composite_info_t *info)
4224
{
4225
    PIXMAN_COMPOSITE_ARGS (info);
4226
    uint8_t     *dst_line, *dst;
4227
    uint8_t     *src_line, *src;
4228
    int src_stride, dst_stride;
4229
    int32_t w;
4230
    uint32_t s, d;
4231
 
4232
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4233
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4234
 
4235
    PIXMAN_IMAGE_GET_LINE (
4236
	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4237
    PIXMAN_IMAGE_GET_LINE (
4238
	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4239
 
4240
    while (height--)
4241
    {
4242
	dst = dst_line;
4243
	dst_line += dst_stride;
4244
	src = src_line;
4245
	src_line += src_stride;
4246
	w = width;
4247
 
4248
	while (w && ((uintptr_t)dst & 15))
4249
	{
4250
	    s = (uint32_t) *src++;
4251
	    d = (uint32_t) *dst;
4252
 
4253
	    *dst++ = (uint8_t) pack_1x128_32 (
4254
		pix_multiply_1x128 (
4255
		    unpack_32_1x128 (s), unpack_32_1x128 (d)));
4256
	    w--;
4257
	}
4258
 
4259
	while (w >= 16)
4260
	{
4261
	    xmm_src = load_128_unaligned ((__m128i*)src);
4262
	    xmm_dst = load_128_aligned ((__m128i*)dst);
4263
 
4264
	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4265
	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4266
 
4267
	    pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
4268
				&xmm_dst_lo, &xmm_dst_hi,
4269
				&xmm_dst_lo, &xmm_dst_hi);
4270
 
4271
	    save_128_aligned (
4272
		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4273
 
4274
	    src += 16;
4275
	    dst += 16;
4276
	    w -= 16;
4277
	}
4278
 
4279
	while (w)
4280
	{
4281
	    s = (uint32_t) *src++;
4282
	    d = (uint32_t) *dst;
4283
 
4284
	    *dst++ = (uint8_t) pack_1x128_32 (
4285
		pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
4286
	    w--;
4287
	}
4288
    }
4289
 
4290
}
4291
 
4292
static void
4293
sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
4294
			  pixman_composite_info_t *info)
4295
{
4296
    PIXMAN_COMPOSITE_ARGS (info);
4297
    uint8_t     *dst_line, *dst;
4298
    uint8_t     *mask_line, *mask;
4299
    int dst_stride, mask_stride;
4300
    int32_t w;
4301
    uint32_t src;
4302
    uint32_t m, d;
4303
 
4304
    __m128i xmm_alpha;
4305
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4306
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4307
 
4308
    PIXMAN_IMAGE_GET_LINE (
4309
	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4310
    PIXMAN_IMAGE_GET_LINE (
4311
	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4312
 
4313
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4314
 
4315
    xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
4316
 
4317
    while (height--)
4318
    {
4319
	dst = dst_line;
4320
	dst_line += dst_stride;
4321
	mask = mask_line;
4322
	mask_line += mask_stride;
4323
	w = width;
4324
 
4325
	while (w && ((uintptr_t)dst & 15))
4326
	{
4327
	    m = (uint32_t) *mask++;
4328
	    d = (uint32_t) *dst;
4329
 
4330
	    *dst++ = (uint8_t) pack_1x128_32 (
4331
		_mm_adds_epu16 (
4332
		    pix_multiply_1x128 (
4333
			xmm_alpha, unpack_32_1x128 (m)),
4334
		    unpack_32_1x128 (d)));
4335
	    w--;
4336
	}
4337
 
4338
	while (w >= 16)
4339
	{
4340
	    xmm_mask = load_128_unaligned ((__m128i*)mask);
4341
	    xmm_dst = load_128_aligned ((__m128i*)dst);
4342
 
4343
	    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4344
	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4345
 
4346
	    pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
4347
				&xmm_mask_lo, &xmm_mask_hi,
4348
				&xmm_mask_lo, &xmm_mask_hi);
4349
 
4350
	    xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4351
	    xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4352
 
4353
	    save_128_aligned (
4354
		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4355
 
4356
	    mask += 16;
4357
	    dst += 16;
4358
	    w -= 16;
4359
	}
4360
 
4361
	while (w)
4362
	{
4363
	    m = (uint32_t) *mask++;
4364
	    d = (uint32_t) *dst;
4365
 
4366
	    *dst++ = (uint8_t) pack_1x128_32 (
4367
		_mm_adds_epu16 (
4368
		    pix_multiply_1x128 (
4369
			xmm_alpha, unpack_32_1x128 (m)),
4370
		    unpack_32_1x128 (d)));
4371
 
4372
	    w--;
4373
	}
4374
    }
4375
 
4376
}
4377
 
4378
static void
4379
sse2_composite_add_n_8 (pixman_implementation_t *imp,
4380
			pixman_composite_info_t *info)
4381
{
4382
    PIXMAN_COMPOSITE_ARGS (info);
4383
    uint8_t     *dst_line, *dst;
4384
    int dst_stride;
4385
    int32_t w;
4386
    uint32_t src;
4387
 
4388
    __m128i xmm_src;
4389
 
4390
    PIXMAN_IMAGE_GET_LINE (
4391
	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4392
 
4393
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4394
 
4395
    src >>= 24;
4396
 
4397
    if (src == 0x00)
4398
	return;
4399
 
4400
    if (src == 0xff)
4401
    {
4402
	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
4403
		     8, dest_x, dest_y, width, height, 0xff);
4404
 
4405
	return;
4406
    }
4407
 
4408
    src = (src << 24) | (src << 16) | (src << 8) | src;
4409
    xmm_src = _mm_set_epi32 (src, src, src, src);
4410
 
4411
    while (height--)
4412
    {
4413
	dst = dst_line;
4414
	dst_line += dst_stride;
4415
	w = width;
4416
 
4417
	while (w && ((uintptr_t)dst & 15))
4418
	{
4419
	    *dst = (uint8_t)_mm_cvtsi128_si32 (
4420
		_mm_adds_epu8 (
4421
		    xmm_src,
4422
		    _mm_cvtsi32_si128 (*dst)));
4423
 
4424
	    w--;
4425
	    dst++;
4426
	}
4427
 
4428
	while (w >= 16)
4429
	{
4430
	    save_128_aligned (
4431
		(__m128i*)dst, _mm_adds_epu8 (xmm_src, load_128_aligned  ((__m128i*)dst)));
4432
 
4433
	    dst += 16;
4434
	    w -= 16;
4435
	}
4436
 
4437
	while (w)
4438
	{
4439
	    *dst = (uint8_t)_mm_cvtsi128_si32 (
4440
		_mm_adds_epu8 (
4441
		    xmm_src,
4442
		    _mm_cvtsi32_si128 (*dst)));
4443
 
4444
	    w--;
4445
	    dst++;
4446
	}
4447
    }
4448
 
4449
}
4450
 
4451
static void
4452
sse2_composite_add_8_8 (pixman_implementation_t *imp,
4453
			pixman_composite_info_t *info)
4454
{
4455
    PIXMAN_COMPOSITE_ARGS (info);
4456
    uint8_t     *dst_line, *dst;
4457
    uint8_t     *src_line, *src;
4458
    int dst_stride, src_stride;
4459
    int32_t w;
4460
    uint16_t t;
4461
 
4462
    PIXMAN_IMAGE_GET_LINE (
4463
	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
4464
    PIXMAN_IMAGE_GET_LINE (
4465
	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
4466
 
4467
    while (height--)
4468
    {
4469
	dst = dst_line;
4470
	src = src_line;
4471
 
4472
	dst_line += dst_stride;
4473
	src_line += src_stride;
4474
	w = width;
4475
 
4476
	/* Small head */
4477
	while (w && (uintptr_t)dst & 3)
4478
	{
4479
	    t = (*dst) + (*src++);
4480
	    *dst++ = t | (0 - (t >> 8));
4481
	    w--;
4482
	}
4483
 
4484
	sse2_combine_add_u (imp, op,
4485
			    (uint32_t*)dst, (uint32_t*)src, NULL, w >> 2);
4486
 
4487
	/* Small tail */
4488
	dst += w & 0xfffc;
4489
	src += w & 0xfffc;
4490
 
4491
	w &= 3;
4492
 
4493
	while (w)
4494
	{
4495
	    t = (*dst) + (*src++);
4496
	    *dst++ = t | (0 - (t >> 8));
4497
	    w--;
4498
	}
4499
    }
4500
 
4501
}
4502
 
4503
static void
4504
sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
4505
                              pixman_composite_info_t *info)
4506
{
4507
    PIXMAN_COMPOSITE_ARGS (info);
4508
    uint32_t    *dst_line, *dst;
4509
    uint32_t    *src_line, *src;
4510
    int dst_stride, src_stride;
4511
 
4512
    PIXMAN_IMAGE_GET_LINE (
4513
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4514
    PIXMAN_IMAGE_GET_LINE (
4515
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4516
 
4517
    while (height--)
4518
    {
4519
	dst = dst_line;
4520
	dst_line += dst_stride;
4521
	src = src_line;
4522
	src_line += src_stride;
4523
 
4524
	sse2_combine_add_u (imp, op, dst, src, NULL, width);
4525
    }
4526
}
4527
 
4528
static void
4529
sse2_composite_add_n_8888 (pixman_implementation_t *imp,
4530
			   pixman_composite_info_t *info)
4531
{
4532
    PIXMAN_COMPOSITE_ARGS (info);
4533
    uint32_t *dst_line, *dst, src;
4534
    int dst_stride;
4535
 
4536
    __m128i xmm_src;
4537
 
4538
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4539
 
4540
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4541
    if (src == 0)
4542
	return;
4543
 
4544
    if (src == ~0)
4545
    {
4546
	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
4547
		     dest_x, dest_y, width, height, ~0);
4548
 
4549
	return;
4550
    }
4551
 
4552
    xmm_src = _mm_set_epi32 (src, src, src, src);
4553
    while (height--)
4554
    {
4555
	int w = width;
4556
	uint32_t d;
4557
 
4558
	dst = dst_line;
4559
	dst_line += dst_stride;
4560
 
4561
	while (w && (uintptr_t)dst & 15)
4562
	{
4563
	    d = *dst;
4564
	    *dst++ =
4565
		_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
4566
	    w--;
4567
	}
4568
 
4569
	while (w >= 4)
4570
	{
4571
	    save_128_aligned
4572
		((__m128i*)dst,
4573
		 _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
4574
 
4575
	    dst += 4;
4576
	    w -= 4;
4577
	}
4578
 
4579
	while (w--)
4580
	{
4581
	    d = *dst;
4582
	    *dst++ =
4583
		_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
4584
						  _mm_cvtsi32_si128 (d)));
4585
	}
4586
    }
4587
}
4588
 
4589
static void
4590
sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
4591
			     pixman_composite_info_t *info)
4592
{
4593
    PIXMAN_COMPOSITE_ARGS (info);
4594
    uint32_t     *dst_line, *dst;
4595
    uint8_t     *mask_line, *mask;
4596
    int dst_stride, mask_stride;
4597
    int32_t w;
4598
    uint32_t src;
4599
 
4600
    __m128i xmm_src;
4601
 
4602
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
4603
    if (src == 0)
4604
	return;
4605
    xmm_src = expand_pixel_32_1x128 (src);
4606
 
4607
    PIXMAN_IMAGE_GET_LINE (
4608
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4609
    PIXMAN_IMAGE_GET_LINE (
4610
	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4611
 
4612
    while (height--)
4613
    {
4614
	dst = dst_line;
4615
	dst_line += dst_stride;
4616
	mask = mask_line;
4617
	mask_line += mask_stride;
4618
	w = width;
4619
 
4620
	while (w && ((uintptr_t)dst & 15))
4621
	{
4622
	    uint8_t m = *mask++;
4623
	    if (m)
4624
	    {
4625
		*dst = pack_1x128_32
4626
		    (_mm_adds_epu16
4627
		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4628
		      unpack_32_1x128 (*dst)));
4629
	    }
4630
	    dst++;
4631
	    w--;
4632
	}
4633
 
4634
	while (w >= 4)
4635
	{
4636
	    uint32_t m = *(uint32_t*)mask;
4637
	    if (m)
4638
	    {
4639
		__m128i xmm_mask_lo, xmm_mask_hi;
4640
		__m128i xmm_dst_lo, xmm_dst_hi;
4641
 
4642
		__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
4643
		__m128i xmm_mask =
4644
		    _mm_unpacklo_epi8 (unpack_32_1x128(m),
4645
				       _mm_setzero_si128 ());
4646
 
4647
		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4648
		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4649
 
4650
		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
4651
					&xmm_mask_lo, &xmm_mask_hi);
4652
 
4653
		pix_multiply_2x128 (&xmm_src, &xmm_src,
4654
				    &xmm_mask_lo, &xmm_mask_hi,
4655
				    &xmm_mask_lo, &xmm_mask_hi);
4656
 
4657
		xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
4658
		xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
4659
 
4660
		save_128_aligned (
4661
		    (__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4662
	    }
4663
 
4664
	    w -= 4;
4665
	    dst += 4;
4666
	    mask += 4;
4667
	}
4668
 
4669
	while (w)
4670
	{
4671
	    uint8_t m = *mask++;
4672
	    if (m)
4673
	    {
4674
		*dst = pack_1x128_32
4675
		    (_mm_adds_epu16
4676
		     (pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
4677
		      unpack_32_1x128 (*dst)));
4678
	    }
4679
	    dst++;
4680
	    w--;
4681
	}
4682
    }
4683
}
4684
 
4685
static pixman_bool_t
4686
sse2_blt (pixman_implementation_t *imp,
4687
          uint32_t *               src_bits,
4688
          uint32_t *               dst_bits,
4689
          int                      src_stride,
4690
          int                      dst_stride,
4691
          int                      src_bpp,
4692
          int                      dst_bpp,
4693
          int                      src_x,
4694
          int                      src_y,
4695
          int                      dest_x,
4696
          int                      dest_y,
4697
          int                      width,
4698
          int                      height)
4699
{
4700
    uint8_t *   src_bytes;
4701
    uint8_t *   dst_bytes;
4702
    int byte_width;
4703
 
4704
    if (src_bpp != dst_bpp)
4705
	return FALSE;
4706
 
4707
    if (src_bpp == 16)
4708
    {
4709
	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
4710
	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
4711
	src_bytes =(uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
4712
	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4713
	byte_width = 2 * width;
4714
	src_stride *= 2;
4715
	dst_stride *= 2;
4716
    }
4717
    else if (src_bpp == 32)
4718
    {
4719
	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
4720
	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
4721
	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
4722
	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
4723
	byte_width = 4 * width;
4724
	src_stride *= 4;
4725
	dst_stride *= 4;
4726
    }
4727
    else
4728
    {
4729
	return FALSE;
4730
    }
4731
 
4732
    while (height--)
4733
    {
4734
	int w;
4735
	uint8_t *s = src_bytes;
4736
	uint8_t *d = dst_bytes;
4737
	src_bytes += src_stride;
4738
	dst_bytes += dst_stride;
4739
	w = byte_width;
4740
 
4741
	while (w >= 2 && ((uintptr_t)d & 3))
4742
	{
4743
	    *(uint16_t *)d = *(uint16_t *)s;
4744
	    w -= 2;
4745
	    s += 2;
4746
	    d += 2;
4747
	}
4748
 
4749
	while (w >= 4 && ((uintptr_t)d & 15))
4750
	{
4751
	    *(uint32_t *)d = *(uint32_t *)s;
4752
 
4753
	    w -= 4;
4754
	    s += 4;
4755
	    d += 4;
4756
	}
4757
 
4758
	while (w >= 64)
4759
	{
4760
	    __m128i xmm0, xmm1, xmm2, xmm3;
4761
 
4762
	    xmm0 = load_128_unaligned ((__m128i*)(s));
4763
	    xmm1 = load_128_unaligned ((__m128i*)(s + 16));
4764
	    xmm2 = load_128_unaligned ((__m128i*)(s + 32));
4765
	    xmm3 = load_128_unaligned ((__m128i*)(s + 48));
4766
 
4767
	    save_128_aligned ((__m128i*)(d),    xmm0);
4768
	    save_128_aligned ((__m128i*)(d + 16), xmm1);
4769
	    save_128_aligned ((__m128i*)(d + 32), xmm2);
4770
	    save_128_aligned ((__m128i*)(d + 48), xmm3);
4771
 
4772
	    s += 64;
4773
	    d += 64;
4774
	    w -= 64;
4775
	}
4776
 
4777
	while (w >= 16)
4778
	{
4779
	    save_128_aligned ((__m128i*)d, load_128_unaligned ((__m128i*)s) );
4780
 
4781
	    w -= 16;
4782
	    d += 16;
4783
	    s += 16;
4784
	}
4785
 
4786
	while (w >= 4)
4787
	{
4788
	    *(uint32_t *)d = *(uint32_t *)s;
4789
 
4790
	    w -= 4;
4791
	    s += 4;
4792
	    d += 4;
4793
	}
4794
 
4795
	if (w >= 2)
4796
	{
4797
	    *(uint16_t *)d = *(uint16_t *)s;
4798
	    w -= 2;
4799
	    s += 2;
4800
	    d += 2;
4801
	}
4802
    }
4803
 
4804
    return TRUE;
4805
}
4806
 
4807
static void
4808
sse2_composite_copy_area (pixman_implementation_t *imp,
4809
                          pixman_composite_info_t *info)
4810
{
4811
    PIXMAN_COMPOSITE_ARGS (info);
4812
    sse2_blt (imp, src_image->bits.bits,
4813
	      dest_image->bits.bits,
4814
	      src_image->bits.rowstride,
4815
	      dest_image->bits.rowstride,
4816
	      PIXMAN_FORMAT_BPP (src_image->bits.format),
4817
	      PIXMAN_FORMAT_BPP (dest_image->bits.format),
4818
	      src_x, src_y, dest_x, dest_y, width, height);
4819
}
4820
 
4821
static void
4822
sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
4823
                                 pixman_composite_info_t *info)
4824
{
4825
    PIXMAN_COMPOSITE_ARGS (info);
4826
    uint32_t    *src, *src_line, s;
4827
    uint32_t    *dst, *dst_line, d;
4828
    uint8_t         *mask, *mask_line;
4829
    uint32_t m;
4830
    int src_stride, mask_stride, dst_stride;
4831
    int32_t w;
4832
    __m128i ms;
4833
 
4834
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
4835
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4836
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4837
 
4838
    PIXMAN_IMAGE_GET_LINE (
4839
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4840
    PIXMAN_IMAGE_GET_LINE (
4841
	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4842
    PIXMAN_IMAGE_GET_LINE (
4843
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4844
 
4845
    while (height--)
4846
    {
4847
        src = src_line;
4848
        src_line += src_stride;
4849
        dst = dst_line;
4850
        dst_line += dst_stride;
4851
        mask = mask_line;
4852
        mask_line += mask_stride;
4853
 
4854
        w = width;
4855
 
4856
        while (w && (uintptr_t)dst & 15)
4857
        {
4858
            s = 0xff000000 | *src++;
4859
            m = (uint32_t) *mask++;
4860
            d = *dst;
4861
            ms = unpack_32_1x128 (s);
4862
 
4863
            if (m != 0xff)
4864
            {
4865
		__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4866
		__m128i md = unpack_32_1x128 (d);
4867
 
4868
                ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
4869
            }
4870
 
4871
            *dst++ = pack_1x128_32 (ms);
4872
            w--;
4873
        }
4874
 
4875
        while (w >= 4)
4876
        {
4877
            m = *(uint32_t*) mask;
4878
            xmm_src = _mm_or_si128 (
4879
		load_128_unaligned ((__m128i*)src), mask_ff000000);
4880
 
4881
            if (m == 0xffffffff)
4882
            {
4883
                save_128_aligned ((__m128i*)dst, xmm_src);
4884
            }
4885
            else
4886
            {
4887
                xmm_dst = load_128_aligned ((__m128i*)dst);
4888
 
4889
                xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
4890
 
4891
                unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
4892
                unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
4893
                unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
4894
 
4895
                expand_alpha_rev_2x128 (
4896
		    xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
4897
 
4898
                in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
4899
			       &mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
4900
			       &xmm_dst_lo, &xmm_dst_hi);
4901
 
4902
                save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
4903
            }
4904
 
4905
            src += 4;
4906
            dst += 4;
4907
            mask += 4;
4908
            w -= 4;
4909
        }
4910
 
4911
        while (w)
4912
        {
4913
            m = (uint32_t) *mask++;
4914
 
4915
            if (m)
4916
            {
4917
                s = 0xff000000 | *src;
4918
 
4919
                if (m == 0xff)
4920
                {
4921
                    *dst = s;
4922
                }
4923
                else
4924
                {
4925
		    __m128i ma, md, ms;
4926
 
4927
                    d = *dst;
4928
 
4929
		    ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
4930
		    md = unpack_32_1x128 (d);
4931
		    ms = unpack_32_1x128 (s);
4932
 
4933
                    *dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
4934
                }
4935
 
4936
            }
4937
 
4938
            src++;
4939
            dst++;
4940
            w--;
4941
        }
4942
    }
4943
 
4944
}
4945
 
4946
static void
4947
sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
4948
                                 pixman_composite_info_t *info)
4949
{
4950
    PIXMAN_COMPOSITE_ARGS (info);
4951
    uint32_t    *src, *src_line, s;
4952
    uint32_t    *dst, *dst_line, d;
4953
    uint8_t         *mask, *mask_line;
4954
    uint32_t m;
4955
    int src_stride, mask_stride, dst_stride;
4956
    int32_t w;
4957
 
4958
    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
4959
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
4960
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
4961
 
4962
    PIXMAN_IMAGE_GET_LINE (
4963
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
4964
    PIXMAN_IMAGE_GET_LINE (
4965
	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
4966
    PIXMAN_IMAGE_GET_LINE (
4967
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
4968
 
4969
    while (height--)
4970
    {
4971
        src = src_line;
4972
        src_line += src_stride;
4973
        dst = dst_line;
4974
        dst_line += dst_stride;
4975
        mask = mask_line;
4976
        mask_line += mask_stride;
4977
 
4978
        w = width;
4979
 
4980
        while (w && (uintptr_t)dst & 15)
4981
        {
4982
	    uint32_t sa;
4983
 
4984
            s = *src++;
4985
            m = (uint32_t) *mask++;
4986
            d = *dst;
4987
 
4988
	    sa = s >> 24;
4989
 
4990
	    if (m)
4991
	    {
4992
		if (sa == 0xff && m == 0xff)
4993
		{
4994
		    *dst = s;
4995
		}
4996
		else
4997
		{
4998
		    __m128i ms, md, ma, msa;
4999
 
5000
		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5001
		    ms = unpack_32_1x128 (s);
5002
		    md = unpack_32_1x128 (d);
5003
 
5004
		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5005
 
5006
		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5007
		}
5008
	    }
5009
 
5010
	    dst++;
5011
            w--;
5012
        }
5013
 
5014
        while (w >= 4)
5015
        {
5016
            m = *(uint32_t *) mask;
5017
 
5018
	    if (m)
5019
	    {
5020
		xmm_src = load_128_unaligned ((__m128i*)src);
5021
 
5022
		if (m == 0xffffffff && is_opaque (xmm_src))
5023
		{
5024
		    save_128_aligned ((__m128i *)dst, xmm_src);
5025
		}
5026
		else
5027
		{
5028
		    xmm_dst = load_128_aligned ((__m128i *)dst);
5029
 
5030
		    xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5031
 
5032
		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5033
		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5034
		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5035
 
5036
		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5037
		    expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5038
 
5039
		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5040
				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5041
 
5042
		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5043
		}
5044
	    }
5045
 
5046
            src += 4;
5047
            dst += 4;
5048
            mask += 4;
5049
            w -= 4;
5050
        }
5051
 
5052
        while (w)
5053
        {
5054
	    uint32_t sa;
5055
 
5056
            s = *src++;
5057
            m = (uint32_t) *mask++;
5058
            d = *dst;
5059
 
5060
	    sa = s >> 24;
5061
 
5062
	    if (m)
5063
	    {
5064
		if (sa == 0xff && m == 0xff)
5065
		{
5066
		    *dst = s;
5067
		}
5068
		else
5069
		{
5070
		    __m128i ms, md, ma, msa;
5071
 
5072
		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5073
		    ms = unpack_32_1x128 (s);
5074
		    md = unpack_32_1x128 (d);
5075
 
5076
		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5077
 
5078
		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5079
		}
5080
	    }
5081
 
5082
	    dst++;
5083
            w--;
5084
        }
5085
    }
5086
 
5087
}
5088
 
5089
static void
5090
sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
5091
				    pixman_composite_info_t *info)
5092
{
5093
    PIXMAN_COMPOSITE_ARGS (info);
5094
    uint32_t src;
5095
    uint32_t    *dst_line, *dst;
5096
    __m128i xmm_src;
5097
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5098
    __m128i xmm_dsta_hi, xmm_dsta_lo;
5099
    int dst_stride;
5100
    int32_t w;
5101
 
5102
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
5103
 
5104
    if (src == 0)
5105
	return;
5106
 
5107
    PIXMAN_IMAGE_GET_LINE (
5108
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5109
 
5110
    xmm_src = expand_pixel_32_1x128 (src);
5111
 
5112
    while (height--)
5113
    {
5114
	dst = dst_line;
5115
 
5116
	dst_line += dst_stride;
5117
	w = width;
5118
 
5119
	while (w && (uintptr_t)dst & 15)
5120
	{
5121
	    __m128i vd;
5122
 
5123
	    vd = unpack_32_1x128 (*dst);
5124
 
5125
	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5126
					      xmm_src));
5127
	    w--;
5128
	    dst++;
5129
	}
5130
 
5131
	while (w >= 4)
5132
	{
5133
	    __m128i tmp_lo, tmp_hi;
5134
 
5135
	    xmm_dst = load_128_aligned ((__m128i*)dst);
5136
 
5137
	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5138
	    expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
5139
 
5140
	    tmp_lo = xmm_src;
5141
	    tmp_hi = xmm_src;
5142
 
5143
	    over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
5144
			&xmm_dsta_lo, &xmm_dsta_hi,
5145
			&tmp_lo, &tmp_hi);
5146
 
5147
	    save_128_aligned (
5148
		(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
5149
 
5150
	    w -= 4;
5151
	    dst += 4;
5152
	}
5153
 
5154
	while (w)
5155
	{
5156
	    __m128i vd;
5157
 
5158
	    vd = unpack_32_1x128 (*dst);
5159
 
5160
	    *dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
5161
					      xmm_src));
5162
	    w--;
5163
	    dst++;
5164
	}
5165
 
5166
    }
5167
 
5168
}
5169
 
5170
static void
5171
sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
5172
				    pixman_composite_info_t *info)
5173
{
5174
    PIXMAN_COMPOSITE_ARGS (info);
5175
    uint32_t    *src, *src_line, s;
5176
    uint32_t    *dst, *dst_line, d;
5177
    uint32_t    *mask, *mask_line;
5178
    uint32_t    m;
5179
    int src_stride, mask_stride, dst_stride;
5180
    int32_t w;
5181
 
5182
    __m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5183
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5184
    __m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5185
 
5186
    PIXMAN_IMAGE_GET_LINE (
5187
	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
5188
    PIXMAN_IMAGE_GET_LINE (
5189
	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
5190
    PIXMAN_IMAGE_GET_LINE (
5191
	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
5192
 
5193
    while (height--)
5194
    {
5195
        src = src_line;
5196
        src_line += src_stride;
5197
        dst = dst_line;
5198
        dst_line += dst_stride;
5199
        mask = mask_line;
5200
        mask_line += mask_stride;
5201
 
5202
        w = width;
5203
 
5204
        while (w && (uintptr_t)dst & 15)
5205
        {
5206
	    uint32_t sa;
5207
 
5208
            s = *src++;
5209
            m = (*mask++) >> 24;
5210
            d = *dst;
5211
 
5212
	    sa = s >> 24;
5213
 
5214
	    if (m)
5215
	    {
5216
		if (sa == 0xff && m == 0xff)
5217
		{
5218
		    *dst = s;
5219
		}
5220
		else
5221
		{
5222
		    __m128i ms, md, ma, msa;
5223
 
5224
		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5225
		    ms = unpack_32_1x128 (s);
5226
		    md = unpack_32_1x128 (d);
5227
 
5228
		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5229
 
5230
		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5231
		}
5232
	    }
5233
 
5234
	    dst++;
5235
            w--;
5236
        }
5237
 
5238
        while (w >= 4)
5239
        {
5240
	    xmm_mask = load_128_unaligned ((__m128i*)mask);
5241
 
5242
	    if (!is_transparent (xmm_mask))
5243
	    {
5244
		xmm_src = load_128_unaligned ((__m128i*)src);
5245
 
5246
		if (is_opaque (xmm_mask) && is_opaque (xmm_src))
5247
		{
5248
		    save_128_aligned ((__m128i *)dst, xmm_src);
5249
		}
5250
		else
5251
		{
5252
		    xmm_dst = load_128_aligned ((__m128i *)dst);
5253
 
5254
		    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5255
		    unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5256
		    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5257
 
5258
		    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5259
		    expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5260
 
5261
		    in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5262
				   &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5263
 
5264
		    save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5265
		}
5266
	    }
5267
 
5268
            src += 4;
5269
            dst += 4;
5270
            mask += 4;
5271
            w -= 4;
5272
        }
5273
 
5274
        while (w)
5275
        {
5276
	    uint32_t sa;
5277
 
5278
            s = *src++;
5279
            m = (*mask++) >> 24;
5280
            d = *dst;
5281
 
5282
	    sa = s >> 24;
5283
 
5284
	    if (m)
5285
	    {
5286
		if (sa == 0xff && m == 0xff)
5287
		{
5288
		    *dst = s;
5289
		}
5290
		else
5291
		{
5292
		    __m128i ms, md, ma, msa;
5293
 
5294
		    ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5295
		    ms = unpack_32_1x128 (s);
5296
		    md = unpack_32_1x128 (d);
5297
 
5298
		    msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5299
 
5300
		    *dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5301
		}
5302
	    }
5303
 
5304
	    dst++;
5305
            w--;
5306
        }
5307
    }
5308
 
5309
}
5310
 
5311
/* A variant of 'sse2_combine_over_u' with minor tweaks */
5312
static force_inline void
5313
scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t*       pd,
5314
                                             const uint32_t* ps,
5315
                                             int32_t         w,
5316
                                             pixman_fixed_t  vx,
5317
                                             pixman_fixed_t  unit_x,
5318
                                             pixman_fixed_t  src_width_fixed,
5319
                                             pixman_bool_t   fully_transparent_src)
5320
{
5321
    uint32_t s, d;
5322
    const uint32_t* pm = NULL;
5323
 
5324
    __m128i xmm_dst_lo, xmm_dst_hi;
5325
    __m128i xmm_src_lo, xmm_src_hi;
5326
    __m128i xmm_alpha_lo, xmm_alpha_hi;
5327
 
5328
    if (fully_transparent_src)
5329
	return;
5330
 
5331
    /* Align dst on a 16-byte boundary */
5332
    while (w && ((uintptr_t)pd & 15))
5333
    {
5334
	d = *pd;
5335
	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5336
	vx += unit_x;
5337
	while (vx >= 0)
5338
	    vx -= src_width_fixed;
5339
 
5340
	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
5341
	if (pm)
5342
	    pm++;
5343
	w--;
5344
    }
5345
 
5346
    while (w >= 4)
5347
    {
5348
	__m128i tmp;
5349
	uint32_t tmp1, tmp2, tmp3, tmp4;
5350
 
5351
	tmp1 = *(ps + pixman_fixed_to_int (vx));
5352
	vx += unit_x;
5353
	while (vx >= 0)
5354
	    vx -= src_width_fixed;
5355
	tmp2 = *(ps + pixman_fixed_to_int (vx));
5356
	vx += unit_x;
5357
	while (vx >= 0)
5358
	    vx -= src_width_fixed;
5359
	tmp3 = *(ps + pixman_fixed_to_int (vx));
5360
	vx += unit_x;
5361
	while (vx >= 0)
5362
	    vx -= src_width_fixed;
5363
	tmp4 = *(ps + pixman_fixed_to_int (vx));
5364
	vx += unit_x;
5365
	while (vx >= 0)
5366
	    vx -= src_width_fixed;
5367
 
5368
	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5369
 
5370
	xmm_src_hi = combine4 ((__m128i*)&tmp, (__m128i*)pm);
5371
 
5372
	if (is_opaque (xmm_src_hi))
5373
	{
5374
	    save_128_aligned ((__m128i*)pd, xmm_src_hi);
5375
	}
5376
	else if (!is_zero (xmm_src_hi))
5377
	{
5378
	    xmm_dst_hi = load_128_aligned ((__m128i*) pd);
5379
 
5380
	    unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
5381
	    unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
5382
 
5383
	    expand_alpha_2x128 (
5384
		xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5385
 
5386
	    over_2x128 (&xmm_src_lo, &xmm_src_hi,
5387
			&xmm_alpha_lo, &xmm_alpha_hi,
5388
			&xmm_dst_lo, &xmm_dst_hi);
5389
 
5390
	    /* rebuid the 4 pixel data and save*/
5391
	    save_128_aligned ((__m128i*)pd,
5392
			      pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5393
	}
5394
 
5395
	w -= 4;
5396
	pd += 4;
5397
	if (pm)
5398
	    pm += 4;
5399
    }
5400
 
5401
    while (w)
5402
    {
5403
	d = *pd;
5404
	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
5405
	vx += unit_x;
5406
	while (vx >= 0)
5407
	    vx -= src_width_fixed;
5408
 
5409
	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
5410
	if (pm)
5411
	    pm++;
5412
 
5413
	w--;
5414
    }
5415
}
5416
 
5417
FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
5418
		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5419
		       uint32_t, uint32_t, COVER)
5420
FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
5421
		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5422
		       uint32_t, uint32_t, NONE)
5423
FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
5424
		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5425
		       uint32_t, uint32_t, PAD)
5426
FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
5427
		       scaled_nearest_scanline_sse2_8888_8888_OVER,
5428
		       uint32_t, uint32_t, NORMAL)
5429
 
5430
static force_inline void
5431
scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
5432
					       uint32_t *       dst,
5433
					       const uint32_t * src,
5434
					       int32_t          w,
5435
					       pixman_fixed_t   vx,
5436
					       pixman_fixed_t   unit_x,
5437
					       pixman_fixed_t   src_width_fixed,
5438
					       pixman_bool_t    zero_src)
5439
{
5440
    __m128i xmm_mask;
5441
    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
5442
    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5443
    __m128i xmm_alpha_lo, xmm_alpha_hi;
5444
 
5445
    if (zero_src || (*mask >> 24) == 0)
5446
	return;
5447
 
5448
    xmm_mask = create_mask_16_128 (*mask >> 24);
5449
 
5450
    while (w && (uintptr_t)dst & 15)
5451
    {
5452
	uint32_t s = *(src + pixman_fixed_to_int (vx));
5453
	vx += unit_x;
5454
	while (vx >= 0)
5455
	    vx -= src_width_fixed;
5456
 
5457
	if (s)
5458
	{
5459
	    uint32_t d = *dst;
5460
 
5461
	    __m128i ms = unpack_32_1x128 (s);
5462
	    __m128i alpha     = expand_alpha_1x128 (ms);
5463
	    __m128i dest      = xmm_mask;
5464
	    __m128i alpha_dst = unpack_32_1x128 (d);
5465
 
5466
	    *dst = pack_1x128_32 (
5467
		in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5468
	}
5469
	dst++;
5470
	w--;
5471
    }
5472
 
5473
    while (w >= 4)
5474
    {
5475
	uint32_t tmp1, tmp2, tmp3, tmp4;
5476
 
5477
	tmp1 = *(src + pixman_fixed_to_int (vx));
5478
	vx += unit_x;
5479
	while (vx >= 0)
5480
	    vx -= src_width_fixed;
5481
	tmp2 = *(src + pixman_fixed_to_int (vx));
5482
	vx += unit_x;
5483
	while (vx >= 0)
5484
	    vx -= src_width_fixed;
5485
	tmp3 = *(src + pixman_fixed_to_int (vx));
5486
	vx += unit_x;
5487
	while (vx >= 0)
5488
	    vx -= src_width_fixed;
5489
	tmp4 = *(src + pixman_fixed_to_int (vx));
5490
	vx += unit_x;
5491
	while (vx >= 0)
5492
	    vx -= src_width_fixed;
5493
 
5494
	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
5495
 
5496
	if (!is_zero (xmm_src))
5497
	{
5498
	    xmm_dst = load_128_aligned ((__m128i*)dst);
5499
 
5500
	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5501
	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5502
	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
5503
			        &xmm_alpha_lo, &xmm_alpha_hi);
5504
 
5505
	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
5506
			   &xmm_alpha_lo, &xmm_alpha_hi,
5507
			   &xmm_mask, &xmm_mask,
5508
			   &xmm_dst_lo, &xmm_dst_hi);
5509
 
5510
	    save_128_aligned (
5511
		(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5512
	}
5513
 
5514
	dst += 4;
5515
	w -= 4;
5516
    }
5517
 
5518
    while (w)
5519
    {
5520
	uint32_t s = *(src + pixman_fixed_to_int (vx));
5521
	vx += unit_x;
5522
	while (vx >= 0)
5523
	    vx -= src_width_fixed;
5524
 
5525
	if (s)
5526
	{
5527
	    uint32_t d = *dst;
5528
 
5529
	    __m128i ms = unpack_32_1x128 (s);
5530
	    __m128i alpha = expand_alpha_1x128 (ms);
5531
	    __m128i mask  = xmm_mask;
5532
	    __m128i dest  = unpack_32_1x128 (d);
5533
 
5534
	    *dst = pack_1x128_32 (
5535
		in_over_1x128 (&ms, &alpha, &mask, &dest));
5536
	}
5537
 
5538
	dst++;
5539
	w--;
5540
    }
5541
 
5542
}
5543
 
5544
FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
5545
			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5546
			      uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
5547
FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
5548
			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5549
			      uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
5550
FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
5551
			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5552
			      uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
5553
FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
5554
			      scaled_nearest_scanline_sse2_8888_n_8888_OVER,
5555
			      uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
5556
 
5557
#if BILINEAR_INTERPOLATION_BITS < 8
5558
# define BILINEAR_DECLARE_VARIABLES						\
5559
    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
5560
    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
5561
    const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1);		\
5562
    const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x,	\
5563
					  unit_x, -unit_x, unit_x, -unit_x);	\
5564
    const __m128i xmm_zero = _mm_setzero_si128 ();				\
5565
    __m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1),		\
5566
				   vx, -(vx + 1), vx, -(vx + 1))
5567
#else
5568
# define BILINEAR_DECLARE_VARIABLES						\
5569
    const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt);	\
5570
    const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb);	\
5571
    const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1);		\
5572
    const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x,	\
5573
					  -unit_x, -unit_x, -unit_x, -unit_x);	\
5574
    const __m128i xmm_zero = _mm_setzero_si128 ();				\
5575
    __m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx,				\
5576
				   -(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
5577
#endif
5578
 
5579
#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
5580
do {										\
5581
    __m128i xmm_wh, xmm_lo, xmm_hi, a;						\
5582
    /* fetch 2x2 pixel block into sse2 registers */				\
5583
    __m128i tltr = _mm_loadl_epi64 (						\
5584
			    (__m128i *)&src_top[pixman_fixed_to_int (vx)]);	\
5585
    __m128i blbr = _mm_loadl_epi64 (						\
5586
			    (__m128i *)&src_bottom[pixman_fixed_to_int (vx)]);	\
5587
    vx += unit_x;								\
5588
    /* vertical interpolation */						\
5589
    a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero),	\
5590
					xmm_wt),				\
5591
		       _mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero),	\
5592
					xmm_wb));				\
5593
    if (BILINEAR_INTERPOLATION_BITS < 8)					\
5594
    {										\
5595
	/* calculate horizontal weights */					\
5596
	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
5597
					16 - BILINEAR_INTERPOLATION_BITS));	\
5598
	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
5599
	/* horizontal interpolation */						\
5600
	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 (		\
5601
		a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh);			\
5602
    }										\
5603
    else									\
5604
    {										\
5605
	/* calculate horizontal weights */					\
5606
	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x,		\
5607
					16 - BILINEAR_INTERPOLATION_BITS));	\
5608
	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
5609
	/* horizontal interpolation */						\
5610
	xmm_lo = _mm_mullo_epi16 (a, xmm_wh);					\
5611
	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh);					\
5612
	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi),			\
5613
			   _mm_unpackhi_epi16 (xmm_lo, xmm_hi));		\
5614
    }										\
5615
    /* shift and pack the result */						\
5616
    a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2);			\
5617
    a = _mm_packs_epi32 (a, a);							\
5618
    a = _mm_packus_epi16 (a, a);						\
5619
    pix = _mm_cvtsi128_si32 (a);						\
5620
} while (0)
5621
 
5622
#define BILINEAR_SKIP_ONE_PIXEL()						\
5623
do {										\
5624
    vx += unit_x;								\
5625
    xmm_x = _mm_add_epi16 (xmm_x, xmm_ux);					\
5626
} while(0)
5627
 
5628
static force_inline void
5629
scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t *       dst,
5630
					     const uint32_t * mask,
5631
					     const uint32_t * src_top,
5632
					     const uint32_t * src_bottom,
5633
					     int32_t          w,
5634
					     int              wt,
5635
					     int              wb,
5636
					     pixman_fixed_t   vx,
5637
					     pixman_fixed_t   unit_x,
5638
					     pixman_fixed_t   max_vx,
5639
					     pixman_bool_t    zero_src)
5640
{
5641
    BILINEAR_DECLARE_VARIABLES;
5642
    uint32_t pix1, pix2, pix3, pix4;
5643
 
5644
    while ((w -= 4) >= 0)
5645
    {
5646
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5647
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5648
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5649
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5650
	*dst++ = pix1;
5651
	*dst++ = pix2;
5652
	*dst++ = pix3;
5653
	*dst++ = pix4;
5654
    }
5655
 
5656
    if (w & 2)
5657
    {
5658
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5659
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5660
	*dst++ = pix1;
5661
	*dst++ = pix2;
5662
    }
5663
 
5664
    if (w & 1)
5665
    {
5666
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5667
	*dst = pix1;
5668
    }
5669
 
5670
}
5671
 
5672
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
5673
			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5674
			       uint32_t, uint32_t, uint32_t,
5675
			       COVER, FLAG_NONE)
5676
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
5677
			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5678
			       uint32_t, uint32_t, uint32_t,
5679
			       PAD, FLAG_NONE)
5680
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
5681
			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5682
			       uint32_t, uint32_t, uint32_t,
5683
			       NONE, FLAG_NONE)
5684
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
5685
			       scaled_bilinear_scanline_sse2_8888_8888_SRC,
5686
			       uint32_t, uint32_t, uint32_t,
5687
			       NORMAL, FLAG_NONE)
5688
 
5689
static force_inline void
5690
scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t *       dst,
5691
					      const uint32_t * mask,
5692
					      const uint32_t * src_top,
5693
					      const uint32_t * src_bottom,
5694
					      int32_t          w,
5695
					      int              wt,
5696
					      int              wb,
5697
					      pixman_fixed_t   vx,
5698
					      pixman_fixed_t   unit_x,
5699
					      pixman_fixed_t   max_vx,
5700
					      pixman_bool_t    zero_src)
5701
{
5702
    BILINEAR_DECLARE_VARIABLES;
5703
    uint32_t pix1, pix2, pix3, pix4;
5704
 
5705
    while (w && ((uintptr_t)dst & 15))
5706
    {
5707
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5708
 
5709
	if (pix1)
5710
	{
5711
	    pix2 = *dst;
5712
	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5713
	}
5714
 
5715
	w--;
5716
	dst++;
5717
    }
5718
 
5719
    while (w  >= 4)
5720
    {
5721
	__m128i xmm_src;
5722
	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
5723
	__m128i xmm_alpha_hi, xmm_alpha_lo;
5724
 
5725
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5726
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5727
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5728
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5729
 
5730
	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5731
 
5732
	if (!is_zero (xmm_src))
5733
	{
5734
	    if (is_opaque (xmm_src))
5735
	    {
5736
		save_128_aligned ((__m128i *)dst, xmm_src);
5737
	    }
5738
	    else
5739
	    {
5740
		__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
5741
 
5742
		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5743
		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5744
 
5745
		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
5746
		over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
5747
			    &xmm_dst_lo, &xmm_dst_hi);
5748
 
5749
		save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5750
	    }
5751
	}
5752
 
5753
	w -= 4;
5754
	dst += 4;
5755
    }
5756
 
5757
    while (w)
5758
    {
5759
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5760
 
5761
	if (pix1)
5762
	{
5763
	    pix2 = *dst;
5764
	    *dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
5765
	}
5766
 
5767
	w--;
5768
	dst++;
5769
    }
5770
}
5771
 
5772
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
5773
			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5774
			       uint32_t, uint32_t, uint32_t,
5775
			       COVER, FLAG_NONE)
5776
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
5777
			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5778
			       uint32_t, uint32_t, uint32_t,
5779
			       PAD, FLAG_NONE)
5780
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
5781
			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5782
			       uint32_t, uint32_t, uint32_t,
5783
			       NONE, FLAG_NONE)
5784
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
5785
			       scaled_bilinear_scanline_sse2_8888_8888_OVER,
5786
			       uint32_t, uint32_t, uint32_t,
5787
			       NORMAL, FLAG_NONE)
5788
 
5789
static force_inline void
5790
scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t *       dst,
5791
						const uint8_t  * mask,
5792
						const uint32_t * src_top,
5793
						const uint32_t * src_bottom,
5794
						int32_t          w,
5795
						int              wt,
5796
						int              wb,
5797
						pixman_fixed_t   vx,
5798
						pixman_fixed_t   unit_x,
5799
						pixman_fixed_t   max_vx,
5800
						pixman_bool_t    zero_src)
5801
{
5802
    BILINEAR_DECLARE_VARIABLES;
5803
    uint32_t pix1, pix2, pix3, pix4;
5804
    uint32_t m;
5805
 
5806
    while (w && ((uintptr_t)dst & 15))
5807
    {
5808
	uint32_t sa;
5809
 
5810
	m = (uint32_t) *mask++;
5811
 
5812
	if (m)
5813
	{
5814
	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5815
	    sa = pix1 >> 24;
5816
 
5817
	    if (sa == 0xff && m == 0xff)
5818
	    {
5819
		*dst = pix1;
5820
	    }
5821
	    else
5822
	    {
5823
		__m128i ms, md, ma, msa;
5824
 
5825
		pix2 = *dst;
5826
		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5827
		ms = unpack_32_1x128 (pix1);
5828
		md = unpack_32_1x128 (pix2);
5829
 
5830
		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5831
 
5832
		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5833
	    }
5834
	}
5835
	else
5836
	{
5837
	    BILINEAR_SKIP_ONE_PIXEL ();
5838
	}
5839
 
5840
	w--;
5841
	dst++;
5842
    }
5843
 
5844
    while (w >= 4)
5845
    {
5846
	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
5847
	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
5848
	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
5849
 
5850
	m = *(uint32_t*)mask;
5851
 
5852
	if (m)
5853
	{
5854
	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5855
	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5856
	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
5857
	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
5858
 
5859
	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
5860
 
5861
	    if (m == 0xffffffff && is_opaque (xmm_src))
5862
	    {
5863
		save_128_aligned ((__m128i *)dst, xmm_src);
5864
	    }
5865
	    else
5866
	    {
5867
		xmm_dst = load_128_aligned ((__m128i *)dst);
5868
 
5869
		xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
5870
 
5871
		unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
5872
		unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
5873
		unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
5874
 
5875
		expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
5876
		expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
5877
 
5878
		in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
5879
			       &xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
5880
 
5881
		save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
5882
	    }
5883
	}
5884
	else
5885
	{
5886
	    BILINEAR_SKIP_ONE_PIXEL ();
5887
	    BILINEAR_SKIP_ONE_PIXEL ();
5888
	    BILINEAR_SKIP_ONE_PIXEL ();
5889
	    BILINEAR_SKIP_ONE_PIXEL ();
5890
	}
5891
 
5892
	w -= 4;
5893
	dst += 4;
5894
	mask += 4;
5895
    }
5896
 
5897
    while (w)
5898
    {
5899
	uint32_t sa;
5900
 
5901
	m = (uint32_t) *mask++;
5902
 
5903
	if (m)
5904
	{
5905
	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5906
	    sa = pix1 >> 24;
5907
 
5908
	    if (sa == 0xff && m == 0xff)
5909
	    {
5910
		*dst = pix1;
5911
	    }
5912
	    else
5913
	    {
5914
		__m128i ms, md, ma, msa;
5915
 
5916
		pix2 = *dst;
5917
		ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
5918
		ms = unpack_32_1x128 (pix1);
5919
		md = unpack_32_1x128 (pix2);
5920
 
5921
		msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
5922
 
5923
		*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
5924
	    }
5925
	}
5926
	else
5927
	{
5928
	    BILINEAR_SKIP_ONE_PIXEL ();
5929
	}
5930
 
5931
	w--;
5932
	dst++;
5933
    }
5934
}
5935
 
5936
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
5937
			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5938
			       uint32_t, uint8_t, uint32_t,
5939
			       COVER, FLAG_HAVE_NON_SOLID_MASK)
5940
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
5941
			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5942
			       uint32_t, uint8_t, uint32_t,
5943
			       PAD, FLAG_HAVE_NON_SOLID_MASK)
5944
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
5945
			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5946
			       uint32_t, uint8_t, uint32_t,
5947
			       NONE, FLAG_HAVE_NON_SOLID_MASK)
5948
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
5949
			       scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
5950
			       uint32_t, uint8_t, uint32_t,
5951
			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
5952
 
5953
static force_inline void
5954
scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t *       dst,
5955
						const uint32_t * mask,
5956
						const uint32_t * src_top,
5957
						const uint32_t * src_bottom,
5958
						int32_t          w,
5959
						int              wt,
5960
						int              wb,
5961
						pixman_fixed_t   vx,
5962
						pixman_fixed_t   unit_x,
5963
						pixman_fixed_t   max_vx,
5964
						pixman_bool_t    zero_src)
5965
{
5966
    BILINEAR_DECLARE_VARIABLES;
5967
    uint32_t pix1, pix2, pix3, pix4;
5968
    __m128i xmm_mask;
5969
 
5970
    if (zero_src || (*mask >> 24) == 0)
5971
	return;
5972
 
5973
    xmm_mask = create_mask_16_128 (*mask >> 24);
5974
 
5975
    while (w && ((uintptr_t)dst & 15))
5976
    {
5977
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5978
	if (pix1)
5979
	{
5980
		uint32_t d = *dst;
5981
 
5982
		__m128i ms = unpack_32_1x128 (pix1);
5983
		__m128i alpha     = expand_alpha_1x128 (ms);
5984
		__m128i dest      = xmm_mask;
5985
		__m128i alpha_dst = unpack_32_1x128 (d);
5986
 
5987
		*dst = pack_1x128_32
5988
			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
5989
	}
5990
 
5991
	dst++;
5992
	w--;
5993
    }
5994
 
5995
    while (w >= 4)
5996
    {
5997
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
5998
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
5999
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
6000
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
6001
 
6002
	if (pix1 | pix2 | pix3 | pix4)
6003
	{
6004
	    __m128i xmm_src, xmm_src_lo, xmm_src_hi;
6005
	    __m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
6006
	    __m128i xmm_alpha_lo, xmm_alpha_hi;
6007
 
6008
	    xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
6009
 
6010
	    xmm_dst = load_128_aligned ((__m128i*)dst);
6011
 
6012
	    unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
6013
	    unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
6014
	    expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
6015
				&xmm_alpha_lo, &xmm_alpha_hi);
6016
 
6017
	    in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
6018
			   &xmm_alpha_lo, &xmm_alpha_hi,
6019
			   &xmm_mask, &xmm_mask,
6020
			   &xmm_dst_lo, &xmm_dst_hi);
6021
 
6022
	    save_128_aligned
6023
		((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
6024
	}
6025
 
6026
	dst += 4;
6027
	w -= 4;
6028
    }
6029
 
6030
    while (w)
6031
    {
6032
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
6033
	if (pix1)
6034
	{
6035
		uint32_t d = *dst;
6036
 
6037
		__m128i ms = unpack_32_1x128 (pix1);
6038
		__m128i alpha     = expand_alpha_1x128 (ms);
6039
		__m128i dest      = xmm_mask;
6040
		__m128i alpha_dst = unpack_32_1x128 (d);
6041
 
6042
		*dst = pack_1x128_32
6043
			(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
6044
	}
6045
 
6046
	dst++;
6047
	w--;
6048
    }
6049
}
6050
 
6051
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
6052
			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6053
			       uint32_t, uint32_t, uint32_t,
6054
			       COVER, FLAG_HAVE_SOLID_MASK)
6055
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
6056
			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6057
			       uint32_t, uint32_t, uint32_t,
6058
			       PAD, FLAG_HAVE_SOLID_MASK)
6059
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
6060
			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6061
			       uint32_t, uint32_t, uint32_t,
6062
			       NONE, FLAG_HAVE_SOLID_MASK)
6063
FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
6064
			       scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
6065
			       uint32_t, uint32_t, uint32_t,
6066
			       NORMAL, FLAG_HAVE_SOLID_MASK)
6067
 
6068
static const pixman_fast_path_t sse2_fast_paths[] =
6069
{
6070
    /* PIXMAN_OP_OVER */
6071
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
6072
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
6073
    PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
6074
    PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
6075
    PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
6076
    PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
6077
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
6078
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
6079
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
6080
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
6081
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
6082
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
6083
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
6084
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
6085
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
6086
    PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
6087
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
6088
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
6089
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
6090
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
6091
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
6092
    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
6093
    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
6094
    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
6095
    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
6096
    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
6097
    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
6098
    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
6099
    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
6100
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
6101
    PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
6102
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
6103
    PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
6104
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6105
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
6106
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6107
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
6108
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
6109
    PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
6110
    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
6111
    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
6112
    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
6113
    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
6114
    PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
6115
    PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
6116
    PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6117
    PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6118
 
6119
    /* PIXMAN_OP_OVER_REVERSE */
6120
    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
6121
    PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
6122
 
6123
    /* PIXMAN_OP_ADD */
6124
    PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
6125
    PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
6126
    PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
6127
    PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
6128
    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
6129
    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
6130
    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
6131
    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
6132
    PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
6133
    PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
6134
    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
6135
    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
6136
    PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
6137
    PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
6138
 
6139
    /* PIXMAN_OP_SRC */
6140
    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
6141
    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
6142
    PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
6143
    PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
6144
    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6145
    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6146
    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
6147
    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
6148
    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
6149
    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
6150
    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
6151
    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
6152
    PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6153
    PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6154
    PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
6155
    PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
6156
    PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
6157
    PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
6158
 
6159
    /* PIXMAN_OP_IN */
6160
    PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
6161
    PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
6162
    PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
6163
 
6164
    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6165
    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6166
    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6167
    SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6168
    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6169
    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6170
    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6171
    SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6172
    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6173
    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6174
    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6175
    SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6176
    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6177
    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6178
    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6179
    SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6180
 
6181
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6182
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6183
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6184
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6185
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6186
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6187
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6188
    SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6189
 
6190
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6191
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6192
    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
6193
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6194
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6195
    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
6196
 
6197
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
6198
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
6199
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
6200
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
6201
 
6202
    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
6203
    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
6204
    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
6205
    SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
6206
 
6207
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
6208
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
6209
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
6210
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
6211
 
6212
    { PIXMAN_OP_NONE },
6213
};
6214
 
6215
static uint32_t *
6216
sse2_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
6217
{
6218
    int w = iter->width;
6219
    __m128i ff000000 = mask_ff000000;
6220
    uint32_t *dst = iter->buffer;
6221
    uint32_t *src = (uint32_t *)iter->bits;
6222
 
6223
    iter->bits += iter->stride;
6224
 
6225
    while (w && ((uintptr_t)dst) & 0x0f)
6226
    {
6227
	*dst++ = (*src++) | 0xff000000;
6228
	w--;
6229
    }
6230
 
6231
    while (w >= 4)
6232
    {
6233
	save_128_aligned (
6234
	    (__m128i *)dst, _mm_or_si128 (
6235
		load_128_unaligned ((__m128i *)src), ff000000));
6236
 
6237
	dst += 4;
6238
	src += 4;
6239
	w -= 4;
6240
    }
6241
 
6242
    while (w)
6243
    {
6244
	*dst++ = (*src++) | 0xff000000;
6245
	w--;
6246
    }
6247
 
6248
    return iter->buffer;
6249
}
6250
 
6251
static uint32_t *
6252
sse2_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
6253
{
6254
    int w = iter->width;
6255
    uint32_t *dst = iter->buffer;
6256
    uint16_t *src = (uint16_t *)iter->bits;
6257
    __m128i ff000000 = mask_ff000000;
6258
 
6259
    iter->bits += iter->stride;
6260
 
6261
    while (w && ((uintptr_t)dst) & 0x0f)
6262
    {
6263
	uint16_t s = *src++;
6264
 
6265
	*dst++ = convert_0565_to_8888 (s);
6266
	w--;
6267
    }
6268
 
6269
    while (w >= 8)
6270
    {
6271
	__m128i lo, hi, s;
6272
 
6273
	s = _mm_loadu_si128 ((__m128i *)src);
6274
 
6275
	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
6276
	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
6277
 
6278
	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
6279
	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
6280
 
6281
	dst += 8;
6282
	src += 8;
6283
	w -= 8;
6284
    }
6285
 
6286
    while (w)
6287
    {
6288
	uint16_t s = *src++;
6289
 
6290
	*dst++ = convert_0565_to_8888 (s);
6291
	w--;
6292
    }
6293
 
6294
    return iter->buffer;
6295
}
6296
 
6297
static uint32_t *
6298
sse2_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
6299
{
6300
    int w = iter->width;
6301
    uint32_t *dst = iter->buffer;
6302
    uint8_t *src = iter->bits;
6303
    __m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
6304
 
6305
    iter->bits += iter->stride;
6306
 
6307
    while (w && (((uintptr_t)dst) & 15))
6308
    {
6309
        *dst++ = *(src++) << 24;
6310
        w--;
6311
    }
6312
 
6313
    while (w >= 16)
6314
    {
6315
	xmm0 = _mm_loadu_si128((__m128i *)src);
6316
 
6317
	xmm1 = _mm_unpacklo_epi8  (_mm_setzero_si128(), xmm0);
6318
	xmm2 = _mm_unpackhi_epi8  (_mm_setzero_si128(), xmm0);
6319
	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
6320
	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
6321
	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
6322
	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
6323
 
6324
	_mm_store_si128(((__m128i *)(dst +  0)), xmm3);
6325
	_mm_store_si128(((__m128i *)(dst +  4)), xmm4);
6326
	_mm_store_si128(((__m128i *)(dst +  8)), xmm5);
6327
	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
6328
 
6329
	dst += 16;
6330
	src += 16;
6331
	w -= 16;
6332
    }
6333
 
6334
    while (w)
6335
    {
6336
	*dst++ = *(src++) << 24;
6337
	w--;
6338
    }
6339
 
6340
    return iter->buffer;
6341
}
6342
 
6343
typedef struct
6344
{
6345
    pixman_format_code_t	format;
6346
    pixman_iter_get_scanline_t	get_scanline;
6347
} fetcher_info_t;
6348
 
6349
static const fetcher_info_t fetchers[] =
6350
{
6351
    { PIXMAN_x8r8g8b8,		sse2_fetch_x8r8g8b8 },
6352
    { PIXMAN_r5g6b5,		sse2_fetch_r5g6b5 },
6353
    { PIXMAN_a8,		sse2_fetch_a8 },
6354
    { PIXMAN_null }
6355
};
6356
 
6357
static pixman_bool_t
6358
sse2_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
6359
{
6360
    pixman_image_t *image = iter->image;
6361
 
6362
#define FLAGS								\
6363
    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
6364
     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
6365
 
6366
    if ((iter->iter_flags & ITER_NARROW)			&&
6367
	(iter->image_flags & FLAGS) == FLAGS)
6368
    {
6369
	const fetcher_info_t *f;
6370
 
6371
	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
6372
	{
6373
	    if (image->common.extended_format_code == f->format)
6374
	    {
6375
		uint8_t *b = (uint8_t *)image->bits.bits;
6376
		int s = image->bits.rowstride * 4;
6377
 
6378
		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
6379
		iter->stride = s;
6380
 
6381
		iter->get_scanline = f->get_scanline;
6382
		return TRUE;
6383
	    }
6384
	}
6385
    }
6386
 
6387
    return FALSE;
6388
}
6389
 
6390
#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
6391
__attribute__((__force_align_arg_pointer__))
6392
#endif
6393
pixman_implementation_t *
6394
_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
6395
{
6396
    pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
6397
 
6398
    /* SSE2 constants */
6399
    mask_565_r  = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6400
    mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
6401
    mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
6402
    mask_565_b  = create_mask_2x32_128 (0x0000001f, 0x0000001f);
6403
    mask_red   = create_mask_2x32_128 (0x00f80000, 0x00f80000);
6404
    mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
6405
    mask_blue  = create_mask_2x32_128 (0x000000f8, 0x000000f8);
6406
    mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
6407
    mask_565_fix_g = create_mask_2x32_128  (0x0000c000, 0x0000c000);
6408
    mask_0080 = create_mask_16_128 (0x0080);
6409
    mask_00ff = create_mask_16_128 (0x00ff);
6410
    mask_0101 = create_mask_16_128 (0x0101);
6411
    mask_ffff = create_mask_16_128 (0xffff);
6412
    mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
6413
    mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
6414
    mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
6415
    mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
6416
 
6417
    /* Set up function pointers */
6418
    imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
6419
    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
6420
    imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
6421
    imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
6422
    imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
6423
    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
6424
    imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
6425
    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
6426
    imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
6427
    imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
6428
 
6429
    imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
6430
 
6431
    imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
6432
    imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
6433
    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
6434
    imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
6435
    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
6436
    imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
6437
    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
6438
    imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
6439
    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
6440
    imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
6441
    imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
6442
 
6443
    imp->blt = sse2_blt;
6444
    imp->fill = sse2_fill;
6445
 
6446
    imp->src_iter_init = sse2_src_iter_init;
6447
 
6448
    return imp;
6449
}