Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
1891 serge 1
/*
2
 * Copyright © 2004, 2005 Red Hat, Inc.
3
 * Copyright © 2004 Nicholas Miell
4
 * Copyright © 2005 Trolltech AS
5
 *
6
 * Permission to use, copy, modify, distribute, and sell this software and its
7
 * documentation for any purpose is hereby granted without fee, provided that
8
 * the above copyright notice appear in all copies and that both that
9
 * copyright notice and this permission notice appear in supporting
10
 * documentation, and that the name of Red Hat not be used in advertising or
11
 * publicity pertaining to distribution of the software without specific,
12
 * written prior permission.  Red Hat makes no representations about the
13
 * suitability of this software for any purpose.  It is provided "as is"
14
 * without express or implied warranty.
15
 *
16
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23
 * SOFTWARE.
24
 *
25
 * Author:  Søren Sandmann (sandmann@redhat.com)
26
 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27
 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28
 *
29
 * Based on work by Owen Taylor
30
 */
31
 
32
#ifdef HAVE_CONFIG_H
33
#include 
34
#endif
35
 
36
#ifdef USE_MMX
37
 
38
#include 
39
#include "pixman-private.h"
40
#include "pixman-combine32.h"
41
 
42
#define no_vERBOSE
43
 
44
#ifdef VERBOSE
45
#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
46
#else
47
#define CHECKPOINT()
48
#endif
49
 
50
/* Notes about writing mmx code
51
 *
52
 * give memory operands as the second operand. If you give it as the
53
 * first, gcc will first load it into a register, then use that
54
 * register
55
 *
56
 *   ie. use
57
 *
58
 *         _mm_mullo_pi16 (x, mmx_constant);
59
 *
60
 *   not
61
 *
62
 *         _mm_mullo_pi16 (mmx_constant, x);
63
 *
64
 * Also try to minimize dependencies. i.e. when you need a value, try
65
 * to calculate it from a value that was calculated as early as
66
 * possible.
67
 */
68
 
69
/* --------------- MMX primitives ------------------------------------- */
70
 
71
#ifdef __GNUC__
72
typedef uint64_t mmxdatafield;
73
#else
74
typedef __m64 mmxdatafield;
75
/* If __m64 is defined as a struct or union, define M64_MEMBER to be the
76
   name of the member used to access the data */
77
# ifdef _MSC_VER
78
#  define M64_MEMBER m64_u64
79
# elif defined(__SUNPRO_C)
80
#  define M64_MEMBER l_
81
# endif
82
#endif
83
 
84
typedef struct
85
{
86
    mmxdatafield mmx_4x00ff;
87
    mmxdatafield mmx_4x0080;
88
    mmxdatafield mmx_565_rgb;
89
    mmxdatafield mmx_565_unpack_multiplier;
90
    mmxdatafield mmx_565_r;
91
    mmxdatafield mmx_565_g;
92
    mmxdatafield mmx_565_b;
93
    mmxdatafield mmx_mask_0;
94
    mmxdatafield mmx_mask_1;
95
    mmxdatafield mmx_mask_2;
96
    mmxdatafield mmx_mask_3;
97
    mmxdatafield mmx_full_alpha;
98
    mmxdatafield mmx_ffff0000ffff0000;
99
    mmxdatafield mmx_0000ffff00000000;
100
    mmxdatafield mmx_000000000000ffff;
101
} mmx_data_t;
102
 
103
#if defined(_MSC_VER)
104
# define MMXDATA_INIT(field, val) { val ## UI64 }
105
#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
106
# define MMXDATA_INIT(field, val) field =   { val ## ULL }
107
#else                           /* __m64 is an integral type */
108
# define MMXDATA_INIT(field, val) field =   val ## ULL
109
#endif
110
 
111
static const mmx_data_t c =
112
{
113
    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
114
    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
115
    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
116
    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
117
    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
118
    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
119
    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
120
    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
121
    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
122
    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
123
    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
124
    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
125
    MMXDATA_INIT (.mmx_ffff0000ffff0000,         0xffff0000ffff0000),
126
    MMXDATA_INIT (.mmx_0000ffff00000000,         0x0000ffff00000000),
127
    MMXDATA_INIT (.mmx_000000000000ffff,         0x000000000000ffff),
128
};
129
 
130
#ifdef __GNUC__
131
#    ifdef __ICC
132
#        define MC(x) to_m64 (c.mmx_ ## x)
133
#    else
134
#        define MC(x) ((__m64)c.mmx_ ## x)
135
#    endif
136
#else
137
#    define MC(x) c.mmx_ ## x
138
#endif
139
 
140
static force_inline __m64
141
to_m64 (uint64_t x)
142
{
143
#ifdef __ICC
144
    return _mm_cvtsi64_m64 (x);
145
#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
146
    __m64 res;
147
 
148
    res.M64_MEMBER = x;
149
    return res;
150
#else                           /* __m64 is an integral type */
151
    return (__m64)x;
152
#endif
153
}
154
 
155
static force_inline uint64_t
156
to_uint64 (__m64 x)
157
{
158
#ifdef __ICC
159
    return _mm_cvtm64_si64 (x);
160
#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
161
    uint64_t res = x.M64_MEMBER;
162
    return res;
163
#else                           /* __m64 is an integral type */
164
    return (uint64_t)x;
165
#endif
166
}
167
 
168
static force_inline __m64
169
shift (__m64 v,
170
       int   s)
171
{
172
    if (s > 0)
173
	return _mm_slli_si64 (v, s);
174
    else if (s < 0)
175
	return _mm_srli_si64 (v, -s);
176
    else
177
	return v;
178
}
179
 
180
static force_inline __m64
181
negate (__m64 mask)
182
{
183
    return _mm_xor_si64 (mask, MC (4x00ff));
184
}
185
 
186
static force_inline __m64
187
pix_multiply (__m64 a, __m64 b)
188
{
189
    __m64 res;
190
 
191
    res = _mm_mullo_pi16 (a, b);
192
    res = _mm_adds_pu16 (res, MC (4x0080));
193
    res = _mm_adds_pu16 (res, _mm_srli_pi16 (res, 8));
194
    res = _mm_srli_pi16 (res, 8);
195
 
196
    return res;
197
}
198
 
199
static force_inline __m64
200
pix_add (__m64 a, __m64 b)
201
{
202
    return _mm_adds_pu8 (a, b);
203
}
204
 
205
static force_inline __m64
206
expand_alpha (__m64 pixel)
207
{
208
    __m64 t1, t2;
209
 
210
    t1 = shift (pixel, -48);
211
    t2 = shift (t1, 16);
212
    t1 = _mm_or_si64 (t1, t2);
213
    t2 = shift (t1, 32);
214
    t1 = _mm_or_si64 (t1, t2);
215
 
216
    return t1;
217
}
218
 
219
static force_inline __m64
220
expand_alpha_rev (__m64 pixel)
221
{
222
    __m64 t1, t2;
223
 
224
    /* move alpha to low 16 bits and zero the rest */
225
    t1 = shift (pixel,  48);
226
    t1 = shift (t1, -48);
227
 
228
    t2 = shift (t1, 16);
229
    t1 = _mm_or_si64 (t1, t2);
230
    t2 = shift (t1, 32);
231
    t1 = _mm_or_si64 (t1, t2);
232
 
233
    return t1;
234
}
235
 
236
static force_inline __m64
237
invert_colors (__m64 pixel)
238
{
239
    __m64 x, y, z;
240
 
241
    x = y = z = pixel;
242
 
243
    x = _mm_and_si64 (x, MC (ffff0000ffff0000));
244
    y = _mm_and_si64 (y, MC (000000000000ffff));
245
    z = _mm_and_si64 (z, MC (0000ffff00000000));
246
 
247
    y = shift (y, 32);
248
    z = shift (z, -32);
249
 
250
    x = _mm_or_si64 (x, y);
251
    x = _mm_or_si64 (x, z);
252
 
253
    return x;
254
}
255
 
256
static force_inline __m64
257
over (__m64 src,
258
      __m64 srca,
259
      __m64 dest)
260
{
261
    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
262
}
263
 
264
static force_inline __m64
265
over_rev_non_pre (__m64 src, __m64 dest)
266
{
267
    __m64 srca = expand_alpha (src);
268
    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
269
 
270
    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
271
}
272
 
273
static force_inline __m64
274
in (__m64 src, __m64 mask)
275
{
276
    return pix_multiply (src, mask);
277
}
278
 
279
static force_inline __m64
280
in_over_full_src_alpha (__m64 src, __m64 mask, __m64 dest)
281
{
282
    src = _mm_or_si64 (src, MC (full_alpha));
283
 
284
    return over (in (src, mask), mask, dest);
285
}
286
 
287
#ifndef _MSC_VER
288
static force_inline __m64
289
in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
290
{
291
    return over (in (src, mask), pix_multiply (srca, mask), dest);
292
}
293
 
294
#else
295
 
296
#define in_over(src, srca, mask, dest)					\
297
    over (in (src, mask), pix_multiply (srca, mask), dest)
298
 
299
#endif
300
 
301
static force_inline __m64
302
load8888 (uint32_t v)
303
{
304
    return _mm_unpacklo_pi8 (_mm_cvtsi32_si64 (v), _mm_setzero_si64 ());
305
}
306
 
307
static force_inline __m64
308
pack8888 (__m64 lo, __m64 hi)
309
{
310
    return _mm_packs_pu16 (lo, hi);
311
}
312
 
313
static force_inline uint32_t
314
store8888 (__m64 v)
315
{
316
    return _mm_cvtsi64_si32 (pack8888 (v, _mm_setzero_si64 ()));
317
}
318
 
319
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
320
 *
321
 *    00RR00GG00BB
322
 *
323
 * --- Expanding 565 in the low word ---
324
 *
325
 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
326
 * m = m & (01f0003f001f);
327
 * m = m * (008404100840);
328
 * m = m >> 8;
329
 *
330
 * Note the trick here - the top word is shifted by another nibble to
331
 * avoid it bumping into the middle word
332
 */
333
static force_inline __m64
334
expand565 (__m64 pixel, int pos)
335
{
336
    __m64 p = pixel;
337
    __m64 t1, t2;
338
 
339
    /* move pixel to low 16 bit and zero the rest */
340
    p = shift (shift (p, (3 - pos) * 16), -48);
341
 
342
    t1 = shift (p, 36 - 11);
343
    t2 = shift (p, 16 - 5);
344
 
345
    p = _mm_or_si64 (t1, p);
346
    p = _mm_or_si64 (t2, p);
347
    p = _mm_and_si64 (p, MC (565_rgb));
348
 
349
    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
350
    return _mm_srli_pi16 (pixel, 8);
351
}
352
 
353
static force_inline __m64
354
expand8888 (__m64 in, int pos)
355
{
356
    if (pos == 0)
357
	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
358
    else
359
	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
360
}
361
 
362
static force_inline __m64
363
expandx888 (__m64 in, int pos)
364
{
365
    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
366
}
367
 
368
static force_inline __m64
369
pack_565 (__m64 pixel, __m64 target, int pos)
370
{
371
    __m64 p = pixel;
372
    __m64 t = target;
373
    __m64 r, g, b;
374
 
375
    r = _mm_and_si64 (p, MC (565_r));
376
    g = _mm_and_si64 (p, MC (565_g));
377
    b = _mm_and_si64 (p, MC (565_b));
378
 
379
    r = shift (r, -(32 - 8) + pos * 16);
380
    g = shift (g, -(16 - 3) + pos * 16);
381
    b = shift (b, -(0  + 3) + pos * 16);
382
 
383
    if (pos == 0)
384
	t = _mm_and_si64 (t, MC (mask_0));
385
    else if (pos == 1)
386
	t = _mm_and_si64 (t, MC (mask_1));
387
    else if (pos == 2)
388
	t = _mm_and_si64 (t, MC (mask_2));
389
    else if (pos == 3)
390
	t = _mm_and_si64 (t, MC (mask_3));
391
 
392
    p = _mm_or_si64 (r, t);
393
    p = _mm_or_si64 (g, p);
394
 
395
    return _mm_or_si64 (b, p);
396
}
397
 
398
#ifndef _MSC_VER
399
 
400
static force_inline __m64
401
pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
402
{
403
    x = pix_multiply (x, a);
404
    y = pix_multiply (y, b);
405
 
406
    return pix_add (x, y);
407
}
408
 
409
#else
410
 
411
#define pix_add_mul(x, a, y, b)	 \
412
    ( x = pix_multiply (x, a),	 \
413
      y = pix_multiply (y, a),	 \
414
      pix_add (x, y) )
415
 
416
#endif
417
 
418
/* --------------- MMX code patch for fbcompose.c --------------------- */
419
 
420
static force_inline uint32_t
421
combine (const uint32_t *src, const uint32_t *mask)
422
{
423
    uint32_t ssrc = *src;
424
 
425
    if (mask)
426
    {
427
	__m64 m = load8888 (*mask);
428
	__m64 s = load8888 (ssrc);
429
 
430
	m = expand_alpha (m);
431
	s = pix_multiply (s, m);
432
 
433
	ssrc = store8888 (s);
434
    }
435
 
436
    return ssrc;
437
}
438
 
439
static void
440
mmx_combine_over_u (pixman_implementation_t *imp,
441
                    pixman_op_t              op,
442
                    uint32_t *               dest,
443
                    const uint32_t *         src,
444
                    const uint32_t *         mask,
445
                    int                      width)
446
{
447
    const uint32_t *end = dest + width;
448
 
449
    while (dest < end)
450
    {
451
	uint32_t ssrc = combine (src, mask);
452
	uint32_t a = ssrc >> 24;
453
 
454
	if (a == 0xff)
455
	{
456
	    *dest = ssrc;
457
	}
458
	else if (ssrc)
459
	{
460
	    __m64 s, sa;
461
	    s = load8888 (ssrc);
462
	    sa = expand_alpha (s);
463
	    *dest = store8888 (over (s, sa, load8888 (*dest)));
464
	}
465
 
466
	++dest;
467
	++src;
468
	if (mask)
469
	    ++mask;
470
    }
471
    _mm_empty ();
472
}
473
 
474
static void
475
mmx_combine_over_reverse_u (pixman_implementation_t *imp,
476
                            pixman_op_t              op,
477
                            uint32_t *               dest,
478
                            const uint32_t *         src,
479
                            const uint32_t *         mask,
480
                            int                      width)
481
{
482
    const uint32_t *end = dest + width;
483
 
484
    while (dest < end)
485
    {
486
	__m64 d, da;
487
	uint32_t s = combine (src, mask);
488
 
489
	d = load8888 (*dest);
490
	da = expand_alpha (d);
491
	*dest = store8888 (over (d, da, load8888 (s)));
492
 
493
	++dest;
494
	++src;
495
	if (mask)
496
	    mask++;
497
    }
498
    _mm_empty ();
499
}
500
 
501
static void
502
mmx_combine_in_u (pixman_implementation_t *imp,
503
                  pixman_op_t              op,
504
                  uint32_t *               dest,
505
                  const uint32_t *         src,
506
                  const uint32_t *         mask,
507
                  int                      width)
508
{
509
    const uint32_t *end = dest + width;
510
 
511
    while (dest < end)
512
    {
513
	__m64 x, a;
514
 
515
	x = load8888 (combine (src, mask));
516
	a = load8888 (*dest);
517
	a = expand_alpha (a);
518
	x = pix_multiply (x, a);
519
 
520
	*dest = store8888 (x);
521
 
522
	++dest;
523
	++src;
524
	if (mask)
525
	    mask++;
526
    }
527
    _mm_empty ();
528
}
529
 
530
static void
531
mmx_combine_in_reverse_u (pixman_implementation_t *imp,
532
                          pixman_op_t              op,
533
                          uint32_t *               dest,
534
                          const uint32_t *         src,
535
                          const uint32_t *         mask,
536
                          int                      width)
537
{
538
    const uint32_t *end = dest + width;
539
 
540
    while (dest < end)
541
    {
542
	__m64 x, a;
543
 
544
	x = load8888 (*dest);
545
	a = load8888 (combine (src, mask));
546
	a = expand_alpha (a);
547
	x = pix_multiply (x, a);
548
	*dest = store8888 (x);
549
 
550
	++dest;
551
	++src;
552
	if (mask)
553
	    mask++;
554
    }
555
    _mm_empty ();
556
}
557
 
558
static void
559
mmx_combine_out_u (pixman_implementation_t *imp,
560
                   pixman_op_t              op,
561
                   uint32_t *               dest,
562
                   const uint32_t *         src,
563
                   const uint32_t *         mask,
564
                   int                      width)
565
{
566
    const uint32_t *end = dest + width;
567
 
568
    while (dest < end)
569
    {
570
	__m64 x, a;
571
 
572
	x = load8888 (combine (src, mask));
573
	a = load8888 (*dest);
574
	a = expand_alpha (a);
575
	a = negate (a);
576
	x = pix_multiply (x, a);
577
	*dest = store8888 (x);
578
 
579
	++dest;
580
	++src;
581
	if (mask)
582
	    mask++;
583
    }
584
    _mm_empty ();
585
}
586
 
587
static void
588
mmx_combine_out_reverse_u (pixman_implementation_t *imp,
589
                           pixman_op_t              op,
590
                           uint32_t *               dest,
591
                           const uint32_t *         src,
592
                           const uint32_t *         mask,
593
                           int                      width)
594
{
595
    const uint32_t *end = dest + width;
596
 
597
    while (dest < end)
598
    {
599
	__m64 x, a;
600
 
601
	x = load8888 (*dest);
602
	a = load8888 (combine (src, mask));
603
	a = expand_alpha (a);
604
	a = negate (a);
605
	x = pix_multiply (x, a);
606
 
607
	*dest = store8888 (x);
608
 
609
	++dest;
610
	++src;
611
	if (mask)
612
	    mask++;
613
    }
614
    _mm_empty ();
615
}
616
 
617
static void
618
mmx_combine_atop_u (pixman_implementation_t *imp,
619
                    pixman_op_t              op,
620
                    uint32_t *               dest,
621
                    const uint32_t *         src,
622
                    const uint32_t *         mask,
623
                    int                      width)
624
{
625
    const uint32_t *end = dest + width;
626
 
627
    while (dest < end)
628
    {
629
	__m64 s, da, d, sia;
630
 
631
	s = load8888 (combine (src, mask));
632
	d = load8888 (*dest);
633
	sia = expand_alpha (s);
634
	sia = negate (sia);
635
	da = expand_alpha (d);
636
	s = pix_add_mul (s, da, d, sia);
637
	*dest = store8888 (s);
638
 
639
	++dest;
640
	++src;
641
	if (mask)
642
	    mask++;
643
    }
644
    _mm_empty ();
645
}
646
 
647
static void
648
mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
649
                            pixman_op_t              op,
650
                            uint32_t *               dest,
651
                            const uint32_t *         src,
652
                            const uint32_t *         mask,
653
                            int                      width)
654
{
655
    const uint32_t *end;
656
 
657
    end = dest + width;
658
 
659
    while (dest < end)
660
    {
661
	__m64 s, dia, d, sa;
662
 
663
	s = load8888 (combine (src, mask));
664
	d = load8888 (*dest);
665
	sa = expand_alpha (s);
666
	dia = expand_alpha (d);
667
	dia = negate (dia);
668
	s = pix_add_mul (s, dia, d, sa);
669
	*dest = store8888 (s);
670
 
671
	++dest;
672
	++src;
673
	if (mask)
674
	    mask++;
675
    }
676
    _mm_empty ();
677
}
678
 
679
static void
680
mmx_combine_xor_u (pixman_implementation_t *imp,
681
                   pixman_op_t              op,
682
                   uint32_t *               dest,
683
                   const uint32_t *         src,
684
                   const uint32_t *         mask,
685
                   int                      width)
686
{
687
    const uint32_t *end = dest + width;
688
 
689
    while (dest < end)
690
    {
691
	__m64 s, dia, d, sia;
692
 
693
	s = load8888 (combine (src, mask));
694
	d = load8888 (*dest);
695
	sia = expand_alpha (s);
696
	dia = expand_alpha (d);
697
	sia = negate (sia);
698
	dia = negate (dia);
699
	s = pix_add_mul (s, dia, d, sia);
700
	*dest = store8888 (s);
701
 
702
	++dest;
703
	++src;
704
	if (mask)
705
	    mask++;
706
    }
707
    _mm_empty ();
708
}
709
 
710
static void
711
mmx_combine_add_u (pixman_implementation_t *imp,
712
                   pixman_op_t              op,
713
                   uint32_t *               dest,
714
                   const uint32_t *         src,
715
                   const uint32_t *         mask,
716
                   int                      width)
717
{
718
    const uint32_t *end = dest + width;
719
 
720
    while (dest < end)
721
    {
722
	__m64 s, d;
723
 
724
	s = load8888 (combine (src, mask));
725
	d = load8888 (*dest);
726
	s = pix_add (s, d);
727
	*dest = store8888 (s);
728
 
729
	++dest;
730
	++src;
731
	if (mask)
732
	    mask++;
733
    }
734
    _mm_empty ();
735
}
736
 
737
static void
738
mmx_combine_saturate_u (pixman_implementation_t *imp,
739
                        pixman_op_t              op,
740
                        uint32_t *               dest,
741
                        const uint32_t *         src,
742
                        const uint32_t *         mask,
743
                        int                      width)
744
{
745
    const uint32_t *end = dest + width;
746
 
747
    while (dest < end)
748
    {
749
	uint32_t s = combine (src, mask);
750
	uint32_t d = *dest;
751
	__m64 ms = load8888 (s);
752
	__m64 md = load8888 (d);
753
	uint32_t sa = s >> 24;
754
	uint32_t da = ~d >> 24;
755
 
756
	if (sa > da)
757
	{
758
	    __m64 msa = load8888 (DIV_UN8 (da, sa) << 24);
759
	    msa = expand_alpha (msa);
760
	    ms = pix_multiply (ms, msa);
761
	}
762
 
763
	md = pix_add (md, ms);
764
	*dest = store8888 (md);
765
 
766
	++src;
767
	++dest;
768
	if (mask)
769
	    mask++;
770
    }
771
    _mm_empty ();
772
}
773
 
774
static void
775
mmx_combine_src_ca (pixman_implementation_t *imp,
776
                    pixman_op_t              op,
777
                    uint32_t *               dest,
778
                    const uint32_t *         src,
779
                    const uint32_t *         mask,
780
                    int                      width)
781
{
782
    const uint32_t *end = src + width;
783
 
784
    while (src < end)
785
    {
786
	__m64 a = load8888 (*mask);
787
	__m64 s = load8888 (*src);
788
 
789
	s = pix_multiply (s, a);
790
	*dest = store8888 (s);
791
 
792
	++src;
793
	++mask;
794
	++dest;
795
    }
796
    _mm_empty ();
797
}
798
 
799
static void
800
mmx_combine_over_ca (pixman_implementation_t *imp,
801
                     pixman_op_t              op,
802
                     uint32_t *               dest,
803
                     const uint32_t *         src,
804
                     const uint32_t *         mask,
805
                     int                      width)
806
{
807
    const uint32_t *end = src + width;
808
 
809
    while (src < end)
810
    {
811
	__m64 a = load8888 (*mask);
812
	__m64 s = load8888 (*src);
813
	__m64 d = load8888 (*dest);
814
	__m64 sa = expand_alpha (s);
815
 
816
	*dest = store8888 (in_over (s, sa, a, d));
817
 
818
	++src;
819
	++dest;
820
	++mask;
821
    }
822
    _mm_empty ();
823
}
824
 
825
static void
826
mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
827
                             pixman_op_t              op,
828
                             uint32_t *               dest,
829
                             const uint32_t *         src,
830
                             const uint32_t *         mask,
831
                             int                      width)
832
{
833
    const uint32_t *end = src + width;
834
 
835
    while (src < end)
836
    {
837
	__m64 a = load8888 (*mask);
838
	__m64 s = load8888 (*src);
839
	__m64 d = load8888 (*dest);
840
	__m64 da = expand_alpha (d);
841
 
842
	*dest = store8888 (over (d, da, in (s, a)));
843
 
844
	++src;
845
	++dest;
846
	++mask;
847
    }
848
    _mm_empty ();
849
}
850
 
851
static void
852
mmx_combine_in_ca (pixman_implementation_t *imp,
853
                   pixman_op_t              op,
854
                   uint32_t *               dest,
855
                   const uint32_t *         src,
856
                   const uint32_t *         mask,
857
                   int                      width)
858
{
859
    const uint32_t *end = src + width;
860
 
861
    while (src < end)
862
    {
863
	__m64 a = load8888 (*mask);
864
	__m64 s = load8888 (*src);
865
	__m64 d = load8888 (*dest);
866
	__m64 da = expand_alpha (d);
867
 
868
	s = pix_multiply (s, a);
869
	s = pix_multiply (s, da);
870
	*dest = store8888 (s);
871
 
872
	++src;
873
	++dest;
874
	++mask;
875
    }
876
    _mm_empty ();
877
}
878
 
879
static void
880
mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
881
                           pixman_op_t              op,
882
                           uint32_t *               dest,
883
                           const uint32_t *         src,
884
                           const uint32_t *         mask,
885
                           int                      width)
886
{
887
    const uint32_t *end = src + width;
888
 
889
    while (src < end)
890
    {
891
	__m64 a = load8888 (*mask);
892
	__m64 s = load8888 (*src);
893
	__m64 d = load8888 (*dest);
894
	__m64 sa = expand_alpha (s);
895
 
896
	a = pix_multiply (a, sa);
897
	d = pix_multiply (d, a);
898
	*dest = store8888 (d);
899
 
900
	++src;
901
	++dest;
902
	++mask;
903
    }
904
    _mm_empty ();
905
}
906
 
907
static void
908
mmx_combine_out_ca (pixman_implementation_t *imp,
909
                    pixman_op_t              op,
910
                    uint32_t *               dest,
911
                    const uint32_t *         src,
912
                    const uint32_t *         mask,
913
                    int                      width)
914
{
915
    const uint32_t *end = src + width;
916
 
917
    while (src < end)
918
    {
919
	__m64 a = load8888 (*mask);
920
	__m64 s = load8888 (*src);
921
	__m64 d = load8888 (*dest);
922
	__m64 da = expand_alpha (d);
923
 
924
	da = negate (da);
925
	s = pix_multiply (s, a);
926
	s = pix_multiply (s, da);
927
	*dest = store8888 (s);
928
 
929
	++src;
930
	++dest;
931
	++mask;
932
    }
933
    _mm_empty ();
934
}
935
 
936
static void
937
mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
938
                            pixman_op_t              op,
939
                            uint32_t *               dest,
940
                            const uint32_t *         src,
941
                            const uint32_t *         mask,
942
                            int                      width)
943
{
944
    const uint32_t *end = src + width;
945
 
946
    while (src < end)
947
    {
948
	__m64 a = load8888 (*mask);
949
	__m64 s = load8888 (*src);
950
	__m64 d = load8888 (*dest);
951
	__m64 sa = expand_alpha (s);
952
 
953
	a = pix_multiply (a, sa);
954
	a = negate (a);
955
	d = pix_multiply (d, a);
956
	*dest = store8888 (d);
957
 
958
	++src;
959
	++dest;
960
	++mask;
961
    }
962
    _mm_empty ();
963
}
964
 
965
static void
966
mmx_combine_atop_ca (pixman_implementation_t *imp,
967
                     pixman_op_t              op,
968
                     uint32_t *               dest,
969
                     const uint32_t *         src,
970
                     const uint32_t *         mask,
971
                     int                      width)
972
{
973
    const uint32_t *end = src + width;
974
 
975
    while (src < end)
976
    {
977
	__m64 a = load8888 (*mask);
978
	__m64 s = load8888 (*src);
979
	__m64 d = load8888 (*dest);
980
	__m64 da = expand_alpha (d);
981
	__m64 sa = expand_alpha (s);
982
 
983
	s = pix_multiply (s, a);
984
	a = pix_multiply (a, sa);
985
	a = negate (a);
986
	d = pix_add_mul (d, a, s, da);
987
	*dest = store8888 (d);
988
 
989
	++src;
990
	++dest;
991
	++mask;
992
    }
993
    _mm_empty ();
994
}
995
 
996
static void
997
mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
998
                             pixman_op_t              op,
999
                             uint32_t *               dest,
1000
                             const uint32_t *         src,
1001
                             const uint32_t *         mask,
1002
                             int                      width)
1003
{
1004
    const uint32_t *end = src + width;
1005
 
1006
    while (src < end)
1007
    {
1008
	__m64 a = load8888 (*mask);
1009
	__m64 s = load8888 (*src);
1010
	__m64 d = load8888 (*dest);
1011
	__m64 da = expand_alpha (d);
1012
	__m64 sa = expand_alpha (s);
1013
 
1014
	s = pix_multiply (s, a);
1015
	a = pix_multiply (a, sa);
1016
	da = negate (da);
1017
	d = pix_add_mul (d, a, s, da);
1018
	*dest = store8888 (d);
1019
 
1020
	++src;
1021
	++dest;
1022
	++mask;
1023
    }
1024
    _mm_empty ();
1025
}
1026
 
1027
static void
1028
mmx_combine_xor_ca (pixman_implementation_t *imp,
1029
                    pixman_op_t              op,
1030
                    uint32_t *               dest,
1031
                    const uint32_t *         src,
1032
                    const uint32_t *         mask,
1033
                    int                      width)
1034
{
1035
    const uint32_t *end = src + width;
1036
 
1037
    while (src < end)
1038
    {
1039
	__m64 a = load8888 (*mask);
1040
	__m64 s = load8888 (*src);
1041
	__m64 d = load8888 (*dest);
1042
	__m64 da = expand_alpha (d);
1043
	__m64 sa = expand_alpha (s);
1044
 
1045
	s = pix_multiply (s, a);
1046
	a = pix_multiply (a, sa);
1047
	da = negate (da);
1048
	a = negate (a);
1049
	d = pix_add_mul (d, a, s, da);
1050
	*dest = store8888 (d);
1051
 
1052
	++src;
1053
	++dest;
1054
	++mask;
1055
    }
1056
    _mm_empty ();
1057
}
1058
 
1059
static void
1060
mmx_combine_add_ca (pixman_implementation_t *imp,
1061
                    pixman_op_t              op,
1062
                    uint32_t *               dest,
1063
                    const uint32_t *         src,
1064
                    const uint32_t *         mask,
1065
                    int                      width)
1066
{
1067
    const uint32_t *end = src + width;
1068
 
1069
    while (src < end)
1070
    {
1071
	__m64 a = load8888 (*mask);
1072
	__m64 s = load8888 (*src);
1073
	__m64 d = load8888 (*dest);
1074
 
1075
	s = pix_multiply (s, a);
1076
	d = pix_add (s, d);
1077
	*dest = store8888 (d);
1078
 
1079
	++src;
1080
	++dest;
1081
	++mask;
1082
    }
1083
    _mm_empty ();
1084
}
1085
 
1086
/* ------------- MMX code paths called from fbpict.c -------------------- */
1087
 
1088
static void
1089
mmx_composite_over_n_8888 (pixman_implementation_t *imp,
1090
                           pixman_op_t              op,
1091
                           pixman_image_t *         src_image,
1092
                           pixman_image_t *         mask_image,
1093
                           pixman_image_t *         dst_image,
1094
                           int32_t                  src_x,
1095
                           int32_t                  src_y,
1096
                           int32_t                  mask_x,
1097
                           int32_t                  mask_y,
1098
                           int32_t                  dest_x,
1099
                           int32_t                  dest_y,
1100
                           int32_t                  width,
1101
                           int32_t                  height)
1102
{
1103
    uint32_t src;
1104
    uint32_t    *dst_line, *dst;
1105
    int32_t w;
1106
    int dst_stride;
1107
    __m64 vsrc, vsrca;
1108
 
1109
    CHECKPOINT ();
1110
 
1111
    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1112
 
1113
    if (src == 0)
1114
	return;
1115
 
1116
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1117
 
1118
    vsrc = load8888 (src);
1119
    vsrca = expand_alpha (vsrc);
1120
 
1121
    while (height--)
1122
    {
1123
	dst = dst_line;
1124
	dst_line += dst_stride;
1125
	w = width;
1126
 
1127
	CHECKPOINT ();
1128
 
1129
	while (w && (unsigned long)dst & 7)
1130
	{
1131
	    *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1132
 
1133
	    w--;
1134
	    dst++;
1135
	}
1136
 
1137
	while (w >= 2)
1138
	{
1139
	    __m64 vdest;
1140
	    __m64 dest0, dest1;
1141
 
1142
	    vdest = *(__m64 *)dst;
1143
 
1144
	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1145
	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1146
 
1147
	    *(__m64 *)dst = pack8888 (dest0, dest1);
1148
 
1149
	    dst += 2;
1150
	    w -= 2;
1151
	}
1152
 
1153
	CHECKPOINT ();
1154
 
1155
	while (w)
1156
	{
1157
	    *dst = store8888 (over (vsrc, vsrca, load8888 (*dst)));
1158
 
1159
	    w--;
1160
	    dst++;
1161
	}
1162
    }
1163
 
1164
    _mm_empty ();
1165
}
1166
 
1167
static void
1168
mmx_composite_over_n_0565 (pixman_implementation_t *imp,
1169
                           pixman_op_t              op,
1170
                           pixman_image_t *         src_image,
1171
                           pixman_image_t *         mask_image,
1172
                           pixman_image_t *         dst_image,
1173
                           int32_t                  src_x,
1174
                           int32_t                  src_y,
1175
                           int32_t                  mask_x,
1176
                           int32_t                  mask_y,
1177
                           int32_t                  dest_x,
1178
                           int32_t                  dest_y,
1179
                           int32_t                  width,
1180
                           int32_t                  height)
1181
{
1182
    uint32_t src;
1183
    uint16_t    *dst_line, *dst;
1184
    int32_t w;
1185
    int dst_stride;
1186
    __m64 vsrc, vsrca;
1187
 
1188
    CHECKPOINT ();
1189
 
1190
    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1191
 
1192
    if (src == 0)
1193
	return;
1194
 
1195
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1196
 
1197
    vsrc = load8888 (src);
1198
    vsrca = expand_alpha (vsrc);
1199
 
1200
    while (height--)
1201
    {
1202
	dst = dst_line;
1203
	dst_line += dst_stride;
1204
	w = width;
1205
 
1206
	CHECKPOINT ();
1207
 
1208
	while (w && (unsigned long)dst & 7)
1209
	{
1210
	    uint64_t d = *dst;
1211
	    __m64 vdest = expand565 (to_m64 (d), 0);
1212
 
1213
	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1214
	    *dst = to_uint64 (vdest);
1215
 
1216
	    w--;
1217
	    dst++;
1218
	}
1219
 
1220
	while (w >= 4)
1221
	{
1222
	    __m64 vdest;
1223
 
1224
	    vdest = *(__m64 *)dst;
1225
 
1226
	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 0)), vdest, 0);
1227
	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 1)), vdest, 1);
1228
	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 2)), vdest, 2);
1229
	    vdest = pack_565 (over (vsrc, vsrca, expand565 (vdest, 3)), vdest, 3);
1230
 
1231
	    *(__m64 *)dst = vdest;
1232
 
1233
	    dst += 4;
1234
	    w -= 4;
1235
	}
1236
 
1237
	CHECKPOINT ();
1238
 
1239
	while (w)
1240
	{
1241
	    uint64_t d = *dst;
1242
	    __m64 vdest = expand565 (to_m64 (d), 0);
1243
 
1244
	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1245
	    *dst = to_uint64 (vdest);
1246
 
1247
	    w--;
1248
	    dst++;
1249
	}
1250
    }
1251
 
1252
    _mm_empty ();
1253
}
1254
 
1255
static void
1256
mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
1257
                                   pixman_op_t              op,
1258
                                   pixman_image_t *         src_image,
1259
                                   pixman_image_t *         mask_image,
1260
                                   pixman_image_t *         dst_image,
1261
                                   int32_t                  src_x,
1262
                                   int32_t                  src_y,
1263
                                   int32_t                  mask_x,
1264
                                   int32_t                  mask_y,
1265
                                   int32_t                  dest_x,
1266
                                   int32_t                  dest_y,
1267
                                   int32_t                  width,
1268
                                   int32_t                  height)
1269
{
1270
    uint32_t src, srca;
1271
    uint32_t    *dst_line;
1272
    uint32_t    *mask_line;
1273
    int dst_stride, mask_stride;
1274
    __m64 vsrc, vsrca;
1275
 
1276
    CHECKPOINT ();
1277
 
1278
    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1279
 
1280
    srca = src >> 24;
1281
    if (src == 0)
1282
	return;
1283
 
1284
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1285
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1286
 
1287
    vsrc = load8888 (src);
1288
    vsrca = expand_alpha (vsrc);
1289
 
1290
    while (height--)
1291
    {
1292
	int twidth = width;
1293
	uint32_t *p = (uint32_t *)mask_line;
1294
	uint32_t *q = (uint32_t *)dst_line;
1295
 
1296
	while (twidth && (unsigned long)q & 7)
1297
	{
1298
	    uint32_t m = *(uint32_t *)p;
1299
 
1300
	    if (m)
1301
	    {
1302
		__m64 vdest = load8888 (*q);
1303
		vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1304
		*q = store8888 (vdest);
1305
	    }
1306
 
1307
	    twidth--;
1308
	    p++;
1309
	    q++;
1310
	}
1311
 
1312
	while (twidth >= 2)
1313
	{
1314
	    uint32_t m0, m1;
1315
	    m0 = *p;
1316
	    m1 = *(p + 1);
1317
 
1318
	    if (m0 | m1)
1319
	    {
1320
		__m64 dest0, dest1;
1321
		__m64 vdest = *(__m64 *)q;
1322
 
1323
		dest0 = in_over (vsrc, vsrca, load8888 (m0),
1324
		                 expand8888 (vdest, 0));
1325
		dest1 = in_over (vsrc, vsrca, load8888 (m1),
1326
		                 expand8888 (vdest, 1));
1327
 
1328
		*(__m64 *)q = pack8888 (dest0, dest1);
1329
	    }
1330
 
1331
	    p += 2;
1332
	    q += 2;
1333
	    twidth -= 2;
1334
	}
1335
 
1336
	while (twidth)
1337
	{
1338
	    uint32_t m = *(uint32_t *)p;
1339
 
1340
	    if (m)
1341
	    {
1342
		__m64 vdest = load8888 (*q);
1343
		vdest = in_over (vsrc, vsrca, load8888 (m), vdest);
1344
		*q = store8888 (vdest);
1345
	    }
1346
 
1347
	    twidth--;
1348
	    p++;
1349
	    q++;
1350
	}
1351
 
1352
	dst_line += dst_stride;
1353
	mask_line += mask_stride;
1354
    }
1355
 
1356
    _mm_empty ();
1357
}
1358
 
1359
static void
1360
mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
1361
                                pixman_op_t              op,
1362
                                pixman_image_t *         src_image,
1363
                                pixman_image_t *         mask_image,
1364
                                pixman_image_t *         dst_image,
1365
                                int32_t                  src_x,
1366
                                int32_t                  src_y,
1367
                                int32_t                  mask_x,
1368
                                int32_t                  mask_y,
1369
                                int32_t                  dest_x,
1370
                                int32_t                  dest_y,
1371
                                int32_t                  width,
1372
                                int32_t                  height)
1373
{
1374
    uint32_t    *dst_line, *dst;
1375
    uint32_t    *src_line, *src;
1376
    uint32_t mask;
1377
    __m64 vmask;
1378
    int dst_stride, src_stride;
1379
    int32_t w;
1380
    __m64 srca;
1381
 
1382
    CHECKPOINT ();
1383
 
1384
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1385
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1386
 
1387
    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
1388
    mask &= 0xff000000;
1389
    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1390
    vmask = load8888 (mask);
1391
    srca = MC (4x00ff);
1392
 
1393
    while (height--)
1394
    {
1395
	dst = dst_line;
1396
	dst_line += dst_stride;
1397
	src = src_line;
1398
	src_line += src_stride;
1399
	w = width;
1400
 
1401
	while (w && (unsigned long)dst & 7)
1402
	{
1403
	    __m64 s = load8888 (*src);
1404
	    __m64 d = load8888 (*dst);
1405
 
1406
	    *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1407
 
1408
	    w--;
1409
	    dst++;
1410
	    src++;
1411
	}
1412
 
1413
	while (w >= 2)
1414
	{
1415
	    __m64 vs = *(__m64 *)src;
1416
	    __m64 vd = *(__m64 *)dst;
1417
	    __m64 vsrc0 = expand8888 (vs, 0);
1418
	    __m64 vsrc1 = expand8888 (vs, 1);
1419
 
1420
	    *(__m64 *)dst = pack8888 (
1421
	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1422
	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1423
 
1424
	    w -= 2;
1425
	    dst += 2;
1426
	    src += 2;
1427
	}
1428
 
1429
	while (w)
1430
	{
1431
	    __m64 s = load8888 (*src);
1432
	    __m64 d = load8888 (*dst);
1433
 
1434
	    *dst = store8888 (in_over (s, expand_alpha (s), vmask, d));
1435
 
1436
	    w--;
1437
	    dst++;
1438
	    src++;
1439
	}
1440
    }
1441
 
1442
    _mm_empty ();
1443
}
1444
 
1445
static void
1446
mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
1447
                                pixman_op_t              op,
1448
                                pixman_image_t *         src_image,
1449
                                pixman_image_t *         mask_image,
1450
                                pixman_image_t *         dst_image,
1451
                                int32_t                  src_x,
1452
                                int32_t                  src_y,
1453
                                int32_t                  mask_x,
1454
                                int32_t                  mask_y,
1455
                                int32_t                  dest_x,
1456
                                int32_t                  dest_y,
1457
                                int32_t                  width,
1458
                                int32_t                  height)
1459
{
1460
    uint32_t *dst_line, *dst;
1461
    uint32_t *src_line, *src;
1462
    uint32_t mask;
1463
    __m64 vmask;
1464
    int dst_stride, src_stride;
1465
    int32_t w;
1466
    __m64 srca;
1467
 
1468
    CHECKPOINT ();
1469
 
1470
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1471
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1472
    mask = _pixman_image_get_solid (mask_image, dst_image->bits.format);
1473
 
1474
    mask &= 0xff000000;
1475
    mask = mask | mask >> 8 | mask >> 16 | mask >> 24;
1476
    vmask = load8888 (mask);
1477
    srca = MC (4x00ff);
1478
 
1479
    while (height--)
1480
    {
1481
	dst = dst_line;
1482
	dst_line += dst_stride;
1483
	src = src_line;
1484
	src_line += src_stride;
1485
	w = width;
1486
 
1487
	while (w && (unsigned long)dst & 7)
1488
	{
1489
	    __m64 s = load8888 (*src | 0xff000000);
1490
	    __m64 d = load8888 (*dst);
1491
 
1492
	    *dst = store8888 (in_over (s, srca, vmask, d));
1493
 
1494
	    w--;
1495
	    dst++;
1496
	    src++;
1497
	}
1498
 
1499
	while (w >= 16)
1500
	{
1501
	    __m64 vd0 = *(__m64 *)(dst + 0);
1502
	    __m64 vd1 = *(__m64 *)(dst + 2);
1503
	    __m64 vd2 = *(__m64 *)(dst + 4);
1504
	    __m64 vd3 = *(__m64 *)(dst + 6);
1505
	    __m64 vd4 = *(__m64 *)(dst + 8);
1506
	    __m64 vd5 = *(__m64 *)(dst + 10);
1507
	    __m64 vd6 = *(__m64 *)(dst + 12);
1508
	    __m64 vd7 = *(__m64 *)(dst + 14);
1509
 
1510
	    __m64 vs0 = *(__m64 *)(src + 0);
1511
	    __m64 vs1 = *(__m64 *)(src + 2);
1512
	    __m64 vs2 = *(__m64 *)(src + 4);
1513
	    __m64 vs3 = *(__m64 *)(src + 6);
1514
	    __m64 vs4 = *(__m64 *)(src + 8);
1515
	    __m64 vs5 = *(__m64 *)(src + 10);
1516
	    __m64 vs6 = *(__m64 *)(src + 12);
1517
	    __m64 vs7 = *(__m64 *)(src + 14);
1518
 
1519
	    vd0 = pack8888 (
1520
	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1521
	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1522
 
1523
	    vd1 = pack8888 (
1524
	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1525
	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1526
 
1527
	    vd2 = pack8888 (
1528
	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1529
	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1530
 
1531
	    vd3 = pack8888 (
1532
	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1533
	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1534
 
1535
	    vd4 = pack8888 (
1536
	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1537
	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1538
 
1539
	    vd5 = pack8888 (
1540
	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1541
	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1542
 
1543
	    vd6 = pack8888 (
1544
	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1545
	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1546
 
1547
	    vd7 = pack8888 (
1548
	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1549
	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1550
 
1551
	    *(__m64 *)(dst + 0) = vd0;
1552
	    *(__m64 *)(dst + 2) = vd1;
1553
	    *(__m64 *)(dst + 4) = vd2;
1554
	    *(__m64 *)(dst + 6) = vd3;
1555
	    *(__m64 *)(dst + 8) = vd4;
1556
	    *(__m64 *)(dst + 10) = vd5;
1557
	    *(__m64 *)(dst + 12) = vd6;
1558
	    *(__m64 *)(dst + 14) = vd7;
1559
 
1560
	    w -= 16;
1561
	    dst += 16;
1562
	    src += 16;
1563
	}
1564
 
1565
	while (w)
1566
	{
1567
	    __m64 s = load8888 (*src | 0xff000000);
1568
	    __m64 d = load8888 (*dst);
1569
 
1570
	    *dst = store8888 (in_over (s, srca, vmask, d));
1571
 
1572
	    w--;
1573
	    dst++;
1574
	    src++;
1575
	}
1576
    }
1577
 
1578
    _mm_empty ();
1579
}
1580
 
1581
static void
1582
mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
1583
                              pixman_op_t              op,
1584
                              pixman_image_t *         src_image,
1585
                              pixman_image_t *         mask_image,
1586
                              pixman_image_t *         dst_image,
1587
                              int32_t                  src_x,
1588
                              int32_t                  src_y,
1589
                              int32_t                  mask_x,
1590
                              int32_t                  mask_y,
1591
                              int32_t                  dest_x,
1592
                              int32_t                  dest_y,
1593
                              int32_t                  width,
1594
                              int32_t                  height)
1595
{
1596
    uint32_t *dst_line, *dst;
1597
    uint32_t *src_line, *src;
1598
    uint32_t s;
1599
    int dst_stride, src_stride;
1600
    uint8_t a;
1601
    int32_t w;
1602
 
1603
    CHECKPOINT ();
1604
 
1605
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1606
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1607
 
1608
    while (height--)
1609
    {
1610
	dst = dst_line;
1611
	dst_line += dst_stride;
1612
	src = src_line;
1613
	src_line += src_stride;
1614
	w = width;
1615
 
1616
	while (w--)
1617
	{
1618
	    s = *src++;
1619
	    a = s >> 24;
1620
 
1621
	    if (a == 0xff)
1622
	    {
1623
		*dst = s;
1624
	    }
1625
	    else if (s)
1626
	    {
1627
		__m64 ms, sa;
1628
		ms = load8888 (s);
1629
		sa = expand_alpha (ms);
1630
		*dst = store8888 (over (ms, sa, load8888 (*dst)));
1631
	    }
1632
 
1633
	    dst++;
1634
	}
1635
    }
1636
    _mm_empty ();
1637
}
1638
 
1639
static void
1640
mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
1641
                              pixman_op_t              op,
1642
                              pixman_image_t *         src_image,
1643
                              pixman_image_t *         mask_image,
1644
                              pixman_image_t *         dst_image,
1645
                              int32_t                  src_x,
1646
                              int32_t                  src_y,
1647
                              int32_t                  mask_x,
1648
                              int32_t                  mask_y,
1649
                              int32_t                  dest_x,
1650
                              int32_t                  dest_y,
1651
                              int32_t                  width,
1652
                              int32_t                  height)
1653
{
1654
    uint16_t    *dst_line, *dst;
1655
    uint32_t    *src_line, *src;
1656
    int dst_stride, src_stride;
1657
    int32_t w;
1658
 
1659
    CHECKPOINT ();
1660
 
1661
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1662
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1663
 
1664
#if 0
1665
    /* FIXME */
1666
    assert (src_image->drawable == mask_image->drawable);
1667
#endif
1668
 
1669
    while (height--)
1670
    {
1671
	dst = dst_line;
1672
	dst_line += dst_stride;
1673
	src = src_line;
1674
	src_line += src_stride;
1675
	w = width;
1676
 
1677
	CHECKPOINT ();
1678
 
1679
	while (w && (unsigned long)dst & 7)
1680
	{
1681
	    __m64 vsrc = load8888 (*src);
1682
	    uint64_t d = *dst;
1683
	    __m64 vdest = expand565 (to_m64 (d), 0);
1684
 
1685
	    vdest = pack_565 (
1686
		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1687
 
1688
	    *dst = to_uint64 (vdest);
1689
 
1690
	    w--;
1691
	    dst++;
1692
	    src++;
1693
	}
1694
 
1695
	CHECKPOINT ();
1696
 
1697
	while (w >= 4)
1698
	{
1699
	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1700
	    __m64 vdest;
1701
 
1702
	    vsrc0 = load8888 (*(src + 0));
1703
	    vsrc1 = load8888 (*(src + 1));
1704
	    vsrc2 = load8888 (*(src + 2));
1705
	    vsrc3 = load8888 (*(src + 3));
1706
 
1707
	    vdest = *(__m64 *)dst;
1708
 
1709
	    vdest = pack_565 (over (vsrc0, expand_alpha (vsrc0), expand565 (vdest, 0)), vdest, 0);
1710
	    vdest = pack_565 (over (vsrc1, expand_alpha (vsrc1), expand565 (vdest, 1)), vdest, 1);
1711
	    vdest = pack_565 (over (vsrc2, expand_alpha (vsrc2), expand565 (vdest, 2)), vdest, 2);
1712
	    vdest = pack_565 (over (vsrc3, expand_alpha (vsrc3), expand565 (vdest, 3)), vdest, 3);
1713
 
1714
	    *(__m64 *)dst = vdest;
1715
 
1716
	    w -= 4;
1717
	    dst += 4;
1718
	    src += 4;
1719
	}
1720
 
1721
	CHECKPOINT ();
1722
 
1723
	while (w)
1724
	{
1725
	    __m64 vsrc = load8888 (*src);
1726
	    uint64_t d = *dst;
1727
	    __m64 vdest = expand565 (to_m64 (d), 0);
1728
 
1729
	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1730
 
1731
	    *dst = to_uint64 (vdest);
1732
 
1733
	    w--;
1734
	    dst++;
1735
	    src++;
1736
	}
1737
    }
1738
 
1739
    _mm_empty ();
1740
}
1741
 
1742
static void
1743
mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
1744
                             pixman_op_t              op,
1745
                             pixman_image_t *         src_image,
1746
                             pixman_image_t *         mask_image,
1747
                             pixman_image_t *         dst_image,
1748
                             int32_t                  src_x,
1749
                             int32_t                  src_y,
1750
                             int32_t                  mask_x,
1751
                             int32_t                  mask_y,
1752
                             int32_t                  dest_x,
1753
                             int32_t                  dest_y,
1754
                             int32_t                  width,
1755
                             int32_t                  height)
1756
{
1757
    uint32_t src, srca;
1758
    uint32_t *dst_line, *dst;
1759
    uint8_t *mask_line, *mask;
1760
    int dst_stride, mask_stride;
1761
    int32_t w;
1762
    __m64 vsrc, vsrca;
1763
    uint64_t srcsrc;
1764
 
1765
    CHECKPOINT ();
1766
 
1767
    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
1768
 
1769
    srca = src >> 24;
1770
    if (src == 0)
1771
	return;
1772
 
1773
    srcsrc = (uint64_t)src << 32 | src;
1774
 
1775
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1776
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1777
 
1778
    vsrc = load8888 (src);
1779
    vsrca = expand_alpha (vsrc);
1780
 
1781
    while (height--)
1782
    {
1783
	dst = dst_line;
1784
	dst_line += dst_stride;
1785
	mask = mask_line;
1786
	mask_line += mask_stride;
1787
	w = width;
1788
 
1789
	CHECKPOINT ();
1790
 
1791
	while (w && (unsigned long)dst & 7)
1792
	{
1793
	    uint64_t m = *mask;
1794
 
1795
	    if (m)
1796
	    {
1797
		__m64 vdest = in_over (vsrc, vsrca,
1798
				       expand_alpha_rev (to_m64 (m)),
1799
				       load8888 (*dst));
1800
 
1801
		*dst = store8888 (vdest);
1802
	    }
1803
 
1804
	    w--;
1805
	    mask++;
1806
	    dst++;
1807
	}
1808
 
1809
	CHECKPOINT ();
1810
 
1811
	while (w >= 2)
1812
	{
1813
	    uint64_t m0, m1;
1814
 
1815
	    m0 = *mask;
1816
	    m1 = *(mask + 1);
1817
 
1818
	    if (srca == 0xff && (m0 & m1) == 0xff)
1819
	    {
1820
		*(uint64_t *)dst = srcsrc;
1821
	    }
1822
	    else if (m0 | m1)
1823
	    {
1824
		__m64 vdest;
1825
		__m64 dest0, dest1;
1826
 
1827
		vdest = *(__m64 *)dst;
1828
 
1829
		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
1830
				 expand8888 (vdest, 0));
1831
		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
1832
				 expand8888 (vdest, 1));
1833
 
1834
		*(__m64 *)dst = pack8888 (dest0, dest1);
1835
	    }
1836
 
1837
	    mask += 2;
1838
	    dst += 2;
1839
	    w -= 2;
1840
	}
1841
 
1842
	CHECKPOINT ();
1843
 
1844
	while (w)
1845
	{
1846
	    uint64_t m = *mask;
1847
 
1848
	    if (m)
1849
	    {
1850
		__m64 vdest = load8888 (*dst);
1851
 
1852
		vdest = in_over (
1853
		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
1854
		*dst = store8888 (vdest);
1855
	    }
1856
 
1857
	    w--;
1858
	    mask++;
1859
	    dst++;
1860
	}
1861
    }
1862
 
1863
    _mm_empty ();
1864
}
1865
 
1866
pixman_bool_t
1867
pixman_fill_mmx (uint32_t *bits,
1868
                 int       stride,
1869
                 int       bpp,
1870
                 int       x,
1871
                 int       y,
1872
                 int       width,
1873
                 int       height,
1874
                 uint32_t xor)
1875
{
1876
    uint64_t fill;
1877
    __m64 vfill;
1878
    uint32_t byte_width;
1879
    uint8_t     *byte_line;
1880
 
1881
#ifdef __GNUC__
1882
    __m64 v1, v2, v3, v4, v5, v6, v7;
1883
#endif
1884
 
1885
    if (bpp != 16 && bpp != 32 && bpp != 8)
1886
	return FALSE;
1887
 
1888
    if (bpp == 8)
1889
    {
1890
	stride = stride * (int) sizeof (uint32_t) / 1;
1891
	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
1892
	byte_width = width;
1893
	stride *= 1;
1894
        xor = (xor & 0xff) * 0x01010101;
1895
    }
1896
    else if (bpp == 16)
1897
    {
1898
	stride = stride * (int) sizeof (uint32_t) / 2;
1899
	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
1900
	byte_width = 2 * width;
1901
	stride *= 2;
1902
        xor = (xor & 0xffff) * 0x00010001;
1903
    }
1904
    else
1905
    {
1906
	stride = stride * (int) sizeof (uint32_t) / 4;
1907
	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
1908
	byte_width = 4 * width;
1909
	stride *= 4;
1910
    }
1911
 
1912
    fill = ((uint64_t)xor << 32) | xor;
1913
    vfill = to_m64 (fill);
1914
 
1915
#ifdef __GNUC__
1916
    __asm__ (
1917
        "movq		%7,	%0\n"
1918
        "movq		%7,	%1\n"
1919
        "movq		%7,	%2\n"
1920
        "movq		%7,	%3\n"
1921
        "movq		%7,	%4\n"
1922
        "movq		%7,	%5\n"
1923
        "movq		%7,	%6\n"
1924
	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
1925
	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
1926
	: "y" (vfill));
1927
#endif
1928
 
1929
    while (height--)
1930
    {
1931
	int w;
1932
	uint8_t *d = byte_line;
1933
 
1934
	byte_line += stride;
1935
	w = byte_width;
1936
 
1937
	while (w >= 1 && ((unsigned long)d & 1))
1938
	{
1939
	    *(uint8_t *)d = (xor & 0xff);
1940
	    w--;
1941
	    d++;
1942
	}
1943
 
1944
	while (w >= 2 && ((unsigned long)d & 3))
1945
	{
1946
	    *(uint16_t *)d = xor;
1947
	    w -= 2;
1948
	    d += 2;
1949
	}
1950
 
1951
	while (w >= 4 && ((unsigned long)d & 7))
1952
	{
1953
	    *(uint32_t *)d = xor;
1954
 
1955
	    w -= 4;
1956
	    d += 4;
1957
	}
1958
 
1959
	while (w >= 64)
1960
	{
1961
#ifdef __GNUC__
1962
	    __asm__ (
1963
	        "movq	%1,	  (%0)\n"
1964
	        "movq	%2,	 8(%0)\n"
1965
	        "movq	%3,	16(%0)\n"
1966
	        "movq	%4,	24(%0)\n"
1967
	        "movq	%5,	32(%0)\n"
1968
	        "movq	%6,	40(%0)\n"
1969
	        "movq	%7,	48(%0)\n"
1970
	        "movq	%8,	56(%0)\n"
1971
		:
1972
		: "r" (d),
1973
		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
1974
		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
1975
		: "memory");
1976
#else
1977
	    *(__m64*) (d +  0) = vfill;
1978
	    *(__m64*) (d +  8) = vfill;
1979
	    *(__m64*) (d + 16) = vfill;
1980
	    *(__m64*) (d + 24) = vfill;
1981
	    *(__m64*) (d + 32) = vfill;
1982
	    *(__m64*) (d + 40) = vfill;
1983
	    *(__m64*) (d + 48) = vfill;
1984
	    *(__m64*) (d + 56) = vfill;
1985
#endif
1986
	    w -= 64;
1987
	    d += 64;
1988
	}
1989
 
1990
	while (w >= 4)
1991
	{
1992
	    *(uint32_t *)d = xor;
1993
 
1994
	    w -= 4;
1995
	    d += 4;
1996
	}
1997
	while (w >= 2)
1998
	{
1999
	    *(uint16_t *)d = xor;
2000
	    w -= 2;
2001
	    d += 2;
2002
	}
2003
	while (w >= 1)
2004
	{
2005
	    *(uint8_t *)d = (xor & 0xff);
2006
	    w--;
2007
	    d++;
2008
	}
2009
 
2010
    }
2011
 
2012
    _mm_empty ();
2013
    return TRUE;
2014
}
2015
 
2016
static void
2017
mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
2018
                            pixman_op_t              op,
2019
                            pixman_image_t *         src_image,
2020
                            pixman_image_t *         mask_image,
2021
                            pixman_image_t *         dst_image,
2022
                            int32_t                  src_x,
2023
                            int32_t                  src_y,
2024
                            int32_t                  mask_x,
2025
                            int32_t                  mask_y,
2026
                            int32_t                  dest_x,
2027
                            int32_t                  dest_y,
2028
                            int32_t                  width,
2029
                            int32_t                  height)
2030
{
2031
    uint32_t src, srca;
2032
    uint32_t    *dst_line, *dst;
2033
    uint8_t     *mask_line, *mask;
2034
    int dst_stride, mask_stride;
2035
    int32_t w;
2036
    __m64 vsrc, vsrca;
2037
    uint64_t srcsrc;
2038
 
2039
    CHECKPOINT ();
2040
 
2041
    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2042
 
2043
    srca = src >> 24;
2044
    if (src == 0)
2045
    {
2046
	pixman_fill_mmx (dst_image->bits.bits, dst_image->bits.rowstride,
2047
			 PIXMAN_FORMAT_BPP (dst_image->bits.format),
2048
	                 dest_x, dest_y, width, height, 0);
2049
	return;
2050
    }
2051
 
2052
    srcsrc = (uint64_t)src << 32 | src;
2053
 
2054
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2055
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2056
 
2057
    vsrc = load8888 (src);
2058
    vsrca = expand_alpha (vsrc);
2059
 
2060
    while (height--)
2061
    {
2062
	dst = dst_line;
2063
	dst_line += dst_stride;
2064
	mask = mask_line;
2065
	mask_line += mask_stride;
2066
	w = width;
2067
 
2068
	CHECKPOINT ();
2069
 
2070
	while (w && (unsigned long)dst & 7)
2071
	{
2072
	    uint64_t m = *mask;
2073
 
2074
	    if (m)
2075
	    {
2076
		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2077
 
2078
		*dst = store8888 (vdest);
2079
	    }
2080
	    else
2081
	    {
2082
		*dst = 0;
2083
	    }
2084
 
2085
	    w--;
2086
	    mask++;
2087
	    dst++;
2088
	}
2089
 
2090
	CHECKPOINT ();
2091
 
2092
	while (w >= 2)
2093
	{
2094
	    uint64_t m0, m1;
2095
	    m0 = *mask;
2096
	    m1 = *(mask + 1);
2097
 
2098
	    if (srca == 0xff && (m0 & m1) == 0xff)
2099
	    {
2100
		*(uint64_t *)dst = srcsrc;
2101
	    }
2102
	    else if (m0 | m1)
2103
	    {
2104
		__m64 vdest;
2105
		__m64 dest0, dest1;
2106
 
2107
		vdest = *(__m64 *)dst;
2108
 
2109
		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2110
		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2111
 
2112
		*(__m64 *)dst = pack8888 (dest0, dest1);
2113
	    }
2114
	    else
2115
	    {
2116
		*(uint64_t *)dst = 0;
2117
	    }
2118
 
2119
	    mask += 2;
2120
	    dst += 2;
2121
	    w -= 2;
2122
	}
2123
 
2124
	CHECKPOINT ();
2125
 
2126
	while (w)
2127
	{
2128
	    uint64_t m = *mask;
2129
 
2130
	    if (m)
2131
	    {
2132
		__m64 vdest = load8888 (*dst);
2133
 
2134
		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2135
		*dst = store8888 (vdest);
2136
	    }
2137
	    else
2138
	    {
2139
		*dst = 0;
2140
	    }
2141
 
2142
	    w--;
2143
	    mask++;
2144
	    dst++;
2145
	}
2146
    }
2147
 
2148
    _mm_empty ();
2149
}
2150
 
2151
static void
2152
mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
2153
                             pixman_op_t              op,
2154
                             pixman_image_t *         src_image,
2155
                             pixman_image_t *         mask_image,
2156
                             pixman_image_t *         dst_image,
2157
                             int32_t                  src_x,
2158
                             int32_t                  src_y,
2159
                             int32_t                  mask_x,
2160
                             int32_t                  mask_y,
2161
                             int32_t                  dest_x,
2162
                             int32_t                  dest_y,
2163
                             int32_t                  width,
2164
                             int32_t                  height)
2165
{
2166
    uint32_t src, srca;
2167
    uint16_t *dst_line, *dst;
2168
    uint8_t *mask_line, *mask;
2169
    int dst_stride, mask_stride;
2170
    int32_t w;
2171
    __m64 vsrc, vsrca, tmp;
2172
    uint64_t srcsrcsrcsrc, src16;
2173
 
2174
    CHECKPOINT ();
2175
 
2176
    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2177
 
2178
    srca = src >> 24;
2179
    if (src == 0)
2180
	return;
2181
 
2182
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2183
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2184
 
2185
    vsrc = load8888 (src);
2186
    vsrca = expand_alpha (vsrc);
2187
 
2188
    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
2189
    src16 = to_uint64 (tmp);
2190
 
2191
    srcsrcsrcsrc =
2192
	(uint64_t)src16 << 48 | (uint64_t)src16 << 32 |
2193
	(uint64_t)src16 << 16 | (uint64_t)src16;
2194
 
2195
    while (height--)
2196
    {
2197
	dst = dst_line;
2198
	dst_line += dst_stride;
2199
	mask = mask_line;
2200
	mask_line += mask_stride;
2201
	w = width;
2202
 
2203
	CHECKPOINT ();
2204
 
2205
	while (w && (unsigned long)dst & 7)
2206
	{
2207
	    uint64_t m = *mask;
2208
 
2209
	    if (m)
2210
	    {
2211
		uint64_t d = *dst;
2212
		__m64 vd = to_m64 (d);
2213
		__m64 vdest = in_over (
2214
		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2215
 
2216
		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2217
		*dst = to_uint64 (vd);
2218
	    }
2219
 
2220
	    w--;
2221
	    mask++;
2222
	    dst++;
2223
	}
2224
 
2225
	CHECKPOINT ();
2226
 
2227
	while (w >= 4)
2228
	{
2229
	    uint64_t m0, m1, m2, m3;
2230
	    m0 = *mask;
2231
	    m1 = *(mask + 1);
2232
	    m2 = *(mask + 2);
2233
	    m3 = *(mask + 3);
2234
 
2235
	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2236
	    {
2237
		*(uint64_t *)dst = srcsrcsrcsrc;
2238
	    }
2239
	    else if (m0 | m1 | m2 | m3)
2240
	    {
2241
		__m64 vdest;
2242
		__m64 vm0, vm1, vm2, vm3;
2243
 
2244
		vdest = *(__m64 *)dst;
2245
 
2246
		vm0 = to_m64 (m0);
2247
		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm0),
2248
					   expand565 (vdest, 0)), vdest, 0);
2249
		vm1 = to_m64 (m1);
2250
		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm1),
2251
					   expand565 (vdest, 1)), vdest, 1);
2252
		vm2 = to_m64 (m2);
2253
		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm2),
2254
					   expand565 (vdest, 2)), vdest, 2);
2255
		vm3 = to_m64 (m3);
2256
		vdest = pack_565 (in_over (vsrc, vsrca, expand_alpha_rev (vm3),
2257
					   expand565 (vdest, 3)), vdest, 3);
2258
 
2259
		*(__m64 *)dst = vdest;
2260
	    }
2261
 
2262
	    w -= 4;
2263
	    mask += 4;
2264
	    dst += 4;
2265
	}
2266
 
2267
	CHECKPOINT ();
2268
 
2269
	while (w)
2270
	{
2271
	    uint64_t m = *mask;
2272
 
2273
	    if (m)
2274
	    {
2275
		uint64_t d = *dst;
2276
		__m64 vd = to_m64 (d);
2277
		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2278
				       expand565 (vd, 0));
2279
		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2280
		*dst = to_uint64 (vd);
2281
	    }
2282
 
2283
	    w--;
2284
	    mask++;
2285
	    dst++;
2286
	}
2287
    }
2288
 
2289
    _mm_empty ();
2290
}
2291
 
2292
static void
2293
mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
2294
                                pixman_op_t              op,
2295
                                pixman_image_t *         src_image,
2296
                                pixman_image_t *         mask_image,
2297
                                pixman_image_t *         dst_image,
2298
                                int32_t                  src_x,
2299
                                int32_t                  src_y,
2300
                                int32_t                  mask_x,
2301
                                int32_t                  mask_y,
2302
                                int32_t                  dest_x,
2303
                                int32_t                  dest_y,
2304
                                int32_t                  width,
2305
                                int32_t                  height)
2306
{
2307
    uint16_t    *dst_line, *dst;
2308
    uint32_t    *src_line, *src;
2309
    int dst_stride, src_stride;
2310
    int32_t w;
2311
 
2312
    CHECKPOINT ();
2313
 
2314
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2315
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2316
 
2317
#if 0
2318
    /* FIXME */
2319
    assert (src_image->drawable == mask_image->drawable);
2320
#endif
2321
 
2322
    while (height--)
2323
    {
2324
	dst = dst_line;
2325
	dst_line += dst_stride;
2326
	src = src_line;
2327
	src_line += src_stride;
2328
	w = width;
2329
 
2330
	CHECKPOINT ();
2331
 
2332
	while (w && (unsigned long)dst & 7)
2333
	{
2334
	    __m64 vsrc = load8888 (*src);
2335
	    uint64_t d = *dst;
2336
	    __m64 vdest = expand565 (to_m64 (d), 0);
2337
 
2338
	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2339
 
2340
	    *dst = to_uint64 (vdest);
2341
 
2342
	    w--;
2343
	    dst++;
2344
	    src++;
2345
	}
2346
 
2347
	CHECKPOINT ();
2348
 
2349
	while (w >= 4)
2350
	{
2351
	    uint32_t s0, s1, s2, s3;
2352
	    unsigned char a0, a1, a2, a3;
2353
 
2354
	    s0 = *src;
2355
	    s1 = *(src + 1);
2356
	    s2 = *(src + 2);
2357
	    s3 = *(src + 3);
2358
 
2359
	    a0 = (s0 >> 24);
2360
	    a1 = (s1 >> 24);
2361
	    a2 = (s2 >> 24);
2362
	    a3 = (s3 >> 24);
2363
 
2364
	    if ((a0 & a1 & a2 & a3) == 0xFF)
2365
	    {
2366
		__m64 vdest;
2367
		vdest = pack_565 (invert_colors (load8888 (s0)), _mm_setzero_si64 (), 0);
2368
		vdest = pack_565 (invert_colors (load8888 (s1)), vdest, 1);
2369
		vdest = pack_565 (invert_colors (load8888 (s2)), vdest, 2);
2370
		vdest = pack_565 (invert_colors (load8888 (s3)), vdest, 3);
2371
 
2372
		*(__m64 *)dst = vdest;
2373
	    }
2374
	    else if (s0 | s1 | s2 | s3)
2375
	    {
2376
		__m64 vdest = *(__m64 *)dst;
2377
 
2378
		vdest = pack_565 (over_rev_non_pre (load8888 (s0), expand565 (vdest, 0)), vdest, 0);
2379
		vdest = pack_565 (over_rev_non_pre (load8888 (s1), expand565 (vdest, 1)), vdest, 1);
2380
		vdest = pack_565 (over_rev_non_pre (load8888 (s2), expand565 (vdest, 2)), vdest, 2);
2381
		vdest = pack_565 (over_rev_non_pre (load8888 (s3), expand565 (vdest, 3)), vdest, 3);
2382
 
2383
		*(__m64 *)dst = vdest;
2384
	    }
2385
 
2386
	    w -= 4;
2387
	    dst += 4;
2388
	    src += 4;
2389
	}
2390
 
2391
	CHECKPOINT ();
2392
 
2393
	while (w)
2394
	{
2395
	    __m64 vsrc = load8888 (*src);
2396
	    uint64_t d = *dst;
2397
	    __m64 vdest = expand565 (to_m64 (d), 0);
2398
 
2399
	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2400
 
2401
	    *dst = to_uint64 (vdest);
2402
 
2403
	    w--;
2404
	    dst++;
2405
	    src++;
2406
	}
2407
    }
2408
 
2409
    _mm_empty ();
2410
}
2411
 
2412
static void
2413
mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
2414
                                pixman_op_t              op,
2415
                                pixman_image_t *         src_image,
2416
                                pixman_image_t *         mask_image,
2417
                                pixman_image_t *         dst_image,
2418
                                int32_t                  src_x,
2419
                                int32_t                  src_y,
2420
                                int32_t                  mask_x,
2421
                                int32_t                  mask_y,
2422
                                int32_t                  dest_x,
2423
                                int32_t                  dest_y,
2424
                                int32_t                  width,
2425
                                int32_t                  height)
2426
{
2427
    uint32_t    *dst_line, *dst;
2428
    uint32_t    *src_line, *src;
2429
    int dst_stride, src_stride;
2430
    int32_t w;
2431
 
2432
    CHECKPOINT ();
2433
 
2434
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2435
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2436
 
2437
#if 0
2438
    /* FIXME */
2439
    assert (src_image->drawable == mask_image->drawable);
2440
#endif
2441
 
2442
    while (height--)
2443
    {
2444
	dst = dst_line;
2445
	dst_line += dst_stride;
2446
	src = src_line;
2447
	src_line += src_stride;
2448
	w = width;
2449
 
2450
	while (w && (unsigned long)dst & 7)
2451
	{
2452
	    __m64 s = load8888 (*src);
2453
	    __m64 d = load8888 (*dst);
2454
 
2455
	    *dst = store8888 (over_rev_non_pre (s, d));
2456
 
2457
	    w--;
2458
	    dst++;
2459
	    src++;
2460
	}
2461
 
2462
	while (w >= 2)
2463
	{
2464
	    uint64_t s0, s1;
2465
	    unsigned char a0, a1;
2466
	    __m64 d0, d1;
2467
 
2468
	    s0 = *src;
2469
	    s1 = *(src + 1);
2470
 
2471
	    a0 = (s0 >> 24);
2472
	    a1 = (s1 >> 24);
2473
 
2474
	    if ((a0 & a1) == 0xFF)
2475
	    {
2476
		d0 = invert_colors (load8888 (s0));
2477
		d1 = invert_colors (load8888 (s1));
2478
 
2479
		*(__m64 *)dst = pack8888 (d0, d1);
2480
	    }
2481
	    else if (s0 | s1)
2482
	    {
2483
		__m64 vdest = *(__m64 *)dst;
2484
 
2485
		d0 = over_rev_non_pre (load8888 (s0), expand8888 (vdest, 0));
2486
		d1 = over_rev_non_pre (load8888 (s1), expand8888 (vdest, 1));
2487
 
2488
		*(__m64 *)dst = pack8888 (d0, d1);
2489
	    }
2490
 
2491
	    w -= 2;
2492
	    dst += 2;
2493
	    src += 2;
2494
	}
2495
 
2496
	while (w)
2497
	{
2498
	    __m64 s = load8888 (*src);
2499
	    __m64 d = load8888 (*dst);
2500
 
2501
	    *dst = store8888 (over_rev_non_pre (s, d));
2502
 
2503
	    w--;
2504
	    dst++;
2505
	    src++;
2506
	}
2507
    }
2508
 
2509
    _mm_empty ();
2510
}
2511
 
2512
static void
2513
mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
2514
                                   pixman_op_t              op,
2515
                                   pixman_image_t *         src_image,
2516
                                   pixman_image_t *         mask_image,
2517
                                   pixman_image_t *         dst_image,
2518
                                   int32_t                  src_x,
2519
                                   int32_t                  src_y,
2520
                                   int32_t                  mask_x,
2521
                                   int32_t                  mask_y,
2522
                                   int32_t                  dest_x,
2523
                                   int32_t                  dest_y,
2524
                                   int32_t                  width,
2525
                                   int32_t                  height)
2526
{
2527
    uint32_t src, srca;
2528
    uint16_t    *dst_line;
2529
    uint32_t    *mask_line;
2530
    int dst_stride, mask_stride;
2531
    __m64 vsrc, vsrca;
2532
 
2533
    CHECKPOINT ();
2534
 
2535
    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2536
 
2537
    srca = src >> 24;
2538
    if (src == 0)
2539
	return;
2540
 
2541
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2542
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2543
 
2544
    vsrc = load8888 (src);
2545
    vsrca = expand_alpha (vsrc);
2546
 
2547
    while (height--)
2548
    {
2549
	int twidth = width;
2550
	uint32_t *p = (uint32_t *)mask_line;
2551
	uint16_t *q = (uint16_t *)dst_line;
2552
 
2553
	while (twidth && ((unsigned long)q & 7))
2554
	{
2555
	    uint32_t m = *(uint32_t *)p;
2556
 
2557
	    if (m)
2558
	    {
2559
		uint64_t d = *q;
2560
		__m64 vdest = expand565 (to_m64 (d), 0);
2561
		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2562
		*q = to_uint64 (vdest);
2563
	    }
2564
 
2565
	    twidth--;
2566
	    p++;
2567
	    q++;
2568
	}
2569
 
2570
	while (twidth >= 4)
2571
	{
2572
	    uint32_t m0, m1, m2, m3;
2573
 
2574
	    m0 = *p;
2575
	    m1 = *(p + 1);
2576
	    m2 = *(p + 2);
2577
	    m3 = *(p + 3);
2578
 
2579
	    if ((m0 | m1 | m2 | m3))
2580
	    {
2581
		__m64 vdest = *(__m64 *)q;
2582
 
2583
		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m0), expand565 (vdest, 0)), vdest, 0);
2584
		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m1), expand565 (vdest, 1)), vdest, 1);
2585
		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m2), expand565 (vdest, 2)), vdest, 2);
2586
		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m3), expand565 (vdest, 3)), vdest, 3);
2587
 
2588
		*(__m64 *)q = vdest;
2589
	    }
2590
	    twidth -= 4;
2591
	    p += 4;
2592
	    q += 4;
2593
	}
2594
 
2595
	while (twidth)
2596
	{
2597
	    uint32_t m;
2598
 
2599
	    m = *(uint32_t *)p;
2600
	    if (m)
2601
	    {
2602
		uint64_t d = *q;
2603
		__m64 vdest = expand565 (to_m64 (d), 0);
2604
		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (m), vdest), vdest, 0);
2605
		*q = to_uint64 (vdest);
2606
	    }
2607
 
2608
	    twidth--;
2609
	    p++;
2610
	    q++;
2611
	}
2612
 
2613
	mask_line += mask_stride;
2614
	dst_line += dst_stride;
2615
    }
2616
 
2617
    _mm_empty ();
2618
}
2619
 
2620
static void
2621
mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
2622
                        pixman_op_t              op,
2623
                        pixman_image_t *         src_image,
2624
                        pixman_image_t *         mask_image,
2625
                        pixman_image_t *         dst_image,
2626
                        int32_t                  src_x,
2627
                        int32_t                  src_y,
2628
                        int32_t                  mask_x,
2629
                        int32_t                  mask_y,
2630
                        int32_t                  dest_x,
2631
                        int32_t                  dest_y,
2632
                        int32_t                  width,
2633
                        int32_t                  height)
2634
{
2635
    uint8_t *dst_line, *dst;
2636
    uint8_t *mask_line, *mask;
2637
    int dst_stride, mask_stride;
2638
    int32_t w;
2639
    uint32_t src;
2640
    uint8_t sa;
2641
    __m64 vsrc, vsrca;
2642
 
2643
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2644
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2645
 
2646
    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2647
 
2648
    sa = src >> 24;
2649
 
2650
    vsrc = load8888 (src);
2651
    vsrca = expand_alpha (vsrc);
2652
 
2653
    while (height--)
2654
    {
2655
	dst = dst_line;
2656
	dst_line += dst_stride;
2657
	mask = mask_line;
2658
	mask_line += mask_stride;
2659
	w = width;
2660
 
2661
	if ((((unsigned long)dst_image & 3) == 0) &&
2662
	    (((unsigned long)src_image & 3) == 0))
2663
	{
2664
	    while (w >= 4)
2665
	    {
2666
		uint32_t m;
2667
		__m64 vmask;
2668
		__m64 vdest;
2669
 
2670
		m = 0;
2671
 
2672
		vmask = load8888 (*(uint32_t *)mask);
2673
		vdest = load8888 (*(uint32_t *)dst);
2674
 
2675
		*(uint32_t *)dst = store8888 (in (in (vsrca, vmask), vdest));
2676
 
2677
		dst += 4;
2678
		mask += 4;
2679
		w -= 4;
2680
	    }
2681
	}
2682
 
2683
	while (w--)
2684
	{
2685
	    uint16_t tmp;
2686
	    uint8_t a;
2687
	    uint32_t m, d;
2688
 
2689
	    a = *mask++;
2690
	    d = *dst;
2691
 
2692
	    m = MUL_UN8 (sa, a, tmp);
2693
	    d = MUL_UN8 (m, d, tmp);
2694
 
2695
	    *dst++ = d;
2696
	}
2697
    }
2698
 
2699
    _mm_empty ();
2700
}
2701
 
2702
static void
2703
mmx_composite_in_8_8 (pixman_implementation_t *imp,
2704
                      pixman_op_t              op,
2705
                      pixman_image_t *         src_image,
2706
                      pixman_image_t *         mask_image,
2707
                      pixman_image_t *         dst_image,
2708
                      int32_t                  src_x,
2709
                      int32_t                  src_y,
2710
                      int32_t                  mask_x,
2711
                      int32_t                  mask_y,
2712
                      int32_t                  dest_x,
2713
                      int32_t                  dest_y,
2714
                      int32_t                  width,
2715
                      int32_t                  height)
2716
{
2717
    uint8_t     *dst_line, *dst;
2718
    uint8_t     *src_line, *src;
2719
    int src_stride, dst_stride;
2720
    int32_t w;
2721
 
2722
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2723
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2724
 
2725
    while (height--)
2726
    {
2727
	dst = dst_line;
2728
	dst_line += dst_stride;
2729
	src = src_line;
2730
	src_line += src_stride;
2731
	w = width;
2732
 
2733
	if ((((unsigned long)dst_image & 3) == 0) &&
2734
	    (((unsigned long)src_image & 3) == 0))
2735
	{
2736
	    while (w >= 4)
2737
	    {
2738
		uint32_t *s = (uint32_t *)src;
2739
		uint32_t *d = (uint32_t *)dst;
2740
 
2741
		*d = store8888 (in (load8888 (*s), load8888 (*d)));
2742
 
2743
		w -= 4;
2744
		dst += 4;
2745
		src += 4;
2746
	    }
2747
	}
2748
 
2749
	while (w--)
2750
	{
2751
	    uint8_t s, d;
2752
	    uint16_t tmp;
2753
 
2754
	    s = *src;
2755
	    d = *dst;
2756
 
2757
	    *dst = MUL_UN8 (s, d, tmp);
2758
 
2759
	    src++;
2760
	    dst++;
2761
	}
2762
    }
2763
 
2764
    _mm_empty ();
2765
}
2766
 
2767
static void
2768
mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
2769
			 pixman_op_t              op,
2770
			 pixman_image_t *         src_image,
2771
			 pixman_image_t *         mask_image,
2772
			 pixman_image_t *         dst_image,
2773
			 int32_t                  src_x,
2774
			 int32_t                  src_y,
2775
			 int32_t                  mask_x,
2776
			 int32_t                  mask_y,
2777
			 int32_t                  dest_x,
2778
			 int32_t                  dest_y,
2779
			 int32_t                  width,
2780
			 int32_t                  height)
2781
{
2782
    uint8_t     *dst_line, *dst;
2783
    uint8_t     *mask_line, *mask;
2784
    int dst_stride, mask_stride;
2785
    int32_t w;
2786
    uint32_t src;
2787
    uint8_t sa;
2788
    __m64 vsrc, vsrca;
2789
 
2790
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2791
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2792
 
2793
    src = _pixman_image_get_solid (src_image, dst_image->bits.format);
2794
 
2795
    sa = src >> 24;
2796
 
2797
    if (src == 0)
2798
	return;
2799
 
2800
    vsrc = load8888 (src);
2801
    vsrca = expand_alpha (vsrc);
2802
 
2803
    while (height--)
2804
    {
2805
	dst = dst_line;
2806
	dst_line += dst_stride;
2807
	mask = mask_line;
2808
	mask_line += mask_stride;
2809
	w = width;
2810
 
2811
	if ((((unsigned long)mask_image & 3) == 0) &&
2812
	    (((unsigned long)dst_image  & 3) == 0))
2813
	{
2814
	    while (w >= 4)
2815
	    {
2816
		__m64 vmask = load8888 (*(uint32_t *)mask);
2817
		__m64 vdest = load8888 (*(uint32_t *)dst);
2818
 
2819
		*(uint32_t *)dst = store8888 (_mm_adds_pu8 (in (vsrca, vmask), vdest));
2820
 
2821
		w -= 4;
2822
		dst += 4;
2823
		mask += 4;
2824
	    }
2825
	}
2826
 
2827
	while (w--)
2828
	{
2829
	    uint16_t tmp;
2830
	    uint16_t a;
2831
	    uint32_t m, d;
2832
	    uint32_t r;
2833
 
2834
	    a = *mask++;
2835
	    d = *dst;
2836
 
2837
	    m = MUL_UN8 (sa, a, tmp);
2838
	    r = ADD_UN8 (m, d, tmp);
2839
 
2840
	    *dst++ = r;
2841
	}
2842
    }
2843
 
2844
    _mm_empty ();
2845
}
2846
 
2847
static void
2848
mmx_composite_add_8_8 (pixman_implementation_t *imp,
2849
		       pixman_op_t              op,
2850
		       pixman_image_t *         src_image,
2851
		       pixman_image_t *         mask_image,
2852
		       pixman_image_t *         dst_image,
2853
		       int32_t                  src_x,
2854
		       int32_t                  src_y,
2855
		       int32_t                  mask_x,
2856
		       int32_t                  mask_y,
2857
		       int32_t                  dest_x,
2858
		       int32_t                  dest_y,
2859
		       int32_t                  width,
2860
		       int32_t                  height)
2861
{
2862
    uint8_t *dst_line, *dst;
2863
    uint8_t *src_line, *src;
2864
    int dst_stride, src_stride;
2865
    int32_t w;
2866
    uint8_t s, d;
2867
    uint16_t t;
2868
 
2869
    CHECKPOINT ();
2870
 
2871
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2872
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
2873
 
2874
    while (height--)
2875
    {
2876
	dst = dst_line;
2877
	dst_line += dst_stride;
2878
	src = src_line;
2879
	src_line += src_stride;
2880
	w = width;
2881
 
2882
	while (w && (unsigned long)dst & 7)
2883
	{
2884
	    s = *src;
2885
	    d = *dst;
2886
	    t = d + s;
2887
	    s = t | (0 - (t >> 8));
2888
	    *dst = s;
2889
 
2890
	    dst++;
2891
	    src++;
2892
	    w--;
2893
	}
2894
 
2895
	while (w >= 8)
2896
	{
2897
	    *(__m64*)dst = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
2898
	    dst += 8;
2899
	    src += 8;
2900
	    w -= 8;
2901
	}
2902
 
2903
	while (w)
2904
	{
2905
	    s = *src;
2906
	    d = *dst;
2907
	    t = d + s;
2908
	    s = t | (0 - (t >> 8));
2909
	    *dst = s;
2910
 
2911
	    dst++;
2912
	    src++;
2913
	    w--;
2914
	}
2915
    }
2916
 
2917
    _mm_empty ();
2918
}
2919
 
2920
static void
2921
mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
2922
                             pixman_op_t              op,
2923
                             pixman_image_t *         src_image,
2924
                             pixman_image_t *         mask_image,
2925
                             pixman_image_t *         dst_image,
2926
                             int32_t                  src_x,
2927
                             int32_t                  src_y,
2928
                             int32_t                  mask_x,
2929
                             int32_t                  mask_y,
2930
                             int32_t                  dest_x,
2931
                             int32_t                  dest_y,
2932
                             int32_t                  width,
2933
                             int32_t                  height)
2934
{
2935
    __m64 dst64;
2936
    uint32_t    *dst_line, *dst;
2937
    uint32_t    *src_line, *src;
2938
    int dst_stride, src_stride;
2939
    int32_t w;
2940
 
2941
    CHECKPOINT ();
2942
 
2943
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2944
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
2945
 
2946
    while (height--)
2947
    {
2948
	dst = dst_line;
2949
	dst_line += dst_stride;
2950
	src = src_line;
2951
	src_line += src_stride;
2952
	w = width;
2953
 
2954
	while (w && (unsigned long)dst & 7)
2955
	{
2956
	    *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2957
	                                           _mm_cvtsi32_si64 (*dst)));
2958
	    dst++;
2959
	    src++;
2960
	    w--;
2961
	}
2962
 
2963
	while (w >= 2)
2964
	{
2965
	    dst64 = _mm_adds_pu8 (*(__m64*)src, *(__m64*)dst);
2966
	    *(uint64_t*)dst = to_uint64 (dst64);
2967
	    dst += 2;
2968
	    src += 2;
2969
	    w -= 2;
2970
	}
2971
 
2972
	if (w)
2973
	{
2974
	    *dst = _mm_cvtsi64_si32 (_mm_adds_pu8 (_mm_cvtsi32_si64 (*src),
2975
	                                           _mm_cvtsi32_si64 (*dst)));
2976
 
2977
	}
2978
    }
2979
 
2980
    _mm_empty ();
2981
}
2982
 
2983
static pixman_bool_t
2984
pixman_blt_mmx (uint32_t *src_bits,
2985
                uint32_t *dst_bits,
2986
                int       src_stride,
2987
                int       dst_stride,
2988
                int       src_bpp,
2989
                int       dst_bpp,
2990
                int       src_x,
2991
                int       src_y,
2992
                int       dst_x,
2993
                int       dst_y,
2994
                int       width,
2995
                int       height)
2996
{
2997
    uint8_t *   src_bytes;
2998
    uint8_t *   dst_bytes;
2999
    int byte_width;
3000
 
3001
    if (src_bpp != dst_bpp)
3002
	return FALSE;
3003
 
3004
    if (src_bpp == 16)
3005
    {
3006
	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3007
	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3008
	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3009
	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
3010
	byte_width = 2 * width;
3011
	src_stride *= 2;
3012
	dst_stride *= 2;
3013
    }
3014
    else if (src_bpp == 32)
3015
    {
3016
	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3017
	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3018
	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3019
	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dst_y) + (dst_x));
3020
	byte_width = 4 * width;
3021
	src_stride *= 4;
3022
	dst_stride *= 4;
3023
    }
3024
    else
3025
    {
3026
	return FALSE;
3027
    }
3028
 
3029
    while (height--)
3030
    {
3031
	int w;
3032
	uint8_t *s = src_bytes;
3033
	uint8_t *d = dst_bytes;
3034
	src_bytes += src_stride;
3035
	dst_bytes += dst_stride;
3036
	w = byte_width;
3037
 
3038
	while (w >= 2 && ((unsigned long)d & 3))
3039
	{
3040
	    *(uint16_t *)d = *(uint16_t *)s;
3041
	    w -= 2;
3042
	    s += 2;
3043
	    d += 2;
3044
	}
3045
 
3046
	while (w >= 4 && ((unsigned long)d & 7))
3047
	{
3048
	    *(uint32_t *)d = *(uint32_t *)s;
3049
 
3050
	    w -= 4;
3051
	    s += 4;
3052
	    d += 4;
3053
	}
3054
 
3055
	while (w >= 64)
3056
	{
3057
#if defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))
3058
	    __asm__ (
3059
	        "movq	  (%1),	  %%mm0\n"
3060
	        "movq	 8(%1),	  %%mm1\n"
3061
	        "movq	16(%1),	  %%mm2\n"
3062
	        "movq	24(%1),	  %%mm3\n"
3063
	        "movq	32(%1),	  %%mm4\n"
3064
	        "movq	40(%1),	  %%mm5\n"
3065
	        "movq	48(%1),	  %%mm6\n"
3066
	        "movq	56(%1),	  %%mm7\n"
3067
 
3068
	        "movq	%%mm0,	  (%0)\n"
3069
	        "movq	%%mm1,	 8(%0)\n"
3070
	        "movq	%%mm2,	16(%0)\n"
3071
	        "movq	%%mm3,	24(%0)\n"
3072
	        "movq	%%mm4,	32(%0)\n"
3073
	        "movq	%%mm5,	40(%0)\n"
3074
	        "movq	%%mm6,	48(%0)\n"
3075
	        "movq	%%mm7,	56(%0)\n"
3076
		:
3077
		: "r" (d), "r" (s)
3078
		: "memory",
3079
		  "%mm0", "%mm1", "%mm2", "%mm3",
3080
		  "%mm4", "%mm5", "%mm6", "%mm7");
3081
#else
3082
	    __m64 v0 = *(__m64 *)(s + 0);
3083
	    __m64 v1 = *(__m64 *)(s + 8);
3084
	    __m64 v2 = *(__m64 *)(s + 16);
3085
	    __m64 v3 = *(__m64 *)(s + 24);
3086
	    __m64 v4 = *(__m64 *)(s + 32);
3087
	    __m64 v5 = *(__m64 *)(s + 40);
3088
	    __m64 v6 = *(__m64 *)(s + 48);
3089
	    __m64 v7 = *(__m64 *)(s + 56);
3090
	    *(__m64 *)(d + 0)  = v0;
3091
	    *(__m64 *)(d + 8)  = v1;
3092
	    *(__m64 *)(d + 16) = v2;
3093
	    *(__m64 *)(d + 24) = v3;
3094
	    *(__m64 *)(d + 32) = v4;
3095
	    *(__m64 *)(d + 40) = v5;
3096
	    *(__m64 *)(d + 48) = v6;
3097
	    *(__m64 *)(d + 56) = v7;
3098
#endif
3099
 
3100
	    w -= 64;
3101
	    s += 64;
3102
	    d += 64;
3103
	}
3104
	while (w >= 4)
3105
	{
3106
	    *(uint32_t *)d = *(uint32_t *)s;
3107
 
3108
	    w -= 4;
3109
	    s += 4;
3110
	    d += 4;
3111
	}
3112
	if (w >= 2)
3113
	{
3114
	    *(uint16_t *)d = *(uint16_t *)s;
3115
	    w -= 2;
3116
	    s += 2;
3117
	    d += 2;
3118
	}
3119
    }
3120
 
3121
    _mm_empty ();
3122
 
3123
    return TRUE;
3124
}
3125
 
3126
static void
3127
mmx_composite_copy_area (pixman_implementation_t *imp,
3128
                         pixman_op_t              op,
3129
                         pixman_image_t *         src_image,
3130
                         pixman_image_t *         mask_image,
3131
                         pixman_image_t *         dst_image,
3132
                         int32_t                  src_x,
3133
                         int32_t                  src_y,
3134
                         int32_t                  mask_x,
3135
                         int32_t                  mask_y,
3136
                         int32_t                  dest_x,
3137
                         int32_t                  dest_y,
3138
                         int32_t                  width,
3139
                         int32_t                  height)
3140
{
3141
    pixman_blt_mmx (src_image->bits.bits,
3142
                    dst_image->bits.bits,
3143
                    src_image->bits.rowstride,
3144
                    dst_image->bits.rowstride,
3145
                    PIXMAN_FORMAT_BPP (src_image->bits.format),
3146
                    PIXMAN_FORMAT_BPP (dst_image->bits.format),
3147
                    src_x, src_y, dest_x, dest_y, width, height);
3148
}
3149
 
3150
#if 0
3151
static void
3152
mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3153
                                pixman_op_t              op,
3154
                                pixman_image_t *         src_image,
3155
                                pixman_image_t *         mask_image,
3156
                                pixman_image_t *         dst_image,
3157
                                int32_t                  src_x,
3158
                                int32_t                  src_y,
3159
                                int32_t                  mask_x,
3160
                                int32_t                  mask_y,
3161
                                int32_t                  dest_x,
3162
                                int32_t                  dest_y,
3163
                                int32_t                  width,
3164
                                int32_t                  height)
3165
{
3166
    uint32_t  *src, *src_line;
3167
    uint32_t  *dst, *dst_line;
3168
    uint8_t  *mask, *mask_line;
3169
    int src_stride, mask_stride, dst_stride;
3170
    int32_t w;
3171
 
3172
    PIXMAN_IMAGE_GET_LINE (dst_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3173
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3174
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3175
 
3176
    while (height--)
3177
    {
3178
	src = src_line;
3179
	src_line += src_stride;
3180
	dst = dst_line;
3181
	dst_line += dst_stride;
3182
	mask = mask_line;
3183
	mask_line += mask_stride;
3184
 
3185
	w = width;
3186
 
3187
	while (w--)
3188
	{
3189
	    uint64_t m = *mask;
3190
 
3191
	    if (m)
3192
	    {
3193
		__m64 s = load8888 (*src | 0xff000000);
3194
 
3195
		if (m == 0xff)
3196
		{
3197
		    *dst = store8888 (s);
3198
		}
3199
		else
3200
		{
3201
		    __m64 sa = expand_alpha (s);
3202
		    __m64 vm = expand_alpha_rev (to_m64 (m));
3203
		    __m64 vdest = in_over (s, sa, vm, load8888 (*dst));
3204
 
3205
		    *dst = store8888 (vdest);
3206
		}
3207
	    }
3208
 
3209
	    mask++;
3210
	    dst++;
3211
	    src++;
3212
	}
3213
    }
3214
 
3215
    _mm_empty ();
3216
}
3217
#endif
3218
 
3219
static const pixman_fast_path_t mmx_fast_paths[] =
3220
{
3221
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
3222
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
3223
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
3224
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
3225
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
3226
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
3227
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3228
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3229
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
3230
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3231
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3232
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
3233
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3234
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3235
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
3236
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3237
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3238
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
3239
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
3240
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
3241
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
3242
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
3243
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
3244
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
3245
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
3246
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
3247
#if 0
3248
    /* FIXME: This code is commented out since it's apparently
3249
     * not actually faster than the generic code.
3250
     */
3251
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
3252
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
3253
    PIXMAN_STD_FAST_PATH    (OVER, x8b8r8g8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
3254
    PIXMAN_STD_FAST_PATH    (OVER, x8b8r8g8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
3255
#endif
3256
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
3257
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
3258
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
3259
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3260
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3261
 
3262
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
3263
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
3264
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
3265
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
3266
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
3267
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
3268
 
3269
    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
3270
    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
3271
    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
3272
    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
3273
 
3274
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
3275
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
3276
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
3277
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
3278
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
3279
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
3280
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3281
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3282
    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3283
    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3284
    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
3285
    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
3286
 
3287
    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
3288
    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
3289
 
3290
    { PIXMAN_OP_NONE },
3291
};
3292
 
3293
static pixman_bool_t
3294
mmx_blt (pixman_implementation_t *imp,
3295
         uint32_t *               src_bits,
3296
         uint32_t *               dst_bits,
3297
         int                      src_stride,
3298
         int                      dst_stride,
3299
         int                      src_bpp,
3300
         int                      dst_bpp,
3301
         int                      src_x,
3302
         int                      src_y,
3303
         int                      dst_x,
3304
         int                      dst_y,
3305
         int                      width,
3306
         int                      height)
3307
{
3308
    if (!pixman_blt_mmx (
3309
            src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3310
            src_x, src_y, dst_x, dst_y, width, height))
3311
 
3312
    {
3313
	return _pixman_implementation_blt (
3314
	    imp->delegate,
3315
	    src_bits, dst_bits, src_stride, dst_stride, src_bpp, dst_bpp,
3316
	    src_x, src_y, dst_x, dst_y, width, height);
3317
    }
3318
 
3319
    return TRUE;
3320
}
3321
 
3322
static pixman_bool_t
3323
mmx_fill (pixman_implementation_t *imp,
3324
          uint32_t *               bits,
3325
          int                      stride,
3326
          int                      bpp,
3327
          int                      x,
3328
          int                      y,
3329
          int                      width,
3330
          int                      height,
3331
          uint32_t xor)
3332
{
3333
    if (!pixman_fill_mmx (bits, stride, bpp, x, y, width, height, xor))
3334
    {
3335
	return _pixman_implementation_fill (
3336
	    imp->delegate, bits, stride, bpp, x, y, width, height, xor);
3337
    }
3338
 
3339
    return TRUE;
3340
}
3341
 
3342
pixman_implementation_t *
3343
_pixman_implementation_create_mmx (void)
3344
{
3345
    pixman_implementation_t *general = _pixman_implementation_create_fast_path ();
3346
    pixman_implementation_t *imp = _pixman_implementation_create (general, mmx_fast_paths);
3347
 
3348
    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
3349
    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
3350
    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
3351
    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
3352
    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
3353
    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
3354
    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
3355
    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
3356
    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
3357
    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
3358
    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
3359
 
3360
    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
3361
    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
3362
    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
3363
    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
3364
    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
3365
    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
3366
    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
3367
    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
3368
    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
3369
    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
3370
    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
3371
 
3372
    imp->blt = mmx_blt;
3373
    imp->fill = mmx_fill;
3374
 
3375
    return imp;
3376
}
3377
 
3378
#endif /* USE_MMX */