Subversion Repositories Kolibri OS

Rev

Rev 1891 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
1891 serge 1
/*
2
 * Copyright © 2004, 2005 Red Hat, Inc.
3
 * Copyright © 2004 Nicholas Miell
4
 * Copyright © 2005 Trolltech AS
5
 *
6
 * Permission to use, copy, modify, distribute, and sell this software and its
7
 * documentation for any purpose is hereby granted without fee, provided that
8
 * the above copyright notice appear in all copies and that both that
9
 * copyright notice and this permission notice appear in supporting
10
 * documentation, and that the name of Red Hat not be used in advertising or
11
 * publicity pertaining to distribution of the software without specific,
12
 * written prior permission.  Red Hat makes no representations about the
13
 * suitability of this software for any purpose.  It is provided "as is"
14
 * without express or implied warranty.
15
 *
16
 * THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
17
 * SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
18
 * FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
19
 * SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
20
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
21
 * AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
22
 * OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
23
 * SOFTWARE.
24
 *
25
 * Author:  Søren Sandmann (sandmann@redhat.com)
26
 * Minor Improvements: Nicholas Miell (nmiell@gmail.com)
27
 * MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
28
 *
29
 * Based on work by Owen Taylor
30
 */
31
 
32
#ifdef HAVE_CONFIG_H
33
#include 
34
#endif
35
 
3931 Serge 36
#if defined USE_X86_MMX || defined USE_ARM_IWMMXT || defined USE_LOONGSON_MMI
1891 serge 37
 
3931 Serge 38
#ifdef USE_LOONGSON_MMI
39
#include 
40
#else
1891 serge 41
#include 
3931 Serge 42
#endif
1891 serge 43
#include "pixman-private.h"
44
#include "pixman-combine32.h"
3931 Serge 45
#include "pixman-inlines.h"
1891 serge 46
 
47
#ifdef VERBOSE
48
#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
49
#else
50
#define CHECKPOINT()
51
#endif
52
 
3931 Serge 53
#if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
54
/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this.  */
55
extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
56
_mm_empty (void)
57
{
58
 
59
}
60
#endif
61
 
62
#ifdef USE_X86_MMX
63
# if (defined(__SUNPRO_C) || defined(_MSC_VER) || defined(_WIN64))
64
#  include 
65
# else
66
/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
67
 * instructions to be generated that we don't want. Just duplicate the
68
 * functions we want to use.  */
69
extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
70
_mm_movemask_pi8 (__m64 __A)
71
{
72
    int ret;
73
 
74
    asm ("pmovmskb %1, %0\n\t"
75
	: "=r" (ret)
76
	: "y" (__A)
77
    );
78
 
79
    return ret;
80
}
81
 
82
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
83
_mm_mulhi_pu16 (__m64 __A, __m64 __B)
84
{
85
    asm ("pmulhuw %1, %0\n\t"
86
	: "+y" (__A)
87
	: "y" (__B)
88
    );
89
    return __A;
90
}
91
 
92
#  ifdef __OPTIMIZE__
93
extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
94
_mm_shuffle_pi16 (__m64 __A, int8_t const __N)
95
{
96
    __m64 ret;
97
 
98
    asm ("pshufw %2, %1, %0\n\t"
99
	: "=y" (ret)
100
	: "y" (__A), "K" (__N)
101
    );
102
 
103
    return ret;
104
}
105
#  else
106
#   define _mm_shuffle_pi16(A, N)					\
107
    ({									\
108
	__m64 ret;							\
109
									\
110
	asm ("pshufw %2, %1, %0\n\t"					\
111
	     : "=y" (ret)						\
112
	     : "y" (A), "K" ((const int8_t)N)				\
113
	);								\
114
									\
115
	ret;								\
116
    })
117
#  endif
118
# endif
119
#endif
120
 
121
#ifndef _MSC_VER
122
#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
123
 (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | (fp0))
124
#endif
125
 
1891 serge 126
/* Notes about writing mmx code
127
 *
128
 * give memory operands as the second operand. If you give it as the
129
 * first, gcc will first load it into a register, then use that
130
 * register
131
 *
132
 *   ie. use
133
 *
134
 *         _mm_mullo_pi16 (x, mmx_constant);
135
 *
136
 *   not
137
 *
138
 *         _mm_mullo_pi16 (mmx_constant, x);
139
 *
140
 * Also try to minimize dependencies. i.e. when you need a value, try
141
 * to calculate it from a value that was calculated as early as
142
 * possible.
143
 */
144
 
145
/* --------------- MMX primitives ------------------------------------- */
146
 
3931 Serge 147
/* If __m64 is defined as a struct or union, then define M64_MEMBER to be
148
 * the name of the member used to access the data.
149
 * If __m64 requires using mm_cvt* intrinsics functions to convert between
150
 * uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
151
 * If __m64 and uint64_t values can just be cast to each other directly,
152
 * then define USE_M64_CASTS.
153
 * If __m64 is a double datatype, then define USE_M64_DOUBLE.
154
 */
155
#ifdef _MSC_VER
156
# define M64_MEMBER m64_u64
157
#elif defined(__ICC)
158
# define USE_CVT_INTRINSICS
159
#elif defined(USE_LOONGSON_MMI)
160
# define USE_M64_DOUBLE
161
#elif defined(__GNUC__)
162
# define USE_M64_CASTS
163
#elif defined(__SUNPRO_C)
164
# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
165
/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
166
 * support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
167
 * is defined.   If it is used, then the mm_cvt* intrinsics must be used.
168
 */
169
#  define USE_CVT_INTRINSICS
170
# else
171
/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
172
 * disabled, __m64 is defined as a struct containing "unsigned long long l_".
173
 */
174
#  define M64_MEMBER l_
175
# endif
176
#endif
177
 
178
#if defined(USE_M64_CASTS) || defined(USE_CVT_INTRINSICS) || defined(USE_M64_DOUBLE)
1891 serge 179
typedef uint64_t mmxdatafield;
180
#else
181
typedef __m64 mmxdatafield;
182
#endif
183
 
184
typedef struct
185
{
186
    mmxdatafield mmx_4x00ff;
187
    mmxdatafield mmx_4x0080;
188
    mmxdatafield mmx_565_rgb;
189
    mmxdatafield mmx_565_unpack_multiplier;
3931 Serge 190
    mmxdatafield mmx_565_pack_multiplier;
1891 serge 191
    mmxdatafield mmx_565_r;
192
    mmxdatafield mmx_565_g;
193
    mmxdatafield mmx_565_b;
3931 Serge 194
    mmxdatafield mmx_packed_565_rb;
195
    mmxdatafield mmx_packed_565_g;
196
    mmxdatafield mmx_expand_565_g;
197
    mmxdatafield mmx_expand_565_b;
198
    mmxdatafield mmx_expand_565_r;
199
#ifndef USE_LOONGSON_MMI
1891 serge 200
    mmxdatafield mmx_mask_0;
201
    mmxdatafield mmx_mask_1;
202
    mmxdatafield mmx_mask_2;
203
    mmxdatafield mmx_mask_3;
3931 Serge 204
#endif
1891 serge 205
    mmxdatafield mmx_full_alpha;
3931 Serge 206
    mmxdatafield mmx_4x0101;
207
    mmxdatafield mmx_ff000000;
1891 serge 208
} mmx_data_t;
209
 
210
#if defined(_MSC_VER)
211
# define MMXDATA_INIT(field, val) { val ## UI64 }
212
#elif defined(M64_MEMBER)       /* __m64 is a struct, not an integral type */
213
# define MMXDATA_INIT(field, val) field =   { val ## ULL }
3931 Serge 214
#else                           /* mmxdatafield is an integral type */
1891 serge 215
# define MMXDATA_INIT(field, val) field =   val ## ULL
216
#endif
217
 
218
static const mmx_data_t c =
219
{
220
    MMXDATA_INIT (.mmx_4x00ff,                   0x00ff00ff00ff00ff),
221
    MMXDATA_INIT (.mmx_4x0080,                   0x0080008000800080),
222
    MMXDATA_INIT (.mmx_565_rgb,                  0x000001f0003f001f),
223
    MMXDATA_INIT (.mmx_565_unpack_multiplier,    0x0000008404100840),
3931 Serge 224
    MMXDATA_INIT (.mmx_565_pack_multiplier,      0x2000000420000004),
1891 serge 225
    MMXDATA_INIT (.mmx_565_r,                    0x000000f800000000),
226
    MMXDATA_INIT (.mmx_565_g,                    0x0000000000fc0000),
227
    MMXDATA_INIT (.mmx_565_b,                    0x00000000000000f8),
3931 Serge 228
    MMXDATA_INIT (.mmx_packed_565_rb,            0x00f800f800f800f8),
229
    MMXDATA_INIT (.mmx_packed_565_g,             0x0000fc000000fc00),
230
    MMXDATA_INIT (.mmx_expand_565_g,             0x07e007e007e007e0),
231
    MMXDATA_INIT (.mmx_expand_565_b,             0x001f001f001f001f),
232
    MMXDATA_INIT (.mmx_expand_565_r,             0xf800f800f800f800),
233
#ifndef USE_LOONGSON_MMI
1891 serge 234
    MMXDATA_INIT (.mmx_mask_0,                   0xffffffffffff0000),
235
    MMXDATA_INIT (.mmx_mask_1,                   0xffffffff0000ffff),
236
    MMXDATA_INIT (.mmx_mask_2,                   0xffff0000ffffffff),
237
    MMXDATA_INIT (.mmx_mask_3,                   0x0000ffffffffffff),
3931 Serge 238
#endif
1891 serge 239
    MMXDATA_INIT (.mmx_full_alpha,               0x00ff000000000000),
3931 Serge 240
    MMXDATA_INIT (.mmx_4x0101,                   0x0101010101010101),
241
    MMXDATA_INIT (.mmx_ff000000,                 0xff000000ff000000),
1891 serge 242
};
243
 
3931 Serge 244
#ifdef USE_CVT_INTRINSICS
245
#    define MC(x) to_m64 (c.mmx_ ## x)
246
#elif defined(USE_M64_CASTS)
247
#    define MC(x) ((__m64)c.mmx_ ## x)
248
#elif defined(USE_M64_DOUBLE)
249
#    define MC(x) (*(__m64 *)&c.mmx_ ## x)
1891 serge 250
#else
251
#    define MC(x) c.mmx_ ## x
252
#endif
253
 
254
static force_inline __m64
255
to_m64 (uint64_t x)
256
{
3931 Serge 257
#ifdef USE_CVT_INTRINSICS
1891 serge 258
    return _mm_cvtsi64_m64 (x);
259
#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
260
    __m64 res;
261
 
262
    res.M64_MEMBER = x;
263
    return res;
3931 Serge 264
#elif defined USE_M64_DOUBLE
265
    return *(__m64 *)&x;
266
#else /* USE_M64_CASTS */
1891 serge 267
    return (__m64)x;
268
#endif
269
}
270
 
271
static force_inline uint64_t
272
to_uint64 (__m64 x)
273
{
3931 Serge 274
#ifdef USE_CVT_INTRINSICS
1891 serge 275
    return _mm_cvtm64_si64 (x);
276
#elif defined M64_MEMBER        /* __m64 is a struct, not an integral type */
277
    uint64_t res = x.M64_MEMBER;
278
    return res;
3931 Serge 279
#elif defined USE_M64_DOUBLE
280
    return *(uint64_t *)&x;
281
#else /* USE_M64_CASTS */
1891 serge 282
    return (uint64_t)x;
283
#endif
284
}
285
 
286
static force_inline __m64
287
shift (__m64 v,
288
       int   s)
289
{
290
    if (s > 0)
291
	return _mm_slli_si64 (v, s);
292
    else if (s < 0)
293
	return _mm_srli_si64 (v, -s);
294
    else
295
	return v;
296
}
297
 
298
static force_inline __m64
299
negate (__m64 mask)
300
{
301
    return _mm_xor_si64 (mask, MC (4x00ff));
302
}
303
 
304
static force_inline __m64
305
pix_multiply (__m64 a, __m64 b)
306
{
307
    __m64 res;
308
 
309
    res = _mm_mullo_pi16 (a, b);
310
    res = _mm_adds_pu16 (res, MC (4x0080));
3931 Serge 311
    res = _mm_mulhi_pu16 (res, MC (4x0101));
1891 serge 312
 
313
    return res;
314
}
315
 
316
static force_inline __m64
317
pix_add (__m64 a, __m64 b)
318
{
319
    return _mm_adds_pu8 (a, b);
320
}
321
 
322
static force_inline __m64
323
expand_alpha (__m64 pixel)
324
{
3931 Serge 325
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
1891 serge 326
}
327
 
328
static force_inline __m64
329
expand_alpha_rev (__m64 pixel)
330
{
3931 Serge 331
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
1891 serge 332
}
333
 
334
static force_inline __m64
335
invert_colors (__m64 pixel)
336
{
3931 Serge 337
    return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
1891 serge 338
}
339
 
340
static force_inline __m64
341
over (__m64 src,
342
      __m64 srca,
343
      __m64 dest)
344
{
345
    return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
346
}
347
 
348
static force_inline __m64
349
over_rev_non_pre (__m64 src, __m64 dest)
350
{
351
    __m64 srca = expand_alpha (src);
352
    __m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
353
 
354
    return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
355
}
356
 
357
static force_inline __m64
358
in (__m64 src, __m64 mask)
359
{
360
    return pix_multiply (src, mask);
361
}
362
 
363
#ifndef _MSC_VER
364
static force_inline __m64
365
in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
366
{
367
    return over (in (src, mask), pix_multiply (srca, mask), dest);
368
}
369
 
370
#else
371
 
372
#define in_over(src, srca, mask, dest)					\
373
    over (in (src, mask), pix_multiply (srca, mask), dest)
374
 
375
#endif
376
 
3931 Serge 377
/* Elemental unaligned loads */
378
 
379
static force_inline __m64 ldq_u(__m64 *p)
380
{
381
#ifdef USE_X86_MMX
382
    /* x86's alignment restrictions are very relaxed. */
383
    return *(__m64 *)p;
384
#elif defined USE_ARM_IWMMXT
385
    int align = (uintptr_t)p & 7;
386
    __m64 *aligned_p;
387
    if (align == 0)
388
	return *p;
389
    aligned_p = (__m64 *)((uintptr_t)p & ~7);
390
    return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
391
#else
392
    struct __una_u64 { __m64 x __attribute__((packed)); };
393
    const struct __una_u64 *ptr = (const struct __una_u64 *) p;
394
    return (__m64) ptr->x;
395
#endif
396
}
397
 
398
static force_inline uint32_t ldl_u(const uint32_t *p)
399
{
400
#ifdef USE_X86_MMX
401
    /* x86's alignment restrictions are very relaxed. */
402
    return *p;
403
#else
404
    struct __una_u32 { uint32_t x __attribute__((packed)); };
405
    const struct __una_u32 *ptr = (const struct __una_u32 *) p;
406
    return ptr->x;
407
#endif
408
}
409
 
1891 serge 410
static force_inline __m64
3931 Serge 411
load (const uint32_t *v)
1891 serge 412
{
3931 Serge 413
#ifdef USE_LOONGSON_MMI
414
    __m64 ret;
415
    asm ("lwc1 %0, %1\n\t"
416
	: "=f" (ret)
417
	: "m" (*v)
418
    );
419
    return ret;
420
#else
421
    return _mm_cvtsi32_si64 (*v);
422
#endif
1891 serge 423
}
424
 
425
static force_inline __m64
3931 Serge 426
load8888 (const uint32_t *v)
427
{
428
#ifdef USE_LOONGSON_MMI
429
    return _mm_unpacklo_pi8_f (*(__m32 *)v, _mm_setzero_si64 ());
430
#else
431
    return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
432
#endif
433
}
434
 
435
static force_inline __m64
436
load8888u (const uint32_t *v)
437
{
438
    uint32_t l = ldl_u (v);
439
    return load8888 (&l);
440
}
441
 
442
static force_inline __m64
1891 serge 443
pack8888 (__m64 lo, __m64 hi)
444
{
445
    return _mm_packs_pu16 (lo, hi);
446
}
447
 
3931 Serge 448
static force_inline void
449
store (uint32_t *dest, __m64 v)
1891 serge 450
{
3931 Serge 451
#ifdef USE_LOONGSON_MMI
452
    asm ("swc1 %1, %0\n\t"
453
	: "=m" (*dest)
454
	: "f" (v)
455
	: "memory"
456
    );
457
#else
458
    *dest = _mm_cvtsi64_si32 (v);
459
#endif
1891 serge 460
}
461
 
3931 Serge 462
static force_inline void
463
store8888 (uint32_t *dest, __m64 v)
464
{
465
    v = pack8888 (v, _mm_setzero_si64 ());
466
    store (dest, v);
467
}
468
 
469
static force_inline pixman_bool_t
470
is_equal (__m64 a, __m64 b)
471
{
472
#ifdef USE_LOONGSON_MMI
473
    /* __m64 is double, we can compare directly. */
474
    return a == b;
475
#else
476
    return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
477
#endif
478
}
479
 
480
static force_inline pixman_bool_t
481
is_opaque (__m64 v)
482
{
483
#ifdef USE_LOONGSON_MMI
484
    return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
485
#else
486
    __m64 ffs = _mm_cmpeq_pi8 (v, v);
487
    return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
488
#endif
489
}
490
 
491
static force_inline pixman_bool_t
492
is_zero (__m64 v)
493
{
494
    return is_equal (v, _mm_setzero_si64 ());
495
}
496
 
1891 serge 497
/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
498
 *
499
 *    00RR00GG00BB
500
 *
501
 * --- Expanding 565 in the low word ---
502
 *
503
 * m = (m << (32 - 3)) | (m << (16 - 5)) | m;
504
 * m = m & (01f0003f001f);
505
 * m = m * (008404100840);
506
 * m = m >> 8;
507
 *
508
 * Note the trick here - the top word is shifted by another nibble to
509
 * avoid it bumping into the middle word
510
 */
511
static force_inline __m64
512
expand565 (__m64 pixel, int pos)
513
{
514
    __m64 p = pixel;
515
    __m64 t1, t2;
516
 
517
    /* move pixel to low 16 bit and zero the rest */
3931 Serge 518
#ifdef USE_LOONGSON_MMI
519
    p = loongson_extract_pi16 (p, pos);
520
#else
1891 serge 521
    p = shift (shift (p, (3 - pos) * 16), -48);
3931 Serge 522
#endif
1891 serge 523
 
524
    t1 = shift (p, 36 - 11);
525
    t2 = shift (p, 16 - 5);
526
 
527
    p = _mm_or_si64 (t1, p);
528
    p = _mm_or_si64 (t2, p);
529
    p = _mm_and_si64 (p, MC (565_rgb));
530
 
531
    pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
532
    return _mm_srli_pi16 (pixel, 8);
533
}
534
 
3931 Serge 535
/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
536
 *
537
 *    AARRGGBBRRGGBB
538
 */
539
static force_inline void
540
expand_4xpacked565 (__m64 vin, __m64 *vout0, __m64 *vout1, int full_alpha)
541
{
542
    __m64 t0, t1, alpha = _mm_setzero_si64 ();
543
    __m64 r = _mm_and_si64 (vin, MC (expand_565_r));
544
    __m64 g = _mm_and_si64 (vin, MC (expand_565_g));
545
    __m64 b = _mm_and_si64 (vin, MC (expand_565_b));
546
    if (full_alpha)
547
	alpha = _mm_cmpeq_pi32 (alpha, alpha);
548
 
549
    /* Replicate high bits into empty low bits. */
550
    r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
551
    g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
552
    b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
553
 
554
    r = _mm_packs_pu16 (r, _mm_setzero_si64 ());	/* 00 00 00 00 R3 R2 R1 R0 */
555
    g = _mm_packs_pu16 (g, _mm_setzero_si64 ());	/* 00 00 00 00 G3 G2 G1 G0 */
556
    b = _mm_packs_pu16 (b, _mm_setzero_si64 ());	/* 00 00 00 00 B3 B2 B1 B0 */
557
 
558
    t1 = _mm_unpacklo_pi8 (r, alpha);			/* A3 R3 A2 R2 A1 R1 A0 R0 */
559
    t0 = _mm_unpacklo_pi8 (b, g);			/* G3 B3 G2 B2 G1 B1 G0 B0 */
560
 
561
    *vout0 = _mm_unpacklo_pi16 (t0, t1);		/* A1 R1 G1 B1 A0 R0 G0 B0 */
562
    *vout1 = _mm_unpackhi_pi16 (t0, t1);		/* A3 R3 G3 B3 A2 R2 G2 B2 */
563
}
564
 
1891 serge 565
static force_inline __m64
566
expand8888 (__m64 in, int pos)
567
{
568
    if (pos == 0)
569
	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
570
    else
571
	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
572
}
573
 
574
static force_inline __m64
575
expandx888 (__m64 in, int pos)
576
{
577
    return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
578
}
579
 
3931 Serge 580
static force_inline void
581
expand_4x565 (__m64 vin, __m64 *vout0, __m64 *vout1, __m64 *vout2, __m64 *vout3, int full_alpha)
582
{
583
    __m64 v0, v1;
584
    expand_4xpacked565 (vin, &v0, &v1, full_alpha);
585
    *vout0 = expand8888 (v0, 0);
586
    *vout1 = expand8888 (v0, 1);
587
    *vout2 = expand8888 (v1, 0);
588
    *vout3 = expand8888 (v1, 1);
589
}
590
 
1891 serge 591
static force_inline __m64
592
pack_565 (__m64 pixel, __m64 target, int pos)
593
{
594
    __m64 p = pixel;
595
    __m64 t = target;
596
    __m64 r, g, b;
597
 
598
    r = _mm_and_si64 (p, MC (565_r));
599
    g = _mm_and_si64 (p, MC (565_g));
600
    b = _mm_and_si64 (p, MC (565_b));
601
 
3931 Serge 602
#ifdef USE_LOONGSON_MMI
603
    r = shift (r, -(32 - 8));
604
    g = shift (g, -(16 - 3));
605
    b = shift (b, -(0  + 3));
606
 
607
    p = _mm_or_si64 (r, g);
608
    p = _mm_or_si64 (p, b);
609
    return loongson_insert_pi16 (t, p, pos);
610
#else
1891 serge 611
    r = shift (r, -(32 - 8) + pos * 16);
612
    g = shift (g, -(16 - 3) + pos * 16);
613
    b = shift (b, -(0  + 3) + pos * 16);
614
 
615
    if (pos == 0)
616
	t = _mm_and_si64 (t, MC (mask_0));
617
    else if (pos == 1)
618
	t = _mm_and_si64 (t, MC (mask_1));
619
    else if (pos == 2)
620
	t = _mm_and_si64 (t, MC (mask_2));
621
    else if (pos == 3)
622
	t = _mm_and_si64 (t, MC (mask_3));
623
 
624
    p = _mm_or_si64 (r, t);
625
    p = _mm_or_si64 (g, p);
626
 
627
    return _mm_or_si64 (b, p);
3931 Serge 628
#endif
1891 serge 629
}
630
 
3931 Serge 631
static force_inline __m64
632
pack_4xpacked565 (__m64 a, __m64 b)
633
{
634
    __m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
635
    __m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
636
 
637
    __m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
638
    __m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
639
 
640
    __m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
641
    __m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
642
 
643
    t0 = _mm_or_si64 (t0, g0);
644
    t1 = _mm_or_si64 (t1, g1);
645
 
646
    t0 = shift(t0, -5);
647
#ifdef USE_ARM_IWMMXT
648
    t1 = shift(t1, -5);
649
    return _mm_packs_pu32 (t0, t1);
650
#else
651
    t1 = shift(t1, -5 + 16);
652
    return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
653
#endif
654
}
655
 
1891 serge 656
#ifndef _MSC_VER
657
 
658
static force_inline __m64
3931 Serge 659
pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
660
{
661
    return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
662
}
663
 
664
static force_inline __m64
1891 serge 665
pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
666
{
667
    x = pix_multiply (x, a);
668
    y = pix_multiply (y, b);
669
 
670
    return pix_add (x, y);
671
}
672
 
673
#else
674
 
3931 Serge 675
/* MSVC only handles a "pass by register" of up to three SSE intrinsics */
676
 
677
#define pack_4x565(v0, v1, v2, v3) \
678
    pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
679
 
1891 serge 680
#define pix_add_mul(x, a, y, b)	 \
681
    ( x = pix_multiply (x, a),	 \
3931 Serge 682
      y = pix_multiply (y, b),	 \
1891 serge 683
      pix_add (x, y) )
684
 
685
#endif
686
 
687
/* --------------- MMX code patch for fbcompose.c --------------------- */
688
 
3931 Serge 689
static force_inline __m64
1891 serge 690
combine (const uint32_t *src, const uint32_t *mask)
691
{
3931 Serge 692
    __m64 vsrc = load8888 (src);
1891 serge 693
 
694
    if (mask)
695
    {
3931 Serge 696
	__m64 m = load8888 (mask);
1891 serge 697
 
698
	m = expand_alpha (m);
3931 Serge 699
	vsrc = pix_multiply (vsrc, m);
700
    }
1891 serge 701
 
3931 Serge 702
    return vsrc;
703
}
704
 
705
static force_inline __m64
706
core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
707
{
708
    vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
709
 
710
    if (is_opaque (vsrc))
711
    {
712
	return vsrc;
1891 serge 713
    }
3931 Serge 714
    else if (!is_zero (vsrc))
715
    {
716
	return over (vsrc, expand_alpha (vsrc),
717
		     _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
718
    }
1891 serge 719
 
3931 Serge 720
    return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
1891 serge 721
}
722
 
723
static void
724
mmx_combine_over_u (pixman_implementation_t *imp,
725
                    pixman_op_t              op,
726
                    uint32_t *               dest,
727
                    const uint32_t *         src,
728
                    const uint32_t *         mask,
729
                    int                      width)
730
{
731
    const uint32_t *end = dest + width;
732
 
733
    while (dest < end)
734
    {
3931 Serge 735
	__m64 vsrc = combine (src, mask);
1891 serge 736
 
3931 Serge 737
	if (is_opaque (vsrc))
1891 serge 738
	{
3931 Serge 739
	    store8888 (dest, vsrc);
1891 serge 740
	}
3931 Serge 741
	else if (!is_zero (vsrc))
1891 serge 742
	{
3931 Serge 743
	    __m64 sa = expand_alpha (vsrc);
744
	    store8888 (dest, over (vsrc, sa, load8888 (dest)));
1891 serge 745
	}
746
 
747
	++dest;
748
	++src;
749
	if (mask)
750
	    ++mask;
751
    }
752
    _mm_empty ();
753
}
754
 
755
static void
756
mmx_combine_over_reverse_u (pixman_implementation_t *imp,
757
                            pixman_op_t              op,
758
                            uint32_t *               dest,
759
                            const uint32_t *         src,
760
                            const uint32_t *         mask,
761
                            int                      width)
762
{
763
    const uint32_t *end = dest + width;
764
 
765
    while (dest < end)
766
    {
767
	__m64 d, da;
3931 Serge 768
	__m64 s = combine (src, mask);
1891 serge 769
 
3931 Serge 770
	d = load8888 (dest);
1891 serge 771
	da = expand_alpha (d);
3931 Serge 772
	store8888 (dest, over (d, da, s));
1891 serge 773
 
774
	++dest;
775
	++src;
776
	if (mask)
777
	    mask++;
778
    }
779
    _mm_empty ();
780
}
781
 
782
static void
783
mmx_combine_in_u (pixman_implementation_t *imp,
784
                  pixman_op_t              op,
785
                  uint32_t *               dest,
786
                  const uint32_t *         src,
787
                  const uint32_t *         mask,
788
                  int                      width)
789
{
790
    const uint32_t *end = dest + width;
791
 
792
    while (dest < end)
793
    {
3931 Serge 794
	__m64 a;
795
	__m64 x = combine (src, mask);
1891 serge 796
 
3931 Serge 797
	a = load8888 (dest);
1891 serge 798
	a = expand_alpha (a);
799
	x = pix_multiply (x, a);
800
 
3931 Serge 801
	store8888 (dest, x);
1891 serge 802
 
803
	++dest;
804
	++src;
805
	if (mask)
806
	    mask++;
807
    }
808
    _mm_empty ();
809
}
810
 
811
static void
812
mmx_combine_in_reverse_u (pixman_implementation_t *imp,
813
                          pixman_op_t              op,
814
                          uint32_t *               dest,
815
                          const uint32_t *         src,
816
                          const uint32_t *         mask,
817
                          int                      width)
818
{
819
    const uint32_t *end = dest + width;
820
 
821
    while (dest < end)
822
    {
3931 Serge 823
	__m64 a = combine (src, mask);
824
	__m64 x;
1891 serge 825
 
3931 Serge 826
	x = load8888 (dest);
1891 serge 827
	a = expand_alpha (a);
828
	x = pix_multiply (x, a);
3931 Serge 829
	store8888 (dest, x);
1891 serge 830
 
831
	++dest;
832
	++src;
833
	if (mask)
834
	    mask++;
835
    }
836
    _mm_empty ();
837
}
838
 
839
static void
840
mmx_combine_out_u (pixman_implementation_t *imp,
841
                   pixman_op_t              op,
842
                   uint32_t *               dest,
843
                   const uint32_t *         src,
844
                   const uint32_t *         mask,
845
                   int                      width)
846
{
847
    const uint32_t *end = dest + width;
848
 
849
    while (dest < end)
850
    {
3931 Serge 851
	__m64 a;
852
	__m64 x = combine (src, mask);
1891 serge 853
 
3931 Serge 854
	a = load8888 (dest);
1891 serge 855
	a = expand_alpha (a);
856
	a = negate (a);
857
	x = pix_multiply (x, a);
3931 Serge 858
	store8888 (dest, x);
1891 serge 859
 
860
	++dest;
861
	++src;
862
	if (mask)
863
	    mask++;
864
    }
865
    _mm_empty ();
866
}
867
 
868
static void
869
mmx_combine_out_reverse_u (pixman_implementation_t *imp,
870
                           pixman_op_t              op,
871
                           uint32_t *               dest,
872
                           const uint32_t *         src,
873
                           const uint32_t *         mask,
874
                           int                      width)
875
{
876
    const uint32_t *end = dest + width;
877
 
878
    while (dest < end)
879
    {
3931 Serge 880
	__m64 a = combine (src, mask);
881
	__m64 x;
1891 serge 882
 
3931 Serge 883
	x = load8888 (dest);
1891 serge 884
	a = expand_alpha (a);
885
	a = negate (a);
886
	x = pix_multiply (x, a);
887
 
3931 Serge 888
	store8888 (dest, x);
1891 serge 889
 
890
	++dest;
891
	++src;
892
	if (mask)
893
	    mask++;
894
    }
895
    _mm_empty ();
896
}
897
 
898
static void
899
mmx_combine_atop_u (pixman_implementation_t *imp,
900
                    pixman_op_t              op,
901
                    uint32_t *               dest,
902
                    const uint32_t *         src,
903
                    const uint32_t *         mask,
904
                    int                      width)
905
{
906
    const uint32_t *end = dest + width;
907
 
908
    while (dest < end)
909
    {
3931 Serge 910
	__m64 da, d, sia;
911
	__m64 s = combine (src, mask);
1891 serge 912
 
3931 Serge 913
	d = load8888 (dest);
1891 serge 914
	sia = expand_alpha (s);
915
	sia = negate (sia);
916
	da = expand_alpha (d);
917
	s = pix_add_mul (s, da, d, sia);
3931 Serge 918
	store8888 (dest, s);
1891 serge 919
 
920
	++dest;
921
	++src;
922
	if (mask)
923
	    mask++;
924
    }
925
    _mm_empty ();
926
}
927
 
928
static void
929
mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
930
                            pixman_op_t              op,
931
                            uint32_t *               dest,
932
                            const uint32_t *         src,
933
                            const uint32_t *         mask,
934
                            int                      width)
935
{
936
    const uint32_t *end;
937
 
938
    end = dest + width;
939
 
940
    while (dest < end)
941
    {
3931 Serge 942
	__m64 dia, d, sa;
943
	__m64 s = combine (src, mask);
1891 serge 944
 
3931 Serge 945
	d = load8888 (dest);
1891 serge 946
	sa = expand_alpha (s);
947
	dia = expand_alpha (d);
948
	dia = negate (dia);
949
	s = pix_add_mul (s, dia, d, sa);
3931 Serge 950
	store8888 (dest, s);
1891 serge 951
 
952
	++dest;
953
	++src;
954
	if (mask)
955
	    mask++;
956
    }
957
    _mm_empty ();
958
}
959
 
960
static void
961
mmx_combine_xor_u (pixman_implementation_t *imp,
962
                   pixman_op_t              op,
963
                   uint32_t *               dest,
964
                   const uint32_t *         src,
965
                   const uint32_t *         mask,
966
                   int                      width)
967
{
968
    const uint32_t *end = dest + width;
969
 
970
    while (dest < end)
971
    {
3931 Serge 972
	__m64 dia, d, sia;
973
	__m64 s = combine (src, mask);
1891 serge 974
 
3931 Serge 975
	d = load8888 (dest);
1891 serge 976
	sia = expand_alpha (s);
977
	dia = expand_alpha (d);
978
	sia = negate (sia);
979
	dia = negate (dia);
980
	s = pix_add_mul (s, dia, d, sia);
3931 Serge 981
	store8888 (dest, s);
1891 serge 982
 
983
	++dest;
984
	++src;
985
	if (mask)
986
	    mask++;
987
    }
988
    _mm_empty ();
989
}
990
 
991
static void
992
mmx_combine_add_u (pixman_implementation_t *imp,
993
                   pixman_op_t              op,
994
                   uint32_t *               dest,
995
                   const uint32_t *         src,
996
                   const uint32_t *         mask,
997
                   int                      width)
998
{
999
    const uint32_t *end = dest + width;
1000
 
1001
    while (dest < end)
1002
    {
3931 Serge 1003
	__m64 d;
1004
	__m64 s = combine (src, mask);
1891 serge 1005
 
3931 Serge 1006
	d = load8888 (dest);
1891 serge 1007
	s = pix_add (s, d);
3931 Serge 1008
	store8888 (dest, s);
1891 serge 1009
 
1010
	++dest;
1011
	++src;
1012
	if (mask)
1013
	    mask++;
1014
    }
1015
    _mm_empty ();
1016
}
1017
 
1018
static void
1019
mmx_combine_saturate_u (pixman_implementation_t *imp,
1020
                        pixman_op_t              op,
1021
                        uint32_t *               dest,
1022
                        const uint32_t *         src,
1023
                        const uint32_t *         mask,
1024
                        int                      width)
1025
{
1026
    const uint32_t *end = dest + width;
1027
 
1028
    while (dest < end)
1029
    {
3931 Serge 1030
	uint32_t s, sa, da;
1891 serge 1031
	uint32_t d = *dest;
3931 Serge 1032
	__m64 ms = combine (src, mask);
1033
	__m64 md = load8888 (dest);
1891 serge 1034
 
3931 Serge 1035
	store8888(&s, ms);
1036
	da = ~d >> 24;
1037
	sa = s >> 24;
1038
 
1891 serge 1039
	if (sa > da)
1040
	{
3931 Serge 1041
	    uint32_t quot = DIV_UN8 (da, sa) << 24;
1042
	    __m64 msa = load8888 (");
1891 serge 1043
	    msa = expand_alpha (msa);
1044
	    ms = pix_multiply (ms, msa);
1045
	}
1046
 
1047
	md = pix_add (md, ms);
3931 Serge 1048
	store8888 (dest, md);
1891 serge 1049
 
1050
	++src;
1051
	++dest;
1052
	if (mask)
1053
	    mask++;
1054
    }
1055
    _mm_empty ();
1056
}
1057
 
1058
static void
1059
mmx_combine_src_ca (pixman_implementation_t *imp,
1060
                    pixman_op_t              op,
1061
                    uint32_t *               dest,
1062
                    const uint32_t *         src,
1063
                    const uint32_t *         mask,
1064
                    int                      width)
1065
{
1066
    const uint32_t *end = src + width;
1067
 
1068
    while (src < end)
1069
    {
3931 Serge 1070
	__m64 a = load8888 (mask);
1071
	__m64 s = load8888 (src);
1891 serge 1072
 
1073
	s = pix_multiply (s, a);
3931 Serge 1074
	store8888 (dest, s);
1891 serge 1075
 
1076
	++src;
1077
	++mask;
1078
	++dest;
1079
    }
1080
    _mm_empty ();
1081
}
1082
 
1083
static void
1084
mmx_combine_over_ca (pixman_implementation_t *imp,
1085
                     pixman_op_t              op,
1086
                     uint32_t *               dest,
1087
                     const uint32_t *         src,
1088
                     const uint32_t *         mask,
1089
                     int                      width)
1090
{
1091
    const uint32_t *end = src + width;
1092
 
1093
    while (src < end)
1094
    {
3931 Serge 1095
	__m64 a = load8888 (mask);
1096
	__m64 s = load8888 (src);
1097
	__m64 d = load8888 (dest);
1891 serge 1098
	__m64 sa = expand_alpha (s);
1099
 
3931 Serge 1100
	store8888 (dest, in_over (s, sa, a, d));
1891 serge 1101
 
1102
	++src;
1103
	++dest;
1104
	++mask;
1105
    }
1106
    _mm_empty ();
1107
}
1108
 
1109
static void
1110
mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
1111
                             pixman_op_t              op,
1112
                             uint32_t *               dest,
1113
                             const uint32_t *         src,
1114
                             const uint32_t *         mask,
1115
                             int                      width)
1116
{
1117
    const uint32_t *end = src + width;
1118
 
1119
    while (src < end)
1120
    {
3931 Serge 1121
	__m64 a = load8888 (mask);
1122
	__m64 s = load8888 (src);
1123
	__m64 d = load8888 (dest);
1891 serge 1124
	__m64 da = expand_alpha (d);
1125
 
3931 Serge 1126
	store8888 (dest, over (d, da, in (s, a)));
1891 serge 1127
 
1128
	++src;
1129
	++dest;
1130
	++mask;
1131
    }
1132
    _mm_empty ();
1133
}
1134
 
1135
static void
1136
mmx_combine_in_ca (pixman_implementation_t *imp,
1137
                   pixman_op_t              op,
1138
                   uint32_t *               dest,
1139
                   const uint32_t *         src,
1140
                   const uint32_t *         mask,
1141
                   int                      width)
1142
{
1143
    const uint32_t *end = src + width;
1144
 
1145
    while (src < end)
1146
    {
3931 Serge 1147
	__m64 a = load8888 (mask);
1148
	__m64 s = load8888 (src);
1149
	__m64 d = load8888 (dest);
1891 serge 1150
	__m64 da = expand_alpha (d);
1151
 
1152
	s = pix_multiply (s, a);
1153
	s = pix_multiply (s, da);
3931 Serge 1154
	store8888 (dest, s);
1891 serge 1155
 
1156
	++src;
1157
	++dest;
1158
	++mask;
1159
    }
1160
    _mm_empty ();
1161
}
1162
 
1163
static void
1164
mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
1165
                           pixman_op_t              op,
1166
                           uint32_t *               dest,
1167
                           const uint32_t *         src,
1168
                           const uint32_t *         mask,
1169
                           int                      width)
1170
{
1171
    const uint32_t *end = src + width;
1172
 
1173
    while (src < end)
1174
    {
3931 Serge 1175
	__m64 a = load8888 (mask);
1176
	__m64 s = load8888 (src);
1177
	__m64 d = load8888 (dest);
1891 serge 1178
	__m64 sa = expand_alpha (s);
1179
 
1180
	a = pix_multiply (a, sa);
1181
	d = pix_multiply (d, a);
3931 Serge 1182
	store8888 (dest, d);
1891 serge 1183
 
1184
	++src;
1185
	++dest;
1186
	++mask;
1187
    }
1188
    _mm_empty ();
1189
}
1190
 
1191
static void
1192
mmx_combine_out_ca (pixman_implementation_t *imp,
1193
                    pixman_op_t              op,
1194
                    uint32_t *               dest,
1195
                    const uint32_t *         src,
1196
                    const uint32_t *         mask,
1197
                    int                      width)
1198
{
1199
    const uint32_t *end = src + width;
1200
 
1201
    while (src < end)
1202
    {
3931 Serge 1203
	__m64 a = load8888 (mask);
1204
	__m64 s = load8888 (src);
1205
	__m64 d = load8888 (dest);
1891 serge 1206
	__m64 da = expand_alpha (d);
1207
 
1208
	da = negate (da);
1209
	s = pix_multiply (s, a);
1210
	s = pix_multiply (s, da);
3931 Serge 1211
	store8888 (dest, s);
1891 serge 1212
 
1213
	++src;
1214
	++dest;
1215
	++mask;
1216
    }
1217
    _mm_empty ();
1218
}
1219
 
1220
static void
1221
mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
1222
                            pixman_op_t              op,
1223
                            uint32_t *               dest,
1224
                            const uint32_t *         src,
1225
                            const uint32_t *         mask,
1226
                            int                      width)
1227
{
1228
    const uint32_t *end = src + width;
1229
 
1230
    while (src < end)
1231
    {
3931 Serge 1232
	__m64 a = load8888 (mask);
1233
	__m64 s = load8888 (src);
1234
	__m64 d = load8888 (dest);
1891 serge 1235
	__m64 sa = expand_alpha (s);
1236
 
1237
	a = pix_multiply (a, sa);
1238
	a = negate (a);
1239
	d = pix_multiply (d, a);
3931 Serge 1240
	store8888 (dest, d);
1891 serge 1241
 
1242
	++src;
1243
	++dest;
1244
	++mask;
1245
    }
1246
    _mm_empty ();
1247
}
1248
 
1249
static void
1250
mmx_combine_atop_ca (pixman_implementation_t *imp,
1251
                     pixman_op_t              op,
1252
                     uint32_t *               dest,
1253
                     const uint32_t *         src,
1254
                     const uint32_t *         mask,
1255
                     int                      width)
1256
{
1257
    const uint32_t *end = src + width;
1258
 
1259
    while (src < end)
1260
    {
3931 Serge 1261
	__m64 a = load8888 (mask);
1262
	__m64 s = load8888 (src);
1263
	__m64 d = load8888 (dest);
1891 serge 1264
	__m64 da = expand_alpha (d);
1265
	__m64 sa = expand_alpha (s);
1266
 
1267
	s = pix_multiply (s, a);
1268
	a = pix_multiply (a, sa);
1269
	a = negate (a);
1270
	d = pix_add_mul (d, a, s, da);
3931 Serge 1271
	store8888 (dest, d);
1891 serge 1272
 
1273
	++src;
1274
	++dest;
1275
	++mask;
1276
    }
1277
    _mm_empty ();
1278
}
1279
 
1280
static void
1281
mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
1282
                             pixman_op_t              op,
1283
                             uint32_t *               dest,
1284
                             const uint32_t *         src,
1285
                             const uint32_t *         mask,
1286
                             int                      width)
1287
{
1288
    const uint32_t *end = src + width;
1289
 
1290
    while (src < end)
1291
    {
3931 Serge 1292
	__m64 a = load8888 (mask);
1293
	__m64 s = load8888 (src);
1294
	__m64 d = load8888 (dest);
1891 serge 1295
	__m64 da = expand_alpha (d);
1296
	__m64 sa = expand_alpha (s);
1297
 
1298
	s = pix_multiply (s, a);
1299
	a = pix_multiply (a, sa);
1300
	da = negate (da);
1301
	d = pix_add_mul (d, a, s, da);
3931 Serge 1302
	store8888 (dest, d);
1891 serge 1303
 
1304
	++src;
1305
	++dest;
1306
	++mask;
1307
    }
1308
    _mm_empty ();
1309
}
1310
 
1311
static void
1312
mmx_combine_xor_ca (pixman_implementation_t *imp,
1313
                    pixman_op_t              op,
1314
                    uint32_t *               dest,
1315
                    const uint32_t *         src,
1316
                    const uint32_t *         mask,
1317
                    int                      width)
1318
{
1319
    const uint32_t *end = src + width;
1320
 
1321
    while (src < end)
1322
    {
3931 Serge 1323
	__m64 a = load8888 (mask);
1324
	__m64 s = load8888 (src);
1325
	__m64 d = load8888 (dest);
1891 serge 1326
	__m64 da = expand_alpha (d);
1327
	__m64 sa = expand_alpha (s);
1328
 
1329
	s = pix_multiply (s, a);
1330
	a = pix_multiply (a, sa);
1331
	da = negate (da);
1332
	a = negate (a);
1333
	d = pix_add_mul (d, a, s, da);
3931 Serge 1334
	store8888 (dest, d);
1891 serge 1335
 
1336
	++src;
1337
	++dest;
1338
	++mask;
1339
    }
1340
    _mm_empty ();
1341
}
1342
 
1343
static void
1344
mmx_combine_add_ca (pixman_implementation_t *imp,
1345
                    pixman_op_t              op,
1346
                    uint32_t *               dest,
1347
                    const uint32_t *         src,
1348
                    const uint32_t *         mask,
1349
                    int                      width)
1350
{
1351
    const uint32_t *end = src + width;
1352
 
1353
    while (src < end)
1354
    {
3931 Serge 1355
	__m64 a = load8888 (mask);
1356
	__m64 s = load8888 (src);
1357
	__m64 d = load8888 (dest);
1891 serge 1358
 
1359
	s = pix_multiply (s, a);
1360
	d = pix_add (s, d);
3931 Serge 1361
	store8888 (dest, d);
1891 serge 1362
 
1363
	++src;
1364
	++dest;
1365
	++mask;
1366
    }
1367
    _mm_empty ();
1368
}
1369
 
1370
/* ------------- MMX code paths called from fbpict.c -------------------- */
1371
 
1372
static void
1373
mmx_composite_over_n_8888 (pixman_implementation_t *imp,
3931 Serge 1374
                           pixman_composite_info_t *info)
1891 serge 1375
{
3931 Serge 1376
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 1377
    uint32_t src;
1378
    uint32_t    *dst_line, *dst;
1379
    int32_t w;
1380
    int dst_stride;
1381
    __m64 vsrc, vsrca;
1382
 
1383
    CHECKPOINT ();
1384
 
3931 Serge 1385
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891 serge 1386
 
1387
    if (src == 0)
1388
	return;
1389
 
3931 Serge 1390
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891 serge 1391
 
3931 Serge 1392
    vsrc = load8888 (&src);
1891 serge 1393
    vsrca = expand_alpha (vsrc);
1394
 
1395
    while (height--)
1396
    {
1397
	dst = dst_line;
1398
	dst_line += dst_stride;
1399
	w = width;
1400
 
1401
	CHECKPOINT ();
1402
 
3931 Serge 1403
	while (w && (uintptr_t)dst & 7)
1891 serge 1404
	{
3931 Serge 1405
	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1891 serge 1406
 
1407
	    w--;
1408
	    dst++;
1409
	}
1410
 
1411
	while (w >= 2)
1412
	{
1413
	    __m64 vdest;
1414
	    __m64 dest0, dest1;
1415
 
1416
	    vdest = *(__m64 *)dst;
1417
 
1418
	    dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
1419
	    dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
1420
 
1421
	    *(__m64 *)dst = pack8888 (dest0, dest1);
1422
 
1423
	    dst += 2;
1424
	    w -= 2;
1425
	}
1426
 
1427
	CHECKPOINT ();
1428
 
3931 Serge 1429
	if (w)
1891 serge 1430
	{
3931 Serge 1431
	    store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1891 serge 1432
	}
1433
    }
1434
 
1435
    _mm_empty ();
1436
}
1437
 
1438
static void
1439
mmx_composite_over_n_0565 (pixman_implementation_t *imp,
3931 Serge 1440
                           pixman_composite_info_t *info)
1891 serge 1441
{
3931 Serge 1442
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 1443
    uint32_t src;
1444
    uint16_t    *dst_line, *dst;
1445
    int32_t w;
1446
    int dst_stride;
1447
    __m64 vsrc, vsrca;
1448
 
1449
    CHECKPOINT ();
1450
 
3931 Serge 1451
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891 serge 1452
 
1453
    if (src == 0)
1454
	return;
1455
 
3931 Serge 1456
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1891 serge 1457
 
3931 Serge 1458
    vsrc = load8888 (&src);
1891 serge 1459
    vsrca = expand_alpha (vsrc);
1460
 
1461
    while (height--)
1462
    {
1463
	dst = dst_line;
1464
	dst_line += dst_stride;
1465
	w = width;
1466
 
1467
	CHECKPOINT ();
1468
 
3931 Serge 1469
	while (w && (uintptr_t)dst & 7)
1891 serge 1470
	{
1471
	    uint64_t d = *dst;
1472
	    __m64 vdest = expand565 (to_m64 (d), 0);
1473
 
1474
	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1475
	    *dst = to_uint64 (vdest);
1476
 
1477
	    w--;
1478
	    dst++;
1479
	}
1480
 
1481
	while (w >= 4)
1482
	{
3931 Serge 1483
	    __m64 vdest = *(__m64 *)dst;
1484
	    __m64 v0, v1, v2, v3;
1891 serge 1485
 
3931 Serge 1486
	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1891 serge 1487
 
3931 Serge 1488
	    v0 = over (vsrc, vsrca, v0);
1489
	    v1 = over (vsrc, vsrca, v1);
1490
	    v2 = over (vsrc, vsrca, v2);
1491
	    v3 = over (vsrc, vsrca, v3);
1891 serge 1492
 
3931 Serge 1493
	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1891 serge 1494
 
1495
	    dst += 4;
1496
	    w -= 4;
1497
	}
1498
 
1499
	CHECKPOINT ();
1500
 
1501
	while (w)
1502
	{
1503
	    uint64_t d = *dst;
1504
	    __m64 vdest = expand565 (to_m64 (d), 0);
1505
 
1506
	    vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
1507
	    *dst = to_uint64 (vdest);
1508
 
1509
	    w--;
1510
	    dst++;
1511
	}
1512
    }
1513
 
1514
    _mm_empty ();
1515
}
1516
 
1517
static void
1518
mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3931 Serge 1519
                                   pixman_composite_info_t *info)
1891 serge 1520
{
3931 Serge 1521
    PIXMAN_COMPOSITE_ARGS (info);
1522
    uint32_t src;
1891 serge 1523
    uint32_t    *dst_line;
1524
    uint32_t    *mask_line;
1525
    int dst_stride, mask_stride;
1526
    __m64 vsrc, vsrca;
1527
 
1528
    CHECKPOINT ();
1529
 
3931 Serge 1530
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891 serge 1531
 
1532
    if (src == 0)
1533
	return;
1534
 
3931 Serge 1535
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891 serge 1536
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
1537
 
3931 Serge 1538
    vsrc = load8888 (&src);
1891 serge 1539
    vsrca = expand_alpha (vsrc);
1540
 
1541
    while (height--)
1542
    {
1543
	int twidth = width;
1544
	uint32_t *p = (uint32_t *)mask_line;
1545
	uint32_t *q = (uint32_t *)dst_line;
1546
 
3931 Serge 1547
	while (twidth && (uintptr_t)q & 7)
1891 serge 1548
	{
1549
	    uint32_t m = *(uint32_t *)p;
1550
 
1551
	    if (m)
1552
	    {
3931 Serge 1553
		__m64 vdest = load8888 (q);
1554
		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1555
		store8888 (q, vdest);
1891 serge 1556
	    }
1557
 
1558
	    twidth--;
1559
	    p++;
1560
	    q++;
1561
	}
1562
 
1563
	while (twidth >= 2)
1564
	{
1565
	    uint32_t m0, m1;
1566
	    m0 = *p;
1567
	    m1 = *(p + 1);
1568
 
1569
	    if (m0 | m1)
1570
	    {
1571
		__m64 dest0, dest1;
1572
		__m64 vdest = *(__m64 *)q;
1573
 
3931 Serge 1574
		dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1891 serge 1575
		                 expand8888 (vdest, 0));
3931 Serge 1576
		dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1891 serge 1577
		                 expand8888 (vdest, 1));
1578
 
1579
		*(__m64 *)q = pack8888 (dest0, dest1);
1580
	    }
1581
 
1582
	    p += 2;
1583
	    q += 2;
1584
	    twidth -= 2;
1585
	}
1586
 
3931 Serge 1587
	if (twidth)
1891 serge 1588
	{
1589
	    uint32_t m = *(uint32_t *)p;
1590
 
1591
	    if (m)
1592
	    {
3931 Serge 1593
		__m64 vdest = load8888 (q);
1594
		vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
1595
		store8888 (q, vdest);
1891 serge 1596
	    }
1597
 
1598
	    twidth--;
1599
	    p++;
1600
	    q++;
1601
	}
1602
 
1603
	dst_line += dst_stride;
1604
	mask_line += mask_stride;
1605
    }
1606
 
1607
    _mm_empty ();
1608
}
1609
 
1610
static void
1611
mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3931 Serge 1612
                                pixman_composite_info_t *info)
1891 serge 1613
{
3931 Serge 1614
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 1615
    uint32_t    *dst_line, *dst;
1616
    uint32_t    *src_line, *src;
1617
    uint32_t mask;
1618
    __m64 vmask;
1619
    int dst_stride, src_stride;
1620
    int32_t w;
1621
 
1622
    CHECKPOINT ();
1623
 
3931 Serge 1624
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891 serge 1625
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1626
 
3931 Serge 1627
    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1628
    vmask = expand_alpha (load8888 (&mask));
1891 serge 1629
 
1630
    while (height--)
1631
    {
1632
	dst = dst_line;
1633
	dst_line += dst_stride;
1634
	src = src_line;
1635
	src_line += src_stride;
1636
	w = width;
1637
 
3931 Serge 1638
	while (w && (uintptr_t)dst & 7)
1891 serge 1639
	{
3931 Serge 1640
	    __m64 s = load8888 (src);
1641
	    __m64 d = load8888 (dst);
1891 serge 1642
 
3931 Serge 1643
	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1891 serge 1644
 
1645
	    w--;
1646
	    dst++;
1647
	    src++;
1648
	}
1649
 
1650
	while (w >= 2)
1651
	{
3931 Serge 1652
	    __m64 vs = ldq_u ((__m64 *)src);
1891 serge 1653
	    __m64 vd = *(__m64 *)dst;
1654
	    __m64 vsrc0 = expand8888 (vs, 0);
1655
	    __m64 vsrc1 = expand8888 (vs, 1);
1656
 
1657
	    *(__m64 *)dst = pack8888 (
1658
	        in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
1659
	        in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
1660
 
1661
	    w -= 2;
1662
	    dst += 2;
1663
	    src += 2;
1664
	}
1665
 
3931 Serge 1666
	if (w)
1891 serge 1667
	{
3931 Serge 1668
	    __m64 s = load8888 (src);
1669
	    __m64 d = load8888 (dst);
1891 serge 1670
 
3931 Serge 1671
	    store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1891 serge 1672
	}
1673
    }
1674
 
1675
    _mm_empty ();
1676
}
1677
 
1678
static void
1679
mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3931 Serge 1680
                                pixman_composite_info_t *info)
1891 serge 1681
{
3931 Serge 1682
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 1683
    uint32_t *dst_line, *dst;
1684
    uint32_t *src_line, *src;
1685
    uint32_t mask;
1686
    __m64 vmask;
1687
    int dst_stride, src_stride;
1688
    int32_t w;
1689
    __m64 srca;
1690
 
1691
    CHECKPOINT ();
1692
 
3931 Serge 1693
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891 serge 1694
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3931 Serge 1695
    mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1891 serge 1696
 
3931 Serge 1697
    vmask = expand_alpha (load8888 (&mask));
1891 serge 1698
    srca = MC (4x00ff);
1699
 
1700
    while (height--)
1701
    {
1702
	dst = dst_line;
1703
	dst_line += dst_stride;
1704
	src = src_line;
1705
	src_line += src_stride;
1706
	w = width;
1707
 
3931 Serge 1708
	while (w && (uintptr_t)dst & 7)
1891 serge 1709
	{
3931 Serge 1710
	    uint32_t ssrc = *src | 0xff000000;
1711
	    __m64 s = load8888 (&ssrc);
1712
	    __m64 d = load8888 (dst);
1891 serge 1713
 
3931 Serge 1714
	    store8888 (dst, in_over (s, srca, vmask, d));
1891 serge 1715
 
1716
	    w--;
1717
	    dst++;
1718
	    src++;
1719
	}
1720
 
1721
	while (w >= 16)
1722
	{
1723
	    __m64 vd0 = *(__m64 *)(dst + 0);
1724
	    __m64 vd1 = *(__m64 *)(dst + 2);
1725
	    __m64 vd2 = *(__m64 *)(dst + 4);
1726
	    __m64 vd3 = *(__m64 *)(dst + 6);
1727
	    __m64 vd4 = *(__m64 *)(dst + 8);
1728
	    __m64 vd5 = *(__m64 *)(dst + 10);
1729
	    __m64 vd6 = *(__m64 *)(dst + 12);
1730
	    __m64 vd7 = *(__m64 *)(dst + 14);
1731
 
3931 Serge 1732
	    __m64 vs0 = ldq_u ((__m64 *)(src + 0));
1733
	    __m64 vs1 = ldq_u ((__m64 *)(src + 2));
1734
	    __m64 vs2 = ldq_u ((__m64 *)(src + 4));
1735
	    __m64 vs3 = ldq_u ((__m64 *)(src + 6));
1736
	    __m64 vs4 = ldq_u ((__m64 *)(src + 8));
1737
	    __m64 vs5 = ldq_u ((__m64 *)(src + 10));
1738
	    __m64 vs6 = ldq_u ((__m64 *)(src + 12));
1739
	    __m64 vs7 = ldq_u ((__m64 *)(src + 14));
1891 serge 1740
 
1741
	    vd0 = pack8888 (
1742
	        in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
1743
	        in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
1744
 
1745
	    vd1 = pack8888 (
1746
	        in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
1747
	        in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
1748
 
1749
	    vd2 = pack8888 (
1750
	        in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
1751
	        in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
1752
 
1753
	    vd3 = pack8888 (
1754
	        in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
1755
	        in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
1756
 
1757
	    vd4 = pack8888 (
1758
	        in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
1759
	        in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
1760
 
1761
	    vd5 = pack8888 (
1762
	        in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
1763
	        in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
1764
 
1765
	    vd6 = pack8888 (
1766
	        in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
1767
	        in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
1768
 
1769
	    vd7 = pack8888 (
1770
	        in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
1771
	        in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
1772
 
1773
	    *(__m64 *)(dst + 0) = vd0;
1774
	    *(__m64 *)(dst + 2) = vd1;
1775
	    *(__m64 *)(dst + 4) = vd2;
1776
	    *(__m64 *)(dst + 6) = vd3;
1777
	    *(__m64 *)(dst + 8) = vd4;
1778
	    *(__m64 *)(dst + 10) = vd5;
1779
	    *(__m64 *)(dst + 12) = vd6;
1780
	    *(__m64 *)(dst + 14) = vd7;
1781
 
1782
	    w -= 16;
1783
	    dst += 16;
1784
	    src += 16;
1785
	}
1786
 
1787
	while (w)
1788
	{
3931 Serge 1789
	    uint32_t ssrc = *src | 0xff000000;
1790
	    __m64 s = load8888 (&ssrc);
1791
	    __m64 d = load8888 (dst);
1891 serge 1792
 
3931 Serge 1793
	    store8888 (dst, in_over (s, srca, vmask, d));
1891 serge 1794
 
1795
	    w--;
1796
	    dst++;
1797
	    src++;
1798
	}
1799
    }
1800
 
1801
    _mm_empty ();
1802
}
1803
 
1804
static void
1805
mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
3931 Serge 1806
                              pixman_composite_info_t *info)
1891 serge 1807
{
3931 Serge 1808
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 1809
    uint32_t *dst_line, *dst;
1810
    uint32_t *src_line, *src;
1811
    uint32_t s;
1812
    int dst_stride, src_stride;
1813
    uint8_t a;
1814
    int32_t w;
1815
 
1816
    CHECKPOINT ();
1817
 
3931 Serge 1818
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891 serge 1819
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1820
 
1821
    while (height--)
1822
    {
1823
	dst = dst_line;
1824
	dst_line += dst_stride;
1825
	src = src_line;
1826
	src_line += src_stride;
1827
	w = width;
1828
 
1829
	while (w--)
1830
	{
1831
	    s = *src++;
1832
	    a = s >> 24;
1833
 
1834
	    if (a == 0xff)
1835
	    {
1836
		*dst = s;
1837
	    }
1838
	    else if (s)
1839
	    {
1840
		__m64 ms, sa;
3931 Serge 1841
		ms = load8888 (&s);
1891 serge 1842
		sa = expand_alpha (ms);
3931 Serge 1843
		store8888 (dst, over (ms, sa, load8888 (dst)));
1891 serge 1844
	    }
1845
 
1846
	    dst++;
1847
	}
1848
    }
1849
    _mm_empty ();
1850
}
1851
 
1852
static void
1853
mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
3931 Serge 1854
                              pixman_composite_info_t *info)
1891 serge 1855
{
3931 Serge 1856
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 1857
    uint16_t    *dst_line, *dst;
1858
    uint32_t    *src_line, *src;
1859
    int dst_stride, src_stride;
1860
    int32_t w;
1861
 
1862
    CHECKPOINT ();
1863
 
3931 Serge 1864
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1891 serge 1865
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
1866
 
1867
#if 0
1868
    /* FIXME */
1869
    assert (src_image->drawable == mask_image->drawable);
1870
#endif
1871
 
1872
    while (height--)
1873
    {
1874
	dst = dst_line;
1875
	dst_line += dst_stride;
1876
	src = src_line;
1877
	src_line += src_stride;
1878
	w = width;
1879
 
1880
	CHECKPOINT ();
1881
 
3931 Serge 1882
	while (w && (uintptr_t)dst & 7)
1891 serge 1883
	{
3931 Serge 1884
	    __m64 vsrc = load8888 (src);
1891 serge 1885
	    uint64_t d = *dst;
1886
	    __m64 vdest = expand565 (to_m64 (d), 0);
1887
 
1888
	    vdest = pack_565 (
1889
		over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1890
 
1891
	    *dst = to_uint64 (vdest);
1892
 
1893
	    w--;
1894
	    dst++;
1895
	    src++;
1896
	}
1897
 
1898
	CHECKPOINT ();
1899
 
1900
	while (w >= 4)
1901
	{
3931 Serge 1902
	    __m64 vdest = *(__m64 *)dst;
1903
	    __m64 v0, v1, v2, v3;
1891 serge 1904
	    __m64 vsrc0, vsrc1, vsrc2, vsrc3;
1905
 
3931 Serge 1906
	    expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1891 serge 1907
 
3931 Serge 1908
	    vsrc0 = load8888 ((src + 0));
1909
	    vsrc1 = load8888 ((src + 1));
1910
	    vsrc2 = load8888 ((src + 2));
1911
	    vsrc3 = load8888 ((src + 3));
1891 serge 1912
 
3931 Serge 1913
	    v0 = over (vsrc0, expand_alpha (vsrc0), v0);
1914
	    v1 = over (vsrc1, expand_alpha (vsrc1), v1);
1915
	    v2 = over (vsrc2, expand_alpha (vsrc2), v2);
1916
	    v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1891 serge 1917
 
3931 Serge 1918
	    *(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1891 serge 1919
 
1920
	    w -= 4;
1921
	    dst += 4;
1922
	    src += 4;
1923
	}
1924
 
1925
	CHECKPOINT ();
1926
 
1927
	while (w)
1928
	{
3931 Serge 1929
	    __m64 vsrc = load8888 (src);
1891 serge 1930
	    uint64_t d = *dst;
1931
	    __m64 vdest = expand565 (to_m64 (d), 0);
1932
 
1933
	    vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
1934
 
1935
	    *dst = to_uint64 (vdest);
1936
 
1937
	    w--;
1938
	    dst++;
1939
	    src++;
1940
	}
1941
    }
1942
 
1943
    _mm_empty ();
1944
}
1945
 
1946
static void
1947
mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
3931 Serge 1948
                             pixman_composite_info_t *info)
1891 serge 1949
{
3931 Serge 1950
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 1951
    uint32_t src, srca;
1952
    uint32_t *dst_line, *dst;
1953
    uint8_t *mask_line, *mask;
1954
    int dst_stride, mask_stride;
1955
    int32_t w;
1956
    __m64 vsrc, vsrca;
1957
    uint64_t srcsrc;
1958
 
1959
    CHECKPOINT ();
1960
 
3931 Serge 1961
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891 serge 1962
 
1963
    srca = src >> 24;
1964
    if (src == 0)
1965
	return;
1966
 
1967
    srcsrc = (uint64_t)src << 32 | src;
1968
 
3931 Serge 1969
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891 serge 1970
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
1971
 
3931 Serge 1972
    vsrc = load8888 (&src);
1891 serge 1973
    vsrca = expand_alpha (vsrc);
1974
 
1975
    while (height--)
1976
    {
1977
	dst = dst_line;
1978
	dst_line += dst_stride;
1979
	mask = mask_line;
1980
	mask_line += mask_stride;
1981
	w = width;
1982
 
1983
	CHECKPOINT ();
1984
 
3931 Serge 1985
	while (w && (uintptr_t)dst & 7)
1891 serge 1986
	{
1987
	    uint64_t m = *mask;
1988
 
1989
	    if (m)
1990
	    {
1991
		__m64 vdest = in_over (vsrc, vsrca,
1992
				       expand_alpha_rev (to_m64 (m)),
3931 Serge 1993
				       load8888 (dst));
1891 serge 1994
 
3931 Serge 1995
		store8888 (dst, vdest);
1891 serge 1996
	    }
1997
 
1998
	    w--;
1999
	    mask++;
2000
	    dst++;
2001
	}
2002
 
2003
	CHECKPOINT ();
2004
 
2005
	while (w >= 2)
2006
	{
2007
	    uint64_t m0, m1;
2008
 
2009
	    m0 = *mask;
2010
	    m1 = *(mask + 1);
2011
 
2012
	    if (srca == 0xff && (m0 & m1) == 0xff)
2013
	    {
2014
		*(uint64_t *)dst = srcsrc;
2015
	    }
2016
	    else if (m0 | m1)
2017
	    {
2018
		__m64 vdest;
2019
		__m64 dest0, dest1;
2020
 
2021
		vdest = *(__m64 *)dst;
2022
 
2023
		dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
2024
				 expand8888 (vdest, 0));
2025
		dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
2026
				 expand8888 (vdest, 1));
2027
 
2028
		*(__m64 *)dst = pack8888 (dest0, dest1);
2029
	    }
2030
 
2031
	    mask += 2;
2032
	    dst += 2;
2033
	    w -= 2;
2034
	}
2035
 
2036
	CHECKPOINT ();
2037
 
3931 Serge 2038
	if (w)
1891 serge 2039
	{
2040
	    uint64_t m = *mask;
2041
 
2042
	    if (m)
2043
	    {
3931 Serge 2044
		__m64 vdest = load8888 (dst);
1891 serge 2045
 
2046
		vdest = in_over (
2047
		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
3931 Serge 2048
		store8888 (dst, vdest);
1891 serge 2049
	    }
2050
	}
2051
    }
2052
 
2053
    _mm_empty ();
2054
}
2055
 
3931 Serge 2056
static pixman_bool_t
2057
mmx_fill (pixman_implementation_t *imp,
2058
          uint32_t *               bits,
2059
          int                      stride,
2060
          int                      bpp,
2061
          int                      x,
2062
          int                      y,
2063
          int                      width,
2064
          int                      height,
2065
          uint32_t		   filler)
1891 serge 2066
{
2067
    uint64_t fill;
2068
    __m64 vfill;
2069
    uint32_t byte_width;
2070
    uint8_t     *byte_line;
2071
 
3931 Serge 2072
#if defined __GNUC__ && defined USE_X86_MMX
1891 serge 2073
    __m64 v1, v2, v3, v4, v5, v6, v7;
2074
#endif
2075
 
2076
    if (bpp != 16 && bpp != 32 && bpp != 8)
2077
	return FALSE;
2078
 
2079
    if (bpp == 8)
2080
    {
2081
	stride = stride * (int) sizeof (uint32_t) / 1;
2082
	byte_line = (uint8_t *)(((uint8_t *)bits) + stride * y + x);
2083
	byte_width = width;
2084
	stride *= 1;
3931 Serge 2085
        filler = (filler & 0xff) * 0x01010101;
1891 serge 2086
    }
2087
    else if (bpp == 16)
2088
    {
2089
	stride = stride * (int) sizeof (uint32_t) / 2;
2090
	byte_line = (uint8_t *)(((uint16_t *)bits) + stride * y + x);
2091
	byte_width = 2 * width;
2092
	stride *= 2;
3931 Serge 2093
        filler = (filler & 0xffff) * 0x00010001;
1891 serge 2094
    }
2095
    else
2096
    {
2097
	stride = stride * (int) sizeof (uint32_t) / 4;
2098
	byte_line = (uint8_t *)(((uint32_t *)bits) + stride * y + x);
2099
	byte_width = 4 * width;
2100
	stride *= 4;
2101
    }
2102
 
3931 Serge 2103
    fill = ((uint64_t)filler << 32) | filler;
1891 serge 2104
    vfill = to_m64 (fill);
2105
 
3931 Serge 2106
#if defined __GNUC__ && defined USE_X86_MMX
1891 serge 2107
    __asm__ (
2108
        "movq		%7,	%0\n"
2109
        "movq		%7,	%1\n"
2110
        "movq		%7,	%2\n"
2111
        "movq		%7,	%3\n"
2112
        "movq		%7,	%4\n"
2113
        "movq		%7,	%5\n"
2114
        "movq		%7,	%6\n"
2115
	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
2116
	  "=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
2117
	: "y" (vfill));
2118
#endif
2119
 
2120
    while (height--)
2121
    {
2122
	int w;
2123
	uint8_t *d = byte_line;
2124
 
2125
	byte_line += stride;
2126
	w = byte_width;
2127
 
3931 Serge 2128
	if (w >= 1 && ((uintptr_t)d & 1))
1891 serge 2129
	{
3931 Serge 2130
	    *(uint8_t *)d = (filler & 0xff);
1891 serge 2131
	    w--;
2132
	    d++;
2133
	}
2134
 
3931 Serge 2135
	if (w >= 2 && ((uintptr_t)d & 3))
1891 serge 2136
	{
3931 Serge 2137
	    *(uint16_t *)d = filler;
1891 serge 2138
	    w -= 2;
2139
	    d += 2;
2140
	}
2141
 
3931 Serge 2142
	while (w >= 4 && ((uintptr_t)d & 7))
1891 serge 2143
	{
3931 Serge 2144
	    *(uint32_t *)d = filler;
1891 serge 2145
 
2146
	    w -= 4;
2147
	    d += 4;
2148
	}
2149
 
2150
	while (w >= 64)
2151
	{
3931 Serge 2152
#if defined __GNUC__ && defined USE_X86_MMX
1891 serge 2153
	    __asm__ (
2154
	        "movq	%1,	  (%0)\n"
2155
	        "movq	%2,	 8(%0)\n"
2156
	        "movq	%3,	16(%0)\n"
2157
	        "movq	%4,	24(%0)\n"
2158
	        "movq	%5,	32(%0)\n"
2159
	        "movq	%6,	40(%0)\n"
2160
	        "movq	%7,	48(%0)\n"
2161
	        "movq	%8,	56(%0)\n"
2162
		:
2163
		: "r" (d),
2164
		  "y" (vfill), "y" (v1), "y" (v2), "y" (v3),
2165
		  "y" (v4), "y" (v5), "y" (v6), "y" (v7)
2166
		: "memory");
2167
#else
2168
	    *(__m64*) (d +  0) = vfill;
2169
	    *(__m64*) (d +  8) = vfill;
2170
	    *(__m64*) (d + 16) = vfill;
2171
	    *(__m64*) (d + 24) = vfill;
2172
	    *(__m64*) (d + 32) = vfill;
2173
	    *(__m64*) (d + 40) = vfill;
2174
	    *(__m64*) (d + 48) = vfill;
2175
	    *(__m64*) (d + 56) = vfill;
2176
#endif
2177
	    w -= 64;
2178
	    d += 64;
2179
	}
2180
 
2181
	while (w >= 4)
2182
	{
3931 Serge 2183
	    *(uint32_t *)d = filler;
1891 serge 2184
 
2185
	    w -= 4;
2186
	    d += 4;
2187
	}
3931 Serge 2188
	if (w >= 2)
1891 serge 2189
	{
3931 Serge 2190
	    *(uint16_t *)d = filler;
1891 serge 2191
	    w -= 2;
2192
	    d += 2;
2193
	}
3931 Serge 2194
	if (w >= 1)
1891 serge 2195
	{
3931 Serge 2196
	    *(uint8_t *)d = (filler & 0xff);
1891 serge 2197
	    w--;
2198
	    d++;
2199
	}
2200
 
2201
    }
2202
 
2203
    _mm_empty ();
2204
    return TRUE;
2205
}
2206
 
2207
static void
3931 Serge 2208
mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
2209
                             pixman_composite_info_t *info)
2210
{
2211
    PIXMAN_COMPOSITE_ARGS (info);
2212
    uint16_t    *dst_line, *dst;
2213
    uint32_t    *src_line, *src, s;
2214
    int dst_stride, src_stride;
2215
    int32_t w;
2216
 
2217
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2218
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
2219
 
2220
    while (height--)
2221
    {
2222
	dst = dst_line;
2223
	dst_line += dst_stride;
2224
	src = src_line;
2225
	src_line += src_stride;
2226
	w = width;
2227
 
2228
	while (w && (uintptr_t)dst & 7)
2229
	{
2230
	    s = *src++;
2231
	    *dst = convert_8888_to_0565 (s);
2232
	    dst++;
2233
	    w--;
2234
	}
2235
 
2236
	while (w >= 4)
2237
	{
2238
	    __m64 vdest;
2239
	    __m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
2240
	    __m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
2241
 
2242
	    vdest = pack_4xpacked565 (vsrc0, vsrc1);
2243
 
2244
	    *(__m64 *)dst = vdest;
2245
 
2246
	    w -= 4;
2247
	    src += 4;
2248
	    dst += 4;
2249
	}
2250
 
2251
	while (w)
2252
	{
2253
	    s = *src++;
2254
	    *dst = convert_8888_to_0565 (s);
2255
	    dst++;
2256
	    w--;
2257
	}
2258
    }
2259
 
2260
    _mm_empty ();
2261
}
2262
 
2263
static void
1891 serge 2264
mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
3931 Serge 2265
                            pixman_composite_info_t *info)
1891 serge 2266
{
3931 Serge 2267
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 2268
    uint32_t src, srca;
2269
    uint32_t    *dst_line, *dst;
2270
    uint8_t     *mask_line, *mask;
2271
    int dst_stride, mask_stride;
2272
    int32_t w;
3931 Serge 2273
    __m64 vsrc;
1891 serge 2274
    uint64_t srcsrc;
2275
 
2276
    CHECKPOINT ();
2277
 
3931 Serge 2278
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891 serge 2279
 
2280
    srca = src >> 24;
2281
    if (src == 0)
2282
    {
3931 Serge 2283
	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
2284
		  PIXMAN_FORMAT_BPP (dest_image->bits.format),
2285
		  dest_x, dest_y, width, height, 0);
1891 serge 2286
	return;
2287
    }
2288
 
2289
    srcsrc = (uint64_t)src << 32 | src;
2290
 
3931 Serge 2291
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891 serge 2292
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2293
 
3931 Serge 2294
    vsrc = load8888 (&src);
1891 serge 2295
 
2296
    while (height--)
2297
    {
2298
	dst = dst_line;
2299
	dst_line += dst_stride;
2300
	mask = mask_line;
2301
	mask_line += mask_stride;
2302
	w = width;
2303
 
2304
	CHECKPOINT ();
2305
 
3931 Serge 2306
	while (w && (uintptr_t)dst & 7)
1891 serge 2307
	{
2308
	    uint64_t m = *mask;
2309
 
2310
	    if (m)
2311
	    {
2312
		__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
2313
 
3931 Serge 2314
		store8888 (dst, vdest);
1891 serge 2315
	    }
2316
	    else
2317
	    {
2318
		*dst = 0;
2319
	    }
2320
 
2321
	    w--;
2322
	    mask++;
2323
	    dst++;
2324
	}
2325
 
2326
	CHECKPOINT ();
2327
 
2328
	while (w >= 2)
2329
	{
2330
	    uint64_t m0, m1;
2331
	    m0 = *mask;
2332
	    m1 = *(mask + 1);
2333
 
2334
	    if (srca == 0xff && (m0 & m1) == 0xff)
2335
	    {
2336
		*(uint64_t *)dst = srcsrc;
2337
	    }
2338
	    else if (m0 | m1)
2339
	    {
2340
		__m64 dest0, dest1;
2341
 
2342
		dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
2343
		dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
2344
 
2345
		*(__m64 *)dst = pack8888 (dest0, dest1);
2346
	    }
2347
	    else
2348
	    {
2349
		*(uint64_t *)dst = 0;
2350
	    }
2351
 
2352
	    mask += 2;
2353
	    dst += 2;
2354
	    w -= 2;
2355
	}
2356
 
2357
	CHECKPOINT ();
2358
 
3931 Serge 2359
	if (w)
1891 serge 2360
	{
2361
	    uint64_t m = *mask;
2362
 
2363
	    if (m)
2364
	    {
3931 Serge 2365
		__m64 vdest = load8888 (dst);
1891 serge 2366
 
2367
		vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
3931 Serge 2368
		store8888 (dst, vdest);
1891 serge 2369
	    }
2370
	    else
2371
	    {
2372
		*dst = 0;
2373
	    }
2374
	}
2375
    }
2376
 
2377
    _mm_empty ();
2378
}
2379
 
2380
static void
2381
mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
3931 Serge 2382
                             pixman_composite_info_t *info)
1891 serge 2383
{
3931 Serge 2384
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 2385
    uint32_t src, srca;
2386
    uint16_t *dst_line, *dst;
2387
    uint8_t *mask_line, *mask;
2388
    int dst_stride, mask_stride;
2389
    int32_t w;
2390
    __m64 vsrc, vsrca, tmp;
3931 Serge 2391
    __m64 srcsrcsrcsrc;
1891 serge 2392
 
2393
    CHECKPOINT ();
2394
 
3931 Serge 2395
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891 serge 2396
 
2397
    srca = src >> 24;
2398
    if (src == 0)
2399
	return;
2400
 
3931 Serge 2401
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1891 serge 2402
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2403
 
3931 Serge 2404
    vsrc = load8888 (&src);
1891 serge 2405
    vsrca = expand_alpha (vsrc);
2406
 
2407
    tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
3931 Serge 2408
    srcsrcsrcsrc = expand_alpha_rev (tmp);
1891 serge 2409
 
2410
    while (height--)
2411
    {
2412
	dst = dst_line;
2413
	dst_line += dst_stride;
2414
	mask = mask_line;
2415
	mask_line += mask_stride;
2416
	w = width;
2417
 
2418
	CHECKPOINT ();
2419
 
3931 Serge 2420
	while (w && (uintptr_t)dst & 7)
1891 serge 2421
	{
2422
	    uint64_t m = *mask;
2423
 
2424
	    if (m)
2425
	    {
2426
		uint64_t d = *dst;
2427
		__m64 vd = to_m64 (d);
2428
		__m64 vdest = in_over (
2429
		    vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
2430
 
2431
		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2432
		*dst = to_uint64 (vd);
2433
	    }
2434
 
2435
	    w--;
2436
	    mask++;
2437
	    dst++;
2438
	}
2439
 
2440
	CHECKPOINT ();
2441
 
2442
	while (w >= 4)
2443
	{
2444
	    uint64_t m0, m1, m2, m3;
2445
	    m0 = *mask;
2446
	    m1 = *(mask + 1);
2447
	    m2 = *(mask + 2);
2448
	    m3 = *(mask + 3);
2449
 
2450
	    if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
2451
	    {
3931 Serge 2452
		*(__m64 *)dst = srcsrcsrcsrc;
1891 serge 2453
	    }
2454
	    else if (m0 | m1 | m2 | m3)
2455
	    {
3931 Serge 2456
		__m64 vdest = *(__m64 *)dst;
2457
		__m64 v0, v1, v2, v3;
1891 serge 2458
		__m64 vm0, vm1, vm2, vm3;
2459
 
3931 Serge 2460
		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1891 serge 2461
 
2462
		vm0 = to_m64 (m0);
3931 Serge 2463
		v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
2464
 
1891 serge 2465
		vm1 = to_m64 (m1);
3931 Serge 2466
		v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
2467
 
1891 serge 2468
		vm2 = to_m64 (m2);
3931 Serge 2469
		v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
2470
 
1891 serge 2471
		vm3 = to_m64 (m3);
3931 Serge 2472
		v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
1891 serge 2473
 
3931 Serge 2474
		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);;
1891 serge 2475
	    }
2476
 
2477
	    w -= 4;
2478
	    mask += 4;
2479
	    dst += 4;
2480
	}
2481
 
2482
	CHECKPOINT ();
2483
 
2484
	while (w)
2485
	{
2486
	    uint64_t m = *mask;
2487
 
2488
	    if (m)
2489
	    {
2490
		uint64_t d = *dst;
2491
		__m64 vd = to_m64 (d);
2492
		__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
2493
				       expand565 (vd, 0));
2494
		vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
2495
		*dst = to_uint64 (vd);
2496
	    }
2497
 
2498
	    w--;
2499
	    mask++;
2500
	    dst++;
2501
	}
2502
    }
2503
 
2504
    _mm_empty ();
2505
}
2506
 
2507
static void
2508
mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3931 Serge 2509
                                pixman_composite_info_t *info)
1891 serge 2510
{
3931 Serge 2511
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 2512
    uint16_t    *dst_line, *dst;
2513
    uint32_t    *src_line, *src;
2514
    int dst_stride, src_stride;
2515
    int32_t w;
2516
 
2517
    CHECKPOINT ();
2518
 
3931 Serge 2519
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1891 serge 2520
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2521
 
2522
#if 0
2523
    /* FIXME */
2524
    assert (src_image->drawable == mask_image->drawable);
2525
#endif
2526
 
2527
    while (height--)
2528
    {
2529
	dst = dst_line;
2530
	dst_line += dst_stride;
2531
	src = src_line;
2532
	src_line += src_stride;
2533
	w = width;
2534
 
2535
	CHECKPOINT ();
2536
 
3931 Serge 2537
	while (w && (uintptr_t)dst & 7)
1891 serge 2538
	{
3931 Serge 2539
	    __m64 vsrc = load8888 (src);
1891 serge 2540
	    uint64_t d = *dst;
2541
	    __m64 vdest = expand565 (to_m64 (d), 0);
2542
 
2543
	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2544
 
2545
	    *dst = to_uint64 (vdest);
2546
 
2547
	    w--;
2548
	    dst++;
2549
	    src++;
2550
	}
2551
 
2552
	CHECKPOINT ();
2553
 
2554
	while (w >= 4)
2555
	{
2556
	    uint32_t s0, s1, s2, s3;
2557
	    unsigned char a0, a1, a2, a3;
2558
 
2559
	    s0 = *src;
2560
	    s1 = *(src + 1);
2561
	    s2 = *(src + 2);
2562
	    s3 = *(src + 3);
2563
 
2564
	    a0 = (s0 >> 24);
2565
	    a1 = (s1 >> 24);
2566
	    a2 = (s2 >> 24);
2567
	    a3 = (s3 >> 24);
2568
 
2569
	    if ((a0 & a1 & a2 & a3) == 0xFF)
2570
	    {
3931 Serge 2571
		__m64 v0 = invert_colors (load8888 (&s0));
2572
		__m64 v1 = invert_colors (load8888 (&s1));
2573
		__m64 v2 = invert_colors (load8888 (&s2));
2574
		__m64 v3 = invert_colors (load8888 (&s3));
1891 serge 2575
 
3931 Serge 2576
		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1891 serge 2577
	    }
2578
	    else if (s0 | s1 | s2 | s3)
2579
	    {
2580
		__m64 vdest = *(__m64 *)dst;
3931 Serge 2581
		__m64 v0, v1, v2, v3;
1891 serge 2582
 
3931 Serge 2583
		__m64 vsrc0 = load8888 (&s0);
2584
		__m64 vsrc1 = load8888 (&s1);
2585
		__m64 vsrc2 = load8888 (&s2);
2586
		__m64 vsrc3 = load8888 (&s3);
1891 serge 2587
 
3931 Serge 2588
		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
2589
 
2590
		v0 = over_rev_non_pre (vsrc0, v0);
2591
		v1 = over_rev_non_pre (vsrc1, v1);
2592
		v2 = over_rev_non_pre (vsrc2, v2);
2593
		v3 = over_rev_non_pre (vsrc3, v3);
2594
 
2595
		*(__m64 *)dst = pack_4x565 (v0, v1, v2, v3);
1891 serge 2596
	    }
2597
 
2598
	    w -= 4;
2599
	    dst += 4;
2600
	    src += 4;
2601
	}
2602
 
2603
	CHECKPOINT ();
2604
 
2605
	while (w)
2606
	{
3931 Serge 2607
	    __m64 vsrc = load8888 (src);
1891 serge 2608
	    uint64_t d = *dst;
2609
	    __m64 vdest = expand565 (to_m64 (d), 0);
2610
 
2611
	    vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
2612
 
2613
	    *dst = to_uint64 (vdest);
2614
 
2615
	    w--;
2616
	    dst++;
2617
	    src++;
2618
	}
2619
    }
2620
 
2621
    _mm_empty ();
2622
}
2623
 
2624
static void
2625
mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3931 Serge 2626
                                pixman_composite_info_t *info)
1891 serge 2627
{
3931 Serge 2628
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 2629
    uint32_t    *dst_line, *dst;
2630
    uint32_t    *src_line, *src;
2631
    int dst_stride, src_stride;
2632
    int32_t w;
2633
 
2634
    CHECKPOINT ();
2635
 
3931 Serge 2636
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891 serge 2637
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
2638
 
2639
#if 0
2640
    /* FIXME */
2641
    assert (src_image->drawable == mask_image->drawable);
2642
#endif
2643
 
2644
    while (height--)
2645
    {
2646
	dst = dst_line;
2647
	dst_line += dst_stride;
2648
	src = src_line;
2649
	src_line += src_stride;
2650
	w = width;
2651
 
3931 Serge 2652
	while (w && (uintptr_t)dst & 7)
1891 serge 2653
	{
3931 Serge 2654
	    __m64 s = load8888 (src);
2655
	    __m64 d = load8888 (dst);
1891 serge 2656
 
3931 Serge 2657
	    store8888 (dst, over_rev_non_pre (s, d));
1891 serge 2658
 
2659
	    w--;
2660
	    dst++;
2661
	    src++;
2662
	}
2663
 
2664
	while (w >= 2)
2665
	{
3931 Serge 2666
	    uint32_t s0, s1;
1891 serge 2667
	    unsigned char a0, a1;
2668
	    __m64 d0, d1;
2669
 
2670
	    s0 = *src;
2671
	    s1 = *(src + 1);
2672
 
2673
	    a0 = (s0 >> 24);
2674
	    a1 = (s1 >> 24);
2675
 
2676
	    if ((a0 & a1) == 0xFF)
2677
	    {
3931 Serge 2678
		d0 = invert_colors (load8888 (&s0));
2679
		d1 = invert_colors (load8888 (&s1));
1891 serge 2680
 
2681
		*(__m64 *)dst = pack8888 (d0, d1);
2682
	    }
2683
	    else if (s0 | s1)
2684
	    {
2685
		__m64 vdest = *(__m64 *)dst;
2686
 
3931 Serge 2687
		d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
2688
		d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
1891 serge 2689
 
2690
		*(__m64 *)dst = pack8888 (d0, d1);
2691
	    }
2692
 
2693
	    w -= 2;
2694
	    dst += 2;
2695
	    src += 2;
2696
	}
2697
 
3931 Serge 2698
	if (w)
1891 serge 2699
	{
3931 Serge 2700
	    __m64 s = load8888 (src);
2701
	    __m64 d = load8888 (dst);
1891 serge 2702
 
3931 Serge 2703
	    store8888 (dst, over_rev_non_pre (s, d));
1891 serge 2704
	}
2705
    }
2706
 
2707
    _mm_empty ();
2708
}
2709
 
2710
static void
2711
mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3931 Serge 2712
                                   pixman_composite_info_t *info)
1891 serge 2713
{
3931 Serge 2714
    PIXMAN_COMPOSITE_ARGS (info);
2715
    uint32_t src;
1891 serge 2716
    uint16_t    *dst_line;
2717
    uint32_t    *mask_line;
2718
    int dst_stride, mask_stride;
2719
    __m64 vsrc, vsrca;
2720
 
2721
    CHECKPOINT ();
2722
 
3931 Serge 2723
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891 serge 2724
 
2725
    if (src == 0)
2726
	return;
2727
 
3931 Serge 2728
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1891 serge 2729
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
2730
 
3931 Serge 2731
    vsrc = load8888 (&src);
1891 serge 2732
    vsrca = expand_alpha (vsrc);
2733
 
2734
    while (height--)
2735
    {
2736
	int twidth = width;
2737
	uint32_t *p = (uint32_t *)mask_line;
2738
	uint16_t *q = (uint16_t *)dst_line;
2739
 
3931 Serge 2740
	while (twidth && ((uintptr_t)q & 7))
1891 serge 2741
	{
2742
	    uint32_t m = *(uint32_t *)p;
2743
 
2744
	    if (m)
2745
	    {
2746
		uint64_t d = *q;
2747
		__m64 vdest = expand565 (to_m64 (d), 0);
3931 Serge 2748
		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
1891 serge 2749
		*q = to_uint64 (vdest);
2750
	    }
2751
 
2752
	    twidth--;
2753
	    p++;
2754
	    q++;
2755
	}
2756
 
2757
	while (twidth >= 4)
2758
	{
2759
	    uint32_t m0, m1, m2, m3;
2760
 
2761
	    m0 = *p;
2762
	    m1 = *(p + 1);
2763
	    m2 = *(p + 2);
2764
	    m3 = *(p + 3);
2765
 
2766
	    if ((m0 | m1 | m2 | m3))
2767
	    {
2768
		__m64 vdest = *(__m64 *)q;
3931 Serge 2769
		__m64 v0, v1, v2, v3;
1891 serge 2770
 
3931 Serge 2771
		expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1891 serge 2772
 
3931 Serge 2773
		v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
2774
		v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
2775
		v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
2776
		v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
2777
 
2778
		*(__m64 *)q = pack_4x565 (v0, v1, v2, v3);
1891 serge 2779
	    }
2780
	    twidth -= 4;
2781
	    p += 4;
2782
	    q += 4;
2783
	}
2784
 
2785
	while (twidth)
2786
	{
2787
	    uint32_t m;
2788
 
2789
	    m = *(uint32_t *)p;
2790
	    if (m)
2791
	    {
2792
		uint64_t d = *q;
2793
		__m64 vdest = expand565 (to_m64 (d), 0);
3931 Serge 2794
		vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
1891 serge 2795
		*q = to_uint64 (vdest);
2796
	    }
2797
 
2798
	    twidth--;
2799
	    p++;
2800
	    q++;
2801
	}
2802
 
2803
	mask_line += mask_stride;
2804
	dst_line += dst_stride;
2805
    }
2806
 
2807
    _mm_empty ();
2808
}
2809
 
2810
static void
2811
mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
3931 Serge 2812
                        pixman_composite_info_t *info)
1891 serge 2813
{
3931 Serge 2814
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 2815
    uint8_t *dst_line, *dst;
2816
    uint8_t *mask_line, *mask;
2817
    int dst_stride, mask_stride;
2818
    int32_t w;
2819
    uint32_t src;
2820
    uint8_t sa;
2821
    __m64 vsrc, vsrca;
2822
 
3931 Serge 2823
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1891 serge 2824
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2825
 
3931 Serge 2826
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891 serge 2827
 
2828
    sa = src >> 24;
2829
 
3931 Serge 2830
    vsrc = load8888 (&src);
1891 serge 2831
    vsrca = expand_alpha (vsrc);
2832
 
2833
    while (height--)
2834
    {
2835
	dst = dst_line;
2836
	dst_line += dst_stride;
2837
	mask = mask_line;
2838
	mask_line += mask_stride;
2839
	w = width;
2840
 
3931 Serge 2841
	while (w && (uintptr_t)dst & 7)
1891 serge 2842
	{
3931 Serge 2843
	    uint16_t tmp;
2844
	    uint8_t a;
2845
	    uint32_t m, d;
1891 serge 2846
 
3931 Serge 2847
	    a = *mask++;
2848
	    d = *dst;
1891 serge 2849
 
3931 Serge 2850
	    m = MUL_UN8 (sa, a, tmp);
2851
	    d = MUL_UN8 (m, d, tmp);
1891 serge 2852
 
3931 Serge 2853
	    *dst++ = d;
2854
	    w--;
2855
	}
1891 serge 2856
 
3931 Serge 2857
	while (w >= 4)
2858
	{
2859
	    __m64 vmask;
2860
	    __m64 vdest;
2861
 
2862
	    vmask = load8888u ((uint32_t *)mask);
2863
	    vdest = load8888 ((uint32_t *)dst);
2864
 
2865
	    store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
2866
 
2867
	    dst += 4;
2868
	    mask += 4;
2869
	    w -= 4;
1891 serge 2870
	}
2871
 
2872
	while (w--)
2873
	{
2874
	    uint16_t tmp;
2875
	    uint8_t a;
2876
	    uint32_t m, d;
2877
 
2878
	    a = *mask++;
2879
	    d = *dst;
2880
 
2881
	    m = MUL_UN8 (sa, a, tmp);
2882
	    d = MUL_UN8 (m, d, tmp);
2883
 
2884
	    *dst++ = d;
2885
	}
2886
    }
2887
 
2888
    _mm_empty ();
2889
}
2890
 
2891
static void
2892
mmx_composite_in_8_8 (pixman_implementation_t *imp,
3931 Serge 2893
                      pixman_composite_info_t *info)
1891 serge 2894
{
3931 Serge 2895
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 2896
    uint8_t     *dst_line, *dst;
2897
    uint8_t     *src_line, *src;
2898
    int src_stride, dst_stride;
2899
    int32_t w;
2900
 
3931 Serge 2901
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1891 serge 2902
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
2903
 
2904
    while (height--)
2905
    {
2906
	dst = dst_line;
2907
	dst_line += dst_stride;
2908
	src = src_line;
2909
	src_line += src_stride;
2910
	w = width;
2911
 
3931 Serge 2912
	while (w && (uintptr_t)dst & 3)
1891 serge 2913
	{
3931 Serge 2914
	    uint8_t s, d;
2915
	    uint16_t tmp;
1891 serge 2916
 
3931 Serge 2917
	    s = *src;
2918
	    d = *dst;
1891 serge 2919
 
3931 Serge 2920
	    *dst = MUL_UN8 (s, d, tmp);
2921
 
2922
	    src++;
2923
	    dst++;
2924
	    w--;
1891 serge 2925
	}
2926
 
3931 Serge 2927
	while (w >= 4)
2928
	{
2929
	    uint32_t *s = (uint32_t *)src;
2930
	    uint32_t *d = (uint32_t *)dst;
2931
 
2932
	    store8888 (d, in (load8888u (s), load8888 (d)));
2933
 
2934
	    w -= 4;
2935
	    dst += 4;
2936
	    src += 4;
2937
	}
2938
 
1891 serge 2939
	while (w--)
2940
	{
2941
	    uint8_t s, d;
2942
	    uint16_t tmp;
2943
 
2944
	    s = *src;
2945
	    d = *dst;
2946
 
2947
	    *dst = MUL_UN8 (s, d, tmp);
2948
 
2949
	    src++;
2950
	    dst++;
2951
	}
2952
    }
2953
 
2954
    _mm_empty ();
2955
}
2956
 
2957
static void
2958
mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
3931 Serge 2959
			 pixman_composite_info_t *info)
1891 serge 2960
{
3931 Serge 2961
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 2962
    uint8_t     *dst_line, *dst;
2963
    uint8_t     *mask_line, *mask;
2964
    int dst_stride, mask_stride;
2965
    int32_t w;
2966
    uint32_t src;
2967
    uint8_t sa;
2968
    __m64 vsrc, vsrca;
2969
 
3931 Serge 2970
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1891 serge 2971
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
2972
 
3931 Serge 2973
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891 serge 2974
 
2975
    sa = src >> 24;
2976
 
2977
    if (src == 0)
2978
	return;
2979
 
3931 Serge 2980
    vsrc = load8888 (&src);
1891 serge 2981
    vsrca = expand_alpha (vsrc);
2982
 
2983
    while (height--)
2984
    {
2985
	dst = dst_line;
2986
	dst_line += dst_stride;
2987
	mask = mask_line;
2988
	mask_line += mask_stride;
2989
	w = width;
2990
 
3931 Serge 2991
	while (w && (uintptr_t)dst & 3)
1891 serge 2992
	{
3931 Serge 2993
	    uint16_t tmp;
2994
	    uint16_t a;
2995
	    uint32_t m, d;
2996
	    uint32_t r;
1891 serge 2997
 
3931 Serge 2998
	    a = *mask++;
2999
	    d = *dst;
1891 serge 3000
 
3931 Serge 3001
	    m = MUL_UN8 (sa, a, tmp);
3002
	    r = ADD_UN8 (m, d, tmp);
3003
 
3004
	    *dst++ = r;
3005
	    w--;
1891 serge 3006
	}
3007
 
3931 Serge 3008
	while (w >= 4)
3009
	{
3010
	    __m64 vmask;
3011
	    __m64 vdest;
3012
 
3013
	    vmask = load8888u ((uint32_t *)mask);
3014
	    vdest = load8888 ((uint32_t *)dst);
3015
 
3016
	    store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
3017
 
3018
	    dst += 4;
3019
	    mask += 4;
3020
	    w -= 4;
3021
	}
3022
 
1891 serge 3023
	while (w--)
3024
	{
3025
	    uint16_t tmp;
3026
	    uint16_t a;
3027
	    uint32_t m, d;
3028
	    uint32_t r;
3029
 
3030
	    a = *mask++;
3031
	    d = *dst;
3032
 
3033
	    m = MUL_UN8 (sa, a, tmp);
3034
	    r = ADD_UN8 (m, d, tmp);
3035
 
3036
	    *dst++ = r;
3037
	}
3038
    }
3039
 
3040
    _mm_empty ();
3041
}
3042
 
3043
static void
3044
mmx_composite_add_8_8 (pixman_implementation_t *imp,
3931 Serge 3045
		       pixman_composite_info_t *info)
1891 serge 3046
{
3931 Serge 3047
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 3048
    uint8_t *dst_line, *dst;
3049
    uint8_t *src_line, *src;
3050
    int dst_stride, src_stride;
3051
    int32_t w;
3052
    uint8_t s, d;
3053
    uint16_t t;
3054
 
3055
    CHECKPOINT ();
3056
 
3057
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3931 Serge 3058
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1891 serge 3059
 
3060
    while (height--)
3061
    {
3062
	dst = dst_line;
3063
	dst_line += dst_stride;
3064
	src = src_line;
3065
	src_line += src_stride;
3066
	w = width;
3067
 
3931 Serge 3068
	while (w && (uintptr_t)dst & 7)
1891 serge 3069
	{
3070
	    s = *src;
3071
	    d = *dst;
3072
	    t = d + s;
3073
	    s = t | (0 - (t >> 8));
3074
	    *dst = s;
3075
 
3076
	    dst++;
3077
	    src++;
3078
	    w--;
3079
	}
3080
 
3081
	while (w >= 8)
3082
	{
3931 Serge 3083
	    *(__m64*)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
1891 serge 3084
	    dst += 8;
3085
	    src += 8;
3086
	    w -= 8;
3087
	}
3088
 
3089
	while (w)
3090
	{
3091
	    s = *src;
3092
	    d = *dst;
3093
	    t = d + s;
3094
	    s = t | (0 - (t >> 8));
3095
	    *dst = s;
3096
 
3097
	    dst++;
3098
	    src++;
3099
	    w--;
3100
	}
3101
    }
3102
 
3103
    _mm_empty ();
3104
}
3105
 
3106
static void
3931 Serge 3107
mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
3108
                             pixman_composite_info_t *info)
3109
{
3110
    PIXMAN_COMPOSITE_ARGS (info);
3111
    uint16_t    *dst_line, *dst;
3112
    uint32_t	d;
3113
    uint16_t    *src_line, *src;
3114
    uint32_t	s;
3115
    int dst_stride, src_stride;
3116
    int32_t w;
3117
 
3118
    CHECKPOINT ();
3119
 
3120
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
3121
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
3122
 
3123
    while (height--)
3124
    {
3125
	dst = dst_line;
3126
	dst_line += dst_stride;
3127
	src = src_line;
3128
	src_line += src_stride;
3129
	w = width;
3130
 
3131
	while (w && (uintptr_t)dst & 7)
3132
	{
3133
	    s = *src++;
3134
	    if (s)
3135
	    {
3136
		d = *dst;
3137
		s = convert_0565_to_8888 (s);
3138
		if (d)
3139
		{
3140
		    d = convert_0565_to_8888 (d);
3141
		    UN8x4_ADD_UN8x4 (s, d);
3142
		}
3143
		*dst = convert_8888_to_0565 (s);
3144
	    }
3145
	    dst++;
3146
	    w--;
3147
	}
3148
 
3149
	while (w >= 4)
3150
	{
3151
	    __m64 vdest = *(__m64 *)dst;
3152
	    __m64 vsrc = ldq_u ((__m64 *)src);
3153
	    __m64 vd0, vd1;
3154
	    __m64 vs0, vs1;
3155
 
3156
	    expand_4xpacked565 (vdest, &vd0, &vd1, 0);
3157
	    expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
3158
 
3159
	    vd0 = _mm_adds_pu8 (vd0, vs0);
3160
	    vd1 = _mm_adds_pu8 (vd1, vs1);
3161
 
3162
	    *(__m64 *)dst = pack_4xpacked565 (vd0, vd1);
3163
 
3164
	    dst += 4;
3165
	    src += 4;
3166
	    w -= 4;
3167
	}
3168
 
3169
	while (w--)
3170
	{
3171
	    s = *src++;
3172
	    if (s)
3173
	    {
3174
		d = *dst;
3175
		s = convert_0565_to_8888 (s);
3176
		if (d)
3177
		{
3178
		    d = convert_0565_to_8888 (d);
3179
		    UN8x4_ADD_UN8x4 (s, d);
3180
		}
3181
		*dst = convert_8888_to_0565 (s);
3182
	    }
3183
	    dst++;
3184
	}
3185
    }
3186
 
3187
    _mm_empty ();
3188
}
3189
 
3190
static void
1891 serge 3191
mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3931 Serge 3192
                             pixman_composite_info_t *info)
1891 serge 3193
{
3931 Serge 3194
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 3195
    uint32_t    *dst_line, *dst;
3196
    uint32_t    *src_line, *src;
3197
    int dst_stride, src_stride;
3198
    int32_t w;
3199
 
3200
    CHECKPOINT ();
3201
 
3202
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3931 Serge 3203
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891 serge 3204
 
3205
    while (height--)
3206
    {
3207
	dst = dst_line;
3208
	dst_line += dst_stride;
3209
	src = src_line;
3210
	src_line += src_stride;
3211
	w = width;
3212
 
3931 Serge 3213
	while (w && (uintptr_t)dst & 7)
1891 serge 3214
	{
3931 Serge 3215
	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3216
	                              load ((const uint32_t *)dst)));
1891 serge 3217
	    dst++;
3218
	    src++;
3219
	    w--;
3220
	}
3221
 
3222
	while (w >= 2)
3223
	{
3931 Serge 3224
	    *(__m64 *)dst = _mm_adds_pu8 (ldq_u ((__m64 *)src), *(__m64*)dst);
1891 serge 3225
	    dst += 2;
3226
	    src += 2;
3227
	    w -= 2;
3228
	}
3229
 
3230
	if (w)
3231
	{
3931 Serge 3232
	    store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
3233
	                              load ((const uint32_t *)dst)));
1891 serge 3234
 
3235
	}
3236
    }
3237
 
3238
    _mm_empty ();
3239
}
3240
 
3241
static pixman_bool_t
3931 Serge 3242
mmx_blt (pixman_implementation_t *imp,
3243
         uint32_t *               src_bits,
3244
         uint32_t *               dst_bits,
3245
         int                      src_stride,
3246
         int                      dst_stride,
3247
         int                      src_bpp,
3248
         int                      dst_bpp,
3249
         int                      src_x,
3250
         int                      src_y,
3251
         int                      dest_x,
3252
         int                      dest_y,
3253
         int                      width,
3254
         int                      height)
1891 serge 3255
{
3256
    uint8_t *   src_bytes;
3257
    uint8_t *   dst_bytes;
3258
    int byte_width;
3259
 
3260
    if (src_bpp != dst_bpp)
3261
	return FALSE;
3262
 
3263
    if (src_bpp == 16)
3264
    {
3265
	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
3266
	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
3267
	src_bytes = (uint8_t *)(((uint16_t *)src_bits) + src_stride * (src_y) + (src_x));
3931 Serge 3268
	dst_bytes = (uint8_t *)(((uint16_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
1891 serge 3269
	byte_width = 2 * width;
3270
	src_stride *= 2;
3271
	dst_stride *= 2;
3272
    }
3273
    else if (src_bpp == 32)
3274
    {
3275
	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
3276
	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
3277
	src_bytes = (uint8_t *)(((uint32_t *)src_bits) + src_stride * (src_y) + (src_x));
3931 Serge 3278
	dst_bytes = (uint8_t *)(((uint32_t *)dst_bits) + dst_stride * (dest_y) + (dest_x));
1891 serge 3279
	byte_width = 4 * width;
3280
	src_stride *= 4;
3281
	dst_stride *= 4;
3282
    }
3283
    else
3284
    {
3285
	return FALSE;
3286
    }
3287
 
3288
    while (height--)
3289
    {
3290
	int w;
3291
	uint8_t *s = src_bytes;
3292
	uint8_t *d = dst_bytes;
3293
	src_bytes += src_stride;
3294
	dst_bytes += dst_stride;
3295
	w = byte_width;
3296
 
3931 Serge 3297
	if (w >= 1 && ((uintptr_t)d & 1))
1891 serge 3298
	{
3931 Serge 3299
	    *(uint8_t *)d = *(uint8_t *)s;
3300
	    w -= 1;
3301
	    s += 1;
3302
	    d += 1;
3303
	}
3304
 
3305
	if (w >= 2 && ((uintptr_t)d & 3))
3306
	{
1891 serge 3307
	    *(uint16_t *)d = *(uint16_t *)s;
3308
	    w -= 2;
3309
	    s += 2;
3310
	    d += 2;
3311
	}
3312
 
3931 Serge 3313
	while (w >= 4 && ((uintptr_t)d & 7))
1891 serge 3314
	{
3931 Serge 3315
	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
1891 serge 3316
 
3317
	    w -= 4;
3318
	    s += 4;
3319
	    d += 4;
3320
	}
3321
 
3322
	while (w >= 64)
3323
	{
3931 Serge 3324
#if (defined (__GNUC__) || (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
1891 serge 3325
	    __asm__ (
3326
	        "movq	  (%1),	  %%mm0\n"
3327
	        "movq	 8(%1),	  %%mm1\n"
3328
	        "movq	16(%1),	  %%mm2\n"
3329
	        "movq	24(%1),	  %%mm3\n"
3330
	        "movq	32(%1),	  %%mm4\n"
3331
	        "movq	40(%1),	  %%mm5\n"
3332
	        "movq	48(%1),	  %%mm6\n"
3333
	        "movq	56(%1),	  %%mm7\n"
3334
 
3335
	        "movq	%%mm0,	  (%0)\n"
3336
	        "movq	%%mm1,	 8(%0)\n"
3337
	        "movq	%%mm2,	16(%0)\n"
3338
	        "movq	%%mm3,	24(%0)\n"
3339
	        "movq	%%mm4,	32(%0)\n"
3340
	        "movq	%%mm5,	40(%0)\n"
3341
	        "movq	%%mm6,	48(%0)\n"
3342
	        "movq	%%mm7,	56(%0)\n"
3343
		:
3344
		: "r" (d), "r" (s)
3345
		: "memory",
3346
		  "%mm0", "%mm1", "%mm2", "%mm3",
3347
		  "%mm4", "%mm5", "%mm6", "%mm7");
3348
#else
3931 Serge 3349
	    __m64 v0 = ldq_u ((__m64 *)(s + 0));
3350
	    __m64 v1 = ldq_u ((__m64 *)(s + 8));
3351
	    __m64 v2 = ldq_u ((__m64 *)(s + 16));
3352
	    __m64 v3 = ldq_u ((__m64 *)(s + 24));
3353
	    __m64 v4 = ldq_u ((__m64 *)(s + 32));
3354
	    __m64 v5 = ldq_u ((__m64 *)(s + 40));
3355
	    __m64 v6 = ldq_u ((__m64 *)(s + 48));
3356
	    __m64 v7 = ldq_u ((__m64 *)(s + 56));
1891 serge 3357
	    *(__m64 *)(d + 0)  = v0;
3358
	    *(__m64 *)(d + 8)  = v1;
3359
	    *(__m64 *)(d + 16) = v2;
3360
	    *(__m64 *)(d + 24) = v3;
3361
	    *(__m64 *)(d + 32) = v4;
3362
	    *(__m64 *)(d + 40) = v5;
3363
	    *(__m64 *)(d + 48) = v6;
3364
	    *(__m64 *)(d + 56) = v7;
3365
#endif
3366
 
3367
	    w -= 64;
3368
	    s += 64;
3369
	    d += 64;
3370
	}
3371
	while (w >= 4)
3372
	{
3931 Serge 3373
	    *(uint32_t *)d = ldl_u ((uint32_t *)s);
1891 serge 3374
 
3375
	    w -= 4;
3376
	    s += 4;
3377
	    d += 4;
3378
	}
3379
	if (w >= 2)
3380
	{
3381
	    *(uint16_t *)d = *(uint16_t *)s;
3382
	    w -= 2;
3383
	    s += 2;
3384
	    d += 2;
3385
	}
3386
    }
3387
 
3388
    _mm_empty ();
3389
 
3390
    return TRUE;
3391
}
3392
 
3393
static void
3394
mmx_composite_copy_area (pixman_implementation_t *imp,
3931 Serge 3395
                         pixman_composite_info_t *info)
1891 serge 3396
{
3931 Serge 3397
    PIXMAN_COMPOSITE_ARGS (info);
3398
 
3399
    mmx_blt (imp, src_image->bits.bits,
3400
	     dest_image->bits.bits,
3401
	     src_image->bits.rowstride,
3402
	     dest_image->bits.rowstride,
3403
	     PIXMAN_FORMAT_BPP (src_image->bits.format),
3404
	     PIXMAN_FORMAT_BPP (dest_image->bits.format),
3405
	     src_x, src_y, dest_x, dest_y, width, height);
1891 serge 3406
}
3407
 
3408
static void
3409
mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3931 Serge 3410
                                pixman_composite_info_t *info)
1891 serge 3411
{
3931 Serge 3412
    PIXMAN_COMPOSITE_ARGS (info);
1891 serge 3413
    uint32_t  *src, *src_line;
3414
    uint32_t  *dst, *dst_line;
3415
    uint8_t  *mask, *mask_line;
3416
    int src_stride, mask_stride, dst_stride;
3417
    int32_t w;
3418
 
3931 Serge 3419
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891 serge 3420
    PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
3421
    PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3422
 
3423
    while (height--)
3424
    {
3425
	src = src_line;
3426
	src_line += src_stride;
3427
	dst = dst_line;
3428
	dst_line += dst_stride;
3429
	mask = mask_line;
3430
	mask_line += mask_stride;
3431
 
3432
	w = width;
3433
 
3434
	while (w--)
3435
	{
3436
	    uint64_t m = *mask;
3437
 
3438
	    if (m)
3439
	    {
3931 Serge 3440
		uint32_t ssrc = *src | 0xff000000;
3441
		__m64 s = load8888 (&ssrc);
1891 serge 3442
 
3443
		if (m == 0xff)
3444
		{
3931 Serge 3445
		    store8888 (dst, s);
1891 serge 3446
		}
3447
		else
3448
		{
3449
		    __m64 sa = expand_alpha (s);
3450
		    __m64 vm = expand_alpha_rev (to_m64 (m));
3931 Serge 3451
		    __m64 vdest = in_over (s, sa, vm, load8888 (dst));
1891 serge 3452
 
3931 Serge 3453
		    store8888 (dst, vdest);
1891 serge 3454
		}
3455
	    }
3456
 
3457
	    mask++;
3458
	    dst++;
3459
	    src++;
3460
	}
3461
    }
3462
 
3463
    _mm_empty ();
3464
}
3465
 
3931 Serge 3466
static void
3467
mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
3468
                                   pixman_composite_info_t *info)
3469
{
3470
    PIXMAN_COMPOSITE_ARGS (info);
3471
    uint32_t src;
3472
    uint32_t    *dst_line, *dst;
3473
    int32_t w;
3474
    int dst_stride;
3475
    __m64 vsrc;
3476
 
3477
    CHECKPOINT ();
3478
 
3479
    src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
3480
 
3481
    if (src == 0)
3482
	return;
3483
 
3484
    PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
3485
 
3486
    vsrc = load8888 (&src);
3487
 
3488
    while (height--)
3489
    {
3490
	dst = dst_line;
3491
	dst_line += dst_stride;
3492
	w = width;
3493
 
3494
	CHECKPOINT ();
3495
 
3496
	while (w && (uintptr_t)dst & 7)
3497
	{
3498
	    __m64 vdest = load8888 (dst);
3499
 
3500
	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3501
 
3502
	    w--;
3503
	    dst++;
3504
	}
3505
 
3506
	while (w >= 2)
3507
	{
3508
	    __m64 vdest = *(__m64 *)dst;
3509
	    __m64 dest0 = expand8888 (vdest, 0);
3510
	    __m64 dest1 = expand8888 (vdest, 1);
3511
 
3512
 
3513
	    dest0 = over (dest0, expand_alpha (dest0), vsrc);
3514
	    dest1 = over (dest1, expand_alpha (dest1), vsrc);
3515
 
3516
	    *(__m64 *)dst = pack8888 (dest0, dest1);
3517
 
3518
	    dst += 2;
3519
	    w -= 2;
3520
	}
3521
 
3522
	CHECKPOINT ();
3523
 
3524
	if (w)
3525
	{
3526
	    __m64 vdest = load8888 (dst);
3527
 
3528
	    store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
3529
	}
3530
    }
3531
 
3532
    _mm_empty ();
3533
}
3534
 
3535
#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
3536
#define BMSK (BSHIFT - 1)
3537
 
3538
#define BILINEAR_DECLARE_VARIABLES						\
3539
    const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt);				\
3540
    const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb);				\
3541
    const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT);	\
3542
    const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1);				\
3543
    const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK);			\
3544
    const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x);		\
3545
    const __m64 mm_zero = _mm_setzero_si64 ();					\
3546
    __m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
3547
 
3548
#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix)					\
3549
do {										\
3550
    /* fetch 2x2 pixel block into 2 mmx registers */				\
3551
    __m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]);		\
3552
    __m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]);		\
3553
    /* vertical interpolation */						\
3554
    __m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt);		\
3555
    __m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt);		\
3556
    __m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb);		\
3557
    __m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb);		\
3558
    __m64 hi = _mm_add_pi16 (t_hi, b_hi);					\
3559
    __m64 lo = _mm_add_pi16 (t_lo, b_lo);					\
3560
    vx += unit_x;								\
3561
    if (BILINEAR_INTERPOLATION_BITS < 8)					\
3562
    {										\
3563
	/* calculate horizontal weights */					\
3564
	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7,		\
3565
			  _mm_srli_pi16 (mm_x,					\
3566
					 16 - BILINEAR_INTERPOLATION_BITS)));	\
3567
	/* horizontal interpolation */						\
3568
	__m64 p = _mm_unpacklo_pi16 (lo, hi);					\
3569
	__m64 q = _mm_unpackhi_pi16 (lo, hi);					\
3570
	lo = _mm_madd_pi16 (p, mm_wh);						\
3571
	hi = _mm_madd_pi16 (q, mm_wh);						\
3572
    }										\
3573
    else									\
3574
    {										\
3575
	/* calculate horizontal weights */					\
3576
	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x,		\
3577
					16 - BILINEAR_INTERPOLATION_BITS));	\
3578
	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x,					\
3579
					16 - BILINEAR_INTERPOLATION_BITS);	\
3580
	/* horizontal interpolation */						\
3581
	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo);				\
3582
	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi);				\
3583
	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo);				\
3584
	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi);				\
3585
	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo),		\
3586
			   _mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi));		\
3587
	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo),		\
3588
			   _mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi));		\
3589
    }										\
3590
    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
3591
    /* shift and pack the result */						\
3592
    hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2);			\
3593
    lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2);			\
3594
    lo = _mm_packs_pi32 (lo, hi);						\
3595
    lo = _mm_packs_pu16 (lo, lo);						\
3596
    pix = lo;									\
3597
} while (0)
3598
 
3599
#define BILINEAR_SKIP_ONE_PIXEL()						\
3600
do {										\
3601
    vx += unit_x;								\
3602
    mm_x = _mm_add_pi16 (mm_x, mm_ux);						\
3603
} while(0)
3604
 
3605
static force_inline void
3606
scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t *       dst,
3607
					    const uint32_t * mask,
3608
					    const uint32_t * src_top,
3609
					    const uint32_t * src_bottom,
3610
					    int32_t          w,
3611
					    int              wt,
3612
					    int              wb,
3613
					    pixman_fixed_t   vx,
3614
					    pixman_fixed_t   unit_x,
3615
					    pixman_fixed_t   max_vx,
3616
					    pixman_bool_t    zero_src)
3617
{
3618
    BILINEAR_DECLARE_VARIABLES;
3619
    __m64 pix;
3620
 
3621
    while (w--)
3622
    {
3623
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
3624
	store (dst, pix);
3625
	dst++;
3626
    }
3627
 
3628
    _mm_empty ();
3629
}
3630
 
3631
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
3632
			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3633
			       uint32_t, uint32_t, uint32_t,
3634
			       COVER, FLAG_NONE)
3635
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
3636
			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3637
			       uint32_t, uint32_t, uint32_t,
3638
			       PAD, FLAG_NONE)
3639
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
3640
			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3641
			       uint32_t, uint32_t, uint32_t,
3642
			       NONE, FLAG_NONE)
3643
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
3644
			       scaled_bilinear_scanline_mmx_8888_8888_SRC,
3645
			       uint32_t, uint32_t, uint32_t,
3646
			       NORMAL, FLAG_NONE)
3647
 
3648
static force_inline void
3649
scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t *       dst,
3650
					     const uint32_t * mask,
3651
					     const uint32_t * src_top,
3652
					     const uint32_t * src_bottom,
3653
					     int32_t          w,
3654
					     int              wt,
3655
					     int              wb,
3656
					     pixman_fixed_t   vx,
3657
					     pixman_fixed_t   unit_x,
3658
					     pixman_fixed_t   max_vx,
3659
					     pixman_bool_t    zero_src)
3660
{
3661
    BILINEAR_DECLARE_VARIABLES;
3662
    __m64 pix1, pix2;
3663
 
3664
    while (w)
3665
    {
3666
	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3667
 
3668
	if (!is_zero (pix1))
3669
	{
3670
	    pix2 = load (dst);
3671
	    store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
3672
	}
3673
 
3674
	w--;
3675
	dst++;
3676
    }
3677
 
3678
    _mm_empty ();
3679
}
3680
 
3681
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
3682
			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3683
			       uint32_t, uint32_t, uint32_t,
3684
			       COVER, FLAG_NONE)
3685
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
3686
			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3687
			       uint32_t, uint32_t, uint32_t,
3688
			       PAD, FLAG_NONE)
3689
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
3690
			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3691
			       uint32_t, uint32_t, uint32_t,
3692
			       NONE, FLAG_NONE)
3693
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
3694
			       scaled_bilinear_scanline_mmx_8888_8888_OVER,
3695
			       uint32_t, uint32_t, uint32_t,
3696
			       NORMAL, FLAG_NONE)
3697
 
3698
static force_inline void
3699
scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t *       dst,
3700
					       const uint8_t  * mask,
3701
					       const uint32_t * src_top,
3702
					       const uint32_t * src_bottom,
3703
					       int32_t          w,
3704
					       int              wt,
3705
					       int              wb,
3706
					       pixman_fixed_t   vx,
3707
					       pixman_fixed_t   unit_x,
3708
					       pixman_fixed_t   max_vx,
3709
					       pixman_bool_t    zero_src)
3710
{
3711
    BILINEAR_DECLARE_VARIABLES;
3712
    __m64 pix1, pix2;
3713
    uint32_t m;
3714
 
3715
    while (w)
3716
    {
3717
	m = (uint32_t) *mask++;
3718
 
3719
	if (m)
3720
	{
3721
	    BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
3722
 
3723
	    if (m == 0xff && is_opaque (pix1))
3724
	    {
3725
		store (dst, pix1);
3726
	    }
3727
	    else
3728
	    {
3729
		__m64 ms, md, ma, msa;
3730
 
3731
		pix2 = load (dst);
3732
		ma = expand_alpha_rev (to_m64 (m));
3733
		ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
3734
		md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
3735
 
3736
		msa = expand_alpha (ms);
3737
 
3738
		store8888 (dst, (in_over (ms, msa, ma, md)));
3739
	    }
3740
	}
3741
	else
3742
	{
3743
	    BILINEAR_SKIP_ONE_PIXEL ();
3744
	}
3745
 
3746
	w--;
3747
	dst++;
3748
    }
3749
 
3750
    _mm_empty ();
3751
}
3752
 
3753
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
3754
			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3755
			       uint32_t, uint8_t, uint32_t,
3756
			       COVER, FLAG_HAVE_NON_SOLID_MASK)
3757
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
3758
			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3759
			       uint32_t, uint8_t, uint32_t,
3760
			       PAD, FLAG_HAVE_NON_SOLID_MASK)
3761
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
3762
			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3763
			       uint32_t, uint8_t, uint32_t,
3764
			       NONE, FLAG_HAVE_NON_SOLID_MASK)
3765
FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
3766
			       scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
3767
			       uint32_t, uint8_t, uint32_t,
3768
			       NORMAL, FLAG_HAVE_NON_SOLID_MASK)
3769
 
3770
static uint32_t *
3771
mmx_fetch_x8r8g8b8 (pixman_iter_t *iter, const uint32_t *mask)
3772
{
3773
    int w = iter->width;
3774
    uint32_t *dst = iter->buffer;
3775
    uint32_t *src = (uint32_t *)iter->bits;
3776
 
3777
    iter->bits += iter->stride;
3778
 
3779
    while (w && ((uintptr_t)dst) & 7)
3780
    {
3781
	*dst++ = (*src++) | 0xff000000;
3782
	w--;
3783
    }
3784
 
3785
    while (w >= 8)
3786
    {
3787
	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
3788
	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
3789
	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
3790
	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
3791
 
3792
	*(__m64 *)(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
3793
	*(__m64 *)(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
3794
	*(__m64 *)(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
3795
	*(__m64 *)(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
3796
 
3797
	dst += 8;
3798
	src += 8;
3799
	w -= 8;
3800
    }
3801
 
3802
    while (w)
3803
    {
3804
	*dst++ = (*src++) | 0xff000000;
3805
	w--;
3806
    }
3807
 
3808
    _mm_empty ();
3809
    return iter->buffer;
3810
}
3811
 
3812
static uint32_t *
3813
mmx_fetch_r5g6b5 (pixman_iter_t *iter, const uint32_t *mask)
3814
{
3815
    int w = iter->width;
3816
    uint32_t *dst = iter->buffer;
3817
    uint16_t *src = (uint16_t *)iter->bits;
3818
 
3819
    iter->bits += iter->stride;
3820
 
3821
    while (w && ((uintptr_t)dst) & 0x0f)
3822
    {
3823
	uint16_t s = *src++;
3824
 
3825
	*dst++ = convert_0565_to_8888 (s);
3826
	w--;
3827
    }
3828
 
3829
    while (w >= 4)
3830
    {
3831
	__m64 vsrc = ldq_u ((__m64 *)src);
3832
	__m64 mm0, mm1;
3833
 
3834
	expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
3835
 
3836
	*(__m64 *)(dst + 0) = mm0;
3837
	*(__m64 *)(dst + 2) = mm1;
3838
 
3839
	dst += 4;
3840
	src += 4;
3841
	w -= 4;
3842
    }
3843
 
3844
    while (w)
3845
    {
3846
	uint16_t s = *src++;
3847
 
3848
	*dst++ = convert_0565_to_8888 (s);
3849
	w--;
3850
    }
3851
 
3852
    _mm_empty ();
3853
    return iter->buffer;
3854
}
3855
 
3856
static uint32_t *
3857
mmx_fetch_a8 (pixman_iter_t *iter, const uint32_t *mask)
3858
{
3859
    int w = iter->width;
3860
    uint32_t *dst = iter->buffer;
3861
    uint8_t *src = iter->bits;
3862
 
3863
    iter->bits += iter->stride;
3864
 
3865
    while (w && (((uintptr_t)dst) & 15))
3866
    {
3867
        *dst++ = *(src++) << 24;
3868
        w--;
3869
    }
3870
 
3871
    while (w >= 8)
3872
    {
3873
	__m64 mm0 = ldq_u ((__m64 *)src);
3874
 
3875
	__m64 mm1 = _mm_unpacklo_pi8  (_mm_setzero_si64(), mm0);
3876
	__m64 mm2 = _mm_unpackhi_pi8  (_mm_setzero_si64(), mm0);
3877
	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
3878
	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
3879
	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
3880
	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
3881
 
3882
	*(__m64 *)(dst + 0) = mm3;
3883
	*(__m64 *)(dst + 2) = mm4;
3884
	*(__m64 *)(dst + 4) = mm5;
3885
	*(__m64 *)(dst + 6) = mm6;
3886
 
3887
	dst += 8;
3888
	src += 8;
3889
	w -= 8;
3890
    }
3891
 
3892
    while (w)
3893
    {
3894
	*dst++ = *(src++) << 24;
3895
	w--;
3896
    }
3897
 
3898
    _mm_empty ();
3899
    return iter->buffer;
3900
}
3901
 
3902
typedef struct
3903
{
3904
    pixman_format_code_t	format;
3905
    pixman_iter_get_scanline_t	get_scanline;
3906
} fetcher_info_t;
3907
 
3908
static const fetcher_info_t fetchers[] =
3909
{
3910
    { PIXMAN_x8r8g8b8,		mmx_fetch_x8r8g8b8 },
3911
    { PIXMAN_r5g6b5,		mmx_fetch_r5g6b5 },
3912
    { PIXMAN_a8,		mmx_fetch_a8 },
3913
    { PIXMAN_null }
3914
};
3915
 
3916
static pixman_bool_t
3917
mmx_src_iter_init (pixman_implementation_t *imp, pixman_iter_t *iter)
3918
{
3919
    pixman_image_t *image = iter->image;
3920
 
3921
#define FLAGS								\
3922
    (FAST_PATH_STANDARD_FLAGS | FAST_PATH_ID_TRANSFORM |		\
3923
     FAST_PATH_BITS_IMAGE | FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
3924
 
3925
    if ((iter->iter_flags & ITER_NARROW)			&&
3926
	(iter->image_flags & FLAGS) == FLAGS)
3927
    {
3928
	const fetcher_info_t *f;
3929
 
3930
	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
3931
	{
3932
	    if (image->common.extended_format_code == f->format)
3933
	    {
3934
		uint8_t *b = (uint8_t *)image->bits.bits;
3935
		int s = image->bits.rowstride * 4;
3936
 
3937
		iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
3938
		iter->stride = s;
3939
 
3940
		iter->get_scanline = f->get_scanline;
3941
		return TRUE;
3942
	    }
3943
	}
3944
    }
3945
 
3946
    return FALSE;
3947
}
3948
 
1891 serge 3949
static const pixman_fast_path_t mmx_fast_paths[] =
3950
{
3951
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       r5g6b5,   mmx_composite_over_n_8_0565       ),
3952
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       b5g6r5,   mmx_composite_over_n_8_0565       ),
3953
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8r8g8b8, mmx_composite_over_n_8_8888       ),
3954
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8r8g8b8, mmx_composite_over_n_8_8888       ),
3955
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       a8b8g8r8, mmx_composite_over_n_8_8888       ),
3956
    PIXMAN_STD_FAST_PATH    (OVER, solid,    a8,       x8b8g8r8, mmx_composite_over_n_8_8888       ),
3957
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3958
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
3959
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8r8g8b8, r5g6b5,   mmx_composite_over_n_8888_0565_ca ),
3960
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3961
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
3962
    PIXMAN_STD_FAST_PATH_CA (OVER, solid,    a8b8g8r8, b5g6r5,   mmx_composite_over_n_8888_0565_ca ),
3963
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   a8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3964
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   x8r8g8b8, mmx_composite_over_pixbuf_8888    ),
3965
    PIXMAN_STD_FAST_PATH    (OVER, pixbuf,   pixbuf,   r5g6b5,   mmx_composite_over_pixbuf_0565    ),
3966
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  a8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3967
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  x8b8g8r8, mmx_composite_over_pixbuf_8888    ),
3968
    PIXMAN_STD_FAST_PATH    (OVER, rpixbuf,  rpixbuf,  b5g6r5,   mmx_composite_over_pixbuf_0565    ),
3969
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_x888_n_8888    ),
3970
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_x888_n_8888    ),
3971
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_x888_n_8888    ),
3972
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_x888_n_8888    ),
3973
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    a8r8g8b8, mmx_composite_over_8888_n_8888    ),
3974
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, solid,    x8r8g8b8, mmx_composite_over_8888_n_8888    ),
3975
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    a8b8g8r8, mmx_composite_over_8888_n_8888    ),
3976
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, solid,    x8b8g8r8, mmx_composite_over_8888_n_8888    ),
3977
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       x8r8g8b8, mmx_composite_over_x888_8_8888    ),
3978
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, a8,       a8r8g8b8, mmx_composite_over_x888_8_8888    ),
3931 Serge 3979
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       x8b8g8r8, mmx_composite_over_x888_8_8888    ),
3980
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, a8,       a8b8g8r8, mmx_composite_over_x888_8_8888    ),
1891 serge 3981
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     a8r8g8b8, mmx_composite_over_n_8888         ),
3982
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     x8r8g8b8, mmx_composite_over_n_8888         ),
3983
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     r5g6b5,   mmx_composite_over_n_0565         ),
3931 Serge 3984
    PIXMAN_STD_FAST_PATH    (OVER, solid,    null,     b5g6r5,   mmx_composite_over_n_0565         ),
1891 serge 3985
    PIXMAN_STD_FAST_PATH    (OVER, x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
3986
    PIXMAN_STD_FAST_PATH    (OVER, x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
3987
 
3988
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     a8r8g8b8, mmx_composite_over_8888_8888      ),
3989
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     x8r8g8b8, mmx_composite_over_8888_8888      ),
3990
    PIXMAN_STD_FAST_PATH    (OVER, a8r8g8b8, null,     r5g6b5,   mmx_composite_over_8888_0565      ),
3991
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     a8b8g8r8, mmx_composite_over_8888_8888      ),
3992
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     x8b8g8r8, mmx_composite_over_8888_8888      ),
3993
    PIXMAN_STD_FAST_PATH    (OVER, a8b8g8r8, null,     b5g6r5,   mmx_composite_over_8888_0565      ),
3994
 
3931 Serge 3995
    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
3996
    PIXMAN_STD_FAST_PATH    (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
3997
 
3998
    PIXMAN_STD_FAST_PATH    (ADD,  r5g6b5,   null,     r5g6b5,   mmx_composite_add_0565_0565       ),
3999
    PIXMAN_STD_FAST_PATH    (ADD,  b5g6r5,   null,     b5g6r5,   mmx_composite_add_0565_0565       ),
1891 serge 4000
    PIXMAN_STD_FAST_PATH    (ADD,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_add_8888_8888       ),
4001
    PIXMAN_STD_FAST_PATH    (ADD,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_add_8888_8888       ),
4002
    PIXMAN_STD_FAST_PATH    (ADD,  a8,       null,     a8,       mmx_composite_add_8_8		   ),
4003
    PIXMAN_STD_FAST_PATH    (ADD,  solid,    a8,       a8,       mmx_composite_add_n_8_8           ),
4004
 
3931 Serge 4005
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4006
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
4007
    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     r5g6b5,   mmx_composite_src_x888_0565       ),
4008
    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     b5g6r5,   mmx_composite_src_x888_0565       ),
1891 serge 4009
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8r8g8b8, mmx_composite_src_n_8_8888        ),
4010
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8r8g8b8, mmx_composite_src_n_8_8888        ),
4011
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       a8b8g8r8, mmx_composite_src_n_8_8888        ),
4012
    PIXMAN_STD_FAST_PATH    (SRC,  solid,    a8,       x8b8g8r8, mmx_composite_src_n_8_8888        ),
4013
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     a8r8g8b8, mmx_composite_copy_area           ),
4014
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     a8b8g8r8, mmx_composite_copy_area           ),
4015
    PIXMAN_STD_FAST_PATH    (SRC,  a8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4016
    PIXMAN_STD_FAST_PATH    (SRC,  a8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4017
    PIXMAN_STD_FAST_PATH    (SRC,  x8r8g8b8, null,     x8r8g8b8, mmx_composite_copy_area           ),
4018
    PIXMAN_STD_FAST_PATH    (SRC,  x8b8g8r8, null,     x8b8g8r8, mmx_composite_copy_area           ),
4019
    PIXMAN_STD_FAST_PATH    (SRC,  r5g6b5,   null,     r5g6b5,   mmx_composite_copy_area           ),
4020
    PIXMAN_STD_FAST_PATH    (SRC,  b5g6r5,   null,     b5g6r5,   mmx_composite_copy_area           ),
4021
 
4022
    PIXMAN_STD_FAST_PATH    (IN,   a8,       null,     a8,       mmx_composite_in_8_8              ),
4023
    PIXMAN_STD_FAST_PATH    (IN,   solid,    a8,       a8,       mmx_composite_in_n_8_8            ),
4024
 
3931 Serge 4025
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          a8r8g8b8, mmx_8888_8888                     ),
4026
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4027
    SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8,          x8r8g8b8, mmx_8888_8888                     ),
4028
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          a8b8g8r8, mmx_8888_8888                     ),
4029
    SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
4030
    SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8,          x8b8g8r8, mmx_8888_8888                     ),
1891 serge 4031
 
3931 Serge 4032
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         x8r8g8b8, mmx_8888_8888                     ),
4033
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         x8b8g8r8, mmx_8888_8888                     ),
4034
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8,         a8r8g8b8, mmx_8888_8888                     ),
4035
    SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8,         a8b8g8r8, mmx_8888_8888                     ),
1891 serge 4036
 
3931 Serge 4037
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888                   ),
4038
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888                   ),
4039
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888                   ),
4040
    SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888                   ),
1891 serge 4041
 
3931 Serge 4042
    { PIXMAN_OP_NONE },
4043
};
1891 serge 4044
 
4045
pixman_implementation_t *
3931 Serge 4046
_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
1891 serge 4047
{
3931 Serge 4048
    pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
1891 serge 4049
 
4050
    imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
4051
    imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
4052
    imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
4053
    imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
4054
    imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
4055
    imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
4056
    imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
4057
    imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
4058
    imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
4059
    imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
4060
    imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
4061
 
4062
    imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
4063
    imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
4064
    imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
4065
    imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
4066
    imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
4067
    imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
4068
    imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
4069
    imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
4070
    imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
4071
    imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
4072
    imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
4073
 
4074
    imp->blt = mmx_blt;
4075
    imp->fill = mmx_fill;
4076
 
3931 Serge 4077
    imp->src_iter_init = mmx_src_iter_init;
4078
 
1891 serge 4079
    return imp;
4080
}
4081
 
3931 Serge 4082
#endif /* USE_X86_MMX || USE_ARM_IWMMXT || USE_LOONGSON_MMI */