WebSVN – Kolibri OS – Blame – /programs/develop/libraries/pixman/pixman-mmx.c

Rev	Author	Line No.	Line
1891	serge	1	/*
		2	* Copyright © 2004, 2005 Red Hat, Inc.
		3	* Copyright © 2004 Nicholas Miell
		4	* Copyright © 2005 Trolltech AS
		5	*
		6	* Permission to use, copy, modify, distribute, and sell this software and its
		7	* documentation for any purpose is hereby granted without fee, provided that
		8	* the above copyright notice appear in all copies and that both that
		9	* copyright notice and this permission notice appear in supporting
		10	* documentation, and that the name of Red Hat not be used in advertising or
		11	* publicity pertaining to distribution of the software without specific,
		12	* written prior permission. Red Hat makes no representations about the
		13	* suitability of this software for any purpose. It is provided "as is"
		14	* without express or implied warranty.
		15	*
		16	* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
		17	* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
		18	* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
		19	* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
		20	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
		21	* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
		22	* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
		23	* SOFTWARE.
		24	*
		25	* Author: Søren Sandmann (sandmann@redhat.com)
		26	* Minor Improvements: Nicholas Miell (nmiell@gmail.com)
		27	* MMX code paths for fbcompose.c by Lars Knoll (lars@trolltech.com)
		28	*
		29	* Based on work by Owen Taylor
		30	*/
		31
		32	#ifdef HAVE_CONFIG_H
		33	#include
		34	#endif
		35
3931	Serge	36	#if defined USE_X86_MMX \|\| defined USE_ARM_IWMMXT \|\| defined USE_LOONGSON_MMI
1891	serge	37
3931	Serge	38	#ifdef USE_LOONGSON_MMI
		39	#include
		40	#else
1891	serge	41	#include
3931	Serge	42	#endif
1891	serge	43	#include "pixman-private.h"
		44	#include "pixman-combine32.h"
3931	Serge	45	#include "pixman-inlines.h"
1891	serge	46
		47	#ifdef VERBOSE
		48	#define CHECKPOINT() error_f ("at %s %d\n", __FUNCTION__, __LINE__)
		49	#else
		50	#define CHECKPOINT()
		51	#endif
		52
3931	Serge	53	#if defined USE_ARM_IWMMXT && __GNUC__ == 4 && __GNUC_MINOR__ < 8
		54	/* Empty the multimedia state. For some reason, ARM's mmintrin.h doesn't provide this. */
		55	extern __inline void __attribute__((__gnu_inline__, __always_inline__, __artificial__))
		56	_mm_empty (void)
		57	{
		58
		59	}
		60	#endif
		61
		62	#ifdef USE_X86_MMX
		63	# if (defined(__SUNPRO_C) \|\| defined(_MSC_VER) \|\| defined(_WIN64))
		64	# include
		65	# else
		66	/* We have to compile with -msse to use xmmintrin.h, but that causes SSE
		67	* instructions to be generated that we don't want. Just duplicate the
		68	* functions we want to use. */
		69	extern __inline int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
		70	_mm_movemask_pi8 (__m64 __A)
		71	{
		72	int ret;
		73
		74	asm ("pmovmskb %1, %0\n\t"
		75	: "=r" (ret)
		76	: "y" (__A)
		77	);
		78
		79	return ret;
		80	}
		81
		82	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
		83	_mm_mulhi_pu16 (__m64 __A, __m64 __B)
		84	{
		85	asm ("pmulhuw %1, %0\n\t"
		86	: "+y" (__A)
		87	: "y" (__B)
		88	);
		89	return __A;
		90	}
		91
		92	# ifdef __OPTIMIZE__
		93	extern __inline __m64 __attribute__((__gnu_inline__, __always_inline__, __artificial__))
		94	_mm_shuffle_pi16 (__m64 __A, int8_t const __N)
		95	{
		96	__m64 ret;
		97
		98	asm ("pshufw %2, %1, %0\n\t"
		99	: "=y" (ret)
		100	: "y" (__A), "K" (__N)
		101	);
		102
		103	return ret;
		104	}
		105	# else
		106	# define _mm_shuffle_pi16(A, N) \
		107	({ \
		108	__m64 ret; \
		109	\
		110	asm ("pshufw %2, %1, %0\n\t" \
		111	: "=y" (ret) \
		112	: "y" (A), "K" ((const int8_t)N) \
		113	); \
		114	\
		115	ret; \
		116	})
		117	# endif
		118	# endif
		119	#endif
		120
		121	#ifndef _MSC_VER
		122	#define _MM_SHUFFLE(fp3,fp2,fp1,fp0) \
		123	(((fp3) << 6) \| ((fp2) << 4) \| ((fp1) << 2) \| (fp0))
		124	#endif
		125
1891	serge	126	/* Notes about writing mmx code
		127	*
		128	* give memory operands as the second operand. If you give it as the
		129	* first, gcc will first load it into a register, then use that
		130	* register
		131	*
		132	* ie. use
		133	*
		134	* _mm_mullo_pi16 (x, mmx_constant);
		135	*
		136	* not
		137	*
		138	* _mm_mullo_pi16 (mmx_constant, x);
		139	*
		140	* Also try to minimize dependencies. i.e. when you need a value, try
		141	* to calculate it from a value that was calculated as early as
		142	* possible.
		143	*/
		144
		145	/* --------------- MMX primitives ------------------------------------- */
		146
3931	Serge	147	/* If __m64 is defined as a struct or union, then define M64_MEMBER to be
		148	* the name of the member used to access the data.
		149	* If __m64 requires using mm_cvt* intrinsics functions to convert between
		150	* uint64_t and __m64 values, then define USE_CVT_INTRINSICS.
		151	* If __m64 and uint64_t values can just be cast to each other directly,
		152	* then define USE_M64_CASTS.
		153	* If __m64 is a double datatype, then define USE_M64_DOUBLE.
		154	*/
		155	#ifdef _MSC_VER
		156	# define M64_MEMBER m64_u64
		157	#elif defined(__ICC)
		158	# define USE_CVT_INTRINSICS
		159	#elif defined(USE_LOONGSON_MMI)
		160	# define USE_M64_DOUBLE
		161	#elif defined(__GNUC__)
		162	# define USE_M64_CASTS
		163	#elif defined(__SUNPRO_C)
		164	# if (__SUNPRO_C >= 0x5120) && !defined(__NOVECTORSIZE__)
		165	/* Solaris Studio 12.3 (Sun C 5.12) introduces __attribute__(__vector_size__)
		166	* support, and defaults to using it to define __m64, unless __NOVECTORSIZE__
		167	* is defined. If it is used, then the mm_cvt* intrinsics must be used.
		168	*/
		169	# define USE_CVT_INTRINSICS
		170	# else
		171	/* For Studio 12.2 or older, or when __attribute__(__vector_size__) is
		172	* disabled, __m64 is defined as a struct containing "unsigned long long l_".
		173	*/
		174	# define M64_MEMBER l_
		175	# endif
		176	#endif
		177
		178	#if defined(USE_M64_CASTS) \|\| defined(USE_CVT_INTRINSICS) \|\| defined(USE_M64_DOUBLE)
1891	serge	179	typedef uint64_t mmxdatafield;
		180	#else
		181	typedef __m64 mmxdatafield;
		182	#endif
		183
		184	typedef struct
		185	{
		186	mmxdatafield mmx_4x00ff;
		187	mmxdatafield mmx_4x0080;
		188	mmxdatafield mmx_565_rgb;
		189	mmxdatafield mmx_565_unpack_multiplier;
3931	Serge	190	mmxdatafield mmx_565_pack_multiplier;
1891	serge	191	mmxdatafield mmx_565_r;
		192	mmxdatafield mmx_565_g;
		193	mmxdatafield mmx_565_b;
3931	Serge	194	mmxdatafield mmx_packed_565_rb;
		195	mmxdatafield mmx_packed_565_g;
		196	mmxdatafield mmx_expand_565_g;
		197	mmxdatafield mmx_expand_565_b;
		198	mmxdatafield mmx_expand_565_r;
		199	#ifndef USE_LOONGSON_MMI
1891	serge	200	mmxdatafield mmx_mask_0;
		201	mmxdatafield mmx_mask_1;
		202	mmxdatafield mmx_mask_2;
		203	mmxdatafield mmx_mask_3;
3931	Serge	204	#endif
1891	serge	205	mmxdatafield mmx_full_alpha;
3931	Serge	206	mmxdatafield mmx_4x0101;
		207	mmxdatafield mmx_ff000000;
1891	serge	208	} mmx_data_t;
		209
		210	#if defined(_MSC_VER)
		211	# define MMXDATA_INIT(field, val) { val ## UI64 }
		212	#elif defined(M64_MEMBER) /* __m64 is a struct, not an integral type */
		213	# define MMXDATA_INIT(field, val) field = { val ## ULL }
3931	Serge	214	#else /* mmxdatafield is an integral type */
1891	serge	215	# define MMXDATA_INIT(field, val) field = val ## ULL
		216	#endif
		217
		218	static const mmx_data_t c =
		219	{
		220	MMXDATA_INIT (.mmx_4x00ff, 0x00ff00ff00ff00ff),
		221	MMXDATA_INIT (.mmx_4x0080, 0x0080008000800080),
		222	MMXDATA_INIT (.mmx_565_rgb, 0x000001f0003f001f),
		223	MMXDATA_INIT (.mmx_565_unpack_multiplier, 0x0000008404100840),
3931	Serge	224	MMXDATA_INIT (.mmx_565_pack_multiplier, 0x2000000420000004),
1891	serge	225	MMXDATA_INIT (.mmx_565_r, 0x000000f800000000),
		226	MMXDATA_INIT (.mmx_565_g, 0x0000000000fc0000),
		227	MMXDATA_INIT (.mmx_565_b, 0x00000000000000f8),
3931	Serge	228	MMXDATA_INIT (.mmx_packed_565_rb, 0x00f800f800f800f8),
		229	MMXDATA_INIT (.mmx_packed_565_g, 0x0000fc000000fc00),
		230	MMXDATA_INIT (.mmx_expand_565_g, 0x07e007e007e007e0),
		231	MMXDATA_INIT (.mmx_expand_565_b, 0x001f001f001f001f),
		232	MMXDATA_INIT (.mmx_expand_565_r, 0xf800f800f800f800),
		233	#ifndef USE_LOONGSON_MMI
1891	serge	234	MMXDATA_INIT (.mmx_mask_0, 0xffffffffffff0000),
		235	MMXDATA_INIT (.mmx_mask_1, 0xffffffff0000ffff),
		236	MMXDATA_INIT (.mmx_mask_2, 0xffff0000ffffffff),
		237	MMXDATA_INIT (.mmx_mask_3, 0x0000ffffffffffff),
3931	Serge	238	#endif
1891	serge	239	MMXDATA_INIT (.mmx_full_alpha, 0x00ff000000000000),
3931	Serge	240	MMXDATA_INIT (.mmx_4x0101, 0x0101010101010101),
		241	MMXDATA_INIT (.mmx_ff000000, 0xff000000ff000000),
1891	serge	242	};
		243
3931	Serge	244	#ifdef USE_CVT_INTRINSICS
		245	# define MC(x) to_m64 (c.mmx_ ## x)
		246	#elif defined(USE_M64_CASTS)
		247	# define MC(x) ((__m64)c.mmx_ ## x)
		248	#elif defined(USE_M64_DOUBLE)
		249	# define MC(x) ((__m64 )&c.mmx_ ## x)
1891	serge	250	#else
		251	# define MC(x) c.mmx_ ## x
		252	#endif
		253
		254	static force_inline __m64
		255	to_m64 (uint64_t x)
		256	{
3931	Serge	257	#ifdef USE_CVT_INTRINSICS
1891	serge	258	return _mm_cvtsi64_m64 (x);
		259	#elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
		260	__m64 res;
		261
		262	res.M64_MEMBER = x;
		263	return res;
3931	Serge	264	#elif defined USE_M64_DOUBLE
		265	return (__m64 )&x;
		266	#else /* USE_M64_CASTS */
1891	serge	267	return (__m64)x;
		268	#endif
		269	}
		270
		271	static force_inline uint64_t
		272	to_uint64 (__m64 x)
		273	{
3931	Serge	274	#ifdef USE_CVT_INTRINSICS
1891	serge	275	return _mm_cvtm64_si64 (x);
		276	#elif defined M64_MEMBER /* __m64 is a struct, not an integral type */
		277	uint64_t res = x.M64_MEMBER;
		278	return res;
3931	Serge	279	#elif defined USE_M64_DOUBLE
		280	return (uint64_t )&x;
		281	#else /* USE_M64_CASTS */
1891	serge	282	return (uint64_t)x;
		283	#endif
		284	}
		285
		286	static force_inline __m64
		287	shift (__m64 v,
		288	int s)
		289	{
		290	if (s > 0)
		291	return _mm_slli_si64 (v, s);
		292	else if (s < 0)
		293	return _mm_srli_si64 (v, -s);
		294	else
		295	return v;
		296	}
		297
		298	static force_inline __m64
		299	negate (__m64 mask)
		300	{
		301	return _mm_xor_si64 (mask, MC (4x00ff));
		302	}
		303
		304	static force_inline __m64
		305	pix_multiply (__m64 a, __m64 b)
		306	{
		307	__m64 res;
		308
		309	res = _mm_mullo_pi16 (a, b);
		310	res = _mm_adds_pu16 (res, MC (4x0080));
3931	Serge	311	res = _mm_mulhi_pu16 (res, MC (4x0101));
1891	serge	312
		313	return res;
		314	}
		315
		316	static force_inline __m64
		317	pix_add (__m64 a, __m64 b)
		318	{
		319	return _mm_adds_pu8 (a, b);
		320	}
		321
		322	static force_inline __m64
		323	expand_alpha (__m64 pixel)
		324	{
3931	Serge	325	return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 3, 3, 3));
1891	serge	326	}
		327
		328	static force_inline __m64
		329	expand_alpha_rev (__m64 pixel)
		330	{
3931	Serge	331	return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (0, 0, 0, 0));
1891	serge	332	}
		333
		334	static force_inline __m64
		335	invert_colors (__m64 pixel)
		336	{
3931	Serge	337	return _mm_shuffle_pi16 (pixel, _MM_SHUFFLE (3, 0, 1, 2));
1891	serge	338	}
		339
		340	static force_inline __m64
		341	over (__m64 src,
		342	__m64 srca,
		343	__m64 dest)
		344	{
		345	return _mm_adds_pu8 (src, pix_multiply (dest, negate (srca)));
		346	}
		347
		348	static force_inline __m64
		349	over_rev_non_pre (__m64 src, __m64 dest)
		350	{
		351	__m64 srca = expand_alpha (src);
		352	__m64 srcfaaa = _mm_or_si64 (srca, MC (full_alpha));
		353
		354	return over (pix_multiply (invert_colors (src), srcfaaa), srca, dest);
		355	}
		356
		357	static force_inline __m64
		358	in (__m64 src, __m64 mask)
		359	{
		360	return pix_multiply (src, mask);
		361	}
		362
		363	#ifndef _MSC_VER
		364	static force_inline __m64
		365	in_over (__m64 src, __m64 srca, __m64 mask, __m64 dest)
		366	{
		367	return over (in (src, mask), pix_multiply (srca, mask), dest);
		368	}
		369
		370	#else
		371
		372	#define in_over(src, srca, mask, dest) \
		373	over (in (src, mask), pix_multiply (srca, mask), dest)
		374
		375	#endif
		376
3931	Serge	377	/* Elemental unaligned loads */
		378
		379	static force_inline __m64 ldq_u(__m64 *p)
		380	{
		381	#ifdef USE_X86_MMX
		382	/* x86's alignment restrictions are very relaxed. */
		383	return (__m64 )p;
		384	#elif defined USE_ARM_IWMMXT
		385	int align = (uintptr_t)p & 7;
		386	__m64 *aligned_p;
		387	if (align == 0)
		388	return *p;
		389	aligned_p = (__m64 *)((uintptr_t)p & ~7);
		390	return (__m64) _mm_align_si64 (aligned_p[0], aligned_p[1], align);
		391	#else
		392	struct __una_u64 { __m64 x __attribute__((packed)); };
		393	const struct __una_u64 ptr = (const struct __una_u64 ) p;
		394	return (__m64) ptr->x;
		395	#endif
		396	}
		397
		398	static force_inline uint32_t ldl_u(const uint32_t *p)
		399	{
		400	#ifdef USE_X86_MMX
		401	/* x86's alignment restrictions are very relaxed. */
		402	return *p;
		403	#else
		404	struct __una_u32 { uint32_t x __attribute__((packed)); };
		405	const struct __una_u32 ptr = (const struct __una_u32 ) p;
		406	return ptr->x;
		407	#endif
		408	}
		409
1891	serge	410	static force_inline __m64
3931	Serge	411	load (const uint32_t *v)
1891	serge	412	{
3931	Serge	413	#ifdef USE_LOONGSON_MMI
		414	__m64 ret;
		415	asm ("lwc1 %0, %1\n\t"
		416	: "=f" (ret)
		417	: "m" (*v)
		418	);
		419	return ret;
		420	#else
		421	return _mm_cvtsi32_si64 (*v);
		422	#endif
1891	serge	423	}
		424
		425	static force_inline __m64
3931	Serge	426	load8888 (const uint32_t *v)
		427	{
		428	#ifdef USE_LOONGSON_MMI
		429	return _mm_unpacklo_pi8_f ((__m32 )v, _mm_setzero_si64 ());
		430	#else
		431	return _mm_unpacklo_pi8 (load (v), _mm_setzero_si64 ());
		432	#endif
		433	}
		434
		435	static force_inline __m64
		436	load8888u (const uint32_t *v)
		437	{
		438	uint32_t l = ldl_u (v);
		439	return load8888 (&l);
		440	}
		441
		442	static force_inline __m64
1891	serge	443	pack8888 (__m64 lo, __m64 hi)
		444	{
		445	return _mm_packs_pu16 (lo, hi);
		446	}
		447
3931	Serge	448	static force_inline void
		449	store (uint32_t *dest, __m64 v)
1891	serge	450	{
3931	Serge	451	#ifdef USE_LOONGSON_MMI
		452	asm ("swc1 %1, %0\n\t"
		453	: "=m" (*dest)
		454	: "f" (v)
		455	: "memory"
		456	);
		457	#else
		458	*dest = _mm_cvtsi64_si32 (v);
		459	#endif
1891	serge	460	}
		461
3931	Serge	462	static force_inline void
		463	store8888 (uint32_t *dest, __m64 v)
		464	{
		465	v = pack8888 (v, _mm_setzero_si64 ());
		466	store (dest, v);
		467	}
		468
		469	static force_inline pixman_bool_t
		470	is_equal (__m64 a, __m64 b)
		471	{
		472	#ifdef USE_LOONGSON_MMI
		473	/* __m64 is double, we can compare directly. */
		474	return a == b;
		475	#else
		476	return _mm_movemask_pi8 (_mm_cmpeq_pi8 (a, b)) == 0xff;
		477	#endif
		478	}
		479
		480	static force_inline pixman_bool_t
		481	is_opaque (__m64 v)
		482	{
		483	#ifdef USE_LOONGSON_MMI
		484	return is_equal (_mm_and_si64 (v, MC (full_alpha)), MC (full_alpha));
		485	#else
		486	__m64 ffs = _mm_cmpeq_pi8 (v, v);
		487	return (_mm_movemask_pi8 (_mm_cmpeq_pi8 (v, ffs)) & 0x40);
		488	#endif
		489	}
		490
		491	static force_inline pixman_bool_t
		492	is_zero (__m64 v)
		493	{
		494	return is_equal (v, _mm_setzero_si64 ());
		495	}
		496
1891	serge	497	/* Expand 16 bits positioned at @pos (0-3) of a mmx register into
		498	*
		499	* 00RR00GG00BB
		500	*
		501	* --- Expanding 565 in the low word ---
		502	*
		503	* m = (m << (32 - 3)) \| (m << (16 - 5)) \| m;
		504	* m = m & (01f0003f001f);
		505	* m = m * (008404100840);
		506	* m = m >> 8;
		507	*
		508	* Note the trick here - the top word is shifted by another nibble to
		509	* avoid it bumping into the middle word
		510	*/
		511	static force_inline __m64
		512	expand565 (__m64 pixel, int pos)
		513	{
		514	__m64 p = pixel;
		515	__m64 t1, t2;
		516
		517	/* move pixel to low 16 bit and zero the rest */
3931	Serge	518	#ifdef USE_LOONGSON_MMI
		519	p = loongson_extract_pi16 (p, pos);
		520	#else
1891	serge	521	p = shift (shift (p, (3 - pos) * 16), -48);
3931	Serge	522	#endif
1891	serge	523
		524	t1 = shift (p, 36 - 11);
		525	t2 = shift (p, 16 - 5);
		526
		527	p = _mm_or_si64 (t1, p);
		528	p = _mm_or_si64 (t2, p);
		529	p = _mm_and_si64 (p, MC (565_rgb));
		530
		531	pixel = _mm_mullo_pi16 (p, MC (565_unpack_multiplier));
		532	return _mm_srli_pi16 (pixel, 8);
		533	}
		534
3931	Serge	535	/* Expand 4 16 bit pixels in an mmx register into two mmx registers of
		536	*
		537	* AARRGGBBRRGGBB
		538	*/
		539	static force_inline void
		540	expand_4xpacked565 (__m64 vin, __m64 vout0, __m64 vout1, int full_alpha)
		541	{
		542	__m64 t0, t1, alpha = _mm_setzero_si64 ();
		543	__m64 r = _mm_and_si64 (vin, MC (expand_565_r));
		544	__m64 g = _mm_and_si64 (vin, MC (expand_565_g));
		545	__m64 b = _mm_and_si64 (vin, MC (expand_565_b));
		546	if (full_alpha)
		547	alpha = _mm_cmpeq_pi32 (alpha, alpha);
		548
		549	/* Replicate high bits into empty low bits. */
		550	r = _mm_or_si64 (_mm_srli_pi16 (r, 8), _mm_srli_pi16 (r, 13));
		551	g = _mm_or_si64 (_mm_srli_pi16 (g, 3), _mm_srli_pi16 (g, 9));
		552	b = _mm_or_si64 (_mm_slli_pi16 (b, 3), _mm_srli_pi16 (b, 2));
		553
		554	r = _mm_packs_pu16 (r, _mm_setzero_si64 ()); /* 00 00 00 00 R3 R2 R1 R0 */
		555	g = _mm_packs_pu16 (g, _mm_setzero_si64 ()); /* 00 00 00 00 G3 G2 G1 G0 */
		556	b = _mm_packs_pu16 (b, _mm_setzero_si64 ()); /* 00 00 00 00 B3 B2 B1 B0 */
		557
		558	t1 = _mm_unpacklo_pi8 (r, alpha); /* A3 R3 A2 R2 A1 R1 A0 R0 */
		559	t0 = _mm_unpacklo_pi8 (b, g); /* G3 B3 G2 B2 G1 B1 G0 B0 */
		560
		561	vout0 = _mm_unpacklo_pi16 (t0, t1); / A1 R1 G1 B1 A0 R0 G0 B0 */
		562	vout1 = _mm_unpackhi_pi16 (t0, t1); / A3 R3 G3 B3 A2 R2 G2 B2 */
		563	}
		564
1891	serge	565	static force_inline __m64
		566	expand8888 (__m64 in, int pos)
		567	{
		568	if (pos == 0)
		569	return _mm_unpacklo_pi8 (in, _mm_setzero_si64 ());
		570	else
		571	return _mm_unpackhi_pi8 (in, _mm_setzero_si64 ());
		572	}
		573
		574	static force_inline __m64
		575	expandx888 (__m64 in, int pos)
		576	{
		577	return _mm_or_si64 (expand8888 (in, pos), MC (full_alpha));
		578	}
		579
3931	Serge	580	static force_inline void
		581	expand_4x565 (__m64 vin, __m64 vout0, __m64 vout1, __m64 vout2, __m64 vout3, int full_alpha)
		582	{
		583	__m64 v0, v1;
		584	expand_4xpacked565 (vin, &v0, &v1, full_alpha);
		585	*vout0 = expand8888 (v0, 0);
		586	*vout1 = expand8888 (v0, 1);
		587	*vout2 = expand8888 (v1, 0);
		588	*vout3 = expand8888 (v1, 1);
		589	}
		590
1891	serge	591	static force_inline __m64
		592	pack_565 (__m64 pixel, __m64 target, int pos)
		593	{
		594	__m64 p = pixel;
		595	__m64 t = target;
		596	__m64 r, g, b;
		597
		598	r = _mm_and_si64 (p, MC (565_r));
		599	g = _mm_and_si64 (p, MC (565_g));
		600	b = _mm_and_si64 (p, MC (565_b));
		601
3931	Serge	602	#ifdef USE_LOONGSON_MMI
		603	r = shift (r, -(32 - 8));
		604	g = shift (g, -(16 - 3));
		605	b = shift (b, -(0 + 3));
		606
		607	p = _mm_or_si64 (r, g);
		608	p = _mm_or_si64 (p, b);
		609	return loongson_insert_pi16 (t, p, pos);
		610	#else
1891	serge	611	r = shift (r, -(32 - 8) + pos * 16);
		612	g = shift (g, -(16 - 3) + pos * 16);
		613	b = shift (b, -(0 + 3) + pos * 16);
		614
		615	if (pos == 0)
		616	t = _mm_and_si64 (t, MC (mask_0));
		617	else if (pos == 1)
		618	t = _mm_and_si64 (t, MC (mask_1));
		619	else if (pos == 2)
		620	t = _mm_and_si64 (t, MC (mask_2));
		621	else if (pos == 3)
		622	t = _mm_and_si64 (t, MC (mask_3));
		623
		624	p = _mm_or_si64 (r, t);
		625	p = _mm_or_si64 (g, p);
		626
		627	return _mm_or_si64 (b, p);
3931	Serge	628	#endif
1891	serge	629	}
		630
3931	Serge	631	static force_inline __m64
		632	pack_4xpacked565 (__m64 a, __m64 b)
		633	{
		634	__m64 rb0 = _mm_and_si64 (a, MC (packed_565_rb));
		635	__m64 rb1 = _mm_and_si64 (b, MC (packed_565_rb));
		636
		637	__m64 t0 = _mm_madd_pi16 (rb0, MC (565_pack_multiplier));
		638	__m64 t1 = _mm_madd_pi16 (rb1, MC (565_pack_multiplier));
		639
		640	__m64 g0 = _mm_and_si64 (a, MC (packed_565_g));
		641	__m64 g1 = _mm_and_si64 (b, MC (packed_565_g));
		642
		643	t0 = _mm_or_si64 (t0, g0);
		644	t1 = _mm_or_si64 (t1, g1);
		645
		646	t0 = shift(t0, -5);
		647	#ifdef USE_ARM_IWMMXT
		648	t1 = shift(t1, -5);
		649	return _mm_packs_pu32 (t0, t1);
		650	#else
		651	t1 = shift(t1, -5 + 16);
		652	return _mm_shuffle_pi16 (_mm_or_si64 (t0, t1), _MM_SHUFFLE (3, 1, 2, 0));
		653	#endif
		654	}
		655
1891	serge	656	#ifndef _MSC_VER
		657
		658	static force_inline __m64
3931	Serge	659	pack_4x565 (__m64 v0, __m64 v1, __m64 v2, __m64 v3)
		660	{
		661	return pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3));
		662	}
		663
		664	static force_inline __m64
1891	serge	665	pix_add_mul (__m64 x, __m64 a, __m64 y, __m64 b)
		666	{
		667	x = pix_multiply (x, a);
		668	y = pix_multiply (y, b);
		669
		670	return pix_add (x, y);
		671	}
		672
		673	#else
		674
3931	Serge	675	/* MSVC only handles a "pass by register" of up to three SSE intrinsics */
		676
		677	#define pack_4x565(v0, v1, v2, v3) \
		678	pack_4xpacked565 (pack8888 (v0, v1), pack8888 (v2, v3))
		679
1891	serge	680	#define pix_add_mul(x, a, y, b) \
		681	( x = pix_multiply (x, a), \
3931	Serge	682	y = pix_multiply (y, b), \
1891	serge	683	pix_add (x, y) )
		684
		685	#endif
		686
		687	/* --------------- MMX code patch for fbcompose.c --------------------- */
		688
3931	Serge	689	static force_inline __m64
1891	serge	690	combine (const uint32_t src, const uint32_t mask)
		691	{
3931	Serge	692	__m64 vsrc = load8888 (src);
1891	serge	693
		694	if (mask)
		695	{
3931	Serge	696	__m64 m = load8888 (mask);
1891	serge	697
		698	m = expand_alpha (m);
3931	Serge	699	vsrc = pix_multiply (vsrc, m);
		700	}
1891	serge	701
3931	Serge	702	return vsrc;
		703	}
		704
		705	static force_inline __m64
		706	core_combine_over_u_pixel_mmx (__m64 vsrc, __m64 vdst)
		707	{
		708	vsrc = _mm_unpacklo_pi8 (vsrc, _mm_setzero_si64 ());
		709
		710	if (is_opaque (vsrc))
		711	{
		712	return vsrc;
1891	serge	713	}
3931	Serge	714	else if (!is_zero (vsrc))
		715	{
		716	return over (vsrc, expand_alpha (vsrc),
		717	_mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ()));
		718	}
1891	serge	719
3931	Serge	720	return _mm_unpacklo_pi8 (vdst, _mm_setzero_si64 ());
1891	serge	721	}
		722
		723	static void
		724	mmx_combine_over_u (pixman_implementation_t *imp,
		725	pixman_op_t op,
		726	uint32_t * dest,
		727	const uint32_t * src,
		728	const uint32_t * mask,
		729	int width)
		730	{
		731	const uint32_t *end = dest + width;
		732
		733	while (dest < end)
		734	{
3931	Serge	735	__m64 vsrc = combine (src, mask);
1891	serge	736
3931	Serge	737	if (is_opaque (vsrc))
1891	serge	738	{
3931	Serge	739	store8888 (dest, vsrc);
1891	serge	740	}
3931	Serge	741	else if (!is_zero (vsrc))
1891	serge	742	{
3931	Serge	743	__m64 sa = expand_alpha (vsrc);
		744	store8888 (dest, over (vsrc, sa, load8888 (dest)));
1891	serge	745	}
		746
		747	++dest;
		748	++src;
		749	if (mask)
		750	++mask;
		751	}
		752	_mm_empty ();
		753	}
		754
		755	static void
		756	mmx_combine_over_reverse_u (pixman_implementation_t *imp,
		757	pixman_op_t op,
		758	uint32_t * dest,
		759	const uint32_t * src,
		760	const uint32_t * mask,
		761	int width)
		762	{
		763	const uint32_t *end = dest + width;
		764
		765	while (dest < end)
		766	{
		767	__m64 d, da;
3931	Serge	768	__m64 s = combine (src, mask);
1891	serge	769
3931	Serge	770	d = load8888 (dest);
1891	serge	771	da = expand_alpha (d);
3931	Serge	772	store8888 (dest, over (d, da, s));
1891	serge	773
		774	++dest;
		775	++src;
		776	if (mask)
		777	mask++;
		778	}
		779	_mm_empty ();
		780	}
		781
		782	static void
		783	mmx_combine_in_u (pixman_implementation_t *imp,
		784	pixman_op_t op,
		785	uint32_t * dest,
		786	const uint32_t * src,
		787	const uint32_t * mask,
		788	int width)
		789	{
		790	const uint32_t *end = dest + width;
		791
		792	while (dest < end)
		793	{
3931	Serge	794	__m64 a;
		795	__m64 x = combine (src, mask);
1891	serge	796
3931	Serge	797	a = load8888 (dest);
1891	serge	798	a = expand_alpha (a);
		799	x = pix_multiply (x, a);
		800
3931	Serge	801	store8888 (dest, x);
1891	serge	802
		803	++dest;
		804	++src;
		805	if (mask)
		806	mask++;
		807	}
		808	_mm_empty ();
		809	}
		810
		811	static void
		812	mmx_combine_in_reverse_u (pixman_implementation_t *imp,
		813	pixman_op_t op,
		814	uint32_t * dest,
		815	const uint32_t * src,
		816	const uint32_t * mask,
		817	int width)
		818	{
		819	const uint32_t *end = dest + width;
		820
		821	while (dest < end)
		822	{
3931	Serge	823	__m64 a = combine (src, mask);
		824	__m64 x;
1891	serge	825
3931	Serge	826	x = load8888 (dest);
1891	serge	827	a = expand_alpha (a);
		828	x = pix_multiply (x, a);
3931	Serge	829	store8888 (dest, x);
1891	serge	830
		831	++dest;
		832	++src;
		833	if (mask)
		834	mask++;
		835	}
		836	_mm_empty ();
		837	}
		838
		839	static void
		840	mmx_combine_out_u (pixman_implementation_t *imp,
		841	pixman_op_t op,
		842	uint32_t * dest,
		843	const uint32_t * src,
		844	const uint32_t * mask,
		845	int width)
		846	{
		847	const uint32_t *end = dest + width;
		848
		849	while (dest < end)
		850	{
3931	Serge	851	__m64 a;
		852	__m64 x = combine (src, mask);
1891	serge	853
3931	Serge	854	a = load8888 (dest);
1891	serge	855	a = expand_alpha (a);
		856	a = negate (a);
		857	x = pix_multiply (x, a);
3931	Serge	858	store8888 (dest, x);
1891	serge	859
		860	++dest;
		861	++src;
		862	if (mask)
		863	mask++;
		864	}
		865	_mm_empty ();
		866	}
		867
		868	static void
		869	mmx_combine_out_reverse_u (pixman_implementation_t *imp,
		870	pixman_op_t op,
		871	uint32_t * dest,
		872	const uint32_t * src,
		873	const uint32_t * mask,
		874	int width)
		875	{
		876	const uint32_t *end = dest + width;
		877
		878	while (dest < end)
		879	{
3931	Serge	880	__m64 a = combine (src, mask);
		881	__m64 x;
1891	serge	882
3931	Serge	883	x = load8888 (dest);
1891	serge	884	a = expand_alpha (a);
		885	a = negate (a);
		886	x = pix_multiply (x, a);
		887
3931	Serge	888	store8888 (dest, x);
1891	serge	889
		890	++dest;
		891	++src;
		892	if (mask)
		893	mask++;
		894	}
		895	_mm_empty ();
		896	}
		897
		898	static void
		899	mmx_combine_atop_u (pixman_implementation_t *imp,
		900	pixman_op_t op,
		901	uint32_t * dest,
		902	const uint32_t * src,
		903	const uint32_t * mask,
		904	int width)
		905	{
		906	const uint32_t *end = dest + width;
		907
		908	while (dest < end)
		909	{
3931	Serge	910	__m64 da, d, sia;
		911	__m64 s = combine (src, mask);
1891	serge	912
3931	Serge	913	d = load8888 (dest);
1891	serge	914	sia = expand_alpha (s);
		915	sia = negate (sia);
		916	da = expand_alpha (d);
		917	s = pix_add_mul (s, da, d, sia);
3931	Serge	918	store8888 (dest, s);
1891	serge	919
		920	++dest;
		921	++src;
		922	if (mask)
		923	mask++;
		924	}
		925	_mm_empty ();
		926	}
		927
		928	static void
		929	mmx_combine_atop_reverse_u (pixman_implementation_t *imp,
		930	pixman_op_t op,
		931	uint32_t * dest,
		932	const uint32_t * src,
		933	const uint32_t * mask,
		934	int width)
		935	{
		936	const uint32_t *end;
		937
		938	end = dest + width;
		939
		940	while (dest < end)
		941	{
3931	Serge	942	__m64 dia, d, sa;
		943	__m64 s = combine (src, mask);
1891	serge	944
3931	Serge	945	d = load8888 (dest);
1891	serge	946	sa = expand_alpha (s);
		947	dia = expand_alpha (d);
		948	dia = negate (dia);
		949	s = pix_add_mul (s, dia, d, sa);
3931	Serge	950	store8888 (dest, s);
1891	serge	951
		952	++dest;
		953	++src;
		954	if (mask)
		955	mask++;
		956	}
		957	_mm_empty ();
		958	}
		959
		960	static void
		961	mmx_combine_xor_u (pixman_implementation_t *imp,
		962	pixman_op_t op,
		963	uint32_t * dest,
		964	const uint32_t * src,
		965	const uint32_t * mask,
		966	int width)
		967	{
		968	const uint32_t *end = dest + width;
		969
		970	while (dest < end)
		971	{
3931	Serge	972	__m64 dia, d, sia;
		973	__m64 s = combine (src, mask);
1891	serge	974
3931	Serge	975	d = load8888 (dest);
1891	serge	976	sia = expand_alpha (s);
		977	dia = expand_alpha (d);
		978	sia = negate (sia);
		979	dia = negate (dia);
		980	s = pix_add_mul (s, dia, d, sia);
3931	Serge	981	store8888 (dest, s);
1891	serge	982
		983	++dest;
		984	++src;
		985	if (mask)
		986	mask++;
		987	}
		988	_mm_empty ();
		989	}
		990
		991	static void
		992	mmx_combine_add_u (pixman_implementation_t *imp,
		993	pixman_op_t op,
		994	uint32_t * dest,
		995	const uint32_t * src,
		996	const uint32_t * mask,
		997	int width)
		998	{
		999	const uint32_t *end = dest + width;
		1000
		1001	while (dest < end)
		1002	{
3931	Serge	1003	__m64 d;
		1004	__m64 s = combine (src, mask);
1891	serge	1005
3931	Serge	1006	d = load8888 (dest);
1891	serge	1007	s = pix_add (s, d);
3931	Serge	1008	store8888 (dest, s);
1891	serge	1009
		1010	++dest;
		1011	++src;
		1012	if (mask)
		1013	mask++;
		1014	}
		1015	_mm_empty ();
		1016	}
		1017
		1018	static void
		1019	mmx_combine_saturate_u (pixman_implementation_t *imp,
		1020	pixman_op_t op,
		1021	uint32_t * dest,
		1022	const uint32_t * src,
		1023	const uint32_t * mask,
		1024	int width)
		1025	{
		1026	const uint32_t *end = dest + width;
		1027
		1028	while (dest < end)
		1029	{
3931	Serge	1030	uint32_t s, sa, da;
1891	serge	1031	uint32_t d = *dest;
3931	Serge	1032	__m64 ms = combine (src, mask);
		1033	__m64 md = load8888 (dest);
1891	serge	1034
3931	Serge	1035	store8888(&s, ms);
		1036	da = ~d >> 24;
		1037	sa = s >> 24;
		1038
1891	serge	1039	if (sa > da)
		1040	{
3931	Serge	1041	uint32_t quot = DIV_UN8 (da, sa) << 24;
		1042	__m64 msa = load8888 (");
1891	serge	1043	msa = expand_alpha (msa);
		1044	ms = pix_multiply (ms, msa);
		1045	}
		1046
		1047	md = pix_add (md, ms);
3931	Serge	1048	store8888 (dest, md);
1891	serge	1049
		1050	++src;
		1051	++dest;
		1052	if (mask)
		1053	mask++;
		1054	}
		1055	_mm_empty ();
		1056	}
		1057
		1058	static void
		1059	mmx_combine_src_ca (pixman_implementation_t *imp,
		1060	pixman_op_t op,
		1061	uint32_t * dest,
		1062	const uint32_t * src,
		1063	const uint32_t * mask,
		1064	int width)
		1065	{
		1066	const uint32_t *end = src + width;
		1067
		1068	while (src < end)
		1069	{
3931	Serge	1070	__m64 a = load8888 (mask);
		1071	__m64 s = load8888 (src);
1891	serge	1072
		1073	s = pix_multiply (s, a);
3931	Serge	1074	store8888 (dest, s);
1891	serge	1075
		1076	++src;
		1077	++mask;
		1078	++dest;
		1079	}
		1080	_mm_empty ();
		1081	}
		1082
		1083	static void
		1084	mmx_combine_over_ca (pixman_implementation_t *imp,
		1085	pixman_op_t op,
		1086	uint32_t * dest,
		1087	const uint32_t * src,
		1088	const uint32_t * mask,
		1089	int width)
		1090	{
		1091	const uint32_t *end = src + width;
		1092
		1093	while (src < end)
		1094	{
3931	Serge	1095	__m64 a = load8888 (mask);
		1096	__m64 s = load8888 (src);
		1097	__m64 d = load8888 (dest);
1891	serge	1098	__m64 sa = expand_alpha (s);
		1099
3931	Serge	1100	store8888 (dest, in_over (s, sa, a, d));
1891	serge	1101
		1102	++src;
		1103	++dest;
		1104	++mask;
		1105	}
		1106	_mm_empty ();
		1107	}
		1108
		1109	static void
		1110	mmx_combine_over_reverse_ca (pixman_implementation_t *imp,
		1111	pixman_op_t op,
		1112	uint32_t * dest,
		1113	const uint32_t * src,
		1114	const uint32_t * mask,
		1115	int width)
		1116	{
		1117	const uint32_t *end = src + width;
		1118
		1119	while (src < end)
		1120	{
3931	Serge	1121	__m64 a = load8888 (mask);
		1122	__m64 s = load8888 (src);
		1123	__m64 d = load8888 (dest);
1891	serge	1124	__m64 da = expand_alpha (d);
		1125
3931	Serge	1126	store8888 (dest, over (d, da, in (s, a)));
1891	serge	1127
		1128	++src;
		1129	++dest;
		1130	++mask;
		1131	}
		1132	_mm_empty ();
		1133	}
		1134
		1135	static void
		1136	mmx_combine_in_ca (pixman_implementation_t *imp,
		1137	pixman_op_t op,
		1138	uint32_t * dest,
		1139	const uint32_t * src,
		1140	const uint32_t * mask,
		1141	int width)
		1142	{
		1143	const uint32_t *end = src + width;
		1144
		1145	while (src < end)
		1146	{
3931	Serge	1147	__m64 a = load8888 (mask);
		1148	__m64 s = load8888 (src);
		1149	__m64 d = load8888 (dest);
1891	serge	1150	__m64 da = expand_alpha (d);
		1151
		1152	s = pix_multiply (s, a);
		1153	s = pix_multiply (s, da);
3931	Serge	1154	store8888 (dest, s);
1891	serge	1155
		1156	++src;
		1157	++dest;
		1158	++mask;
		1159	}
		1160	_mm_empty ();
		1161	}
		1162
		1163	static void
		1164	mmx_combine_in_reverse_ca (pixman_implementation_t *imp,
		1165	pixman_op_t op,
		1166	uint32_t * dest,
		1167	const uint32_t * src,
		1168	const uint32_t * mask,
		1169	int width)
		1170	{
		1171	const uint32_t *end = src + width;
		1172
		1173	while (src < end)
		1174	{
3931	Serge	1175	__m64 a = load8888 (mask);
		1176	__m64 s = load8888 (src);
		1177	__m64 d = load8888 (dest);
1891	serge	1178	__m64 sa = expand_alpha (s);
		1179
		1180	a = pix_multiply (a, sa);
		1181	d = pix_multiply (d, a);
3931	Serge	1182	store8888 (dest, d);
1891	serge	1183
		1184	++src;
		1185	++dest;
		1186	++mask;
		1187	}
		1188	_mm_empty ();
		1189	}
		1190
		1191	static void
		1192	mmx_combine_out_ca (pixman_implementation_t *imp,
		1193	pixman_op_t op,
		1194	uint32_t * dest,
		1195	const uint32_t * src,
		1196	const uint32_t * mask,
		1197	int width)
		1198	{
		1199	const uint32_t *end = src + width;
		1200
		1201	while (src < end)
		1202	{
3931	Serge	1203	__m64 a = load8888 (mask);
		1204	__m64 s = load8888 (src);
		1205	__m64 d = load8888 (dest);
1891	serge	1206	__m64 da = expand_alpha (d);
		1207
		1208	da = negate (da);
		1209	s = pix_multiply (s, a);
		1210	s = pix_multiply (s, da);
3931	Serge	1211	store8888 (dest, s);
1891	serge	1212
		1213	++src;
		1214	++dest;
		1215	++mask;
		1216	}
		1217	_mm_empty ();
		1218	}
		1219
		1220	static void
		1221	mmx_combine_out_reverse_ca (pixman_implementation_t *imp,
		1222	pixman_op_t op,
		1223	uint32_t * dest,
		1224	const uint32_t * src,
		1225	const uint32_t * mask,
		1226	int width)
		1227	{
		1228	const uint32_t *end = src + width;
		1229
		1230	while (src < end)
		1231	{
3931	Serge	1232	__m64 a = load8888 (mask);
		1233	__m64 s = load8888 (src);
		1234	__m64 d = load8888 (dest);
1891	serge	1235	__m64 sa = expand_alpha (s);
		1236
		1237	a = pix_multiply (a, sa);
		1238	a = negate (a);
		1239	d = pix_multiply (d, a);
3931	Serge	1240	store8888 (dest, d);
1891	serge	1241
		1242	++src;
		1243	++dest;
		1244	++mask;
		1245	}
		1246	_mm_empty ();
		1247	}
		1248
		1249	static void
		1250	mmx_combine_atop_ca (pixman_implementation_t *imp,
		1251	pixman_op_t op,
		1252	uint32_t * dest,
		1253	const uint32_t * src,
		1254	const uint32_t * mask,
		1255	int width)
		1256	{
		1257	const uint32_t *end = src + width;
		1258
		1259	while (src < end)
		1260	{
3931	Serge	1261	__m64 a = load8888 (mask);
		1262	__m64 s = load8888 (src);
		1263	__m64 d = load8888 (dest);
1891	serge	1264	__m64 da = expand_alpha (d);
		1265	__m64 sa = expand_alpha (s);
		1266
		1267	s = pix_multiply (s, a);
		1268	a = pix_multiply (a, sa);
		1269	a = negate (a);
		1270	d = pix_add_mul (d, a, s, da);
3931	Serge	1271	store8888 (dest, d);
1891	serge	1272
		1273	++src;
		1274	++dest;
		1275	++mask;
		1276	}
		1277	_mm_empty ();
		1278	}
		1279
		1280	static void
		1281	mmx_combine_atop_reverse_ca (pixman_implementation_t *imp,
		1282	pixman_op_t op,
		1283	uint32_t * dest,
		1284	const uint32_t * src,
		1285	const uint32_t * mask,
		1286	int width)
		1287	{
		1288	const uint32_t *end = src + width;
		1289
		1290	while (src < end)
		1291	{
3931	Serge	1292	__m64 a = load8888 (mask);
		1293	__m64 s = load8888 (src);
		1294	__m64 d = load8888 (dest);
1891	serge	1295	__m64 da = expand_alpha (d);
		1296	__m64 sa = expand_alpha (s);
		1297
		1298	s = pix_multiply (s, a);
		1299	a = pix_multiply (a, sa);
		1300	da = negate (da);
		1301	d = pix_add_mul (d, a, s, da);
3931	Serge	1302	store8888 (dest, d);
1891	serge	1303
		1304	++src;
		1305	++dest;
		1306	++mask;
		1307	}
		1308	_mm_empty ();
		1309	}
		1310
		1311	static void
		1312	mmx_combine_xor_ca (pixman_implementation_t *imp,
		1313	pixman_op_t op,
		1314	uint32_t * dest,
		1315	const uint32_t * src,
		1316	const uint32_t * mask,
		1317	int width)
		1318	{
		1319	const uint32_t *end = src + width;
		1320
		1321	while (src < end)
		1322	{
3931	Serge	1323	__m64 a = load8888 (mask);
		1324	__m64 s = load8888 (src);
		1325	__m64 d = load8888 (dest);
1891	serge	1326	__m64 da = expand_alpha (d);
		1327	__m64 sa = expand_alpha (s);
		1328
		1329	s = pix_multiply (s, a);
		1330	a = pix_multiply (a, sa);
		1331	da = negate (da);
		1332	a = negate (a);
		1333	d = pix_add_mul (d, a, s, da);
3931	Serge	1334	store8888 (dest, d);
1891	serge	1335
		1336	++src;
		1337	++dest;
		1338	++mask;
		1339	}
		1340	_mm_empty ();
		1341	}
		1342
		1343	static void
		1344	mmx_combine_add_ca (pixman_implementation_t *imp,
		1345	pixman_op_t op,
		1346	uint32_t * dest,
		1347	const uint32_t * src,
		1348	const uint32_t * mask,
		1349	int width)
		1350	{
		1351	const uint32_t *end = src + width;
		1352
		1353	while (src < end)
		1354	{
3931	Serge	1355	__m64 a = load8888 (mask);
		1356	__m64 s = load8888 (src);
		1357	__m64 d = load8888 (dest);
1891	serge	1358
		1359	s = pix_multiply (s, a);
		1360	d = pix_add (s, d);
3931	Serge	1361	store8888 (dest, d);
1891	serge	1362
		1363	++src;
		1364	++dest;
		1365	++mask;
		1366	}
		1367	_mm_empty ();
		1368	}
		1369
		1370	/* ------------- MMX code paths called from fbpict.c -------------------- */
		1371
		1372	static void
		1373	mmx_composite_over_n_8888 (pixman_implementation_t *imp,
3931	Serge	1374	pixman_composite_info_t *info)
1891	serge	1375	{
3931	Serge	1376	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	1377	uint32_t src;
		1378	uint32_t dst_line, dst;
		1379	int32_t w;
		1380	int dst_stride;
		1381	__m64 vsrc, vsrca;
		1382
		1383	CHECKPOINT ();
		1384
3931	Serge	1385	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891	serge	1386
		1387	if (src == 0)
		1388	return;
		1389
3931	Serge	1390	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891	serge	1391
3931	Serge	1392	vsrc = load8888 (&src);
1891	serge	1393	vsrca = expand_alpha (vsrc);
		1394
		1395	while (height--)
		1396	{
		1397	dst = dst_line;
		1398	dst_line += dst_stride;
		1399	w = width;
		1400
		1401	CHECKPOINT ();
		1402
3931	Serge	1403	while (w && (uintptr_t)dst & 7)
1891	serge	1404	{
3931	Serge	1405	store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1891	serge	1406
		1407	w--;
		1408	dst++;
		1409	}
		1410
		1411	while (w >= 2)
		1412	{
		1413	__m64 vdest;
		1414	__m64 dest0, dest1;
		1415
		1416	vdest = (__m64 )dst;
		1417
		1418	dest0 = over (vsrc, vsrca, expand8888 (vdest, 0));
		1419	dest1 = over (vsrc, vsrca, expand8888 (vdest, 1));
		1420
		1421	(__m64 )dst = pack8888 (dest0, dest1);
		1422
		1423	dst += 2;
		1424	w -= 2;
		1425	}
		1426
		1427	CHECKPOINT ();
		1428
3931	Serge	1429	if (w)
1891	serge	1430	{
3931	Serge	1431	store8888 (dst, over (vsrc, vsrca, load8888 (dst)));
1891	serge	1432	}
		1433	}
		1434
		1435	_mm_empty ();
		1436	}
		1437
		1438	static void
		1439	mmx_composite_over_n_0565 (pixman_implementation_t *imp,
3931	Serge	1440	pixman_composite_info_t *info)
1891	serge	1441	{
3931	Serge	1442	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	1443	uint32_t src;
		1444	uint16_t dst_line, dst;
		1445	int32_t w;
		1446	int dst_stride;
		1447	__m64 vsrc, vsrca;
		1448
		1449	CHECKPOINT ();
		1450
3931	Serge	1451	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891	serge	1452
		1453	if (src == 0)
		1454	return;
		1455
3931	Serge	1456	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1891	serge	1457
3931	Serge	1458	vsrc = load8888 (&src);
1891	serge	1459	vsrca = expand_alpha (vsrc);
		1460
		1461	while (height--)
		1462	{
		1463	dst = dst_line;
		1464	dst_line += dst_stride;
		1465	w = width;
		1466
		1467	CHECKPOINT ();
		1468
3931	Serge	1469	while (w && (uintptr_t)dst & 7)
1891	serge	1470	{
		1471	uint64_t d = *dst;
		1472	__m64 vdest = expand565 (to_m64 (d), 0);
		1473
		1474	vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
		1475	*dst = to_uint64 (vdest);
		1476
		1477	w--;
		1478	dst++;
		1479	}
		1480
		1481	while (w >= 4)
		1482	{
3931	Serge	1483	__m64 vdest = (__m64 )dst;
		1484	__m64 v0, v1, v2, v3;
1891	serge	1485
3931	Serge	1486	expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1891	serge	1487
3931	Serge	1488	v0 = over (vsrc, vsrca, v0);
		1489	v1 = over (vsrc, vsrca, v1);
		1490	v2 = over (vsrc, vsrca, v2);
		1491	v3 = over (vsrc, vsrca, v3);
1891	serge	1492
3931	Serge	1493	(__m64 )dst = pack_4x565 (v0, v1, v2, v3);
1891	serge	1494
		1495	dst += 4;
		1496	w -= 4;
		1497	}
		1498
		1499	CHECKPOINT ();
		1500
		1501	while (w)
		1502	{
		1503	uint64_t d = *dst;
		1504	__m64 vdest = expand565 (to_m64 (d), 0);
		1505
		1506	vdest = pack_565 (over (vsrc, vsrca, vdest), vdest, 0);
		1507	*dst = to_uint64 (vdest);
		1508
		1509	w--;
		1510	dst++;
		1511	}
		1512	}
		1513
		1514	_mm_empty ();
		1515	}
		1516
		1517	static void
		1518	mmx_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
3931	Serge	1519	pixman_composite_info_t *info)
1891	serge	1520	{
3931	Serge	1521	PIXMAN_COMPOSITE_ARGS (info);
		1522	uint32_t src;
1891	serge	1523	uint32_t *dst_line;
		1524	uint32_t *mask_line;
		1525	int dst_stride, mask_stride;
		1526	__m64 vsrc, vsrca;
		1527
		1528	CHECKPOINT ();
		1529
3931	Serge	1530	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891	serge	1531
		1532	if (src == 0)
		1533	return;
		1534
3931	Serge	1535	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891	serge	1536	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
		1537
3931	Serge	1538	vsrc = load8888 (&src);
1891	serge	1539	vsrca = expand_alpha (vsrc);
		1540
		1541	while (height--)
		1542	{
		1543	int twidth = width;
		1544	uint32_t p = (uint32_t )mask_line;
		1545	uint32_t q = (uint32_t )dst_line;
		1546
3931	Serge	1547	while (twidth && (uintptr_t)q & 7)
1891	serge	1548	{
		1549	uint32_t m = (uint32_t )p;
		1550
		1551	if (m)
		1552	{
3931	Serge	1553	__m64 vdest = load8888 (q);
		1554	vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
		1555	store8888 (q, vdest);
1891	serge	1556	}
		1557
		1558	twidth--;
		1559	p++;
		1560	q++;
		1561	}
		1562
		1563	while (twidth >= 2)
		1564	{
		1565	uint32_t m0, m1;
		1566	m0 = *p;
		1567	m1 = *(p + 1);
		1568
		1569	if (m0 \| m1)
		1570	{
		1571	__m64 dest0, dest1;
		1572	__m64 vdest = (__m64 )q;
		1573
3931	Serge	1574	dest0 = in_over (vsrc, vsrca, load8888 (&m0),
1891	serge	1575	expand8888 (vdest, 0));
3931	Serge	1576	dest1 = in_over (vsrc, vsrca, load8888 (&m1),
1891	serge	1577	expand8888 (vdest, 1));
		1578
		1579	(__m64 )q = pack8888 (dest0, dest1);
		1580	}
		1581
		1582	p += 2;
		1583	q += 2;
		1584	twidth -= 2;
		1585	}
		1586
3931	Serge	1587	if (twidth)
1891	serge	1588	{
		1589	uint32_t m = (uint32_t )p;
		1590
		1591	if (m)
		1592	{
3931	Serge	1593	__m64 vdest = load8888 (q);
		1594	vdest = in_over (vsrc, vsrca, load8888 (&m), vdest);
		1595	store8888 (q, vdest);
1891	serge	1596	}
		1597
		1598	twidth--;
		1599	p++;
		1600	q++;
		1601	}
		1602
		1603	dst_line += dst_stride;
		1604	mask_line += mask_stride;
		1605	}
		1606
		1607	_mm_empty ();
		1608	}
		1609
		1610	static void
		1611	mmx_composite_over_8888_n_8888 (pixman_implementation_t *imp,
3931	Serge	1612	pixman_composite_info_t *info)
1891	serge	1613	{
3931	Serge	1614	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	1615	uint32_t dst_line, dst;
		1616	uint32_t src_line, src;
		1617	uint32_t mask;
		1618	__m64 vmask;
		1619	int dst_stride, src_stride;
		1620	int32_t w;
		1621
		1622	CHECKPOINT ();
		1623
3931	Serge	1624	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891	serge	1625	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		1626
3931	Serge	1627	mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
		1628	vmask = expand_alpha (load8888 (&mask));
1891	serge	1629
		1630	while (height--)
		1631	{
		1632	dst = dst_line;
		1633	dst_line += dst_stride;
		1634	src = src_line;
		1635	src_line += src_stride;
		1636	w = width;
		1637
3931	Serge	1638	while (w && (uintptr_t)dst & 7)
1891	serge	1639	{
3931	Serge	1640	__m64 s = load8888 (src);
		1641	__m64 d = load8888 (dst);
1891	serge	1642
3931	Serge	1643	store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1891	serge	1644
		1645	w--;
		1646	dst++;
		1647	src++;
		1648	}
		1649
		1650	while (w >= 2)
		1651	{
3931	Serge	1652	__m64 vs = ldq_u ((__m64 *)src);
1891	serge	1653	__m64 vd = (__m64 )dst;
		1654	__m64 vsrc0 = expand8888 (vs, 0);
		1655	__m64 vsrc1 = expand8888 (vs, 1);
		1656
		1657	(__m64 )dst = pack8888 (
		1658	in_over (vsrc0, expand_alpha (vsrc0), vmask, expand8888 (vd, 0)),
		1659	in_over (vsrc1, expand_alpha (vsrc1), vmask, expand8888 (vd, 1)));
		1660
		1661	w -= 2;
		1662	dst += 2;
		1663	src += 2;
		1664	}
		1665
3931	Serge	1666	if (w)
1891	serge	1667	{
3931	Serge	1668	__m64 s = load8888 (src);
		1669	__m64 d = load8888 (dst);
1891	serge	1670
3931	Serge	1671	store8888 (dst, in_over (s, expand_alpha (s), vmask, d));
1891	serge	1672	}
		1673	}
		1674
		1675	_mm_empty ();
		1676	}
		1677
		1678	static void
		1679	mmx_composite_over_x888_n_8888 (pixman_implementation_t *imp,
3931	Serge	1680	pixman_composite_info_t *info)
1891	serge	1681	{
3931	Serge	1682	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	1683	uint32_t dst_line, dst;
		1684	uint32_t src_line, src;
		1685	uint32_t mask;
		1686	__m64 vmask;
		1687	int dst_stride, src_stride;
		1688	int32_t w;
		1689	__m64 srca;
		1690
		1691	CHECKPOINT ();
		1692
3931	Serge	1693	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891	serge	1694	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3931	Serge	1695	mask = _pixman_image_get_solid (imp, mask_image, dest_image->bits.format);
1891	serge	1696
3931	Serge	1697	vmask = expand_alpha (load8888 (&mask));
1891	serge	1698	srca = MC (4x00ff);
		1699
		1700	while (height--)
		1701	{
		1702	dst = dst_line;
		1703	dst_line += dst_stride;
		1704	src = src_line;
		1705	src_line += src_stride;
		1706	w = width;
		1707
3931	Serge	1708	while (w && (uintptr_t)dst & 7)
1891	serge	1709	{
3931	Serge	1710	uint32_t ssrc = *src \| 0xff000000;
		1711	__m64 s = load8888 (&ssrc);
		1712	__m64 d = load8888 (dst);
1891	serge	1713
3931	Serge	1714	store8888 (dst, in_over (s, srca, vmask, d));
1891	serge	1715
		1716	w--;
		1717	dst++;
		1718	src++;
		1719	}
		1720
		1721	while (w >= 16)
		1722	{
		1723	__m64 vd0 = (__m64 )(dst + 0);
		1724	__m64 vd1 = (__m64 )(dst + 2);
		1725	__m64 vd2 = (__m64 )(dst + 4);
		1726	__m64 vd3 = (__m64 )(dst + 6);
		1727	__m64 vd4 = (__m64 )(dst + 8);
		1728	__m64 vd5 = (__m64 )(dst + 10);
		1729	__m64 vd6 = (__m64 )(dst + 12);
		1730	__m64 vd7 = (__m64 )(dst + 14);
		1731
3931	Serge	1732	__m64 vs0 = ldq_u ((__m64 *)(src + 0));
		1733	__m64 vs1 = ldq_u ((__m64 *)(src + 2));
		1734	__m64 vs2 = ldq_u ((__m64 *)(src + 4));
		1735	__m64 vs3 = ldq_u ((__m64 *)(src + 6));
		1736	__m64 vs4 = ldq_u ((__m64 *)(src + 8));
		1737	__m64 vs5 = ldq_u ((__m64 *)(src + 10));
		1738	__m64 vs6 = ldq_u ((__m64 *)(src + 12));
		1739	__m64 vs7 = ldq_u ((__m64 *)(src + 14));
1891	serge	1740
		1741	vd0 = pack8888 (
		1742	in_over (expandx888 (vs0, 0), srca, vmask, expand8888 (vd0, 0)),
		1743	in_over (expandx888 (vs0, 1), srca, vmask, expand8888 (vd0, 1)));
		1744
		1745	vd1 = pack8888 (
		1746	in_over (expandx888 (vs1, 0), srca, vmask, expand8888 (vd1, 0)),
		1747	in_over (expandx888 (vs1, 1), srca, vmask, expand8888 (vd1, 1)));
		1748
		1749	vd2 = pack8888 (
		1750	in_over (expandx888 (vs2, 0), srca, vmask, expand8888 (vd2, 0)),
		1751	in_over (expandx888 (vs2, 1), srca, vmask, expand8888 (vd2, 1)));
		1752
		1753	vd3 = pack8888 (
		1754	in_over (expandx888 (vs3, 0), srca, vmask, expand8888 (vd3, 0)),
		1755	in_over (expandx888 (vs3, 1), srca, vmask, expand8888 (vd3, 1)));
		1756
		1757	vd4 = pack8888 (
		1758	in_over (expandx888 (vs4, 0), srca, vmask, expand8888 (vd4, 0)),
		1759	in_over (expandx888 (vs4, 1), srca, vmask, expand8888 (vd4, 1)));
		1760
		1761	vd5 = pack8888 (
		1762	in_over (expandx888 (vs5, 0), srca, vmask, expand8888 (vd5, 0)),
		1763	in_over (expandx888 (vs5, 1), srca, vmask, expand8888 (vd5, 1)));
		1764
		1765	vd6 = pack8888 (
		1766	in_over (expandx888 (vs6, 0), srca, vmask, expand8888 (vd6, 0)),
		1767	in_over (expandx888 (vs6, 1), srca, vmask, expand8888 (vd6, 1)));
		1768
		1769	vd7 = pack8888 (
		1770	in_over (expandx888 (vs7, 0), srca, vmask, expand8888 (vd7, 0)),
		1771	in_over (expandx888 (vs7, 1), srca, vmask, expand8888 (vd7, 1)));
		1772
		1773	(__m64 )(dst + 0) = vd0;
		1774	(__m64 )(dst + 2) = vd1;
		1775	(__m64 )(dst + 4) = vd2;
		1776	(__m64 )(dst + 6) = vd3;
		1777	(__m64 )(dst + 8) = vd4;
		1778	(__m64 )(dst + 10) = vd5;
		1779	(__m64 )(dst + 12) = vd6;
		1780	(__m64 )(dst + 14) = vd7;
		1781
		1782	w -= 16;
		1783	dst += 16;
		1784	src += 16;
		1785	}
		1786
		1787	while (w)
		1788	{
3931	Serge	1789	uint32_t ssrc = *src \| 0xff000000;
		1790	__m64 s = load8888 (&ssrc);
		1791	__m64 d = load8888 (dst);
1891	serge	1792
3931	Serge	1793	store8888 (dst, in_over (s, srca, vmask, d));
1891	serge	1794
		1795	w--;
		1796	dst++;
		1797	src++;
		1798	}
		1799	}
		1800
		1801	_mm_empty ();
		1802	}
		1803
		1804	static void
		1805	mmx_composite_over_8888_8888 (pixman_implementation_t *imp,
3931	Serge	1806	pixman_composite_info_t *info)
1891	serge	1807	{
3931	Serge	1808	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	1809	uint32_t dst_line, dst;
		1810	uint32_t src_line, src;
		1811	uint32_t s;
		1812	int dst_stride, src_stride;
		1813	uint8_t a;
		1814	int32_t w;
		1815
		1816	CHECKPOINT ();
		1817
3931	Serge	1818	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891	serge	1819	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		1820
		1821	while (height--)
		1822	{
		1823	dst = dst_line;
		1824	dst_line += dst_stride;
		1825	src = src_line;
		1826	src_line += src_stride;
		1827	w = width;
		1828
		1829	while (w--)
		1830	{
		1831	s = *src++;
		1832	a = s >> 24;
		1833
		1834	if (a == 0xff)
		1835	{
		1836	*dst = s;
		1837	}
		1838	else if (s)
		1839	{
		1840	__m64 ms, sa;
3931	Serge	1841	ms = load8888 (&s);
1891	serge	1842	sa = expand_alpha (ms);
3931	Serge	1843	store8888 (dst, over (ms, sa, load8888 (dst)));
1891	serge	1844	}
		1845
		1846	dst++;
		1847	}
		1848	}
		1849	_mm_empty ();
		1850	}
		1851
		1852	static void
		1853	mmx_composite_over_8888_0565 (pixman_implementation_t *imp,
3931	Serge	1854	pixman_composite_info_t *info)
1891	serge	1855	{
3931	Serge	1856	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	1857	uint16_t dst_line, dst;
		1858	uint32_t src_line, src;
		1859	int dst_stride, src_stride;
		1860	int32_t w;
		1861
		1862	CHECKPOINT ();
		1863
3931	Serge	1864	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1891	serge	1865	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		1866
		1867	#if 0
		1868	/* FIXME */
		1869	assert (src_image->drawable == mask_image->drawable);
		1870	#endif
		1871
		1872	while (height--)
		1873	{
		1874	dst = dst_line;
		1875	dst_line += dst_stride;
		1876	src = src_line;
		1877	src_line += src_stride;
		1878	w = width;
		1879
		1880	CHECKPOINT ();
		1881
3931	Serge	1882	while (w && (uintptr_t)dst & 7)
1891	serge	1883	{
3931	Serge	1884	__m64 vsrc = load8888 (src);
1891	serge	1885	uint64_t d = *dst;
		1886	__m64 vdest = expand565 (to_m64 (d), 0);
		1887
		1888	vdest = pack_565 (
		1889	over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
		1890
		1891	*dst = to_uint64 (vdest);
		1892
		1893	w--;
		1894	dst++;
		1895	src++;
		1896	}
		1897
		1898	CHECKPOINT ();
		1899
		1900	while (w >= 4)
		1901	{
3931	Serge	1902	__m64 vdest = (__m64 )dst;
		1903	__m64 v0, v1, v2, v3;
1891	serge	1904	__m64 vsrc0, vsrc1, vsrc2, vsrc3;
		1905
3931	Serge	1906	expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1891	serge	1907
3931	Serge	1908	vsrc0 = load8888 ((src + 0));
		1909	vsrc1 = load8888 ((src + 1));
		1910	vsrc2 = load8888 ((src + 2));
		1911	vsrc3 = load8888 ((src + 3));
1891	serge	1912
3931	Serge	1913	v0 = over (vsrc0, expand_alpha (vsrc0), v0);
		1914	v1 = over (vsrc1, expand_alpha (vsrc1), v1);
		1915	v2 = over (vsrc2, expand_alpha (vsrc2), v2);
		1916	v3 = over (vsrc3, expand_alpha (vsrc3), v3);
1891	serge	1917
3931	Serge	1918	(__m64 )dst = pack_4x565 (v0, v1, v2, v3);
1891	serge	1919
		1920	w -= 4;
		1921	dst += 4;
		1922	src += 4;
		1923	}
		1924
		1925	CHECKPOINT ();
		1926
		1927	while (w)
		1928	{
3931	Serge	1929	__m64 vsrc = load8888 (src);
1891	serge	1930	uint64_t d = *dst;
		1931	__m64 vdest = expand565 (to_m64 (d), 0);
		1932
		1933	vdest = pack_565 (over (vsrc, expand_alpha (vsrc), vdest), vdest, 0);
		1934
		1935	*dst = to_uint64 (vdest);
		1936
		1937	w--;
		1938	dst++;
		1939	src++;
		1940	}
		1941	}
		1942
		1943	_mm_empty ();
		1944	}
		1945
		1946	static void
		1947	mmx_composite_over_n_8_8888 (pixman_implementation_t *imp,
3931	Serge	1948	pixman_composite_info_t *info)
1891	serge	1949	{
3931	Serge	1950	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	1951	uint32_t src, srca;
		1952	uint32_t dst_line, dst;
		1953	uint8_t mask_line, mask;
		1954	int dst_stride, mask_stride;
		1955	int32_t w;
		1956	__m64 vsrc, vsrca;
		1957	uint64_t srcsrc;
		1958
		1959	CHECKPOINT ();
		1960
3931	Serge	1961	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891	serge	1962
		1963	srca = src >> 24;
		1964	if (src == 0)
		1965	return;
		1966
		1967	srcsrc = (uint64_t)src << 32 \| src;
		1968
3931	Serge	1969	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891	serge	1970	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		1971
3931	Serge	1972	vsrc = load8888 (&src);
1891	serge	1973	vsrca = expand_alpha (vsrc);
		1974
		1975	while (height--)
		1976	{
		1977	dst = dst_line;
		1978	dst_line += dst_stride;
		1979	mask = mask_line;
		1980	mask_line += mask_stride;
		1981	w = width;
		1982
		1983	CHECKPOINT ();
		1984
3931	Serge	1985	while (w && (uintptr_t)dst & 7)
1891	serge	1986	{
		1987	uint64_t m = *mask;
		1988
		1989	if (m)
		1990	{
		1991	__m64 vdest = in_over (vsrc, vsrca,
		1992	expand_alpha_rev (to_m64 (m)),
3931	Serge	1993	load8888 (dst));
1891	serge	1994
3931	Serge	1995	store8888 (dst, vdest);
1891	serge	1996	}
		1997
		1998	w--;
		1999	mask++;
		2000	dst++;
		2001	}
		2002
		2003	CHECKPOINT ();
		2004
		2005	while (w >= 2)
		2006	{
		2007	uint64_t m0, m1;
		2008
		2009	m0 = *mask;
		2010	m1 = *(mask + 1);
		2011
		2012	if (srca == 0xff && (m0 & m1) == 0xff)
		2013	{
		2014	(uint64_t )dst = srcsrc;
		2015	}
		2016	else if (m0 \| m1)
		2017	{
		2018	__m64 vdest;
		2019	__m64 dest0, dest1;
		2020
		2021	vdest = (__m64 )dst;
		2022
		2023	dest0 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m0)),
		2024	expand8888 (vdest, 0));
		2025	dest1 = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m1)),
		2026	expand8888 (vdest, 1));
		2027
		2028	(__m64 )dst = pack8888 (dest0, dest1);
		2029	}
		2030
		2031	mask += 2;
		2032	dst += 2;
		2033	w -= 2;
		2034	}
		2035
		2036	CHECKPOINT ();
		2037
3931	Serge	2038	if (w)
1891	serge	2039	{
		2040	uint64_t m = *mask;
		2041
		2042	if (m)
		2043	{
3931	Serge	2044	__m64 vdest = load8888 (dst);
1891	serge	2045
		2046	vdest = in_over (
		2047	vsrc, vsrca, expand_alpha_rev (to_m64 (m)), vdest);
3931	Serge	2048	store8888 (dst, vdest);
1891	serge	2049	}
		2050	}
		2051	}
		2052
		2053	_mm_empty ();
		2054	}
		2055
3931	Serge	2056	static pixman_bool_t
		2057	mmx_fill (pixman_implementation_t *imp,
		2058	uint32_t * bits,
		2059	int stride,
		2060	int bpp,
		2061	int x,
		2062	int y,
		2063	int width,
		2064	int height,
		2065	uint32_t filler)
1891	serge	2066	{
		2067	uint64_t fill;
		2068	__m64 vfill;
		2069	uint32_t byte_width;
		2070	uint8_t *byte_line;
		2071
3931	Serge	2072	#if defined __GNUC__ && defined USE_X86_MMX
1891	serge	2073	__m64 v1, v2, v3, v4, v5, v6, v7;
		2074	#endif
		2075
		2076	if (bpp != 16 && bpp != 32 && bpp != 8)
		2077	return FALSE;
		2078
		2079	if (bpp == 8)
		2080	{
		2081	stride = stride * (int) sizeof (uint32_t) / 1;
		2082	byte_line = (uint8_t )(((uint8_t )bits) + stride * y + x);
		2083	byte_width = width;
		2084	stride *= 1;
3931	Serge	2085	filler = (filler & 0xff) * 0x01010101;
1891	serge	2086	}
		2087	else if (bpp == 16)
		2088	{
		2089	stride = stride * (int) sizeof (uint32_t) / 2;
		2090	byte_line = (uint8_t )(((uint16_t )bits) + stride * y + x);
		2091	byte_width = 2 * width;
		2092	stride *= 2;
3931	Serge	2093	filler = (filler & 0xffff) * 0x00010001;
1891	serge	2094	}
		2095	else
		2096	{
		2097	stride = stride * (int) sizeof (uint32_t) / 4;
		2098	byte_line = (uint8_t )(((uint32_t )bits) + stride * y + x);
		2099	byte_width = 4 * width;
		2100	stride *= 4;
		2101	}
		2102
3931	Serge	2103	fill = ((uint64_t)filler << 32) \| filler;
1891	serge	2104	vfill = to_m64 (fill);
		2105
3931	Serge	2106	#if defined __GNUC__ && defined USE_X86_MMX
1891	serge	2107	__asm__ (
		2108	"movq %7, %0\n"
		2109	"movq %7, %1\n"
		2110	"movq %7, %2\n"
		2111	"movq %7, %3\n"
		2112	"movq %7, %4\n"
		2113	"movq %7, %5\n"
		2114	"movq %7, %6\n"
		2115	: "=&y" (v1), "=&y" (v2), "=&y" (v3),
		2116	"=&y" (v4), "=&y" (v5), "=&y" (v6), "=y" (v7)
		2117	: "y" (vfill));
		2118	#endif
		2119
		2120	while (height--)
		2121	{
		2122	int w;
		2123	uint8_t *d = byte_line;
		2124
		2125	byte_line += stride;
		2126	w = byte_width;
		2127
3931	Serge	2128	if (w >= 1 && ((uintptr_t)d & 1))
1891	serge	2129	{
3931	Serge	2130	(uint8_t )d = (filler & 0xff);
1891	serge	2131	w--;
		2132	d++;
		2133	}
		2134
3931	Serge	2135	if (w >= 2 && ((uintptr_t)d & 3))
1891	serge	2136	{
3931	Serge	2137	(uint16_t )d = filler;
1891	serge	2138	w -= 2;
		2139	d += 2;
		2140	}
		2141
3931	Serge	2142	while (w >= 4 && ((uintptr_t)d & 7))
1891	serge	2143	{
3931	Serge	2144	(uint32_t )d = filler;
1891	serge	2145
		2146	w -= 4;
		2147	d += 4;
		2148	}
		2149
		2150	while (w >= 64)
		2151	{
3931	Serge	2152	#if defined __GNUC__ && defined USE_X86_MMX
1891	serge	2153	__asm__ (
		2154	"movq %1, (%0)\n"
		2155	"movq %2, 8(%0)\n"
		2156	"movq %3, 16(%0)\n"
		2157	"movq %4, 24(%0)\n"
		2158	"movq %5, 32(%0)\n"
		2159	"movq %6, 40(%0)\n"
		2160	"movq %7, 48(%0)\n"
		2161	"movq %8, 56(%0)\n"
		2162	:
		2163	: "r" (d),
		2164	"y" (vfill), "y" (v1), "y" (v2), "y" (v3),
		2165	"y" (v4), "y" (v5), "y" (v6), "y" (v7)
		2166	: "memory");
		2167	#else
		2168	(__m64) (d + 0) = vfill;
		2169	(__m64) (d + 8) = vfill;
		2170	(__m64) (d + 16) = vfill;
		2171	(__m64) (d + 24) = vfill;
		2172	(__m64) (d + 32) = vfill;
		2173	(__m64) (d + 40) = vfill;
		2174	(__m64) (d + 48) = vfill;
		2175	(__m64) (d + 56) = vfill;
		2176	#endif
		2177	w -= 64;
		2178	d += 64;
		2179	}
		2180
		2181	while (w >= 4)
		2182	{
3931	Serge	2183	(uint32_t )d = filler;
1891	serge	2184
		2185	w -= 4;
		2186	d += 4;
		2187	}
3931	Serge	2188	if (w >= 2)
1891	serge	2189	{
3931	Serge	2190	(uint16_t )d = filler;
1891	serge	2191	w -= 2;
		2192	d += 2;
		2193	}
3931	Serge	2194	if (w >= 1)
1891	serge	2195	{
3931	Serge	2196	(uint8_t )d = (filler & 0xff);
1891	serge	2197	w--;
		2198	d++;
		2199	}
		2200
		2201	}
		2202
		2203	_mm_empty ();
		2204	return TRUE;
		2205	}
		2206
		2207	static void
3931	Serge	2208	mmx_composite_src_x888_0565 (pixman_implementation_t *imp,
		2209	pixman_composite_info_t *info)
		2210	{
		2211	PIXMAN_COMPOSITE_ARGS (info);
		2212	uint16_t dst_line, dst;
		2213	uint32_t src_line, src, s;
		2214	int dst_stride, src_stride;
		2215	int32_t w;
		2216
		2217	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		2218	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
		2219
		2220	while (height--)
		2221	{
		2222	dst = dst_line;
		2223	dst_line += dst_stride;
		2224	src = src_line;
		2225	src_line += src_stride;
		2226	w = width;
		2227
		2228	while (w && (uintptr_t)dst & 7)
		2229	{
		2230	s = *src++;
		2231	*dst = convert_8888_to_0565 (s);
		2232	dst++;
		2233	w--;
		2234	}
		2235
		2236	while (w >= 4)
		2237	{
		2238	__m64 vdest;
		2239	__m64 vsrc0 = ldq_u ((__m64 *)(src + 0));
		2240	__m64 vsrc1 = ldq_u ((__m64 *)(src + 2));
		2241
		2242	vdest = pack_4xpacked565 (vsrc0, vsrc1);
		2243
		2244	(__m64 )dst = vdest;
		2245
		2246	w -= 4;
		2247	src += 4;
		2248	dst += 4;
		2249	}
		2250
		2251	while (w)
		2252	{
		2253	s = *src++;
		2254	*dst = convert_8888_to_0565 (s);
		2255	dst++;
		2256	w--;
		2257	}
		2258	}
		2259
		2260	_mm_empty ();
		2261	}
		2262
		2263	static void
1891	serge	2264	mmx_composite_src_n_8_8888 (pixman_implementation_t *imp,
3931	Serge	2265	pixman_composite_info_t *info)
1891	serge	2266	{
3931	Serge	2267	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	2268	uint32_t src, srca;
		2269	uint32_t dst_line, dst;
		2270	uint8_t mask_line, mask;
		2271	int dst_stride, mask_stride;
		2272	int32_t w;
3931	Serge	2273	__m64 vsrc;
1891	serge	2274	uint64_t srcsrc;
		2275
		2276	CHECKPOINT ();
		2277
3931	Serge	2278	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891	serge	2279
		2280	srca = src >> 24;
		2281	if (src == 0)
		2282	{
3931	Serge	2283	mmx_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
		2284	PIXMAN_FORMAT_BPP (dest_image->bits.format),
		2285	dest_x, dest_y, width, height, 0);
1891	serge	2286	return;
		2287	}
		2288
		2289	srcsrc = (uint64_t)src << 32 \| src;
		2290
3931	Serge	2291	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891	serge	2292	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		2293
3931	Serge	2294	vsrc = load8888 (&src);
1891	serge	2295
		2296	while (height--)
		2297	{
		2298	dst = dst_line;
		2299	dst_line += dst_stride;
		2300	mask = mask_line;
		2301	mask_line += mask_stride;
		2302	w = width;
		2303
		2304	CHECKPOINT ();
		2305
3931	Serge	2306	while (w && (uintptr_t)dst & 7)
1891	serge	2307	{
		2308	uint64_t m = *mask;
		2309
		2310	if (m)
		2311	{
		2312	__m64 vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
		2313
3931	Serge	2314	store8888 (dst, vdest);
1891	serge	2315	}
		2316	else
		2317	{
		2318	*dst = 0;
		2319	}
		2320
		2321	w--;
		2322	mask++;
		2323	dst++;
		2324	}
		2325
		2326	CHECKPOINT ();
		2327
		2328	while (w >= 2)
		2329	{
		2330	uint64_t m0, m1;
		2331	m0 = *mask;
		2332	m1 = *(mask + 1);
		2333
		2334	if (srca == 0xff && (m0 & m1) == 0xff)
		2335	{
		2336	(uint64_t )dst = srcsrc;
		2337	}
		2338	else if (m0 \| m1)
		2339	{
		2340	__m64 dest0, dest1;
		2341
		2342	dest0 = in (vsrc, expand_alpha_rev (to_m64 (m0)));
		2343	dest1 = in (vsrc, expand_alpha_rev (to_m64 (m1)));
		2344
		2345	(__m64 )dst = pack8888 (dest0, dest1);
		2346	}
		2347	else
		2348	{
		2349	(uint64_t )dst = 0;
		2350	}
		2351
		2352	mask += 2;
		2353	dst += 2;
		2354	w -= 2;
		2355	}
		2356
		2357	CHECKPOINT ();
		2358
3931	Serge	2359	if (w)
1891	serge	2360	{
		2361	uint64_t m = *mask;
		2362
		2363	if (m)
		2364	{
3931	Serge	2365	__m64 vdest = load8888 (dst);
1891	serge	2366
		2367	vdest = in (vsrc, expand_alpha_rev (to_m64 (m)));
3931	Serge	2368	store8888 (dst, vdest);
1891	serge	2369	}
		2370	else
		2371	{
		2372	*dst = 0;
		2373	}
		2374	}
		2375	}
		2376
		2377	_mm_empty ();
		2378	}
		2379
		2380	static void
		2381	mmx_composite_over_n_8_0565 (pixman_implementation_t *imp,
3931	Serge	2382	pixman_composite_info_t *info)
1891	serge	2383	{
3931	Serge	2384	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	2385	uint32_t src, srca;
		2386	uint16_t dst_line, dst;
		2387	uint8_t mask_line, mask;
		2388	int dst_stride, mask_stride;
		2389	int32_t w;
		2390	__m64 vsrc, vsrca, tmp;
3931	Serge	2391	__m64 srcsrcsrcsrc;
1891	serge	2392
		2393	CHECKPOINT ();
		2394
3931	Serge	2395	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891	serge	2396
		2397	srca = src >> 24;
		2398	if (src == 0)
		2399	return;
		2400
3931	Serge	2401	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1891	serge	2402	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		2403
3931	Serge	2404	vsrc = load8888 (&src);
1891	serge	2405	vsrca = expand_alpha (vsrc);
		2406
		2407	tmp = pack_565 (vsrc, _mm_setzero_si64 (), 0);
3931	Serge	2408	srcsrcsrcsrc = expand_alpha_rev (tmp);
1891	serge	2409
		2410	while (height--)
		2411	{
		2412	dst = dst_line;
		2413	dst_line += dst_stride;
		2414	mask = mask_line;
		2415	mask_line += mask_stride;
		2416	w = width;
		2417
		2418	CHECKPOINT ();
		2419
3931	Serge	2420	while (w && (uintptr_t)dst & 7)
1891	serge	2421	{
		2422	uint64_t m = *mask;
		2423
		2424	if (m)
		2425	{
		2426	uint64_t d = *dst;
		2427	__m64 vd = to_m64 (d);
		2428	__m64 vdest = in_over (
		2429	vsrc, vsrca, expand_alpha_rev (to_m64 (m)), expand565 (vd, 0));
		2430
		2431	vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
		2432	*dst = to_uint64 (vd);
		2433	}
		2434
		2435	w--;
		2436	mask++;
		2437	dst++;
		2438	}
		2439
		2440	CHECKPOINT ();
		2441
		2442	while (w >= 4)
		2443	{
		2444	uint64_t m0, m1, m2, m3;
		2445	m0 = *mask;
		2446	m1 = *(mask + 1);
		2447	m2 = *(mask + 2);
		2448	m3 = *(mask + 3);
		2449
		2450	if (srca == 0xff && (m0 & m1 & m2 & m3) == 0xff)
		2451	{
3931	Serge	2452	(__m64 )dst = srcsrcsrcsrc;
1891	serge	2453	}
		2454	else if (m0 \| m1 \| m2 \| m3)
		2455	{
3931	Serge	2456	__m64 vdest = (__m64 )dst;
		2457	__m64 v0, v1, v2, v3;
1891	serge	2458	__m64 vm0, vm1, vm2, vm3;
		2459
3931	Serge	2460	expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1891	serge	2461
		2462	vm0 = to_m64 (m0);
3931	Serge	2463	v0 = in_over (vsrc, vsrca, expand_alpha_rev (vm0), v0);
		2464
1891	serge	2465	vm1 = to_m64 (m1);
3931	Serge	2466	v1 = in_over (vsrc, vsrca, expand_alpha_rev (vm1), v1);
		2467
1891	serge	2468	vm2 = to_m64 (m2);
3931	Serge	2469	v2 = in_over (vsrc, vsrca, expand_alpha_rev (vm2), v2);
		2470
1891	serge	2471	vm3 = to_m64 (m3);
3931	Serge	2472	v3 = in_over (vsrc, vsrca, expand_alpha_rev (vm3), v3);
1891	serge	2473
3931	Serge	2474	(__m64 )dst = pack_4x565 (v0, v1, v2, v3);;
1891	serge	2475	}
		2476
		2477	w -= 4;
		2478	mask += 4;
		2479	dst += 4;
		2480	}
		2481
		2482	CHECKPOINT ();
		2483
		2484	while (w)
		2485	{
		2486	uint64_t m = *mask;
		2487
		2488	if (m)
		2489	{
		2490	uint64_t d = *dst;
		2491	__m64 vd = to_m64 (d);
		2492	__m64 vdest = in_over (vsrc, vsrca, expand_alpha_rev (to_m64 (m)),
		2493	expand565 (vd, 0));
		2494	vd = pack_565 (vdest, _mm_setzero_si64 (), 0);
		2495	*dst = to_uint64 (vd);
		2496	}
		2497
		2498	w--;
		2499	mask++;
		2500	dst++;
		2501	}
		2502	}
		2503
		2504	_mm_empty ();
		2505	}
		2506
		2507	static void
		2508	mmx_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
3931	Serge	2509	pixman_composite_info_t *info)
1891	serge	2510	{
3931	Serge	2511	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	2512	uint16_t dst_line, dst;
		2513	uint32_t src_line, src;
		2514	int dst_stride, src_stride;
		2515	int32_t w;
		2516
		2517	CHECKPOINT ();
		2518
3931	Serge	2519	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1891	serge	2520	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		2521
		2522	#if 0
		2523	/* FIXME */
		2524	assert (src_image->drawable == mask_image->drawable);
		2525	#endif
		2526
		2527	while (height--)
		2528	{
		2529	dst = dst_line;
		2530	dst_line += dst_stride;
		2531	src = src_line;
		2532	src_line += src_stride;
		2533	w = width;
		2534
		2535	CHECKPOINT ();
		2536
3931	Serge	2537	while (w && (uintptr_t)dst & 7)
1891	serge	2538	{
3931	Serge	2539	__m64 vsrc = load8888 (src);
1891	serge	2540	uint64_t d = *dst;
		2541	__m64 vdest = expand565 (to_m64 (d), 0);
		2542
		2543	vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
		2544
		2545	*dst = to_uint64 (vdest);
		2546
		2547	w--;
		2548	dst++;
		2549	src++;
		2550	}
		2551
		2552	CHECKPOINT ();
		2553
		2554	while (w >= 4)
		2555	{
		2556	uint32_t s0, s1, s2, s3;
		2557	unsigned char a0, a1, a2, a3;
		2558
		2559	s0 = *src;
		2560	s1 = *(src + 1);
		2561	s2 = *(src + 2);
		2562	s3 = *(src + 3);
		2563
		2564	a0 = (s0 >> 24);
		2565	a1 = (s1 >> 24);
		2566	a2 = (s2 >> 24);
		2567	a3 = (s3 >> 24);
		2568
		2569	if ((a0 & a1 & a2 & a3) == 0xFF)
		2570	{
3931	Serge	2571	__m64 v0 = invert_colors (load8888 (&s0));
		2572	__m64 v1 = invert_colors (load8888 (&s1));
		2573	__m64 v2 = invert_colors (load8888 (&s2));
		2574	__m64 v3 = invert_colors (load8888 (&s3));
1891	serge	2575
3931	Serge	2576	(__m64 )dst = pack_4x565 (v0, v1, v2, v3);
1891	serge	2577	}
		2578	else if (s0 \| s1 \| s2 \| s3)
		2579	{
		2580	__m64 vdest = (__m64 )dst;
3931	Serge	2581	__m64 v0, v1, v2, v3;
1891	serge	2582
3931	Serge	2583	__m64 vsrc0 = load8888 (&s0);
		2584	__m64 vsrc1 = load8888 (&s1);
		2585	__m64 vsrc2 = load8888 (&s2);
		2586	__m64 vsrc3 = load8888 (&s3);
1891	serge	2587
3931	Serge	2588	expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
		2589
		2590	v0 = over_rev_non_pre (vsrc0, v0);
		2591	v1 = over_rev_non_pre (vsrc1, v1);
		2592	v2 = over_rev_non_pre (vsrc2, v2);
		2593	v3 = over_rev_non_pre (vsrc3, v3);
		2594
		2595	(__m64 )dst = pack_4x565 (v0, v1, v2, v3);
1891	serge	2596	}
		2597
		2598	w -= 4;
		2599	dst += 4;
		2600	src += 4;
		2601	}
		2602
		2603	CHECKPOINT ();
		2604
		2605	while (w)
		2606	{
3931	Serge	2607	__m64 vsrc = load8888 (src);
1891	serge	2608	uint64_t d = *dst;
		2609	__m64 vdest = expand565 (to_m64 (d), 0);
		2610
		2611	vdest = pack_565 (over_rev_non_pre (vsrc, vdest), vdest, 0);
		2612
		2613	*dst = to_uint64 (vdest);
		2614
		2615	w--;
		2616	dst++;
		2617	src++;
		2618	}
		2619	}
		2620
		2621	_mm_empty ();
		2622	}
		2623
		2624	static void
		2625	mmx_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
3931	Serge	2626	pixman_composite_info_t *info)
1891	serge	2627	{
3931	Serge	2628	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	2629	uint32_t dst_line, dst;
		2630	uint32_t src_line, src;
		2631	int dst_stride, src_stride;
		2632	int32_t w;
		2633
		2634	CHECKPOINT ();
		2635
3931	Serge	2636	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891	serge	2637	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		2638
		2639	#if 0
		2640	/* FIXME */
		2641	assert (src_image->drawable == mask_image->drawable);
		2642	#endif
		2643
		2644	while (height--)
		2645	{
		2646	dst = dst_line;
		2647	dst_line += dst_stride;
		2648	src = src_line;
		2649	src_line += src_stride;
		2650	w = width;
		2651
3931	Serge	2652	while (w && (uintptr_t)dst & 7)
1891	serge	2653	{
3931	Serge	2654	__m64 s = load8888 (src);
		2655	__m64 d = load8888 (dst);
1891	serge	2656
3931	Serge	2657	store8888 (dst, over_rev_non_pre (s, d));
1891	serge	2658
		2659	w--;
		2660	dst++;
		2661	src++;
		2662	}
		2663
		2664	while (w >= 2)
		2665	{
3931	Serge	2666	uint32_t s0, s1;
1891	serge	2667	unsigned char a0, a1;
		2668	__m64 d0, d1;
		2669
		2670	s0 = *src;
		2671	s1 = *(src + 1);
		2672
		2673	a0 = (s0 >> 24);
		2674	a1 = (s1 >> 24);
		2675
		2676	if ((a0 & a1) == 0xFF)
		2677	{
3931	Serge	2678	d0 = invert_colors (load8888 (&s0));
		2679	d1 = invert_colors (load8888 (&s1));
1891	serge	2680
		2681	(__m64 )dst = pack8888 (d0, d1);
		2682	}
		2683	else if (s0 \| s1)
		2684	{
		2685	__m64 vdest = (__m64 )dst;
		2686
3931	Serge	2687	d0 = over_rev_non_pre (load8888 (&s0), expand8888 (vdest, 0));
		2688	d1 = over_rev_non_pre (load8888 (&s1), expand8888 (vdest, 1));
1891	serge	2689
		2690	(__m64 )dst = pack8888 (d0, d1);
		2691	}
		2692
		2693	w -= 2;
		2694	dst += 2;
		2695	src += 2;
		2696	}
		2697
3931	Serge	2698	if (w)
1891	serge	2699	{
3931	Serge	2700	__m64 s = load8888 (src);
		2701	__m64 d = load8888 (dst);
1891	serge	2702
3931	Serge	2703	store8888 (dst, over_rev_non_pre (s, d));
1891	serge	2704	}
		2705	}
		2706
		2707	_mm_empty ();
		2708	}
		2709
		2710	static void
		2711	mmx_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
3931	Serge	2712	pixman_composite_info_t *info)
1891	serge	2713	{
3931	Serge	2714	PIXMAN_COMPOSITE_ARGS (info);
		2715	uint32_t src;
1891	serge	2716	uint16_t *dst_line;
		2717	uint32_t *mask_line;
		2718	int dst_stride, mask_stride;
		2719	__m64 vsrc, vsrca;
		2720
		2721	CHECKPOINT ();
		2722
3931	Serge	2723	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891	serge	2724
		2725	if (src == 0)
		2726	return;
		2727
3931	Serge	2728	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
1891	serge	2729	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
		2730
3931	Serge	2731	vsrc = load8888 (&src);
1891	serge	2732	vsrca = expand_alpha (vsrc);
		2733
		2734	while (height--)
		2735	{
		2736	int twidth = width;
		2737	uint32_t p = (uint32_t )mask_line;
		2738	uint16_t q = (uint16_t )dst_line;
		2739
3931	Serge	2740	while (twidth && ((uintptr_t)q & 7))
1891	serge	2741	{
		2742	uint32_t m = (uint32_t )p;
		2743
		2744	if (m)
		2745	{
		2746	uint64_t d = *q;
		2747	__m64 vdest = expand565 (to_m64 (d), 0);
3931	Serge	2748	vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
1891	serge	2749	*q = to_uint64 (vdest);
		2750	}
		2751
		2752	twidth--;
		2753	p++;
		2754	q++;
		2755	}
		2756
		2757	while (twidth >= 4)
		2758	{
		2759	uint32_t m0, m1, m2, m3;
		2760
		2761	m0 = *p;
		2762	m1 = *(p + 1);
		2763	m2 = *(p + 2);
		2764	m3 = *(p + 3);
		2765
		2766	if ((m0 \| m1 \| m2 \| m3))
		2767	{
		2768	__m64 vdest = (__m64 )q;
3931	Serge	2769	__m64 v0, v1, v2, v3;
1891	serge	2770
3931	Serge	2771	expand_4x565 (vdest, &v0, &v1, &v2, &v3, 0);
1891	serge	2772
3931	Serge	2773	v0 = in_over (vsrc, vsrca, load8888 (&m0), v0);
		2774	v1 = in_over (vsrc, vsrca, load8888 (&m1), v1);
		2775	v2 = in_over (vsrc, vsrca, load8888 (&m2), v2);
		2776	v3 = in_over (vsrc, vsrca, load8888 (&m3), v3);
		2777
		2778	(__m64 )q = pack_4x565 (v0, v1, v2, v3);
1891	serge	2779	}
		2780	twidth -= 4;
		2781	p += 4;
		2782	q += 4;
		2783	}
		2784
		2785	while (twidth)
		2786	{
		2787	uint32_t m;
		2788
		2789	m = (uint32_t )p;
		2790	if (m)
		2791	{
		2792	uint64_t d = *q;
		2793	__m64 vdest = expand565 (to_m64 (d), 0);
3931	Serge	2794	vdest = pack_565 (in_over (vsrc, vsrca, load8888 (&m), vdest), vdest, 0);
1891	serge	2795	*q = to_uint64 (vdest);
		2796	}
		2797
		2798	twidth--;
		2799	p++;
		2800	q++;
		2801	}
		2802
		2803	mask_line += mask_stride;
		2804	dst_line += dst_stride;
		2805	}
		2806
		2807	_mm_empty ();
		2808	}
		2809
		2810	static void
		2811	mmx_composite_in_n_8_8 (pixman_implementation_t *imp,
3931	Serge	2812	pixman_composite_info_t *info)
1891	serge	2813	{
3931	Serge	2814	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	2815	uint8_t dst_line, dst;
		2816	uint8_t mask_line, mask;
		2817	int dst_stride, mask_stride;
		2818	int32_t w;
		2819	uint32_t src;
		2820	uint8_t sa;
		2821	__m64 vsrc, vsrca;
		2822
3931	Serge	2823	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1891	serge	2824	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		2825
3931	Serge	2826	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891	serge	2827
		2828	sa = src >> 24;
		2829
3931	Serge	2830	vsrc = load8888 (&src);
1891	serge	2831	vsrca = expand_alpha (vsrc);
		2832
		2833	while (height--)
		2834	{
		2835	dst = dst_line;
		2836	dst_line += dst_stride;
		2837	mask = mask_line;
		2838	mask_line += mask_stride;
		2839	w = width;
		2840
3931	Serge	2841	while (w && (uintptr_t)dst & 7)
1891	serge	2842	{
3931	Serge	2843	uint16_t tmp;
		2844	uint8_t a;
		2845	uint32_t m, d;
1891	serge	2846
3931	Serge	2847	a = *mask++;
		2848	d = *dst;
1891	serge	2849
3931	Serge	2850	m = MUL_UN8 (sa, a, tmp);
		2851	d = MUL_UN8 (m, d, tmp);
1891	serge	2852
3931	Serge	2853	*dst++ = d;
		2854	w--;
		2855	}
1891	serge	2856
3931	Serge	2857	while (w >= 4)
		2858	{
		2859	__m64 vmask;
		2860	__m64 vdest;
		2861
		2862	vmask = load8888u ((uint32_t *)mask);
		2863	vdest = load8888 ((uint32_t *)dst);
		2864
		2865	store8888 ((uint32_t *)dst, in (in (vsrca, vmask), vdest));
		2866
		2867	dst += 4;
		2868	mask += 4;
		2869	w -= 4;
1891	serge	2870	}
		2871
		2872	while (w--)
		2873	{
		2874	uint16_t tmp;
		2875	uint8_t a;
		2876	uint32_t m, d;
		2877
		2878	a = *mask++;
		2879	d = *dst;
		2880
		2881	m = MUL_UN8 (sa, a, tmp);
		2882	d = MUL_UN8 (m, d, tmp);
		2883
		2884	*dst++ = d;
		2885	}
		2886	}
		2887
		2888	_mm_empty ();
		2889	}
		2890
		2891	static void
		2892	mmx_composite_in_8_8 (pixman_implementation_t *imp,
3931	Serge	2893	pixman_composite_info_t *info)
1891	serge	2894	{
3931	Serge	2895	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	2896	uint8_t dst_line, dst;
		2897	uint8_t src_line, src;
		2898	int src_stride, dst_stride;
		2899	int32_t w;
		2900
3931	Serge	2901	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1891	serge	2902	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
		2903
		2904	while (height--)
		2905	{
		2906	dst = dst_line;
		2907	dst_line += dst_stride;
		2908	src = src_line;
		2909	src_line += src_stride;
		2910	w = width;
		2911
3931	Serge	2912	while (w && (uintptr_t)dst & 3)
1891	serge	2913	{
3931	Serge	2914	uint8_t s, d;
		2915	uint16_t tmp;
1891	serge	2916
3931	Serge	2917	s = *src;
		2918	d = *dst;
1891	serge	2919
3931	Serge	2920	*dst = MUL_UN8 (s, d, tmp);
		2921
		2922	src++;
		2923	dst++;
		2924	w--;
1891	serge	2925	}
		2926
3931	Serge	2927	while (w >= 4)
		2928	{
		2929	uint32_t s = (uint32_t )src;
		2930	uint32_t d = (uint32_t )dst;
		2931
		2932	store8888 (d, in (load8888u (s), load8888 (d)));
		2933
		2934	w -= 4;
		2935	dst += 4;
		2936	src += 4;
		2937	}
		2938
1891	serge	2939	while (w--)
		2940	{
		2941	uint8_t s, d;
		2942	uint16_t tmp;
		2943
		2944	s = *src;
		2945	d = *dst;
		2946
		2947	*dst = MUL_UN8 (s, d, tmp);
		2948
		2949	src++;
		2950	dst++;
		2951	}
		2952	}
		2953
		2954	_mm_empty ();
		2955	}
		2956
		2957	static void
		2958	mmx_composite_add_n_8_8 (pixman_implementation_t *imp,
3931	Serge	2959	pixman_composite_info_t *info)
1891	serge	2960	{
3931	Serge	2961	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	2962	uint8_t dst_line, dst;
		2963	uint8_t mask_line, mask;
		2964	int dst_stride, mask_stride;
		2965	int32_t w;
		2966	uint32_t src;
		2967	uint8_t sa;
		2968	__m64 vsrc, vsrca;
		2969
3931	Serge	2970	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1891	serge	2971	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		2972
3931	Serge	2973	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
1891	serge	2974
		2975	sa = src >> 24;
		2976
		2977	if (src == 0)
		2978	return;
		2979
3931	Serge	2980	vsrc = load8888 (&src);
1891	serge	2981	vsrca = expand_alpha (vsrc);
		2982
		2983	while (height--)
		2984	{
		2985	dst = dst_line;
		2986	dst_line += dst_stride;
		2987	mask = mask_line;
		2988	mask_line += mask_stride;
		2989	w = width;
		2990
3931	Serge	2991	while (w && (uintptr_t)dst & 3)
1891	serge	2992	{
3931	Serge	2993	uint16_t tmp;
		2994	uint16_t a;
		2995	uint32_t m, d;
		2996	uint32_t r;
1891	serge	2997
3931	Serge	2998	a = *mask++;
		2999	d = *dst;
1891	serge	3000
3931	Serge	3001	m = MUL_UN8 (sa, a, tmp);
		3002	r = ADD_UN8 (m, d, tmp);
		3003
		3004	*dst++ = r;
		3005	w--;
1891	serge	3006	}
		3007
3931	Serge	3008	while (w >= 4)
		3009	{
		3010	__m64 vmask;
		3011	__m64 vdest;
		3012
		3013	vmask = load8888u ((uint32_t *)mask);
		3014	vdest = load8888 ((uint32_t *)dst);
		3015
		3016	store8888 ((uint32_t *)dst, _mm_adds_pu8 (in (vsrca, vmask), vdest));
		3017
		3018	dst += 4;
		3019	mask += 4;
		3020	w -= 4;
		3021	}
		3022
1891	serge	3023	while (w--)
		3024	{
		3025	uint16_t tmp;
		3026	uint16_t a;
		3027	uint32_t m, d;
		3028	uint32_t r;
		3029
		3030	a = *mask++;
		3031	d = *dst;
		3032
		3033	m = MUL_UN8 (sa, a, tmp);
		3034	r = ADD_UN8 (m, d, tmp);
		3035
		3036	*dst++ = r;
		3037	}
		3038	}
		3039
		3040	_mm_empty ();
		3041	}
		3042
		3043	static void
		3044	mmx_composite_add_8_8 (pixman_implementation_t *imp,
3931	Serge	3045	pixman_composite_info_t *info)
1891	serge	3046	{
3931	Serge	3047	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	3048	uint8_t dst_line, dst;
		3049	uint8_t src_line, src;
		3050	int dst_stride, src_stride;
		3051	int32_t w;
		3052	uint8_t s, d;
		3053	uint16_t t;
		3054
		3055	CHECKPOINT ();
		3056
		3057	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
3931	Serge	3058	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
1891	serge	3059
		3060	while (height--)
		3061	{
		3062	dst = dst_line;
		3063	dst_line += dst_stride;
		3064	src = src_line;
		3065	src_line += src_stride;
		3066	w = width;
		3067
3931	Serge	3068	while (w && (uintptr_t)dst & 7)
1891	serge	3069	{
		3070	s = *src;
		3071	d = *dst;
		3072	t = d + s;
		3073	s = t \| (0 - (t >> 8));
		3074	*dst = s;
		3075
		3076	dst++;
		3077	src++;
		3078	w--;
		3079	}
		3080
		3081	while (w >= 8)
		3082	{
3931	Serge	3083	(__m64)dst = _mm_adds_pu8 (ldq_u ((__m64 )src), (__m64*)dst);
1891	serge	3084	dst += 8;
		3085	src += 8;
		3086	w -= 8;
		3087	}
		3088
		3089	while (w)
		3090	{
		3091	s = *src;
		3092	d = *dst;
		3093	t = d + s;
		3094	s = t \| (0 - (t >> 8));
		3095	*dst = s;
		3096
		3097	dst++;
		3098	src++;
		3099	w--;
		3100	}
		3101	}
		3102
		3103	_mm_empty ();
		3104	}
		3105
		3106	static void
3931	Serge	3107	mmx_composite_add_0565_0565 (pixman_implementation_t *imp,
		3108	pixman_composite_info_t *info)
		3109	{
		3110	PIXMAN_COMPOSITE_ARGS (info);
		3111	uint16_t dst_line, dst;
		3112	uint32_t d;
		3113	uint16_t src_line, src;
		3114	uint32_t s;
		3115	int dst_stride, src_stride;
		3116	int32_t w;
		3117
		3118	CHECKPOINT ();
		3119
		3120	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint16_t, src_stride, src_line, 1);
		3121	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
		3122
		3123	while (height--)
		3124	{
		3125	dst = dst_line;
		3126	dst_line += dst_stride;
		3127	src = src_line;
		3128	src_line += src_stride;
		3129	w = width;
		3130
		3131	while (w && (uintptr_t)dst & 7)
		3132	{
		3133	s = *src++;
		3134	if (s)
		3135	{
		3136	d = *dst;
		3137	s = convert_0565_to_8888 (s);
		3138	if (d)
		3139	{
		3140	d = convert_0565_to_8888 (d);
		3141	UN8x4_ADD_UN8x4 (s, d);
		3142	}
		3143	*dst = convert_8888_to_0565 (s);
		3144	}
		3145	dst++;
		3146	w--;
		3147	}
		3148
		3149	while (w >= 4)
		3150	{
		3151	__m64 vdest = (__m64 )dst;
		3152	__m64 vsrc = ldq_u ((__m64 *)src);
		3153	__m64 vd0, vd1;
		3154	__m64 vs0, vs1;
		3155
		3156	expand_4xpacked565 (vdest, &vd0, &vd1, 0);
		3157	expand_4xpacked565 (vsrc, &vs0, &vs1, 0);
		3158
		3159	vd0 = _mm_adds_pu8 (vd0, vs0);
		3160	vd1 = _mm_adds_pu8 (vd1, vs1);
		3161
		3162	(__m64 )dst = pack_4xpacked565 (vd0, vd1);
		3163
		3164	dst += 4;
		3165	src += 4;
		3166	w -= 4;
		3167	}
		3168
		3169	while (w--)
		3170	{
		3171	s = *src++;
		3172	if (s)
		3173	{
		3174	d = *dst;
		3175	s = convert_0565_to_8888 (s);
		3176	if (d)
		3177	{
		3178	d = convert_0565_to_8888 (d);
		3179	UN8x4_ADD_UN8x4 (s, d);
		3180	}
		3181	*dst = convert_8888_to_0565 (s);
		3182	}
		3183	dst++;
		3184	}
		3185	}
		3186
		3187	_mm_empty ();
		3188	}
		3189
		3190	static void
1891	serge	3191	mmx_composite_add_8888_8888 (pixman_implementation_t *imp,
3931	Serge	3192	pixman_composite_info_t *info)
1891	serge	3193	{
3931	Serge	3194	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	3195	uint32_t dst_line, dst;
		3196	uint32_t src_line, src;
		3197	int dst_stride, src_stride;
		3198	int32_t w;
		3199
		3200	CHECKPOINT ();
		3201
		3202	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
3931	Serge	3203	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891	serge	3204
		3205	while (height--)
		3206	{
		3207	dst = dst_line;
		3208	dst_line += dst_stride;
		3209	src = src_line;
		3210	src_line += src_stride;
		3211	w = width;
		3212
3931	Serge	3213	while (w && (uintptr_t)dst & 7)
1891	serge	3214	{
3931	Serge	3215	store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
		3216	load ((const uint32_t *)dst)));
1891	serge	3217	dst++;
		3218	src++;
		3219	w--;
		3220	}
		3221
		3222	while (w >= 2)
		3223	{
3931	Serge	3224	(__m64 )dst = _mm_adds_pu8 (ldq_u ((__m64 )src), (__m64*)dst);
1891	serge	3225	dst += 2;
		3226	src += 2;
		3227	w -= 2;
		3228	}
		3229
		3230	if (w)
		3231	{
3931	Serge	3232	store (dst, _mm_adds_pu8 (load ((const uint32_t *)src),
		3233	load ((const uint32_t *)dst)));
1891	serge	3234
		3235	}
		3236	}
		3237
		3238	_mm_empty ();
		3239	}
		3240
		3241	static pixman_bool_t
3931	Serge	3242	mmx_blt (pixman_implementation_t *imp,
		3243	uint32_t * src_bits,
		3244	uint32_t * dst_bits,
		3245	int src_stride,
		3246	int dst_stride,
		3247	int src_bpp,
		3248	int dst_bpp,
		3249	int src_x,
		3250	int src_y,
		3251	int dest_x,
		3252	int dest_y,
		3253	int width,
		3254	int height)
1891	serge	3255	{
		3256	uint8_t * src_bytes;
		3257	uint8_t * dst_bytes;
		3258	int byte_width;
		3259
		3260	if (src_bpp != dst_bpp)
		3261	return FALSE;
		3262
		3263	if (src_bpp == 16)
		3264	{
		3265	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
		3266	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
		3267	src_bytes = (uint8_t )(((uint16_t )src_bits) + src_stride * (src_y) + (src_x));
3931	Serge	3268	dst_bytes = (uint8_t )(((uint16_t )dst_bits) + dst_stride * (dest_y) + (dest_x));
1891	serge	3269	byte_width = 2 * width;
		3270	src_stride *= 2;
		3271	dst_stride *= 2;
		3272	}
		3273	else if (src_bpp == 32)
		3274	{
		3275	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
		3276	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
		3277	src_bytes = (uint8_t )(((uint32_t )src_bits) + src_stride * (src_y) + (src_x));
3931	Serge	3278	dst_bytes = (uint8_t )(((uint32_t )dst_bits) + dst_stride * (dest_y) + (dest_x));
1891	serge	3279	byte_width = 4 * width;
		3280	src_stride *= 4;
		3281	dst_stride *= 4;
		3282	}
		3283	else
		3284	{
		3285	return FALSE;
		3286	}
		3287
		3288	while (height--)
		3289	{
		3290	int w;
		3291	uint8_t *s = src_bytes;
		3292	uint8_t *d = dst_bytes;
		3293	src_bytes += src_stride;
		3294	dst_bytes += dst_stride;
		3295	w = byte_width;
		3296
3931	Serge	3297	if (w >= 1 && ((uintptr_t)d & 1))
1891	serge	3298	{
3931	Serge	3299	(uint8_t )d = (uint8_t )s;
		3300	w -= 1;
		3301	s += 1;
		3302	d += 1;
		3303	}
		3304
		3305	if (w >= 2 && ((uintptr_t)d & 3))
		3306	{
1891	serge	3307	(uint16_t )d = (uint16_t )s;
		3308	w -= 2;
		3309	s += 2;
		3310	d += 2;
		3311	}
		3312
3931	Serge	3313	while (w >= 4 && ((uintptr_t)d & 7))
1891	serge	3314	{
3931	Serge	3315	(uint32_t )d = ldl_u ((uint32_t *)s);
1891	serge	3316
		3317	w -= 4;
		3318	s += 4;
		3319	d += 4;
		3320	}
		3321
		3322	while (w >= 64)
		3323	{
3931	Serge	3324	#if (defined (__GNUC__) \|\| (defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590))) && defined USE_X86_MMX
1891	serge	3325	__asm__ (
		3326	"movq (%1), %%mm0\n"
		3327	"movq 8(%1), %%mm1\n"
		3328	"movq 16(%1), %%mm2\n"
		3329	"movq 24(%1), %%mm3\n"
		3330	"movq 32(%1), %%mm4\n"
		3331	"movq 40(%1), %%mm5\n"
		3332	"movq 48(%1), %%mm6\n"
		3333	"movq 56(%1), %%mm7\n"
		3334
		3335	"movq %%mm0, (%0)\n"
		3336	"movq %%mm1, 8(%0)\n"
		3337	"movq %%mm2, 16(%0)\n"
		3338	"movq %%mm3, 24(%0)\n"
		3339	"movq %%mm4, 32(%0)\n"
		3340	"movq %%mm5, 40(%0)\n"
		3341	"movq %%mm6, 48(%0)\n"
		3342	"movq %%mm7, 56(%0)\n"
		3343	:
		3344	: "r" (d), "r" (s)
		3345	: "memory",
		3346	"%mm0", "%mm1", "%mm2", "%mm3",
		3347	"%mm4", "%mm5", "%mm6", "%mm7");
		3348	#else
3931	Serge	3349	__m64 v0 = ldq_u ((__m64 *)(s + 0));
		3350	__m64 v1 = ldq_u ((__m64 *)(s + 8));
		3351	__m64 v2 = ldq_u ((__m64 *)(s + 16));
		3352	__m64 v3 = ldq_u ((__m64 *)(s + 24));
		3353	__m64 v4 = ldq_u ((__m64 *)(s + 32));
		3354	__m64 v5 = ldq_u ((__m64 *)(s + 40));
		3355	__m64 v6 = ldq_u ((__m64 *)(s + 48));
		3356	__m64 v7 = ldq_u ((__m64 *)(s + 56));
1891	serge	3357	(__m64 )(d + 0) = v0;
		3358	(__m64 )(d + 8) = v1;
		3359	(__m64 )(d + 16) = v2;
		3360	(__m64 )(d + 24) = v3;
		3361	(__m64 )(d + 32) = v4;
		3362	(__m64 )(d + 40) = v5;
		3363	(__m64 )(d + 48) = v6;
		3364	(__m64 )(d + 56) = v7;
		3365	#endif
		3366
		3367	w -= 64;
		3368	s += 64;
		3369	d += 64;
		3370	}
		3371	while (w >= 4)
		3372	{
3931	Serge	3373	(uint32_t )d = ldl_u ((uint32_t *)s);
1891	serge	3374
		3375	w -= 4;
		3376	s += 4;
		3377	d += 4;
		3378	}
		3379	if (w >= 2)
		3380	{
		3381	(uint16_t )d = (uint16_t )s;
		3382	w -= 2;
		3383	s += 2;
		3384	d += 2;
		3385	}
		3386	}
		3387
		3388	_mm_empty ();
		3389
		3390	return TRUE;
		3391	}
		3392
		3393	static void
		3394	mmx_composite_copy_area (pixman_implementation_t *imp,
3931	Serge	3395	pixman_composite_info_t *info)
1891	serge	3396	{
3931	Serge	3397	PIXMAN_COMPOSITE_ARGS (info);
		3398
		3399	mmx_blt (imp, src_image->bits.bits,
		3400	dest_image->bits.bits,
		3401	src_image->bits.rowstride,
		3402	dest_image->bits.rowstride,
		3403	PIXMAN_FORMAT_BPP (src_image->bits.format),
		3404	PIXMAN_FORMAT_BPP (dest_image->bits.format),
		3405	src_x, src_y, dest_x, dest_y, width, height);
1891	serge	3406	}
		3407
		3408	static void
		3409	mmx_composite_over_x888_8_8888 (pixman_implementation_t *imp,
3931	Serge	3410	pixman_composite_info_t *info)
1891	serge	3411	{
3931	Serge	3412	PIXMAN_COMPOSITE_ARGS (info);
1891	serge	3413	uint32_t src, src_line;
		3414	uint32_t dst, dst_line;
		3415	uint8_t mask, mask_line;
		3416	int src_stride, mask_stride, dst_stride;
		3417	int32_t w;
		3418
3931	Serge	3419	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
1891	serge	3420	PIXMAN_IMAGE_GET_LINE (mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		3421	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		3422
		3423	while (height--)
		3424	{
		3425	src = src_line;
		3426	src_line += src_stride;
		3427	dst = dst_line;
		3428	dst_line += dst_stride;
		3429	mask = mask_line;
		3430	mask_line += mask_stride;
		3431
		3432	w = width;
		3433
		3434	while (w--)
		3435	{
		3436	uint64_t m = *mask;
		3437
		3438	if (m)
		3439	{
3931	Serge	3440	uint32_t ssrc = *src \| 0xff000000;
		3441	__m64 s = load8888 (&ssrc);
1891	serge	3442
		3443	if (m == 0xff)
		3444	{
3931	Serge	3445	store8888 (dst, s);
1891	serge	3446	}
		3447	else
		3448	{
		3449	__m64 sa = expand_alpha (s);
		3450	__m64 vm = expand_alpha_rev (to_m64 (m));
3931	Serge	3451	__m64 vdest = in_over (s, sa, vm, load8888 (dst));
1891	serge	3452
3931	Serge	3453	store8888 (dst, vdest);
1891	serge	3454	}
		3455	}
		3456
		3457	mask++;
		3458	dst++;
		3459	src++;
		3460	}
		3461	}
		3462
		3463	_mm_empty ();
		3464	}
		3465
3931	Serge	3466	static void
		3467	mmx_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
		3468	pixman_composite_info_t *info)
		3469	{
		3470	PIXMAN_COMPOSITE_ARGS (info);
		3471	uint32_t src;
		3472	uint32_t dst_line, dst;
		3473	int32_t w;
		3474	int dst_stride;
		3475	__m64 vsrc;
		3476
		3477	CHECKPOINT ();
		3478
		3479	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		3480
		3481	if (src == 0)
		3482	return;
		3483
		3484	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		3485
		3486	vsrc = load8888 (&src);
		3487
		3488	while (height--)
		3489	{
		3490	dst = dst_line;
		3491	dst_line += dst_stride;
		3492	w = width;
		3493
		3494	CHECKPOINT ();
		3495
		3496	while (w && (uintptr_t)dst & 7)
		3497	{
		3498	__m64 vdest = load8888 (dst);
		3499
		3500	store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
		3501
		3502	w--;
		3503	dst++;
		3504	}
		3505
		3506	while (w >= 2)
		3507	{
		3508	__m64 vdest = (__m64 )dst;
		3509	__m64 dest0 = expand8888 (vdest, 0);
		3510	__m64 dest1 = expand8888 (vdest, 1);
		3511
		3512
		3513	dest0 = over (dest0, expand_alpha (dest0), vsrc);
		3514	dest1 = over (dest1, expand_alpha (dest1), vsrc);
		3515
		3516	(__m64 )dst = pack8888 (dest0, dest1);
		3517
		3518	dst += 2;
		3519	w -= 2;
		3520	}
		3521
		3522	CHECKPOINT ();
		3523
		3524	if (w)
		3525	{
		3526	__m64 vdest = load8888 (dst);
		3527
		3528	store8888 (dst, over (vdest, expand_alpha (vdest), vsrc));
		3529	}
		3530	}
		3531
		3532	_mm_empty ();
		3533	}
		3534
		3535	#define BSHIFT ((1 << BILINEAR_INTERPOLATION_BITS))
		3536	#define BMSK (BSHIFT - 1)
		3537
		3538	#define BILINEAR_DECLARE_VARIABLES \
		3539	const __m64 mm_wt = _mm_set_pi16 (wt, wt, wt, wt); \
		3540	const __m64 mm_wb = _mm_set_pi16 (wb, wb, wb, wb); \
		3541	const __m64 mm_BSHIFT = _mm_set_pi16 (BSHIFT, BSHIFT, BSHIFT, BSHIFT); \
		3542	const __m64 mm_addc7 = _mm_set_pi16 (0, 1, 0, 1); \
		3543	const __m64 mm_xorc7 = _mm_set_pi16 (0, BMSK, 0, BMSK); \
		3544	const __m64 mm_ux = _mm_set_pi16 (unit_x, unit_x, unit_x, unit_x); \
		3545	const __m64 mm_zero = _mm_setzero_si64 (); \
		3546	__m64 mm_x = _mm_set_pi16 (vx, vx, vx, vx)
		3547
		3548	#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
		3549	do { \
		3550	/* fetch 2x2 pixel block into 2 mmx registers */ \
		3551	__m64 t = ldq_u ((__m64 *)&src_top [pixman_fixed_to_int (vx)]); \
		3552	__m64 b = ldq_u ((__m64 *)&src_bottom [pixman_fixed_to_int (vx)]); \
		3553	/* vertical interpolation */ \
		3554	__m64 t_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (t, mm_zero), mm_wt); \
		3555	__m64 t_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (t, mm_zero), mm_wt); \
		3556	__m64 b_hi = _mm_mullo_pi16 (_mm_unpackhi_pi8 (b, mm_zero), mm_wb); \
		3557	__m64 b_lo = _mm_mullo_pi16 (_mm_unpacklo_pi8 (b, mm_zero), mm_wb); \
		3558	__m64 hi = _mm_add_pi16 (t_hi, b_hi); \
		3559	__m64 lo = _mm_add_pi16 (t_lo, b_lo); \
		3560	vx += unit_x; \
		3561	if (BILINEAR_INTERPOLATION_BITS < 8) \
		3562	{ \
		3563	/* calculate horizontal weights */ \
		3564	__m64 mm_wh = _mm_add_pi16 (mm_addc7, _mm_xor_si64 (mm_xorc7, \
		3565	_mm_srli_pi16 (mm_x, \
		3566	16 - BILINEAR_INTERPOLATION_BITS))); \
		3567	/* horizontal interpolation */ \
		3568	__m64 p = _mm_unpacklo_pi16 (lo, hi); \
		3569	__m64 q = _mm_unpackhi_pi16 (lo, hi); \
		3570	lo = _mm_madd_pi16 (p, mm_wh); \
		3571	hi = _mm_madd_pi16 (q, mm_wh); \
		3572	} \
		3573	else \
		3574	{ \
		3575	/* calculate horizontal weights */ \
		3576	__m64 mm_wh_lo = _mm_sub_pi16 (mm_BSHIFT, _mm_srli_pi16 (mm_x, \
		3577	16 - BILINEAR_INTERPOLATION_BITS)); \
		3578	__m64 mm_wh_hi = _mm_srli_pi16 (mm_x, \
		3579	16 - BILINEAR_INTERPOLATION_BITS); \
		3580	/* horizontal interpolation */ \
		3581	__m64 mm_lo_lo = _mm_mullo_pi16 (lo, mm_wh_lo); \
		3582	__m64 mm_lo_hi = _mm_mullo_pi16 (hi, mm_wh_hi); \
		3583	__m64 mm_hi_lo = _mm_mulhi_pu16 (lo, mm_wh_lo); \
		3584	__m64 mm_hi_hi = _mm_mulhi_pu16 (hi, mm_wh_hi); \
		3585	lo = _mm_add_pi32 (_mm_unpacklo_pi16 (mm_lo_lo, mm_hi_lo), \
		3586	_mm_unpacklo_pi16 (mm_lo_hi, mm_hi_hi)); \
		3587	hi = _mm_add_pi32 (_mm_unpackhi_pi16 (mm_lo_lo, mm_hi_lo), \
		3588	_mm_unpackhi_pi16 (mm_lo_hi, mm_hi_hi)); \
		3589	} \
		3590	mm_x = _mm_add_pi16 (mm_x, mm_ux); \
		3591	/* shift and pack the result */ \
		3592	hi = _mm_srli_pi32 (hi, BILINEAR_INTERPOLATION_BITS * 2); \
		3593	lo = _mm_srli_pi32 (lo, BILINEAR_INTERPOLATION_BITS * 2); \
		3594	lo = _mm_packs_pi32 (lo, hi); \
		3595	lo = _mm_packs_pu16 (lo, lo); \
		3596	pix = lo; \
		3597	} while (0)
		3598
		3599	#define BILINEAR_SKIP_ONE_PIXEL() \
		3600	do { \
		3601	vx += unit_x; \
		3602	mm_x = _mm_add_pi16 (mm_x, mm_ux); \
		3603	} while(0)
		3604
		3605	static force_inline void
		3606	scaled_bilinear_scanline_mmx_8888_8888_SRC (uint32_t * dst,
		3607	const uint32_t * mask,
		3608	const uint32_t * src_top,
		3609	const uint32_t * src_bottom,
		3610	int32_t w,
		3611	int wt,
		3612	int wb,
		3613	pixman_fixed_t vx,
		3614	pixman_fixed_t unit_x,
		3615	pixman_fixed_t max_vx,
		3616	pixman_bool_t zero_src)
		3617	{
		3618	BILINEAR_DECLARE_VARIABLES;
		3619	__m64 pix;
		3620
		3621	while (w--)
		3622	{
		3623	BILINEAR_INTERPOLATE_ONE_PIXEL (pix);
		3624	store (dst, pix);
		3625	dst++;
		3626	}
		3627
		3628	_mm_empty ();
		3629	}
		3630
		3631	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_SRC,
		3632	scaled_bilinear_scanline_mmx_8888_8888_SRC,
		3633	uint32_t, uint32_t, uint32_t,
		3634	COVER, FLAG_NONE)
		3635	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_SRC,
		3636	scaled_bilinear_scanline_mmx_8888_8888_SRC,
		3637	uint32_t, uint32_t, uint32_t,
		3638	PAD, FLAG_NONE)
		3639	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_SRC,
		3640	scaled_bilinear_scanline_mmx_8888_8888_SRC,
		3641	uint32_t, uint32_t, uint32_t,
		3642	NONE, FLAG_NONE)
		3643	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_SRC,
		3644	scaled_bilinear_scanline_mmx_8888_8888_SRC,
		3645	uint32_t, uint32_t, uint32_t,
		3646	NORMAL, FLAG_NONE)
		3647
		3648	static force_inline void
		3649	scaled_bilinear_scanline_mmx_8888_8888_OVER (uint32_t * dst,
		3650	const uint32_t * mask,
		3651	const uint32_t * src_top,
		3652	const uint32_t * src_bottom,
		3653	int32_t w,
		3654	int wt,
		3655	int wb,
		3656	pixman_fixed_t vx,
		3657	pixman_fixed_t unit_x,
		3658	pixman_fixed_t max_vx,
		3659	pixman_bool_t zero_src)
		3660	{
		3661	BILINEAR_DECLARE_VARIABLES;
		3662	__m64 pix1, pix2;
		3663
		3664	while (w)
		3665	{
		3666	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		3667
		3668	if (!is_zero (pix1))
		3669	{
		3670	pix2 = load (dst);
		3671	store8888 (dst, core_combine_over_u_pixel_mmx (pix1, pix2));
		3672	}
		3673
		3674	w--;
		3675	dst++;
		3676	}
		3677
		3678	_mm_empty ();
		3679	}
		3680
		3681	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_cover_OVER,
		3682	scaled_bilinear_scanline_mmx_8888_8888_OVER,
		3683	uint32_t, uint32_t, uint32_t,
		3684	COVER, FLAG_NONE)
		3685	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_pad_OVER,
		3686	scaled_bilinear_scanline_mmx_8888_8888_OVER,
		3687	uint32_t, uint32_t, uint32_t,
		3688	PAD, FLAG_NONE)
		3689	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_none_OVER,
		3690	scaled_bilinear_scanline_mmx_8888_8888_OVER,
		3691	uint32_t, uint32_t, uint32_t,
		3692	NONE, FLAG_NONE)
		3693	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8888_normal_OVER,
		3694	scaled_bilinear_scanline_mmx_8888_8888_OVER,
		3695	uint32_t, uint32_t, uint32_t,
		3696	NORMAL, FLAG_NONE)
		3697
		3698	static force_inline void
		3699	scaled_bilinear_scanline_mmx_8888_8_8888_OVER (uint32_t * dst,
		3700	const uint8_t * mask,
		3701	const uint32_t * src_top,
		3702	const uint32_t * src_bottom,
		3703	int32_t w,
		3704	int wt,
		3705	int wb,
		3706	pixman_fixed_t vx,
		3707	pixman_fixed_t unit_x,
		3708	pixman_fixed_t max_vx,
		3709	pixman_bool_t zero_src)
		3710	{
		3711	BILINEAR_DECLARE_VARIABLES;
		3712	__m64 pix1, pix2;
		3713	uint32_t m;
		3714
		3715	while (w)
		3716	{
		3717	m = (uint32_t) *mask++;
		3718
		3719	if (m)
		3720	{
		3721	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		3722
		3723	if (m == 0xff && is_opaque (pix1))
		3724	{
		3725	store (dst, pix1);
		3726	}
		3727	else
		3728	{
		3729	__m64 ms, md, ma, msa;
		3730
		3731	pix2 = load (dst);
		3732	ma = expand_alpha_rev (to_m64 (m));
		3733	ms = _mm_unpacklo_pi8 (pix1, _mm_setzero_si64 ());
		3734	md = _mm_unpacklo_pi8 (pix2, _mm_setzero_si64 ());
		3735
		3736	msa = expand_alpha (ms);
		3737
		3738	store8888 (dst, (in_over (ms, msa, ma, md)));
		3739	}
		3740	}
		3741	else
		3742	{
		3743	BILINEAR_SKIP_ONE_PIXEL ();
		3744	}
		3745
		3746	w--;
		3747	dst++;
		3748	}
		3749
		3750	_mm_empty ();
		3751	}
		3752
		3753	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_cover_OVER,
		3754	scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
		3755	uint32_t, uint8_t, uint32_t,
		3756	COVER, FLAG_HAVE_NON_SOLID_MASK)
		3757	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_pad_OVER,
		3758	scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
		3759	uint32_t, uint8_t, uint32_t,
		3760	PAD, FLAG_HAVE_NON_SOLID_MASK)
		3761	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_none_OVER,
		3762	scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
		3763	uint32_t, uint8_t, uint32_t,
		3764	NONE, FLAG_HAVE_NON_SOLID_MASK)
		3765	FAST_BILINEAR_MAINLOOP_COMMON (mmx_8888_8_8888_normal_OVER,
		3766	scaled_bilinear_scanline_mmx_8888_8_8888_OVER,
		3767	uint32_t, uint8_t, uint32_t,
		3768	NORMAL, FLAG_HAVE_NON_SOLID_MASK)
		3769
		3770	static uint32_t *
		3771	mmx_fetch_x8r8g8b8 (pixman_iter_t iter, const uint32_t mask)
		3772	{
		3773	int w = iter->width;
		3774	uint32_t *dst = iter->buffer;
		3775	uint32_t src = (uint32_t )iter->bits;
		3776
		3777	iter->bits += iter->stride;
		3778
		3779	while (w && ((uintptr_t)dst) & 7)
		3780	{
		3781	dst++ = (src++) \| 0xff000000;
		3782	w--;
		3783	}
		3784
		3785	while (w >= 8)
		3786	{
		3787	__m64 vsrc1 = ldq_u ((__m64 *)(src + 0));
		3788	__m64 vsrc2 = ldq_u ((__m64 *)(src + 2));
		3789	__m64 vsrc3 = ldq_u ((__m64 *)(src + 4));
		3790	__m64 vsrc4 = ldq_u ((__m64 *)(src + 6));
		3791
		3792	(__m64 )(dst + 0) = _mm_or_si64 (vsrc1, MC (ff000000));
		3793	(__m64 )(dst + 2) = _mm_or_si64 (vsrc2, MC (ff000000));
		3794	(__m64 )(dst + 4) = _mm_or_si64 (vsrc3, MC (ff000000));
		3795	(__m64 )(dst + 6) = _mm_or_si64 (vsrc4, MC (ff000000));
		3796
		3797	dst += 8;
		3798	src += 8;
		3799	w -= 8;
		3800	}
		3801
		3802	while (w)
		3803	{
		3804	dst++ = (src++) \| 0xff000000;
		3805	w--;
		3806	}
		3807
		3808	_mm_empty ();
		3809	return iter->buffer;
		3810	}
		3811
		3812	static uint32_t *
		3813	mmx_fetch_r5g6b5 (pixman_iter_t iter, const uint32_t mask)
		3814	{
		3815	int w = iter->width;
		3816	uint32_t *dst = iter->buffer;
		3817	uint16_t src = (uint16_t )iter->bits;
		3818
		3819	iter->bits += iter->stride;
		3820
		3821	while (w && ((uintptr_t)dst) & 0x0f)
		3822	{
		3823	uint16_t s = *src++;
		3824
		3825	*dst++ = convert_0565_to_8888 (s);
		3826	w--;
		3827	}
		3828
		3829	while (w >= 4)
		3830	{
		3831	__m64 vsrc = ldq_u ((__m64 *)src);
		3832	__m64 mm0, mm1;
		3833
		3834	expand_4xpacked565 (vsrc, &mm0, &mm1, 1);
		3835
		3836	(__m64 )(dst + 0) = mm0;
		3837	(__m64 )(dst + 2) = mm1;
		3838
		3839	dst += 4;
		3840	src += 4;
		3841	w -= 4;
		3842	}
		3843
		3844	while (w)
		3845	{
		3846	uint16_t s = *src++;
		3847
		3848	*dst++ = convert_0565_to_8888 (s);
		3849	w--;
		3850	}
		3851
		3852	_mm_empty ();
		3853	return iter->buffer;
		3854	}
		3855
		3856	static uint32_t *
		3857	mmx_fetch_a8 (pixman_iter_t iter, const uint32_t mask)
		3858	{
		3859	int w = iter->width;
		3860	uint32_t *dst = iter->buffer;
		3861	uint8_t *src = iter->bits;
		3862
		3863	iter->bits += iter->stride;
		3864
		3865	while (w && (((uintptr_t)dst) & 15))
		3866	{
		3867	dst++ = (src++) << 24;
		3868	w--;
		3869	}
		3870
		3871	while (w >= 8)
		3872	{
		3873	__m64 mm0 = ldq_u ((__m64 *)src);
		3874
		3875	__m64 mm1 = _mm_unpacklo_pi8 (_mm_setzero_si64(), mm0);
		3876	__m64 mm2 = _mm_unpackhi_pi8 (_mm_setzero_si64(), mm0);
		3877	__m64 mm3 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm1);
		3878	__m64 mm4 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm1);
		3879	__m64 mm5 = _mm_unpacklo_pi16 (_mm_setzero_si64(), mm2);
		3880	__m64 mm6 = _mm_unpackhi_pi16 (_mm_setzero_si64(), mm2);
		3881
		3882	(__m64 )(dst + 0) = mm3;
		3883	(__m64 )(dst + 2) = mm4;
		3884	(__m64 )(dst + 4) = mm5;
		3885	(__m64 )(dst + 6) = mm6;
		3886
		3887	dst += 8;
		3888	src += 8;
		3889	w -= 8;
		3890	}
		3891
		3892	while (w)
		3893	{
		3894	dst++ = (src++) << 24;
		3895	w--;
		3896	}
		3897
		3898	_mm_empty ();
		3899	return iter->buffer;
		3900	}
		3901
		3902	typedef struct
		3903	{
		3904	pixman_format_code_t format;
		3905	pixman_iter_get_scanline_t get_scanline;
		3906	} fetcher_info_t;
		3907
		3908	static const fetcher_info_t fetchers[] =
		3909	{
		3910	{ PIXMAN_x8r8g8b8, mmx_fetch_x8r8g8b8 },
		3911	{ PIXMAN_r5g6b5, mmx_fetch_r5g6b5 },
		3912	{ PIXMAN_a8, mmx_fetch_a8 },
		3913	{ PIXMAN_null }
		3914	};
		3915
		3916	static pixman_bool_t
		3917	mmx_src_iter_init (pixman_implementation_t imp, pixman_iter_t iter)
		3918	{
		3919	pixman_image_t *image = iter->image;
		3920
		3921	#define FLAGS \
		3922	(FAST_PATH_STANDARD_FLAGS \| FAST_PATH_ID_TRANSFORM \| \
		3923	FAST_PATH_BITS_IMAGE \| FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
		3924
		3925	if ((iter->iter_flags & ITER_NARROW) &&
		3926	(iter->image_flags & FLAGS) == FLAGS)
		3927	{
		3928	const fetcher_info_t *f;
		3929
		3930	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
		3931	{
		3932	if (image->common.extended_format_code == f->format)
		3933	{
		3934	uint8_t b = (uint8_t )image->bits.bits;
		3935	int s = image->bits.rowstride * 4;
		3936
		3937	iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
		3938	iter->stride = s;
		3939
		3940	iter->get_scanline = f->get_scanline;
		3941	return TRUE;
		3942	}
		3943	}
		3944	}
		3945
		3946	return FALSE;
		3947	}
		3948
1891	serge	3949	static const pixman_fast_path_t mmx_fast_paths[] =
		3950	{
		3951	PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, mmx_composite_over_n_8_0565 ),
		3952	PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, mmx_composite_over_n_8_0565 ),
		3953	PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, mmx_composite_over_n_8_8888 ),
		3954	PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, mmx_composite_over_n_8_8888 ),
		3955	PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, mmx_composite_over_n_8_8888 ),
		3956	PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, mmx_composite_over_n_8_8888 ),
		3957	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
		3958	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, mmx_composite_over_n_8888_8888_ca ),
		3959	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, mmx_composite_over_n_8888_0565_ca ),
		3960	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
		3961	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, mmx_composite_over_n_8888_8888_ca ),
		3962	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, mmx_composite_over_n_8888_0565_ca ),
		3963	PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, mmx_composite_over_pixbuf_8888 ),
		3964	PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, mmx_composite_over_pixbuf_8888 ),
		3965	PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, mmx_composite_over_pixbuf_0565 ),
		3966	PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, mmx_composite_over_pixbuf_8888 ),
		3967	PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, mmx_composite_over_pixbuf_8888 ),
		3968	PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, mmx_composite_over_pixbuf_0565 ),
		3969	PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, mmx_composite_over_x888_n_8888 ),
		3970	PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, mmx_composite_over_x888_n_8888 ),
		3971	PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, mmx_composite_over_x888_n_8888 ),
		3972	PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, mmx_composite_over_x888_n_8888 ),
		3973	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, mmx_composite_over_8888_n_8888 ),
		3974	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, mmx_composite_over_8888_n_8888 ),
		3975	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, mmx_composite_over_8888_n_8888 ),
		3976	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, mmx_composite_over_8888_n_8888 ),
		3977	PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, mmx_composite_over_x888_8_8888 ),
		3978	PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, mmx_composite_over_x888_8_8888 ),
3931	Serge	3979	PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, mmx_composite_over_x888_8_8888 ),
		3980	PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, mmx_composite_over_x888_8_8888 ),
1891	serge	3981	PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, mmx_composite_over_n_8888 ),
		3982	PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, mmx_composite_over_n_8888 ),
		3983	PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, mmx_composite_over_n_0565 ),
3931	Serge	3984	PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, mmx_composite_over_n_0565 ),
1891	serge	3985	PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
		3986	PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
		3987
		3988	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, mmx_composite_over_8888_8888 ),
		3989	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, mmx_composite_over_8888_8888 ),
		3990	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, mmx_composite_over_8888_0565 ),
		3991	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, mmx_composite_over_8888_8888 ),
		3992	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, mmx_composite_over_8888_8888 ),
		3993	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, mmx_composite_over_8888_0565 ),
		3994
3931	Serge	3995	PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, mmx_composite_over_reverse_n_8888),
		3996	PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, mmx_composite_over_reverse_n_8888),
		3997
		3998	PIXMAN_STD_FAST_PATH (ADD, r5g6b5, null, r5g6b5, mmx_composite_add_0565_0565 ),
		3999	PIXMAN_STD_FAST_PATH (ADD, b5g6r5, null, b5g6r5, mmx_composite_add_0565_0565 ),
1891	serge	4000	PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, mmx_composite_add_8888_8888 ),
		4001	PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, mmx_composite_add_8888_8888 ),
		4002	PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, mmx_composite_add_8_8 ),
		4003	PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, mmx_composite_add_n_8_8 ),
		4004
3931	Serge	4005	PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
		4006	PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
		4007	PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, mmx_composite_src_x888_0565 ),
		4008	PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, mmx_composite_src_x888_0565 ),
1891	serge	4009	PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, mmx_composite_src_n_8_8888 ),
		4010	PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, mmx_composite_src_n_8_8888 ),
		4011	PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, mmx_composite_src_n_8_8888 ),
		4012	PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, mmx_composite_src_n_8_8888 ),
		4013	PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, mmx_composite_copy_area ),
		4014	PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, mmx_composite_copy_area ),
		4015	PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
		4016	PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
		4017	PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, mmx_composite_copy_area ),
		4018	PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, mmx_composite_copy_area ),
		4019	PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, mmx_composite_copy_area ),
		4020	PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, mmx_composite_copy_area ),
		4021
		4022	PIXMAN_STD_FAST_PATH (IN, a8, null, a8, mmx_composite_in_8_8 ),
		4023	PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, mmx_composite_in_n_8_8 ),
		4024
3931	Serge	4025	SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
		4026	SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
		4027	SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
		4028	SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
		4029	SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
		4030	SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
1891	serge	4031
3931	Serge	4032	SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8888 ),
		4033	SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8888 ),
		4034	SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8888 ),
		4035	SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8888 ),
1891	serge	4036
3931	Serge	4037	SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, mmx_8888_8_8888 ),
		4038	SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, mmx_8888_8_8888 ),
		4039	SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, mmx_8888_8_8888 ),
		4040	SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, mmx_8888_8_8888 ),
1891	serge	4041
3931	Serge	4042	{ PIXMAN_OP_NONE },
		4043	};
1891	serge	4044
		4045	pixman_implementation_t *
3931	Serge	4046	_pixman_implementation_create_mmx (pixman_implementation_t *fallback)
1891	serge	4047	{
3931	Serge	4048	pixman_implementation_t *imp = _pixman_implementation_create (fallback, mmx_fast_paths);
1891	serge	4049
		4050	imp->combine_32[PIXMAN_OP_OVER] = mmx_combine_over_u;
		4051	imp->combine_32[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_u;
		4052	imp->combine_32[PIXMAN_OP_IN] = mmx_combine_in_u;
		4053	imp->combine_32[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_u;
		4054	imp->combine_32[PIXMAN_OP_OUT] = mmx_combine_out_u;
		4055	imp->combine_32[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_u;
		4056	imp->combine_32[PIXMAN_OP_ATOP] = mmx_combine_atop_u;
		4057	imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_u;
		4058	imp->combine_32[PIXMAN_OP_XOR] = mmx_combine_xor_u;
		4059	imp->combine_32[PIXMAN_OP_ADD] = mmx_combine_add_u;
		4060	imp->combine_32[PIXMAN_OP_SATURATE] = mmx_combine_saturate_u;
		4061
		4062	imp->combine_32_ca[PIXMAN_OP_SRC] = mmx_combine_src_ca;
		4063	imp->combine_32_ca[PIXMAN_OP_OVER] = mmx_combine_over_ca;
		4064	imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = mmx_combine_over_reverse_ca;
		4065	imp->combine_32_ca[PIXMAN_OP_IN] = mmx_combine_in_ca;
		4066	imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = mmx_combine_in_reverse_ca;
		4067	imp->combine_32_ca[PIXMAN_OP_OUT] = mmx_combine_out_ca;
		4068	imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = mmx_combine_out_reverse_ca;
		4069	imp->combine_32_ca[PIXMAN_OP_ATOP] = mmx_combine_atop_ca;
		4070	imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = mmx_combine_atop_reverse_ca;
		4071	imp->combine_32_ca[PIXMAN_OP_XOR] = mmx_combine_xor_ca;
		4072	imp->combine_32_ca[PIXMAN_OP_ADD] = mmx_combine_add_ca;
		4073
		4074	imp->blt = mmx_blt;
		4075	imp->fill = mmx_fill;
		4076
3931	Serge	4077	imp->src_iter_init = mmx_src_iter_init;
		4078
1891	serge	4079	return imp;
		4080	}
		4081
3931	Serge	4082	#endif /* USE_X86_MMX \|\| USE_ARM_IWMMXT \|\| USE_LOONGSON_MMI */

Subversion Repositories Kolibri OS

(root)/programs/develop/libraries/pixman/pixman-mmx.c – Rev 3931