WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/pixman/pixman-sse2.c

Rev	Author	Line No.	Line
4349	Serge	1	/*
		2	* Copyright © 2008 Rodrigo Kumpera
		3	* Copyright © 2008 André Tupinambá
		4	*
		5	* Permission to use, copy, modify, distribute, and sell this software and its
		6	* documentation for any purpose is hereby granted without fee, provided that
		7	* the above copyright notice appear in all copies and that both that
		8	* copyright notice and this permission notice appear in supporting
		9	* documentation, and that the name of Red Hat not be used in advertising or
		10	* publicity pertaining to distribution of the software without specific,
		11	* written prior permission. Red Hat makes no representations about the
		12	* suitability of this software for any purpose. It is provided "as is"
		13	* without express or implied warranty.
		14	*
		15	* THE COPYRIGHT HOLDERS DISCLAIM ALL WARRANTIES WITH REGARD TO THIS
		16	* SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
		17	* FITNESS, IN NO EVENT SHALL THE COPYRIGHT HOLDERS BE LIABLE FOR ANY
		18	* SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
		19	* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN
		20	* AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING
		21	* OUT OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS
		22	* SOFTWARE.
		23	*
		24	* Author: Rodrigo Kumpera (kumpera@gmail.com)
		25	* André Tupinambá (andrelrt@gmail.com)
		26	*
		27	* Based on work by Owen Taylor and Søren Sandmann
		28	*/
		29	#ifdef HAVE_CONFIG_H
		30	#include
		31	#endif
		32
		33	#include /* for _mm_shuffle_pi16 and _MM_SHUFFLE */
		34	#include /* for SSE2 intrinsics */
		35	#include "pixman-private.h"
		36	#include "pixman-combine32.h"
		37	#include "pixman-inlines.h"
		38
		39	static __m128i mask_0080;
		40	static __m128i mask_00ff;
		41	static __m128i mask_0101;
		42	static __m128i mask_ffff;
		43	static __m128i mask_ff000000;
		44	static __m128i mask_alpha;
		45
		46	static __m128i mask_565_r;
		47	static __m128i mask_565_g1, mask_565_g2;
		48	static __m128i mask_565_b;
		49	static __m128i mask_red;
		50	static __m128i mask_green;
		51	static __m128i mask_blue;
		52
		53	static __m128i mask_565_fix_rb;
		54	static __m128i mask_565_fix_g;
		55
		56	static __m128i mask_565_rb;
		57	static __m128i mask_565_pack_multiplier;
		58
		59	static force_inline __m128i
		60	unpack_32_1x128 (uint32_t data)
		61	{
		62	return _mm_unpacklo_epi8 (_mm_cvtsi32_si128 (data), _mm_setzero_si128 ());
		63	}
		64
		65	static force_inline void
		66	unpack_128_2x128 (__m128i data, __m128i* data_lo, __m128i* data_hi)
		67	{
		68	*data_lo = _mm_unpacklo_epi8 (data, _mm_setzero_si128 ());
		69	*data_hi = _mm_unpackhi_epi8 (data, _mm_setzero_si128 ());
		70	}
		71
		72	static force_inline __m128i
		73	unpack_565_to_8888 (__m128i lo)
		74	{
		75	__m128i r, g, b, rb, t;
		76
		77	r = _mm_and_si128 (_mm_slli_epi32 (lo, 8), mask_red);
		78	g = _mm_and_si128 (_mm_slli_epi32 (lo, 5), mask_green);
		79	b = _mm_and_si128 (_mm_slli_epi32 (lo, 3), mask_blue);
		80
		81	rb = _mm_or_si128 (r, b);
		82	t = _mm_and_si128 (rb, mask_565_fix_rb);
		83	t = _mm_srli_epi32 (t, 5);
		84	rb = _mm_or_si128 (rb, t);
		85
		86	t = _mm_and_si128 (g, mask_565_fix_g);
		87	t = _mm_srli_epi32 (t, 6);
		88	g = _mm_or_si128 (g, t);
		89
		90	return _mm_or_si128 (rb, g);
		91	}
		92
		93	static force_inline void
		94	unpack_565_128_4x128 (__m128i data,
		95	__m128i* data0,
		96	__m128i* data1,
		97	__m128i* data2,
		98	__m128i* data3)
		99	{
		100	__m128i lo, hi;
		101
		102	lo = _mm_unpacklo_epi16 (data, _mm_setzero_si128 ());
		103	hi = _mm_unpackhi_epi16 (data, _mm_setzero_si128 ());
		104
		105	lo = unpack_565_to_8888 (lo);
		106	hi = unpack_565_to_8888 (hi);
		107
		108	unpack_128_2x128 (lo, data0, data1);
		109	unpack_128_2x128 (hi, data2, data3);
		110	}
		111
		112	static force_inline uint16_t
		113	pack_565_32_16 (uint32_t pixel)
		114	{
		115	return (uint16_t) (((pixel >> 8) & 0xf800) \|
		116	((pixel >> 5) & 0x07e0) \|
		117	((pixel >> 3) & 0x001f));
		118	}
		119
		120	static force_inline __m128i
		121	pack_2x128_128 (__m128i lo, __m128i hi)
		122	{
		123	return _mm_packus_epi16 (lo, hi);
		124	}
		125
		126	static force_inline __m128i
		127	pack_565_2packedx128_128 (__m128i lo, __m128i hi)
		128	{
		129	__m128i rb0 = _mm_and_si128 (lo, mask_565_rb);
		130	__m128i rb1 = _mm_and_si128 (hi, mask_565_rb);
		131
		132	__m128i t0 = _mm_madd_epi16 (rb0, mask_565_pack_multiplier);
		133	__m128i t1 = _mm_madd_epi16 (rb1, mask_565_pack_multiplier);
		134
		135	__m128i g0 = _mm_and_si128 (lo, mask_green);
		136	__m128i g1 = _mm_and_si128 (hi, mask_green);
		137
		138	t0 = _mm_or_si128 (t0, g0);
		139	t1 = _mm_or_si128 (t1, g1);
		140
		141	/* Simulates _mm_packus_epi32 */
		142	t0 = _mm_slli_epi32 (t0, 16 - 5);
		143	t1 = _mm_slli_epi32 (t1, 16 - 5);
		144	t0 = _mm_srai_epi32 (t0, 16);
		145	t1 = _mm_srai_epi32 (t1, 16);
		146	return _mm_packs_epi32 (t0, t1);
		147	}
		148
		149	static force_inline __m128i
		150	pack_565_2x128_128 (__m128i lo, __m128i hi)
		151	{
		152	__m128i data;
		153	__m128i r, g1, g2, b;
		154
		155	data = pack_2x128_128 (lo, hi);
		156
		157	r = _mm_and_si128 (data, mask_565_r);
		158	g1 = _mm_and_si128 (_mm_slli_epi32 (data, 3), mask_565_g1);
		159	g2 = _mm_and_si128 (_mm_srli_epi32 (data, 5), mask_565_g2);
		160	b = _mm_and_si128 (_mm_srli_epi32 (data, 3), mask_565_b);
		161
		162	return _mm_or_si128 (_mm_or_si128 (_mm_or_si128 (r, g1), g2), b);
		163	}
		164
		165	static force_inline __m128i
		166	pack_565_4x128_128 (__m128i* xmm0, __m128i* xmm1, __m128i* xmm2, __m128i* xmm3)
		167	{
		168	return _mm_packus_epi16 (pack_565_2x128_128 (xmm0, xmm1),
		169	pack_565_2x128_128 (xmm2, xmm3));
		170	}
		171
		172	static force_inline int
		173	is_opaque (__m128i x)
		174	{
		175	__m128i ffs = _mm_cmpeq_epi8 (x, x);
		176
		177	return (_mm_movemask_epi8 (_mm_cmpeq_epi8 (x, ffs)) & 0x8888) == 0x8888;
		178	}
		179
		180	static force_inline int
		181	is_zero (__m128i x)
		182	{
		183	return _mm_movemask_epi8 (
		184	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) == 0xffff;
		185	}
		186
		187	static force_inline int
		188	is_transparent (__m128i x)
		189	{
		190	return (_mm_movemask_epi8 (
		191	_mm_cmpeq_epi8 (x, _mm_setzero_si128 ())) & 0x8888) == 0x8888;
		192	}
		193
		194	static force_inline __m128i
		195	expand_pixel_32_1x128 (uint32_t data)
		196	{
		197	return _mm_shuffle_epi32 (unpack_32_1x128 (data), _MM_SHUFFLE (1, 0, 1, 0));
		198	}
		199
		200	static force_inline __m128i
		201	expand_alpha_1x128 (__m128i data)
		202	{
		203	return _mm_shufflehi_epi16 (_mm_shufflelo_epi16 (data,
		204	_MM_SHUFFLE (3, 3, 3, 3)),
		205	_MM_SHUFFLE (3, 3, 3, 3));
		206	}
		207
		208	static force_inline void
		209	expand_alpha_2x128 (__m128i data_lo,
		210	__m128i data_hi,
		211	__m128i* alpha_lo,
		212	__m128i* alpha_hi)
		213	{
		214	__m128i lo, hi;
		215
		216	lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 3, 3, 3));
		217	hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 3, 3, 3));
		218
		219	*alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 3, 3, 3));
		220	*alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 3, 3, 3));
		221	}
		222
		223	static force_inline void
		224	expand_alpha_rev_2x128 (__m128i data_lo,
		225	__m128i data_hi,
		226	__m128i* alpha_lo,
		227	__m128i* alpha_hi)
		228	{
		229	__m128i lo, hi;
		230
		231	lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (0, 0, 0, 0));
		232	hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (0, 0, 0, 0));
		233	*alpha_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (0, 0, 0, 0));
		234	*alpha_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (0, 0, 0, 0));
		235	}
		236
		237	static force_inline void
		238	pix_multiply_2x128 (__m128i* data_lo,
		239	__m128i* data_hi,
		240	__m128i* alpha_lo,
		241	__m128i* alpha_hi,
		242	__m128i* ret_lo,
		243	__m128i* ret_hi)
		244	{
		245	__m128i lo, hi;
		246
		247	lo = _mm_mullo_epi16 (data_lo, alpha_lo);
		248	hi = _mm_mullo_epi16 (data_hi, alpha_hi);
		249	lo = _mm_adds_epu16 (lo, mask_0080);
		250	hi = _mm_adds_epu16 (hi, mask_0080);
		251	*ret_lo = _mm_mulhi_epu16 (lo, mask_0101);
		252	*ret_hi = _mm_mulhi_epu16 (hi, mask_0101);
		253	}
		254
		255	static force_inline void
		256	pix_add_multiply_2x128 (__m128i* src_lo,
		257	__m128i* src_hi,
		258	__m128i* alpha_dst_lo,
		259	__m128i* alpha_dst_hi,
		260	__m128i* dst_lo,
		261	__m128i* dst_hi,
		262	__m128i* alpha_src_lo,
		263	__m128i* alpha_src_hi,
		264	__m128i* ret_lo,
		265	__m128i* ret_hi)
		266	{
		267	__m128i t1_lo, t1_hi;
		268	__m128i t2_lo, t2_hi;
		269
		270	pix_multiply_2x128 (src_lo, src_hi, alpha_dst_lo, alpha_dst_hi, &t1_lo, &t1_hi);
		271	pix_multiply_2x128 (dst_lo, dst_hi, alpha_src_lo, alpha_src_hi, &t2_lo, &t2_hi);
		272
		273	*ret_lo = _mm_adds_epu8 (t1_lo, t2_lo);
		274	*ret_hi = _mm_adds_epu8 (t1_hi, t2_hi);
		275	}
		276
		277	static force_inline void
		278	negate_2x128 (__m128i data_lo,
		279	__m128i data_hi,
		280	__m128i* neg_lo,
		281	__m128i* neg_hi)
		282	{
		283	*neg_lo = _mm_xor_si128 (data_lo, mask_00ff);
		284	*neg_hi = _mm_xor_si128 (data_hi, mask_00ff);
		285	}
		286
		287	static force_inline void
		288	invert_colors_2x128 (__m128i data_lo,
		289	__m128i data_hi,
		290	__m128i* inv_lo,
		291	__m128i* inv_hi)
		292	{
		293	__m128i lo, hi;
		294
		295	lo = _mm_shufflelo_epi16 (data_lo, _MM_SHUFFLE (3, 0, 1, 2));
		296	hi = _mm_shufflelo_epi16 (data_hi, _MM_SHUFFLE (3, 0, 1, 2));
		297	*inv_lo = _mm_shufflehi_epi16 (lo, _MM_SHUFFLE (3, 0, 1, 2));
		298	*inv_hi = _mm_shufflehi_epi16 (hi, _MM_SHUFFLE (3, 0, 1, 2));
		299	}
		300
		301	static force_inline void
		302	over_2x128 (__m128i* src_lo,
		303	__m128i* src_hi,
		304	__m128i* alpha_lo,
		305	__m128i* alpha_hi,
		306	__m128i* dst_lo,
		307	__m128i* dst_hi)
		308	{
		309	__m128i t1, t2;
		310
		311	negate_2x128 (alpha_lo, alpha_hi, &t1, &t2);
		312
		313	pix_multiply_2x128 (dst_lo, dst_hi, &t1, &t2, dst_lo, dst_hi);
		314
		315	dst_lo = _mm_adds_epu8 (src_lo, *dst_lo);
		316	dst_hi = _mm_adds_epu8 (src_hi, *dst_hi);
		317	}
		318
		319	static force_inline void
		320	over_rev_non_pre_2x128 (__m128i src_lo,
		321	__m128i src_hi,
		322	__m128i* dst_lo,
		323	__m128i* dst_hi)
		324	{
		325	__m128i lo, hi;
		326	__m128i alpha_lo, alpha_hi;
		327
		328	expand_alpha_2x128 (src_lo, src_hi, &alpha_lo, &alpha_hi);
		329
		330	lo = _mm_or_si128 (alpha_lo, mask_alpha);
		331	hi = _mm_or_si128 (alpha_hi, mask_alpha);
		332
		333	invert_colors_2x128 (src_lo, src_hi, &src_lo, &src_hi);
		334
		335	pix_multiply_2x128 (&src_lo, &src_hi, &lo, &hi, &lo, &hi);
		336
		337	over_2x128 (&lo, &hi, &alpha_lo, &alpha_hi, dst_lo, dst_hi);
		338	}
		339
		340	static force_inline void
		341	in_over_2x128 (__m128i* src_lo,
		342	__m128i* src_hi,
		343	__m128i* alpha_lo,
		344	__m128i* alpha_hi,
		345	__m128i* mask_lo,
		346	__m128i* mask_hi,
		347	__m128i* dst_lo,
		348	__m128i* dst_hi)
		349	{
		350	__m128i s_lo, s_hi;
		351	__m128i a_lo, a_hi;
		352
		353	pix_multiply_2x128 (src_lo, src_hi, mask_lo, mask_hi, &s_lo, &s_hi);
		354	pix_multiply_2x128 (alpha_lo, alpha_hi, mask_lo, mask_hi, &a_lo, &a_hi);
		355
		356	over_2x128 (&s_lo, &s_hi, &a_lo, &a_hi, dst_lo, dst_hi);
		357	}
		358
		359	/* load 4 pixels from a 16-byte boundary aligned address */
		360	static force_inline __m128i
		361	load_128_aligned (__m128i* src)
		362	{
		363	return _mm_load_si128 (src);
		364	}
		365
		366	/* load 4 pixels from a unaligned address */
		367	static force_inline __m128i
		368	load_128_unaligned (const __m128i* src)
		369	{
		370	return _mm_loadu_si128 (src);
		371	}
		372
		373	/* save 4 pixels using Write Combining memory on a 16-byte
		374	* boundary aligned address
		375	*/
		376	static force_inline void
		377	save_128_write_combining (__m128i* dst,
		378	__m128i data)
		379	{
		380	_mm_stream_si128 (dst, data);
		381	}
		382
		383	/* save 4 pixels on a 16-byte boundary aligned address */
		384	static force_inline void
		385	save_128_aligned (__m128i* dst,
		386	__m128i data)
		387	{
		388	_mm_store_si128 (dst, data);
		389	}
		390
		391	/* save 4 pixels on a unaligned address */
		392	static force_inline void
		393	save_128_unaligned (__m128i* dst,
		394	__m128i data)
		395	{
		396	_mm_storeu_si128 (dst, data);
		397	}
		398
		399	static force_inline __m128i
		400	load_32_1x128 (uint32_t data)
		401	{
		402	return _mm_cvtsi32_si128 (data);
		403	}
		404
		405	static force_inline __m128i
		406	expand_alpha_rev_1x128 (__m128i data)
		407	{
		408	return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (0, 0, 0, 0));
		409	}
		410
		411	static force_inline __m128i
		412	expand_pixel_8_1x128 (uint8_t data)
		413	{
		414	return _mm_shufflelo_epi16 (
		415	unpack_32_1x128 ((uint32_t)data), _MM_SHUFFLE (0, 0, 0, 0));
		416	}
		417
		418	static force_inline __m128i
		419	pix_multiply_1x128 (__m128i data,
		420	__m128i alpha)
		421	{
		422	return _mm_mulhi_epu16 (_mm_adds_epu16 (_mm_mullo_epi16 (data, alpha),
		423	mask_0080),
		424	mask_0101);
		425	}
		426
		427	static force_inline __m128i
		428	pix_add_multiply_1x128 (__m128i* src,
		429	__m128i* alpha_dst,
		430	__m128i* dst,
		431	__m128i* alpha_src)
		432	{
		433	__m128i t1 = pix_multiply_1x128 (src, alpha_dst);
		434	__m128i t2 = pix_multiply_1x128 (dst, alpha_src);
		435
		436	return _mm_adds_epu8 (t1, t2);
		437	}
		438
		439	static force_inline __m128i
		440	negate_1x128 (__m128i data)
		441	{
		442	return _mm_xor_si128 (data, mask_00ff);
		443	}
		444
		445	static force_inline __m128i
		446	invert_colors_1x128 (__m128i data)
		447	{
		448	return _mm_shufflelo_epi16 (data, _MM_SHUFFLE (3, 0, 1, 2));
		449	}
		450
		451	static force_inline __m128i
		452	over_1x128 (__m128i src, __m128i alpha, __m128i dst)
		453	{
		454	return _mm_adds_epu8 (src, pix_multiply_1x128 (dst, negate_1x128 (alpha)));
		455	}
		456
		457	static force_inline __m128i
		458	in_over_1x128 (__m128i* src, __m128i* alpha, __m128i* mask, __m128i* dst)
		459	{
		460	return over_1x128 (pix_multiply_1x128 (src, mask),
		461	pix_multiply_1x128 (alpha, mask),
		462	*dst);
		463	}
		464
		465	static force_inline __m128i
		466	over_rev_non_pre_1x128 (__m128i src, __m128i dst)
		467	{
		468	__m128i alpha = expand_alpha_1x128 (src);
		469
		470	return over_1x128 (pix_multiply_1x128 (invert_colors_1x128 (src),
		471	_mm_or_si128 (alpha, mask_alpha)),
		472	alpha,
		473	dst);
		474	}
		475
		476	static force_inline uint32_t
		477	pack_1x128_32 (__m128i data)
		478	{
		479	return _mm_cvtsi128_si32 (_mm_packus_epi16 (data, _mm_setzero_si128 ()));
		480	}
		481
		482	static force_inline __m128i
		483	expand565_16_1x128 (uint16_t pixel)
		484	{
		485	__m128i m = _mm_cvtsi32_si128 (pixel);
		486
		487	m = unpack_565_to_8888 (m);
		488
		489	return _mm_unpacklo_epi8 (m, _mm_setzero_si128 ());
		490	}
		491
		492	static force_inline uint32_t
		493	core_combine_over_u_pixel_sse2 (uint32_t src, uint32_t dst)
		494	{
		495	uint8_t a;
		496	__m128i xmms;
		497
		498	a = src >> 24;
		499
		500	if (a == 0xff)
		501	{
		502	return src;
		503	}
		504	else if (src)
		505	{
		506	xmms = unpack_32_1x128 (src);
		507	return pack_1x128_32 (
		508	over_1x128 (xmms, expand_alpha_1x128 (xmms),
		509	unpack_32_1x128 (dst)));
		510	}
		511
		512	return dst;
		513	}
		514
		515	static force_inline uint32_t
		516	combine1 (const uint32_t ps, const uint32_t pm)
		517	{
		518	uint32_t s = *ps;
		519
		520	if (pm)
		521	{
		522	__m128i ms, mm;
		523
		524	mm = unpack_32_1x128 (*pm);
		525	mm = expand_alpha_1x128 (mm);
		526
		527	ms = unpack_32_1x128 (s);
		528	ms = pix_multiply_1x128 (ms, mm);
		529
		530	s = pack_1x128_32 (ms);
		531	}
		532
		533	return s;
		534	}
		535
		536	static force_inline __m128i
		537	combine4 (const __m128i ps, const __m128i pm)
		538	{
		539	__m128i xmm_src_lo, xmm_src_hi;
		540	__m128i xmm_msk_lo, xmm_msk_hi;
		541	__m128i s;
		542
		543	if (pm)
		544	{
		545	xmm_msk_lo = load_128_unaligned (pm);
		546
		547	if (is_transparent (xmm_msk_lo))
		548	return _mm_setzero_si128 ();
		549	}
		550
		551	s = load_128_unaligned (ps);
		552
		553	if (pm)
		554	{
		555	unpack_128_2x128 (s, &xmm_src_lo, &xmm_src_hi);
		556	unpack_128_2x128 (xmm_msk_lo, &xmm_msk_lo, &xmm_msk_hi);
		557
		558	expand_alpha_2x128 (xmm_msk_lo, xmm_msk_hi, &xmm_msk_lo, &xmm_msk_hi);
		559
		560	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		561	&xmm_msk_lo, &xmm_msk_hi,
		562	&xmm_src_lo, &xmm_src_hi);
		563
		564	s = pack_2x128_128 (xmm_src_lo, xmm_src_hi);
		565	}
		566
		567	return s;
		568	}
		569
		570	static force_inline void
		571	core_combine_over_u_sse2_mask (uint32_t * pd,
		572	const uint32_t* ps,
		573	const uint32_t* pm,
		574	int w)
		575	{
		576	uint32_t s, d;
		577
		578	/* Align dst on a 16-byte boundary */
		579	while (w && ((uintptr_t)pd & 15))
		580	{
		581	d = *pd;
		582	s = combine1 (ps, pm);
		583
		584	if (s)
		585	*pd = core_combine_over_u_pixel_sse2 (s, d);
		586	pd++;
		587	ps++;
		588	pm++;
		589	w--;
		590	}
		591
		592	while (w >= 4)
		593	{
		594	__m128i mask = load_128_unaligned ((__m128i *)pm);
		595
		596	if (!is_zero (mask))
		597	{
		598	__m128i src;
		599	__m128i src_hi, src_lo;
		600	__m128i mask_hi, mask_lo;
		601	__m128i alpha_hi, alpha_lo;
		602
		603	src = load_128_unaligned ((__m128i *)ps);
		604
		605	if (is_opaque (_mm_and_si128 (src, mask)))
		606	{
		607	save_128_aligned ((__m128i *)pd, src);
		608	}
		609	else
		610	{
		611	__m128i dst = load_128_aligned ((__m128i *)pd);
		612	__m128i dst_hi, dst_lo;
		613
		614	unpack_128_2x128 (mask, &mask_lo, &mask_hi);
		615	unpack_128_2x128 (src, &src_lo, &src_hi);
		616
		617	expand_alpha_2x128 (mask_lo, mask_hi, &mask_lo, &mask_hi);
		618	pix_multiply_2x128 (&src_lo, &src_hi,
		619	&mask_lo, &mask_hi,
		620	&src_lo, &src_hi);
		621
		622	unpack_128_2x128 (dst, &dst_lo, &dst_hi);
		623
		624	expand_alpha_2x128 (src_lo, src_hi,
		625	&alpha_lo, &alpha_hi);
		626
		627	over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
		628	&dst_lo, &dst_hi);
		629
		630	save_128_aligned (
		631	(__m128i *)pd,
		632	pack_2x128_128 (dst_lo, dst_hi));
		633	}
		634	}
		635
		636	pm += 4;
		637	ps += 4;
		638	pd += 4;
		639	w -= 4;
		640	}
		641	while (w)
		642	{
		643	d = *pd;
		644	s = combine1 (ps, pm);
		645
		646	if (s)
		647	*pd = core_combine_over_u_pixel_sse2 (s, d);
		648	pd++;
		649	ps++;
		650	pm++;
		651
		652	w--;
		653	}
		654	}
		655
		656	static force_inline void
		657	core_combine_over_u_sse2_no_mask (uint32_t * pd,
		658	const uint32_t* ps,
		659	int w)
		660	{
		661	uint32_t s, d;
		662
		663	/* Align dst on a 16-byte boundary */
		664	while (w && ((uintptr_t)pd & 15))
		665	{
		666	d = *pd;
		667	s = *ps;
		668
		669	if (s)
		670	*pd = core_combine_over_u_pixel_sse2 (s, d);
		671	pd++;
		672	ps++;
		673	w--;
		674	}
		675
		676	while (w >= 4)
		677	{
		678	__m128i src;
		679	__m128i src_hi, src_lo, dst_hi, dst_lo;
		680	__m128i alpha_hi, alpha_lo;
		681
		682	src = load_128_unaligned ((__m128i *)ps);
		683
		684	if (!is_zero (src))
		685	{
		686	if (is_opaque (src))
		687	{
		688	save_128_aligned ((__m128i *)pd, src);
		689	}
		690	else
		691	{
		692	__m128i dst = load_128_aligned ((__m128i *)pd);
		693
		694	unpack_128_2x128 (src, &src_lo, &src_hi);
		695	unpack_128_2x128 (dst, &dst_lo, &dst_hi);
		696
		697	expand_alpha_2x128 (src_lo, src_hi,
		698	&alpha_lo, &alpha_hi);
		699	over_2x128 (&src_lo, &src_hi, &alpha_lo, &alpha_hi,
		700	&dst_lo, &dst_hi);
		701
		702	save_128_aligned (
		703	(__m128i *)pd,
		704	pack_2x128_128 (dst_lo, dst_hi));
		705	}
		706	}
		707
		708	ps += 4;
		709	pd += 4;
		710	w -= 4;
		711	}
		712	while (w)
		713	{
		714	d = *pd;
		715	s = *ps;
		716
		717	if (s)
		718	*pd = core_combine_over_u_pixel_sse2 (s, d);
		719	pd++;
		720	ps++;
		721
		722	w--;
		723	}
		724	}
		725
		726	static force_inline void
		727	sse2_combine_over_u (pixman_implementation_t *imp,
		728	pixman_op_t op,
		729	uint32_t * pd,
		730	const uint32_t * ps,
		731	const uint32_t * pm,
		732	int w)
		733	{
		734	if (pm)
		735	core_combine_over_u_sse2_mask (pd, ps, pm, w);
		736	else
		737	core_combine_over_u_sse2_no_mask (pd, ps, w);
		738	}
		739
		740	static void
		741	sse2_combine_over_reverse_u (pixman_implementation_t *imp,
		742	pixman_op_t op,
		743	uint32_t * pd,
		744	const uint32_t * ps,
		745	const uint32_t * pm,
		746	int w)
		747	{
		748	uint32_t s, d;
		749
		750	__m128i xmm_dst_lo, xmm_dst_hi;
		751	__m128i xmm_src_lo, xmm_src_hi;
		752	__m128i xmm_alpha_lo, xmm_alpha_hi;
		753
		754	/* Align dst on a 16-byte boundary */
		755	while (w &&
		756	((uintptr_t)pd & 15))
		757	{
		758	d = *pd;
		759	s = combine1 (ps, pm);
		760
		761	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
		762	w--;
		763	ps++;
		764	if (pm)
		765	pm++;
		766	}
		767
		768	while (w >= 4)
		769	{
		770	/* I'm loading unaligned because I'm not sure
		771	* about the address alignment.
		772	*/
		773	xmm_src_hi = combine4 ((__m128i)ps, (__m128i)pm);
		774	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
		775
		776	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		777	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		778
		779	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
		780	&xmm_alpha_lo, &xmm_alpha_hi);
		781
		782	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
		783	&xmm_alpha_lo, &xmm_alpha_hi,
		784	&xmm_src_lo, &xmm_src_hi);
		785
		786	/* rebuid the 4 pixel data and save*/
		787	save_128_aligned ((__m128i*)pd,
		788	pack_2x128_128 (xmm_src_lo, xmm_src_hi));
		789
		790	w -= 4;
		791	ps += 4;
		792	pd += 4;
		793
		794	if (pm)
		795	pm += 4;
		796	}
		797
		798	while (w)
		799	{
		800	d = *pd;
		801	s = combine1 (ps, pm);
		802
		803	*pd++ = core_combine_over_u_pixel_sse2 (d, s);
		804	ps++;
		805	w--;
		806	if (pm)
		807	pm++;
		808	}
		809	}
		810
		811	static force_inline uint32_t
		812	core_combine_in_u_pixel_sse2 (uint32_t src, uint32_t dst)
		813	{
		814	uint32_t maska = src >> 24;
		815
		816	if (maska == 0)
		817	{
		818	return 0;
		819	}
		820	else if (maska != 0xff)
		821	{
		822	return pack_1x128_32 (
		823	pix_multiply_1x128 (unpack_32_1x128 (dst),
		824	expand_alpha_1x128 (unpack_32_1x128 (src))));
		825	}
		826
		827	return dst;
		828	}
		829
		830	static void
		831	sse2_combine_in_u (pixman_implementation_t *imp,
		832	pixman_op_t op,
		833	uint32_t * pd,
		834	const uint32_t * ps,
		835	const uint32_t * pm,
		836	int w)
		837	{
		838	uint32_t s, d;
		839
		840	__m128i xmm_src_lo, xmm_src_hi;
		841	__m128i xmm_dst_lo, xmm_dst_hi;
		842
		843	while (w && ((uintptr_t)pd & 15))
		844	{
		845	s = combine1 (ps, pm);
		846	d = *pd;
		847
		848	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
		849	w--;
		850	ps++;
		851	if (pm)
		852	pm++;
		853	}
		854
		855	while (w >= 4)
		856	{
		857	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
		858	xmm_src_hi = combine4 ((__m128i) ps, (__m128i) pm);
		859
		860	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		861	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		862
		863	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		864	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		865	&xmm_dst_lo, &xmm_dst_hi,
		866	&xmm_dst_lo, &xmm_dst_hi);
		867
		868	save_128_aligned ((__m128i*)pd,
		869	pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		870
		871	ps += 4;
		872	pd += 4;
		873	w -= 4;
		874	if (pm)
		875	pm += 4;
		876	}
		877
		878	while (w)
		879	{
		880	s = combine1 (ps, pm);
		881	d = *pd;
		882
		883	*pd++ = core_combine_in_u_pixel_sse2 (d, s);
		884	w--;
		885	ps++;
		886	if (pm)
		887	pm++;
		888	}
		889	}
		890
		891	static void
		892	sse2_combine_in_reverse_u (pixman_implementation_t *imp,
		893	pixman_op_t op,
		894	uint32_t * pd,
		895	const uint32_t * ps,
		896	const uint32_t * pm,
		897	int w)
		898	{
		899	uint32_t s, d;
		900
		901	__m128i xmm_src_lo, xmm_src_hi;
		902	__m128i xmm_dst_lo, xmm_dst_hi;
		903
		904	while (w && ((uintptr_t)pd & 15))
		905	{
		906	s = combine1 (ps, pm);
		907	d = *pd;
		908
		909	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
		910	ps++;
		911	w--;
		912	if (pm)
		913	pm++;
		914	}
		915
		916	while (w >= 4)
		917	{
		918	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
		919	xmm_src_hi = combine4 ((__m128i) ps, (__m128i)pm);
		920
		921	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		922	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		923
		924	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		925	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
		926	&xmm_src_lo, &xmm_src_hi,
		927	&xmm_dst_lo, &xmm_dst_hi);
		928
		929	save_128_aligned (
		930	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		931
		932	ps += 4;
		933	pd += 4;
		934	w -= 4;
		935	if (pm)
		936	pm += 4;
		937	}
		938
		939	while (w)
		940	{
		941	s = combine1 (ps, pm);
		942	d = *pd;
		943
		944	*pd++ = core_combine_in_u_pixel_sse2 (s, d);
		945	w--;
		946	ps++;
		947	if (pm)
		948	pm++;
		949	}
		950	}
		951
		952	static void
		953	sse2_combine_out_reverse_u (pixman_implementation_t *imp,
		954	pixman_op_t op,
		955	uint32_t * pd,
		956	const uint32_t * ps,
		957	const uint32_t * pm,
		958	int w)
		959	{
		960	while (w && ((uintptr_t)pd & 15))
		961	{
		962	uint32_t s = combine1 (ps, pm);
		963	uint32_t d = *pd;
		964
		965	*pd++ = pack_1x128_32 (
		966	pix_multiply_1x128 (
		967	unpack_32_1x128 (d), negate_1x128 (
		968	expand_alpha_1x128 (unpack_32_1x128 (s)))));
		969
		970	if (pm)
		971	pm++;
		972	ps++;
		973	w--;
		974	}
		975
		976	while (w >= 4)
		977	{
		978	__m128i xmm_src_lo, xmm_src_hi;
		979	__m128i xmm_dst_lo, xmm_dst_hi;
		980
		981	xmm_src_hi = combine4 ((__m128i)ps, (__m128i)pm);
		982	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
		983
		984	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		985	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		986
		987	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		988	negate_2x128 (xmm_src_lo, xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		989
		990	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
		991	&xmm_src_lo, &xmm_src_hi,
		992	&xmm_dst_lo, &xmm_dst_hi);
		993
		994	save_128_aligned (
		995	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		996
		997	ps += 4;
		998	pd += 4;
		999	if (pm)
		1000	pm += 4;
		1001
		1002	w -= 4;
		1003	}
		1004
		1005	while (w)
		1006	{
		1007	uint32_t s = combine1 (ps, pm);
		1008	uint32_t d = *pd;
		1009
		1010	*pd++ = pack_1x128_32 (
		1011	pix_multiply_1x128 (
		1012	unpack_32_1x128 (d), negate_1x128 (
		1013	expand_alpha_1x128 (unpack_32_1x128 (s)))));
		1014	ps++;
		1015	if (pm)
		1016	pm++;
		1017	w--;
		1018	}
		1019	}
		1020
		1021	static void
		1022	sse2_combine_out_u (pixman_implementation_t *imp,
		1023	pixman_op_t op,
		1024	uint32_t * pd,
		1025	const uint32_t * ps,
		1026	const uint32_t * pm,
		1027	int w)
		1028	{
		1029	while (w && ((uintptr_t)pd & 15))
		1030	{
		1031	uint32_t s = combine1 (ps, pm);
		1032	uint32_t d = *pd;
		1033
		1034	*pd++ = pack_1x128_32 (
		1035	pix_multiply_1x128 (
		1036	unpack_32_1x128 (s), negate_1x128 (
		1037	expand_alpha_1x128 (unpack_32_1x128 (d)))));
		1038	w--;
		1039	ps++;
		1040	if (pm)
		1041	pm++;
		1042	}
		1043
		1044	while (w >= 4)
		1045	{
		1046	__m128i xmm_src_lo, xmm_src_hi;
		1047	__m128i xmm_dst_lo, xmm_dst_hi;
		1048
		1049	xmm_src_hi = combine4 ((__m128i) ps, (__m128i)pm);
		1050	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
		1051
		1052	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		1053	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1054
		1055	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1056	negate_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1057
		1058	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		1059	&xmm_dst_lo, &xmm_dst_hi,
		1060	&xmm_dst_lo, &xmm_dst_hi);
		1061
		1062	save_128_aligned (
		1063	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		1064
		1065	ps += 4;
		1066	pd += 4;
		1067	w -= 4;
		1068	if (pm)
		1069	pm += 4;
		1070	}
		1071
		1072	while (w)
		1073	{
		1074	uint32_t s = combine1 (ps, pm);
		1075	uint32_t d = *pd;
		1076
		1077	*pd++ = pack_1x128_32 (
		1078	pix_multiply_1x128 (
		1079	unpack_32_1x128 (s), negate_1x128 (
		1080	expand_alpha_1x128 (unpack_32_1x128 (d)))));
		1081	w--;
		1082	ps++;
		1083	if (pm)
		1084	pm++;
		1085	}
		1086	}
		1087
		1088	static force_inline uint32_t
		1089	core_combine_atop_u_pixel_sse2 (uint32_t src,
		1090	uint32_t dst)
		1091	{
		1092	__m128i s = unpack_32_1x128 (src);
		1093	__m128i d = unpack_32_1x128 (dst);
		1094
		1095	__m128i sa = negate_1x128 (expand_alpha_1x128 (s));
		1096	__m128i da = expand_alpha_1x128 (d);
		1097
		1098	return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
		1099	}
		1100
		1101	static void
		1102	sse2_combine_atop_u (pixman_implementation_t *imp,
		1103	pixman_op_t op,
		1104	uint32_t * pd,
		1105	const uint32_t * ps,
		1106	const uint32_t * pm,
		1107	int w)
		1108	{
		1109	uint32_t s, d;
		1110
		1111	__m128i xmm_src_lo, xmm_src_hi;
		1112	__m128i xmm_dst_lo, xmm_dst_hi;
		1113	__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
		1114	__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
		1115
		1116	while (w && ((uintptr_t)pd & 15))
		1117	{
		1118	s = combine1 (ps, pm);
		1119	d = *pd;
		1120
		1121	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
		1122	w--;
		1123	ps++;
		1124	if (pm)
		1125	pm++;
		1126	}
		1127
		1128	while (w >= 4)
		1129	{
		1130	xmm_src_hi = combine4 ((__m128i)ps, (__m128i)pm);
		1131	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
		1132
		1133	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		1134	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1135
		1136	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		1137	&xmm_alpha_src_lo, &xmm_alpha_src_hi);
		1138	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
		1139	&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
		1140
		1141	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
		1142	&xmm_alpha_src_lo, &xmm_alpha_src_hi);
		1143
		1144	pix_add_multiply_2x128 (
		1145	&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
		1146	&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
		1147	&xmm_dst_lo, &xmm_dst_hi);
		1148
		1149	save_128_aligned (
		1150	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		1151
		1152	ps += 4;
		1153	pd += 4;
		1154	w -= 4;
		1155	if (pm)
		1156	pm += 4;
		1157	}
		1158
		1159	while (w)
		1160	{
		1161	s = combine1 (ps, pm);
		1162	d = *pd;
		1163
		1164	*pd++ = core_combine_atop_u_pixel_sse2 (s, d);
		1165	w--;
		1166	ps++;
		1167	if (pm)
		1168	pm++;
		1169	}
		1170	}
		1171
		1172	static force_inline uint32_t
		1173	core_combine_reverse_atop_u_pixel_sse2 (uint32_t src,
		1174	uint32_t dst)
		1175	{
		1176	__m128i s = unpack_32_1x128 (src);
		1177	__m128i d = unpack_32_1x128 (dst);
		1178
		1179	__m128i sa = expand_alpha_1x128 (s);
		1180	__m128i da = negate_1x128 (expand_alpha_1x128 (d));
		1181
		1182	return pack_1x128_32 (pix_add_multiply_1x128 (&s, &da, &d, &sa));
		1183	}
		1184
		1185	static void
		1186	sse2_combine_atop_reverse_u (pixman_implementation_t *imp,
		1187	pixman_op_t op,
		1188	uint32_t * pd,
		1189	const uint32_t * ps,
		1190	const uint32_t * pm,
		1191	int w)
		1192	{
		1193	uint32_t s, d;
		1194
		1195	__m128i xmm_src_lo, xmm_src_hi;
		1196	__m128i xmm_dst_lo, xmm_dst_hi;
		1197	__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
		1198	__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
		1199
		1200	while (w && ((uintptr_t)pd & 15))
		1201	{
		1202	s = combine1 (ps, pm);
		1203	d = *pd;
		1204
		1205	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
		1206	ps++;
		1207	w--;
		1208	if (pm)
		1209	pm++;
		1210	}
		1211
		1212	while (w >= 4)
		1213	{
		1214	xmm_src_hi = combine4 ((__m128i)ps, (__m128i)pm);
		1215	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
		1216
		1217	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		1218	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1219
		1220	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		1221	&xmm_alpha_src_lo, &xmm_alpha_src_hi);
		1222	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
		1223	&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
		1224
		1225	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
		1226	&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
		1227
		1228	pix_add_multiply_2x128 (
		1229	&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
		1230	&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
		1231	&xmm_dst_lo, &xmm_dst_hi);
		1232
		1233	save_128_aligned (
		1234	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		1235
		1236	ps += 4;
		1237	pd += 4;
		1238	w -= 4;
		1239	if (pm)
		1240	pm += 4;
		1241	}
		1242
		1243	while (w)
		1244	{
		1245	s = combine1 (ps, pm);
		1246	d = *pd;
		1247
		1248	*pd++ = core_combine_reverse_atop_u_pixel_sse2 (s, d);
		1249	ps++;
		1250	w--;
		1251	if (pm)
		1252	pm++;
		1253	}
		1254	}
		1255
		1256	static force_inline uint32_t
		1257	core_combine_xor_u_pixel_sse2 (uint32_t src,
		1258	uint32_t dst)
		1259	{
		1260	__m128i s = unpack_32_1x128 (src);
		1261	__m128i d = unpack_32_1x128 (dst);
		1262
		1263	__m128i neg_d = negate_1x128 (expand_alpha_1x128 (d));
		1264	__m128i neg_s = negate_1x128 (expand_alpha_1x128 (s));
		1265
		1266	return pack_1x128_32 (pix_add_multiply_1x128 (&s, &neg_d, &d, &neg_s));
		1267	}
		1268
		1269	static void
		1270	sse2_combine_xor_u (pixman_implementation_t *imp,
		1271	pixman_op_t op,
		1272	uint32_t * dst,
		1273	const uint32_t * src,
		1274	const uint32_t * mask,
		1275	int width)
		1276	{
		1277	int w = width;
		1278	uint32_t s, d;
		1279	uint32_t* pd = dst;
		1280	const uint32_t* ps = src;
		1281	const uint32_t* pm = mask;
		1282
		1283	__m128i xmm_src, xmm_src_lo, xmm_src_hi;
		1284	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		1285	__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
		1286	__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
		1287
		1288	while (w && ((uintptr_t)pd & 15))
		1289	{
		1290	s = combine1 (ps, pm);
		1291	d = *pd;
		1292
		1293	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
		1294	w--;
		1295	ps++;
		1296	if (pm)
		1297	pm++;
		1298	}
		1299
		1300	while (w >= 4)
		1301	{
		1302	xmm_src = combine4 ((__m128i) ps, (__m128i) pm);
		1303	xmm_dst = load_128_aligned ((__m128i*) pd);
		1304
		1305	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		1306	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		1307
		1308	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		1309	&xmm_alpha_src_lo, &xmm_alpha_src_hi);
		1310	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
		1311	&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
		1312
		1313	negate_2x128 (xmm_alpha_src_lo, xmm_alpha_src_hi,
		1314	&xmm_alpha_src_lo, &xmm_alpha_src_hi);
		1315	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
		1316	&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
		1317
		1318	pix_add_multiply_2x128 (
		1319	&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
		1320	&xmm_dst_lo, &xmm_dst_hi, &xmm_alpha_src_lo, &xmm_alpha_src_hi,
		1321	&xmm_dst_lo, &xmm_dst_hi);
		1322
		1323	save_128_aligned (
		1324	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		1325
		1326	ps += 4;
		1327	pd += 4;
		1328	w -= 4;
		1329	if (pm)
		1330	pm += 4;
		1331	}
		1332
		1333	while (w)
		1334	{
		1335	s = combine1 (ps, pm);
		1336	d = *pd;
		1337
		1338	*pd++ = core_combine_xor_u_pixel_sse2 (s, d);
		1339	w--;
		1340	ps++;
		1341	if (pm)
		1342	pm++;
		1343	}
		1344	}
		1345
		1346	static force_inline void
		1347	sse2_combine_add_u (pixman_implementation_t *imp,
		1348	pixman_op_t op,
		1349	uint32_t * dst,
		1350	const uint32_t * src,
		1351	const uint32_t * mask,
		1352	int width)
		1353	{
		1354	int w = width;
		1355	uint32_t s, d;
		1356	uint32_t* pd = dst;
		1357	const uint32_t* ps = src;
		1358	const uint32_t* pm = mask;
		1359
		1360	while (w && (uintptr_t)pd & 15)
		1361	{
		1362	s = combine1 (ps, pm);
		1363	d = *pd;
		1364
		1365	ps++;
		1366	if (pm)
		1367	pm++;
		1368	*pd++ = _mm_cvtsi128_si32 (
		1369	_mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
		1370	w--;
		1371	}
		1372
		1373	while (w >= 4)
		1374	{
		1375	__m128i s;
		1376
		1377	s = combine4 ((__m128i)ps, (__m128i)pm);
		1378
		1379	save_128_aligned (
		1380	(__m128i)pd, _mm_adds_epu8 (s, load_128_aligned ((__m128i)pd)));
		1381
		1382	pd += 4;
		1383	ps += 4;
		1384	if (pm)
		1385	pm += 4;
		1386	w -= 4;
		1387	}
		1388
		1389	while (w--)
		1390	{
		1391	s = combine1 (ps, pm);
		1392	d = *pd;
		1393
		1394	ps++;
		1395	*pd++ = _mm_cvtsi128_si32 (
		1396	_mm_adds_epu8 (_mm_cvtsi32_si128 (s), _mm_cvtsi32_si128 (d)));
		1397	if (pm)
		1398	pm++;
		1399	}
		1400	}
		1401
		1402	static force_inline uint32_t
		1403	core_combine_saturate_u_pixel_sse2 (uint32_t src,
		1404	uint32_t dst)
		1405	{
		1406	__m128i ms = unpack_32_1x128 (src);
		1407	__m128i md = unpack_32_1x128 (dst);
		1408	uint32_t sa = src >> 24;
		1409	uint32_t da = ~dst >> 24;
		1410
		1411	if (sa > da)
		1412	{
		1413	ms = pix_multiply_1x128 (
		1414	ms, expand_alpha_1x128 (unpack_32_1x128 (DIV_UN8 (da, sa) << 24)));
		1415	}
		1416
		1417	return pack_1x128_32 (_mm_adds_epu16 (md, ms));
		1418	}
		1419
		1420	static void
		1421	sse2_combine_saturate_u (pixman_implementation_t *imp,
		1422	pixman_op_t op,
		1423	uint32_t * pd,
		1424	const uint32_t * ps,
		1425	const uint32_t * pm,
		1426	int w)
		1427	{
		1428	uint32_t s, d;
		1429
		1430	uint32_t pack_cmp;
		1431	__m128i xmm_src, xmm_dst;
		1432
		1433	while (w && (uintptr_t)pd & 15)
		1434	{
		1435	s = combine1 (ps, pm);
		1436	d = *pd;
		1437
		1438	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
		1439	w--;
		1440	ps++;
		1441	if (pm)
		1442	pm++;
		1443	}
		1444
		1445	while (w >= 4)
		1446	{
		1447	xmm_dst = load_128_aligned ((__m128i*)pd);
		1448	xmm_src = combine4 ((__m128i)ps, (__m128i)pm);
		1449
		1450	pack_cmp = _mm_movemask_epi8 (
		1451	_mm_cmpgt_epi32 (
		1452	_mm_srli_epi32 (xmm_src, 24),
		1453	_mm_srli_epi32 (_mm_xor_si128 (xmm_dst, mask_ff000000), 24)));
		1454
		1455	/* if some alpha src is grater than respective ~alpha dst */
		1456	if (pack_cmp)
		1457	{
		1458	s = combine1 (ps++, pm);
		1459	d = *pd;
		1460	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
		1461	if (pm)
		1462	pm++;
		1463
		1464	s = combine1 (ps++, pm);
		1465	d = *pd;
		1466	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
		1467	if (pm)
		1468	pm++;
		1469
		1470	s = combine1 (ps++, pm);
		1471	d = *pd;
		1472	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
		1473	if (pm)
		1474	pm++;
		1475
		1476	s = combine1 (ps++, pm);
		1477	d = *pd;
		1478	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
		1479	if (pm)
		1480	pm++;
		1481	}
		1482	else
		1483	{
		1484	save_128_aligned ((__m128i*)pd, _mm_adds_epu8 (xmm_dst, xmm_src));
		1485
		1486	pd += 4;
		1487	ps += 4;
		1488	if (pm)
		1489	pm += 4;
		1490	}
		1491
		1492	w -= 4;
		1493	}
		1494
		1495	while (w--)
		1496	{
		1497	s = combine1 (ps, pm);
		1498	d = *pd;
		1499
		1500	*pd++ = core_combine_saturate_u_pixel_sse2 (s, d);
		1501	ps++;
		1502	if (pm)
		1503	pm++;
		1504	}
		1505	}
		1506
		1507	static void
		1508	sse2_combine_src_ca (pixman_implementation_t *imp,
		1509	pixman_op_t op,
		1510	uint32_t * pd,
		1511	const uint32_t * ps,
		1512	const uint32_t * pm,
		1513	int w)
		1514	{
		1515	uint32_t s, m;
		1516
		1517	__m128i xmm_src_lo, xmm_src_hi;
		1518	__m128i xmm_mask_lo, xmm_mask_hi;
		1519	__m128i xmm_dst_lo, xmm_dst_hi;
		1520
		1521	while (w && (uintptr_t)pd & 15)
		1522	{
		1523	s = *ps++;
		1524	m = *pm++;
		1525	*pd++ = pack_1x128_32 (
		1526	pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
		1527	w--;
		1528	}
		1529
		1530	while (w >= 4)
		1531	{
		1532	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		1533	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		1534
		1535	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		1536	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		1537
		1538	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		1539	&xmm_mask_lo, &xmm_mask_hi,
		1540	&xmm_dst_lo, &xmm_dst_hi);
		1541
		1542	save_128_aligned (
		1543	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		1544
		1545	ps += 4;
		1546	pd += 4;
		1547	pm += 4;
		1548	w -= 4;
		1549	}
		1550
		1551	while (w)
		1552	{
		1553	s = *ps++;
		1554	m = *pm++;
		1555	*pd++ = pack_1x128_32 (
		1556	pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)));
		1557	w--;
		1558	}
		1559	}
		1560
		1561	static force_inline uint32_t
		1562	core_combine_over_ca_pixel_sse2 (uint32_t src,
		1563	uint32_t mask,
		1564	uint32_t dst)
		1565	{
		1566	__m128i s = unpack_32_1x128 (src);
		1567	__m128i expAlpha = expand_alpha_1x128 (s);
		1568	__m128i unpk_mask = unpack_32_1x128 (mask);
		1569	__m128i unpk_dst = unpack_32_1x128 (dst);
		1570
		1571	return pack_1x128_32 (in_over_1x128 (&s, &expAlpha, &unpk_mask, &unpk_dst));
		1572	}
		1573
		1574	static void
		1575	sse2_combine_over_ca (pixman_implementation_t *imp,
		1576	pixman_op_t op,
		1577	uint32_t * pd,
		1578	const uint32_t * ps,
		1579	const uint32_t * pm,
		1580	int w)
		1581	{
		1582	uint32_t s, m, d;
		1583
		1584	__m128i xmm_alpha_lo, xmm_alpha_hi;
		1585	__m128i xmm_src_lo, xmm_src_hi;
		1586	__m128i xmm_dst_lo, xmm_dst_hi;
		1587	__m128i xmm_mask_lo, xmm_mask_hi;
		1588
		1589	while (w && (uintptr_t)pd & 15)
		1590	{
		1591	s = *ps++;
		1592	m = *pm++;
		1593	d = *pd;
		1594
		1595	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
		1596	w--;
		1597	}
		1598
		1599	while (w >= 4)
		1600	{
		1601	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
		1602	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		1603	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		1604
		1605	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1606	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		1607	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		1608
		1609	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		1610	&xmm_alpha_lo, &xmm_alpha_hi);
		1611
		1612	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
		1613	&xmm_alpha_lo, &xmm_alpha_hi,
		1614	&xmm_mask_lo, &xmm_mask_hi,
		1615	&xmm_dst_lo, &xmm_dst_hi);
		1616
		1617	save_128_aligned (
		1618	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		1619
		1620	ps += 4;
		1621	pd += 4;
		1622	pm += 4;
		1623	w -= 4;
		1624	}
		1625
		1626	while (w)
		1627	{
		1628	s = *ps++;
		1629	m = *pm++;
		1630	d = *pd;
		1631
		1632	*pd++ = core_combine_over_ca_pixel_sse2 (s, m, d);
		1633	w--;
		1634	}
		1635	}
		1636
		1637	static force_inline uint32_t
		1638	core_combine_over_reverse_ca_pixel_sse2 (uint32_t src,
		1639	uint32_t mask,
		1640	uint32_t dst)
		1641	{
		1642	__m128i d = unpack_32_1x128 (dst);
		1643
		1644	return pack_1x128_32 (
		1645	over_1x128 (d, expand_alpha_1x128 (d),
		1646	pix_multiply_1x128 (unpack_32_1x128 (src),
		1647	unpack_32_1x128 (mask))));
		1648	}
		1649
		1650	static void
		1651	sse2_combine_over_reverse_ca (pixman_implementation_t *imp,
		1652	pixman_op_t op,
		1653	uint32_t * pd,
		1654	const uint32_t * ps,
		1655	const uint32_t * pm,
		1656	int w)
		1657	{
		1658	uint32_t s, m, d;
		1659
		1660	__m128i xmm_alpha_lo, xmm_alpha_hi;
		1661	__m128i xmm_src_lo, xmm_src_hi;
		1662	__m128i xmm_dst_lo, xmm_dst_hi;
		1663	__m128i xmm_mask_lo, xmm_mask_hi;
		1664
		1665	while (w && (uintptr_t)pd & 15)
		1666	{
		1667	s = *ps++;
		1668	m = *pm++;
		1669	d = *pd;
		1670
		1671	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
		1672	w--;
		1673	}
		1674
		1675	while (w >= 4)
		1676	{
		1677	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
		1678	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		1679	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		1680
		1681	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1682	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		1683	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		1684
		1685	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
		1686	&xmm_alpha_lo, &xmm_alpha_hi);
		1687	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		1688	&xmm_mask_lo, &xmm_mask_hi,
		1689	&xmm_mask_lo, &xmm_mask_hi);
		1690
		1691	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
		1692	&xmm_alpha_lo, &xmm_alpha_hi,
		1693	&xmm_mask_lo, &xmm_mask_hi);
		1694
		1695	save_128_aligned (
		1696	(__m128i*)pd, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
		1697
		1698	ps += 4;
		1699	pd += 4;
		1700	pm += 4;
		1701	w -= 4;
		1702	}
		1703
		1704	while (w)
		1705	{
		1706	s = *ps++;
		1707	m = *pm++;
		1708	d = *pd;
		1709
		1710	*pd++ = core_combine_over_reverse_ca_pixel_sse2 (s, m, d);
		1711	w--;
		1712	}
		1713	}
		1714
		1715	static void
		1716	sse2_combine_in_ca (pixman_implementation_t *imp,
		1717	pixman_op_t op,
		1718	uint32_t * pd,
		1719	const uint32_t * ps,
		1720	const uint32_t * pm,
		1721	int w)
		1722	{
		1723	uint32_t s, m, d;
		1724
		1725	__m128i xmm_alpha_lo, xmm_alpha_hi;
		1726	__m128i xmm_src_lo, xmm_src_hi;
		1727	__m128i xmm_dst_lo, xmm_dst_hi;
		1728	__m128i xmm_mask_lo, xmm_mask_hi;
		1729
		1730	while (w && (uintptr_t)pd & 15)
		1731	{
		1732	s = *ps++;
		1733	m = *pm++;
		1734	d = *pd;
		1735
		1736	*pd++ = pack_1x128_32 (
		1737	pix_multiply_1x128 (
		1738	pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (m)),
		1739	expand_alpha_1x128 (unpack_32_1x128 (d))));
		1740
		1741	w--;
		1742	}
		1743
		1744	while (w >= 4)
		1745	{
		1746	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
		1747	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		1748	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		1749
		1750	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1751	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		1752	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		1753
		1754	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
		1755	&xmm_alpha_lo, &xmm_alpha_hi);
		1756
		1757	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		1758	&xmm_mask_lo, &xmm_mask_hi,
		1759	&xmm_dst_lo, &xmm_dst_hi);
		1760
		1761	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
		1762	&xmm_alpha_lo, &xmm_alpha_hi,
		1763	&xmm_dst_lo, &xmm_dst_hi);
		1764
		1765	save_128_aligned (
		1766	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		1767
		1768	ps += 4;
		1769	pd += 4;
		1770	pm += 4;
		1771	w -= 4;
		1772	}
		1773
		1774	while (w)
		1775	{
		1776	s = *ps++;
		1777	m = *pm++;
		1778	d = *pd;
		1779
		1780	*pd++ = pack_1x128_32 (
		1781	pix_multiply_1x128 (
		1782	pix_multiply_1x128 (
		1783	unpack_32_1x128 (s), unpack_32_1x128 (m)),
		1784	expand_alpha_1x128 (unpack_32_1x128 (d))));
		1785
		1786	w--;
		1787	}
		1788	}
		1789
		1790	static void
		1791	sse2_combine_in_reverse_ca (pixman_implementation_t *imp,
		1792	pixman_op_t op,
		1793	uint32_t * pd,
		1794	const uint32_t * ps,
		1795	const uint32_t * pm,
		1796	int w)
		1797	{
		1798	uint32_t s, m, d;
		1799
		1800	__m128i xmm_alpha_lo, xmm_alpha_hi;
		1801	__m128i xmm_src_lo, xmm_src_hi;
		1802	__m128i xmm_dst_lo, xmm_dst_hi;
		1803	__m128i xmm_mask_lo, xmm_mask_hi;
		1804
		1805	while (w && (uintptr_t)pd & 15)
		1806	{
		1807	s = *ps++;
		1808	m = *pm++;
		1809	d = *pd;
		1810
		1811	*pd++ = pack_1x128_32 (
		1812	pix_multiply_1x128 (
		1813	unpack_32_1x128 (d),
		1814	pix_multiply_1x128 (unpack_32_1x128 (m),
		1815	expand_alpha_1x128 (unpack_32_1x128 (s)))));
		1816	w--;
		1817	}
		1818
		1819	while (w >= 4)
		1820	{
		1821	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
		1822	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		1823	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		1824
		1825	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1826	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		1827	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		1828
		1829	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		1830	&xmm_alpha_lo, &xmm_alpha_hi);
		1831	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
		1832	&xmm_alpha_lo, &xmm_alpha_hi,
		1833	&xmm_alpha_lo, &xmm_alpha_hi);
		1834
		1835	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
		1836	&xmm_alpha_lo, &xmm_alpha_hi,
		1837	&xmm_dst_lo, &xmm_dst_hi);
		1838
		1839	save_128_aligned (
		1840	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		1841
		1842	ps += 4;
		1843	pd += 4;
		1844	pm += 4;
		1845	w -= 4;
		1846	}
		1847
		1848	while (w)
		1849	{
		1850	s = *ps++;
		1851	m = *pm++;
		1852	d = *pd;
		1853
		1854	*pd++ = pack_1x128_32 (
		1855	pix_multiply_1x128 (
		1856	unpack_32_1x128 (d),
		1857	pix_multiply_1x128 (unpack_32_1x128 (m),
		1858	expand_alpha_1x128 (unpack_32_1x128 (s)))));
		1859	w--;
		1860	}
		1861	}
		1862
		1863	static void
		1864	sse2_combine_out_ca (pixman_implementation_t *imp,
		1865	pixman_op_t op,
		1866	uint32_t * pd,
		1867	const uint32_t * ps,
		1868	const uint32_t * pm,
		1869	int w)
		1870	{
		1871	uint32_t s, m, d;
		1872
		1873	__m128i xmm_alpha_lo, xmm_alpha_hi;
		1874	__m128i xmm_src_lo, xmm_src_hi;
		1875	__m128i xmm_dst_lo, xmm_dst_hi;
		1876	__m128i xmm_mask_lo, xmm_mask_hi;
		1877
		1878	while (w && (uintptr_t)pd & 15)
		1879	{
		1880	s = *ps++;
		1881	m = *pm++;
		1882	d = *pd;
		1883
		1884	*pd++ = pack_1x128_32 (
		1885	pix_multiply_1x128 (
		1886	pix_multiply_1x128 (
		1887	unpack_32_1x128 (s), unpack_32_1x128 (m)),
		1888	negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
		1889	w--;
		1890	}
		1891
		1892	while (w >= 4)
		1893	{
		1894	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
		1895	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		1896	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		1897
		1898	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1899	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		1900	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		1901
		1902	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
		1903	&xmm_alpha_lo, &xmm_alpha_hi);
		1904	negate_2x128 (xmm_alpha_lo, xmm_alpha_hi,
		1905	&xmm_alpha_lo, &xmm_alpha_hi);
		1906
		1907	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		1908	&xmm_mask_lo, &xmm_mask_hi,
		1909	&xmm_dst_lo, &xmm_dst_hi);
		1910	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
		1911	&xmm_alpha_lo, &xmm_alpha_hi,
		1912	&xmm_dst_lo, &xmm_dst_hi);
		1913
		1914	save_128_aligned (
		1915	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		1916
		1917	ps += 4;
		1918	pd += 4;
		1919	pm += 4;
		1920	w -= 4;
		1921	}
		1922
		1923	while (w)
		1924	{
		1925	s = *ps++;
		1926	m = *pm++;
		1927	d = *pd;
		1928
		1929	*pd++ = pack_1x128_32 (
		1930	pix_multiply_1x128 (
		1931	pix_multiply_1x128 (
		1932	unpack_32_1x128 (s), unpack_32_1x128 (m)),
		1933	negate_1x128 (expand_alpha_1x128 (unpack_32_1x128 (d)))));
		1934
		1935	w--;
		1936	}
		1937	}
		1938
		1939	static void
		1940	sse2_combine_out_reverse_ca (pixman_implementation_t *imp,
		1941	pixman_op_t op,
		1942	uint32_t * pd,
		1943	const uint32_t * ps,
		1944	const uint32_t * pm,
		1945	int w)
		1946	{
		1947	uint32_t s, m, d;
		1948
		1949	__m128i xmm_alpha_lo, xmm_alpha_hi;
		1950	__m128i xmm_src_lo, xmm_src_hi;
		1951	__m128i xmm_dst_lo, xmm_dst_hi;
		1952	__m128i xmm_mask_lo, xmm_mask_hi;
		1953
		1954	while (w && (uintptr_t)pd & 15)
		1955	{
		1956	s = *ps++;
		1957	m = *pm++;
		1958	d = *pd;
		1959
		1960	*pd++ = pack_1x128_32 (
		1961	pix_multiply_1x128 (
		1962	unpack_32_1x128 (d),
		1963	negate_1x128 (pix_multiply_1x128 (
		1964	unpack_32_1x128 (m),
		1965	expand_alpha_1x128 (unpack_32_1x128 (s))))));
		1966	w--;
		1967	}
		1968
		1969	while (w >= 4)
		1970	{
		1971	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
		1972	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		1973	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		1974
		1975	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		1976	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		1977	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		1978
		1979	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		1980	&xmm_alpha_lo, &xmm_alpha_hi);
		1981
		1982	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
		1983	&xmm_alpha_lo, &xmm_alpha_hi,
		1984	&xmm_mask_lo, &xmm_mask_hi);
		1985
		1986	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
		1987	&xmm_mask_lo, &xmm_mask_hi);
		1988
		1989	pix_multiply_2x128 (&xmm_dst_lo, &xmm_dst_hi,
		1990	&xmm_mask_lo, &xmm_mask_hi,
		1991	&xmm_dst_lo, &xmm_dst_hi);
		1992
		1993	save_128_aligned (
		1994	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		1995
		1996	ps += 4;
		1997	pd += 4;
		1998	pm += 4;
		1999	w -= 4;
		2000	}
		2001
		2002	while (w)
		2003	{
		2004	s = *ps++;
		2005	m = *pm++;
		2006	d = *pd;
		2007
		2008	*pd++ = pack_1x128_32 (
		2009	pix_multiply_1x128 (
		2010	unpack_32_1x128 (d),
		2011	negate_1x128 (pix_multiply_1x128 (
		2012	unpack_32_1x128 (m),
		2013	expand_alpha_1x128 (unpack_32_1x128 (s))))));
		2014	w--;
		2015	}
		2016	}
		2017
		2018	static force_inline uint32_t
		2019	core_combine_atop_ca_pixel_sse2 (uint32_t src,
		2020	uint32_t mask,
		2021	uint32_t dst)
		2022	{
		2023	__m128i m = unpack_32_1x128 (mask);
		2024	__m128i s = unpack_32_1x128 (src);
		2025	__m128i d = unpack_32_1x128 (dst);
		2026	__m128i sa = expand_alpha_1x128 (s);
		2027	__m128i da = expand_alpha_1x128 (d);
		2028
		2029	s = pix_multiply_1x128 (s, m);
		2030	m = negate_1x128 (pix_multiply_1x128 (m, sa));
		2031
		2032	return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
		2033	}
		2034
		2035	static void
		2036	sse2_combine_atop_ca (pixman_implementation_t *imp,
		2037	pixman_op_t op,
		2038	uint32_t * pd,
		2039	const uint32_t * ps,
		2040	const uint32_t * pm,
		2041	int w)
		2042	{
		2043	uint32_t s, m, d;
		2044
		2045	__m128i xmm_src_lo, xmm_src_hi;
		2046	__m128i xmm_dst_lo, xmm_dst_hi;
		2047	__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
		2048	__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
		2049	__m128i xmm_mask_lo, xmm_mask_hi;
		2050
		2051	while (w && (uintptr_t)pd & 15)
		2052	{
		2053	s = *ps++;
		2054	m = *pm++;
		2055	d = *pd;
		2056
		2057	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
		2058	w--;
		2059	}
		2060
		2061	while (w >= 4)
		2062	{
		2063	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
		2064	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		2065	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		2066
		2067	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		2068	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		2069	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		2070
		2071	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		2072	&xmm_alpha_src_lo, &xmm_alpha_src_hi);
		2073	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
		2074	&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
		2075
		2076	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		2077	&xmm_mask_lo, &xmm_mask_hi,
		2078	&xmm_src_lo, &xmm_src_hi);
		2079	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
		2080	&xmm_alpha_src_lo, &xmm_alpha_src_hi,
		2081	&xmm_mask_lo, &xmm_mask_hi);
		2082
		2083	negate_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		2084
		2085	pix_add_multiply_2x128 (
		2086	&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
		2087	&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
		2088	&xmm_dst_lo, &xmm_dst_hi);
		2089
		2090	save_128_aligned (
		2091	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		2092
		2093	ps += 4;
		2094	pd += 4;
		2095	pm += 4;
		2096	w -= 4;
		2097	}
		2098
		2099	while (w)
		2100	{
		2101	s = *ps++;
		2102	m = *pm++;
		2103	d = *pd;
		2104
		2105	*pd++ = core_combine_atop_ca_pixel_sse2 (s, m, d);
		2106	w--;
		2107	}
		2108	}
		2109
		2110	static force_inline uint32_t
		2111	core_combine_reverse_atop_ca_pixel_sse2 (uint32_t src,
		2112	uint32_t mask,
		2113	uint32_t dst)
		2114	{
		2115	__m128i m = unpack_32_1x128 (mask);
		2116	__m128i s = unpack_32_1x128 (src);
		2117	__m128i d = unpack_32_1x128 (dst);
		2118
		2119	__m128i da = negate_1x128 (expand_alpha_1x128 (d));
		2120	__m128i sa = expand_alpha_1x128 (s);
		2121
		2122	s = pix_multiply_1x128 (s, m);
		2123	m = pix_multiply_1x128 (m, sa);
		2124
		2125	return pack_1x128_32 (pix_add_multiply_1x128 (&d, &m, &s, &da));
		2126	}
		2127
		2128	static void
		2129	sse2_combine_atop_reverse_ca (pixman_implementation_t *imp,
		2130	pixman_op_t op,
		2131	uint32_t * pd,
		2132	const uint32_t * ps,
		2133	const uint32_t * pm,
		2134	int w)
		2135	{
		2136	uint32_t s, m, d;
		2137
		2138	__m128i xmm_src_lo, xmm_src_hi;
		2139	__m128i xmm_dst_lo, xmm_dst_hi;
		2140	__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
		2141	__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
		2142	__m128i xmm_mask_lo, xmm_mask_hi;
		2143
		2144	while (w && (uintptr_t)pd & 15)
		2145	{
		2146	s = *ps++;
		2147	m = *pm++;
		2148	d = *pd;
		2149
		2150	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
		2151	w--;
		2152	}
		2153
		2154	while (w >= 4)
		2155	{
		2156	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
		2157	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		2158	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		2159
		2160	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		2161	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		2162	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		2163
		2164	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		2165	&xmm_alpha_src_lo, &xmm_alpha_src_hi);
		2166	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
		2167	&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
		2168
		2169	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		2170	&xmm_mask_lo, &xmm_mask_hi,
		2171	&xmm_src_lo, &xmm_src_hi);
		2172	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
		2173	&xmm_alpha_src_lo, &xmm_alpha_src_hi,
		2174	&xmm_mask_lo, &xmm_mask_hi);
		2175
		2176	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
		2177	&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
		2178
		2179	pix_add_multiply_2x128 (
		2180	&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
		2181	&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
		2182	&xmm_dst_lo, &xmm_dst_hi);
		2183
		2184	save_128_aligned (
		2185	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		2186
		2187	ps += 4;
		2188	pd += 4;
		2189	pm += 4;
		2190	w -= 4;
		2191	}
		2192
		2193	while (w)
		2194	{
		2195	s = *ps++;
		2196	m = *pm++;
		2197	d = *pd;
		2198
		2199	*pd++ = core_combine_reverse_atop_ca_pixel_sse2 (s, m, d);
		2200	w--;
		2201	}
		2202	}
		2203
		2204	static force_inline uint32_t
		2205	core_combine_xor_ca_pixel_sse2 (uint32_t src,
		2206	uint32_t mask,
		2207	uint32_t dst)
		2208	{
		2209	__m128i a = unpack_32_1x128 (mask);
		2210	__m128i s = unpack_32_1x128 (src);
		2211	__m128i d = unpack_32_1x128 (dst);
		2212
		2213	__m128i alpha_dst = negate_1x128 (pix_multiply_1x128 (
		2214	a, expand_alpha_1x128 (s)));
		2215	__m128i dest = pix_multiply_1x128 (s, a);
		2216	__m128i alpha_src = negate_1x128 (expand_alpha_1x128 (d));
		2217
		2218	return pack_1x128_32 (pix_add_multiply_1x128 (&d,
		2219	&alpha_dst,
		2220	&dest,
		2221	&alpha_src));
		2222	}
		2223
		2224	static void
		2225	sse2_combine_xor_ca (pixman_implementation_t *imp,
		2226	pixman_op_t op,
		2227	uint32_t * pd,
		2228	const uint32_t * ps,
		2229	const uint32_t * pm,
		2230	int w)
		2231	{
		2232	uint32_t s, m, d;
		2233
		2234	__m128i xmm_src_lo, xmm_src_hi;
		2235	__m128i xmm_dst_lo, xmm_dst_hi;
		2236	__m128i xmm_alpha_src_lo, xmm_alpha_src_hi;
		2237	__m128i xmm_alpha_dst_lo, xmm_alpha_dst_hi;
		2238	__m128i xmm_mask_lo, xmm_mask_hi;
		2239
		2240	while (w && (uintptr_t)pd & 15)
		2241	{
		2242	s = *ps++;
		2243	m = *pm++;
		2244	d = *pd;
		2245
		2246	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
		2247	w--;
		2248	}
		2249
		2250	while (w >= 4)
		2251	{
		2252	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
		2253	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		2254	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		2255
		2256	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		2257	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		2258	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		2259
		2260	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		2261	&xmm_alpha_src_lo, &xmm_alpha_src_hi);
		2262	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi,
		2263	&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
		2264
		2265	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		2266	&xmm_mask_lo, &xmm_mask_hi,
		2267	&xmm_src_lo, &xmm_src_hi);
		2268	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
		2269	&xmm_alpha_src_lo, &xmm_alpha_src_hi,
		2270	&xmm_mask_lo, &xmm_mask_hi);
		2271
		2272	negate_2x128 (xmm_alpha_dst_lo, xmm_alpha_dst_hi,
		2273	&xmm_alpha_dst_lo, &xmm_alpha_dst_hi);
		2274	negate_2x128 (xmm_mask_lo, xmm_mask_hi,
		2275	&xmm_mask_lo, &xmm_mask_hi);
		2276
		2277	pix_add_multiply_2x128 (
		2278	&xmm_dst_lo, &xmm_dst_hi, &xmm_mask_lo, &xmm_mask_hi,
		2279	&xmm_src_lo, &xmm_src_hi, &xmm_alpha_dst_lo, &xmm_alpha_dst_hi,
		2280	&xmm_dst_lo, &xmm_dst_hi);
		2281
		2282	save_128_aligned (
		2283	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		2284
		2285	ps += 4;
		2286	pd += 4;
		2287	pm += 4;
		2288	w -= 4;
		2289	}
		2290
		2291	while (w)
		2292	{
		2293	s = *ps++;
		2294	m = *pm++;
		2295	d = *pd;
		2296
		2297	*pd++ = core_combine_xor_ca_pixel_sse2 (s, m, d);
		2298	w--;
		2299	}
		2300	}
		2301
		2302	static void
		2303	sse2_combine_add_ca (pixman_implementation_t *imp,
		2304	pixman_op_t op,
		2305	uint32_t * pd,
		2306	const uint32_t * ps,
		2307	const uint32_t * pm,
		2308	int w)
		2309	{
		2310	uint32_t s, m, d;
		2311
		2312	__m128i xmm_src_lo, xmm_src_hi;
		2313	__m128i xmm_dst_lo, xmm_dst_hi;
		2314	__m128i xmm_mask_lo, xmm_mask_hi;
		2315
		2316	while (w && (uintptr_t)pd & 15)
		2317	{
		2318	s = *ps++;
		2319	m = *pm++;
		2320	d = *pd;
		2321
		2322	*pd++ = pack_1x128_32 (
		2323	_mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
		2324	unpack_32_1x128 (m)),
		2325	unpack_32_1x128 (d)));
		2326	w--;
		2327	}
		2328
		2329	while (w >= 4)
		2330	{
		2331	xmm_src_hi = load_128_unaligned ((__m128i*)ps);
		2332	xmm_mask_hi = load_128_unaligned ((__m128i*)pm);
		2333	xmm_dst_hi = load_128_aligned ((__m128i*)pd);
		2334
		2335	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		2336	unpack_128_2x128 (xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		2337	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		2338
		2339	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		2340	&xmm_mask_lo, &xmm_mask_hi,
		2341	&xmm_src_lo, &xmm_src_hi);
		2342
		2343	save_128_aligned (
		2344	(__m128i*)pd, pack_2x128_128 (
		2345	_mm_adds_epu8 (xmm_src_lo, xmm_dst_lo),
		2346	_mm_adds_epu8 (xmm_src_hi, xmm_dst_hi)));
		2347
		2348	ps += 4;
		2349	pd += 4;
		2350	pm += 4;
		2351	w -= 4;
		2352	}
		2353
		2354	while (w)
		2355	{
		2356	s = *ps++;
		2357	m = *pm++;
		2358	d = *pd;
		2359
		2360	*pd++ = pack_1x128_32 (
		2361	_mm_adds_epu8 (pix_multiply_1x128 (unpack_32_1x128 (s),
		2362	unpack_32_1x128 (m)),
		2363	unpack_32_1x128 (d)));
		2364	w--;
		2365	}
		2366	}
		2367
		2368	static force_inline __m128i
		2369	create_mask_16_128 (uint16_t mask)
		2370	{
		2371	return _mm_set1_epi16 (mask);
		2372	}
		2373
		2374	/* Work around a code generation bug in Sun Studio 12. */
		2375	#if defined(__SUNPRO_C) && (__SUNPRO_C >= 0x590)
		2376	# define create_mask_2x32_128(mask0, mask1) \
		2377	(_mm_set_epi32 ((mask0), (mask1), (mask0), (mask1)))
		2378	#else
		2379	static force_inline __m128i
		2380	create_mask_2x32_128 (uint32_t mask0,
		2381	uint32_t mask1)
		2382	{
		2383	return _mm_set_epi32 (mask0, mask1, mask0, mask1);
		2384	}
		2385	#endif
		2386
		2387	static void
		2388	sse2_composite_over_n_8888 (pixman_implementation_t *imp,
		2389	pixman_composite_info_t *info)
		2390	{
		2391	PIXMAN_COMPOSITE_ARGS (info);
		2392	uint32_t src;
		2393	uint32_t dst_line, dst, d;
		2394	int32_t w;
		2395	int dst_stride;
		2396	__m128i xmm_src, xmm_alpha;
		2397	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		2398
		2399	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		2400
		2401	if (src == 0)
		2402	return;
		2403
		2404	PIXMAN_IMAGE_GET_LINE (
		2405	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		2406
		2407	xmm_src = expand_pixel_32_1x128 (src);
		2408	xmm_alpha = expand_alpha_1x128 (xmm_src);
		2409
		2410	while (height--)
		2411	{
		2412	dst = dst_line;
		2413
		2414	dst_line += dst_stride;
		2415	w = width;
		2416
		2417	while (w && (uintptr_t)dst & 15)
		2418	{
		2419	d = *dst;
		2420	*dst++ = pack_1x128_32 (over_1x128 (xmm_src,
		2421	xmm_alpha,
		2422	unpack_32_1x128 (d)));
		2423	w--;
		2424	}
		2425
		2426	while (w >= 4)
		2427	{
		2428	xmm_dst = load_128_aligned ((__m128i*)dst);
		2429
		2430	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		2431
		2432	over_2x128 (&xmm_src, &xmm_src,
		2433	&xmm_alpha, &xmm_alpha,
		2434	&xmm_dst_lo, &xmm_dst_hi);
		2435
		2436	/* rebuid the 4 pixel data and save*/
		2437	save_128_aligned (
		2438	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		2439
		2440	w -= 4;
		2441	dst += 4;
		2442	}
		2443
		2444	while (w)
		2445	{
		2446	d = *dst;
		2447	*dst++ = pack_1x128_32 (over_1x128 (xmm_src,
		2448	xmm_alpha,
		2449	unpack_32_1x128 (d)));
		2450	w--;
		2451	}
		2452
		2453	}
		2454	}
		2455
		2456	static void
		2457	sse2_composite_over_n_0565 (pixman_implementation_t *imp,
		2458	pixman_composite_info_t *info)
		2459	{
		2460	PIXMAN_COMPOSITE_ARGS (info);
		2461	uint32_t src;
		2462	uint16_t dst_line, dst, d;
		2463	int32_t w;
		2464	int dst_stride;
		2465	__m128i xmm_src, xmm_alpha;
		2466	__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
		2467
		2468	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		2469
		2470	if (src == 0)
		2471	return;
		2472
		2473	PIXMAN_IMAGE_GET_LINE (
		2474	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
		2475
		2476	xmm_src = expand_pixel_32_1x128 (src);
		2477	xmm_alpha = expand_alpha_1x128 (xmm_src);
		2478
		2479	while (height--)
		2480	{
		2481	dst = dst_line;
		2482
		2483	dst_line += dst_stride;
		2484	w = width;
		2485
		2486	while (w && (uintptr_t)dst & 15)
		2487	{
		2488	d = *dst;
		2489
		2490	*dst++ = pack_565_32_16 (
		2491	pack_1x128_32 (over_1x128 (xmm_src,
		2492	xmm_alpha,
		2493	expand565_16_1x128 (d))));
		2494	w--;
		2495	}
		2496
		2497	while (w >= 8)
		2498	{
		2499	xmm_dst = load_128_aligned ((__m128i*)dst);
		2500
		2501	unpack_565_128_4x128 (xmm_dst,
		2502	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
		2503
		2504	over_2x128 (&xmm_src, &xmm_src,
		2505	&xmm_alpha, &xmm_alpha,
		2506	&xmm_dst0, &xmm_dst1);
		2507	over_2x128 (&xmm_src, &xmm_src,
		2508	&xmm_alpha, &xmm_alpha,
		2509	&xmm_dst2, &xmm_dst3);
		2510
		2511	xmm_dst = pack_565_4x128_128 (
		2512	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
		2513
		2514	save_128_aligned ((__m128i*)dst, xmm_dst);
		2515
		2516	dst += 8;
		2517	w -= 8;
		2518	}
		2519
		2520	while (w--)
		2521	{
		2522	d = *dst;
		2523	*dst++ = pack_565_32_16 (
		2524	pack_1x128_32 (over_1x128 (xmm_src, xmm_alpha,
		2525	expand565_16_1x128 (d))));
		2526	}
		2527	}
		2528
		2529	}
		2530
		2531	static void
		2532	sse2_composite_add_n_8888_8888_ca (pixman_implementation_t *imp,
		2533	pixman_composite_info_t *info)
		2534	{
		2535	PIXMAN_COMPOSITE_ARGS (info);
		2536	uint32_t src;
		2537	uint32_t *dst_line, d;
		2538	uint32_t *mask_line, m;
		2539	uint32_t pack_cmp;
		2540	int dst_stride, mask_stride;
		2541
		2542	__m128i xmm_src;
		2543	__m128i xmm_dst;
		2544	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		2545
		2546	__m128i mmx_src, mmx_mask, mmx_dest;
		2547
		2548	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		2549
		2550	if (src == 0)
		2551	return;
		2552
		2553	PIXMAN_IMAGE_GET_LINE (
		2554	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		2555	PIXMAN_IMAGE_GET_LINE (
		2556	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
		2557
		2558	xmm_src = _mm_unpacklo_epi8 (
		2559	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
		2560	mmx_src = xmm_src;
		2561
		2562	while (height--)
		2563	{
		2564	int w = width;
		2565	const uint32_t pm = (uint32_t )mask_line;
		2566	uint32_t pd = (uint32_t )dst_line;
		2567
		2568	dst_line += dst_stride;
		2569	mask_line += mask_stride;
		2570
		2571	while (w && (uintptr_t)pd & 15)
		2572	{
		2573	m = *pm++;
		2574
		2575	if (m)
		2576	{
		2577	d = *pd;
		2578
		2579	mmx_mask = unpack_32_1x128 (m);
		2580	mmx_dest = unpack_32_1x128 (d);
		2581
		2582	*pd = pack_1x128_32 (
		2583	_mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
		2584	mmx_dest));
		2585	}
		2586
		2587	pd++;
		2588	w--;
		2589	}
		2590
		2591	while (w >= 4)
		2592	{
		2593	xmm_mask = load_128_unaligned ((__m128i*)pm);
		2594
		2595	pack_cmp =
		2596	_mm_movemask_epi8 (
		2597	_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
		2598
		2599	/* if all bits in mask are zero, pack_cmp are equal to 0xffff */
		2600	if (pack_cmp != 0xffff)
		2601	{
		2602	xmm_dst = load_128_aligned ((__m128i*)pd);
		2603
		2604	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		2605
		2606	pix_multiply_2x128 (&xmm_src, &xmm_src,
		2607	&xmm_mask_lo, &xmm_mask_hi,
		2608	&xmm_mask_lo, &xmm_mask_hi);
		2609	xmm_mask_hi = pack_2x128_128 (xmm_mask_lo, xmm_mask_hi);
		2610
		2611	save_128_aligned (
		2612	(__m128i*)pd, _mm_adds_epu8 (xmm_mask_hi, xmm_dst));
		2613	}
		2614
		2615	pd += 4;
		2616	pm += 4;
		2617	w -= 4;
		2618	}
		2619
		2620	while (w)
		2621	{
		2622	m = *pm++;
		2623
		2624	if (m)
		2625	{
		2626	d = *pd;
		2627
		2628	mmx_mask = unpack_32_1x128 (m);
		2629	mmx_dest = unpack_32_1x128 (d);
		2630
		2631	*pd = pack_1x128_32 (
		2632	_mm_adds_epu8 (pix_multiply_1x128 (mmx_mask, mmx_src),
		2633	mmx_dest));
		2634	}
		2635
		2636	pd++;
		2637	w--;
		2638	}
		2639	}
		2640
		2641	}
		2642
		2643	static void
		2644	sse2_composite_over_n_8888_8888_ca (pixman_implementation_t *imp,
		2645	pixman_composite_info_t *info)
		2646	{
		2647	PIXMAN_COMPOSITE_ARGS (info);
		2648	uint32_t src;
		2649	uint32_t *dst_line, d;
		2650	uint32_t *mask_line, m;
		2651	uint32_t pack_cmp;
		2652	int dst_stride, mask_stride;
		2653
		2654	__m128i xmm_src, xmm_alpha;
		2655	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		2656	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		2657
		2658	__m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
		2659
		2660	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		2661
		2662	if (src == 0)
		2663	return;
		2664
		2665	PIXMAN_IMAGE_GET_LINE (
		2666	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		2667	PIXMAN_IMAGE_GET_LINE (
		2668	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
		2669
		2670	xmm_src = _mm_unpacklo_epi8 (
		2671	create_mask_2x32_128 (src, src), _mm_setzero_si128 ());
		2672	xmm_alpha = expand_alpha_1x128 (xmm_src);
		2673	mmx_src = xmm_src;
		2674	mmx_alpha = xmm_alpha;
		2675
		2676	while (height--)
		2677	{
		2678	int w = width;
		2679	const uint32_t pm = (uint32_t )mask_line;
		2680	uint32_t pd = (uint32_t )dst_line;
		2681
		2682	dst_line += dst_stride;
		2683	mask_line += mask_stride;
		2684
		2685	while (w && (uintptr_t)pd & 15)
		2686	{
		2687	m = *pm++;
		2688
		2689	if (m)
		2690	{
		2691	d = *pd;
		2692	mmx_mask = unpack_32_1x128 (m);
		2693	mmx_dest = unpack_32_1x128 (d);
		2694
		2695	*pd = pack_1x128_32 (in_over_1x128 (&mmx_src,
		2696	&mmx_alpha,
		2697	&mmx_mask,
		2698	&mmx_dest));
		2699	}
		2700
		2701	pd++;
		2702	w--;
		2703	}
		2704
		2705	while (w >= 4)
		2706	{
		2707	xmm_mask = load_128_unaligned ((__m128i*)pm);
		2708
		2709	pack_cmp =
		2710	_mm_movemask_epi8 (
		2711	_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
		2712
		2713	/* if all bits in mask are zero, pack_cmp are equal to 0xffff */
		2714	if (pack_cmp != 0xffff)
		2715	{
		2716	xmm_dst = load_128_aligned ((__m128i*)pd);
		2717
		2718	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		2719	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		2720
		2721	in_over_2x128 (&xmm_src, &xmm_src,
		2722	&xmm_alpha, &xmm_alpha,
		2723	&xmm_mask_lo, &xmm_mask_hi,
		2724	&xmm_dst_lo, &xmm_dst_hi);
		2725
		2726	save_128_aligned (
		2727	(__m128i*)pd, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		2728	}
		2729
		2730	pd += 4;
		2731	pm += 4;
		2732	w -= 4;
		2733	}
		2734
		2735	while (w)
		2736	{
		2737	m = *pm++;
		2738
		2739	if (m)
		2740	{
		2741	d = *pd;
		2742	mmx_mask = unpack_32_1x128 (m);
		2743	mmx_dest = unpack_32_1x128 (d);
		2744
		2745	*pd = pack_1x128_32 (
		2746	in_over_1x128 (&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest));
		2747	}
		2748
		2749	pd++;
		2750	w--;
		2751	}
		2752	}
		2753
		2754	}
		2755
		2756	static void
		2757	sse2_composite_over_8888_n_8888 (pixman_implementation_t *imp,
		2758	pixman_composite_info_t *info)
		2759	{
		2760	PIXMAN_COMPOSITE_ARGS (info);
		2761	uint32_t dst_line, dst;
		2762	uint32_t src_line, src;
		2763	uint32_t mask;
		2764	int32_t w;
		2765	int dst_stride, src_stride;
		2766
		2767	__m128i xmm_mask;
		2768	__m128i xmm_src, xmm_src_lo, xmm_src_hi;
		2769	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		2770	__m128i xmm_alpha_lo, xmm_alpha_hi;
		2771
		2772	PIXMAN_IMAGE_GET_LINE (
		2773	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		2774	PIXMAN_IMAGE_GET_LINE (
		2775	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		2776
		2777	mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
		2778
		2779	xmm_mask = create_mask_16_128 (mask >> 24);
		2780
		2781	while (height--)
		2782	{
		2783	dst = dst_line;
		2784	dst_line += dst_stride;
		2785	src = src_line;
		2786	src_line += src_stride;
		2787	w = width;
		2788
		2789	while (w && (uintptr_t)dst & 15)
		2790	{
		2791	uint32_t s = *src++;
		2792
		2793	if (s)
		2794	{
		2795	uint32_t d = *dst;
		2796
		2797	__m128i ms = unpack_32_1x128 (s);
		2798	__m128i alpha = expand_alpha_1x128 (ms);
		2799	__m128i dest = xmm_mask;
		2800	__m128i alpha_dst = unpack_32_1x128 (d);
		2801
		2802	*dst = pack_1x128_32 (
		2803	in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
		2804	}
		2805	dst++;
		2806	w--;
		2807	}
		2808
		2809	while (w >= 4)
		2810	{
		2811	xmm_src = load_128_unaligned ((__m128i*)src);
		2812
		2813	if (!is_zero (xmm_src))
		2814	{
		2815	xmm_dst = load_128_aligned ((__m128i*)dst);
		2816
		2817	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		2818	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		2819	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		2820	&xmm_alpha_lo, &xmm_alpha_hi);
		2821
		2822	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
		2823	&xmm_alpha_lo, &xmm_alpha_hi,
		2824	&xmm_mask, &xmm_mask,
		2825	&xmm_dst_lo, &xmm_dst_hi);
		2826
		2827	save_128_aligned (
		2828	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		2829	}
		2830
		2831	dst += 4;
		2832	src += 4;
		2833	w -= 4;
		2834	}
		2835
		2836	while (w)
		2837	{
		2838	uint32_t s = *src++;
		2839
		2840	if (s)
		2841	{
		2842	uint32_t d = *dst;
		2843
		2844	__m128i ms = unpack_32_1x128 (s);
		2845	__m128i alpha = expand_alpha_1x128 (ms);
		2846	__m128i mask = xmm_mask;
		2847	__m128i dest = unpack_32_1x128 (d);
		2848
		2849	*dst = pack_1x128_32 (
		2850	in_over_1x128 (&ms, &alpha, &mask, &dest));
		2851	}
		2852
		2853	dst++;
		2854	w--;
		2855	}
		2856	}
		2857
		2858	}
		2859
		2860	static void
		2861	sse2_composite_src_x888_0565 (pixman_implementation_t *imp,
		2862	pixman_composite_info_t *info)
		2863	{
		2864	PIXMAN_COMPOSITE_ARGS (info);
		2865	uint16_t dst_line, dst;
		2866	uint32_t src_line, src, s;
		2867	int dst_stride, src_stride;
		2868	int32_t w;
		2869
		2870	PIXMAN_IMAGE_GET_LINE (src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		2871	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
		2872
		2873	while (height--)
		2874	{
		2875	dst = dst_line;
		2876	dst_line += dst_stride;
		2877	src = src_line;
		2878	src_line += src_stride;
		2879	w = width;
		2880
		2881	while (w && (uintptr_t)dst & 15)
		2882	{
		2883	s = *src++;
		2884	*dst = convert_8888_to_0565 (s);
		2885	dst++;
		2886	w--;
		2887	}
		2888
		2889	while (w >= 8)
		2890	{
		2891	__m128i xmm_src0 = load_128_unaligned ((__m128i *)src + 0);
		2892	__m128i xmm_src1 = load_128_unaligned ((__m128i *)src + 1);
		2893
		2894	save_128_aligned ((__m128i*)dst, pack_565_2packedx128_128 (xmm_src0, xmm_src1));
		2895
		2896	w -= 8;
		2897	src += 8;
		2898	dst += 8;
		2899	}
		2900
		2901	while (w)
		2902	{
		2903	s = *src++;
		2904	*dst = convert_8888_to_0565 (s);
		2905	dst++;
		2906	w--;
		2907	}
		2908	}
		2909	}
		2910
		2911	static void
		2912	sse2_composite_src_x888_8888 (pixman_implementation_t *imp,
		2913	pixman_composite_info_t *info)
		2914	{
		2915	PIXMAN_COMPOSITE_ARGS (info);
		2916	uint32_t dst_line, dst;
		2917	uint32_t src_line, src;
		2918	int32_t w;
		2919	int dst_stride, src_stride;
		2920
		2921
		2922	PIXMAN_IMAGE_GET_LINE (
		2923	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		2924	PIXMAN_IMAGE_GET_LINE (
		2925	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		2926
		2927	while (height--)
		2928	{
		2929	dst = dst_line;
		2930	dst_line += dst_stride;
		2931	src = src_line;
		2932	src_line += src_stride;
		2933	w = width;
		2934
		2935	while (w && (uintptr_t)dst & 15)
		2936	{
		2937	dst++ = src++ \| 0xff000000;
		2938	w--;
		2939	}
		2940
		2941	while (w >= 16)
		2942	{
		2943	__m128i xmm_src1, xmm_src2, xmm_src3, xmm_src4;
		2944
		2945	xmm_src1 = load_128_unaligned ((__m128i*)src + 0);
		2946	xmm_src2 = load_128_unaligned ((__m128i*)src + 1);
		2947	xmm_src3 = load_128_unaligned ((__m128i*)src + 2);
		2948	xmm_src4 = load_128_unaligned ((__m128i*)src + 3);
		2949
		2950	save_128_aligned ((__m128i*)dst + 0, _mm_or_si128 (xmm_src1, mask_ff000000));
		2951	save_128_aligned ((__m128i*)dst + 1, _mm_or_si128 (xmm_src2, mask_ff000000));
		2952	save_128_aligned ((__m128i*)dst + 2, _mm_or_si128 (xmm_src3, mask_ff000000));
		2953	save_128_aligned ((__m128i*)dst + 3, _mm_or_si128 (xmm_src4, mask_ff000000));
		2954
		2955	dst += 16;
		2956	src += 16;
		2957	w -= 16;
		2958	}
		2959
		2960	while (w)
		2961	{
		2962	dst++ = src++ \| 0xff000000;
		2963	w--;
		2964	}
		2965	}
		2966
		2967	}
		2968
		2969	static void
		2970	sse2_composite_over_x888_n_8888 (pixman_implementation_t *imp,
		2971	pixman_composite_info_t *info)
		2972	{
		2973	PIXMAN_COMPOSITE_ARGS (info);
		2974	uint32_t dst_line, dst;
		2975	uint32_t src_line, src;
		2976	uint32_t mask;
		2977	int dst_stride, src_stride;
		2978	int32_t w;
		2979
		2980	__m128i xmm_mask, xmm_alpha;
		2981	__m128i xmm_src, xmm_src_lo, xmm_src_hi;
		2982	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		2983
		2984	PIXMAN_IMAGE_GET_LINE (
		2985	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		2986	PIXMAN_IMAGE_GET_LINE (
		2987	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		2988
		2989	mask = _pixman_image_get_solid (imp, mask_image, PIXMAN_a8r8g8b8);
		2990
		2991	xmm_mask = create_mask_16_128 (mask >> 24);
		2992	xmm_alpha = mask_00ff;
		2993
		2994	while (height--)
		2995	{
		2996	dst = dst_line;
		2997	dst_line += dst_stride;
		2998	src = src_line;
		2999	src_line += src_stride;
		3000	w = width;
		3001
		3002	while (w && (uintptr_t)dst & 15)
		3003	{
		3004	uint32_t s = (*src++) \| 0xff000000;
		3005	uint32_t d = *dst;
		3006
		3007	__m128i src = unpack_32_1x128 (s);
		3008	__m128i alpha = xmm_alpha;
		3009	__m128i mask = xmm_mask;
		3010	__m128i dest = unpack_32_1x128 (d);
		3011
		3012	*dst++ = pack_1x128_32 (
		3013	in_over_1x128 (&src, &alpha, &mask, &dest));
		3014
		3015	w--;
		3016	}
		3017
		3018	while (w >= 4)
		3019	{
		3020	xmm_src = _mm_or_si128 (
		3021	load_128_unaligned ((__m128i*)src), mask_ff000000);
		3022	xmm_dst = load_128_aligned ((__m128i*)dst);
		3023
		3024	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		3025	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		3026
		3027	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
		3028	&xmm_alpha, &xmm_alpha,
		3029	&xmm_mask, &xmm_mask,
		3030	&xmm_dst_lo, &xmm_dst_hi);
		3031
		3032	save_128_aligned (
		3033	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		3034
		3035	dst += 4;
		3036	src += 4;
		3037	w -= 4;
		3038
		3039	}
		3040
		3041	while (w)
		3042	{
		3043	uint32_t s = (*src++) \| 0xff000000;
		3044	uint32_t d = *dst;
		3045
		3046	__m128i src = unpack_32_1x128 (s);
		3047	__m128i alpha = xmm_alpha;
		3048	__m128i mask = xmm_mask;
		3049	__m128i dest = unpack_32_1x128 (d);
		3050
		3051	*dst++ = pack_1x128_32 (
		3052	in_over_1x128 (&src, &alpha, &mask, &dest));
		3053
		3054	w--;
		3055	}
		3056	}
		3057
		3058	}
		3059
		3060	static void
		3061	sse2_composite_over_8888_8888 (pixman_implementation_t *imp,
		3062	pixman_composite_info_t *info)
		3063	{
		3064	PIXMAN_COMPOSITE_ARGS (info);
		3065	int dst_stride, src_stride;
		3066	uint32_t dst_line, dst;
		3067	uint32_t src_line, src;
		3068
		3069	PIXMAN_IMAGE_GET_LINE (
		3070	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		3071	PIXMAN_IMAGE_GET_LINE (
		3072	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		3073
		3074	dst = dst_line;
		3075	src = src_line;
		3076
		3077	while (height--)
		3078	{
		3079	sse2_combine_over_u (imp, op, dst, src, NULL, width);
		3080
		3081	dst += dst_stride;
		3082	src += src_stride;
		3083	}
		3084	}
		3085
		3086	static force_inline uint16_t
		3087	composite_over_8888_0565pixel (uint32_t src, uint16_t dst)
		3088	{
		3089	__m128i ms;
		3090
		3091	ms = unpack_32_1x128 (src);
		3092	return pack_565_32_16 (
		3093	pack_1x128_32 (
		3094	over_1x128 (
		3095	ms, expand_alpha_1x128 (ms), expand565_16_1x128 (dst))));
		3096	}
		3097
		3098	static void
		3099	sse2_composite_over_8888_0565 (pixman_implementation_t *imp,
		3100	pixman_composite_info_t *info)
		3101	{
		3102	PIXMAN_COMPOSITE_ARGS (info);
		3103	uint16_t dst_line, dst, d;
		3104	uint32_t src_line, src, s;
		3105	int dst_stride, src_stride;
		3106	int32_t w;
		3107
		3108	__m128i xmm_alpha_lo, xmm_alpha_hi;
		3109	__m128i xmm_src, xmm_src_lo, xmm_src_hi;
		3110	__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
		3111
		3112	PIXMAN_IMAGE_GET_LINE (
		3113	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
		3114	PIXMAN_IMAGE_GET_LINE (
		3115	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		3116
		3117	while (height--)
		3118	{
		3119	dst = dst_line;
		3120	src = src_line;
		3121
		3122	dst_line += dst_stride;
		3123	src_line += src_stride;
		3124	w = width;
		3125
		3126	/* Align dst on a 16-byte boundary */
		3127	while (w &&
		3128	((uintptr_t)dst & 15))
		3129	{
		3130	s = *src++;
		3131	d = *dst;
		3132
		3133	*dst++ = composite_over_8888_0565pixel (s, d);
		3134	w--;
		3135	}
		3136
		3137	/* It's a 8 pixel loop */
		3138	while (w >= 8)
		3139	{
		3140	/* I'm loading unaligned because I'm not sure
		3141	* about the address alignment.
		3142	*/
		3143	xmm_src = load_128_unaligned ((__m128i*) src);
		3144	xmm_dst = load_128_aligned ((__m128i*) dst);
		3145
		3146	/* Unpacking */
		3147	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		3148	unpack_565_128_4x128 (xmm_dst,
		3149	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
		3150	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		3151	&xmm_alpha_lo, &xmm_alpha_hi);
		3152
		3153	/* I'm loading next 4 pixels from memory
		3154	* before to optimze the memory read.
		3155	*/
		3156	xmm_src = load_128_unaligned ((__m128i*) (src + 4));
		3157
		3158	over_2x128 (&xmm_src_lo, &xmm_src_hi,
		3159	&xmm_alpha_lo, &xmm_alpha_hi,
		3160	&xmm_dst0, &xmm_dst1);
		3161
		3162	/* Unpacking */
		3163	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		3164	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		3165	&xmm_alpha_lo, &xmm_alpha_hi);
		3166
		3167	over_2x128 (&xmm_src_lo, &xmm_src_hi,
		3168	&xmm_alpha_lo, &xmm_alpha_hi,
		3169	&xmm_dst2, &xmm_dst3);
		3170
		3171	save_128_aligned (
		3172	(__m128i*)dst, pack_565_4x128_128 (
		3173	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
		3174
		3175	w -= 8;
		3176	dst += 8;
		3177	src += 8;
		3178	}
		3179
		3180	while (w--)
		3181	{
		3182	s = *src++;
		3183	d = *dst;
		3184
		3185	*dst++ = composite_over_8888_0565pixel (s, d);
		3186	}
		3187	}
		3188
		3189	}
		3190
		3191	static void
		3192	sse2_composite_over_n_8_8888 (pixman_implementation_t *imp,
		3193	pixman_composite_info_t *info)
		3194	{
		3195	PIXMAN_COMPOSITE_ARGS (info);
		3196	uint32_t src, srca;
		3197	uint32_t dst_line, dst;
		3198	uint8_t mask_line, mask;
		3199	int dst_stride, mask_stride;
		3200	int32_t w;
		3201	uint32_t m, d;
		3202
		3203	__m128i xmm_src, xmm_alpha, xmm_def;
		3204	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		3205	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		3206
		3207	__m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
		3208
		3209	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		3210
		3211	srca = src >> 24;
		3212	if (src == 0)
		3213	return;
		3214
		3215	PIXMAN_IMAGE_GET_LINE (
		3216	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		3217	PIXMAN_IMAGE_GET_LINE (
		3218	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		3219
		3220	xmm_def = create_mask_2x32_128 (src, src);
		3221	xmm_src = expand_pixel_32_1x128 (src);
		3222	xmm_alpha = expand_alpha_1x128 (xmm_src);
		3223	mmx_src = xmm_src;
		3224	mmx_alpha = xmm_alpha;
		3225
		3226	while (height--)
		3227	{
		3228	dst = dst_line;
		3229	dst_line += dst_stride;
		3230	mask = mask_line;
		3231	mask_line += mask_stride;
		3232	w = width;
		3233
		3234	while (w && (uintptr_t)dst & 15)
		3235	{
		3236	uint8_t m = *mask++;
		3237
		3238	if (m)
		3239	{
		3240	d = *dst;
		3241	mmx_mask = expand_pixel_8_1x128 (m);
		3242	mmx_dest = unpack_32_1x128 (d);
		3243
		3244	*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
		3245	&mmx_alpha,
		3246	&mmx_mask,
		3247	&mmx_dest));
		3248	}
		3249
		3250	w--;
		3251	dst++;
		3252	}
		3253
		3254	while (w >= 4)
		3255	{
		3256	m = ((uint32_t)mask);
		3257
		3258	if (srca == 0xff && m == 0xffffffff)
		3259	{
		3260	save_128_aligned ((__m128i*)dst, xmm_def);
		3261	}
		3262	else if (m)
		3263	{
		3264	xmm_dst = load_128_aligned ((__m128i*) dst);
		3265	xmm_mask = unpack_32_1x128 (m);
		3266	xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
		3267
		3268	/* Unpacking */
		3269	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		3270	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		3271
		3272	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
		3273	&xmm_mask_lo, &xmm_mask_hi);
		3274
		3275	in_over_2x128 (&xmm_src, &xmm_src,
		3276	&xmm_alpha, &xmm_alpha,
		3277	&xmm_mask_lo, &xmm_mask_hi,
		3278	&xmm_dst_lo, &xmm_dst_hi);
		3279
		3280	save_128_aligned (
		3281	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		3282	}
		3283
		3284	w -= 4;
		3285	dst += 4;
		3286	mask += 4;
		3287	}
		3288
		3289	while (w)
		3290	{
		3291	uint8_t m = *mask++;
		3292
		3293	if (m)
		3294	{
		3295	d = *dst;
		3296	mmx_mask = expand_pixel_8_1x128 (m);
		3297	mmx_dest = unpack_32_1x128 (d);
		3298
		3299	*dst = pack_1x128_32 (in_over_1x128 (&mmx_src,
		3300	&mmx_alpha,
		3301	&mmx_mask,
		3302	&mmx_dest));
		3303	}
		3304
		3305	w--;
		3306	dst++;
		3307	}
		3308	}
		3309
		3310	}
		3311
		3312	#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
		3313	__attribute__((__force_align_arg_pointer__))
		3314	#endif
		3315	static pixman_bool_t
		3316	sse2_fill (pixman_implementation_t *imp,
		3317	uint32_t * bits,
		3318	int stride,
		3319	int bpp,
		3320	int x,
		3321	int y,
		3322	int width,
		3323	int height,
		3324	uint32_t filler)
		3325	{
		3326	uint32_t byte_width;
		3327	uint8_t *byte_line;
		3328
		3329	__m128i xmm_def;
		3330
		3331	if (bpp == 8)
		3332	{
		3333	uint8_t b;
		3334	uint16_t w;
		3335
		3336	stride = stride * (int) sizeof (uint32_t) / 1;
		3337	byte_line = (uint8_t )(((uint8_t )bits) + stride * y + x);
		3338	byte_width = width;
		3339	stride *= 1;
		3340
		3341	b = filler & 0xff;
		3342	w = (b << 8) \| b;
		3343	filler = (w << 16) \| w;
		3344	}
		3345	else if (bpp == 16)
		3346	{
		3347	stride = stride * (int) sizeof (uint32_t) / 2;
		3348	byte_line = (uint8_t )(((uint16_t )bits) + stride * y + x);
		3349	byte_width = 2 * width;
		3350	stride *= 2;
		3351
		3352	filler = (filler & 0xffff) * 0x00010001;
		3353	}
		3354	else if (bpp == 32)
		3355	{
		3356	stride = stride * (int) sizeof (uint32_t) / 4;
		3357	byte_line = (uint8_t )(((uint32_t )bits) + stride * y + x);
		3358	byte_width = 4 * width;
		3359	stride *= 4;
		3360	}
		3361	else
		3362	{
		3363	return FALSE;
		3364	}
		3365
		3366	xmm_def = create_mask_2x32_128 (filler, filler);
		3367
		3368	while (height--)
		3369	{
		3370	int w;
		3371	uint8_t *d = byte_line;
		3372	byte_line += stride;
		3373	w = byte_width;
		3374
		3375	if (w >= 1 && ((uintptr_t)d & 1))
		3376	{
		3377	(uint8_t )d = filler;
		3378	w -= 1;
		3379	d += 1;
		3380	}
		3381
		3382	while (w >= 2 && ((uintptr_t)d & 3))
		3383	{
		3384	(uint16_t )d = filler;
		3385	w -= 2;
		3386	d += 2;
		3387	}
		3388
		3389	while (w >= 4 && ((uintptr_t)d & 15))
		3390	{
		3391	(uint32_t )d = filler;
		3392
		3393	w -= 4;
		3394	d += 4;
		3395	}
		3396
		3397	while (w >= 128)
		3398	{
		3399	save_128_aligned ((__m128i*)(d), xmm_def);
		3400	save_128_aligned ((__m128i*)(d + 16), xmm_def);
		3401	save_128_aligned ((__m128i*)(d + 32), xmm_def);
		3402	save_128_aligned ((__m128i*)(d + 48), xmm_def);
		3403	save_128_aligned ((__m128i*)(d + 64), xmm_def);
		3404	save_128_aligned ((__m128i*)(d + 80), xmm_def);
		3405	save_128_aligned ((__m128i*)(d + 96), xmm_def);
		3406	save_128_aligned ((__m128i*)(d + 112), xmm_def);
		3407
		3408	d += 128;
		3409	w -= 128;
		3410	}
		3411
		3412	if (w >= 64)
		3413	{
		3414	save_128_aligned ((__m128i*)(d), xmm_def);
		3415	save_128_aligned ((__m128i*)(d + 16), xmm_def);
		3416	save_128_aligned ((__m128i*)(d + 32), xmm_def);
		3417	save_128_aligned ((__m128i*)(d + 48), xmm_def);
		3418
		3419	d += 64;
		3420	w -= 64;
		3421	}
		3422
		3423	if (w >= 32)
		3424	{
		3425	save_128_aligned ((__m128i*)(d), xmm_def);
		3426	save_128_aligned ((__m128i*)(d + 16), xmm_def);
		3427
		3428	d += 32;
		3429	w -= 32;
		3430	}
		3431
		3432	if (w >= 16)
		3433	{
		3434	save_128_aligned ((__m128i*)(d), xmm_def);
		3435
		3436	d += 16;
		3437	w -= 16;
		3438	}
		3439
		3440	while (w >= 4)
		3441	{
		3442	(uint32_t )d = filler;
		3443
		3444	w -= 4;
		3445	d += 4;
		3446	}
		3447
		3448	if (w >= 2)
		3449	{
		3450	(uint16_t )d = filler;
		3451	w -= 2;
		3452	d += 2;
		3453	}
		3454
		3455	if (w >= 1)
		3456	{
		3457	(uint8_t )d = filler;
		3458	w -= 1;
		3459	d += 1;
		3460	}
		3461	}
		3462
		3463	return TRUE;
		3464	}
		3465
		3466	static void
		3467	sse2_composite_src_n_8_8888 (pixman_implementation_t *imp,
		3468	pixman_composite_info_t *info)
		3469	{
		3470	PIXMAN_COMPOSITE_ARGS (info);
		3471	uint32_t src, srca;
		3472	uint32_t dst_line, dst;
		3473	uint8_t mask_line, mask;
		3474	int dst_stride, mask_stride;
		3475	int32_t w;
		3476	uint32_t m;
		3477
		3478	__m128i xmm_src, xmm_def;
		3479	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		3480
		3481	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		3482
		3483	srca = src >> 24;
		3484	if (src == 0)
		3485	{
		3486	sse2_fill (imp, dest_image->bits.bits, dest_image->bits.rowstride,
		3487	PIXMAN_FORMAT_BPP (dest_image->bits.format),
		3488	dest_x, dest_y, width, height, 0);
		3489	return;
		3490	}
		3491
		3492	PIXMAN_IMAGE_GET_LINE (
		3493	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		3494	PIXMAN_IMAGE_GET_LINE (
		3495	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		3496
		3497	xmm_def = create_mask_2x32_128 (src, src);
		3498	xmm_src = expand_pixel_32_1x128 (src);
		3499
		3500	while (height--)
		3501	{
		3502	dst = dst_line;
		3503	dst_line += dst_stride;
		3504	mask = mask_line;
		3505	mask_line += mask_stride;
		3506	w = width;
		3507
		3508	while (w && (uintptr_t)dst & 15)
		3509	{
		3510	uint8_t m = *mask++;
		3511
		3512	if (m)
		3513	{
		3514	*dst = pack_1x128_32 (
		3515	pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)));
		3516	}
		3517	else
		3518	{
		3519	*dst = 0;
		3520	}
		3521
		3522	w--;
		3523	dst++;
		3524	}
		3525
		3526	while (w >= 4)
		3527	{
		3528	m = ((uint32_t)mask);
		3529
		3530	if (srca == 0xff && m == 0xffffffff)
		3531	{
		3532	save_128_aligned ((__m128i*)dst, xmm_def);
		3533	}
		3534	else if (m)
		3535	{
		3536	xmm_mask = unpack_32_1x128 (m);
		3537	xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
		3538
		3539	/* Unpacking */
		3540	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		3541
		3542	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
		3543	&xmm_mask_lo, &xmm_mask_hi);
		3544
		3545	pix_multiply_2x128 (&xmm_src, &xmm_src,
		3546	&xmm_mask_lo, &xmm_mask_hi,
		3547	&xmm_mask_lo, &xmm_mask_hi);
		3548
		3549	save_128_aligned (
		3550	(__m128i*)dst, pack_2x128_128 (xmm_mask_lo, xmm_mask_hi));
		3551	}
		3552	else
		3553	{
		3554	save_128_aligned ((__m128i*)dst, _mm_setzero_si128 ());
		3555	}
		3556
		3557	w -= 4;
		3558	dst += 4;
		3559	mask += 4;
		3560	}
		3561
		3562	while (w)
		3563	{
		3564	uint8_t m = *mask++;
		3565
		3566	if (m)
		3567	{
		3568	*dst = pack_1x128_32 (
		3569	pix_multiply_1x128 (
		3570	xmm_src, expand_pixel_8_1x128 (m)));
		3571	}
		3572	else
		3573	{
		3574	*dst = 0;
		3575	}
		3576
		3577	w--;
		3578	dst++;
		3579	}
		3580	}
		3581
		3582	}
		3583
		3584	static void
		3585	sse2_composite_over_n_8_0565 (pixman_implementation_t *imp,
		3586	pixman_composite_info_t *info)
		3587	{
		3588	PIXMAN_COMPOSITE_ARGS (info);
		3589	uint32_t src;
		3590	uint16_t dst_line, dst, d;
		3591	uint8_t mask_line, mask;
		3592	int dst_stride, mask_stride;
		3593	int32_t w;
		3594	uint32_t m;
		3595	__m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
		3596
		3597	__m128i xmm_src, xmm_alpha;
		3598	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		3599	__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
		3600
		3601	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		3602
		3603	if (src == 0)
		3604	return;
		3605
		3606	PIXMAN_IMAGE_GET_LINE (
		3607	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
		3608	PIXMAN_IMAGE_GET_LINE (
		3609	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		3610
		3611	xmm_src = expand_pixel_32_1x128 (src);
		3612	xmm_alpha = expand_alpha_1x128 (xmm_src);
		3613	mmx_src = xmm_src;
		3614	mmx_alpha = xmm_alpha;
		3615
		3616	while (height--)
		3617	{
		3618	dst = dst_line;
		3619	dst_line += dst_stride;
		3620	mask = mask_line;
		3621	mask_line += mask_stride;
		3622	w = width;
		3623
		3624	while (w && (uintptr_t)dst & 15)
		3625	{
		3626	m = *mask++;
		3627
		3628	if (m)
		3629	{
		3630	d = *dst;
		3631	mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
		3632	mmx_dest = expand565_16_1x128 (d);
		3633
		3634	*dst = pack_565_32_16 (
		3635	pack_1x128_32 (
		3636	in_over_1x128 (
		3637	&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
		3638	}
		3639
		3640	w--;
		3641	dst++;
		3642	}
		3643
		3644	while (w >= 8)
		3645	{
		3646	xmm_dst = load_128_aligned ((__m128i*) dst);
		3647	unpack_565_128_4x128 (xmm_dst,
		3648	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
		3649
		3650	m = ((uint32_t)mask);
		3651	mask += 4;
		3652
		3653	if (m)
		3654	{
		3655	xmm_mask = unpack_32_1x128 (m);
		3656	xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
		3657
		3658	/* Unpacking */
		3659	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		3660
		3661	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
		3662	&xmm_mask_lo, &xmm_mask_hi);
		3663
		3664	in_over_2x128 (&xmm_src, &xmm_src,
		3665	&xmm_alpha, &xmm_alpha,
		3666	&xmm_mask_lo, &xmm_mask_hi,
		3667	&xmm_dst0, &xmm_dst1);
		3668	}
		3669
		3670	m = ((uint32_t)mask);
		3671	mask += 4;
		3672
		3673	if (m)
		3674	{
		3675	xmm_mask = unpack_32_1x128 (m);
		3676	xmm_mask = _mm_unpacklo_epi8 (xmm_mask, _mm_setzero_si128 ());
		3677
		3678	/* Unpacking */
		3679	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		3680
		3681	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
		3682	&xmm_mask_lo, &xmm_mask_hi);
		3683	in_over_2x128 (&xmm_src, &xmm_src,
		3684	&xmm_alpha, &xmm_alpha,
		3685	&xmm_mask_lo, &xmm_mask_hi,
		3686	&xmm_dst2, &xmm_dst3);
		3687	}
		3688
		3689	save_128_aligned (
		3690	(__m128i*)dst, pack_565_4x128_128 (
		3691	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
		3692
		3693	w -= 8;
		3694	dst += 8;
		3695	}
		3696
		3697	while (w)
		3698	{
		3699	m = *mask++;
		3700
		3701	if (m)
		3702	{
		3703	d = *dst;
		3704	mmx_mask = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
		3705	mmx_dest = expand565_16_1x128 (d);
		3706
		3707	*dst = pack_565_32_16 (
		3708	pack_1x128_32 (
		3709	in_over_1x128 (
		3710	&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
		3711	}
		3712
		3713	w--;
		3714	dst++;
		3715	}
		3716	}
		3717
		3718	}
		3719
		3720	static void
		3721	sse2_composite_over_pixbuf_0565 (pixman_implementation_t *imp,
		3722	pixman_composite_info_t *info)
		3723	{
		3724	PIXMAN_COMPOSITE_ARGS (info);
		3725	uint16_t dst_line, dst, d;
		3726	uint32_t src_line, src, s;
		3727	int dst_stride, src_stride;
		3728	int32_t w;
		3729	uint32_t opaque, zero;
		3730
		3731	__m128i ms;
		3732	__m128i xmm_src, xmm_src_lo, xmm_src_hi;
		3733	__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
		3734
		3735	PIXMAN_IMAGE_GET_LINE (
		3736	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
		3737	PIXMAN_IMAGE_GET_LINE (
		3738	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		3739
		3740	while (height--)
		3741	{
		3742	dst = dst_line;
		3743	dst_line += dst_stride;
		3744	src = src_line;
		3745	src_line += src_stride;
		3746	w = width;
		3747
		3748	while (w && (uintptr_t)dst & 15)
		3749	{
		3750	s = *src++;
		3751	d = *dst;
		3752
		3753	ms = unpack_32_1x128 (s);
		3754
		3755	*dst++ = pack_565_32_16 (
		3756	pack_1x128_32 (
		3757	over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
		3758	w--;
		3759	}
		3760
		3761	while (w >= 8)
		3762	{
		3763	/* First round */
		3764	xmm_src = load_128_unaligned ((__m128i*)src);
		3765	xmm_dst = load_128_aligned ((__m128i*)dst);
		3766
		3767	opaque = is_opaque (xmm_src);
		3768	zero = is_zero (xmm_src);
		3769
		3770	unpack_565_128_4x128 (xmm_dst,
		3771	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
		3772	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		3773
		3774	/* preload next round*/
		3775	xmm_src = load_128_unaligned ((__m128i*)(src + 4));
		3776
		3777	if (opaque)
		3778	{
		3779	invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
		3780	&xmm_dst0, &xmm_dst1);
		3781	}
		3782	else if (!zero)
		3783	{
		3784	over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
		3785	&xmm_dst0, &xmm_dst1);
		3786	}
		3787
		3788	/* Second round */
		3789	opaque = is_opaque (xmm_src);
		3790	zero = is_zero (xmm_src);
		3791
		3792	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		3793
		3794	if (opaque)
		3795	{
		3796	invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
		3797	&xmm_dst2, &xmm_dst3);
		3798	}
		3799	else if (!zero)
		3800	{
		3801	over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
		3802	&xmm_dst2, &xmm_dst3);
		3803	}
		3804
		3805	save_128_aligned (
		3806	(__m128i*)dst, pack_565_4x128_128 (
		3807	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
		3808
		3809	w -= 8;
		3810	src += 8;
		3811	dst += 8;
		3812	}
		3813
		3814	while (w)
		3815	{
		3816	s = *src++;
		3817	d = *dst;
		3818
		3819	ms = unpack_32_1x128 (s);
		3820
		3821	*dst++ = pack_565_32_16 (
		3822	pack_1x128_32 (
		3823	over_rev_non_pre_1x128 (ms, expand565_16_1x128 (d))));
		3824	w--;
		3825	}
		3826	}
		3827
		3828	}
		3829
		3830	static void
		3831	sse2_composite_over_pixbuf_8888 (pixman_implementation_t *imp,
		3832	pixman_composite_info_t *info)
		3833	{
		3834	PIXMAN_COMPOSITE_ARGS (info);
		3835	uint32_t dst_line, dst, d;
		3836	uint32_t src_line, src, s;
		3837	int dst_stride, src_stride;
		3838	int32_t w;
		3839	uint32_t opaque, zero;
		3840
		3841	__m128i xmm_src_lo, xmm_src_hi;
		3842	__m128i xmm_dst_lo, xmm_dst_hi;
		3843
		3844	PIXMAN_IMAGE_GET_LINE (
		3845	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		3846	PIXMAN_IMAGE_GET_LINE (
		3847	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		3848
		3849	while (height--)
		3850	{
		3851	dst = dst_line;
		3852	dst_line += dst_stride;
		3853	src = src_line;
		3854	src_line += src_stride;
		3855	w = width;
		3856
		3857	while (w && (uintptr_t)dst & 15)
		3858	{
		3859	s = *src++;
		3860	d = *dst;
		3861
		3862	*dst++ = pack_1x128_32 (
		3863	over_rev_non_pre_1x128 (
		3864	unpack_32_1x128 (s), unpack_32_1x128 (d)));
		3865
		3866	w--;
		3867	}
		3868
		3869	while (w >= 4)
		3870	{
		3871	xmm_src_hi = load_128_unaligned ((__m128i*)src);
		3872
		3873	opaque = is_opaque (xmm_src_hi);
		3874	zero = is_zero (xmm_src_hi);
		3875
		3876	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		3877
		3878	if (opaque)
		3879	{
		3880	invert_colors_2x128 (xmm_src_lo, xmm_src_hi,
		3881	&xmm_dst_lo, &xmm_dst_hi);
		3882
		3883	save_128_aligned (
		3884	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		3885	}
		3886	else if (!zero)
		3887	{
		3888	xmm_dst_hi = load_128_aligned ((__m128i*)dst);
		3889
		3890	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		3891
		3892	over_rev_non_pre_2x128 (xmm_src_lo, xmm_src_hi,
		3893	&xmm_dst_lo, &xmm_dst_hi);
		3894
		3895	save_128_aligned (
		3896	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		3897	}
		3898
		3899	w -= 4;
		3900	dst += 4;
		3901	src += 4;
		3902	}
		3903
		3904	while (w)
		3905	{
		3906	s = *src++;
		3907	d = *dst;
		3908
		3909	*dst++ = pack_1x128_32 (
		3910	over_rev_non_pre_1x128 (
		3911	unpack_32_1x128 (s), unpack_32_1x128 (d)));
		3912
		3913	w--;
		3914	}
		3915	}
		3916
		3917	}
		3918
		3919	static void
		3920	sse2_composite_over_n_8888_0565_ca (pixman_implementation_t *imp,
		3921	pixman_composite_info_t *info)
		3922	{
		3923	PIXMAN_COMPOSITE_ARGS (info);
		3924	uint32_t src;
		3925	uint16_t dst_line, dst, d;
		3926	uint32_t mask_line, mask, m;
		3927	int dst_stride, mask_stride;
		3928	int w;
		3929	uint32_t pack_cmp;
		3930
		3931	__m128i xmm_src, xmm_alpha;
		3932	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		3933	__m128i xmm_dst, xmm_dst0, xmm_dst1, xmm_dst2, xmm_dst3;
		3934
		3935	__m128i mmx_src, mmx_alpha, mmx_mask, mmx_dest;
		3936
		3937	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		3938
		3939	if (src == 0)
		3940	return;
		3941
		3942	PIXMAN_IMAGE_GET_LINE (
		3943	dest_image, dest_x, dest_y, uint16_t, dst_stride, dst_line, 1);
		3944	PIXMAN_IMAGE_GET_LINE (
		3945	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
		3946
		3947	xmm_src = expand_pixel_32_1x128 (src);
		3948	xmm_alpha = expand_alpha_1x128 (xmm_src);
		3949	mmx_src = xmm_src;
		3950	mmx_alpha = xmm_alpha;
		3951
		3952	while (height--)
		3953	{
		3954	w = width;
		3955	mask = mask_line;
		3956	dst = dst_line;
		3957	mask_line += mask_stride;
		3958	dst_line += dst_stride;
		3959
		3960	while (w && ((uintptr_t)dst & 15))
		3961	{
		3962	m = (uint32_t ) mask;
		3963
		3964	if (m)
		3965	{
		3966	d = *dst;
		3967	mmx_mask = unpack_32_1x128 (m);
		3968	mmx_dest = expand565_16_1x128 (d);
		3969
		3970	*dst = pack_565_32_16 (
		3971	pack_1x128_32 (
		3972	in_over_1x128 (
		3973	&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
		3974	}
		3975
		3976	w--;
		3977	dst++;
		3978	mask++;
		3979	}
		3980
		3981	while (w >= 8)
		3982	{
		3983	/* First round */
		3984	xmm_mask = load_128_unaligned ((__m128i*)mask);
		3985	xmm_dst = load_128_aligned ((__m128i*)dst);
		3986
		3987	pack_cmp = _mm_movemask_epi8 (
		3988	_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
		3989
		3990	unpack_565_128_4x128 (xmm_dst,
		3991	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3);
		3992	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		3993
		3994	/* preload next round */
		3995	xmm_mask = load_128_unaligned ((__m128i*)(mask + 4));
		3996
		3997	/* preload next round */
		3998	if (pack_cmp != 0xffff)
		3999	{
		4000	in_over_2x128 (&xmm_src, &xmm_src,
		4001	&xmm_alpha, &xmm_alpha,
		4002	&xmm_mask_lo, &xmm_mask_hi,
		4003	&xmm_dst0, &xmm_dst1);
		4004	}
		4005
		4006	/* Second round */
		4007	pack_cmp = _mm_movemask_epi8 (
		4008	_mm_cmpeq_epi32 (xmm_mask, _mm_setzero_si128 ()));
		4009
		4010	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		4011
		4012	if (pack_cmp != 0xffff)
		4013	{
		4014	in_over_2x128 (&xmm_src, &xmm_src,
		4015	&xmm_alpha, &xmm_alpha,
		4016	&xmm_mask_lo, &xmm_mask_hi,
		4017	&xmm_dst2, &xmm_dst3);
		4018	}
		4019
		4020	save_128_aligned (
		4021	(__m128i*)dst, pack_565_4x128_128 (
		4022	&xmm_dst0, &xmm_dst1, &xmm_dst2, &xmm_dst3));
		4023
		4024	w -= 8;
		4025	dst += 8;
		4026	mask += 8;
		4027	}
		4028
		4029	while (w)
		4030	{
		4031	m = (uint32_t ) mask;
		4032
		4033	if (m)
		4034	{
		4035	d = *dst;
		4036	mmx_mask = unpack_32_1x128 (m);
		4037	mmx_dest = expand565_16_1x128 (d);
		4038
		4039	*dst = pack_565_32_16 (
		4040	pack_1x128_32 (
		4041	in_over_1x128 (
		4042	&mmx_src, &mmx_alpha, &mmx_mask, &mmx_dest)));
		4043	}
		4044
		4045	w--;
		4046	dst++;
		4047	mask++;
		4048	}
		4049	}
		4050
		4051	}
		4052
		4053	static void
		4054	sse2_composite_in_n_8_8 (pixman_implementation_t *imp,
		4055	pixman_composite_info_t *info)
		4056	{
		4057	PIXMAN_COMPOSITE_ARGS (info);
		4058	uint8_t dst_line, dst;
		4059	uint8_t mask_line, mask;
		4060	int dst_stride, mask_stride;
		4061	uint32_t d, m;
		4062	uint32_t src;
		4063	int32_t w;
		4064
		4065	__m128i xmm_alpha;
		4066	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		4067	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		4068
		4069	PIXMAN_IMAGE_GET_LINE (
		4070	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
		4071	PIXMAN_IMAGE_GET_LINE (
		4072	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		4073
		4074	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		4075
		4076	xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
		4077
		4078	while (height--)
		4079	{
		4080	dst = dst_line;
		4081	dst_line += dst_stride;
		4082	mask = mask_line;
		4083	mask_line += mask_stride;
		4084	w = width;
		4085
		4086	while (w && ((uintptr_t)dst & 15))
		4087	{
		4088	m = (uint32_t) *mask++;
		4089	d = (uint32_t) *dst;
		4090
		4091	*dst++ = (uint8_t) pack_1x128_32 (
		4092	pix_multiply_1x128 (
		4093	pix_multiply_1x128 (xmm_alpha,
		4094	unpack_32_1x128 (m)),
		4095	unpack_32_1x128 (d)));
		4096	w--;
		4097	}
		4098
		4099	while (w >= 16)
		4100	{
		4101	xmm_mask = load_128_unaligned ((__m128i*)mask);
		4102	xmm_dst = load_128_aligned ((__m128i*)dst);
		4103
		4104	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		4105	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		4106
		4107	pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
		4108	&xmm_mask_lo, &xmm_mask_hi,
		4109	&xmm_mask_lo, &xmm_mask_hi);
		4110
		4111	pix_multiply_2x128 (&xmm_mask_lo, &xmm_mask_hi,
		4112	&xmm_dst_lo, &xmm_dst_hi,
		4113	&xmm_dst_lo, &xmm_dst_hi);
		4114
		4115	save_128_aligned (
		4116	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		4117
		4118	mask += 16;
		4119	dst += 16;
		4120	w -= 16;
		4121	}
		4122
		4123	while (w)
		4124	{
		4125	m = (uint32_t) *mask++;
		4126	d = (uint32_t) *dst;
		4127
		4128	*dst++ = (uint8_t) pack_1x128_32 (
		4129	pix_multiply_1x128 (
		4130	pix_multiply_1x128 (
		4131	xmm_alpha, unpack_32_1x128 (m)),
		4132	unpack_32_1x128 (d)));
		4133	w--;
		4134	}
		4135	}
		4136
		4137	}
		4138
		4139	static void
		4140	sse2_composite_in_n_8 (pixman_implementation_t *imp,
		4141	pixman_composite_info_t *info)
		4142	{
		4143	PIXMAN_COMPOSITE_ARGS (info);
		4144	uint8_t dst_line, dst;
		4145	int dst_stride;
		4146	uint32_t d;
		4147	uint32_t src;
		4148	int32_t w;
		4149
		4150	__m128i xmm_alpha;
		4151	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		4152
		4153	PIXMAN_IMAGE_GET_LINE (
		4154	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
		4155
		4156	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		4157
		4158	xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
		4159
		4160	src = src >> 24;
		4161
		4162	if (src == 0xff)
		4163	return;
		4164
		4165	if (src == 0x00)
		4166	{
		4167	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
		4168	8, dest_x, dest_y, width, height, src);
		4169
		4170	return;
		4171	}
		4172
		4173	while (height--)
		4174	{
		4175	dst = dst_line;
		4176	dst_line += dst_stride;
		4177	w = width;
		4178
		4179	while (w && ((uintptr_t)dst & 15))
		4180	{
		4181	d = (uint32_t) *dst;
		4182
		4183	*dst++ = (uint8_t) pack_1x128_32 (
		4184	pix_multiply_1x128 (
		4185	xmm_alpha,
		4186	unpack_32_1x128 (d)));
		4187	w--;
		4188	}
		4189
		4190	while (w >= 16)
		4191	{
		4192	xmm_dst = load_128_aligned ((__m128i*)dst);
		4193
		4194	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		4195
		4196	pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
		4197	&xmm_dst_lo, &xmm_dst_hi,
		4198	&xmm_dst_lo, &xmm_dst_hi);
		4199
		4200	save_128_aligned (
		4201	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		4202
		4203	dst += 16;
		4204	w -= 16;
		4205	}
		4206
		4207	while (w)
		4208	{
		4209	d = (uint32_t) *dst;
		4210
		4211	*dst++ = (uint8_t) pack_1x128_32 (
		4212	pix_multiply_1x128 (
		4213	xmm_alpha,
		4214	unpack_32_1x128 (d)));
		4215	w--;
		4216	}
		4217	}
		4218
		4219	}
		4220
		4221	static void
		4222	sse2_composite_in_8_8 (pixman_implementation_t *imp,
		4223	pixman_composite_info_t *info)
		4224	{
		4225	PIXMAN_COMPOSITE_ARGS (info);
		4226	uint8_t dst_line, dst;
		4227	uint8_t src_line, src;
		4228	int src_stride, dst_stride;
		4229	int32_t w;
		4230	uint32_t s, d;
		4231
		4232	__m128i xmm_src, xmm_src_lo, xmm_src_hi;
		4233	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		4234
		4235	PIXMAN_IMAGE_GET_LINE (
		4236	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
		4237	PIXMAN_IMAGE_GET_LINE (
		4238	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
		4239
		4240	while (height--)
		4241	{
		4242	dst = dst_line;
		4243	dst_line += dst_stride;
		4244	src = src_line;
		4245	src_line += src_stride;
		4246	w = width;
		4247
		4248	while (w && ((uintptr_t)dst & 15))
		4249	{
		4250	s = (uint32_t) *src++;
		4251	d = (uint32_t) *dst;
		4252
		4253	*dst++ = (uint8_t) pack_1x128_32 (
		4254	pix_multiply_1x128 (
		4255	unpack_32_1x128 (s), unpack_32_1x128 (d)));
		4256	w--;
		4257	}
		4258
		4259	while (w >= 16)
		4260	{
		4261	xmm_src = load_128_unaligned ((__m128i*)src);
		4262	xmm_dst = load_128_aligned ((__m128i*)dst);
		4263
		4264	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		4265	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		4266
		4267	pix_multiply_2x128 (&xmm_src_lo, &xmm_src_hi,
		4268	&xmm_dst_lo, &xmm_dst_hi,
		4269	&xmm_dst_lo, &xmm_dst_hi);
		4270
		4271	save_128_aligned (
		4272	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		4273
		4274	src += 16;
		4275	dst += 16;
		4276	w -= 16;
		4277	}
		4278
		4279	while (w)
		4280	{
		4281	s = (uint32_t) *src++;
		4282	d = (uint32_t) *dst;
		4283
		4284	*dst++ = (uint8_t) pack_1x128_32 (
		4285	pix_multiply_1x128 (unpack_32_1x128 (s), unpack_32_1x128 (d)));
		4286	w--;
		4287	}
		4288	}
		4289
		4290	}
		4291
		4292	static void
		4293	sse2_composite_add_n_8_8 (pixman_implementation_t *imp,
		4294	pixman_composite_info_t *info)
		4295	{
		4296	PIXMAN_COMPOSITE_ARGS (info);
		4297	uint8_t dst_line, dst;
		4298	uint8_t mask_line, mask;
		4299	int dst_stride, mask_stride;
		4300	int32_t w;
		4301	uint32_t src;
		4302	uint32_t m, d;
		4303
		4304	__m128i xmm_alpha;
		4305	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		4306	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		4307
		4308	PIXMAN_IMAGE_GET_LINE (
		4309	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
		4310	PIXMAN_IMAGE_GET_LINE (
		4311	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		4312
		4313	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		4314
		4315	xmm_alpha = expand_alpha_1x128 (expand_pixel_32_1x128 (src));
		4316
		4317	while (height--)
		4318	{
		4319	dst = dst_line;
		4320	dst_line += dst_stride;
		4321	mask = mask_line;
		4322	mask_line += mask_stride;
		4323	w = width;
		4324
		4325	while (w && ((uintptr_t)dst & 15))
		4326	{
		4327	m = (uint32_t) *mask++;
		4328	d = (uint32_t) *dst;
		4329
		4330	*dst++ = (uint8_t) pack_1x128_32 (
		4331	_mm_adds_epu16 (
		4332	pix_multiply_1x128 (
		4333	xmm_alpha, unpack_32_1x128 (m)),
		4334	unpack_32_1x128 (d)));
		4335	w--;
		4336	}
		4337
		4338	while (w >= 16)
		4339	{
		4340	xmm_mask = load_128_unaligned ((__m128i*)mask);
		4341	xmm_dst = load_128_aligned ((__m128i*)dst);
		4342
		4343	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		4344	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		4345
		4346	pix_multiply_2x128 (&xmm_alpha, &xmm_alpha,
		4347	&xmm_mask_lo, &xmm_mask_hi,
		4348	&xmm_mask_lo, &xmm_mask_hi);
		4349
		4350	xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
		4351	xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
		4352
		4353	save_128_aligned (
		4354	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		4355
		4356	mask += 16;
		4357	dst += 16;
		4358	w -= 16;
		4359	}
		4360
		4361	while (w)
		4362	{
		4363	m = (uint32_t) *mask++;
		4364	d = (uint32_t) *dst;
		4365
		4366	*dst++ = (uint8_t) pack_1x128_32 (
		4367	_mm_adds_epu16 (
		4368	pix_multiply_1x128 (
		4369	xmm_alpha, unpack_32_1x128 (m)),
		4370	unpack_32_1x128 (d)));
		4371
		4372	w--;
		4373	}
		4374	}
		4375
		4376	}
		4377
		4378	static void
		4379	sse2_composite_add_n_8 (pixman_implementation_t *imp,
		4380	pixman_composite_info_t *info)
		4381	{
		4382	PIXMAN_COMPOSITE_ARGS (info);
		4383	uint8_t dst_line, dst;
		4384	int dst_stride;
		4385	int32_t w;
		4386	uint32_t src;
		4387
		4388	__m128i xmm_src;
		4389
		4390	PIXMAN_IMAGE_GET_LINE (
		4391	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
		4392
		4393	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		4394
		4395	src >>= 24;
		4396
		4397	if (src == 0x00)
		4398	return;
		4399
		4400	if (src == 0xff)
		4401	{
		4402	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride,
		4403	8, dest_x, dest_y, width, height, 0xff);
		4404
		4405	return;
		4406	}
		4407
		4408	src = (src << 24) \| (src << 16) \| (src << 8) \| src;
		4409	xmm_src = _mm_set_epi32 (src, src, src, src);
		4410
		4411	while (height--)
		4412	{
		4413	dst = dst_line;
		4414	dst_line += dst_stride;
		4415	w = width;
		4416
		4417	while (w && ((uintptr_t)dst & 15))
		4418	{
		4419	*dst = (uint8_t)_mm_cvtsi128_si32 (
		4420	_mm_adds_epu8 (
		4421	xmm_src,
		4422	_mm_cvtsi32_si128 (*dst)));
		4423
		4424	w--;
		4425	dst++;
		4426	}
		4427
		4428	while (w >= 16)
		4429	{
		4430	save_128_aligned (
		4431	(__m128i)dst, _mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i)dst)));
		4432
		4433	dst += 16;
		4434	w -= 16;
		4435	}
		4436
		4437	while (w)
		4438	{
		4439	*dst = (uint8_t)_mm_cvtsi128_si32 (
		4440	_mm_adds_epu8 (
		4441	xmm_src,
		4442	_mm_cvtsi32_si128 (*dst)));
		4443
		4444	w--;
		4445	dst++;
		4446	}
		4447	}
		4448
		4449	}
		4450
		4451	static void
		4452	sse2_composite_add_8_8 (pixman_implementation_t *imp,
		4453	pixman_composite_info_t *info)
		4454	{
		4455	PIXMAN_COMPOSITE_ARGS (info);
		4456	uint8_t dst_line, dst;
		4457	uint8_t src_line, src;
		4458	int dst_stride, src_stride;
		4459	int32_t w;
		4460	uint16_t t;
		4461
		4462	PIXMAN_IMAGE_GET_LINE (
		4463	src_image, src_x, src_y, uint8_t, src_stride, src_line, 1);
		4464	PIXMAN_IMAGE_GET_LINE (
		4465	dest_image, dest_x, dest_y, uint8_t, dst_stride, dst_line, 1);
		4466
		4467	while (height--)
		4468	{
		4469	dst = dst_line;
		4470	src = src_line;
		4471
		4472	dst_line += dst_stride;
		4473	src_line += src_stride;
		4474	w = width;
		4475
		4476	/* Small head */
		4477	while (w && (uintptr_t)dst & 3)
		4478	{
		4479	t = (dst) + (src++);
		4480	*dst++ = t \| (0 - (t >> 8));
		4481	w--;
		4482	}
		4483
		4484	sse2_combine_add_u (imp, op,
		4485	(uint32_t)dst, (uint32_t)src, NULL, w >> 2);
		4486
		4487	/* Small tail */
		4488	dst += w & 0xfffc;
		4489	src += w & 0xfffc;
		4490
		4491	w &= 3;
		4492
		4493	while (w)
		4494	{
		4495	t = (dst) + (src++);
		4496	*dst++ = t \| (0 - (t >> 8));
		4497	w--;
		4498	}
		4499	}
		4500
		4501	}
		4502
		4503	static void
		4504	sse2_composite_add_8888_8888 (pixman_implementation_t *imp,
		4505	pixman_composite_info_t *info)
		4506	{
		4507	PIXMAN_COMPOSITE_ARGS (info);
		4508	uint32_t dst_line, dst;
		4509	uint32_t src_line, src;
		4510	int dst_stride, src_stride;
		4511
		4512	PIXMAN_IMAGE_GET_LINE (
		4513	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		4514	PIXMAN_IMAGE_GET_LINE (
		4515	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		4516
		4517	while (height--)
		4518	{
		4519	dst = dst_line;
		4520	dst_line += dst_stride;
		4521	src = src_line;
		4522	src_line += src_stride;
		4523
		4524	sse2_combine_add_u (imp, op, dst, src, NULL, width);
		4525	}
		4526	}
		4527
		4528	static void
		4529	sse2_composite_add_n_8888 (pixman_implementation_t *imp,
		4530	pixman_composite_info_t *info)
		4531	{
		4532	PIXMAN_COMPOSITE_ARGS (info);
		4533	uint32_t dst_line, dst, src;
		4534	int dst_stride;
		4535
		4536	__m128i xmm_src;
		4537
		4538	PIXMAN_IMAGE_GET_LINE (dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		4539
		4540	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		4541	if (src == 0)
		4542	return;
		4543
		4544	if (src == ~0)
		4545	{
		4546	pixman_fill (dest_image->bits.bits, dest_image->bits.rowstride, 32,
		4547	dest_x, dest_y, width, height, ~0);
		4548
		4549	return;
		4550	}
		4551
		4552	xmm_src = _mm_set_epi32 (src, src, src, src);
		4553	while (height--)
		4554	{
		4555	int w = width;
		4556	uint32_t d;
		4557
		4558	dst = dst_line;
		4559	dst_line += dst_stride;
		4560
		4561	while (w && (uintptr_t)dst & 15)
		4562	{
		4563	d = *dst;
		4564	*dst++ =
		4565	_mm_cvtsi128_si32 ( _mm_adds_epu8 (xmm_src, _mm_cvtsi32_si128 (d)));
		4566	w--;
		4567	}
		4568
		4569	while (w >= 4)
		4570	{
		4571	save_128_aligned
		4572	((__m128i*)dst,
		4573	_mm_adds_epu8 (xmm_src, load_128_aligned ((__m128i*)dst)));
		4574
		4575	dst += 4;
		4576	w -= 4;
		4577	}
		4578
		4579	while (w--)
		4580	{
		4581	d = *dst;
		4582	*dst++ =
		4583	_mm_cvtsi128_si32 (_mm_adds_epu8 (xmm_src,
		4584	_mm_cvtsi32_si128 (d)));
		4585	}
		4586	}
		4587	}
		4588
		4589	static void
		4590	sse2_composite_add_n_8_8888 (pixman_implementation_t *imp,
		4591	pixman_composite_info_t *info)
		4592	{
		4593	PIXMAN_COMPOSITE_ARGS (info);
		4594	uint32_t dst_line, dst;
		4595	uint8_t mask_line, mask;
		4596	int dst_stride, mask_stride;
		4597	int32_t w;
		4598	uint32_t src;
		4599
		4600	__m128i xmm_src;
		4601
		4602	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		4603	if (src == 0)
		4604	return;
		4605	xmm_src = expand_pixel_32_1x128 (src);
		4606
		4607	PIXMAN_IMAGE_GET_LINE (
		4608	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		4609	PIXMAN_IMAGE_GET_LINE (
		4610	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		4611
		4612	while (height--)
		4613	{
		4614	dst = dst_line;
		4615	dst_line += dst_stride;
		4616	mask = mask_line;
		4617	mask_line += mask_stride;
		4618	w = width;
		4619
		4620	while (w && ((uintptr_t)dst & 15))
		4621	{
		4622	uint8_t m = *mask++;
		4623	if (m)
		4624	{
		4625	*dst = pack_1x128_32
		4626	(_mm_adds_epu16
		4627	(pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
		4628	unpack_32_1x128 (*dst)));
		4629	}
		4630	dst++;
		4631	w--;
		4632	}
		4633
		4634	while (w >= 4)
		4635	{
		4636	uint32_t m = (uint32_t)mask;
		4637	if (m)
		4638	{
		4639	__m128i xmm_mask_lo, xmm_mask_hi;
		4640	__m128i xmm_dst_lo, xmm_dst_hi;
		4641
		4642	__m128i xmm_dst = load_128_aligned ((__m128i*)dst);
		4643	__m128i xmm_mask =
		4644	_mm_unpacklo_epi8 (unpack_32_1x128(m),
		4645	_mm_setzero_si128 ());
		4646
		4647	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		4648	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		4649
		4650	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi,
		4651	&xmm_mask_lo, &xmm_mask_hi);
		4652
		4653	pix_multiply_2x128 (&xmm_src, &xmm_src,
		4654	&xmm_mask_lo, &xmm_mask_hi,
		4655	&xmm_mask_lo, &xmm_mask_hi);
		4656
		4657	xmm_dst_lo = _mm_adds_epu16 (xmm_mask_lo, xmm_dst_lo);
		4658	xmm_dst_hi = _mm_adds_epu16 (xmm_mask_hi, xmm_dst_hi);
		4659
		4660	save_128_aligned (
		4661	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		4662	}
		4663
		4664	w -= 4;
		4665	dst += 4;
		4666	mask += 4;
		4667	}
		4668
		4669	while (w)
		4670	{
		4671	uint8_t m = *mask++;
		4672	if (m)
		4673	{
		4674	*dst = pack_1x128_32
		4675	(_mm_adds_epu16
		4676	(pix_multiply_1x128 (xmm_src, expand_pixel_8_1x128 (m)),
		4677	unpack_32_1x128 (*dst)));
		4678	}
		4679	dst++;
		4680	w--;
		4681	}
		4682	}
		4683	}
		4684
		4685	static pixman_bool_t
		4686	sse2_blt (pixman_implementation_t *imp,
		4687	uint32_t * src_bits,
		4688	uint32_t * dst_bits,
		4689	int src_stride,
		4690	int dst_stride,
		4691	int src_bpp,
		4692	int dst_bpp,
		4693	int src_x,
		4694	int src_y,
		4695	int dest_x,
		4696	int dest_y,
		4697	int width,
		4698	int height)
		4699	{
		4700	uint8_t * src_bytes;
		4701	uint8_t * dst_bytes;
		4702	int byte_width;
		4703
		4704	if (src_bpp != dst_bpp)
		4705	return FALSE;
		4706
		4707	if (src_bpp == 16)
		4708	{
		4709	src_stride = src_stride * (int) sizeof (uint32_t) / 2;
		4710	dst_stride = dst_stride * (int) sizeof (uint32_t) / 2;
		4711	src_bytes =(uint8_t )(((uint16_t )src_bits) + src_stride * (src_y) + (src_x));
		4712	dst_bytes = (uint8_t )(((uint16_t )dst_bits) + dst_stride * (dest_y) + (dest_x));
		4713	byte_width = 2 * width;
		4714	src_stride *= 2;
		4715	dst_stride *= 2;
		4716	}
		4717	else if (src_bpp == 32)
		4718	{
		4719	src_stride = src_stride * (int) sizeof (uint32_t) / 4;
		4720	dst_stride = dst_stride * (int) sizeof (uint32_t) / 4;
		4721	src_bytes = (uint8_t )(((uint32_t )src_bits) + src_stride * (src_y) + (src_x));
		4722	dst_bytes = (uint8_t )(((uint32_t )dst_bits) + dst_stride * (dest_y) + (dest_x));
		4723	byte_width = 4 * width;
		4724	src_stride *= 4;
		4725	dst_stride *= 4;
		4726	}
		4727	else
		4728	{
		4729	return FALSE;
		4730	}
		4731
		4732	while (height--)
		4733	{
		4734	int w;
		4735	uint8_t *s = src_bytes;
		4736	uint8_t *d = dst_bytes;
		4737	src_bytes += src_stride;
		4738	dst_bytes += dst_stride;
		4739	w = byte_width;
		4740
		4741	while (w >= 2 && ((uintptr_t)d & 3))
		4742	{
		4743	(uint16_t )d = (uint16_t )s;
		4744	w -= 2;
		4745	s += 2;
		4746	d += 2;
		4747	}
		4748
		4749	while (w >= 4 && ((uintptr_t)d & 15))
		4750	{
		4751	(uint32_t )d = (uint32_t )s;
		4752
		4753	w -= 4;
		4754	s += 4;
		4755	d += 4;
		4756	}
		4757
		4758	while (w >= 64)
		4759	{
		4760	__m128i xmm0, xmm1, xmm2, xmm3;
		4761
		4762	xmm0 = load_128_unaligned ((__m128i*)(s));
		4763	xmm1 = load_128_unaligned ((__m128i*)(s + 16));
		4764	xmm2 = load_128_unaligned ((__m128i*)(s + 32));
		4765	xmm3 = load_128_unaligned ((__m128i*)(s + 48));
		4766
		4767	save_128_aligned ((__m128i*)(d), xmm0);
		4768	save_128_aligned ((__m128i*)(d + 16), xmm1);
		4769	save_128_aligned ((__m128i*)(d + 32), xmm2);
		4770	save_128_aligned ((__m128i*)(d + 48), xmm3);
		4771
		4772	s += 64;
		4773	d += 64;
		4774	w -= 64;
		4775	}
		4776
		4777	while (w >= 16)
		4778	{
		4779	save_128_aligned ((__m128i)d, load_128_unaligned ((__m128i)s) );
		4780
		4781	w -= 16;
		4782	d += 16;
		4783	s += 16;
		4784	}
		4785
		4786	while (w >= 4)
		4787	{
		4788	(uint32_t )d = (uint32_t )s;
		4789
		4790	w -= 4;
		4791	s += 4;
		4792	d += 4;
		4793	}
		4794
		4795	if (w >= 2)
		4796	{
		4797	(uint16_t )d = (uint16_t )s;
		4798	w -= 2;
		4799	s += 2;
		4800	d += 2;
		4801	}
		4802	}
		4803
		4804	return TRUE;
		4805	}
		4806
		4807	static void
		4808	sse2_composite_copy_area (pixman_implementation_t *imp,
		4809	pixman_composite_info_t *info)
		4810	{
		4811	PIXMAN_COMPOSITE_ARGS (info);
		4812	sse2_blt (imp, src_image->bits.bits,
		4813	dest_image->bits.bits,
		4814	src_image->bits.rowstride,
		4815	dest_image->bits.rowstride,
		4816	PIXMAN_FORMAT_BPP (src_image->bits.format),
		4817	PIXMAN_FORMAT_BPP (dest_image->bits.format),
		4818	src_x, src_y, dest_x, dest_y, width, height);
		4819	}
		4820
		4821	static void
		4822	sse2_composite_over_x888_8_8888 (pixman_implementation_t *imp,
		4823	pixman_composite_info_t *info)
		4824	{
		4825	PIXMAN_COMPOSITE_ARGS (info);
		4826	uint32_t src, src_line, s;
		4827	uint32_t dst, dst_line, d;
		4828	uint8_t mask, mask_line;
		4829	uint32_t m;
		4830	int src_stride, mask_stride, dst_stride;
		4831	int32_t w;
		4832	__m128i ms;
		4833
		4834	__m128i xmm_src, xmm_src_lo, xmm_src_hi;
		4835	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		4836	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		4837
		4838	PIXMAN_IMAGE_GET_LINE (
		4839	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		4840	PIXMAN_IMAGE_GET_LINE (
		4841	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		4842	PIXMAN_IMAGE_GET_LINE (
		4843	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		4844
		4845	while (height--)
		4846	{
		4847	src = src_line;
		4848	src_line += src_stride;
		4849	dst = dst_line;
		4850	dst_line += dst_stride;
		4851	mask = mask_line;
		4852	mask_line += mask_stride;
		4853
		4854	w = width;
		4855
		4856	while (w && (uintptr_t)dst & 15)
		4857	{
		4858	s = 0xff000000 \| *src++;
		4859	m = (uint32_t) *mask++;
		4860	d = *dst;
		4861	ms = unpack_32_1x128 (s);
		4862
		4863	if (m != 0xff)
		4864	{
		4865	__m128i ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
		4866	__m128i md = unpack_32_1x128 (d);
		4867
		4868	ms = in_over_1x128 (&ms, &mask_00ff, &ma, &md);
		4869	}
		4870
		4871	*dst++ = pack_1x128_32 (ms);
		4872	w--;
		4873	}
		4874
		4875	while (w >= 4)
		4876	{
		4877	m = (uint32_t) mask;
		4878	xmm_src = _mm_or_si128 (
		4879	load_128_unaligned ((__m128i*)src), mask_ff000000);
		4880
		4881	if (m == 0xffffffff)
		4882	{
		4883	save_128_aligned ((__m128i*)dst, xmm_src);
		4884	}
		4885	else
		4886	{
		4887	xmm_dst = load_128_aligned ((__m128i*)dst);
		4888
		4889	xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
		4890
		4891	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		4892	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		4893	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		4894
		4895	expand_alpha_rev_2x128 (
		4896	xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		4897
		4898	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
		4899	&mask_00ff, &mask_00ff, &xmm_mask_lo, &xmm_mask_hi,
		4900	&xmm_dst_lo, &xmm_dst_hi);
		4901
		4902	save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		4903	}
		4904
		4905	src += 4;
		4906	dst += 4;
		4907	mask += 4;
		4908	w -= 4;
		4909	}
		4910
		4911	while (w)
		4912	{
		4913	m = (uint32_t) *mask++;
		4914
		4915	if (m)
		4916	{
		4917	s = 0xff000000 \| *src;
		4918
		4919	if (m == 0xff)
		4920	{
		4921	*dst = s;
		4922	}
		4923	else
		4924	{
		4925	__m128i ma, md, ms;
		4926
		4927	d = *dst;
		4928
		4929	ma = expand_alpha_rev_1x128 (unpack_32_1x128 (m));
		4930	md = unpack_32_1x128 (d);
		4931	ms = unpack_32_1x128 (s);
		4932
		4933	*dst = pack_1x128_32 (in_over_1x128 (&ms, &mask_00ff, &ma, &md));
		4934	}
		4935
		4936	}
		4937
		4938	src++;
		4939	dst++;
		4940	w--;
		4941	}
		4942	}
		4943
		4944	}
		4945
		4946	static void
		4947	sse2_composite_over_8888_8_8888 (pixman_implementation_t *imp,
		4948	pixman_composite_info_t *info)
		4949	{
		4950	PIXMAN_COMPOSITE_ARGS (info);
		4951	uint32_t src, src_line, s;
		4952	uint32_t dst, dst_line, d;
		4953	uint8_t mask, mask_line;
		4954	uint32_t m;
		4955	int src_stride, mask_stride, dst_stride;
		4956	int32_t w;
		4957
		4958	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
		4959	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		4960	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		4961
		4962	PIXMAN_IMAGE_GET_LINE (
		4963	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		4964	PIXMAN_IMAGE_GET_LINE (
		4965	mask_image, mask_x, mask_y, uint8_t, mask_stride, mask_line, 1);
		4966	PIXMAN_IMAGE_GET_LINE (
		4967	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		4968
		4969	while (height--)
		4970	{
		4971	src = src_line;
		4972	src_line += src_stride;
		4973	dst = dst_line;
		4974	dst_line += dst_stride;
		4975	mask = mask_line;
		4976	mask_line += mask_stride;
		4977
		4978	w = width;
		4979
		4980	while (w && (uintptr_t)dst & 15)
		4981	{
		4982	uint32_t sa;
		4983
		4984	s = *src++;
		4985	m = (uint32_t) *mask++;
		4986	d = *dst;
		4987
		4988	sa = s >> 24;
		4989
		4990	if (m)
		4991	{
		4992	if (sa == 0xff && m == 0xff)
		4993	{
		4994	*dst = s;
		4995	}
		4996	else
		4997	{
		4998	__m128i ms, md, ma, msa;
		4999
		5000	ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
		5001	ms = unpack_32_1x128 (s);
		5002	md = unpack_32_1x128 (d);
		5003
		5004	msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
		5005
		5006	*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
		5007	}
		5008	}
		5009
		5010	dst++;
		5011	w--;
		5012	}
		5013
		5014	while (w >= 4)
		5015	{
		5016	m = (uint32_t ) mask;
		5017
		5018	if (m)
		5019	{
		5020	xmm_src = load_128_unaligned ((__m128i*)src);
		5021
		5022	if (m == 0xffffffff && is_opaque (xmm_src))
		5023	{
		5024	save_128_aligned ((__m128i *)dst, xmm_src);
		5025	}
		5026	else
		5027	{
		5028	xmm_dst = load_128_aligned ((__m128i *)dst);
		5029
		5030	xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
		5031
		5032	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		5033	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		5034	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		5035
		5036	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
		5037	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		5038
		5039	in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
		5040	&xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
		5041
		5042	save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		5043	}
		5044	}
		5045
		5046	src += 4;
		5047	dst += 4;
		5048	mask += 4;
		5049	w -= 4;
		5050	}
		5051
		5052	while (w)
		5053	{
		5054	uint32_t sa;
		5055
		5056	s = *src++;
		5057	m = (uint32_t) *mask++;
		5058	d = *dst;
		5059
		5060	sa = s >> 24;
		5061
		5062	if (m)
		5063	{
		5064	if (sa == 0xff && m == 0xff)
		5065	{
		5066	*dst = s;
		5067	}
		5068	else
		5069	{
		5070	__m128i ms, md, ma, msa;
		5071
		5072	ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
		5073	ms = unpack_32_1x128 (s);
		5074	md = unpack_32_1x128 (d);
		5075
		5076	msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
		5077
		5078	*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
		5079	}
		5080	}
		5081
		5082	dst++;
		5083	w--;
		5084	}
		5085	}
		5086
		5087	}
		5088
		5089	static void
		5090	sse2_composite_over_reverse_n_8888 (pixman_implementation_t *imp,
		5091	pixman_composite_info_t *info)
		5092	{
		5093	PIXMAN_COMPOSITE_ARGS (info);
		5094	uint32_t src;
		5095	uint32_t dst_line, dst;
		5096	__m128i xmm_src;
		5097	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		5098	__m128i xmm_dsta_hi, xmm_dsta_lo;
		5099	int dst_stride;
		5100	int32_t w;
		5101
		5102	src = _pixman_image_get_solid (imp, src_image, dest_image->bits.format);
		5103
		5104	if (src == 0)
		5105	return;
		5106
		5107	PIXMAN_IMAGE_GET_LINE (
		5108	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		5109
		5110	xmm_src = expand_pixel_32_1x128 (src);
		5111
		5112	while (height--)
		5113	{
		5114	dst = dst_line;
		5115
		5116	dst_line += dst_stride;
		5117	w = width;
		5118
		5119	while (w && (uintptr_t)dst & 15)
		5120	{
		5121	__m128i vd;
		5122
		5123	vd = unpack_32_1x128 (*dst);
		5124
		5125	*dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
		5126	xmm_src));
		5127	w--;
		5128	dst++;
		5129	}
		5130
		5131	while (w >= 4)
		5132	{
		5133	__m128i tmp_lo, tmp_hi;
		5134
		5135	xmm_dst = load_128_aligned ((__m128i*)dst);
		5136
		5137	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		5138	expand_alpha_2x128 (xmm_dst_lo, xmm_dst_hi, &xmm_dsta_lo, &xmm_dsta_hi);
		5139
		5140	tmp_lo = xmm_src;
		5141	tmp_hi = xmm_src;
		5142
		5143	over_2x128 (&xmm_dst_lo, &xmm_dst_hi,
		5144	&xmm_dsta_lo, &xmm_dsta_hi,
		5145	&tmp_lo, &tmp_hi);
		5146
		5147	save_128_aligned (
		5148	(__m128i*)dst, pack_2x128_128 (tmp_lo, tmp_hi));
		5149
		5150	w -= 4;
		5151	dst += 4;
		5152	}
		5153
		5154	while (w)
		5155	{
		5156	__m128i vd;
		5157
		5158	vd = unpack_32_1x128 (*dst);
		5159
		5160	*dst = pack_1x128_32 (over_1x128 (vd, expand_alpha_1x128 (vd),
		5161	xmm_src));
		5162	w--;
		5163	dst++;
		5164	}
		5165
		5166	}
		5167
		5168	}
		5169
		5170	static void
		5171	sse2_composite_over_8888_8888_8888 (pixman_implementation_t *imp,
		5172	pixman_composite_info_t *info)
		5173	{
		5174	PIXMAN_COMPOSITE_ARGS (info);
		5175	uint32_t src, src_line, s;
		5176	uint32_t dst, dst_line, d;
		5177	uint32_t mask, mask_line;
		5178	uint32_t m;
		5179	int src_stride, mask_stride, dst_stride;
		5180	int32_t w;
		5181
		5182	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
		5183	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		5184	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		5185
		5186	PIXMAN_IMAGE_GET_LINE (
		5187	dest_image, dest_x, dest_y, uint32_t, dst_stride, dst_line, 1);
		5188	PIXMAN_IMAGE_GET_LINE (
		5189	mask_image, mask_x, mask_y, uint32_t, mask_stride, mask_line, 1);
		5190	PIXMAN_IMAGE_GET_LINE (
		5191	src_image, src_x, src_y, uint32_t, src_stride, src_line, 1);
		5192
		5193	while (height--)
		5194	{
		5195	src = src_line;
		5196	src_line += src_stride;
		5197	dst = dst_line;
		5198	dst_line += dst_stride;
		5199	mask = mask_line;
		5200	mask_line += mask_stride;
		5201
		5202	w = width;
		5203
		5204	while (w && (uintptr_t)dst & 15)
		5205	{
		5206	uint32_t sa;
		5207
		5208	s = *src++;
		5209	m = (*mask++) >> 24;
		5210	d = *dst;
		5211
		5212	sa = s >> 24;
		5213
		5214	if (m)
		5215	{
		5216	if (sa == 0xff && m == 0xff)
		5217	{
		5218	*dst = s;
		5219	}
		5220	else
		5221	{
		5222	__m128i ms, md, ma, msa;
		5223
		5224	ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
		5225	ms = unpack_32_1x128 (s);
		5226	md = unpack_32_1x128 (d);
		5227
		5228	msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
		5229
		5230	*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
		5231	}
		5232	}
		5233
		5234	dst++;
		5235	w--;
		5236	}
		5237
		5238	while (w >= 4)
		5239	{
		5240	xmm_mask = load_128_unaligned ((__m128i*)mask);
		5241
		5242	if (!is_transparent (xmm_mask))
		5243	{
		5244	xmm_src = load_128_unaligned ((__m128i*)src);
		5245
		5246	if (is_opaque (xmm_mask) && is_opaque (xmm_src))
		5247	{
		5248	save_128_aligned ((__m128i *)dst, xmm_src);
		5249	}
		5250	else
		5251	{
		5252	xmm_dst = load_128_aligned ((__m128i *)dst);
		5253
		5254	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		5255	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		5256	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		5257
		5258	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
		5259	expand_alpha_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		5260
		5261	in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
		5262	&xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
		5263
		5264	save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		5265	}
		5266	}
		5267
		5268	src += 4;
		5269	dst += 4;
		5270	mask += 4;
		5271	w -= 4;
		5272	}
		5273
		5274	while (w)
		5275	{
		5276	uint32_t sa;
		5277
		5278	s = *src++;
		5279	m = (*mask++) >> 24;
		5280	d = *dst;
		5281
		5282	sa = s >> 24;
		5283
		5284	if (m)
		5285	{
		5286	if (sa == 0xff && m == 0xff)
		5287	{
		5288	*dst = s;
		5289	}
		5290	else
		5291	{
		5292	__m128i ms, md, ma, msa;
		5293
		5294	ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
		5295	ms = unpack_32_1x128 (s);
		5296	md = unpack_32_1x128 (d);
		5297
		5298	msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
		5299
		5300	*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
		5301	}
		5302	}
		5303
		5304	dst++;
		5305	w--;
		5306	}
		5307	}
		5308
		5309	}
		5310
		5311	/* A variant of 'sse2_combine_over_u' with minor tweaks */
		5312	static force_inline void
		5313	scaled_nearest_scanline_sse2_8888_8888_OVER (uint32_t* pd,
		5314	const uint32_t* ps,
		5315	int32_t w,
		5316	pixman_fixed_t vx,
		5317	pixman_fixed_t unit_x,
		5318	pixman_fixed_t src_width_fixed,
		5319	pixman_bool_t fully_transparent_src)
		5320	{
		5321	uint32_t s, d;
		5322	const uint32_t* pm = NULL;
		5323
		5324	__m128i xmm_dst_lo, xmm_dst_hi;
		5325	__m128i xmm_src_lo, xmm_src_hi;
		5326	__m128i xmm_alpha_lo, xmm_alpha_hi;
		5327
		5328	if (fully_transparent_src)
		5329	return;
		5330
		5331	/* Align dst on a 16-byte boundary */
		5332	while (w && ((uintptr_t)pd & 15))
		5333	{
		5334	d = *pd;
		5335	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
		5336	vx += unit_x;
		5337	while (vx >= 0)
		5338	vx -= src_width_fixed;
		5339
		5340	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
		5341	if (pm)
		5342	pm++;
		5343	w--;
		5344	}
		5345
		5346	while (w >= 4)
		5347	{
		5348	__m128i tmp;
		5349	uint32_t tmp1, tmp2, tmp3, tmp4;
		5350
		5351	tmp1 = *(ps + pixman_fixed_to_int (vx));
		5352	vx += unit_x;
		5353	while (vx >= 0)
		5354	vx -= src_width_fixed;
		5355	tmp2 = *(ps + pixman_fixed_to_int (vx));
		5356	vx += unit_x;
		5357	while (vx >= 0)
		5358	vx -= src_width_fixed;
		5359	tmp3 = *(ps + pixman_fixed_to_int (vx));
		5360	vx += unit_x;
		5361	while (vx >= 0)
		5362	vx -= src_width_fixed;
		5363	tmp4 = *(ps + pixman_fixed_to_int (vx));
		5364	vx += unit_x;
		5365	while (vx >= 0)
		5366	vx -= src_width_fixed;
		5367
		5368	tmp = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
		5369
		5370	xmm_src_hi = combine4 ((__m128i)&tmp, (__m128i)pm);
		5371
		5372	if (is_opaque (xmm_src_hi))
		5373	{
		5374	save_128_aligned ((__m128i*)pd, xmm_src_hi);
		5375	}
		5376	else if (!is_zero (xmm_src_hi))
		5377	{
		5378	xmm_dst_hi = load_128_aligned ((__m128i*) pd);
		5379
		5380	unpack_128_2x128 (xmm_src_hi, &xmm_src_lo, &xmm_src_hi);
		5381	unpack_128_2x128 (xmm_dst_hi, &xmm_dst_lo, &xmm_dst_hi);
		5382
		5383	expand_alpha_2x128 (
		5384	xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
		5385
		5386	over_2x128 (&xmm_src_lo, &xmm_src_hi,
		5387	&xmm_alpha_lo, &xmm_alpha_hi,
		5388	&xmm_dst_lo, &xmm_dst_hi);
		5389
		5390	/* rebuid the 4 pixel data and save*/
		5391	save_128_aligned ((__m128i*)pd,
		5392	pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		5393	}
		5394
		5395	w -= 4;
		5396	pd += 4;
		5397	if (pm)
		5398	pm += 4;
		5399	}
		5400
		5401	while (w)
		5402	{
		5403	d = *pd;
		5404	s = combine1 (ps + pixman_fixed_to_int (vx), pm);
		5405	vx += unit_x;
		5406	while (vx >= 0)
		5407	vx -= src_width_fixed;
		5408
		5409	*pd++ = core_combine_over_u_pixel_sse2 (s, d);
		5410	if (pm)
		5411	pm++;
		5412
		5413	w--;
		5414	}
		5415	}
		5416
		5417	FAST_NEAREST_MAINLOOP (sse2_8888_8888_cover_OVER,
		5418	scaled_nearest_scanline_sse2_8888_8888_OVER,
		5419	uint32_t, uint32_t, COVER)
		5420	FAST_NEAREST_MAINLOOP (sse2_8888_8888_none_OVER,
		5421	scaled_nearest_scanline_sse2_8888_8888_OVER,
		5422	uint32_t, uint32_t, NONE)
		5423	FAST_NEAREST_MAINLOOP (sse2_8888_8888_pad_OVER,
		5424	scaled_nearest_scanline_sse2_8888_8888_OVER,
		5425	uint32_t, uint32_t, PAD)
		5426	FAST_NEAREST_MAINLOOP (sse2_8888_8888_normal_OVER,
		5427	scaled_nearest_scanline_sse2_8888_8888_OVER,
		5428	uint32_t, uint32_t, NORMAL)
		5429
		5430	static force_inline void
		5431	scaled_nearest_scanline_sse2_8888_n_8888_OVER (const uint32_t * mask,
		5432	uint32_t * dst,
		5433	const uint32_t * src,
		5434	int32_t w,
		5435	pixman_fixed_t vx,
		5436	pixman_fixed_t unit_x,
		5437	pixman_fixed_t src_width_fixed,
		5438	pixman_bool_t zero_src)
		5439	{
		5440	__m128i xmm_mask;
		5441	__m128i xmm_src, xmm_src_lo, xmm_src_hi;
		5442	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		5443	__m128i xmm_alpha_lo, xmm_alpha_hi;
		5444
		5445	if (zero_src \|\| (*mask >> 24) == 0)
		5446	return;
		5447
		5448	xmm_mask = create_mask_16_128 (*mask >> 24);
		5449
		5450	while (w && (uintptr_t)dst & 15)
		5451	{
		5452	uint32_t s = *(src + pixman_fixed_to_int (vx));
		5453	vx += unit_x;
		5454	while (vx >= 0)
		5455	vx -= src_width_fixed;
		5456
		5457	if (s)
		5458	{
		5459	uint32_t d = *dst;
		5460
		5461	__m128i ms = unpack_32_1x128 (s);
		5462	__m128i alpha = expand_alpha_1x128 (ms);
		5463	__m128i dest = xmm_mask;
		5464	__m128i alpha_dst = unpack_32_1x128 (d);
		5465
		5466	*dst = pack_1x128_32 (
		5467	in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
		5468	}
		5469	dst++;
		5470	w--;
		5471	}
		5472
		5473	while (w >= 4)
		5474	{
		5475	uint32_t tmp1, tmp2, tmp3, tmp4;
		5476
		5477	tmp1 = *(src + pixman_fixed_to_int (vx));
		5478	vx += unit_x;
		5479	while (vx >= 0)
		5480	vx -= src_width_fixed;
		5481	tmp2 = *(src + pixman_fixed_to_int (vx));
		5482	vx += unit_x;
		5483	while (vx >= 0)
		5484	vx -= src_width_fixed;
		5485	tmp3 = *(src + pixman_fixed_to_int (vx));
		5486	vx += unit_x;
		5487	while (vx >= 0)
		5488	vx -= src_width_fixed;
		5489	tmp4 = *(src + pixman_fixed_to_int (vx));
		5490	vx += unit_x;
		5491	while (vx >= 0)
		5492	vx -= src_width_fixed;
		5493
		5494	xmm_src = _mm_set_epi32 (tmp4, tmp3, tmp2, tmp1);
		5495
		5496	if (!is_zero (xmm_src))
		5497	{
		5498	xmm_dst = load_128_aligned ((__m128i*)dst);
		5499
		5500	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		5501	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		5502	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		5503	&xmm_alpha_lo, &xmm_alpha_hi);
		5504
		5505	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
		5506	&xmm_alpha_lo, &xmm_alpha_hi,
		5507	&xmm_mask, &xmm_mask,
		5508	&xmm_dst_lo, &xmm_dst_hi);
		5509
		5510	save_128_aligned (
		5511	(__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		5512	}
		5513
		5514	dst += 4;
		5515	w -= 4;
		5516	}
		5517
		5518	while (w)
		5519	{
		5520	uint32_t s = *(src + pixman_fixed_to_int (vx));
		5521	vx += unit_x;
		5522	while (vx >= 0)
		5523	vx -= src_width_fixed;
		5524
		5525	if (s)
		5526	{
		5527	uint32_t d = *dst;
		5528
		5529	__m128i ms = unpack_32_1x128 (s);
		5530	__m128i alpha = expand_alpha_1x128 (ms);
		5531	__m128i mask = xmm_mask;
		5532	__m128i dest = unpack_32_1x128 (d);
		5533
		5534	*dst = pack_1x128_32 (
		5535	in_over_1x128 (&ms, &alpha, &mask, &dest));
		5536	}
		5537
		5538	dst++;
		5539	w--;
		5540	}
		5541
		5542	}
		5543
		5544	FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
		5545	scaled_nearest_scanline_sse2_8888_n_8888_OVER,
		5546	uint32_t, uint32_t, uint32_t, COVER, TRUE, TRUE)
		5547	FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
		5548	scaled_nearest_scanline_sse2_8888_n_8888_OVER,
		5549	uint32_t, uint32_t, uint32_t, PAD, TRUE, TRUE)
		5550	FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
		5551	scaled_nearest_scanline_sse2_8888_n_8888_OVER,
		5552	uint32_t, uint32_t, uint32_t, NONE, TRUE, TRUE)
		5553	FAST_NEAREST_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
		5554	scaled_nearest_scanline_sse2_8888_n_8888_OVER,
		5555	uint32_t, uint32_t, uint32_t, NORMAL, TRUE, TRUE)
		5556
		5557	#if BILINEAR_INTERPOLATION_BITS < 8
		5558	# define BILINEAR_DECLARE_VARIABLES \
		5559	const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
		5560	const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
		5561	const __m128i xmm_addc = _mm_set_epi16 (0, 1, 0, 1, 0, 1, 0, 1); \
		5562	const __m128i xmm_ux = _mm_set_epi16 (unit_x, -unit_x, unit_x, -unit_x, \
		5563	unit_x, -unit_x, unit_x, -unit_x); \
		5564	const __m128i xmm_zero = _mm_setzero_si128 (); \
		5565	__m128i xmm_x = _mm_set_epi16 (vx, -(vx + 1), vx, -(vx + 1), \
		5566	vx, -(vx + 1), vx, -(vx + 1))
		5567	#else
		5568	# define BILINEAR_DECLARE_VARIABLES \
		5569	const __m128i xmm_wt = _mm_set_epi16 (wt, wt, wt, wt, wt, wt, wt, wt); \
		5570	const __m128i xmm_wb = _mm_set_epi16 (wb, wb, wb, wb, wb, wb, wb, wb); \
		5571	const __m128i xmm_addc = _mm_set_epi16 (0, 0, 0, 0, 1, 1, 1, 1); \
		5572	const __m128i xmm_ux = _mm_set_epi16 (unit_x, unit_x, unit_x, unit_x, \
		5573	-unit_x, -unit_x, -unit_x, -unit_x); \
		5574	const __m128i xmm_zero = _mm_setzero_si128 (); \
		5575	__m128i xmm_x = _mm_set_epi16 (vx, vx, vx, vx, \
		5576	-(vx + 1), -(vx + 1), -(vx + 1), -(vx + 1))
		5577	#endif
		5578
		5579	#define BILINEAR_INTERPOLATE_ONE_PIXEL(pix) \
		5580	do { \
		5581	__m128i xmm_wh, xmm_lo, xmm_hi, a; \
		5582	/* fetch 2x2 pixel block into sse2 registers */ \
		5583	__m128i tltr = _mm_loadl_epi64 ( \
		5584	(__m128i *)&src_top[pixman_fixed_to_int (vx)]); \
		5585	__m128i blbr = _mm_loadl_epi64 ( \
		5586	(__m128i *)&src_bottom[pixman_fixed_to_int (vx)]); \
		5587	vx += unit_x; \
		5588	/* vertical interpolation */ \
		5589	a = _mm_add_epi16 (_mm_mullo_epi16 (_mm_unpacklo_epi8 (tltr, xmm_zero), \
		5590	xmm_wt), \
		5591	_mm_mullo_epi16 (_mm_unpacklo_epi8 (blbr, xmm_zero), \
		5592	xmm_wb)); \
		5593	if (BILINEAR_INTERPOLATION_BITS < 8) \
		5594	{ \
		5595	/* calculate horizontal weights */ \
		5596	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
		5597	16 - BILINEAR_INTERPOLATION_BITS)); \
		5598	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
		5599	/* horizontal interpolation */ \
		5600	a = _mm_madd_epi16 (_mm_unpackhi_epi16 (_mm_shuffle_epi32 ( \
		5601	a, _MM_SHUFFLE (1, 0, 3, 2)), a), xmm_wh); \
		5602	} \
		5603	else \
		5604	{ \
		5605	/* calculate horizontal weights */ \
		5606	xmm_wh = _mm_add_epi16 (xmm_addc, _mm_srli_epi16 (xmm_x, \
		5607	16 - BILINEAR_INTERPOLATION_BITS)); \
		5608	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
		5609	/* horizontal interpolation */ \
		5610	xmm_lo = _mm_mullo_epi16 (a, xmm_wh); \
		5611	xmm_hi = _mm_mulhi_epu16 (a, xmm_wh); \
		5612	a = _mm_add_epi32 (_mm_unpacklo_epi16 (xmm_lo, xmm_hi), \
		5613	_mm_unpackhi_epi16 (xmm_lo, xmm_hi)); \
		5614	} \
		5615	/* shift and pack the result */ \
		5616	a = _mm_srli_epi32 (a, BILINEAR_INTERPOLATION_BITS * 2); \
		5617	a = _mm_packs_epi32 (a, a); \
		5618	a = _mm_packus_epi16 (a, a); \
		5619	pix = _mm_cvtsi128_si32 (a); \
		5620	} while (0)
		5621
		5622	#define BILINEAR_SKIP_ONE_PIXEL() \
		5623	do { \
		5624	vx += unit_x; \
		5625	xmm_x = _mm_add_epi16 (xmm_x, xmm_ux); \
		5626	} while(0)
		5627
		5628	static force_inline void
		5629	scaled_bilinear_scanline_sse2_8888_8888_SRC (uint32_t * dst,
		5630	const uint32_t * mask,
		5631	const uint32_t * src_top,
		5632	const uint32_t * src_bottom,
		5633	int32_t w,
		5634	int wt,
		5635	int wb,
		5636	pixman_fixed_t vx,
		5637	pixman_fixed_t unit_x,
		5638	pixman_fixed_t max_vx,
		5639	pixman_bool_t zero_src)
		5640	{
		5641	BILINEAR_DECLARE_VARIABLES;
		5642	uint32_t pix1, pix2, pix3, pix4;
		5643
		5644	while ((w -= 4) >= 0)
		5645	{
		5646	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5647	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
		5648	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
		5649	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
		5650	*dst++ = pix1;
		5651	*dst++ = pix2;
		5652	*dst++ = pix3;
		5653	*dst++ = pix4;
		5654	}
		5655
		5656	if (w & 2)
		5657	{
		5658	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5659	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
		5660	*dst++ = pix1;
		5661	*dst++ = pix2;
		5662	}
		5663
		5664	if (w & 1)
		5665	{
		5666	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5667	*dst = pix1;
		5668	}
		5669
		5670	}
		5671
		5672	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_SRC,
		5673	scaled_bilinear_scanline_sse2_8888_8888_SRC,
		5674	uint32_t, uint32_t, uint32_t,
		5675	COVER, FLAG_NONE)
		5676	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_SRC,
		5677	scaled_bilinear_scanline_sse2_8888_8888_SRC,
		5678	uint32_t, uint32_t, uint32_t,
		5679	PAD, FLAG_NONE)
		5680	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_SRC,
		5681	scaled_bilinear_scanline_sse2_8888_8888_SRC,
		5682	uint32_t, uint32_t, uint32_t,
		5683	NONE, FLAG_NONE)
		5684	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_SRC,
		5685	scaled_bilinear_scanline_sse2_8888_8888_SRC,
		5686	uint32_t, uint32_t, uint32_t,
		5687	NORMAL, FLAG_NONE)
		5688
		5689	static force_inline void
		5690	scaled_bilinear_scanline_sse2_8888_8888_OVER (uint32_t * dst,
		5691	const uint32_t * mask,
		5692	const uint32_t * src_top,
		5693	const uint32_t * src_bottom,
		5694	int32_t w,
		5695	int wt,
		5696	int wb,
		5697	pixman_fixed_t vx,
		5698	pixman_fixed_t unit_x,
		5699	pixman_fixed_t max_vx,
		5700	pixman_bool_t zero_src)
		5701	{
		5702	BILINEAR_DECLARE_VARIABLES;
		5703	uint32_t pix1, pix2, pix3, pix4;
		5704
		5705	while (w && ((uintptr_t)dst & 15))
		5706	{
		5707	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5708
		5709	if (pix1)
		5710	{
		5711	pix2 = *dst;
		5712	*dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
		5713	}
		5714
		5715	w--;
		5716	dst++;
		5717	}
		5718
		5719	while (w >= 4)
		5720	{
		5721	__m128i xmm_src;
		5722	__m128i xmm_src_hi, xmm_src_lo, xmm_dst_hi, xmm_dst_lo;
		5723	__m128i xmm_alpha_hi, xmm_alpha_lo;
		5724
		5725	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5726	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
		5727	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
		5728	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
		5729
		5730	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
		5731
		5732	if (!is_zero (xmm_src))
		5733	{
		5734	if (is_opaque (xmm_src))
		5735	{
		5736	save_128_aligned ((__m128i *)dst, xmm_src);
		5737	}
		5738	else
		5739	{
		5740	__m128i xmm_dst = load_128_aligned ((__m128i *)dst);
		5741
		5742	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		5743	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		5744
		5745	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi);
		5746	over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_alpha_lo, &xmm_alpha_hi,
		5747	&xmm_dst_lo, &xmm_dst_hi);
		5748
		5749	save_128_aligned ((__m128i *)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		5750	}
		5751	}
		5752
		5753	w -= 4;
		5754	dst += 4;
		5755	}
		5756
		5757	while (w)
		5758	{
		5759	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5760
		5761	if (pix1)
		5762	{
		5763	pix2 = *dst;
		5764	*dst = core_combine_over_u_pixel_sse2 (pix1, pix2);
		5765	}
		5766
		5767	w--;
		5768	dst++;
		5769	}
		5770	}
		5771
		5772	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_cover_OVER,
		5773	scaled_bilinear_scanline_sse2_8888_8888_OVER,
		5774	uint32_t, uint32_t, uint32_t,
		5775	COVER, FLAG_NONE)
		5776	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_pad_OVER,
		5777	scaled_bilinear_scanline_sse2_8888_8888_OVER,
		5778	uint32_t, uint32_t, uint32_t,
		5779	PAD, FLAG_NONE)
		5780	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_none_OVER,
		5781	scaled_bilinear_scanline_sse2_8888_8888_OVER,
		5782	uint32_t, uint32_t, uint32_t,
		5783	NONE, FLAG_NONE)
		5784	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8888_normal_OVER,
		5785	scaled_bilinear_scanline_sse2_8888_8888_OVER,
		5786	uint32_t, uint32_t, uint32_t,
		5787	NORMAL, FLAG_NONE)
		5788
		5789	static force_inline void
		5790	scaled_bilinear_scanline_sse2_8888_8_8888_OVER (uint32_t * dst,
		5791	const uint8_t * mask,
		5792	const uint32_t * src_top,
		5793	const uint32_t * src_bottom,
		5794	int32_t w,
		5795	int wt,
		5796	int wb,
		5797	pixman_fixed_t vx,
		5798	pixman_fixed_t unit_x,
		5799	pixman_fixed_t max_vx,
		5800	pixman_bool_t zero_src)
		5801	{
		5802	BILINEAR_DECLARE_VARIABLES;
		5803	uint32_t pix1, pix2, pix3, pix4;
		5804	uint32_t m;
		5805
		5806	while (w && ((uintptr_t)dst & 15))
		5807	{
		5808	uint32_t sa;
		5809
		5810	m = (uint32_t) *mask++;
		5811
		5812	if (m)
		5813	{
		5814	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5815	sa = pix1 >> 24;
		5816
		5817	if (sa == 0xff && m == 0xff)
		5818	{
		5819	*dst = pix1;
		5820	}
		5821	else
		5822	{
		5823	__m128i ms, md, ma, msa;
		5824
		5825	pix2 = *dst;
		5826	ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
		5827	ms = unpack_32_1x128 (pix1);
		5828	md = unpack_32_1x128 (pix2);
		5829
		5830	msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
		5831
		5832	*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
		5833	}
		5834	}
		5835	else
		5836	{
		5837	BILINEAR_SKIP_ONE_PIXEL ();
		5838	}
		5839
		5840	w--;
		5841	dst++;
		5842	}
		5843
		5844	while (w >= 4)
		5845	{
		5846	__m128i xmm_src, xmm_src_lo, xmm_src_hi, xmm_srca_lo, xmm_srca_hi;
		5847	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		5848	__m128i xmm_mask, xmm_mask_lo, xmm_mask_hi;
		5849
		5850	m = (uint32_t)mask;
		5851
		5852	if (m)
		5853	{
		5854	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5855	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
		5856	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
		5857	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
		5858
		5859	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
		5860
		5861	if (m == 0xffffffff && is_opaque (xmm_src))
		5862	{
		5863	save_128_aligned ((__m128i *)dst, xmm_src);
		5864	}
		5865	else
		5866	{
		5867	xmm_dst = load_128_aligned ((__m128i *)dst);
		5868
		5869	xmm_mask = _mm_unpacklo_epi16 (unpack_32_1x128 (m), _mm_setzero_si128());
		5870
		5871	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		5872	unpack_128_2x128 (xmm_mask, &xmm_mask_lo, &xmm_mask_hi);
		5873	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		5874
		5875	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi);
		5876	expand_alpha_rev_2x128 (xmm_mask_lo, xmm_mask_hi, &xmm_mask_lo, &xmm_mask_hi);
		5877
		5878	in_over_2x128 (&xmm_src_lo, &xmm_src_hi, &xmm_srca_lo, &xmm_srca_hi,
		5879	&xmm_mask_lo, &xmm_mask_hi, &xmm_dst_lo, &xmm_dst_hi);
		5880
		5881	save_128_aligned ((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		5882	}
		5883	}
		5884	else
		5885	{
		5886	BILINEAR_SKIP_ONE_PIXEL ();
		5887	BILINEAR_SKIP_ONE_PIXEL ();
		5888	BILINEAR_SKIP_ONE_PIXEL ();
		5889	BILINEAR_SKIP_ONE_PIXEL ();
		5890	}
		5891
		5892	w -= 4;
		5893	dst += 4;
		5894	mask += 4;
		5895	}
		5896
		5897	while (w)
		5898	{
		5899	uint32_t sa;
		5900
		5901	m = (uint32_t) *mask++;
		5902
		5903	if (m)
		5904	{
		5905	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5906	sa = pix1 >> 24;
		5907
		5908	if (sa == 0xff && m == 0xff)
		5909	{
		5910	*dst = pix1;
		5911	}
		5912	else
		5913	{
		5914	__m128i ms, md, ma, msa;
		5915
		5916	pix2 = *dst;
		5917	ma = expand_alpha_rev_1x128 (load_32_1x128 (m));
		5918	ms = unpack_32_1x128 (pix1);
		5919	md = unpack_32_1x128 (pix2);
		5920
		5921	msa = expand_alpha_rev_1x128 (load_32_1x128 (sa));
		5922
		5923	*dst = pack_1x128_32 (in_over_1x128 (&ms, &msa, &ma, &md));
		5924	}
		5925	}
		5926	else
		5927	{
		5928	BILINEAR_SKIP_ONE_PIXEL ();
		5929	}
		5930
		5931	w--;
		5932	dst++;
		5933	}
		5934	}
		5935
		5936	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_cover_OVER,
		5937	scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
		5938	uint32_t, uint8_t, uint32_t,
		5939	COVER, FLAG_HAVE_NON_SOLID_MASK)
		5940	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_pad_OVER,
		5941	scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
		5942	uint32_t, uint8_t, uint32_t,
		5943	PAD, FLAG_HAVE_NON_SOLID_MASK)
		5944	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_none_OVER,
		5945	scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
		5946	uint32_t, uint8_t, uint32_t,
		5947	NONE, FLAG_HAVE_NON_SOLID_MASK)
		5948	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_8_8888_normal_OVER,
		5949	scaled_bilinear_scanline_sse2_8888_8_8888_OVER,
		5950	uint32_t, uint8_t, uint32_t,
		5951	NORMAL, FLAG_HAVE_NON_SOLID_MASK)
		5952
		5953	static force_inline void
		5954	scaled_bilinear_scanline_sse2_8888_n_8888_OVER (uint32_t * dst,
		5955	const uint32_t * mask,
		5956	const uint32_t * src_top,
		5957	const uint32_t * src_bottom,
		5958	int32_t w,
		5959	int wt,
		5960	int wb,
		5961	pixman_fixed_t vx,
		5962	pixman_fixed_t unit_x,
		5963	pixman_fixed_t max_vx,
		5964	pixman_bool_t zero_src)
		5965	{
		5966	BILINEAR_DECLARE_VARIABLES;
		5967	uint32_t pix1, pix2, pix3, pix4;
		5968	__m128i xmm_mask;
		5969
		5970	if (zero_src \|\| (*mask >> 24) == 0)
		5971	return;
		5972
		5973	xmm_mask = create_mask_16_128 (*mask >> 24);
		5974
		5975	while (w && ((uintptr_t)dst & 15))
		5976	{
		5977	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5978	if (pix1)
		5979	{
		5980	uint32_t d = *dst;
		5981
		5982	__m128i ms = unpack_32_1x128 (pix1);
		5983	__m128i alpha = expand_alpha_1x128 (ms);
		5984	__m128i dest = xmm_mask;
		5985	__m128i alpha_dst = unpack_32_1x128 (d);
		5986
		5987	*dst = pack_1x128_32
		5988	(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
		5989	}
		5990
		5991	dst++;
		5992	w--;
		5993	}
		5994
		5995	while (w >= 4)
		5996	{
		5997	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		5998	BILINEAR_INTERPOLATE_ONE_PIXEL (pix2);
		5999	BILINEAR_INTERPOLATE_ONE_PIXEL (pix3);
		6000	BILINEAR_INTERPOLATE_ONE_PIXEL (pix4);
		6001
		6002	if (pix1 \| pix2 \| pix3 \| pix4)
		6003	{
		6004	__m128i xmm_src, xmm_src_lo, xmm_src_hi;
		6005	__m128i xmm_dst, xmm_dst_lo, xmm_dst_hi;
		6006	__m128i xmm_alpha_lo, xmm_alpha_hi;
		6007
		6008	xmm_src = _mm_set_epi32 (pix4, pix3, pix2, pix1);
		6009
		6010	xmm_dst = load_128_aligned ((__m128i*)dst);
		6011
		6012	unpack_128_2x128 (xmm_src, &xmm_src_lo, &xmm_src_hi);
		6013	unpack_128_2x128 (xmm_dst, &xmm_dst_lo, &xmm_dst_hi);
		6014	expand_alpha_2x128 (xmm_src_lo, xmm_src_hi,
		6015	&xmm_alpha_lo, &xmm_alpha_hi);
		6016
		6017	in_over_2x128 (&xmm_src_lo, &xmm_src_hi,
		6018	&xmm_alpha_lo, &xmm_alpha_hi,
		6019	&xmm_mask, &xmm_mask,
		6020	&xmm_dst_lo, &xmm_dst_hi);
		6021
		6022	save_128_aligned
		6023	((__m128i*)dst, pack_2x128_128 (xmm_dst_lo, xmm_dst_hi));
		6024	}
		6025
		6026	dst += 4;
		6027	w -= 4;
		6028	}
		6029
		6030	while (w)
		6031	{
		6032	BILINEAR_INTERPOLATE_ONE_PIXEL (pix1);
		6033	if (pix1)
		6034	{
		6035	uint32_t d = *dst;
		6036
		6037	__m128i ms = unpack_32_1x128 (pix1);
		6038	__m128i alpha = expand_alpha_1x128 (ms);
		6039	__m128i dest = xmm_mask;
		6040	__m128i alpha_dst = unpack_32_1x128 (d);
		6041
		6042	*dst = pack_1x128_32
		6043	(in_over_1x128 (&ms, &alpha, &dest, &alpha_dst));
		6044	}
		6045
		6046	dst++;
		6047	w--;
		6048	}
		6049	}
		6050
		6051	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_cover_OVER,
		6052	scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
		6053	uint32_t, uint32_t, uint32_t,
		6054	COVER, FLAG_HAVE_SOLID_MASK)
		6055	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_pad_OVER,
		6056	scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
		6057	uint32_t, uint32_t, uint32_t,
		6058	PAD, FLAG_HAVE_SOLID_MASK)
		6059	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_none_OVER,
		6060	scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
		6061	uint32_t, uint32_t, uint32_t,
		6062	NONE, FLAG_HAVE_SOLID_MASK)
		6063	FAST_BILINEAR_MAINLOOP_COMMON (sse2_8888_n_8888_normal_OVER,
		6064	scaled_bilinear_scanline_sse2_8888_n_8888_OVER,
		6065	uint32_t, uint32_t, uint32_t,
		6066	NORMAL, FLAG_HAVE_SOLID_MASK)
		6067
		6068	static const pixman_fast_path_t sse2_fast_paths[] =
		6069	{
		6070	/* PIXMAN_OP_OVER */
		6071	PIXMAN_STD_FAST_PATH (OVER, solid, a8, r5g6b5, sse2_composite_over_n_8_0565),
		6072	PIXMAN_STD_FAST_PATH (OVER, solid, a8, b5g6r5, sse2_composite_over_n_8_0565),
		6073	PIXMAN_STD_FAST_PATH (OVER, solid, null, a8r8g8b8, sse2_composite_over_n_8888),
		6074	PIXMAN_STD_FAST_PATH (OVER, solid, null, x8r8g8b8, sse2_composite_over_n_8888),
		6075	PIXMAN_STD_FAST_PATH (OVER, solid, null, r5g6b5, sse2_composite_over_n_0565),
		6076	PIXMAN_STD_FAST_PATH (OVER, solid, null, b5g6r5, sse2_composite_over_n_0565),
		6077	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, a8r8g8b8, sse2_composite_over_8888_8888),
		6078	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, x8r8g8b8, sse2_composite_over_8888_8888),
		6079	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, a8b8g8r8, sse2_composite_over_8888_8888),
		6080	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, x8b8g8r8, sse2_composite_over_8888_8888),
		6081	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, null, r5g6b5, sse2_composite_over_8888_0565),
		6082	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, null, b5g6r5, sse2_composite_over_8888_0565),
		6083	PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8r8g8b8, sse2_composite_over_n_8_8888),
		6084	PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8r8g8b8, sse2_composite_over_n_8_8888),
		6085	PIXMAN_STD_FAST_PATH (OVER, solid, a8, a8b8g8r8, sse2_composite_over_n_8_8888),
		6086	PIXMAN_STD_FAST_PATH (OVER, solid, a8, x8b8g8r8, sse2_composite_over_n_8_8888),
		6087	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, a8r8g8b8, sse2_composite_over_8888_8888_8888),
		6088	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, x8r8g8b8, sse2_composite_over_8888_8_8888),
		6089	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, a8, a8r8g8b8, sse2_composite_over_8888_8_8888),
		6090	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, x8b8g8r8, sse2_composite_over_8888_8_8888),
		6091	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, a8, a8b8g8r8, sse2_composite_over_8888_8_8888),
		6092	PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, x8r8g8b8, sse2_composite_over_x888_8_8888),
		6093	PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, a8, a8r8g8b8, sse2_composite_over_x888_8_8888),
		6094	PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, x8b8g8r8, sse2_composite_over_x888_8_8888),
		6095	PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, a8, a8b8g8r8, sse2_composite_over_x888_8_8888),
		6096	PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, a8r8g8b8, sse2_composite_over_x888_n_8888),
		6097	PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, solid, x8r8g8b8, sse2_composite_over_x888_n_8888),
		6098	PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, a8b8g8r8, sse2_composite_over_x888_n_8888),
		6099	PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, solid, x8b8g8r8, sse2_composite_over_x888_n_8888),
		6100	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, a8r8g8b8, sse2_composite_over_8888_n_8888),
		6101	PIXMAN_STD_FAST_PATH (OVER, a8r8g8b8, solid, x8r8g8b8, sse2_composite_over_8888_n_8888),
		6102	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, a8b8g8r8, sse2_composite_over_8888_n_8888),
		6103	PIXMAN_STD_FAST_PATH (OVER, a8b8g8r8, solid, x8b8g8r8, sse2_composite_over_8888_n_8888),
		6104	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, a8r8g8b8, sse2_composite_over_n_8888_8888_ca),
		6105	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, x8r8g8b8, sse2_composite_over_n_8888_8888_ca),
		6106	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, a8b8g8r8, sse2_composite_over_n_8888_8888_ca),
		6107	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, x8b8g8r8, sse2_composite_over_n_8888_8888_ca),
		6108	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8r8g8b8, r5g6b5, sse2_composite_over_n_8888_0565_ca),
		6109	PIXMAN_STD_FAST_PATH_CA (OVER, solid, a8b8g8r8, b5g6r5, sse2_composite_over_n_8888_0565_ca),
		6110	PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, a8r8g8b8, sse2_composite_over_pixbuf_8888),
		6111	PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, x8r8g8b8, sse2_composite_over_pixbuf_8888),
		6112	PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, a8b8g8r8, sse2_composite_over_pixbuf_8888),
		6113	PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, x8b8g8r8, sse2_composite_over_pixbuf_8888),
		6114	PIXMAN_STD_FAST_PATH (OVER, pixbuf, pixbuf, r5g6b5, sse2_composite_over_pixbuf_0565),
		6115	PIXMAN_STD_FAST_PATH (OVER, rpixbuf, rpixbuf, b5g6r5, sse2_composite_over_pixbuf_0565),
		6116	PIXMAN_STD_FAST_PATH (OVER, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
		6117	PIXMAN_STD_FAST_PATH (OVER, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
		6118
		6119	/* PIXMAN_OP_OVER_REVERSE */
		6120	PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8r8g8b8, sse2_composite_over_reverse_n_8888),
		6121	PIXMAN_STD_FAST_PATH (OVER_REVERSE, solid, null, a8b8g8r8, sse2_composite_over_reverse_n_8888),
		6122
		6123	/* PIXMAN_OP_ADD */
		6124	PIXMAN_STD_FAST_PATH_CA (ADD, solid, a8r8g8b8, a8r8g8b8, sse2_composite_add_n_8888_8888_ca),
		6125	PIXMAN_STD_FAST_PATH (ADD, a8, null, a8, sse2_composite_add_8_8),
		6126	PIXMAN_STD_FAST_PATH (ADD, a8r8g8b8, null, a8r8g8b8, sse2_composite_add_8888_8888),
		6127	PIXMAN_STD_FAST_PATH (ADD, a8b8g8r8, null, a8b8g8r8, sse2_composite_add_8888_8888),
		6128	PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8, sse2_composite_add_n_8_8),
		6129	PIXMAN_STD_FAST_PATH (ADD, solid, null, a8, sse2_composite_add_n_8),
		6130	PIXMAN_STD_FAST_PATH (ADD, solid, null, x8r8g8b8, sse2_composite_add_n_8888),
		6131	PIXMAN_STD_FAST_PATH (ADD, solid, null, a8r8g8b8, sse2_composite_add_n_8888),
		6132	PIXMAN_STD_FAST_PATH (ADD, solid, null, x8b8g8r8, sse2_composite_add_n_8888),
		6133	PIXMAN_STD_FAST_PATH (ADD, solid, null, a8b8g8r8, sse2_composite_add_n_8888),
		6134	PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8r8g8b8, sse2_composite_add_n_8_8888),
		6135	PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8r8g8b8, sse2_composite_add_n_8_8888),
		6136	PIXMAN_STD_FAST_PATH (ADD, solid, a8, x8b8g8r8, sse2_composite_add_n_8_8888),
		6137	PIXMAN_STD_FAST_PATH (ADD, solid, a8, a8b8g8r8, sse2_composite_add_n_8_8888),
		6138
		6139	/* PIXMAN_OP_SRC */
		6140	PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8r8g8b8, sse2_composite_src_n_8_8888),
		6141	PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8r8g8b8, sse2_composite_src_n_8_8888),
		6142	PIXMAN_STD_FAST_PATH (SRC, solid, a8, a8b8g8r8, sse2_composite_src_n_8_8888),
		6143	PIXMAN_STD_FAST_PATH (SRC, solid, a8, x8b8g8r8, sse2_composite_src_n_8_8888),
		6144	PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
		6145	PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
		6146	PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, r5g6b5, sse2_composite_src_x888_0565),
		6147	PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, b5g6r5, sse2_composite_src_x888_0565),
		6148	PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, a8r8g8b8, sse2_composite_src_x888_8888),
		6149	PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, a8b8g8r8, sse2_composite_src_x888_8888),
		6150	PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, a8r8g8b8, sse2_composite_copy_area),
		6151	PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, a8b8g8r8, sse2_composite_copy_area),
		6152	PIXMAN_STD_FAST_PATH (SRC, a8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
		6153	PIXMAN_STD_FAST_PATH (SRC, a8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
		6154	PIXMAN_STD_FAST_PATH (SRC, x8r8g8b8, null, x8r8g8b8, sse2_composite_copy_area),
		6155	PIXMAN_STD_FAST_PATH (SRC, x8b8g8r8, null, x8b8g8r8, sse2_composite_copy_area),
		6156	PIXMAN_STD_FAST_PATH (SRC, r5g6b5, null, r5g6b5, sse2_composite_copy_area),
		6157	PIXMAN_STD_FAST_PATH (SRC, b5g6r5, null, b5g6r5, sse2_composite_copy_area),
		6158
		6159	/* PIXMAN_OP_IN */
		6160	PIXMAN_STD_FAST_PATH (IN, a8, null, a8, sse2_composite_in_8_8),
		6161	PIXMAN_STD_FAST_PATH (IN, solid, a8, a8, sse2_composite_in_n_8_8),
		6162	PIXMAN_STD_FAST_PATH (IN, solid, null, a8, sse2_composite_in_n_8),
		6163
		6164	SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
		6165	SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
		6166	SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
		6167	SIMPLE_NEAREST_FAST_PATH_COVER (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
		6168	SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
		6169	SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
		6170	SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
		6171	SIMPLE_NEAREST_FAST_PATH_NONE (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
		6172	SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
		6173	SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
		6174	SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
		6175	SIMPLE_NEAREST_FAST_PATH_PAD (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
		6176	SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
		6177	SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
		6178	SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
		6179	SIMPLE_NEAREST_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
		6180
		6181	SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
		6182	SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
		6183	SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
		6184	SIMPLE_NEAREST_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
		6185	SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
		6186	SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
		6187	SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
		6188	SIMPLE_NEAREST_SOLID_MASK_FAST_PATH_NORMAL (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
		6189
		6190	SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
		6191	SIMPLE_BILINEAR_FAST_PATH (SRC, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
		6192	SIMPLE_BILINEAR_FAST_PATH (SRC, x8r8g8b8, x8r8g8b8, sse2_8888_8888),
		6193	SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
		6194	SIMPLE_BILINEAR_FAST_PATH (SRC, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
		6195	SIMPLE_BILINEAR_FAST_PATH (SRC, x8b8g8r8, x8b8g8r8, sse2_8888_8888),
		6196
		6197	SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8888),
		6198	SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8888),
		6199	SIMPLE_BILINEAR_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8888),
		6200	SIMPLE_BILINEAR_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8888),
		6201
		6202	SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_n_8888),
		6203	SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_n_8888),
		6204	SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_n_8888),
		6205	SIMPLE_BILINEAR_SOLID_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_n_8888),
		6206
		6207	SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, x8r8g8b8, sse2_8888_8_8888),
		6208	SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, x8b8g8r8, sse2_8888_8_8888),
		6209	SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8r8g8b8, a8r8g8b8, sse2_8888_8_8888),
		6210	SIMPLE_BILINEAR_A8_MASK_FAST_PATH (OVER, a8b8g8r8, a8b8g8r8, sse2_8888_8_8888),
		6211
		6212	{ PIXMAN_OP_NONE },
		6213	};
		6214
		6215	static uint32_t *
		6216	sse2_fetch_x8r8g8b8 (pixman_iter_t iter, const uint32_t mask)
		6217	{
		6218	int w = iter->width;
		6219	__m128i ff000000 = mask_ff000000;
		6220	uint32_t *dst = iter->buffer;
		6221	uint32_t src = (uint32_t )iter->bits;
		6222
		6223	iter->bits += iter->stride;
		6224
		6225	while (w && ((uintptr_t)dst) & 0x0f)
		6226	{
		6227	dst++ = (src++) \| 0xff000000;
		6228	w--;
		6229	}
		6230
		6231	while (w >= 4)
		6232	{
		6233	save_128_aligned (
		6234	(__m128i *)dst, _mm_or_si128 (
		6235	load_128_unaligned ((__m128i *)src), ff000000));
		6236
		6237	dst += 4;
		6238	src += 4;
		6239	w -= 4;
		6240	}
		6241
		6242	while (w)
		6243	{
		6244	dst++ = (src++) \| 0xff000000;
		6245	w--;
		6246	}
		6247
		6248	return iter->buffer;
		6249	}
		6250
		6251	static uint32_t *
		6252	sse2_fetch_r5g6b5 (pixman_iter_t iter, const uint32_t mask)
		6253	{
		6254	int w = iter->width;
		6255	uint32_t *dst = iter->buffer;
		6256	uint16_t src = (uint16_t )iter->bits;
		6257	__m128i ff000000 = mask_ff000000;
		6258
		6259	iter->bits += iter->stride;
		6260
		6261	while (w && ((uintptr_t)dst) & 0x0f)
		6262	{
		6263	uint16_t s = *src++;
		6264
		6265	*dst++ = convert_0565_to_8888 (s);
		6266	w--;
		6267	}
		6268
		6269	while (w >= 8)
		6270	{
		6271	__m128i lo, hi, s;
		6272
		6273	s = _mm_loadu_si128 ((__m128i *)src);
		6274
		6275	lo = unpack_565_to_8888 (_mm_unpacklo_epi16 (s, _mm_setzero_si128 ()));
		6276	hi = unpack_565_to_8888 (_mm_unpackhi_epi16 (s, _mm_setzero_si128 ()));
		6277
		6278	save_128_aligned ((__m128i *)(dst + 0), _mm_or_si128 (lo, ff000000));
		6279	save_128_aligned ((__m128i *)(dst + 4), _mm_or_si128 (hi, ff000000));
		6280
		6281	dst += 8;
		6282	src += 8;
		6283	w -= 8;
		6284	}
		6285
		6286	while (w)
		6287	{
		6288	uint16_t s = *src++;
		6289
		6290	*dst++ = convert_0565_to_8888 (s);
		6291	w--;
		6292	}
		6293
		6294	return iter->buffer;
		6295	}
		6296
		6297	static uint32_t *
		6298	sse2_fetch_a8 (pixman_iter_t iter, const uint32_t mask)
		6299	{
		6300	int w = iter->width;
		6301	uint32_t *dst = iter->buffer;
		6302	uint8_t *src = iter->bits;
		6303	__m128i xmm0, xmm1, xmm2, xmm3, xmm4, xmm5, xmm6;
		6304
		6305	iter->bits += iter->stride;
		6306
		6307	while (w && (((uintptr_t)dst) & 15))
		6308	{
		6309	dst++ = (src++) << 24;
		6310	w--;
		6311	}
		6312
		6313	while (w >= 16)
		6314	{
		6315	xmm0 = _mm_loadu_si128((__m128i *)src);
		6316
		6317	xmm1 = _mm_unpacklo_epi8 (_mm_setzero_si128(), xmm0);
		6318	xmm2 = _mm_unpackhi_epi8 (_mm_setzero_si128(), xmm0);
		6319	xmm3 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm1);
		6320	xmm4 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm1);
		6321	xmm5 = _mm_unpacklo_epi16 (_mm_setzero_si128(), xmm2);
		6322	xmm6 = _mm_unpackhi_epi16 (_mm_setzero_si128(), xmm2);
		6323
		6324	_mm_store_si128(((__m128i *)(dst + 0)), xmm3);
		6325	_mm_store_si128(((__m128i *)(dst + 4)), xmm4);
		6326	_mm_store_si128(((__m128i *)(dst + 8)), xmm5);
		6327	_mm_store_si128(((__m128i *)(dst + 12)), xmm6);
		6328
		6329	dst += 16;
		6330	src += 16;
		6331	w -= 16;
		6332	}
		6333
		6334	while (w)
		6335	{
		6336	dst++ = (src++) << 24;
		6337	w--;
		6338	}
		6339
		6340	return iter->buffer;
		6341	}
		6342
		6343	typedef struct
		6344	{
		6345	pixman_format_code_t format;
		6346	pixman_iter_get_scanline_t get_scanline;
		6347	} fetcher_info_t;
		6348
		6349	static const fetcher_info_t fetchers[] =
		6350	{
		6351	{ PIXMAN_x8r8g8b8, sse2_fetch_x8r8g8b8 },
		6352	{ PIXMAN_r5g6b5, sse2_fetch_r5g6b5 },
		6353	{ PIXMAN_a8, sse2_fetch_a8 },
		6354	{ PIXMAN_null }
		6355	};
		6356
		6357	static pixman_bool_t
		6358	sse2_src_iter_init (pixman_implementation_t imp, pixman_iter_t iter)
		6359	{
		6360	pixman_image_t *image = iter->image;
		6361
		6362	#define FLAGS \
		6363	(FAST_PATH_STANDARD_FLAGS \| FAST_PATH_ID_TRANSFORM \| \
		6364	FAST_PATH_BITS_IMAGE \| FAST_PATH_SAMPLES_COVER_CLIP_NEAREST)
		6365
		6366	if ((iter->iter_flags & ITER_NARROW) &&
		6367	(iter->image_flags & FLAGS) == FLAGS)
		6368	{
		6369	const fetcher_info_t *f;
		6370
		6371	for (f = &fetchers[0]; f->format != PIXMAN_null; f++)
		6372	{
		6373	if (image->common.extended_format_code == f->format)
		6374	{
		6375	uint8_t b = (uint8_t )image->bits.bits;
		6376	int s = image->bits.rowstride * 4;
		6377
		6378	iter->bits = b + s * iter->y + iter->x * PIXMAN_FORMAT_BPP (f->format) / 8;
		6379	iter->stride = s;
		6380
		6381	iter->get_scanline = f->get_scanline;
		6382	return TRUE;
		6383	}
		6384	}
		6385	}
		6386
		6387	return FALSE;
		6388	}
		6389
		6390	#if defined(__GNUC__) && !defined(__x86_64__) && !defined(__amd64__)
		6391	__attribute__((__force_align_arg_pointer__))
		6392	#endif
		6393	pixman_implementation_t *
		6394	_pixman_implementation_create_sse2 (pixman_implementation_t *fallback)
		6395	{
		6396	pixman_implementation_t *imp = _pixman_implementation_create (fallback, sse2_fast_paths);
		6397
		6398	/* SSE2 constants */
		6399	mask_565_r = create_mask_2x32_128 (0x00f80000, 0x00f80000);
		6400	mask_565_g1 = create_mask_2x32_128 (0x00070000, 0x00070000);
		6401	mask_565_g2 = create_mask_2x32_128 (0x000000e0, 0x000000e0);
		6402	mask_565_b = create_mask_2x32_128 (0x0000001f, 0x0000001f);
		6403	mask_red = create_mask_2x32_128 (0x00f80000, 0x00f80000);
		6404	mask_green = create_mask_2x32_128 (0x0000fc00, 0x0000fc00);
		6405	mask_blue = create_mask_2x32_128 (0x000000f8, 0x000000f8);
		6406	mask_565_fix_rb = create_mask_2x32_128 (0x00e000e0, 0x00e000e0);
		6407	mask_565_fix_g = create_mask_2x32_128 (0x0000c000, 0x0000c000);
		6408	mask_0080 = create_mask_16_128 (0x0080);
		6409	mask_00ff = create_mask_16_128 (0x00ff);
		6410	mask_0101 = create_mask_16_128 (0x0101);
		6411	mask_ffff = create_mask_16_128 (0xffff);
		6412	mask_ff000000 = create_mask_2x32_128 (0xff000000, 0xff000000);
		6413	mask_alpha = create_mask_2x32_128 (0x00ff0000, 0x00000000);
		6414	mask_565_rb = create_mask_2x32_128 (0x00f800f8, 0x00f800f8);
		6415	mask_565_pack_multiplier = create_mask_2x32_128 (0x20000004, 0x20000004);
		6416
		6417	/* Set up function pointers */
		6418	imp->combine_32[PIXMAN_OP_OVER] = sse2_combine_over_u;
		6419	imp->combine_32[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_u;
		6420	imp->combine_32[PIXMAN_OP_IN] = sse2_combine_in_u;
		6421	imp->combine_32[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_u;
		6422	imp->combine_32[PIXMAN_OP_OUT] = sse2_combine_out_u;
		6423	imp->combine_32[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_u;
		6424	imp->combine_32[PIXMAN_OP_ATOP] = sse2_combine_atop_u;
		6425	imp->combine_32[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_u;
		6426	imp->combine_32[PIXMAN_OP_XOR] = sse2_combine_xor_u;
		6427	imp->combine_32[PIXMAN_OP_ADD] = sse2_combine_add_u;
		6428
		6429	imp->combine_32[PIXMAN_OP_SATURATE] = sse2_combine_saturate_u;
		6430
		6431	imp->combine_32_ca[PIXMAN_OP_SRC] = sse2_combine_src_ca;
		6432	imp->combine_32_ca[PIXMAN_OP_OVER] = sse2_combine_over_ca;
		6433	imp->combine_32_ca[PIXMAN_OP_OVER_REVERSE] = sse2_combine_over_reverse_ca;
		6434	imp->combine_32_ca[PIXMAN_OP_IN] = sse2_combine_in_ca;
		6435	imp->combine_32_ca[PIXMAN_OP_IN_REVERSE] = sse2_combine_in_reverse_ca;
		6436	imp->combine_32_ca[PIXMAN_OP_OUT] = sse2_combine_out_ca;
		6437	imp->combine_32_ca[PIXMAN_OP_OUT_REVERSE] = sse2_combine_out_reverse_ca;
		6438	imp->combine_32_ca[PIXMAN_OP_ATOP] = sse2_combine_atop_ca;
		6439	imp->combine_32_ca[PIXMAN_OP_ATOP_REVERSE] = sse2_combine_atop_reverse_ca;
		6440	imp->combine_32_ca[PIXMAN_OP_XOR] = sse2_combine_xor_ca;
		6441	imp->combine_32_ca[PIXMAN_OP_ADD] = sse2_combine_add_ca;
		6442
		6443	imp->blt = sse2_blt;
		6444	imp->fill = sse2_fill;
		6445
		6446	imp->src_iter_init = sse2_src_iter_init;
		6447
		6448	return imp;
		6449	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/pixman/pixman-sse2.c @ 4819 – Rev 4349