WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/mesa-10.6.0/src/gallium/auxiliary/util/u_sse.h

Rev	Author	Line No.	Line
5564	serge	1	/**************************************************************************
		2	*
		3	* Copyright 2008 VMware, Inc.
		4	* All Rights Reserved.
		5	*
		6	* Permission is hereby granted, free of charge, to any person obtaining a
		7	* copy of this software and associated documentation files (the
		8	* "Software"), to deal in the Software without restriction, including
		9	* without limitation the rights to use, copy, modify, merge, publish,
		10	* distribute, sub license, and/or sell copies of the Software, and to
		11	* permit persons to whom the Software is furnished to do so, subject to
		12	* the following conditions:
		13	*
		14	* The above copyright notice and this permission notice (including the
		15	* next paragraph) shall be included in all copies or substantial portions
		16	* of the Software.
		17	*
		18	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
		19	* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
		20	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
		21	* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
		22	* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
		23	* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
		24	* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
		25	*
		26	**************************************************************************/
		27
		28	/**
		29	* @file
		30	* SSE intrinsics portability header.
		31	*
		32	* Although the SSE intrinsics are support by all modern x86 and x86-64
		33	* compilers, there are some intrisincs missing in some implementations
		34	* (especially older MSVC versions). This header abstracts that away.
		35	*/
		36
		37	#ifndef U_SSE_H_
		38	#define U_SSE_H_
		39
		40	#include "pipe/p_config.h"
		41
		42	#if defined(PIPE_ARCH_SSE)
		43
		44	#include
		45
		46
		47	union m128i {
		48	__m128i m;
		49	ubyte ub[16];
		50	ushort us[8];
		51	uint ui[4];
		52	};
		53
		54	static INLINE void u_print_epi8(const char *name, __m128i r)
		55	{
		56	union { __m128i m; ubyte ub[16]; } u;
		57	u.m = r;
		58
		59	debug_printf("%s: "
		60	"%02x/"
		61	"%02x/"
		62	"%02x/"
		63	"%02x/"
		64	"%02x/"
		65	"%02x/"
		66	"%02x/"
		67	"%02x/"
		68	"%02x/"
		69	"%02x/"
		70	"%02x/"
		71	"%02x/"
		72	"%02x/"
		73	"%02x/"
		74	"%02x/"
		75	"%02x\n",
		76	name,
		77	u.ub[0], u.ub[1], u.ub[2], u.ub[3],
		78	u.ub[4], u.ub[5], u.ub[6], u.ub[7],
		79	u.ub[8], u.ub[9], u.ub[10], u.ub[11],
		80	u.ub[12], u.ub[13], u.ub[14], u.ub[15]);
		81	}
		82
		83	static INLINE void u_print_epi16(const char *name, __m128i r)
		84	{
		85	union { __m128i m; ushort us[8]; } u;
		86	u.m = r;
		87
		88	debug_printf("%s: "
		89	"%04x/"
		90	"%04x/"
		91	"%04x/"
		92	"%04x/"
		93	"%04x/"
		94	"%04x/"
		95	"%04x/"
		96	"%04x\n",
		97	name,
		98	u.us[0], u.us[1], u.us[2], u.us[3],
		99	u.us[4], u.us[5], u.us[6], u.us[7]);
		100	}
		101
		102	static INLINE void u_print_epi32(const char *name, __m128i r)
		103	{
		104	union { __m128i m; uint ui[4]; } u;
		105	u.m = r;
		106
		107	debug_printf("%s: "
		108	"%08x/"
		109	"%08x/"
		110	"%08x/"
		111	"%08x\n",
		112	name,
		113	u.ui[0], u.ui[1], u.ui[2], u.ui[3]);
		114	}
		115
		116	static INLINE void u_print_ps(const char *name, __m128 r)
		117	{
		118	union { __m128 m; float f[4]; } u;
		119	u.m = r;
		120
		121	debug_printf("%s: "
		122	"%f/"
		123	"%f/"
		124	"%f/"
		125	"%f\n",
		126	name,
		127	u.f[0], u.f[1], u.f[2], u.f[3]);
		128	}
		129
		130
		131	#define U_DUMP_EPI32(a) u_print_epi32(#a, a)
		132	#define U_DUMP_EPI16(a) u_print_epi16(#a, a)
		133	#define U_DUMP_EPI8(a) u_print_epi8(#a, a)
		134	#define U_DUMP_PS(a) u_print_ps(#a, a)
		135
		136
		137
		138	#if defined(PIPE_ARCH_SSSE3)
		139
		140	#include
		141
		142	#else /* !PIPE_ARCH_SSSE3 */
		143
		144	/**
		145	* Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases
		146	* where -mssse3 is not supported/enabled.
		147	*
		148	* MSVC will never get in here as its intrinsics support do not rely on
		149	* compiler command line options.
		150	*/
		151	static __inline __m128i
		152	#ifdef __clang__
		153	__attribute__((__always_inline__, __nodebug__))
		154	#else
		155	__attribute__((__gnu_inline__, __always_inline__, __artificial__))
		156	#endif
		157	_mm_shuffle_epi8(__m128i a, __m128i mask)
		158	{
		159	__m128i result;
		160	__asm__("pshufb %1, %0"
		161	: "=x" (result)
		162	: "xm" (mask), "0" (a));
		163	return result;
		164	}
		165
		166	#endif /* !PIPE_ARCH_SSSE3 */
		167
		168
		169
		170
		171	/* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of
		172	* _mm_mul_epu32().
		173	*
		174	* I suspect this works fine for us because one of our operands is
		175	* always positive, but not sure that this can be used for general
		176	* signed integer multiplication.
		177	*
		178	* This seems close enough to the speed of SSE4 and the real
		179	* _mm_mullo_epi32() intrinsic as to not justify adding an sse4
		180	* dependency at this point.
		181	*/
		182	static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b)
		183	{
		184	__m128i a4 = _mm_srli_epi64(a, 32); /* shift by one dword */
		185	__m128i b4 = _mm_srli_epi64(b, 32); /* shift by one dword */
		186	__m128i ba = _mm_mul_epu32(b, a); /* multply dwords 0, 2 */
		187	__m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */
		188
		189	/* Interleave the results, either with shuffles or (slightly
		190	* faster) direct bit operations:
		191	*/
		192	#if 0
		193	__m128i ba8 = _mm_shuffle_epi32(ba, 8);
		194	__m128i b4a48 = _mm_shuffle_epi32(b4a4, 8);
		195	__m128i result = _mm_unpacklo_epi32(ba8, b4a48);
		196	#else
		197	__m128i mask = _mm_setr_epi32(~0,0,~0,0);
		198	__m128i ba_mask = _mm_and_si128(ba, mask);
		199	__m128i b4a4_mask_shift = _mm_slli_epi64(b4a4, 32);
		200	__m128i result = _mm_or_si128(ba_mask, b4a4_mask_shift);
		201	#endif
		202
		203	return result;
		204	}
		205
		206
		207	static INLINE void
		208	transpose4_epi32(const __m128i * restrict a,
		209	const __m128i * restrict b,
		210	const __m128i * restrict c,
		211	const __m128i * restrict d,
		212	__m128i * restrict o,
		213	__m128i * restrict p,
		214	__m128i * restrict q,
		215	__m128i * restrict r)
		216	{
		217	__m128i t0 = _mm_unpacklo_epi32(a, b);
		218	__m128i t1 = _mm_unpacklo_epi32(c, d);
		219	__m128i t2 = _mm_unpackhi_epi32(a, b);
		220	__m128i t3 = _mm_unpackhi_epi32(c, d);
		221
		222	*o = _mm_unpacklo_epi64(t0, t1);
		223	*p = _mm_unpackhi_epi64(t0, t1);
		224	*q = _mm_unpacklo_epi64(t2, t3);
		225	*r = _mm_unpackhi_epi64(t2, t3);
		226	}
		227
		228	#define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i))
		229
		230
		231	#endif /* PIPE_ARCH_SSE */
		232
		233	#endif /* U_SSE_H_ */

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/mesa-10.6.0/src/gallium/auxiliary/util/u_sse.h – Rev 5564