Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /************************************************************************** |
2 | * |
||
3 | * Copyright 2008 VMware, Inc. |
||
4 | * All Rights Reserved. |
||
5 | * |
||
6 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
7 | * copy of this software and associated documentation files (the |
||
8 | * "Software"), to deal in the Software without restriction, including |
||
9 | * without limitation the rights to use, copy, modify, merge, publish, |
||
10 | * distribute, sub license, and/or sell copies of the Software, and to |
||
11 | * permit persons to whom the Software is furnished to do so, subject to |
||
12 | * the following conditions: |
||
13 | * |
||
14 | * The above copyright notice and this permission notice (including the |
||
15 | * next paragraph) shall be included in all copies or substantial portions |
||
16 | * of the Software. |
||
17 | * |
||
18 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS |
||
19 | * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF |
||
20 | * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. |
||
21 | * IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR |
||
22 | * ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, |
||
23 | * TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE |
||
24 | * SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. |
||
25 | * |
||
26 | **************************************************************************/ |
||
27 | |||
28 | /** |
||
29 | * @file |
||
30 | * SSE intrinsics portability header. |
||
31 | * |
||
32 | * Although the SSE intrinsics are support by all modern x86 and x86-64 |
||
33 | * compilers, there are some intrisincs missing in some implementations |
||
34 | * (especially older MSVC versions). This header abstracts that away. |
||
35 | */ |
||
36 | |||
37 | #ifndef U_SSE_H_ |
||
38 | #define U_SSE_H_ |
||
39 | |||
40 | #include "pipe/p_config.h" |
||
41 | |||
42 | #if defined(PIPE_ARCH_SSE) |
||
43 | |||
44 | #include |
||
45 | |||
46 | |||
47 | union m128i { |
||
48 | __m128i m; |
||
49 | ubyte ub[16]; |
||
50 | ushort us[8]; |
||
51 | uint ui[4]; |
||
52 | }; |
||
53 | |||
54 | static INLINE void u_print_epi8(const char *name, __m128i r) |
||
55 | { |
||
56 | union { __m128i m; ubyte ub[16]; } u; |
||
57 | u.m = r; |
||
58 | |||
59 | debug_printf("%s: " |
||
60 | "%02x/" |
||
61 | "%02x/" |
||
62 | "%02x/" |
||
63 | "%02x/" |
||
64 | "%02x/" |
||
65 | "%02x/" |
||
66 | "%02x/" |
||
67 | "%02x/" |
||
68 | "%02x/" |
||
69 | "%02x/" |
||
70 | "%02x/" |
||
71 | "%02x/" |
||
72 | "%02x/" |
||
73 | "%02x/" |
||
74 | "%02x/" |
||
75 | "%02x\n", |
||
76 | name, |
||
77 | u.ub[0], u.ub[1], u.ub[2], u.ub[3], |
||
78 | u.ub[4], u.ub[5], u.ub[6], u.ub[7], |
||
79 | u.ub[8], u.ub[9], u.ub[10], u.ub[11], |
||
80 | u.ub[12], u.ub[13], u.ub[14], u.ub[15]); |
||
81 | } |
||
82 | |||
83 | static INLINE void u_print_epi16(const char *name, __m128i r) |
||
84 | { |
||
85 | union { __m128i m; ushort us[8]; } u; |
||
86 | u.m = r; |
||
87 | |||
88 | debug_printf("%s: " |
||
89 | "%04x/" |
||
90 | "%04x/" |
||
91 | "%04x/" |
||
92 | "%04x/" |
||
93 | "%04x/" |
||
94 | "%04x/" |
||
95 | "%04x/" |
||
96 | "%04x\n", |
||
97 | name, |
||
98 | u.us[0], u.us[1], u.us[2], u.us[3], |
||
99 | u.us[4], u.us[5], u.us[6], u.us[7]); |
||
100 | } |
||
101 | |||
102 | static INLINE void u_print_epi32(const char *name, __m128i r) |
||
103 | { |
||
104 | union { __m128i m; uint ui[4]; } u; |
||
105 | u.m = r; |
||
106 | |||
107 | debug_printf("%s: " |
||
108 | "%08x/" |
||
109 | "%08x/" |
||
110 | "%08x/" |
||
111 | "%08x\n", |
||
112 | name, |
||
113 | u.ui[0], u.ui[1], u.ui[2], u.ui[3]); |
||
114 | } |
||
115 | |||
116 | static INLINE void u_print_ps(const char *name, __m128 r) |
||
117 | { |
||
118 | union { __m128 m; float f[4]; } u; |
||
119 | u.m = r; |
||
120 | |||
121 | debug_printf("%s: " |
||
122 | "%f/" |
||
123 | "%f/" |
||
124 | "%f/" |
||
125 | "%f\n", |
||
126 | name, |
||
127 | u.f[0], u.f[1], u.f[2], u.f[3]); |
||
128 | } |
||
129 | |||
130 | |||
131 | #define U_DUMP_EPI32(a) u_print_epi32(#a, a) |
||
132 | #define U_DUMP_EPI16(a) u_print_epi16(#a, a) |
||
133 | #define U_DUMP_EPI8(a) u_print_epi8(#a, a) |
||
134 | #define U_DUMP_PS(a) u_print_ps(#a, a) |
||
135 | |||
136 | |||
137 | |||
138 | #if defined(PIPE_ARCH_SSSE3) |
||
139 | |||
140 | #include |
||
141 | |||
142 | #else /* !PIPE_ARCH_SSSE3 */ |
||
143 | |||
144 | /** |
||
145 | * Describe _mm_shuffle_epi8() with gcc extended inline assembly, for cases |
||
146 | * where -mssse3 is not supported/enabled. |
||
147 | * |
||
148 | * MSVC will never get in here as its intrinsics support do not rely on |
||
149 | * compiler command line options. |
||
150 | */ |
||
151 | static __inline __m128i |
||
152 | #ifdef __clang__ |
||
153 | __attribute__((__always_inline__, __nodebug__)) |
||
154 | #else |
||
155 | __attribute__((__gnu_inline__, __always_inline__, __artificial__)) |
||
156 | #endif |
||
157 | _mm_shuffle_epi8(__m128i a, __m128i mask) |
||
158 | { |
||
159 | __m128i result; |
||
160 | __asm__("pshufb %1, %0" |
||
161 | : "=x" (result) |
||
162 | : "xm" (mask), "0" (a)); |
||
163 | return result; |
||
164 | } |
||
165 | |||
166 | #endif /* !PIPE_ARCH_SSSE3 */ |
||
167 | |||
168 | |||
169 | |||
170 | |||
171 | /* Provide an SSE2 implementation of _mm_mullo_epi32() in terms of |
||
172 | * _mm_mul_epu32(). |
||
173 | * |
||
174 | * I suspect this works fine for us because one of our operands is |
||
175 | * always positive, but not sure that this can be used for general |
||
176 | * signed integer multiplication. |
||
177 | * |
||
178 | * This seems close enough to the speed of SSE4 and the real |
||
179 | * _mm_mullo_epi32() intrinsic as to not justify adding an sse4 |
||
180 | * dependency at this point. |
||
181 | */ |
||
182 | static INLINE __m128i mm_mullo_epi32(const __m128i a, const __m128i b) |
||
183 | { |
||
184 | __m128i a4 = _mm_srli_epi64(a, 32); /* shift by one dword */ |
||
185 | __m128i b4 = _mm_srli_epi64(b, 32); /* shift by one dword */ |
||
186 | __m128i ba = _mm_mul_epu32(b, a); /* multply dwords 0, 2 */ |
||
187 | __m128i b4a4 = _mm_mul_epu32(b4, a4); /* multiply dwords 1, 3 */ |
||
188 | |||
189 | /* Interleave the results, either with shuffles or (slightly |
||
190 | * faster) direct bit operations: |
||
191 | */ |
||
192 | #if 0 |
||
193 | __m128i ba8 = _mm_shuffle_epi32(ba, 8); |
||
194 | __m128i b4a48 = _mm_shuffle_epi32(b4a4, 8); |
||
195 | __m128i result = _mm_unpacklo_epi32(ba8, b4a48); |
||
196 | #else |
||
197 | __m128i mask = _mm_setr_epi32(~0,0,~0,0); |
||
198 | __m128i ba_mask = _mm_and_si128(ba, mask); |
||
199 | __m128i b4a4_mask_shift = _mm_slli_epi64(b4a4, 32); |
||
200 | __m128i result = _mm_or_si128(ba_mask, b4a4_mask_shift); |
||
201 | #endif |
||
202 | |||
203 | return result; |
||
204 | } |
||
205 | |||
206 | |||
207 | static INLINE void |
||
208 | transpose4_epi32(const __m128i * restrict a, |
||
209 | const __m128i * restrict b, |
||
210 | const __m128i * restrict c, |
||
211 | const __m128i * restrict d, |
||
212 | __m128i * restrict o, |
||
213 | __m128i * restrict p, |
||
214 | __m128i * restrict q, |
||
215 | __m128i * restrict r) |
||
216 | { |
||
217 | __m128i t0 = _mm_unpacklo_epi32(*a, *b); |
||
218 | __m128i t1 = _mm_unpacklo_epi32(*c, *d); |
||
219 | __m128i t2 = _mm_unpackhi_epi32(*a, *b); |
||
220 | __m128i t3 = _mm_unpackhi_epi32(*c, *d); |
||
221 | |||
222 | *o = _mm_unpacklo_epi64(t0, t1); |
||
223 | *p = _mm_unpackhi_epi64(t0, t1); |
||
224 | *q = _mm_unpacklo_epi64(t2, t3); |
||
225 | *r = _mm_unpackhi_epi64(t2, t3); |
||
226 | } |
||
227 | |||
228 | #define SCALAR_EPI32(m, i) _mm_shuffle_epi32((m), _MM_SHUFFLE(i,i,i,i)) |
||
229 | |||
230 | |||
231 | #endif /* PIPE_ARCH_SSE */ |
||
232 | |||
233 | #endif /* U_SSE_H_ */ |