Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
4358 | Serge | 1 | /* |
2 | * Copyright 2003 Tungsten Graphics, inc. |
||
3 | * All Rights Reserved. |
||
4 | * |
||
5 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
6 | * copy of this software and associated documentation files (the "Software"), |
||
7 | * to deal in the Software without restriction, including without limitation |
||
8 | * on the rights to use, copy, modify, merge, publish, distribute, sub |
||
9 | * license, and/or sell copies of the Software, and to permit persons to whom |
||
10 | * the Software is furnished to do so, subject to the following conditions: |
||
11 | * |
||
12 | * The above copyright notice and this permission notice (including the next |
||
13 | * paragraph) shall be included in all copies or substantial portions of the |
||
14 | * Software. |
||
15 | * |
||
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
||
19 | * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, |
||
20 | * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
||
21 | * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
||
22 | * USE OR OTHER DEALINGS IN THE SOFTWARE. |
||
23 | * |
||
24 | * Authors: |
||
25 | * Keith Whitwell |
||
26 | */ |
||
27 | |||
28 | |||
29 | #include "pipe/p_config.h" |
||
30 | #include "pipe/p_compiler.h" |
||
31 | #include "util/u_memory.h" |
||
32 | #include "util/u_math.h" |
||
33 | #include "util/u_format.h" |
||
34 | |||
35 | #include "translate.h" |
||
36 | |||
37 | |||
38 | #if (defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(__MINGW32__))) && !defined(PIPE_SUBSYSTEM_EMBEDDED) |
||
39 | |||
40 | #include "rtasm/rtasm_cpu.h" |
||
41 | #include "rtasm/rtasm_x86sse.h" |
||
42 | |||
43 | |||
44 | #define X 0 |
||
45 | #define Y 1 |
||
46 | #define Z 2 |
||
47 | #define W 3 |
||
48 | |||
49 | |||
50 | struct translate_buffer { |
||
51 | const void *base_ptr; |
||
52 | uintptr_t stride; |
||
53 | unsigned max_index; |
||
54 | }; |
||
55 | |||
56 | struct translate_buffer_variant { |
||
57 | unsigned buffer_index; |
||
58 | unsigned instance_divisor; |
||
59 | void *ptr; /* updated either per vertex or per instance */ |
||
60 | }; |
||
61 | |||
62 | |||
63 | #define ELEMENT_BUFFER_INSTANCE_ID 1001 |
||
64 | |||
65 | #define NUM_CONSTS 7 |
||
66 | |||
67 | enum |
||
68 | { |
||
69 | CONST_IDENTITY, |
||
70 | CONST_INV_127, |
||
71 | CONST_INV_255, |
||
72 | CONST_INV_32767, |
||
73 | CONST_INV_65535, |
||
74 | CONST_INV_2147483647, |
||
75 | CONST_255 |
||
76 | }; |
||
77 | |||
78 | #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} |
||
79 | static float consts[NUM_CONSTS][4] = { |
||
80 | {0, 0, 0, 1}, |
||
81 | C(1.0 / 127.0), |
||
82 | C(1.0 / 255.0), |
||
83 | C(1.0 / 32767.0), |
||
84 | C(1.0 / 65535.0), |
||
85 | C(1.0 / 2147483647.0), |
||
86 | C(255.0) |
||
87 | }; |
||
88 | #undef C |
||
89 | |||
90 | struct translate_sse { |
||
91 | struct translate translate; |
||
92 | |||
93 | struct x86_function linear_func; |
||
94 | struct x86_function elt_func; |
||
95 | struct x86_function elt16_func; |
||
96 | struct x86_function elt8_func; |
||
97 | struct x86_function *func; |
||
98 | |||
99 | PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; |
||
100 | int8_t reg_to_const[16]; |
||
101 | int8_t const_to_reg[NUM_CONSTS]; |
||
102 | |||
103 | struct translate_buffer buffer[PIPE_MAX_ATTRIBS]; |
||
104 | unsigned nr_buffers; |
||
105 | |||
106 | /* Multiple buffer variants can map to a single buffer. */ |
||
107 | struct translate_buffer_variant buffer_variant[PIPE_MAX_ATTRIBS]; |
||
108 | unsigned nr_buffer_variants; |
||
109 | |||
110 | /* Multiple elements can map to a single buffer variant. */ |
||
111 | unsigned element_to_buffer_variant[PIPE_MAX_ATTRIBS]; |
||
112 | |||
113 | boolean use_instancing; |
||
114 | unsigned instance_id; |
||
115 | unsigned start_instance; |
||
116 | |||
117 | /* these are actually known values, but putting them in a struct |
||
118 | * like this is helpful to keep them in sync across the file. |
||
119 | */ |
||
120 | struct x86_reg tmp_EAX; |
||
121 | struct x86_reg tmp2_EDX; |
||
122 | struct x86_reg src_ECX; |
||
123 | struct x86_reg idx_ESI; /* either start+i or &elt[i] */ |
||
124 | struct x86_reg machine_EDI; |
||
125 | struct x86_reg outbuf_EBX; |
||
126 | struct x86_reg count_EBP; /* decrements to zero */ |
||
127 | }; |
||
128 | |||
129 | static int get_offset( const void *a, const void *b ) |
||
130 | { |
||
131 | return (const char *)b - (const char *)a; |
||
132 | } |
||
133 | |||
134 | static struct x86_reg get_const( struct translate_sse *p, unsigned id) |
||
135 | { |
||
136 | struct x86_reg reg; |
||
137 | unsigned i; |
||
138 | |||
139 | if(p->const_to_reg[id] >= 0) |
||
140 | return x86_make_reg(file_XMM, p->const_to_reg[id]); |
||
141 | |||
142 | for(i = 2; i < 8; ++i) |
||
143 | { |
||
144 | if(p->reg_to_const[i] < 0) |
||
145 | break; |
||
146 | } |
||
147 | |||
148 | /* TODO: be smarter here */ |
||
149 | if(i == 8) |
||
150 | --i; |
||
151 | |||
152 | reg = x86_make_reg(file_XMM, i); |
||
153 | |||
154 | if(p->reg_to_const[i] >= 0) |
||
155 | p->const_to_reg[p->reg_to_const[i]] = -1; |
||
156 | |||
157 | p->reg_to_const[i] = id; |
||
158 | p->const_to_reg[id] = i; |
||
159 | |||
160 | /* TODO: this should happen outside the loop, if possible */ |
||
161 | sse_movaps(p->func, reg, |
||
162 | x86_make_disp(p->machine_EDI, |
||
163 | get_offset(p, &p->consts[id][0]))); |
||
164 | |||
165 | return reg; |
||
166 | } |
||
167 | |||
168 | /* load the data in a SSE2 register, padding with zeros */ |
||
169 | static boolean emit_load_sse2( struct translate_sse *p, |
||
170 | struct x86_reg data, |
||
171 | struct x86_reg src, |
||
172 | unsigned size) |
||
173 | { |
||
174 | struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); |
||
175 | struct x86_reg tmp = p->tmp_EAX; |
||
176 | switch(size) |
||
177 | { |
||
178 | case 1: |
||
179 | x86_movzx8(p->func, tmp, src); |
||
180 | sse2_movd(p->func, data, tmp); |
||
181 | break; |
||
182 | case 2: |
||
183 | x86_movzx16(p->func, tmp, src); |
||
184 | sse2_movd(p->func, data, tmp); |
||
185 | break; |
||
186 | case 3: |
||
187 | x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); |
||
188 | x86_shl_imm(p->func, tmp, 16); |
||
189 | x86_mov16(p->func, tmp, src); |
||
190 | sse2_movd(p->func, data, tmp); |
||
191 | break; |
||
192 | case 4: |
||
193 | sse2_movd(p->func, data, src); |
||
194 | break; |
||
195 | case 6: |
||
196 | sse2_movd(p->func, data, src); |
||
197 | x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); |
||
198 | sse2_movd(p->func, tmpXMM, tmp); |
||
199 | sse2_punpckldq(p->func, data, tmpXMM); |
||
200 | break; |
||
201 | case 8: |
||
202 | sse2_movq(p->func, data, src); |
||
203 | break; |
||
204 | case 12: |
||
205 | sse2_movq(p->func, data, src); |
||
206 | sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); |
||
207 | sse2_punpcklqdq(p->func, data, tmpXMM); |
||
208 | break; |
||
209 | case 16: |
||
210 | sse2_movdqu(p->func, data, src); |
||
211 | break; |
||
212 | default: |
||
213 | return FALSE; |
||
214 | } |
||
215 | return TRUE; |
||
216 | } |
||
217 | |||
218 | /* this value can be passed for the out_chans argument */ |
||
219 | #define CHANNELS_0001 5 |
||
220 | |||
221 | /* this function will load #chans float values, and will |
||
222 | * pad the register with zeroes at least up to out_chans. |
||
223 | * |
||
224 | * If out_chans is set to CHANNELS_0001, then the fourth |
||
225 | * value will be padded with 1. Only pass this value if |
||
226 | * chans < 4 or results are undefined. |
||
227 | */ |
||
228 | static void emit_load_float32( struct translate_sse *p, |
||
229 | struct x86_reg data, |
||
230 | struct x86_reg arg0, |
||
231 | unsigned out_chans, |
||
232 | unsigned chans) |
||
233 | { |
||
234 | switch(chans) |
||
235 | { |
||
236 | case 1: |
||
237 | /* a 0 0 0 |
||
238 | * a 0 0 1 |
||
239 | */ |
||
240 | sse_movss(p->func, data, arg0); |
||
241 | if(out_chans == CHANNELS_0001) |
||
242 | sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); |
||
243 | break; |
||
244 | case 2: |
||
245 | /* 0 0 0 1 |
||
246 | * a b 0 1 |
||
247 | */ |
||
248 | if(out_chans == CHANNELS_0001) |
||
249 | sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); |
||
250 | else if(out_chans > 2) |
||
251 | sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); |
||
252 | sse_movlps(p->func, data, arg0); |
||
253 | break; |
||
254 | case 3: |
||
255 | /* Have to jump through some hoops: |
||
256 | * |
||
257 | * c 0 0 0 |
||
258 | * c 0 0 1 if out_chans == CHANNELS_0001 |
||
259 | * 0 0 c 0/1 |
||
260 | * a b c 0/1 |
||
261 | */ |
||
262 | sse_movss(p->func, data, x86_make_disp(arg0, 8)); |
||
263 | if(out_chans == CHANNELS_0001) |
||
264 | sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) ); |
||
265 | sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) ); |
||
266 | sse_movlps(p->func, data, arg0); |
||
267 | break; |
||
268 | case 4: |
||
269 | sse_movups(p->func, data, arg0); |
||
270 | break; |
||
271 | } |
||
272 | } |
||
273 | |||
274 | /* this function behaves like emit_load_float32, but loads |
||
275 | 64-bit floating point numbers, converting them to 32-bit |
||
276 | ones */ |
||
277 | static void emit_load_float64to32( struct translate_sse *p, |
||
278 | struct x86_reg data, |
||
279 | struct x86_reg arg0, |
||
280 | unsigned out_chans, |
||
281 | unsigned chans) |
||
282 | { |
||
283 | struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); |
||
284 | switch(chans) |
||
285 | { |
||
286 | case 1: |
||
287 | sse2_movsd(p->func, data, arg0); |
||
288 | if(out_chans > 1) |
||
289 | sse2_cvtpd2ps(p->func, data, data); |
||
290 | else |
||
291 | sse2_cvtsd2ss(p->func, data, data); |
||
292 | if(out_chans == CHANNELS_0001) |
||
293 | sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); |
||
294 | break; |
||
295 | case 2: |
||
296 | sse2_movupd(p->func, data, arg0); |
||
297 | sse2_cvtpd2ps(p->func, data, data); |
||
298 | if(out_chans == CHANNELS_0001) |
||
299 | sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) ); |
||
300 | else if(out_chans > 2) |
||
301 | sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) ); |
||
302 | break; |
||
303 | case 3: |
||
304 | sse2_movupd(p->func, data, arg0); |
||
305 | sse2_cvtpd2ps(p->func, data, data); |
||
306 | sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); |
||
307 | if(out_chans > 3) |
||
308 | sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); |
||
309 | else |
||
310 | sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); |
||
311 | sse_movlhps(p->func, data, tmpXMM); |
||
312 | if(out_chans == CHANNELS_0001) |
||
313 | sse_orps(p->func, data, get_const(p, CONST_IDENTITY) ); |
||
314 | break; |
||
315 | case 4: |
||
316 | sse2_movupd(p->func, data, arg0); |
||
317 | sse2_cvtpd2ps(p->func, data, data); |
||
318 | sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); |
||
319 | sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); |
||
320 | sse_movlhps(p->func, data, tmpXMM); |
||
321 | break; |
||
322 | } |
||
323 | } |
||
324 | |||
325 | static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr, struct x86_reg src_xmm) |
||
326 | { |
||
327 | if(x86_target(p->func) != X86_32) |
||
328 | x64_mov64(p->func, dst_gpr, src_gpr); |
||
329 | else |
||
330 | { |
||
331 | /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ |
||
332 | if(x86_target_caps(p->func) & X86_SSE2) |
||
333 | sse2_movq(p->func, dst_xmm, src_xmm); |
||
334 | else |
||
335 | sse_movlps(p->func, dst_xmm, src_xmm); |
||
336 | } |
||
337 | } |
||
338 | |||
339 | static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src) |
||
340 | { |
||
341 | emit_mov64(p, dst_gpr, dst_xmm, src, src); |
||
342 | } |
||
343 | |||
344 | static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm) |
||
345 | { |
||
346 | emit_mov64(p, dst, dst, src_gpr, src_xmm); |
||
347 | } |
||
348 | |||
349 | static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) |
||
350 | { |
||
351 | if(x86_target_caps(p->func) & X86_SSE2) |
||
352 | sse2_movdqu(p->func, dst, src); |
||
353 | else |
||
354 | sse_movups(p->func, dst, src); |
||
355 | } |
||
356 | |||
357 | /* TODO: this uses unaligned accesses liberally, which is great on Nehalem, |
||
358 | * but may or may not be good on older processors |
||
359 | * TODO: may perhaps want to use non-temporal stores here if possible |
||
360 | */ |
||
361 | static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size) |
||
362 | { |
||
363 | struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); |
||
364 | struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); |
||
365 | struct x86_reg dataGPR = p->tmp_EAX; |
||
366 | struct x86_reg dataGPR2 = p->tmp2_EDX; |
||
367 | |||
368 | if(size < 8) |
||
369 | { |
||
370 | switch (size) |
||
371 | { |
||
372 | case 1: |
||
373 | x86_mov8(p->func, dataGPR, src); |
||
374 | x86_mov8(p->func, dst, dataGPR); |
||
375 | break; |
||
376 | case 2: |
||
377 | x86_mov16(p->func, dataGPR, src); |
||
378 | x86_mov16(p->func, dst, dataGPR); |
||
379 | break; |
||
380 | case 3: |
||
381 | x86_mov16(p->func, dataGPR, src); |
||
382 | x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); |
||
383 | x86_mov16(p->func, dst, dataGPR); |
||
384 | x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); |
||
385 | break; |
||
386 | case 4: |
||
387 | x86_mov(p->func, dataGPR, src); |
||
388 | x86_mov(p->func, dst, dataGPR); |
||
389 | break; |
||
390 | case 6: |
||
391 | x86_mov(p->func, dataGPR, src); |
||
392 | x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); |
||
393 | x86_mov(p->func, dst, dataGPR); |
||
394 | x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); |
||
395 | break; |
||
396 | } |
||
397 | } |
||
398 | else if(!(x86_target_caps(p->func) & X86_SSE)) |
||
399 | { |
||
400 | unsigned i = 0; |
||
401 | assert((size & 3) == 0); |
||
402 | for(i = 0; i < size; i += 4) |
||
403 | { |
||
404 | x86_mov(p->func, dataGPR, x86_make_disp(src, i)); |
||
405 | x86_mov(p->func, x86_make_disp(dst, i), dataGPR); |
||
406 | } |
||
407 | } |
||
408 | else |
||
409 | { |
||
410 | switch(size) |
||
411 | { |
||
412 | case 8: |
||
413 | emit_load64(p, dataGPR, dataXMM, src); |
||
414 | emit_store64(p, dst, dataGPR, dataXMM); |
||
415 | break; |
||
416 | case 12: |
||
417 | emit_load64(p, dataGPR2, dataXMM, src); |
||
418 | x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); |
||
419 | emit_store64(p, dst, dataGPR2, dataXMM); |
||
420 | x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); |
||
421 | break; |
||
422 | case 16: |
||
423 | emit_mov128(p, dataXMM, src); |
||
424 | emit_mov128(p, dst, dataXMM); |
||
425 | break; |
||
426 | case 24: |
||
427 | emit_mov128(p, dataXMM, src); |
||
428 | emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); |
||
429 | emit_mov128(p, dst, dataXMM); |
||
430 | emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); |
||
431 | break; |
||
432 | case 32: |
||
433 | emit_mov128(p, dataXMM, src); |
||
434 | emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); |
||
435 | emit_mov128(p, dst, dataXMM); |
||
436 | emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); |
||
437 | break; |
||
438 | default: |
||
439 | assert(0); |
||
440 | } |
||
441 | } |
||
442 | } |
||
443 | |||
444 | static boolean translate_attr_convert( struct translate_sse *p, |
||
445 | const struct translate_element *a, |
||
446 | struct x86_reg src, |
||
447 | struct x86_reg dst) |
||
448 | |||
449 | { |
||
450 | const struct util_format_description* input_desc = util_format_description(a->input_format); |
||
451 | const struct util_format_description* output_desc = util_format_description(a->output_format); |
||
452 | unsigned i; |
||
453 | boolean id_swizzle = TRUE; |
||
454 | unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE}; |
||
455 | unsigned needed_chans = 0; |
||
456 | unsigned imms[2] = {0, 0x3f800000}; |
||
457 | |||
458 | if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE) |
||
459 | return FALSE; |
||
460 | |||
461 | if(input_desc->channel[0].size & 7) |
||
462 | return FALSE; |
||
463 | |||
464 | if(input_desc->colorspace != output_desc->colorspace) |
||
465 | return FALSE; |
||
466 | |||
467 | for(i = 1; i < input_desc->nr_channels; ++i) |
||
468 | { |
||
469 | if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0]))) |
||
470 | return FALSE; |
||
471 | } |
||
472 | |||
473 | for(i = 1; i < output_desc->nr_channels; ++i) |
||
474 | { |
||
475 | if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0]))) |
||
476 | return FALSE; |
||
477 | } |
||
478 | |||
479 | for(i = 0; i < output_desc->nr_channels; ++i) |
||
480 | { |
||
481 | if(output_desc->swizzle[i] < 4) |
||
482 | swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; |
||
483 | } |
||
484 | |||
485 | if((x86_target_caps(p->func) & X86_SSE) && (0 |
||
486 | || a->output_format == PIPE_FORMAT_R32_FLOAT |
||
487 | || a->output_format == PIPE_FORMAT_R32G32_FLOAT |
||
488 | || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT |
||
489 | || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) |
||
490 | { |
||
491 | struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); |
||
492 | |||
493 | for(i = 0; i < output_desc->nr_channels; ++i) |
||
494 | { |
||
495 | if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) |
||
496 | swizzle[i] = i; |
||
497 | } |
||
498 | |||
499 | for(i = 0; i < output_desc->nr_channels; ++i) |
||
500 | { |
||
501 | if(swizzle[i] < 4) |
||
502 | needed_chans = MAX2(needed_chans, swizzle[i] + 1); |
||
503 | if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) |
||
504 | id_swizzle = FALSE; |
||
505 | } |
||
506 | |||
507 | if(needed_chans > 0) |
||
508 | { |
||
509 | switch(input_desc->channel[0].type) |
||
510 | { |
||
511 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
512 | if(!(x86_target_caps(p->func) & X86_SSE2)) |
||
513 | return FALSE; |
||
514 | emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); |
||
515 | |||
516 | /* TODO: add support for SSE4.1 pmovzx */ |
||
517 | switch(input_desc->channel[0].size) |
||
518 | { |
||
519 | case 8: |
||
520 | /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */ |
||
521 | sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); |
||
522 | sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); |
||
523 | break; |
||
524 | case 16: |
||
525 | sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); |
||
526 | break; |
||
527 | case 32: /* we lose precision here */ |
||
528 | sse2_psrld_imm(p->func, dataXMM, 1); |
||
529 | break; |
||
530 | default: |
||
531 | return FALSE; |
||
532 | } |
||
533 | sse2_cvtdq2ps(p->func, dataXMM, dataXMM); |
||
534 | if(input_desc->channel[0].normalized) |
||
535 | { |
||
536 | struct x86_reg factor; |
||
537 | switch(input_desc->channel[0].size) |
||
538 | { |
||
539 | case 8: |
||
540 | factor = get_const(p, CONST_INV_255); |
||
541 | break; |
||
542 | case 16: |
||
543 | factor = get_const(p, CONST_INV_65535); |
||
544 | break; |
||
545 | case 32: |
||
546 | factor = get_const(p, CONST_INV_2147483647); |
||
547 | break; |
||
548 | default: |
||
549 | assert(0); |
||
550 | factor.disp = 0; |
||
551 | factor.file = 0; |
||
552 | factor.idx = 0; |
||
553 | factor.mod = 0; |
||
554 | break; |
||
555 | } |
||
556 | sse_mulps(p->func, dataXMM, factor); |
||
557 | } |
||
558 | else if(input_desc->channel[0].size == 32) |
||
559 | sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */ |
||
560 | break; |
||
561 | case UTIL_FORMAT_TYPE_SIGNED: |
||
562 | if(!(x86_target_caps(p->func) & X86_SSE2)) |
||
563 | return FALSE; |
||
564 | emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); |
||
565 | |||
566 | /* TODO: add support for SSE4.1 pmovsx */ |
||
567 | switch(input_desc->channel[0].size) |
||
568 | { |
||
569 | case 8: |
||
570 | sse2_punpcklbw(p->func, dataXMM, dataXMM); |
||
571 | sse2_punpcklbw(p->func, dataXMM, dataXMM); |
||
572 | sse2_psrad_imm(p->func, dataXMM, 24); |
||
573 | break; |
||
574 | case 16: |
||
575 | sse2_punpcklwd(p->func, dataXMM, dataXMM); |
||
576 | sse2_psrad_imm(p->func, dataXMM, 16); |
||
577 | break; |
||
578 | case 32: /* we lose precision here */ |
||
579 | break; |
||
580 | default: |
||
581 | return FALSE; |
||
582 | } |
||
583 | sse2_cvtdq2ps(p->func, dataXMM, dataXMM); |
||
584 | if(input_desc->channel[0].normalized) |
||
585 | { |
||
586 | struct x86_reg factor; |
||
587 | switch(input_desc->channel[0].size) |
||
588 | { |
||
589 | case 8: |
||
590 | factor = get_const(p, CONST_INV_127); |
||
591 | break; |
||
592 | case 16: |
||
593 | factor = get_const(p, CONST_INV_32767); |
||
594 | break; |
||
595 | case 32: |
||
596 | factor = get_const(p, CONST_INV_2147483647); |
||
597 | break; |
||
598 | default: |
||
599 | assert(0); |
||
600 | factor.disp = 0; |
||
601 | factor.file = 0; |
||
602 | factor.idx = 0; |
||
603 | factor.mod = 0; |
||
604 | break; |
||
605 | } |
||
606 | sse_mulps(p->func, dataXMM, factor); |
||
607 | } |
||
608 | break; |
||
609 | |||
610 | break; |
||
611 | case UTIL_FORMAT_TYPE_FLOAT: |
||
612 | if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64) |
||
613 | return FALSE; |
||
614 | if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3) |
||
615 | { |
||
616 | swizzle[3] = UTIL_FORMAT_SWIZZLE_W; |
||
617 | needed_chans = CHANNELS_0001; |
||
618 | } |
||
619 | switch(input_desc->channel[0].size) |
||
620 | { |
||
621 | case 32: |
||
622 | emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels); |
||
623 | break; |
||
624 | case 64: /* we lose precision here */ |
||
625 | if(!(x86_target_caps(p->func) & X86_SSE2)) |
||
626 | return FALSE; |
||
627 | emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels); |
||
628 | break; |
||
629 | default: |
||
630 | return FALSE; |
||
631 | } |
||
632 | break; |
||
633 | default: |
||
634 | return FALSE; |
||
635 | } |
||
636 | |||
637 | if(!id_swizzle) |
||
638 | sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) ); |
||
639 | } |
||
640 | |||
641 | if(output_desc->nr_channels >= 4 |
||
642 | && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 |
||
643 | && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 |
||
644 | && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 |
||
645 | && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 |
||
646 | ) |
||
647 | sse_movups(p->func, dst, dataXMM); |
||
648 | else |
||
649 | { |
||
650 | if(output_desc->nr_channels >= 2 |
||
651 | && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 |
||
652 | && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) |
||
653 | sse_movlps(p->func, dst, dataXMM); |
||
654 | else |
||
655 | { |
||
656 | if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) |
||
657 | sse_movss(p->func, dst, dataXMM); |
||
658 | else |
||
659 | x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); |
||
660 | |||
661 | if(output_desc->nr_channels >= 2) |
||
662 | { |
||
663 | if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0) |
||
664 | { |
||
665 | sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); |
||
666 | sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); |
||
667 | } |
||
668 | else |
||
669 | x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); |
||
670 | } |
||
671 | } |
||
672 | |||
673 | if(output_desc->nr_channels >= 3) |
||
674 | { |
||
675 | if(output_desc->nr_channels >= 4 |
||
676 | && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 |
||
677 | && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) |
||
678 | sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); |
||
679 | else |
||
680 | { |
||
681 | if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) |
||
682 | { |
||
683 | sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); |
||
684 | sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); |
||
685 | } |
||
686 | else |
||
687 | x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); |
||
688 | |||
689 | if(output_desc->nr_channels >= 4) |
||
690 | { |
||
691 | if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0) |
||
692 | { |
||
693 | sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); |
||
694 | sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); |
||
695 | } |
||
696 | else |
||
697 | x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); |
||
698 | } |
||
699 | } |
||
700 | } |
||
701 | } |
||
702 | return TRUE; |
||
703 | } |
||
704 | else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16 |
||
705 | && output_desc->channel[0].normalized == input_desc->channel[0].normalized |
||
706 | && (0 |
||
707 | || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) |
||
708 | || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) |
||
709 | || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) |
||
710 | )) |
||
711 | { |
||
712 | struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); |
||
713 | struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); |
||
714 | struct x86_reg tmp = p->tmp_EAX; |
||
715 | unsigned imms[2] = {0, 1}; |
||
716 | |||
717 | for(i = 0; i < output_desc->nr_channels; ++i) |
||
718 | { |
||
719 | if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels) |
||
720 | swizzle[i] = i; |
||
721 | } |
||
722 | |||
723 | for(i = 0; i < output_desc->nr_channels; ++i) |
||
724 | { |
||
725 | if(swizzle[i] < 4) |
||
726 | needed_chans = MAX2(needed_chans, swizzle[i] + 1); |
||
727 | if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) |
||
728 | id_swizzle = FALSE; |
||
729 | } |
||
730 | |||
731 | if(needed_chans > 0) |
||
732 | { |
||
733 | emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3); |
||
734 | |||
735 | switch(input_desc->channel[0].type) |
||
736 | { |
||
737 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
738 | if(input_desc->channel[0].normalized) |
||
739 | { |
||
740 | sse2_punpcklbw(p->func, dataXMM, dataXMM); |
||
741 | if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) |
||
742 | sse2_psrlw_imm(p->func, dataXMM, 1); |
||
743 | } |
||
744 | else |
||
745 | sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); |
||
746 | break; |
||
747 | case UTIL_FORMAT_TYPE_SIGNED: |
||
748 | if(input_desc->channel[0].normalized) |
||
749 | { |
||
750 | sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); |
||
751 | sse2_punpcklbw(p->func, tmpXMM, dataXMM); |
||
752 | sse2_psllw_imm(p->func, dataXMM, 9); |
||
753 | sse2_psrlw_imm(p->func, dataXMM, 8); |
||
754 | sse2_por(p->func, tmpXMM, dataXMM); |
||
755 | sse2_psrlw_imm(p->func, dataXMM, 7); |
||
756 | sse2_por(p->func, tmpXMM, dataXMM); |
||
757 | { |
||
758 | struct x86_reg t = dataXMM; |
||
759 | dataXMM = tmpXMM; |
||
760 | tmpXMM = t; |
||
761 | } |
||
762 | } |
||
763 | else |
||
764 | { |
||
765 | sse2_punpcklbw(p->func, dataXMM, dataXMM); |
||
766 | sse2_psraw_imm(p->func, dataXMM, 8); |
||
767 | } |
||
768 | break; |
||
769 | default: |
||
770 | assert(0); |
||
771 | } |
||
772 | |||
773 | if(output_desc->channel[0].normalized) |
||
774 | imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; |
||
775 | |||
776 | if(!id_swizzle) |
||
777 | sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); |
||
778 | } |
||
779 | |||
780 | if(output_desc->nr_channels >= 4 |
||
781 | && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 |
||
782 | && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 |
||
783 | && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 |
||
784 | && swizzle[3] < UTIL_FORMAT_SWIZZLE_0 |
||
785 | ) |
||
786 | sse2_movq(p->func, dst, dataXMM); |
||
787 | else |
||
788 | { |
||
789 | if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0) |
||
790 | { |
||
791 | if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) |
||
792 | sse2_movd(p->func, dst, dataXMM); |
||
793 | else |
||
794 | { |
||
795 | sse2_movd(p->func, tmp, dataXMM); |
||
796 | x86_mov16(p->func, dst, tmp); |
||
797 | if(output_desc->nr_channels >= 2) |
||
798 | x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); |
||
799 | } |
||
800 | } |
||
801 | else |
||
802 | { |
||
803 | if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) |
||
804 | x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); |
||
805 | else |
||
806 | { |
||
807 | x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); |
||
808 | if(output_desc->nr_channels >= 2) |
||
809 | { |
||
810 | sse2_movd(p->func, tmp, dataXMM); |
||
811 | x86_shr_imm(p->func, tmp, 16); |
||
812 | x86_mov16(p->func, x86_make_disp(dst, 2), tmp); |
||
813 | } |
||
814 | } |
||
815 | } |
||
816 | |||
817 | if(output_desc->nr_channels >= 3) |
||
818 | { |
||
819 | if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0) |
||
820 | { |
||
821 | if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) |
||
822 | { |
||
823 | sse2_psrlq_imm(p->func, dataXMM, 32); |
||
824 | sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); |
||
825 | } |
||
826 | else |
||
827 | { |
||
828 | sse2_psrlq_imm(p->func, dataXMM, 32); |
||
829 | sse2_movd(p->func, tmp, dataXMM); |
||
830 | x86_mov16(p->func, x86_make_disp(dst, 4), tmp); |
||
831 | if(output_desc->nr_channels >= 4) |
||
832 | { |
||
833 | x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); |
||
834 | } |
||
835 | } |
||
836 | } |
||
837 | else |
||
838 | { |
||
839 | if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) |
||
840 | x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); |
||
841 | else |
||
842 | { |
||
843 | x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); |
||
844 | |||
845 | if(output_desc->nr_channels >= 4) |
||
846 | { |
||
847 | sse2_psrlq_imm(p->func, dataXMM, 48); |
||
848 | sse2_movd(p->func, tmp, dataXMM); |
||
849 | x86_mov16(p->func, x86_make_disp(dst, 6), tmp); |
||
850 | } |
||
851 | } |
||
852 | } |
||
853 | } |
||
854 | } |
||
855 | return TRUE; |
||
856 | } |
||
857 | else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0]))) |
||
858 | { |
||
859 | struct x86_reg tmp = p->tmp_EAX; |
||
860 | unsigned i; |
||
861 | if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4 |
||
862 | && swizzle[0] == UTIL_FORMAT_SWIZZLE_W |
||
863 | && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z |
||
864 | && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y |
||
865 | && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) |
||
866 | { |
||
867 | /* TODO: support movbe */ |
||
868 | x86_mov(p->func, tmp, src); |
||
869 | x86_bswap(p->func, tmp); |
||
870 | x86_mov(p->func, dst, tmp); |
||
871 | return TRUE; |
||
872 | } |
||
873 | |||
874 | for(i = 0; i < output_desc->nr_channels; ++i) |
||
875 | { |
||
876 | switch(output_desc->channel[0].size) |
||
877 | { |
||
878 | case 8: |
||
879 | if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) |
||
880 | { |
||
881 | unsigned v = 0; |
||
882 | if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) |
||
883 | { |
||
884 | switch(output_desc->channel[0].type) |
||
885 | { |
||
886 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
887 | v = output_desc->channel[0].normalized ? 0xff : 1; |
||
888 | break; |
||
889 | case UTIL_FORMAT_TYPE_SIGNED: |
||
890 | v = output_desc->channel[0].normalized ? 0x7f : 1; |
||
891 | break; |
||
892 | default: |
||
893 | return FALSE; |
||
894 | } |
||
895 | } |
||
896 | x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); |
||
897 | } |
||
898 | else |
||
899 | { |
||
900 | x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); |
||
901 | x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); |
||
902 | } |
||
903 | break; |
||
904 | case 16: |
||
905 | if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) |
||
906 | { |
||
907 | unsigned v = 0; |
||
908 | if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) |
||
909 | { |
||
910 | switch(output_desc->channel[1].type) |
||
911 | { |
||
912 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
913 | v = output_desc->channel[1].normalized ? 0xffff : 1; |
||
914 | break; |
||
915 | case UTIL_FORMAT_TYPE_SIGNED: |
||
916 | v = output_desc->channel[1].normalized ? 0x7fff : 1; |
||
917 | break; |
||
918 | case UTIL_FORMAT_TYPE_FLOAT: |
||
919 | v = 0x3c00; |
||
920 | break; |
||
921 | default: |
||
922 | return FALSE; |
||
923 | } |
||
924 | } |
||
925 | x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); |
||
926 | } |
||
927 | else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0) |
||
928 | x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); |
||
929 | else |
||
930 | { |
||
931 | x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); |
||
932 | x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); |
||
933 | } |
||
934 | break; |
||
935 | case 32: |
||
936 | if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) |
||
937 | { |
||
938 | unsigned v = 0; |
||
939 | if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) |
||
940 | { |
||
941 | switch(output_desc->channel[1].type) |
||
942 | { |
||
943 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
944 | v = output_desc->channel[1].normalized ? 0xffffffff : 1; |
||
945 | break; |
||
946 | case UTIL_FORMAT_TYPE_SIGNED: |
||
947 | v = output_desc->channel[1].normalized ? 0x7fffffff : 1; |
||
948 | break; |
||
949 | case UTIL_FORMAT_TYPE_FLOAT: |
||
950 | v = 0x3f800000; |
||
951 | break; |
||
952 | default: |
||
953 | return FALSE; |
||
954 | } |
||
955 | } |
||
956 | x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); |
||
957 | } |
||
958 | else |
||
959 | { |
||
960 | x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); |
||
961 | x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); |
||
962 | } |
||
963 | break; |
||
964 | case 64: |
||
965 | if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) |
||
966 | { |
||
967 | unsigned l = 0; |
||
968 | unsigned h = 0; |
||
969 | if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1) |
||
970 | { |
||
971 | switch(output_desc->channel[1].type) |
||
972 | { |
||
973 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
974 | h = output_desc->channel[1].normalized ? 0xffffffff : 0; |
||
975 | l = output_desc->channel[1].normalized ? 0xffffffff : 1; |
||
976 | break; |
||
977 | case UTIL_FORMAT_TYPE_SIGNED: |
||
978 | h = output_desc->channel[1].normalized ? 0x7fffffff : 0; |
||
979 | l = output_desc->channel[1].normalized ? 0xffffffff : 1; |
||
980 | break; |
||
981 | case UTIL_FORMAT_TYPE_FLOAT: |
||
982 | h = 0x3ff00000; |
||
983 | l = 0; |
||
984 | break; |
||
985 | default: |
||
986 | return FALSE; |
||
987 | } |
||
988 | } |
||
989 | x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); |
||
990 | x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); |
||
991 | } |
||
992 | else |
||
993 | { |
||
994 | if(x86_target_caps(p->func) & X86_SSE) |
||
995 | { |
||
996 | struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); |
||
997 | emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8)); |
||
998 | emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); |
||
999 | } |
||
1000 | else |
||
1001 | { |
||
1002 | x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); |
||
1003 | x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); |
||
1004 | x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4)); |
||
1005 | x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); |
||
1006 | } |
||
1007 | } |
||
1008 | break; |
||
1009 | default: |
||
1010 | return FALSE; |
||
1011 | } |
||
1012 | } |
||
1013 | return TRUE; |
||
1014 | } |
||
1015 | /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ |
||
1016 | else if((x86_target_caps(p->func) & X86_SSE2) && |
||
1017 | a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0 |
||
1018 | || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM |
||
1019 | || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM |
||
1020 | )) |
||
1021 | { |
||
1022 | struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); |
||
1023 | |||
1024 | /* load */ |
||
1025 | sse_movups(p->func, dataXMM, src); |
||
1026 | |||
1027 | if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) |
||
1028 | sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3)); |
||
1029 | |||
1030 | /* scale by 255.0 */ |
||
1031 | sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); |
||
1032 | |||
1033 | /* pack and emit */ |
||
1034 | sse2_cvtps2dq(p->func, dataXMM, dataXMM); |
||
1035 | sse2_packssdw(p->func, dataXMM, dataXMM); |
||
1036 | sse2_packuswb(p->func, dataXMM, dataXMM); |
||
1037 | sse2_movd(p->func, dst, dataXMM); |
||
1038 | |||
1039 | return TRUE; |
||
1040 | } |
||
1041 | |||
1042 | return FALSE; |
||
1043 | } |
||
1044 | |||
1045 | static boolean translate_attr( struct translate_sse *p, |
||
1046 | const struct translate_element *a, |
||
1047 | struct x86_reg src, |
||
1048 | struct x86_reg dst) |
||
1049 | { |
||
1050 | if(a->input_format == a->output_format) |
||
1051 | { |
||
1052 | emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); |
||
1053 | return TRUE; |
||
1054 | } |
||
1055 | |||
1056 | return translate_attr_convert(p, a, src, dst); |
||
1057 | } |
||
1058 | |||
1059 | static boolean init_inputs( struct translate_sse *p, |
||
1060 | unsigned index_size ) |
||
1061 | { |
||
1062 | unsigned i; |
||
1063 | struct x86_reg instance_id = x86_make_disp(p->machine_EDI, |
||
1064 | get_offset(p, &p->instance_id)); |
||
1065 | struct x86_reg start_instance = x86_make_disp(p->machine_EDI, |
||
1066 | get_offset(p, &p->start_instance)); |
||
1067 | |||
1068 | for (i = 0; i < p->nr_buffer_variants; i++) { |
||
1069 | struct translate_buffer_variant *variant = &p->buffer_variant[i]; |
||
1070 | struct translate_buffer *buffer = &p->buffer[variant->buffer_index]; |
||
1071 | |||
1072 | if (!index_size || variant->instance_divisor) { |
||
1073 | struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI, |
||
1074 | get_offset(p, &buffer->max_index)); |
||
1075 | struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, |
||
1076 | get_offset(p, &buffer->stride)); |
||
1077 | struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, |
||
1078 | get_offset(p, &variant->ptr)); |
||
1079 | struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI, |
||
1080 | get_offset(p, &buffer->base_ptr)); |
||
1081 | struct x86_reg elt = p->idx_ESI; |
||
1082 | struct x86_reg tmp_EAX = p->tmp_EAX; |
||
1083 | |||
1084 | /* Calculate pointer to first attrib: |
||
1085 | * base_ptr + stride * index, where index depends on instance divisor |
||
1086 | */ |
||
1087 | if (variant->instance_divisor) { |
||
1088 | /* Start with instance = instance_id |
||
1089 | * which is true if divisor is 1. |
||
1090 | */ |
||
1091 | x86_mov(p->func, tmp_EAX, instance_id); |
||
1092 | |||
1093 | if (variant->instance_divisor != 1) { |
||
1094 | struct x86_reg tmp_EDX = p->tmp2_EDX; |
||
1095 | struct x86_reg tmp_ECX = p->src_ECX; |
||
1096 | |||
1097 | /* instance_num = instance_id - start_instance */ |
||
1098 | x86_mov(p->func, tmp_EDX, start_instance); |
||
1099 | x86_sub(p->func, tmp_EAX, tmp_EDX); |
||
1100 | |||
1101 | /* TODO: Add x86_shr() to rtasm and use it whenever |
||
1102 | * instance divisor is power of two. |
||
1103 | */ |
||
1104 | x86_xor(p->func, tmp_EDX, tmp_EDX); |
||
1105 | x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor); |
||
1106 | x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ |
||
1107 | |||
1108 | /* instance = (instance_id - start_instance) / divisor + |
||
1109 | * start_instance |
||
1110 | */ |
||
1111 | x86_mov(p->func, tmp_EDX, start_instance); |
||
1112 | x86_add(p->func, tmp_EAX, tmp_EDX); |
||
1113 | } |
||
1114 | |||
1115 | /* XXX we need to clamp the index here too, but to a |
||
1116 | * per-array max value, not the draw->pt.max_index value |
||
1117 | * that's being given to us via translate->set_buffer(). |
||
1118 | */ |
||
1119 | } else { |
||
1120 | x86_mov(p->func, tmp_EAX, elt); |
||
1121 | |||
1122 | /* Clamp to max_index |
||
1123 | */ |
||
1124 | x86_cmp(p->func, tmp_EAX, buf_max_index); |
||
1125 | x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE); |
||
1126 | } |
||
1127 | |||
1128 | x86_imul(p->func, tmp_EAX, buf_stride); |
||
1129 | x64_rexw(p->func); |
||
1130 | x86_add(p->func, tmp_EAX, buf_base_ptr); |
||
1131 | |||
1132 | x86_cmp(p->func, p->count_EBP, p->tmp_EAX); |
||
1133 | |||
1134 | /* In the linear case, keep the buffer pointer instead of the |
||
1135 | * index number. |
||
1136 | */ |
||
1137 | if (!index_size && p->nr_buffer_variants == 1) |
||
1138 | { |
||
1139 | x64_rexw(p->func); |
||
1140 | x86_mov(p->func, elt, tmp_EAX); |
||
1141 | } |
||
1142 | else |
||
1143 | { |
||
1144 | x64_rexw(p->func); |
||
1145 | x86_mov(p->func, buf_ptr, tmp_EAX); |
||
1146 | } |
||
1147 | } |
||
1148 | } |
||
1149 | |||
1150 | return TRUE; |
||
1151 | } |
||
1152 | |||
1153 | |||
1154 | static struct x86_reg get_buffer_ptr( struct translate_sse *p, |
||
1155 | unsigned index_size, |
||
1156 | unsigned var_idx, |
||
1157 | struct x86_reg elt ) |
||
1158 | { |
||
1159 | if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { |
||
1160 | return x86_make_disp(p->machine_EDI, |
||
1161 | get_offset(p, &p->instance_id)); |
||
1162 | } |
||
1163 | if (!index_size && p->nr_buffer_variants == 1) { |
||
1164 | return p->idx_ESI; |
||
1165 | } |
||
1166 | else if (!index_size || p->buffer_variant[var_idx].instance_divisor) { |
||
1167 | struct x86_reg ptr = p->src_ECX; |
||
1168 | struct x86_reg buf_ptr = |
||
1169 | x86_make_disp(p->machine_EDI, |
||
1170 | get_offset(p, &p->buffer_variant[var_idx].ptr)); |
||
1171 | |||
1172 | x64_rexw(p->func); |
||
1173 | x86_mov(p->func, ptr, buf_ptr); |
||
1174 | return ptr; |
||
1175 | } |
||
1176 | else { |
||
1177 | struct x86_reg ptr = p->src_ECX; |
||
1178 | const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx]; |
||
1179 | |||
1180 | struct x86_reg buf_stride = |
||
1181 | x86_make_disp(p->machine_EDI, |
||
1182 | get_offset(p, &p->buffer[variant->buffer_index].stride)); |
||
1183 | |||
1184 | struct x86_reg buf_base_ptr = |
||
1185 | x86_make_disp(p->machine_EDI, |
||
1186 | get_offset(p, &p->buffer[variant->buffer_index].base_ptr)); |
||
1187 | |||
1188 | struct x86_reg buf_max_index = |
||
1189 | x86_make_disp(p->machine_EDI, |
||
1190 | get_offset(p, &p->buffer[variant->buffer_index].max_index)); |
||
1191 | |||
1192 | |||
1193 | |||
1194 | /* Calculate pointer to current attrib: |
||
1195 | */ |
||
1196 | switch(index_size) |
||
1197 | { |
||
1198 | case 1: |
||
1199 | x86_movzx8(p->func, ptr, elt); |
||
1200 | break; |
||
1201 | case 2: |
||
1202 | x86_movzx16(p->func, ptr, elt); |
||
1203 | break; |
||
1204 | case 4: |
||
1205 | x86_mov(p->func, ptr, elt); |
||
1206 | break; |
||
1207 | } |
||
1208 | |||
1209 | /* Clamp to max_index |
||
1210 | */ |
||
1211 | x86_cmp(p->func, ptr, buf_max_index); |
||
1212 | x86_cmovcc(p->func, ptr, buf_max_index, cc_AE); |
||
1213 | |||
1214 | x86_imul(p->func, ptr, buf_stride); |
||
1215 | x64_rexw(p->func); |
||
1216 | x86_add(p->func, ptr, buf_base_ptr); |
||
1217 | return ptr; |
||
1218 | } |
||
1219 | } |
||
1220 | |||
1221 | |||
1222 | |||
1223 | static boolean incr_inputs( struct translate_sse *p, |
||
1224 | unsigned index_size ) |
||
1225 | { |
||
1226 | if (!index_size && p->nr_buffer_variants == 1) { |
||
1227 | struct x86_reg stride = x86_make_disp(p->machine_EDI, |
||
1228 | get_offset(p, &p->buffer[0].stride)); |
||
1229 | |||
1230 | if (p->buffer_variant[0].instance_divisor == 0) { |
||
1231 | x64_rexw(p->func); |
||
1232 | x86_add(p->func, p->idx_ESI, stride); |
||
1233 | sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); |
||
1234 | } |
||
1235 | } |
||
1236 | else if (!index_size) { |
||
1237 | unsigned i; |
||
1238 | |||
1239 | /* Is this worthwhile?? |
||
1240 | */ |
||
1241 | for (i = 0; i < p->nr_buffer_variants; i++) { |
||
1242 | struct translate_buffer_variant *variant = &p->buffer_variant[i]; |
||
1243 | struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, |
||
1244 | get_offset(p, &variant->ptr)); |
||
1245 | struct x86_reg buf_stride = x86_make_disp(p->machine_EDI, |
||
1246 | get_offset(p, &p->buffer[variant->buffer_index].stride)); |
||
1247 | |||
1248 | if (variant->instance_divisor == 0) { |
||
1249 | x86_mov(p->func, p->tmp_EAX, buf_stride); |
||
1250 | x64_rexw(p->func); |
||
1251 | x86_add(p->func, p->tmp_EAX, buf_ptr); |
||
1252 | if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); |
||
1253 | x64_rexw(p->func); |
||
1254 | x86_mov(p->func, buf_ptr, p->tmp_EAX); |
||
1255 | } |
||
1256 | } |
||
1257 | } |
||
1258 | else { |
||
1259 | x64_rexw(p->func); |
||
1260 | x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); |
||
1261 | } |
||
1262 | |||
1263 | return TRUE; |
||
1264 | } |
||
1265 | |||
1266 | |||
1267 | /* Build run( struct translate *machine, |
||
1268 | * unsigned start, |
||
1269 | * unsigned count, |
||
1270 | * void *output_buffer ) |
||
1271 | * or |
||
1272 | * run_elts( struct translate *machine, |
||
1273 | * unsigned *elts, |
||
1274 | * unsigned count, |
||
1275 | * void *output_buffer ) |
||
1276 | * |
||
1277 | * Lots of hardcoding |
||
1278 | * |
||
1279 | * EAX -- pointer to current output vertex |
||
1280 | * ECX -- pointer to current attribute |
||
1281 | * |
||
1282 | */ |
||
1283 | static boolean build_vertex_emit( struct translate_sse *p, |
||
1284 | struct x86_function *func, |
||
1285 | unsigned index_size ) |
||
1286 | { |
||
1287 | int fixup, label; |
||
1288 | unsigned j; |
||
1289 | |||
1290 | memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); |
||
1291 | memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); |
||
1292 | |||
1293 | p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); |
||
1294 | p->idx_ESI = x86_make_reg(file_REG32, reg_SI); |
||
1295 | p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); |
||
1296 | p->machine_EDI = x86_make_reg(file_REG32, reg_DI); |
||
1297 | p->count_EBP = x86_make_reg(file_REG32, reg_BP); |
||
1298 | p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); |
||
1299 | p->src_ECX = x86_make_reg(file_REG32, reg_CX); |
||
1300 | |||
1301 | p->func = func; |
||
1302 | |||
1303 | x86_init_func(p->func); |
||
1304 | |||
1305 | if(x86_target(p->func) == X86_64_WIN64_ABI) |
||
1306 | { |
||
1307 | /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */ |
||
1308 | sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6)); |
||
1309 | sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7)); |
||
1310 | } |
||
1311 | |||
1312 | x86_push(p->func, p->outbuf_EBX); |
||
1313 | x86_push(p->func, p->count_EBP); |
||
1314 | |||
1315 | /* on non-Win64 x86-64, these are already in the right registers */ |
||
1316 | if(x86_target(p->func) != X86_64_STD_ABI) |
||
1317 | { |
||
1318 | x86_push(p->func, p->machine_EDI); |
||
1319 | x86_push(p->func, p->idx_ESI); |
||
1320 | |||
1321 | x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); |
||
1322 | x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); |
||
1323 | } |
||
1324 | |||
1325 | x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); |
||
1326 | |||
1327 | if(x86_target(p->func) != X86_32) |
||
1328 | x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); |
||
1329 | else |
||
1330 | x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); |
||
1331 | |||
1332 | /* Load instance ID. |
||
1333 | */ |
||
1334 | if (p->use_instancing) { |
||
1335 | x86_mov(p->func, |
||
1336 | p->tmp2_EDX, |
||
1337 | x86_fn_arg(p->func, 4)); |
||
1338 | x86_mov(p->func, |
||
1339 | x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)), |
||
1340 | p->tmp2_EDX); |
||
1341 | |||
1342 | x86_mov(p->func, |
||
1343 | p->tmp_EAX, |
||
1344 | x86_fn_arg(p->func, 5)); |
||
1345 | x86_mov(p->func, |
||
1346 | x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), |
||
1347 | p->tmp_EAX); |
||
1348 | } |
||
1349 | |||
1350 | /* Get vertex count, compare to zero |
||
1351 | */ |
||
1352 | x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); |
||
1353 | x86_cmp(p->func, p->count_EBP, p->tmp_EAX); |
||
1354 | fixup = x86_jcc_forward(p->func, cc_E); |
||
1355 | |||
1356 | /* always load, needed or not: |
||
1357 | */ |
||
1358 | init_inputs(p, index_size); |
||
1359 | |||
1360 | /* Note address for loop jump |
||
1361 | */ |
||
1362 | label = x86_get_label(p->func); |
||
1363 | { |
||
1364 | struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); |
||
1365 | int last_variant = -1; |
||
1366 | struct x86_reg vb; |
||
1367 | |||
1368 | for (j = 0; j < p->translate.key.nr_elements; j++) { |
||
1369 | const struct translate_element *a = &p->translate.key.element[j]; |
||
1370 | unsigned variant = p->element_to_buffer_variant[j]; |
||
1371 | |||
1372 | /* Figure out source pointer address: |
||
1373 | */ |
||
1374 | if (variant != last_variant) { |
||
1375 | last_variant = variant; |
||
1376 | vb = get_buffer_ptr(p, index_size, variant, elt); |
||
1377 | } |
||
1378 | |||
1379 | if (!translate_attr( p, a, |
||
1380 | x86_make_disp(vb, a->input_offset), |
||
1381 | x86_make_disp(p->outbuf_EBX, a->output_offset))) |
||
1382 | return FALSE; |
||
1383 | } |
||
1384 | |||
1385 | /* Next output vertex: |
||
1386 | */ |
||
1387 | x64_rexw(p->func); |
||
1388 | x86_lea(p->func, |
||
1389 | p->outbuf_EBX, |
||
1390 | x86_make_disp(p->outbuf_EBX, |
||
1391 | p->translate.key.output_stride)); |
||
1392 | |||
1393 | /* Incr index |
||
1394 | */ |
||
1395 | incr_inputs( p, index_size ); |
||
1396 | } |
||
1397 | |||
1398 | /* decr count, loop if not zero |
||
1399 | */ |
||
1400 | x86_dec(p->func, p->count_EBP); |
||
1401 | x86_jcc(p->func, cc_NZ, label); |
||
1402 | |||
1403 | /* Exit mmx state? |
||
1404 | */ |
||
1405 | if (p->func->need_emms) |
||
1406 | mmx_emms(p->func); |
||
1407 | |||
1408 | /* Land forward jump here: |
||
1409 | */ |
||
1410 | x86_fixup_fwd_jump(p->func, fixup); |
||
1411 | |||
1412 | /* Pop regs and return |
||
1413 | */ |
||
1414 | |||
1415 | if(x86_target(p->func) != X86_64_STD_ABI) |
||
1416 | { |
||
1417 | x86_pop(p->func, p->idx_ESI); |
||
1418 | x86_pop(p->func, p->machine_EDI); |
||
1419 | } |
||
1420 | |||
1421 | x86_pop(p->func, p->count_EBP); |
||
1422 | x86_pop(p->func, p->outbuf_EBX); |
||
1423 | |||
1424 | if(x86_target(p->func) == X86_64_WIN64_ABI) |
||
1425 | { |
||
1426 | sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); |
||
1427 | sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); |
||
1428 | } |
||
1429 | x86_ret(p->func); |
||
1430 | |||
1431 | return TRUE; |
||
1432 | } |
||
1433 | |||
1434 | |||
1435 | |||
1436 | |||
1437 | |||
1438 | |||
1439 | |||
1440 | static void translate_sse_set_buffer( struct translate *translate, |
||
1441 | unsigned buf, |
||
1442 | const void *ptr, |
||
1443 | unsigned stride, |
||
1444 | unsigned max_index ) |
||
1445 | { |
||
1446 | struct translate_sse *p = (struct translate_sse *)translate; |
||
1447 | |||
1448 | if (buf < p->nr_buffers) { |
||
1449 | p->buffer[buf].base_ptr = (char *)ptr; |
||
1450 | p->buffer[buf].stride = stride; |
||
1451 | p->buffer[buf].max_index = max_index; |
||
1452 | } |
||
1453 | |||
1454 | if (0) debug_printf("%s %d/%d: %p %d\n", |
||
1455 | __FUNCTION__, buf, |
||
1456 | p->nr_buffers, |
||
1457 | ptr, stride); |
||
1458 | } |
||
1459 | |||
1460 | |||
1461 | static void translate_sse_release( struct translate *translate ) |
||
1462 | { |
||
1463 | struct translate_sse *p = (struct translate_sse *)translate; |
||
1464 | |||
1465 | x86_release_func( &p->elt8_func ); |
||
1466 | x86_release_func( &p->elt16_func ); |
||
1467 | x86_release_func( &p->elt_func ); |
||
1468 | x86_release_func( &p->linear_func ); |
||
1469 | |||
1470 | os_free_aligned(p); |
||
1471 | } |
||
1472 | |||
1473 | |||
1474 | struct translate *translate_sse2_create( const struct translate_key *key ) |
||
1475 | { |
||
1476 | struct translate_sse *p = NULL; |
||
1477 | unsigned i; |
||
1478 | |||
1479 | /* this is misnamed, it actually refers to whether rtasm is enabled or not */ |
||
1480 | if (!rtasm_cpu_has_sse()) |
||
1481 | goto fail; |
||
1482 | |||
1483 | p = os_malloc_aligned(sizeof(struct translate_sse), 16); |
||
1484 | if (p == NULL) |
||
1485 | goto fail; |
||
1486 | memset(p, 0, sizeof(*p)); |
||
1487 | memcpy(p->consts, consts, sizeof(consts)); |
||
1488 | |||
1489 | p->translate.key = *key; |
||
1490 | p->translate.release = translate_sse_release; |
||
1491 | p->translate.set_buffer = translate_sse_set_buffer; |
||
1492 | |||
1493 | for (i = 0; i < key->nr_elements; i++) { |
||
1494 | if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { |
||
1495 | unsigned j; |
||
1496 | |||
1497 | p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1); |
||
1498 | |||
1499 | if (key->element[i].instance_divisor) { |
||
1500 | p->use_instancing = TRUE; |
||
1501 | } |
||
1502 | |||
1503 | /* |
||
1504 | * Map vertex element to vertex buffer variant. |
||
1505 | */ |
||
1506 | for (j = 0; j < p->nr_buffer_variants; j++) { |
||
1507 | if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer && |
||
1508 | p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) { |
||
1509 | break; |
||
1510 | } |
||
1511 | } |
||
1512 | if (j == p->nr_buffer_variants) { |
||
1513 | p->buffer_variant[j].buffer_index = key->element[i].input_buffer; |
||
1514 | p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor; |
||
1515 | p->nr_buffer_variants++; |
||
1516 | } |
||
1517 | p->element_to_buffer_variant[i] = j; |
||
1518 | } else { |
||
1519 | assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID); |
||
1520 | |||
1521 | p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID; |
||
1522 | } |
||
1523 | } |
||
1524 | |||
1525 | if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers); |
||
1526 | |||
1527 | if (!build_vertex_emit(p, &p->linear_func, 0)) |
||
1528 | goto fail; |
||
1529 | |||
1530 | if (!build_vertex_emit(p, &p->elt_func, 4)) |
||
1531 | goto fail; |
||
1532 | |||
1533 | if (!build_vertex_emit(p, &p->elt16_func, 2)) |
||
1534 | goto fail; |
||
1535 | |||
1536 | if (!build_vertex_emit(p, &p->elt8_func, 1)) |
||
1537 | goto fail; |
||
1538 | |||
1539 | p->translate.run = (run_func) x86_get_func(&p->linear_func); |
||
1540 | if (p->translate.run == NULL) |
||
1541 | goto fail; |
||
1542 | |||
1543 | p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func); |
||
1544 | if (p->translate.run_elts == NULL) |
||
1545 | goto fail; |
||
1546 | |||
1547 | p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func); |
||
1548 | if (p->translate.run_elts16 == NULL) |
||
1549 | goto fail; |
||
1550 | |||
1551 | p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func); |
||
1552 | if (p->translate.run_elts8 == NULL) |
||
1553 | goto fail; |
||
1554 | |||
1555 | return &p->translate; |
||
1556 | |||
1557 | fail: |
||
1558 | if (p) |
||
1559 | translate_sse_release( &p->translate ); |
||
1560 | |||
1561 | return NULL; |
||
1562 | } |
||
1563 | |||
1564 | |||
1565 | |||
1566 | #else |
||
1567 | |||
1568 | struct translate *translate_sse2_create( const struct translate_key *key ) |
||
1569 | { |
||
1570 | return NULL; |
||
1571 | } |
||
1572 | |||
1573 | #endif>>>>>>>><>>>><>>>>>>>><>><>><>>>>>>>>>>>>>>>>>=>>>>>>>>>>>>>> |