Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Copyright 2003 VMware, Inc. |
||
3 | * All Rights Reserved. |
||
4 | * |
||
5 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
6 | * copy of this software and associated documentation files (the "Software"), |
||
7 | * to deal in the Software without restriction, including without limitation |
||
8 | * on the rights to use, copy, modify, merge, publish, distribute, sub |
||
9 | * license, and/or sell copies of the Software, and to permit persons to whom |
||
10 | * the Software is furnished to do so, subject to the following conditions: |
||
11 | * |
||
12 | * The above copyright notice and this permission notice (including the next |
||
13 | * paragraph) shall be included in all copies or substantial portions of the |
||
14 | * Software. |
||
15 | * |
||
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
||
19 | * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, |
||
20 | * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
||
21 | * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
||
22 | * USE OR OTHER DEALINGS IN THE SOFTWARE. |
||
23 | * |
||
24 | * Authors: |
||
25 | * Keith Whitwell |
||
26 | */ |
||
27 | |||
28 | |||
29 | #include "pipe/p_config.h" |
||
30 | #include "pipe/p_compiler.h" |
||
31 | #include "util/u_memory.h" |
||
32 | #include "util/u_math.h" |
||
33 | #include "util/u_format.h" |
||
34 | |||
35 | #include "translate.h" |
||
36 | |||
37 | |||
38 | #if (defined(PIPE_ARCH_X86) || defined(PIPE_ARCH_X86_64)) && !defined(PIPE_SUBSYSTEM_EMBEDDED) |
||
39 | |||
40 | #include "rtasm/rtasm_cpu.h" |
||
41 | #include "rtasm/rtasm_x86sse.h" |
||
42 | |||
43 | |||
44 | #define X 0 |
||
45 | #define Y 1 |
||
46 | #define Z 2 |
||
47 | #define W 3 |
||
48 | |||
49 | |||
50 | struct translate_buffer |
||
51 | { |
||
52 | const void *base_ptr; |
||
53 | uintptr_t stride; |
||
54 | unsigned max_index; |
||
55 | }; |
||
56 | |||
57 | struct translate_buffer_variant |
||
58 | { |
||
59 | unsigned buffer_index; |
||
60 | unsigned instance_divisor; |
||
61 | void *ptr; /* updated either per vertex or per instance */ |
||
62 | }; |
||
63 | |||
64 | |||
65 | #define ELEMENT_BUFFER_INSTANCE_ID 1001 |
||
66 | |||
67 | #define NUM_CONSTS 7 |
||
68 | |||
69 | enum |
||
70 | { |
||
71 | CONST_IDENTITY, |
||
72 | CONST_INV_127, |
||
73 | CONST_INV_255, |
||
74 | CONST_INV_32767, |
||
75 | CONST_INV_65535, |
||
76 | CONST_INV_2147483647, |
||
77 | CONST_255 |
||
78 | }; |
||
79 | |||
80 | #define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)} |
||
81 | static float consts[NUM_CONSTS][4] = { |
||
82 | {0, 0, 0, 1}, |
||
83 | C(1.0 / 127.0), |
||
84 | C(1.0 / 255.0), |
||
85 | C(1.0 / 32767.0), |
||
86 | C(1.0 / 65535.0), |
||
87 | C(1.0 / 2147483647.0), |
||
88 | C(255.0) |
||
89 | }; |
||
90 | |||
91 | #undef C |
||
92 | |||
93 | struct translate_sse |
||
94 | { |
||
95 | struct translate translate; |
||
96 | |||
97 | struct x86_function linear_func; |
||
98 | struct x86_function elt_func; |
||
99 | struct x86_function elt16_func; |
||
100 | struct x86_function elt8_func; |
||
101 | struct x86_function *func; |
||
102 | |||
103 | PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4]; |
||
104 | int8_t reg_to_const[16]; |
||
105 | int8_t const_to_reg[NUM_CONSTS]; |
||
106 | |||
107 | struct translate_buffer buffer[TRANSLATE_MAX_ATTRIBS]; |
||
108 | unsigned nr_buffers; |
||
109 | |||
110 | /* Multiple buffer variants can map to a single buffer. */ |
||
111 | struct translate_buffer_variant buffer_variant[TRANSLATE_MAX_ATTRIBS]; |
||
112 | unsigned nr_buffer_variants; |
||
113 | |||
114 | /* Multiple elements can map to a single buffer variant. */ |
||
115 | unsigned element_to_buffer_variant[TRANSLATE_MAX_ATTRIBS]; |
||
116 | |||
117 | boolean use_instancing; |
||
118 | unsigned instance_id; |
||
119 | unsigned start_instance; |
||
120 | |||
121 | /* these are actually known values, but putting them in a struct |
||
122 | * like this is helpful to keep them in sync across the file. |
||
123 | */ |
||
124 | struct x86_reg tmp_EAX; |
||
125 | struct x86_reg tmp2_EDX; |
||
126 | struct x86_reg src_ECX; |
||
127 | struct x86_reg idx_ESI; /* either start+i or &elt[i] */ |
||
128 | struct x86_reg machine_EDI; |
||
129 | struct x86_reg outbuf_EBX; |
||
130 | struct x86_reg count_EBP; /* decrements to zero */ |
||
131 | }; |
||
132 | |||
133 | |||
134 | static int |
||
135 | get_offset(const void *a, const void *b) |
||
136 | { |
||
137 | return (const char *) b - (const char *) a; |
||
138 | } |
||
139 | |||
140 | |||
141 | static struct x86_reg |
||
142 | get_const(struct translate_sse *p, unsigned id) |
||
143 | { |
||
144 | struct x86_reg reg; |
||
145 | unsigned i; |
||
146 | |||
147 | if (p->const_to_reg[id] >= 0) |
||
148 | return x86_make_reg(file_XMM, p->const_to_reg[id]); |
||
149 | |||
150 | for (i = 2; i < 8; ++i) { |
||
151 | if (p->reg_to_const[i] < 0) |
||
152 | break; |
||
153 | } |
||
154 | |||
155 | /* TODO: be smarter here */ |
||
156 | if (i == 8) |
||
157 | --i; |
||
158 | |||
159 | reg = x86_make_reg(file_XMM, i); |
||
160 | |||
161 | if (p->reg_to_const[i] >= 0) |
||
162 | p->const_to_reg[p->reg_to_const[i]] = -1; |
||
163 | |||
164 | p->reg_to_const[i] = id; |
||
165 | p->const_to_reg[id] = i; |
||
166 | |||
167 | /* TODO: this should happen outside the loop, if possible */ |
||
168 | sse_movaps(p->func, reg, |
||
169 | x86_make_disp(p->machine_EDI, |
||
170 | get_offset(p, &p->consts[id][0]))); |
||
171 | |||
172 | return reg; |
||
173 | } |
||
174 | |||
175 | |||
176 | /* load the data in a SSE2 register, padding with zeros */ |
||
177 | static boolean |
||
178 | emit_load_sse2(struct translate_sse *p, |
||
179 | struct x86_reg data, struct x86_reg src, unsigned size) |
||
180 | { |
||
181 | struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); |
||
182 | struct x86_reg tmp = p->tmp_EAX; |
||
183 | switch (size) { |
||
184 | case 1: |
||
185 | x86_movzx8(p->func, tmp, src); |
||
186 | sse2_movd(p->func, data, tmp); |
||
187 | break; |
||
188 | case 2: |
||
189 | x86_movzx16(p->func, tmp, src); |
||
190 | sse2_movd(p->func, data, tmp); |
||
191 | break; |
||
192 | case 3: |
||
193 | x86_movzx8(p->func, tmp, x86_make_disp(src, 2)); |
||
194 | x86_shl_imm(p->func, tmp, 16); |
||
195 | x86_mov16(p->func, tmp, src); |
||
196 | sse2_movd(p->func, data, tmp); |
||
197 | break; |
||
198 | case 4: |
||
199 | sse2_movd(p->func, data, src); |
||
200 | break; |
||
201 | case 6: |
||
202 | sse2_movd(p->func, data, src); |
||
203 | x86_movzx16(p->func, tmp, x86_make_disp(src, 4)); |
||
204 | sse2_movd(p->func, tmpXMM, tmp); |
||
205 | sse2_punpckldq(p->func, data, tmpXMM); |
||
206 | break; |
||
207 | case 8: |
||
208 | sse2_movq(p->func, data, src); |
||
209 | break; |
||
210 | case 12: |
||
211 | sse2_movq(p->func, data, src); |
||
212 | sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8)); |
||
213 | sse2_punpcklqdq(p->func, data, tmpXMM); |
||
214 | break; |
||
215 | case 16: |
||
216 | sse2_movdqu(p->func, data, src); |
||
217 | break; |
||
218 | default: |
||
219 | return FALSE; |
||
220 | } |
||
221 | return TRUE; |
||
222 | } |
||
223 | |||
224 | |||
225 | /* this value can be passed for the out_chans argument */ |
||
226 | #define CHANNELS_0001 5 |
||
227 | |||
228 | |||
229 | /* this function will load #chans float values, and will |
||
230 | * pad the register with zeroes at least up to out_chans. |
||
231 | * |
||
232 | * If out_chans is set to CHANNELS_0001, then the fourth |
||
233 | * value will be padded with 1. Only pass this value if |
||
234 | * chans < 4 or results are undefined. |
||
235 | */ |
||
236 | static void |
||
237 | emit_load_float32(struct translate_sse *p, struct x86_reg data, |
||
238 | struct x86_reg arg0, unsigned out_chans, unsigned chans) |
||
239 | { |
||
240 | switch (chans) { |
||
241 | case 1: |
||
242 | /* a 0 0 0 |
||
243 | * a 0 0 1 |
||
244 | */ |
||
245 | sse_movss(p->func, data, arg0); |
||
246 | if (out_chans == CHANNELS_0001) |
||
247 | sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); |
||
248 | break; |
||
249 | case 2: |
||
250 | /* 0 0 0 1 |
||
251 | * a b 0 1 |
||
252 | */ |
||
253 | if (out_chans == CHANNELS_0001) |
||
254 | sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), |
||
255 | SHUF(X, Y, Z, W)); |
||
256 | else if (out_chans > 2) |
||
257 | sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); |
||
258 | sse_movlps(p->func, data, arg0); |
||
259 | break; |
||
260 | case 3: |
||
261 | /* Have to jump through some hoops: |
||
262 | * |
||
263 | * c 0 0 0 |
||
264 | * c 0 0 1 if out_chans == CHANNELS_0001 |
||
265 | * 0 0 c 0/1 |
||
266 | * a b c 0/1 |
||
267 | */ |
||
268 | sse_movss(p->func, data, x86_make_disp(arg0, 8)); |
||
269 | if (out_chans == CHANNELS_0001) |
||
270 | sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), |
||
271 | SHUF(X, Y, Z, W)); |
||
272 | sse_shufps(p->func, data, data, SHUF(Y, Z, X, W)); |
||
273 | sse_movlps(p->func, data, arg0); |
||
274 | break; |
||
275 | case 4: |
||
276 | sse_movups(p->func, data, arg0); |
||
277 | break; |
||
278 | } |
||
279 | } |
||
280 | |||
281 | /* this function behaves like emit_load_float32, but loads |
||
282 | 64-bit floating point numbers, converting them to 32-bit |
||
283 | ones */ |
||
284 | static void |
||
285 | emit_load_float64to32(struct translate_sse *p, struct x86_reg data, |
||
286 | struct x86_reg arg0, unsigned out_chans, unsigned chans) |
||
287 | { |
||
288 | struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); |
||
289 | switch (chans) { |
||
290 | case 1: |
||
291 | sse2_movsd(p->func, data, arg0); |
||
292 | if (out_chans > 1) |
||
293 | sse2_cvtpd2ps(p->func, data, data); |
||
294 | else |
||
295 | sse2_cvtsd2ss(p->func, data, data); |
||
296 | if (out_chans == CHANNELS_0001) |
||
297 | sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), |
||
298 | SHUF(X, Y, Z, W)); |
||
299 | break; |
||
300 | case 2: |
||
301 | sse2_movupd(p->func, data, arg0); |
||
302 | sse2_cvtpd2ps(p->func, data, data); |
||
303 | if (out_chans == CHANNELS_0001) |
||
304 | sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), |
||
305 | SHUF(X, Y, Z, W)); |
||
306 | else if (out_chans > 2) |
||
307 | sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY)); |
||
308 | break; |
||
309 | case 3: |
||
310 | sse2_movupd(p->func, data, arg0); |
||
311 | sse2_cvtpd2ps(p->func, data, data); |
||
312 | sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16)); |
||
313 | if (out_chans > 3) |
||
314 | sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); |
||
315 | else |
||
316 | sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM); |
||
317 | sse_movlhps(p->func, data, tmpXMM); |
||
318 | if (out_chans == CHANNELS_0001) |
||
319 | sse_orps(p->func, data, get_const(p, CONST_IDENTITY)); |
||
320 | break; |
||
321 | case 4: |
||
322 | sse2_movupd(p->func, data, arg0); |
||
323 | sse2_cvtpd2ps(p->func, data, data); |
||
324 | sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16)); |
||
325 | sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM); |
||
326 | sse_movlhps(p->func, data, tmpXMM); |
||
327 | break; |
||
328 | } |
||
329 | } |
||
330 | |||
331 | |||
332 | static void |
||
333 | emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, |
||
334 | struct x86_reg dst_xmm, struct x86_reg src_gpr, |
||
335 | struct x86_reg src_xmm) |
||
336 | { |
||
337 | if (x86_target(p->func) != X86_32) |
||
338 | x64_mov64(p->func, dst_gpr, src_gpr); |
||
339 | else { |
||
340 | /* TODO: when/on which CPUs is SSE2 actually better than SSE? */ |
||
341 | if (x86_target_caps(p->func) & X86_SSE2) |
||
342 | sse2_movq(p->func, dst_xmm, src_xmm); |
||
343 | else |
||
344 | sse_movlps(p->func, dst_xmm, src_xmm); |
||
345 | } |
||
346 | } |
||
347 | |||
348 | |||
349 | static void |
||
350 | emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, |
||
351 | struct x86_reg dst_xmm, struct x86_reg src) |
||
352 | { |
||
353 | emit_mov64(p, dst_gpr, dst_xmm, src, src); |
||
354 | } |
||
355 | |||
356 | |||
357 | static void |
||
358 | emit_store64(struct translate_sse *p, struct x86_reg dst, |
||
359 | struct x86_reg src_gpr, struct x86_reg src_xmm) |
||
360 | { |
||
361 | emit_mov64(p, dst, dst, src_gpr, src_xmm); |
||
362 | } |
||
363 | |||
364 | |||
365 | static void |
||
366 | emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src) |
||
367 | { |
||
368 | if (x86_target_caps(p->func) & X86_SSE2) |
||
369 | sse2_movdqu(p->func, dst, src); |
||
370 | else |
||
371 | sse_movups(p->func, dst, src); |
||
372 | } |
||
373 | |||
374 | |||
375 | /* TODO: this uses unaligned accesses liberally, which is great on Nehalem, |
||
376 | * but may or may not be good on older processors |
||
377 | * TODO: may perhaps want to use non-temporal stores here if possible |
||
378 | */ |
||
379 | static void |
||
380 | emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, |
||
381 | unsigned size) |
||
382 | { |
||
383 | struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); |
||
384 | struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1); |
||
385 | struct x86_reg dataGPR = p->tmp_EAX; |
||
386 | struct x86_reg dataGPR2 = p->tmp2_EDX; |
||
387 | |||
388 | if (size < 8) { |
||
389 | switch (size) { |
||
390 | case 1: |
||
391 | x86_mov8(p->func, dataGPR, src); |
||
392 | x86_mov8(p->func, dst, dataGPR); |
||
393 | break; |
||
394 | case 2: |
||
395 | x86_mov16(p->func, dataGPR, src); |
||
396 | x86_mov16(p->func, dst, dataGPR); |
||
397 | break; |
||
398 | case 3: |
||
399 | x86_mov16(p->func, dataGPR, src); |
||
400 | x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2)); |
||
401 | x86_mov16(p->func, dst, dataGPR); |
||
402 | x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2); |
||
403 | break; |
||
404 | case 4: |
||
405 | x86_mov(p->func, dataGPR, src); |
||
406 | x86_mov(p->func, dst, dataGPR); |
||
407 | break; |
||
408 | case 6: |
||
409 | x86_mov(p->func, dataGPR, src); |
||
410 | x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4)); |
||
411 | x86_mov(p->func, dst, dataGPR); |
||
412 | x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2); |
||
413 | break; |
||
414 | } |
||
415 | } |
||
416 | else if (!(x86_target_caps(p->func) & X86_SSE)) { |
||
417 | unsigned i = 0; |
||
418 | assert((size & 3) == 0); |
||
419 | for (i = 0; i < size; i += 4) { |
||
420 | x86_mov(p->func, dataGPR, x86_make_disp(src, i)); |
||
421 | x86_mov(p->func, x86_make_disp(dst, i), dataGPR); |
||
422 | } |
||
423 | } |
||
424 | else { |
||
425 | switch (size) { |
||
426 | case 8: |
||
427 | emit_load64(p, dataGPR, dataXMM, src); |
||
428 | emit_store64(p, dst, dataGPR, dataXMM); |
||
429 | break; |
||
430 | case 12: |
||
431 | emit_load64(p, dataGPR2, dataXMM, src); |
||
432 | x86_mov(p->func, dataGPR, x86_make_disp(src, 8)); |
||
433 | emit_store64(p, dst, dataGPR2, dataXMM); |
||
434 | x86_mov(p->func, x86_make_disp(dst, 8), dataGPR); |
||
435 | break; |
||
436 | case 16: |
||
437 | emit_mov128(p, dataXMM, src); |
||
438 | emit_mov128(p, dst, dataXMM); |
||
439 | break; |
||
440 | case 24: |
||
441 | emit_mov128(p, dataXMM, src); |
||
442 | emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16)); |
||
443 | emit_mov128(p, dst, dataXMM); |
||
444 | emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2); |
||
445 | break; |
||
446 | case 32: |
||
447 | emit_mov128(p, dataXMM, src); |
||
448 | emit_mov128(p, dataXMM2, x86_make_disp(src, 16)); |
||
449 | emit_mov128(p, dst, dataXMM); |
||
450 | emit_mov128(p, x86_make_disp(dst, 16), dataXMM2); |
||
451 | break; |
||
452 | default: |
||
453 | assert(0); |
||
454 | } |
||
455 | } |
||
456 | } |
||
457 | |||
458 | static boolean |
||
459 | translate_attr_convert(struct translate_sse *p, |
||
460 | const struct translate_element *a, |
||
461 | struct x86_reg src, struct x86_reg dst) |
||
462 | { |
||
463 | const struct util_format_description *input_desc = |
||
464 | util_format_description(a->input_format); |
||
465 | const struct util_format_description *output_desc = |
||
466 | util_format_description(a->output_format); |
||
467 | unsigned i; |
||
468 | boolean id_swizzle = TRUE; |
||
469 | unsigned swizzle[4] = |
||
470 | { UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, |
||
471 | UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE }; |
||
472 | unsigned needed_chans = 0; |
||
473 | unsigned imms[2] = { 0, 0x3f800000 }; |
||
474 | |||
475 | if (a->output_format == PIPE_FORMAT_NONE |
||
476 | || a->input_format == PIPE_FORMAT_NONE) |
||
477 | return FALSE; |
||
478 | |||
479 | if (input_desc->channel[0].size & 7) |
||
480 | return FALSE; |
||
481 | |||
482 | if (input_desc->colorspace != output_desc->colorspace) |
||
483 | return FALSE; |
||
484 | |||
485 | for (i = 1; i < input_desc->nr_channels; ++i) { |
||
486 | if (memcmp |
||
487 | (&input_desc->channel[i], &input_desc->channel[0], |
||
488 | sizeof(input_desc->channel[0]))) |
||
489 | return FALSE; |
||
490 | } |
||
491 | |||
492 | for (i = 1; i < output_desc->nr_channels; ++i) { |
||
493 | if (memcmp |
||
494 | (&output_desc->channel[i], &output_desc->channel[0], |
||
495 | sizeof(output_desc->channel[0]))) { |
||
496 | return FALSE; |
||
497 | } |
||
498 | } |
||
499 | |||
500 | for (i = 0; i < output_desc->nr_channels; ++i) { |
||
501 | if (output_desc->swizzle[i] < 4) |
||
502 | swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i]; |
||
503 | } |
||
504 | |||
505 | if ((x86_target_caps(p->func) & X86_SSE) && |
||
506 | (0 || a->output_format == PIPE_FORMAT_R32_FLOAT |
||
507 | || a->output_format == PIPE_FORMAT_R32G32_FLOAT |
||
508 | || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT |
||
509 | || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) { |
||
510 | struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); |
||
511 | |||
512 | for (i = 0; i < output_desc->nr_channels; ++i) { |
||
513 | if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0 |
||
514 | && i >= input_desc->nr_channels) |
||
515 | swizzle[i] = i; |
||
516 | } |
||
517 | |||
518 | for (i = 0; i < output_desc->nr_channels; ++i) { |
||
519 | if (swizzle[i] < 4) |
||
520 | needed_chans = MAX2(needed_chans, swizzle[i] + 1); |
||
521 | if (swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) |
||
522 | id_swizzle = FALSE; |
||
523 | } |
||
524 | |||
525 | if (needed_chans > 0) { |
||
526 | switch (input_desc->channel[0].type) { |
||
527 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
528 | if (!(x86_target_caps(p->func) & X86_SSE2)) |
||
529 | return FALSE; |
||
530 | emit_load_sse2(p, dataXMM, src, |
||
531 | input_desc->channel[0].size * |
||
532 | input_desc->nr_channels >> 3); |
||
533 | |||
534 | /* TODO: add support for SSE4.1 pmovzx */ |
||
535 | switch (input_desc->channel[0].size) { |
||
536 | case 8: |
||
537 | /* TODO: this may be inefficient due to get_identity() being |
||
538 | * used both as a float and integer register. |
||
539 | */ |
||
540 | sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); |
||
541 | sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); |
||
542 | break; |
||
543 | case 16: |
||
544 | sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY)); |
||
545 | break; |
||
546 | case 32: /* we lose precision here */ |
||
547 | sse2_psrld_imm(p->func, dataXMM, 1); |
||
548 | break; |
||
549 | default: |
||
550 | return FALSE; |
||
551 | } |
||
552 | sse2_cvtdq2ps(p->func, dataXMM, dataXMM); |
||
553 | if (input_desc->channel[0].normalized) { |
||
554 | struct x86_reg factor; |
||
555 | switch (input_desc->channel[0].size) { |
||
556 | case 8: |
||
557 | factor = get_const(p, CONST_INV_255); |
||
558 | break; |
||
559 | case 16: |
||
560 | factor = get_const(p, CONST_INV_65535); |
||
561 | break; |
||
562 | case 32: |
||
563 | factor = get_const(p, CONST_INV_2147483647); |
||
564 | break; |
||
565 | default: |
||
566 | assert(0); |
||
567 | factor.disp = 0; |
||
568 | factor.file = 0; |
||
569 | factor.idx = 0; |
||
570 | factor.mod = 0; |
||
571 | break; |
||
572 | } |
||
573 | sse_mulps(p->func, dataXMM, factor); |
||
574 | } |
||
575 | else if (input_desc->channel[0].size == 32) |
||
576 | /* compensate for the bit we threw away to fit u32 into s32 */ |
||
577 | sse_addps(p->func, dataXMM, dataXMM); |
||
578 | break; |
||
579 | case UTIL_FORMAT_TYPE_SIGNED: |
||
580 | if (!(x86_target_caps(p->func) & X86_SSE2)) |
||
581 | return FALSE; |
||
582 | emit_load_sse2(p, dataXMM, src, |
||
583 | input_desc->channel[0].size * |
||
584 | input_desc->nr_channels >> 3); |
||
585 | |||
586 | /* TODO: add support for SSE4.1 pmovsx */ |
||
587 | switch (input_desc->channel[0].size) { |
||
588 | case 8: |
||
589 | sse2_punpcklbw(p->func, dataXMM, dataXMM); |
||
590 | sse2_punpcklbw(p->func, dataXMM, dataXMM); |
||
591 | sse2_psrad_imm(p->func, dataXMM, 24); |
||
592 | break; |
||
593 | case 16: |
||
594 | sse2_punpcklwd(p->func, dataXMM, dataXMM); |
||
595 | sse2_psrad_imm(p->func, dataXMM, 16); |
||
596 | break; |
||
597 | case 32: /* we lose precision here */ |
||
598 | break; |
||
599 | default: |
||
600 | return FALSE; |
||
601 | } |
||
602 | sse2_cvtdq2ps(p->func, dataXMM, dataXMM); |
||
603 | if (input_desc->channel[0].normalized) { |
||
604 | struct x86_reg factor; |
||
605 | switch (input_desc->channel[0].size) { |
||
606 | case 8: |
||
607 | factor = get_const(p, CONST_INV_127); |
||
608 | break; |
||
609 | case 16: |
||
610 | factor = get_const(p, CONST_INV_32767); |
||
611 | break; |
||
612 | case 32: |
||
613 | factor = get_const(p, CONST_INV_2147483647); |
||
614 | break; |
||
615 | default: |
||
616 | assert(0); |
||
617 | factor.disp = 0; |
||
618 | factor.file = 0; |
||
619 | factor.idx = 0; |
||
620 | factor.mod = 0; |
||
621 | break; |
||
622 | } |
||
623 | sse_mulps(p->func, dataXMM, factor); |
||
624 | } |
||
625 | break; |
||
626 | |||
627 | break; |
||
628 | case UTIL_FORMAT_TYPE_FLOAT: |
||
629 | if (input_desc->channel[0].size != 32 |
||
630 | && input_desc->channel[0].size != 64) { |
||
631 | return FALSE; |
||
632 | } |
||
633 | if (swizzle[3] == UTIL_FORMAT_SWIZZLE_1 |
||
634 | && input_desc->nr_channels <= 3) { |
||
635 | swizzle[3] = UTIL_FORMAT_SWIZZLE_W; |
||
636 | needed_chans = CHANNELS_0001; |
||
637 | } |
||
638 | switch (input_desc->channel[0].size) { |
||
639 | case 32: |
||
640 | emit_load_float32(p, dataXMM, src, needed_chans, |
||
641 | input_desc->nr_channels); |
||
642 | break; |
||
643 | case 64: /* we lose precision here */ |
||
644 | if (!(x86_target_caps(p->func) & X86_SSE2)) |
||
645 | return FALSE; |
||
646 | emit_load_float64to32(p, dataXMM, src, needed_chans, |
||
647 | input_desc->nr_channels); |
||
648 | break; |
||
649 | default: |
||
650 | return FALSE; |
||
651 | } |
||
652 | break; |
||
653 | default: |
||
654 | return FALSE; |
||
655 | } |
||
656 | |||
657 | if (!id_swizzle) { |
||
658 | sse_shufps(p->func, dataXMM, dataXMM, |
||
659 | SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3])); |
||
660 | } |
||
661 | } |
||
662 | |||
663 | if (output_desc->nr_channels >= 4 |
||
664 | && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 |
||
665 | && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 |
||
666 | && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 |
||
667 | && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) { |
||
668 | sse_movups(p->func, dst, dataXMM); |
||
669 | } |
||
670 | else { |
||
671 | if (output_desc->nr_channels >= 2 |
||
672 | && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 |
||
673 | && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) { |
||
674 | sse_movlps(p->func, dst, dataXMM); |
||
675 | } |
||
676 | else { |
||
677 | if (swizzle[0] < UTIL_FORMAT_SWIZZLE_0) { |
||
678 | sse_movss(p->func, dst, dataXMM); |
||
679 | } |
||
680 | else { |
||
681 | x86_mov_imm(p->func, dst, |
||
682 | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); |
||
683 | } |
||
684 | |||
685 | if (output_desc->nr_channels >= 2) { |
||
686 | if (swizzle[1] < UTIL_FORMAT_SWIZZLE_0) { |
||
687 | sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3)); |
||
688 | sse_movss(p->func, x86_make_disp(dst, 4), dataXMM); |
||
689 | } |
||
690 | else { |
||
691 | x86_mov_imm(p->func, x86_make_disp(dst, 4), |
||
692 | imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); |
||
693 | } |
||
694 | } |
||
695 | } |
||
696 | |||
697 | if (output_desc->nr_channels >= 3) { |
||
698 | if (output_desc->nr_channels >= 4 |
||
699 | && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 |
||
700 | && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) { |
||
701 | sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM); |
||
702 | } |
||
703 | else { |
||
704 | if (swizzle[2] < UTIL_FORMAT_SWIZZLE_0) { |
||
705 | sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3)); |
||
706 | sse_movss(p->func, x86_make_disp(dst, 8), dataXMM); |
||
707 | } |
||
708 | else { |
||
709 | x86_mov_imm(p->func, x86_make_disp(dst, 8), |
||
710 | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); |
||
711 | } |
||
712 | |||
713 | if (output_desc->nr_channels >= 4) { |
||
714 | if (swizzle[3] < UTIL_FORMAT_SWIZZLE_0) { |
||
715 | sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3)); |
||
716 | sse_movss(p->func, x86_make_disp(dst, 12), dataXMM); |
||
717 | } |
||
718 | else { |
||
719 | x86_mov_imm(p->func, x86_make_disp(dst, 12), |
||
720 | imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); |
||
721 | } |
||
722 | } |
||
723 | } |
||
724 | } |
||
725 | } |
||
726 | return TRUE; |
||
727 | } |
||
728 | else if ((x86_target_caps(p->func) & X86_SSE2) |
||
729 | && input_desc->channel[0].size == 8 |
||
730 | && output_desc->channel[0].size == 16 |
||
731 | && output_desc->channel[0].normalized == |
||
732 | input_desc->channel[0].normalized && |
||
733 | (0 || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED |
||
734 | && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) |
||
735 | || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED |
||
736 | && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) |
||
737 | || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED |
||
738 | && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED))) { |
||
739 | struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); |
||
740 | struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1); |
||
741 | struct x86_reg tmp = p->tmp_EAX; |
||
742 | unsigned imms[2] = { 0, 1 }; |
||
743 | |||
744 | for (i = 0; i < output_desc->nr_channels; ++i) { |
||
745 | if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0 |
||
746 | && i >= input_desc->nr_channels) { |
||
747 | swizzle[i] = i; |
||
748 | } |
||
749 | } |
||
750 | |||
751 | for (i = 0; i < output_desc->nr_channels; ++i) { |
||
752 | if (swizzle[i] < 4) |
||
753 | needed_chans = MAX2(needed_chans, swizzle[i] + 1); |
||
754 | if (swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i) |
||
755 | id_swizzle = FALSE; |
||
756 | } |
||
757 | |||
758 | if (needed_chans > 0) { |
||
759 | emit_load_sse2(p, dataXMM, src, |
||
760 | input_desc->channel[0].size * |
||
761 | input_desc->nr_channels >> 3); |
||
762 | |||
763 | switch (input_desc->channel[0].type) { |
||
764 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
765 | if (input_desc->channel[0].normalized) { |
||
766 | sse2_punpcklbw(p->func, dataXMM, dataXMM); |
||
767 | if (output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED) |
||
768 | sse2_psrlw_imm(p->func, dataXMM, 1); |
||
769 | } |
||
770 | else |
||
771 | sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY)); |
||
772 | break; |
||
773 | case UTIL_FORMAT_TYPE_SIGNED: |
||
774 | if (input_desc->channel[0].normalized) { |
||
775 | sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY)); |
||
776 | sse2_punpcklbw(p->func, tmpXMM, dataXMM); |
||
777 | sse2_psllw_imm(p->func, dataXMM, 9); |
||
778 | sse2_psrlw_imm(p->func, dataXMM, 8); |
||
779 | sse2_por(p->func, tmpXMM, dataXMM); |
||
780 | sse2_psrlw_imm(p->func, dataXMM, 7); |
||
781 | sse2_por(p->func, tmpXMM, dataXMM); |
||
782 | { |
||
783 | struct x86_reg t = dataXMM; |
||
784 | dataXMM = tmpXMM; |
||
785 | tmpXMM = t; |
||
786 | } |
||
787 | } |
||
788 | else { |
||
789 | sse2_punpcklbw(p->func, dataXMM, dataXMM); |
||
790 | sse2_psraw_imm(p->func, dataXMM, 8); |
||
791 | } |
||
792 | break; |
||
793 | default: |
||
794 | assert(0); |
||
795 | } |
||
796 | |||
797 | if (output_desc->channel[0].normalized) |
||
798 | imms[1] = |
||
799 | (output_desc->channel[0].type == |
||
800 | UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff; |
||
801 | |||
802 | if (!id_swizzle) |
||
803 | sse2_pshuflw(p->func, dataXMM, dataXMM, |
||
804 | (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | |
||
805 | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6)); |
||
806 | } |
||
807 | |||
808 | if (output_desc->nr_channels >= 4 |
||
809 | && swizzle[0] < UTIL_FORMAT_SWIZZLE_0 |
||
810 | && swizzle[1] < UTIL_FORMAT_SWIZZLE_0 |
||
811 | && swizzle[2] < UTIL_FORMAT_SWIZZLE_0 |
||
812 | && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) { |
||
813 | sse2_movq(p->func, dst, dataXMM); |
||
814 | } |
||
815 | else { |
||
816 | if (swizzle[0] < UTIL_FORMAT_SWIZZLE_0) { |
||
817 | if (output_desc->nr_channels >= 2 |
||
818 | && swizzle[1] < UTIL_FORMAT_SWIZZLE_0) { |
||
819 | sse2_movd(p->func, dst, dataXMM); |
||
820 | } |
||
821 | else { |
||
822 | sse2_movd(p->func, tmp, dataXMM); |
||
823 | x86_mov16(p->func, dst, tmp); |
||
824 | if (output_desc->nr_channels >= 2) |
||
825 | x86_mov16_imm(p->func, x86_make_disp(dst, 2), |
||
826 | imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]); |
||
827 | } |
||
828 | } |
||
829 | else { |
||
830 | if (output_desc->nr_channels >= 2 |
||
831 | && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0) { |
||
832 | x86_mov_imm(p->func, dst, |
||
833 | (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | |
||
834 | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); |
||
835 | } |
||
836 | else { |
||
837 | x86_mov16_imm(p->func, dst, |
||
838 | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]); |
||
839 | if (output_desc->nr_channels >= 2) { |
||
840 | sse2_movd(p->func, tmp, dataXMM); |
||
841 | x86_shr_imm(p->func, tmp, 16); |
||
842 | x86_mov16(p->func, x86_make_disp(dst, 2), tmp); |
||
843 | } |
||
844 | } |
||
845 | } |
||
846 | |||
847 | if (output_desc->nr_channels >= 3) { |
||
848 | if (swizzle[2] < UTIL_FORMAT_SWIZZLE_0) { |
||
849 | if (output_desc->nr_channels >= 4 |
||
850 | && swizzle[3] < UTIL_FORMAT_SWIZZLE_0) { |
||
851 | sse2_psrlq_imm(p->func, dataXMM, 32); |
||
852 | sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM); |
||
853 | } |
||
854 | else { |
||
855 | sse2_psrlq_imm(p->func, dataXMM, 32); |
||
856 | sse2_movd(p->func, tmp, dataXMM); |
||
857 | x86_mov16(p->func, x86_make_disp(dst, 4), tmp); |
||
858 | if (output_desc->nr_channels >= 4) { |
||
859 | x86_mov16_imm(p->func, x86_make_disp(dst, 6), |
||
860 | imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]); |
||
861 | } |
||
862 | } |
||
863 | } |
||
864 | else { |
||
865 | if (output_desc->nr_channels >= 4 |
||
866 | && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0) { |
||
867 | x86_mov_imm(p->func, x86_make_disp(dst, 4), |
||
868 | (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) |
||
869 | | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); |
||
870 | } |
||
871 | else { |
||
872 | x86_mov16_imm(p->func, x86_make_disp(dst, 4), |
||
873 | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]); |
||
874 | |||
875 | if (output_desc->nr_channels >= 4) { |
||
876 | sse2_psrlq_imm(p->func, dataXMM, 48); |
||
877 | sse2_movd(p->func, tmp, dataXMM); |
||
878 | x86_mov16(p->func, x86_make_disp(dst, 6), tmp); |
||
879 | } |
||
880 | } |
||
881 | } |
||
882 | } |
||
883 | } |
||
884 | return TRUE; |
||
885 | } |
||
886 | else if (!memcmp(&output_desc->channel[0], &input_desc->channel[0], |
||
887 | sizeof(output_desc->channel[0]))) { |
||
888 | struct x86_reg tmp = p->tmp_EAX; |
||
889 | unsigned i; |
||
890 | |||
891 | if (input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 |
||
892 | && output_desc->nr_channels == 4 |
||
893 | && swizzle[0] == UTIL_FORMAT_SWIZZLE_W |
||
894 | && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z |
||
895 | && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y |
||
896 | && swizzle[3] == UTIL_FORMAT_SWIZZLE_X) { |
||
897 | /* TODO: support movbe */ |
||
898 | x86_mov(p->func, tmp, src); |
||
899 | x86_bswap(p->func, tmp); |
||
900 | x86_mov(p->func, dst, tmp); |
||
901 | return TRUE; |
||
902 | } |
||
903 | |||
904 | for (i = 0; i < output_desc->nr_channels; ++i) { |
||
905 | switch (output_desc->channel[0].size) { |
||
906 | case 8: |
||
907 | if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) { |
||
908 | unsigned v = 0; |
||
909 | if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) { |
||
910 | switch (output_desc->channel[0].type) { |
||
911 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
912 | v = output_desc->channel[0].normalized ? 0xff : 1; |
||
913 | break; |
||
914 | case UTIL_FORMAT_TYPE_SIGNED: |
||
915 | v = output_desc->channel[0].normalized ? 0x7f : 1; |
||
916 | break; |
||
917 | default: |
||
918 | return FALSE; |
||
919 | } |
||
920 | } |
||
921 | x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v); |
||
922 | } |
||
923 | else { |
||
924 | x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1)); |
||
925 | x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp); |
||
926 | } |
||
927 | break; |
||
928 | case 16: |
||
929 | if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) { |
||
930 | unsigned v = 0; |
||
931 | if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) { |
||
932 | switch (output_desc->channel[1].type) { |
||
933 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
934 | v = output_desc->channel[1].normalized ? 0xffff : 1; |
||
935 | break; |
||
936 | case UTIL_FORMAT_TYPE_SIGNED: |
||
937 | v = output_desc->channel[1].normalized ? 0x7fff : 1; |
||
938 | break; |
||
939 | case UTIL_FORMAT_TYPE_FLOAT: |
||
940 | v = 0x3c00; |
||
941 | break; |
||
942 | default: |
||
943 | return FALSE; |
||
944 | } |
||
945 | } |
||
946 | x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v); |
||
947 | } |
||
948 | else if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0) { |
||
949 | x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0); |
||
950 | } |
||
951 | else { |
||
952 | x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2)); |
||
953 | x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp); |
||
954 | } |
||
955 | break; |
||
956 | case 32: |
||
957 | if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) { |
||
958 | unsigned v = 0; |
||
959 | if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) { |
||
960 | switch (output_desc->channel[1].type) { |
||
961 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
962 | v = output_desc->channel[1].normalized ? 0xffffffff : 1; |
||
963 | break; |
||
964 | case UTIL_FORMAT_TYPE_SIGNED: |
||
965 | v = output_desc->channel[1].normalized ? 0x7fffffff : 1; |
||
966 | break; |
||
967 | case UTIL_FORMAT_TYPE_FLOAT: |
||
968 | v = 0x3f800000; |
||
969 | break; |
||
970 | default: |
||
971 | return FALSE; |
||
972 | } |
||
973 | } |
||
974 | x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v); |
||
975 | } |
||
976 | else { |
||
977 | x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4)); |
||
978 | x86_mov(p->func, x86_make_disp(dst, i * 4), tmp); |
||
979 | } |
||
980 | break; |
||
981 | case 64: |
||
982 | if (swizzle[i] >= UTIL_FORMAT_SWIZZLE_0) { |
||
983 | unsigned l = 0; |
||
984 | unsigned h = 0; |
||
985 | if (swizzle[i] == UTIL_FORMAT_SWIZZLE_1) { |
||
986 | switch (output_desc->channel[1].type) { |
||
987 | case UTIL_FORMAT_TYPE_UNSIGNED: |
||
988 | h = output_desc->channel[1].normalized ? 0xffffffff : 0; |
||
989 | l = output_desc->channel[1].normalized ? 0xffffffff : 1; |
||
990 | break; |
||
991 | case UTIL_FORMAT_TYPE_SIGNED: |
||
992 | h = output_desc->channel[1].normalized ? 0x7fffffff : 0; |
||
993 | l = output_desc->channel[1].normalized ? 0xffffffff : 1; |
||
994 | break; |
||
995 | case UTIL_FORMAT_TYPE_FLOAT: |
||
996 | h = 0x3ff00000; |
||
997 | l = 0; |
||
998 | break; |
||
999 | default: |
||
1000 | return FALSE; |
||
1001 | } |
||
1002 | } |
||
1003 | x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l); |
||
1004 | x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h); |
||
1005 | } |
||
1006 | else { |
||
1007 | if (x86_target_caps(p->func) & X86_SSE) { |
||
1008 | struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0); |
||
1009 | emit_load64(p, tmp, tmpXMM, |
||
1010 | x86_make_disp(src, swizzle[i] * 8)); |
||
1011 | emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM); |
||
1012 | } |
||
1013 | else { |
||
1014 | x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8)); |
||
1015 | x86_mov(p->func, x86_make_disp(dst, i * 8), tmp); |
||
1016 | x86_mov(p->func, tmp, |
||
1017 | x86_make_disp(src, swizzle[i] * 8 + 4)); |
||
1018 | x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp); |
||
1019 | } |
||
1020 | } |
||
1021 | break; |
||
1022 | default: |
||
1023 | return FALSE; |
||
1024 | } |
||
1025 | } |
||
1026 | return TRUE; |
||
1027 | } |
||
1028 | /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */ |
||
1029 | else if ((x86_target_caps(p->func) & X86_SSE2) && |
||
1030 | a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && |
||
1031 | (0 || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM |
||
1032 | || a-> output_format == PIPE_FORMAT_R8G8B8A8_UNORM)) { |
||
1033 | struct x86_reg dataXMM = x86_make_reg(file_XMM, 0); |
||
1034 | |||
1035 | /* load */ |
||
1036 | sse_movups(p->func, dataXMM, src); |
||
1037 | |||
1038 | if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM) { |
||
1039 | sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 1, 0, 3)); |
||
1040 | } |
||
1041 | |||
1042 | /* scale by 255.0 */ |
||
1043 | sse_mulps(p->func, dataXMM, get_const(p, CONST_255)); |
||
1044 | |||
1045 | /* pack and emit */ |
||
1046 | sse2_cvtps2dq(p->func, dataXMM, dataXMM); |
||
1047 | sse2_packssdw(p->func, dataXMM, dataXMM); |
||
1048 | sse2_packuswb(p->func, dataXMM, dataXMM); |
||
1049 | sse2_movd(p->func, dst, dataXMM); |
||
1050 | |||
1051 | return TRUE; |
||
1052 | } |
||
1053 | |||
1054 | return FALSE; |
||
1055 | } |
||
1056 | |||
1057 | |||
1058 | static boolean |
||
1059 | translate_attr(struct translate_sse *p, |
||
1060 | const struct translate_element *a, |
||
1061 | struct x86_reg src, struct x86_reg dst) |
||
1062 | { |
||
1063 | if (a->input_format == a->output_format) { |
||
1064 | emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1)); |
||
1065 | return TRUE; |
||
1066 | } |
||
1067 | |||
1068 | return translate_attr_convert(p, a, src, dst); |
||
1069 | } |
||
1070 | |||
1071 | |||
1072 | static boolean |
||
1073 | init_inputs(struct translate_sse *p, unsigned index_size) |
||
1074 | { |
||
1075 | unsigned i; |
||
1076 | struct x86_reg instance_id = |
||
1077 | x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); |
||
1078 | struct x86_reg start_instance = |
||
1079 | x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)); |
||
1080 | |||
1081 | for (i = 0; i < p->nr_buffer_variants; i++) { |
||
1082 | struct translate_buffer_variant *variant = &p->buffer_variant[i]; |
||
1083 | struct translate_buffer *buffer = &p->buffer[variant->buffer_index]; |
||
1084 | |||
1085 | if (!index_size || variant->instance_divisor) { |
||
1086 | struct x86_reg buf_max_index = |
||
1087 | x86_make_disp(p->machine_EDI, get_offset(p, &buffer->max_index)); |
||
1088 | struct x86_reg buf_stride = |
||
1089 | x86_make_disp(p->machine_EDI, get_offset(p, &buffer->stride)); |
||
1090 | struct x86_reg buf_ptr = |
||
1091 | x86_make_disp(p->machine_EDI, get_offset(p, &variant->ptr)); |
||
1092 | struct x86_reg buf_base_ptr = |
||
1093 | x86_make_disp(p->machine_EDI, get_offset(p, &buffer->base_ptr)); |
||
1094 | struct x86_reg elt = p->idx_ESI; |
||
1095 | struct x86_reg tmp_EAX = p->tmp_EAX; |
||
1096 | |||
1097 | /* Calculate pointer to first attrib: |
||
1098 | * base_ptr + stride * index, where index depends on instance divisor |
||
1099 | */ |
||
1100 | if (variant->instance_divisor) { |
||
1101 | /* Start with instance = instance_id |
||
1102 | * which is true if divisor is 1. |
||
1103 | */ |
||
1104 | x86_mov(p->func, tmp_EAX, instance_id); |
||
1105 | |||
1106 | if (variant->instance_divisor != 1) { |
||
1107 | struct x86_reg tmp_EDX = p->tmp2_EDX; |
||
1108 | struct x86_reg tmp_ECX = p->src_ECX; |
||
1109 | |||
1110 | /* TODO: Add x86_shr() to rtasm and use it whenever |
||
1111 | * instance divisor is power of two. |
||
1112 | */ |
||
1113 | x86_xor(p->func, tmp_EDX, tmp_EDX); |
||
1114 | x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor); |
||
1115 | x86_div(p->func, tmp_ECX); /* EAX = EDX:EAX / ECX */ |
||
1116 | |||
1117 | /* instance = (instance_id - start_instance) / divisor + |
||
1118 | * start_instance |
||
1119 | */ |
||
1120 | x86_mov(p->func, tmp_EDX, start_instance); |
||
1121 | x86_add(p->func, tmp_EAX, tmp_EDX); |
||
1122 | } |
||
1123 | |||
1124 | /* XXX we need to clamp the index here too, but to a |
||
1125 | * per-array max value, not the draw->pt.max_index value |
||
1126 | * that's being given to us via translate->set_buffer(). |
||
1127 | */ |
||
1128 | } |
||
1129 | else { |
||
1130 | x86_mov(p->func, tmp_EAX, elt); |
||
1131 | |||
1132 | /* Clamp to max_index |
||
1133 | */ |
||
1134 | x86_cmp(p->func, tmp_EAX, buf_max_index); |
||
1135 | x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE); |
||
1136 | } |
||
1137 | |||
1138 | x86_mov(p->func, p->tmp2_EDX, buf_stride); |
||
1139 | x64_rexw(p->func); |
||
1140 | x86_imul(p->func, tmp_EAX, p->tmp2_EDX); |
||
1141 | x64_rexw(p->func); |
||
1142 | x86_add(p->func, tmp_EAX, buf_base_ptr); |
||
1143 | |||
1144 | x86_cmp(p->func, p->count_EBP, p->tmp_EAX); |
||
1145 | |||
1146 | /* In the linear case, keep the buffer pointer instead of the |
||
1147 | * index number. |
||
1148 | */ |
||
1149 | if (!index_size && p->nr_buffer_variants == 1) { |
||
1150 | x64_rexw(p->func); |
||
1151 | x86_mov(p->func, elt, tmp_EAX); |
||
1152 | } |
||
1153 | else { |
||
1154 | x64_rexw(p->func); |
||
1155 | x86_mov(p->func, buf_ptr, tmp_EAX); |
||
1156 | } |
||
1157 | } |
||
1158 | } |
||
1159 | |||
1160 | return TRUE; |
||
1161 | } |
||
1162 | |||
1163 | |||
1164 | static struct x86_reg |
||
1165 | get_buffer_ptr(struct translate_sse *p, |
||
1166 | unsigned index_size, unsigned var_idx, struct x86_reg elt) |
||
1167 | { |
||
1168 | if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) { |
||
1169 | return x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)); |
||
1170 | } |
||
1171 | if (!index_size && p->nr_buffer_variants == 1) { |
||
1172 | return p->idx_ESI; |
||
1173 | } |
||
1174 | else if (!index_size || p->buffer_variant[var_idx].instance_divisor) { |
||
1175 | struct x86_reg ptr = p->src_ECX; |
||
1176 | struct x86_reg buf_ptr = |
||
1177 | x86_make_disp(p->machine_EDI, |
||
1178 | get_offset(p, &p->buffer_variant[var_idx].ptr)); |
||
1179 | |||
1180 | x64_rexw(p->func); |
||
1181 | x86_mov(p->func, ptr, buf_ptr); |
||
1182 | return ptr; |
||
1183 | } |
||
1184 | else { |
||
1185 | struct x86_reg ptr = p->src_ECX; |
||
1186 | const struct translate_buffer_variant *variant = |
||
1187 | &p->buffer_variant[var_idx]; |
||
1188 | struct x86_reg buf_stride = |
||
1189 | x86_make_disp(p->machine_EDI, |
||
1190 | get_offset(p, &p->buffer[variant->buffer_index].stride)); |
||
1191 | struct x86_reg buf_base_ptr = |
||
1192 | x86_make_disp(p->machine_EDI, |
||
1193 | get_offset(p, &p->buffer[variant->buffer_index].base_ptr)); |
||
1194 | struct x86_reg buf_max_index = |
||
1195 | x86_make_disp(p->machine_EDI, |
||
1196 | get_offset(p, &p->buffer[variant->buffer_index].max_index)); |
||
1197 | |||
1198 | /* Calculate pointer to current attrib: |
||
1199 | */ |
||
1200 | switch (index_size) { |
||
1201 | case 1: |
||
1202 | x86_movzx8(p->func, ptr, elt); |
||
1203 | break; |
||
1204 | case 2: |
||
1205 | x86_movzx16(p->func, ptr, elt); |
||
1206 | break; |
||
1207 | case 4: |
||
1208 | x86_mov(p->func, ptr, elt); |
||
1209 | break; |
||
1210 | } |
||
1211 | |||
1212 | /* Clamp to max_index |
||
1213 | */ |
||
1214 | x86_cmp(p->func, ptr, buf_max_index); |
||
1215 | x86_cmovcc(p->func, ptr, buf_max_index, cc_AE); |
||
1216 | |||
1217 | x86_mov(p->func, p->tmp2_EDX, buf_stride); |
||
1218 | x64_rexw(p->func); |
||
1219 | x86_imul(p->func, ptr, p->tmp2_EDX); |
||
1220 | x64_rexw(p->func); |
||
1221 | x86_add(p->func, ptr, buf_base_ptr); |
||
1222 | return ptr; |
||
1223 | } |
||
1224 | } |
||
1225 | |||
1226 | |||
1227 | static boolean |
||
1228 | incr_inputs(struct translate_sse *p, unsigned index_size) |
||
1229 | { |
||
1230 | if (!index_size && p->nr_buffer_variants == 1) { |
||
1231 | const unsigned buffer_index = p->buffer_variant[0].buffer_index; |
||
1232 | struct x86_reg stride = |
||
1233 | x86_make_disp(p->machine_EDI, |
||
1234 | get_offset(p, &p->buffer[buffer_index].stride)); |
||
1235 | |||
1236 | if (p->buffer_variant[0].instance_divisor == 0) { |
||
1237 | x64_rexw(p->func); |
||
1238 | x86_add(p->func, p->idx_ESI, stride); |
||
1239 | sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192)); |
||
1240 | } |
||
1241 | } |
||
1242 | else if (!index_size) { |
||
1243 | unsigned i; |
||
1244 | |||
1245 | /* Is this worthwhile?? |
||
1246 | */ |
||
1247 | for (i = 0; i < p->nr_buffer_variants; i++) { |
||
1248 | struct translate_buffer_variant *variant = &p->buffer_variant[i]; |
||
1249 | struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI, |
||
1250 | get_offset(p, &variant->ptr)); |
||
1251 | struct x86_reg buf_stride = |
||
1252 | x86_make_disp(p->machine_EDI, |
||
1253 | get_offset(p, &p->buffer[variant->buffer_index].stride)); |
||
1254 | |||
1255 | if (variant->instance_divisor == 0) { |
||
1256 | x86_mov(p->func, p->tmp_EAX, buf_stride); |
||
1257 | x64_rexw(p->func); |
||
1258 | x86_add(p->func, p->tmp_EAX, buf_ptr); |
||
1259 | if (i == 0) |
||
1260 | sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192)); |
||
1261 | x64_rexw(p->func); |
||
1262 | x86_mov(p->func, buf_ptr, p->tmp_EAX); |
||
1263 | } |
||
1264 | } |
||
1265 | } |
||
1266 | else { |
||
1267 | x64_rexw(p->func); |
||
1268 | x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size)); |
||
1269 | } |
||
1270 | |||
1271 | return TRUE; |
||
1272 | } |
||
1273 | |||
1274 | |||
1275 | /* Build run( struct translate *machine, |
||
1276 | * unsigned start, |
||
1277 | * unsigned count, |
||
1278 | * void *output_buffer ) |
||
1279 | * or |
||
1280 | * run_elts( struct translate *machine, |
||
1281 | * unsigned *elts, |
||
1282 | * unsigned count, |
||
1283 | * void *output_buffer ) |
||
1284 | * |
||
1285 | * Lots of hardcoding |
||
1286 | * |
||
1287 | * EAX -- pointer to current output vertex |
||
1288 | * ECX -- pointer to current attribute |
||
1289 | * |
||
1290 | */ |
||
1291 | static boolean |
||
1292 | build_vertex_emit(struct translate_sse *p, |
||
1293 | struct x86_function *func, unsigned index_size) |
||
1294 | { |
||
1295 | int fixup, label; |
||
1296 | unsigned j; |
||
1297 | |||
1298 | memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const)); |
||
1299 | memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg)); |
||
1300 | |||
1301 | p->tmp_EAX = x86_make_reg(file_REG32, reg_AX); |
||
1302 | p->idx_ESI = x86_make_reg(file_REG32, reg_SI); |
||
1303 | p->outbuf_EBX = x86_make_reg(file_REG32, reg_BX); |
||
1304 | p->machine_EDI = x86_make_reg(file_REG32, reg_DI); |
||
1305 | p->count_EBP = x86_make_reg(file_REG32, reg_BP); |
||
1306 | p->tmp2_EDX = x86_make_reg(file_REG32, reg_DX); |
||
1307 | p->src_ECX = x86_make_reg(file_REG32, reg_CX); |
||
1308 | |||
1309 | p->func = func; |
||
1310 | |||
1311 | x86_init_func(p->func); |
||
1312 | |||
1313 | if (x86_target(p->func) == X86_64_WIN64_ABI) { |
||
1314 | /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" |
||
1315 | * above the return address |
||
1316 | */ |
||
1317 | sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), |
||
1318 | x86_make_reg(file_XMM, 6)); |
||
1319 | sse2_movdqa(p->func, |
||
1320 | x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), |
||
1321 | x86_make_reg(file_XMM, 7)); |
||
1322 | } |
||
1323 | |||
1324 | x86_push(p->func, p->outbuf_EBX); |
||
1325 | x86_push(p->func, p->count_EBP); |
||
1326 | |||
1327 | /* on non-Win64 x86-64, these are already in the right registers */ |
||
1328 | if (x86_target(p->func) != X86_64_STD_ABI) { |
||
1329 | x86_push(p->func, p->machine_EDI); |
||
1330 | x86_push(p->func, p->idx_ESI); |
||
1331 | |||
1332 | if (x86_target(p->func) != X86_32) { |
||
1333 | x64_mov64(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); |
||
1334 | x64_mov64(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); |
||
1335 | } |
||
1336 | else { |
||
1337 | x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1)); |
||
1338 | x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2)); |
||
1339 | } |
||
1340 | } |
||
1341 | |||
1342 | x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3)); |
||
1343 | |||
1344 | if (x86_target(p->func) != X86_32) |
||
1345 | x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); |
||
1346 | else |
||
1347 | x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6)); |
||
1348 | |||
1349 | /* Load instance ID. |
||
1350 | */ |
||
1351 | if (p->use_instancing) { |
||
1352 | x86_mov(p->func, p->tmp2_EDX, x86_fn_arg(p->func, 4)); |
||
1353 | x86_mov(p->func, |
||
1354 | x86_make_disp(p->machine_EDI, |
||
1355 | get_offset(p, &p->start_instance)), p->tmp2_EDX); |
||
1356 | |||
1357 | x86_mov(p->func, p->tmp_EAX, x86_fn_arg(p->func, 5)); |
||
1358 | x86_mov(p->func, |
||
1359 | x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)), |
||
1360 | p->tmp_EAX); |
||
1361 | } |
||
1362 | |||
1363 | /* Get vertex count, compare to zero |
||
1364 | */ |
||
1365 | x86_xor(p->func, p->tmp_EAX, p->tmp_EAX); |
||
1366 | x86_cmp(p->func, p->count_EBP, p->tmp_EAX); |
||
1367 | fixup = x86_jcc_forward(p->func, cc_E); |
||
1368 | |||
1369 | /* always load, needed or not: |
||
1370 | */ |
||
1371 | init_inputs(p, index_size); |
||
1372 | |||
1373 | /* Note address for loop jump |
||
1374 | */ |
||
1375 | label = x86_get_label(p->func); |
||
1376 | { |
||
1377 | struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI); |
||
1378 | int last_variant = -1; |
||
1379 | struct x86_reg vb; |
||
1380 | |||
1381 | for (j = 0; j < p->translate.key.nr_elements; j++) { |
||
1382 | const struct translate_element *a = &p->translate.key.element[j]; |
||
1383 | unsigned variant = p->element_to_buffer_variant[j]; |
||
1384 | |||
1385 | /* Figure out source pointer address: |
||
1386 | */ |
||
1387 | if (variant != last_variant) { |
||
1388 | last_variant = variant; |
||
1389 | vb = get_buffer_ptr(p, index_size, variant, elt); |
||
1390 | } |
||
1391 | |||
1392 | if (!translate_attr(p, a, |
||
1393 | x86_make_disp(vb, a->input_offset), |
||
1394 | x86_make_disp(p->outbuf_EBX, a->output_offset))) |
||
1395 | return FALSE; |
||
1396 | } |
||
1397 | |||
1398 | /* Next output vertex: |
||
1399 | */ |
||
1400 | x64_rexw(p->func); |
||
1401 | x86_lea(p->func, p->outbuf_EBX, |
||
1402 | x86_make_disp(p->outbuf_EBX, p->translate.key.output_stride)); |
||
1403 | |||
1404 | /* Incr index |
||
1405 | */ |
||
1406 | incr_inputs(p, index_size); |
||
1407 | } |
||
1408 | |||
1409 | /* decr count, loop if not zero |
||
1410 | */ |
||
1411 | x86_dec(p->func, p->count_EBP); |
||
1412 | x86_jcc(p->func, cc_NZ, label); |
||
1413 | |||
1414 | /* Exit mmx state? |
||
1415 | */ |
||
1416 | if (p->func->need_emms) |
||
1417 | mmx_emms(p->func); |
||
1418 | |||
1419 | /* Land forward jump here: |
||
1420 | */ |
||
1421 | x86_fixup_fwd_jump(p->func, fixup); |
||
1422 | |||
1423 | /* Pop regs and return |
||
1424 | */ |
||
1425 | if (x86_target(p->func) != X86_64_STD_ABI) { |
||
1426 | x86_pop(p->func, p->idx_ESI); |
||
1427 | x86_pop(p->func, p->machine_EDI); |
||
1428 | } |
||
1429 | |||
1430 | x86_pop(p->func, p->count_EBP); |
||
1431 | x86_pop(p->func, p->outbuf_EBX); |
||
1432 | |||
1433 | if (x86_target(p->func) == X86_64_WIN64_ABI) { |
||
1434 | sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), |
||
1435 | x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8)); |
||
1436 | sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), |
||
1437 | x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24)); |
||
1438 | } |
||
1439 | x86_ret(p->func); |
||
1440 | |||
1441 | return TRUE; |
||
1442 | } |
||
1443 | |||
1444 | |||
1445 | static void |
||
1446 | translate_sse_set_buffer(struct translate *translate, |
||
1447 | unsigned buf, |
||
1448 | const void *ptr, unsigned stride, unsigned max_index) |
||
1449 | { |
||
1450 | struct translate_sse *p = (struct translate_sse *) translate; |
||
1451 | |||
1452 | if (buf < p->nr_buffers) { |
||
1453 | p->buffer[buf].base_ptr = (char *) ptr; |
||
1454 | p->buffer[buf].stride = stride; |
||
1455 | p->buffer[buf].max_index = max_index; |
||
1456 | } |
||
1457 | |||
1458 | if (0) |
||
1459 | debug_printf("%s %d/%d: %p %d\n", |
||
1460 | __FUNCTION__, buf, p->nr_buffers, ptr, stride); |
||
1461 | } |
||
1462 | |||
1463 | |||
1464 | static void |
||
1465 | translate_sse_release(struct translate *translate) |
||
1466 | { |
||
1467 | struct translate_sse *p = (struct translate_sse *) translate; |
||
1468 | |||
1469 | x86_release_func(&p->elt8_func); |
||
1470 | x86_release_func(&p->elt16_func); |
||
1471 | x86_release_func(&p->elt_func); |
||
1472 | x86_release_func(&p->linear_func); |
||
1473 | |||
1474 | os_free_aligned(p); |
||
1475 | } |
||
1476 | |||
1477 | |||
1478 | struct translate * |
||
1479 | translate_sse2_create(const struct translate_key *key) |
||
1480 | { |
||
1481 | struct translate_sse *p = NULL; |
||
1482 | unsigned i; |
||
1483 | |||
1484 | /* this is misnamed, it actually refers to whether rtasm is enabled or not */ |
||
1485 | if (!rtasm_cpu_has_sse()) |
||
1486 | goto fail; |
||
1487 | |||
1488 | p = os_malloc_aligned(sizeof(struct translate_sse), 16); |
||
1489 | if (p == NULL) |
||
1490 | goto fail; |
||
1491 | |||
1492 | memset(p, 0, sizeof(*p)); |
||
1493 | memcpy(p->consts, consts, sizeof(consts)); |
||
1494 | |||
1495 | p->translate.key = *key; |
||
1496 | p->translate.release = translate_sse_release; |
||
1497 | p->translate.set_buffer = translate_sse_set_buffer; |
||
1498 | |||
1499 | assert(key->nr_elements <= TRANSLATE_MAX_ATTRIBS); |
||
1500 | |||
1501 | for (i = 0; i < key->nr_elements; i++) { |
||
1502 | if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) { |
||
1503 | unsigned j; |
||
1504 | |||
1505 | p->nr_buffers = |
||
1506 | MAX2(p->nr_buffers, key->element[i].input_buffer + 1); |
||
1507 | |||
1508 | if (key->element[i].instance_divisor) { |
||
1509 | p->use_instancing = TRUE; |
||
1510 | } |
||
1511 | |||
1512 | /* |
||
1513 | * Map vertex element to vertex buffer variant. |
||
1514 | */ |
||
1515 | for (j = 0; j < p->nr_buffer_variants; j++) { |
||
1516 | if (p->buffer_variant[j].buffer_index == |
||
1517 | key->element[i].input_buffer |
||
1518 | && p->buffer_variant[j].instance_divisor == |
||
1519 | key->element[i].instance_divisor) { |
||
1520 | break; |
||
1521 | } |
||
1522 | } |
||
1523 | if (j == p->nr_buffer_variants) { |
||
1524 | p->buffer_variant[j].buffer_index = key->element[i].input_buffer; |
||
1525 | p->buffer_variant[j].instance_divisor = |
||
1526 | key->element[i].instance_divisor; |
||
1527 | p->nr_buffer_variants++; |
||
1528 | } |
||
1529 | p->element_to_buffer_variant[i] = j; |
||
1530 | } |
||
1531 | else { |
||
1532 | assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID); |
||
1533 | |||
1534 | p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID; |
||
1535 | } |
||
1536 | } |
||
1537 | |||
1538 | if (0) |
||
1539 | debug_printf("nr_buffers: %d\n", p->nr_buffers); |
||
1540 | |||
1541 | if (!build_vertex_emit(p, &p->linear_func, 0)) |
||
1542 | goto fail; |
||
1543 | |||
1544 | if (!build_vertex_emit(p, &p->elt_func, 4)) |
||
1545 | goto fail; |
||
1546 | |||
1547 | if (!build_vertex_emit(p, &p->elt16_func, 2)) |
||
1548 | goto fail; |
||
1549 | |||
1550 | if (!build_vertex_emit(p, &p->elt8_func, 1)) |
||
1551 | goto fail; |
||
1552 | |||
1553 | p->translate.run = (run_func) x86_get_func(&p->linear_func); |
||
1554 | if (p->translate.run == NULL) |
||
1555 | goto fail; |
||
1556 | |||
1557 | p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func); |
||
1558 | if (p->translate.run_elts == NULL) |
||
1559 | goto fail; |
||
1560 | |||
1561 | p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func); |
||
1562 | if (p->translate.run_elts16 == NULL) |
||
1563 | goto fail; |
||
1564 | |||
1565 | p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func); |
||
1566 | if (p->translate.run_elts8 == NULL) |
||
1567 | goto fail; |
||
1568 | |||
1569 | return &p->translate; |
||
1570 | |||
1571 | fail: |
||
1572 | if (p) |
||
1573 | translate_sse_release(&p->translate); |
||
1574 | |||
1575 | return NULL; |
||
1576 | } |
||
1577 | |||
1578 | |||
1579 | #else |
||
1580 | |||
1581 | struct translate * |
||
1582 | translate_sse2_create(const struct translate_key *key) |
||
1583 | { |
||
1584 | return NULL; |
||
1585 | } |
||
1586 | |||
1587 | #endif>>=>>>>>>><>>>><>>>>>>>><>><>><>>>>>>>>>>>>>>>>>=>>>>>>>>>>>>>> |