Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Copyright 2003 VMware, Inc. |
||
3 | * All Rights Reserved. |
||
4 | * |
||
5 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
6 | * copy of this software and associated documentation files (the "Software"), |
||
7 | * to deal in the Software without restriction, including without limitation |
||
8 | * on the rights to use, copy, modify, merge, publish, distribute, sub |
||
9 | * license, and/or sell copies of the Software, and to permit persons to whom |
||
10 | * the Software is furnished to do so, subject to the following conditions: |
||
11 | * |
||
12 | * The above copyright notice and this permission notice (including the next |
||
13 | * paragraph) shall be included in all copies or substantial portions of the |
||
14 | * Software. |
||
15 | * |
||
16 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
17 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
18 | * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL |
||
19 | * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM, |
||
20 | * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR |
||
21 | * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE |
||
22 | * USE OR OTHER DEALINGS IN THE SOFTWARE. |
||
23 | * |
||
24 | * Authors: |
||
25 | * Keith Whitwell |
||
26 | */ |
||
27 | |||
28 | #include |
||
29 | |||
30 | #include "main/glheader.h" |
||
31 | #include "main/context.h" |
||
32 | #include "util/simple_list.h" |
||
33 | #include "main/enums.h" |
||
34 | #include "swrast/s_chan.h" |
||
35 | #include "t_context.h" |
||
36 | #include "t_vertex.h" |
||
37 | |||
38 | #if defined(USE_SSE_ASM) |
||
39 | |||
40 | #include "x86/rtasm/x86sse.h" |
||
41 | #include "x86/common_x86_asm.h" |
||
42 | |||
43 | |||
44 | /** |
||
45 | * Number of bytes to allocate for generated SSE functions |
||
46 | */ |
||
47 | #define MAX_SSE_CODE_SIZE 1024 |
||
48 | |||
49 | |||
50 | #define X 0 |
||
51 | #define Y 1 |
||
52 | #define Z 2 |
||
53 | #define W 3 |
||
54 | |||
55 | |||
56 | struct x86_program { |
||
57 | struct x86_function func; |
||
58 | |||
59 | struct gl_context *ctx; |
||
60 | GLboolean inputs_safe; |
||
61 | GLboolean outputs_safe; |
||
62 | GLboolean have_sse2; |
||
63 | |||
64 | struct x86_reg identity; |
||
65 | struct x86_reg chan0; |
||
66 | }; |
||
67 | |||
68 | |||
69 | static struct x86_reg get_identity( struct x86_program *p ) |
||
70 | { |
||
71 | return p->identity; |
||
72 | } |
||
73 | |||
74 | static void emit_load4f_4( struct x86_program *p, |
||
75 | struct x86_reg dest, |
||
76 | struct x86_reg arg0 ) |
||
77 | { |
||
78 | sse_movups(&p->func, dest, arg0); |
||
79 | } |
||
80 | |||
81 | static void emit_load4f_3( struct x86_program *p, |
||
82 | struct x86_reg dest, |
||
83 | struct x86_reg arg0 ) |
||
84 | { |
||
85 | /* Have to jump through some hoops: |
||
86 | * |
||
87 | * c 0 0 0 |
||
88 | * c 0 0 1 |
||
89 | * 0 0 c 1 |
||
90 | * a b c 1 |
||
91 | */ |
||
92 | sse_movss(&p->func, dest, x86_make_disp(arg0, 8)); |
||
93 | sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) ); |
||
94 | sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) ); |
||
95 | sse_movlps(&p->func, dest, arg0); |
||
96 | } |
||
97 | |||
98 | static void emit_load4f_2( struct x86_program *p, |
||
99 | struct x86_reg dest, |
||
100 | struct x86_reg arg0 ) |
||
101 | { |
||
102 | /* Initialize from identity, then pull in low two words: |
||
103 | */ |
||
104 | sse_movups(&p->func, dest, get_identity(p)); |
||
105 | sse_movlps(&p->func, dest, arg0); |
||
106 | } |
||
107 | |||
108 | static void emit_load4f_1( struct x86_program *p, |
||
109 | struct x86_reg dest, |
||
110 | struct x86_reg arg0 ) |
||
111 | { |
||
112 | /* Pull in low word, then swizzle in identity */ |
||
113 | sse_movss(&p->func, dest, arg0); |
||
114 | sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) ); |
||
115 | } |
||
116 | |||
117 | |||
118 | |||
119 | static void emit_load3f_3( struct x86_program *p, |
||
120 | struct x86_reg dest, |
||
121 | struct x86_reg arg0 ) |
||
122 | { |
||
123 | /* Over-reads by 1 dword - potential SEGV if input is a vertex |
||
124 | * array. |
||
125 | */ |
||
126 | if (p->inputs_safe) { |
||
127 | sse_movups(&p->func, dest, arg0); |
||
128 | } |
||
129 | else { |
||
130 | /* c 0 0 0 |
||
131 | * c c c c |
||
132 | * a b c c |
||
133 | */ |
||
134 | sse_movss(&p->func, dest, x86_make_disp(arg0, 8)); |
||
135 | sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X)); |
||
136 | sse_movlps(&p->func, dest, arg0); |
||
137 | } |
||
138 | } |
||
139 | |||
140 | static void emit_load3f_2( struct x86_program *p, |
||
141 | struct x86_reg dest, |
||
142 | struct x86_reg arg0 ) |
||
143 | { |
||
144 | emit_load4f_2(p, dest, arg0); |
||
145 | } |
||
146 | |||
147 | static void emit_load3f_1( struct x86_program *p, |
||
148 | struct x86_reg dest, |
||
149 | struct x86_reg arg0 ) |
||
150 | { |
||
151 | /* Loading from memory erases the upper bits. */ |
||
152 | sse_movss(&p->func, dest, arg0); |
||
153 | } |
||
154 | |||
155 | static void emit_load2f_2( struct x86_program *p, |
||
156 | struct x86_reg dest, |
||
157 | struct x86_reg arg0 ) |
||
158 | { |
||
159 | sse_movlps(&p->func, dest, arg0); |
||
160 | } |
||
161 | |||
162 | static void emit_load2f_1( struct x86_program *p, |
||
163 | struct x86_reg dest, |
||
164 | struct x86_reg arg0 ) |
||
165 | { |
||
166 | /* Loading from memory erases the upper bits. */ |
||
167 | sse_movss(&p->func, dest, arg0); |
||
168 | } |
||
169 | |||
170 | static void emit_load1f_1( struct x86_program *p, |
||
171 | struct x86_reg dest, |
||
172 | struct x86_reg arg0 ) |
||
173 | { |
||
174 | sse_movss(&p->func, dest, arg0); |
||
175 | } |
||
176 | |||
177 | static void (*load[4][4])( struct x86_program *p, |
||
178 | struct x86_reg dest, |
||
179 | struct x86_reg arg0 ) = { |
||
180 | { emit_load1f_1, |
||
181 | emit_load1f_1, |
||
182 | emit_load1f_1, |
||
183 | emit_load1f_1 }, |
||
184 | |||
185 | { emit_load2f_1, |
||
186 | emit_load2f_2, |
||
187 | emit_load2f_2, |
||
188 | emit_load2f_2 }, |
||
189 | |||
190 | { emit_load3f_1, |
||
191 | emit_load3f_2, |
||
192 | emit_load3f_3, |
||
193 | emit_load3f_3 }, |
||
194 | |||
195 | { emit_load4f_1, |
||
196 | emit_load4f_2, |
||
197 | emit_load4f_3, |
||
198 | emit_load4f_4 } |
||
199 | }; |
||
200 | |||
201 | static void emit_load( struct x86_program *p, |
||
202 | struct x86_reg dest, |
||
203 | GLuint sz, |
||
204 | struct x86_reg src, |
||
205 | GLuint src_sz) |
||
206 | { |
||
207 | load[sz-1][src_sz-1](p, dest, src); |
||
208 | } |
||
209 | |||
210 | static void emit_store4f( struct x86_program *p, |
||
211 | struct x86_reg dest, |
||
212 | struct x86_reg arg0 ) |
||
213 | { |
||
214 | sse_movups(&p->func, dest, arg0); |
||
215 | } |
||
216 | |||
217 | static void emit_store3f( struct x86_program *p, |
||
218 | struct x86_reg dest, |
||
219 | struct x86_reg arg0 ) |
||
220 | { |
||
221 | if (p->outputs_safe) { |
||
222 | /* Emit the extra dword anyway. This may hurt writecombining, |
||
223 | * may cause other problems. |
||
224 | */ |
||
225 | sse_movups(&p->func, dest, arg0); |
||
226 | } |
||
227 | else { |
||
228 | /* Alternate strategy - emit two, shuffle, emit one. |
||
229 | */ |
||
230 | sse_movlps(&p->func, dest, arg0); |
||
231 | sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */ |
||
232 | sse_movss(&p->func, x86_make_disp(dest,8), arg0); |
||
233 | } |
||
234 | } |
||
235 | |||
236 | static void emit_store2f( struct x86_program *p, |
||
237 | struct x86_reg dest, |
||
238 | struct x86_reg arg0 ) |
||
239 | { |
||
240 | sse_movlps(&p->func, dest, arg0); |
||
241 | } |
||
242 | |||
243 | static void emit_store1f( struct x86_program *p, |
||
244 | struct x86_reg dest, |
||
245 | struct x86_reg arg0 ) |
||
246 | { |
||
247 | sse_movss(&p->func, dest, arg0); |
||
248 | } |
||
249 | |||
250 | |||
251 | static void (*store[4])( struct x86_program *p, |
||
252 | struct x86_reg dest, |
||
253 | struct x86_reg arg0 ) = |
||
254 | { |
||
255 | emit_store1f, |
||
256 | emit_store2f, |
||
257 | emit_store3f, |
||
258 | emit_store4f |
||
259 | }; |
||
260 | |||
261 | static void emit_store( struct x86_program *p, |
||
262 | struct x86_reg dest, |
||
263 | GLuint sz, |
||
264 | struct x86_reg temp ) |
||
265 | |||
266 | { |
||
267 | store[sz-1](p, dest, temp); |
||
268 | } |
||
269 | |||
270 | static void emit_pack_store_4ub( struct x86_program *p, |
||
271 | struct x86_reg dest, |
||
272 | struct x86_reg temp ) |
||
273 | { |
||
274 | /* Scale by 255.0 |
||
275 | */ |
||
276 | sse_mulps(&p->func, temp, p->chan0); |
||
277 | |||
278 | if (p->have_sse2) { |
||
279 | sse2_cvtps2dq(&p->func, temp, temp); |
||
280 | sse2_packssdw(&p->func, temp, temp); |
||
281 | sse2_packuswb(&p->func, temp, temp); |
||
282 | sse_movss(&p->func, dest, temp); |
||
283 | } |
||
284 | else { |
||
285 | struct x86_reg mmx0 = x86_make_reg(file_MMX, 0); |
||
286 | struct x86_reg mmx1 = x86_make_reg(file_MMX, 1); |
||
287 | sse_cvtps2pi(&p->func, mmx0, temp); |
||
288 | sse_movhlps(&p->func, temp, temp); |
||
289 | sse_cvtps2pi(&p->func, mmx1, temp); |
||
290 | mmx_packssdw(&p->func, mmx0, mmx1); |
||
291 | mmx_packuswb(&p->func, mmx0, mmx0); |
||
292 | mmx_movd(&p->func, dest, mmx0); |
||
293 | } |
||
294 | } |
||
295 | |||
296 | static GLint get_offset( const void *a, const void *b ) |
||
297 | { |
||
298 | return (const char *)b - (const char *)a; |
||
299 | } |
||
300 | |||
301 | /* Not much happens here. Eventually use this function to try and |
||
302 | * avoid saving/reloading the source pointers each vertex (if some of |
||
303 | * them can fit in registers). |
||
304 | */ |
||
305 | static void get_src_ptr( struct x86_program *p, |
||
306 | struct x86_reg srcREG, |
||
307 | struct x86_reg vtxREG, |
||
308 | struct tnl_clipspace_attr *a ) |
||
309 | { |
||
310 | struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx); |
||
311 | struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr)); |
||
312 | |||
313 | /* Load current a[j].inputptr |
||
314 | */ |
||
315 | x86_mov(&p->func, srcREG, ptr_to_src); |
||
316 | } |
||
317 | |||
318 | static void update_src_ptr( struct x86_program *p, |
||
319 | struct x86_reg srcREG, |
||
320 | struct x86_reg vtxREG, |
||
321 | struct tnl_clipspace_attr *a ) |
||
322 | { |
||
323 | if (a->inputstride) { |
||
324 | struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx); |
||
325 | struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr)); |
||
326 | |||
327 | /* add a[j].inputstride (hardcoded value - could just as easily |
||
328 | * pull the stride value from memory each time). |
||
329 | */ |
||
330 | x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride)); |
||
331 | |||
332 | /* save new value of a[j].inputptr |
||
333 | */ |
||
334 | x86_mov(&p->func, ptr_to_src, srcREG); |
||
335 | } |
||
336 | } |
||
337 | |||
338 | |||
339 | /* Lots of hardcoding |
||
340 | * |
||
341 | * EAX -- pointer to current output vertex |
||
342 | * ECX -- pointer to current attribute |
||
343 | * |
||
344 | */ |
||
345 | static GLboolean build_vertex_emit( struct x86_program *p ) |
||
346 | { |
||
347 | struct gl_context *ctx = p->ctx; |
||
348 | TNLcontext *tnl = TNL_CONTEXT(ctx); |
||
349 | struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx); |
||
350 | GLuint j = 0; |
||
351 | |||
352 | struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX); |
||
353 | struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX); |
||
354 | struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP); |
||
355 | struct x86_reg vtxESI = x86_make_reg(file_REG32, reg_SI); |
||
356 | struct x86_reg temp = x86_make_reg(file_XMM, 0); |
||
357 | struct x86_reg vp0 = x86_make_reg(file_XMM, 1); |
||
358 | struct x86_reg vp1 = x86_make_reg(file_XMM, 2); |
||
359 | struct x86_reg temp2 = x86_make_reg(file_XMM, 3); |
||
360 | GLubyte *fixup, *label; |
||
361 | |||
362 | /* Push a few regs? |
||
363 | */ |
||
364 | x86_push(&p->func, countEBP); |
||
365 | x86_push(&p->func, vtxESI); |
||
366 | |||
367 | |||
368 | /* Get vertex count, compare to zero |
||
369 | */ |
||
370 | x86_xor(&p->func, srcECX, srcECX); |
||
371 | x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2)); |
||
372 | x86_cmp(&p->func, countEBP, srcECX); |
||
373 | fixup = x86_jcc_forward(&p->func, cc_E); |
||
374 | |||
375 | /* Initialize destination register. |
||
376 | */ |
||
377 | x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3)); |
||
378 | |||
379 | /* Dereference ctx to get tnl, then vtx: |
||
380 | */ |
||
381 | x86_mov(&p->func, vtxESI, x86_fn_arg(&p->func, 1)); |
||
382 | x86_mov(&p->func, vtxESI, x86_make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context))); |
||
383 | vtxESI = x86_make_disp(vtxESI, get_offset(tnl, &tnl->clipspace)); |
||
384 | |||
385 | |||
386 | /* Possibly load vp0, vp1 for viewport calcs: |
||
387 | */ |
||
388 | if (vtx->need_viewport) { |
||
389 | sse_movups(&p->func, vp0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0]))); |
||
390 | sse_movups(&p->func, vp1, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0]))); |
||
391 | } |
||
392 | |||
393 | /* always load, needed or not: |
||
394 | */ |
||
395 | sse_movups(&p->func, p->chan0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0]))); |
||
396 | sse_movups(&p->func, p->identity, x86_make_disp(vtxESI, get_offset(vtx, &vtx->identity[0]))); |
||
397 | |||
398 | /* Note address for loop jump */ |
||
399 | label = x86_get_label(&p->func); |
||
400 | |||
401 | /* Emit code for each of the attributes. Currently routes |
||
402 | * everything through SSE registers, even when it might be more |
||
403 | * efficient to stick with regular old x86. No optimization or |
||
404 | * other tricks - enough new ground to cover here just getting |
||
405 | * things working. |
||
406 | */ |
||
407 | while (j < vtx->attr_count) { |
||
408 | struct tnl_clipspace_attr *a = &vtx->attr[j]; |
||
409 | struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset); |
||
410 | |||
411 | /* Now, load an XMM reg from src, perhaps transform, then save. |
||
412 | * Could be shortcircuited in specific cases: |
||
413 | */ |
||
414 | switch (a->format) { |
||
415 | case EMIT_1F: |
||
416 | get_src_ptr(p, srcECX, vtxESI, a); |
||
417 | emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize); |
||
418 | emit_store(p, dest, 1, temp); |
||
419 | update_src_ptr(p, srcECX, vtxESI, a); |
||
420 | break; |
||
421 | case EMIT_2F: |
||
422 | get_src_ptr(p, srcECX, vtxESI, a); |
||
423 | emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize); |
||
424 | emit_store(p, dest, 2, temp); |
||
425 | update_src_ptr(p, srcECX, vtxESI, a); |
||
426 | break; |
||
427 | case EMIT_3F: |
||
428 | /* Potentially the worst case - hardcode 2+1 copying: |
||
429 | */ |
||
430 | if (0) { |
||
431 | get_src_ptr(p, srcECX, vtxESI, a); |
||
432 | emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize); |
||
433 | emit_store(p, dest, 3, temp); |
||
434 | update_src_ptr(p, srcECX, vtxESI, a); |
||
435 | } |
||
436 | else { |
||
437 | get_src_ptr(p, srcECX, vtxESI, a); |
||
438 | emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize); |
||
439 | emit_store(p, dest, 2, temp); |
||
440 | if (a->inputsize > 2) { |
||
441 | emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1); |
||
442 | emit_store(p, x86_make_disp(dest,8), 1, temp); |
||
443 | } |
||
444 | else { |
||
445 | sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p)); |
||
446 | } |
||
447 | update_src_ptr(p, srcECX, vtxESI, a); |
||
448 | } |
||
449 | break; |
||
450 | case EMIT_4F: |
||
451 | get_src_ptr(p, srcECX, vtxESI, a); |
||
452 | emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); |
||
453 | emit_store(p, dest, 4, temp); |
||
454 | update_src_ptr(p, srcECX, vtxESI, a); |
||
455 | break; |
||
456 | case EMIT_2F_VIEWPORT: |
||
457 | get_src_ptr(p, srcECX, vtxESI, a); |
||
458 | emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize); |
||
459 | sse_mulps(&p->func, temp, vp0); |
||
460 | sse_addps(&p->func, temp, vp1); |
||
461 | emit_store(p, dest, 2, temp); |
||
462 | update_src_ptr(p, srcECX, vtxESI, a); |
||
463 | break; |
||
464 | case EMIT_3F_VIEWPORT: |
||
465 | get_src_ptr(p, srcECX, vtxESI, a); |
||
466 | emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize); |
||
467 | sse_mulps(&p->func, temp, vp0); |
||
468 | sse_addps(&p->func, temp, vp1); |
||
469 | emit_store(p, dest, 3, temp); |
||
470 | update_src_ptr(p, srcECX, vtxESI, a); |
||
471 | break; |
||
472 | case EMIT_4F_VIEWPORT: |
||
473 | get_src_ptr(p, srcECX, vtxESI, a); |
||
474 | emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); |
||
475 | sse_mulps(&p->func, temp, vp0); |
||
476 | sse_addps(&p->func, temp, vp1); |
||
477 | emit_store(p, dest, 4, temp); |
||
478 | update_src_ptr(p, srcECX, vtxESI, a); |
||
479 | break; |
||
480 | case EMIT_3F_XYW: |
||
481 | get_src_ptr(p, srcECX, vtxESI, a); |
||
482 | emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); |
||
483 | sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z)); |
||
484 | emit_store(p, dest, 3, temp); |
||
485 | update_src_ptr(p, srcECX, vtxESI, a); |
||
486 | break; |
||
487 | |||
488 | case EMIT_1UB_1F: |
||
489 | /* Test for PAD3 + 1UB: |
||
490 | */ |
||
491 | if (j > 0 && |
||
492 | a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3) |
||
493 | { |
||
494 | get_src_ptr(p, srcECX, vtxESI, a); |
||
495 | emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize); |
||
496 | sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X)); |
||
497 | emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */ |
||
498 | update_src_ptr(p, srcECX, vtxESI, a); |
||
499 | } |
||
500 | else { |
||
501 | printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize ); |
||
502 | return GL_FALSE; |
||
503 | } |
||
504 | break; |
||
505 | case EMIT_3UB_3F_RGB: |
||
506 | case EMIT_3UB_3F_BGR: |
||
507 | /* Test for 3UB + PAD1: |
||
508 | */ |
||
509 | if (j == vtx->attr_count - 1 || |
||
510 | a[1].vertoffset >= a->vertoffset + 4) { |
||
511 | get_src_ptr(p, srcECX, vtxESI, a); |
||
512 | emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize); |
||
513 | if (a->format == EMIT_3UB_3F_BGR) |
||
514 | sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W)); |
||
515 | emit_pack_store_4ub(p, dest, temp); |
||
516 | update_src_ptr(p, srcECX, vtxESI, a); |
||
517 | } |
||
518 | /* Test for 3UB + 1UB: |
||
519 | */ |
||
520 | else if (j < vtx->attr_count - 1 && |
||
521 | a[1].format == EMIT_1UB_1F && |
||
522 | a[1].vertoffset == a->vertoffset + 3) { |
||
523 | get_src_ptr(p, srcECX, vtxESI, a); |
||
524 | emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize); |
||
525 | update_src_ptr(p, srcECX, vtxESI, a); |
||
526 | |||
527 | /* Make room for incoming value: |
||
528 | */ |
||
529 | sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z)); |
||
530 | |||
531 | get_src_ptr(p, srcECX, vtxESI, &a[1]); |
||
532 | emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize); |
||
533 | sse_movss(&p->func, temp, temp2); |
||
534 | update_src_ptr(p, srcECX, vtxESI, &a[1]); |
||
535 | |||
536 | /* Rearrange and possibly do BGR conversion: |
||
537 | */ |
||
538 | if (a->format == EMIT_3UB_3F_BGR) |
||
539 | sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X)); |
||
540 | else |
||
541 | sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X)); |
||
542 | |||
543 | emit_pack_store_4ub(p, dest, temp); |
||
544 | j++; /* NOTE: two attrs consumed */ |
||
545 | } |
||
546 | else { |
||
547 | printf("Can't emit 3ub\n"); |
||
548 | return GL_FALSE; /* add this later */ |
||
549 | } |
||
550 | break; |
||
551 | |||
552 | case EMIT_4UB_4F_RGBA: |
||
553 | get_src_ptr(p, srcECX, vtxESI, a); |
||
554 | emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); |
||
555 | emit_pack_store_4ub(p, dest, temp); |
||
556 | update_src_ptr(p, srcECX, vtxESI, a); |
||
557 | break; |
||
558 | case EMIT_4UB_4F_BGRA: |
||
559 | get_src_ptr(p, srcECX, vtxESI, a); |
||
560 | emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); |
||
561 | sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W)); |
||
562 | emit_pack_store_4ub(p, dest, temp); |
||
563 | update_src_ptr(p, srcECX, vtxESI, a); |
||
564 | break; |
||
565 | case EMIT_4UB_4F_ARGB: |
||
566 | get_src_ptr(p, srcECX, vtxESI, a); |
||
567 | emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); |
||
568 | sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z)); |
||
569 | emit_pack_store_4ub(p, dest, temp); |
||
570 | update_src_ptr(p, srcECX, vtxESI, a); |
||
571 | break; |
||
572 | case EMIT_4UB_4F_ABGR: |
||
573 | get_src_ptr(p, srcECX, vtxESI, a); |
||
574 | emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); |
||
575 | sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X)); |
||
576 | emit_pack_store_4ub(p, dest, temp); |
||
577 | update_src_ptr(p, srcECX, vtxESI, a); |
||
578 | break; |
||
579 | case EMIT_4CHAN_4F_RGBA: |
||
580 | switch (CHAN_TYPE) { |
||
581 | case GL_UNSIGNED_BYTE: |
||
582 | get_src_ptr(p, srcECX, vtxESI, a); |
||
583 | emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); |
||
584 | emit_pack_store_4ub(p, dest, temp); |
||
585 | update_src_ptr(p, srcECX, vtxESI, a); |
||
586 | break; |
||
587 | case GL_FLOAT: |
||
588 | get_src_ptr(p, srcECX, vtxESI, a); |
||
589 | emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize); |
||
590 | emit_store(p, dest, 4, temp); |
||
591 | update_src_ptr(p, srcECX, vtxESI, a); |
||
592 | break; |
||
593 | case GL_UNSIGNED_SHORT: |
||
594 | default: |
||
595 | printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE)); |
||
596 | return GL_FALSE; |
||
597 | } |
||
598 | break; |
||
599 | default: |
||
600 | printf("unknown a[%d].format %d\n", j, a->format); |
||
601 | return GL_FALSE; /* catch any new opcodes */ |
||
602 | } |
||
603 | |||
604 | /* Increment j by at least 1 - may have been incremented above also: |
||
605 | */ |
||
606 | j++; |
||
607 | } |
||
608 | |||
609 | /* Next vertex: |
||
610 | */ |
||
611 | x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vtx->vertex_size)); |
||
612 | |||
613 | /* decr count, loop if not zero |
||
614 | */ |
||
615 | x86_dec(&p->func, countEBP); |
||
616 | x86_test(&p->func, countEBP, countEBP); |
||
617 | x86_jcc(&p->func, cc_NZ, label); |
||
618 | |||
619 | /* Exit mmx state? |
||
620 | */ |
||
621 | if (p->func.need_emms) |
||
622 | mmx_emms(&p->func); |
||
623 | |||
624 | /* Land forward jump here: |
||
625 | */ |
||
626 | x86_fixup_fwd_jump(&p->func, fixup); |
||
627 | |||
628 | /* Pop regs and return |
||
629 | */ |
||
630 | x86_pop(&p->func, x86_get_base_reg(vtxESI)); |
||
631 | x86_pop(&p->func, countEBP); |
||
632 | x86_ret(&p->func); |
||
633 | |||
634 | assert(!vtx->emit); |
||
635 | vtx->emit = (tnl_emit_func)x86_get_func(&p->func); |
||
636 | |||
637 | assert( (char *) p->func.csr - (char *) p->func.store <= MAX_SSE_CODE_SIZE ); |
||
638 | return GL_TRUE; |
||
639 | } |
||
640 | |||
641 | |||
642 | |||
643 | void _tnl_generate_sse_emit( struct gl_context *ctx ) |
||
644 | { |
||
645 | struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx); |
||
646 | struct x86_program p; |
||
647 | |||
648 | if (!cpu_has_xmm) { |
||
649 | vtx->codegen_emit = NULL; |
||
650 | return; |
||
651 | } |
||
652 | |||
653 | memset(&p, 0, sizeof(p)); |
||
654 | |||
655 | p.ctx = ctx; |
||
656 | p.inputs_safe = 0; /* for now */ |
||
657 | p.outputs_safe = 0; /* for now */ |
||
658 | p.have_sse2 = cpu_has_xmm2; |
||
659 | p.identity = x86_make_reg(file_XMM, 6); |
||
660 | p.chan0 = x86_make_reg(file_XMM, 7); |
||
661 | |||
662 | if (!x86_init_func_size(&p.func, MAX_SSE_CODE_SIZE)) { |
||
663 | vtx->emit = NULL; |
||
664 | return; |
||
665 | } |
||
666 | |||
667 | if (build_vertex_emit(&p)) { |
||
668 | _tnl_register_fastpath( vtx, GL_TRUE ); |
||
669 | } |
||
670 | else { |
||
671 | /* Note the failure so that we don't keep trying to codegen an |
||
672 | * impossible state: |
||
673 | */ |
||
674 | _tnl_register_fastpath( vtx, GL_FALSE ); |
||
675 | x86_release_func(&p.func); |
||
676 | } |
||
677 | } |
||
678 | |||
679 | #else |
||
680 | |||
681 | void _tnl_generate_sse_emit( struct gl_context *ctx ) |
||
682 | { |
||
683 | /* Dummy version for when USE_SSE_ASM not defined */ |
||
684 | } |
||
685 | |||
686 | #endif=>>=>> |