Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5564 serge 1
/*
2
 * Copyright 2003 VMware, Inc.
3
 * All Rights Reserved.
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * on the rights to use, copy, modify, merge, publish, distribute, sub
9
 * license, and/or sell copies of the Software, and to permit persons to whom
10
 * the Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19
 * VMWARE AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23
 *
24
 * Authors:
25
 *    Keith Whitwell 
26
 */
27
 
28
#include 
29
 
30
#include "main/glheader.h"
31
#include "main/context.h"
32
#include "util/simple_list.h"
33
#include "main/enums.h"
34
#include "swrast/s_chan.h"
35
#include "t_context.h"
36
#include "t_vertex.h"
37
 
38
#if defined(USE_SSE_ASM)
39
 
40
#include "x86/rtasm/x86sse.h"
41
#include "x86/common_x86_asm.h"
42
 
43
 
44
/**
45
 * Number of bytes to allocate for generated SSE functions
46
 */
47
#define MAX_SSE_CODE_SIZE 1024
48
 
49
 
50
#define X    0
51
#define Y    1
52
#define Z    2
53
#define W    3
54
 
55
 
56
struct x86_program {
57
   struct x86_function func;
58
 
59
   struct gl_context *ctx;
60
   GLboolean inputs_safe;
61
   GLboolean outputs_safe;
62
   GLboolean have_sse2;
63
 
64
   struct x86_reg identity;
65
   struct x86_reg chan0;
66
};
67
 
68
 
69
static struct x86_reg get_identity( struct x86_program *p )
70
{
71
   return p->identity;
72
}
73
 
74
static void emit_load4f_4( struct x86_program *p,
75
			   struct x86_reg dest,
76
			   struct x86_reg arg0 )
77
{
78
   sse_movups(&p->func, dest, arg0);
79
}
80
 
81
static void emit_load4f_3( struct x86_program *p,
82
			   struct x86_reg dest,
83
			   struct x86_reg arg0 )
84
{
85
   /* Have to jump through some hoops:
86
    *
87
    * c 0 0 0
88
    * c 0 0 1
89
    * 0 0 c 1
90
    * a b c 1
91
    */
92
   sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
93
   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
94
   sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
95
   sse_movlps(&p->func, dest, arg0);
96
}
97
 
98
static void emit_load4f_2( struct x86_program *p,
99
			   struct x86_reg dest,
100
			   struct x86_reg arg0 )
101
{
102
   /* Initialize from identity, then pull in low two words:
103
    */
104
   sse_movups(&p->func, dest, get_identity(p));
105
   sse_movlps(&p->func, dest, arg0);
106
}
107
 
108
static void emit_load4f_1( struct x86_program *p,
109
			   struct x86_reg dest,
110
			   struct x86_reg arg0 )
111
{
112
   /* Pull in low word, then swizzle in identity */
113
   sse_movss(&p->func, dest, arg0);
114
   sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
115
}
116
 
117
 
118
 
119
static void emit_load3f_3( struct x86_program *p,
120
			   struct x86_reg dest,
121
			   struct x86_reg arg0 )
122
{
123
   /* Over-reads by 1 dword - potential SEGV if input is a vertex
124
    * array.
125
    */
126
   if (p->inputs_safe) {
127
      sse_movups(&p->func, dest, arg0);
128
   }
129
   else {
130
      /* c 0 0 0
131
       * c c c c
132
       * a b c c
133
       */
134
      sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
135
      sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
136
      sse_movlps(&p->func, dest, arg0);
137
   }
138
}
139
 
140
static void emit_load3f_2( struct x86_program *p,
141
			   struct x86_reg dest,
142
			   struct x86_reg arg0 )
143
{
144
   emit_load4f_2(p, dest, arg0);
145
}
146
 
147
static void emit_load3f_1( struct x86_program *p,
148
			   struct x86_reg dest,
149
			   struct x86_reg arg0 )
150
{
151
   /* Loading from memory erases the upper bits. */
152
   sse_movss(&p->func, dest, arg0);
153
}
154
 
155
static void emit_load2f_2( struct x86_program *p,
156
			   struct x86_reg dest,
157
			   struct x86_reg arg0 )
158
{
159
   sse_movlps(&p->func, dest, arg0);
160
}
161
 
162
static void emit_load2f_1( struct x86_program *p,
163
			   struct x86_reg dest,
164
			   struct x86_reg arg0 )
165
{
166
   /* Loading from memory erases the upper bits. */
167
   sse_movss(&p->func, dest, arg0);
168
}
169
 
170
static void emit_load1f_1( struct x86_program *p,
171
			   struct x86_reg dest,
172
			   struct x86_reg arg0 )
173
{
174
   sse_movss(&p->func, dest, arg0);
175
}
176
 
177
static void (*load[4][4])( struct x86_program *p,
178
			   struct x86_reg dest,
179
			   struct x86_reg arg0 ) = {
180
   { emit_load1f_1,
181
     emit_load1f_1,
182
     emit_load1f_1,
183
     emit_load1f_1 },
184
 
185
   { emit_load2f_1,
186
     emit_load2f_2,
187
     emit_load2f_2,
188
     emit_load2f_2 },
189
 
190
   { emit_load3f_1,
191
     emit_load3f_2,
192
     emit_load3f_3,
193
     emit_load3f_3 },
194
 
195
   { emit_load4f_1,
196
     emit_load4f_2,
197
     emit_load4f_3,
198
     emit_load4f_4 }
199
};
200
 
201
static void emit_load( struct x86_program *p,
202
		       struct x86_reg dest,
203
		       GLuint sz,
204
		       struct x86_reg src,
205
		       GLuint src_sz)
206
{
207
   load[sz-1][src_sz-1](p, dest, src);
208
}
209
 
210
static void emit_store4f( struct x86_program *p,
211
			  struct x86_reg dest,
212
			  struct x86_reg arg0 )
213
{
214
   sse_movups(&p->func, dest, arg0);
215
}
216
 
217
static void emit_store3f( struct x86_program *p,
218
			  struct x86_reg dest,
219
			  struct x86_reg arg0 )
220
{
221
   if (p->outputs_safe) {
222
      /* Emit the extra dword anyway.  This may hurt writecombining,
223
       * may cause other problems.
224
       */
225
      sse_movups(&p->func, dest, arg0);
226
   }
227
   else {
228
      /* Alternate strategy - emit two, shuffle, emit one.
229
       */
230
      sse_movlps(&p->func, dest, arg0);
231
      sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
232
      sse_movss(&p->func, x86_make_disp(dest,8), arg0);
233
   }
234
}
235
 
236
static void emit_store2f( struct x86_program *p,
237
			   struct x86_reg dest,
238
			   struct x86_reg arg0 )
239
{
240
   sse_movlps(&p->func, dest, arg0);
241
}
242
 
243
static void emit_store1f( struct x86_program *p,
244
			  struct x86_reg dest,
245
			  struct x86_reg arg0 )
246
{
247
   sse_movss(&p->func, dest, arg0);
248
}
249
 
250
 
251
static void (*store[4])( struct x86_program *p,
252
			 struct x86_reg dest,
253
			 struct x86_reg arg0 ) =
254
{
255
   emit_store1f,
256
   emit_store2f,
257
   emit_store3f,
258
   emit_store4f
259
};
260
 
261
static void emit_store( struct x86_program *p,
262
			struct x86_reg dest,
263
			GLuint sz,
264
			struct x86_reg temp )
265
 
266
{
267
   store[sz-1](p, dest, temp);
268
}
269
 
270
static void emit_pack_store_4ub( struct x86_program *p,
271
				 struct x86_reg dest,
272
				 struct x86_reg temp )
273
{
274
   /* Scale by 255.0
275
    */
276
   sse_mulps(&p->func, temp, p->chan0);
277
 
278
   if (p->have_sse2) {
279
      sse2_cvtps2dq(&p->func, temp, temp);
280
      sse2_packssdw(&p->func, temp, temp);
281
      sse2_packuswb(&p->func, temp, temp);
282
      sse_movss(&p->func, dest, temp);
283
   }
284
   else {
285
      struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
286
      struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
287
      sse_cvtps2pi(&p->func, mmx0, temp);
288
      sse_movhlps(&p->func, temp, temp);
289
      sse_cvtps2pi(&p->func, mmx1, temp);
290
      mmx_packssdw(&p->func, mmx0, mmx1);
291
      mmx_packuswb(&p->func, mmx0, mmx0);
292
      mmx_movd(&p->func, dest, mmx0);
293
   }
294
}
295
 
296
static GLint get_offset( const void *a, const void *b )
297
{
298
   return (const char *)b - (const char *)a;
299
}
300
 
301
/* Not much happens here.  Eventually use this function to try and
302
 * avoid saving/reloading the source pointers each vertex (if some of
303
 * them can fit in registers).
304
 */
305
static void get_src_ptr( struct x86_program *p,
306
			 struct x86_reg srcREG,
307
			 struct x86_reg vtxREG,
308
			 struct tnl_clipspace_attr *a )
309
{
310
   struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
311
   struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
312
 
313
   /* Load current a[j].inputptr
314
    */
315
   x86_mov(&p->func, srcREG, ptr_to_src);
316
}
317
 
318
static void update_src_ptr( struct x86_program *p,
319
			 struct x86_reg srcREG,
320
			 struct x86_reg vtxREG,
321
			 struct tnl_clipspace_attr *a )
322
{
323
   if (a->inputstride) {
324
      struct tnl_clipspace *vtx = GET_VERTEX_STATE(p->ctx);
325
      struct x86_reg ptr_to_src = x86_make_disp(vtxREG, get_offset(vtx, &a->inputptr));
326
 
327
      /* add a[j].inputstride (hardcoded value - could just as easily
328
       * pull the stride value from memory each time).
329
       */
330
      x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));
331
 
332
      /* save new value of a[j].inputptr
333
       */
334
      x86_mov(&p->func, ptr_to_src, srcREG);
335
   }
336
}
337
 
338
 
339
/* Lots of hardcoding
340
 *
341
 * EAX -- pointer to current output vertex
342
 * ECX -- pointer to current attribute
343
 *
344
 */
345
static GLboolean build_vertex_emit( struct x86_program *p )
346
{
347
   struct gl_context *ctx = p->ctx;
348
   TNLcontext *tnl = TNL_CONTEXT(ctx);
349
   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
350
   GLuint j = 0;
351
 
352
   struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
353
   struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
354
   struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
355
   struct x86_reg vtxESI = x86_make_reg(file_REG32, reg_SI);
356
   struct x86_reg temp = x86_make_reg(file_XMM, 0);
357
   struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
358
   struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
359
   struct x86_reg temp2 = x86_make_reg(file_XMM, 3);
360
   GLubyte *fixup, *label;
361
 
362
   /* Push a few regs?
363
    */
364
   x86_push(&p->func, countEBP);
365
   x86_push(&p->func, vtxESI);
366
 
367
 
368
   /* Get vertex count, compare to zero
369
    */
370
   x86_xor(&p->func, srcECX, srcECX);
371
   x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
372
   x86_cmp(&p->func, countEBP, srcECX);
373
   fixup = x86_jcc_forward(&p->func, cc_E);
374
 
375
   /* Initialize destination register.
376
    */
377
   x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));
378
 
379
   /* Dereference ctx to get tnl, then vtx:
380
    */
381
   x86_mov(&p->func, vtxESI, x86_fn_arg(&p->func, 1));
382
   x86_mov(&p->func, vtxESI, x86_make_disp(vtxESI, get_offset(ctx, &ctx->swtnl_context)));
383
   vtxESI = x86_make_disp(vtxESI, get_offset(tnl, &tnl->clipspace));
384
 
385
 
386
   /* Possibly load vp0, vp1 for viewport calcs:
387
    */
388
   if (vtx->need_viewport) {
389
      sse_movups(&p->func, vp0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_scale[0])));
390
      sse_movups(&p->func, vp1, x86_make_disp(vtxESI, get_offset(vtx, &vtx->vp_xlate[0])));
391
   }
392
 
393
   /* always load, needed or not:
394
    */
395
   sse_movups(&p->func, p->chan0, x86_make_disp(vtxESI, get_offset(vtx, &vtx->chan_scale[0])));
396
   sse_movups(&p->func, p->identity, x86_make_disp(vtxESI, get_offset(vtx, &vtx->identity[0])));
397
 
398
   /* Note address for loop jump */
399
   label = x86_get_label(&p->func);
400
 
401
   /* Emit code for each of the attributes.  Currently routes
402
    * everything through SSE registers, even when it might be more
403
    * efficient to stick with regular old x86.  No optimization or
404
    * other tricks - enough new ground to cover here just getting
405
    * things working.
406
    */
407
   while (j < vtx->attr_count) {
408
      struct tnl_clipspace_attr *a = &vtx->attr[j];
409
      struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);
410
 
411
      /* Now, load an XMM reg from src, perhaps transform, then save.
412
       * Could be shortcircuited in specific cases:
413
       */
414
      switch (a->format) {
415
      case EMIT_1F:
416
	 get_src_ptr(p, srcECX, vtxESI, a);
417
	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
418
	 emit_store(p, dest, 1, temp);
419
	 update_src_ptr(p, srcECX, vtxESI, a);
420
	 break;
421
      case EMIT_2F:
422
	 get_src_ptr(p, srcECX, vtxESI, a);
423
	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
424
	 emit_store(p, dest, 2, temp);
425
	 update_src_ptr(p, srcECX, vtxESI, a);
426
	 break;
427
      case EMIT_3F:
428
	 /* Potentially the worst case - hardcode 2+1 copying:
429
	  */
430
	 if (0) {
431
	    get_src_ptr(p, srcECX, vtxESI, a);
432
	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
433
	    emit_store(p, dest, 3, temp);
434
	    update_src_ptr(p, srcECX, vtxESI, a);
435
	 }
436
	 else {
437
	    get_src_ptr(p, srcECX, vtxESI, a);
438
	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
439
	    emit_store(p, dest, 2, temp);
440
	    if (a->inputsize > 2) {
441
	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
442
	       emit_store(p, x86_make_disp(dest,8), 1, temp);
443
	    }
444
	    else {
445
	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
446
	    }
447
	    update_src_ptr(p, srcECX, vtxESI, a);
448
	 }
449
	 break;
450
      case EMIT_4F:
451
	 get_src_ptr(p, srcECX, vtxESI, a);
452
	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
453
	 emit_store(p, dest, 4, temp);
454
	 update_src_ptr(p, srcECX, vtxESI, a);
455
	 break;
456
      case EMIT_2F_VIEWPORT:
457
	 get_src_ptr(p, srcECX, vtxESI, a);
458
	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
459
	 sse_mulps(&p->func, temp, vp0);
460
	 sse_addps(&p->func, temp, vp1);
461
	 emit_store(p, dest, 2, temp);
462
	 update_src_ptr(p, srcECX, vtxESI, a);
463
	 break;
464
      case EMIT_3F_VIEWPORT:
465
	 get_src_ptr(p, srcECX, vtxESI, a);
466
	 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
467
	 sse_mulps(&p->func, temp, vp0);
468
	 sse_addps(&p->func, temp, vp1);
469
	 emit_store(p, dest, 3, temp);
470
	 update_src_ptr(p, srcECX, vtxESI, a);
471
	 break;
472
      case EMIT_4F_VIEWPORT:
473
	 get_src_ptr(p, srcECX, vtxESI, a);
474
	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
475
	 sse_mulps(&p->func, temp, vp0);
476
	 sse_addps(&p->func, temp, vp1);
477
	 emit_store(p, dest, 4, temp);
478
	 update_src_ptr(p, srcECX, vtxESI, a);
479
	 break;
480
      case EMIT_3F_XYW:
481
	 get_src_ptr(p, srcECX, vtxESI, a);
482
	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
483
	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
484
	 emit_store(p, dest, 3, temp);
485
	 update_src_ptr(p, srcECX, vtxESI, a);
486
	 break;
487
 
488
      case EMIT_1UB_1F:
489
	 /* Test for PAD3 + 1UB:
490
	  */
491
	 if (j > 0 &&
492
	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
493
	 {
494
	    get_src_ptr(p, srcECX, vtxESI, a);
495
	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
496
	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
497
	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
498
	    update_src_ptr(p, srcECX, vtxESI, a);
499
	 }
500
	 else {
501
	    printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
502
	    return GL_FALSE;
503
	 }
504
	 break;
505
      case EMIT_3UB_3F_RGB:
506
      case EMIT_3UB_3F_BGR:
507
	 /* Test for 3UB + PAD1:
508
	  */
509
	 if (j == vtx->attr_count - 1 ||
510
	     a[1].vertoffset >= a->vertoffset + 4) {
511
	    get_src_ptr(p, srcECX, vtxESI, a);
512
	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
513
	    if (a->format == EMIT_3UB_3F_BGR)
514
	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
515
	    emit_pack_store_4ub(p, dest, temp);
516
	    update_src_ptr(p, srcECX, vtxESI, a);
517
	 }
518
	 /* Test for 3UB + 1UB:
519
	  */
520
	 else if (j < vtx->attr_count - 1 &&
521
		  a[1].format == EMIT_1UB_1F &&
522
		  a[1].vertoffset == a->vertoffset + 3) {
523
	    get_src_ptr(p, srcECX, vtxESI, a);
524
	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
525
	    update_src_ptr(p, srcECX, vtxESI, a);
526
 
527
	    /* Make room for incoming value:
528
	     */
529
	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
530
 
531
	    get_src_ptr(p, srcECX, vtxESI, &a[1]);
532
	    emit_load(p, temp2, 1, x86_deref(srcECX), a[1].inputsize);
533
	    sse_movss(&p->func, temp, temp2);
534
	    update_src_ptr(p, srcECX, vtxESI, &a[1]);
535
 
536
	    /* Rearrange and possibly do BGR conversion:
537
	     */
538
	    if (a->format == EMIT_3UB_3F_BGR)
539
	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
540
	    else
541
	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));
542
 
543
	    emit_pack_store_4ub(p, dest, temp);
544
	    j++;		/* NOTE: two attrs consumed */
545
	 }
546
	 else {
547
	    printf("Can't emit 3ub\n");
548
	    return GL_FALSE;	/* add this later */
549
	 }
550
	 break;
551
 
552
      case EMIT_4UB_4F_RGBA:
553
	 get_src_ptr(p, srcECX, vtxESI, a);
554
	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
555
	 emit_pack_store_4ub(p, dest, temp);
556
	 update_src_ptr(p, srcECX, vtxESI, a);
557
	 break;
558
      case EMIT_4UB_4F_BGRA:
559
	 get_src_ptr(p, srcECX, vtxESI, a);
560
	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
561
	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
562
	 emit_pack_store_4ub(p, dest, temp);
563
	 update_src_ptr(p, srcECX, vtxESI, a);
564
	 break;
565
      case EMIT_4UB_4F_ARGB:
566
	 get_src_ptr(p, srcECX, vtxESI, a);
567
	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
568
	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
569
	 emit_pack_store_4ub(p, dest, temp);
570
	 update_src_ptr(p, srcECX, vtxESI, a);
571
	 break;
572
      case EMIT_4UB_4F_ABGR:
573
	 get_src_ptr(p, srcECX, vtxESI, a);
574
	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
575
	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
576
	 emit_pack_store_4ub(p, dest, temp);
577
	 update_src_ptr(p, srcECX, vtxESI, a);
578
	 break;
579
      case EMIT_4CHAN_4F_RGBA:
580
	 switch (CHAN_TYPE) {
581
	 case GL_UNSIGNED_BYTE:
582
	    get_src_ptr(p, srcECX, vtxESI, a);
583
	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
584
	    emit_pack_store_4ub(p, dest, temp);
585
	    update_src_ptr(p, srcECX, vtxESI, a);
586
	    break;
587
	 case GL_FLOAT:
588
	    get_src_ptr(p, srcECX, vtxESI, a);
589
	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
590
	    emit_store(p, dest, 4, temp);
591
	    update_src_ptr(p, srcECX, vtxESI, a);
592
	    break;
593
	 case GL_UNSIGNED_SHORT:
594
	 default:
595
	    printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
596
	    return GL_FALSE;
597
	 }
598
	 break;
599
      default:
600
	 printf("unknown a[%d].format %d\n", j, a->format);
601
	 return GL_FALSE;	/* catch any new opcodes */
602
      }
603
 
604
      /* Increment j by at least 1 - may have been incremented above also:
605
       */
606
      j++;
607
   }
608
 
609
   /* Next vertex:
610
    */
611
   x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vtx->vertex_size));
612
 
613
   /* decr count, loop if not zero
614
    */
615
   x86_dec(&p->func, countEBP);
616
   x86_test(&p->func, countEBP, countEBP);
617
   x86_jcc(&p->func, cc_NZ, label);
618
 
619
   /* Exit mmx state?
620
    */
621
   if (p->func.need_emms)
622
      mmx_emms(&p->func);
623
 
624
   /* Land forward jump here:
625
    */
626
   x86_fixup_fwd_jump(&p->func, fixup);
627
 
628
   /* Pop regs and return
629
    */
630
   x86_pop(&p->func, x86_get_base_reg(vtxESI));
631
   x86_pop(&p->func, countEBP);
632
   x86_ret(&p->func);
633
 
634
   assert(!vtx->emit);
635
   vtx->emit = (tnl_emit_func)x86_get_func(&p->func);
636
 
637
   assert( (char *) p->func.csr - (char *) p->func.store <= MAX_SSE_CODE_SIZE );
638
   return GL_TRUE;
639
}
640
 
641
 
642
 
643
void _tnl_generate_sse_emit( struct gl_context *ctx )
644
{
645
   struct tnl_clipspace *vtx = GET_VERTEX_STATE(ctx);
646
   struct x86_program p;
647
 
648
   if (!cpu_has_xmm) {
649
      vtx->codegen_emit = NULL;
650
      return;
651
   }
652
 
653
   memset(&p, 0, sizeof(p));
654
 
655
   p.ctx = ctx;
656
   p.inputs_safe = 0;		/* for now */
657
   p.outputs_safe = 0;		/* for now */
658
   p.have_sse2 = cpu_has_xmm2;
659
   p.identity = x86_make_reg(file_XMM, 6);
660
   p.chan0 = x86_make_reg(file_XMM, 7);
661
 
662
   if (!x86_init_func_size(&p.func, MAX_SSE_CODE_SIZE)) {
663
      vtx->emit = NULL;
664
      return;
665
   }
666
 
667
   if (build_vertex_emit(&p)) {
668
      _tnl_register_fastpath( vtx, GL_TRUE );
669
   }
670
   else {
671
      /* Note the failure so that we don't keep trying to codegen an
672
       * impossible state:
673
       */
674
      _tnl_register_fastpath( vtx, GL_FALSE );
675
      x86_release_func(&p.func);
676
   }
677
}
678
 
679
#else
680
 
681
void _tnl_generate_sse_emit( struct gl_context *ctx )
682
{
683
   /* Dummy version for when USE_SSE_ASM not defined */
684
}
685
 
686
#endif