Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
4358 Serge 1
/*
2
 * Copyright 2003 Tungsten Graphics, inc.
3
 * All Rights Reserved.
4
 *
5
 * Permission is hereby granted, free of charge, to any person obtaining a
6
 * copy of this software and associated documentation files (the "Software"),
7
 * to deal in the Software without restriction, including without limitation
8
 * on the rights to use, copy, modify, merge, publish, distribute, sub
9
 * license, and/or sell copies of the Software, and to permit persons to whom
10
 * the Software is furnished to do so, subject to the following conditions:
11
 *
12
 * The above copyright notice and this permission notice (including the next
13
 * paragraph) shall be included in all copies or substantial portions of the
14
 * Software.
15
 *
16
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18
 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
19
 * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
20
 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
21
 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
22
 * USE OR OTHER DEALINGS IN THE SOFTWARE.
23
 *
24
 * Authors:
25
 *    Keith Whitwell 
26
 */
27
 
28
 
29
#include "pipe/p_config.h"
30
#include "pipe/p_compiler.h"
31
#include "util/u_memory.h"
32
#include "util/u_math.h"
33
#include "util/u_format.h"
34
 
35
#include "translate.h"
36
 
37
 
38
#if (defined(PIPE_ARCH_X86) || (defined(PIPE_ARCH_X86_64) && !defined(__MINGW32__))) && !defined(PIPE_SUBSYSTEM_EMBEDDED)
39
 
40
#include "rtasm/rtasm_cpu.h"
41
#include "rtasm/rtasm_x86sse.h"
42
 
43
 
44
#define X    0
45
#define Y    1
46
#define Z    2
47
#define W    3
48
 
49
 
50
struct translate_buffer {
51
   const void *base_ptr;
52
   uintptr_t stride;
53
   unsigned max_index;
54
};
55
 
56
struct translate_buffer_variant {
57
   unsigned buffer_index;
58
   unsigned instance_divisor;
59
   void *ptr;                    /* updated either per vertex or per instance */
60
};
61
 
62
 
63
#define ELEMENT_BUFFER_INSTANCE_ID  1001
64
 
65
#define NUM_CONSTS 7
66
 
67
enum
68
{
69
   CONST_IDENTITY,
70
   CONST_INV_127,
71
   CONST_INV_255,
72
   CONST_INV_32767,
73
   CONST_INV_65535,
74
   CONST_INV_2147483647,
75
   CONST_255
76
};
77
 
78
#define C(v) {(float)(v), (float)(v), (float)(v), (float)(v)}
79
static float consts[NUM_CONSTS][4] = {
80
      {0, 0, 0, 1},
81
      C(1.0 / 127.0),
82
      C(1.0 / 255.0),
83
      C(1.0 / 32767.0),
84
      C(1.0 / 65535.0),
85
      C(1.0 / 2147483647.0),
86
      C(255.0)
87
};
88
#undef C
89
 
90
struct translate_sse {
91
   struct translate translate;
92
 
93
   struct x86_function linear_func;
94
   struct x86_function elt_func;
95
   struct x86_function elt16_func;
96
   struct x86_function elt8_func;
97
   struct x86_function *func;
98
 
99
   PIPE_ALIGN_VAR(16) float consts[NUM_CONSTS][4];
100
   int8_t reg_to_const[16];
101
   int8_t const_to_reg[NUM_CONSTS];
102
 
103
   struct translate_buffer buffer[PIPE_MAX_ATTRIBS];
104
   unsigned nr_buffers;
105
 
106
   /* Multiple buffer variants can map to a single buffer. */
107
   struct translate_buffer_variant buffer_variant[PIPE_MAX_ATTRIBS];
108
   unsigned nr_buffer_variants;
109
 
110
   /* Multiple elements can map to a single buffer variant. */
111
   unsigned element_to_buffer_variant[PIPE_MAX_ATTRIBS];
112
 
113
   boolean use_instancing;
114
   unsigned instance_id;
115
   unsigned start_instance;
116
 
117
   /* these are actually known values, but putting them in a struct
118
    * like this is helpful to keep them in sync across the file.
119
    */
120
   struct x86_reg tmp_EAX;
121
   struct x86_reg tmp2_EDX;
122
   struct x86_reg src_ECX;
123
   struct x86_reg idx_ESI;     /* either start+i or &elt[i] */
124
   struct x86_reg machine_EDI;
125
   struct x86_reg outbuf_EBX;
126
   struct x86_reg count_EBP;    /* decrements to zero */
127
};
128
 
129
static int get_offset( const void *a, const void *b )
130
{
131
   return (const char *)b - (const char *)a;
132
}
133
 
134
static struct x86_reg get_const( struct translate_sse *p, unsigned id)
135
{
136
   struct x86_reg reg;
137
   unsigned i;
138
 
139
   if(p->const_to_reg[id] >= 0)
140
      return x86_make_reg(file_XMM, p->const_to_reg[id]);
141
 
142
   for(i = 2; i < 8; ++i)
143
   {
144
      if(p->reg_to_const[i] < 0)
145
         break;
146
   }
147
 
148
   /* TODO: be smarter here */
149
   if(i == 8)
150
      --i;
151
 
152
   reg = x86_make_reg(file_XMM, i);
153
 
154
   if(p->reg_to_const[i] >= 0)
155
      p->const_to_reg[p->reg_to_const[i]] = -1;
156
 
157
   p->reg_to_const[i] = id;
158
   p->const_to_reg[id] = i;
159
 
160
   /* TODO: this should happen outside the loop, if possible */
161
   sse_movaps(p->func, reg,
162
         x86_make_disp(p->machine_EDI,
163
               get_offset(p, &p->consts[id][0])));
164
 
165
   return reg;
166
}
167
 
168
/* load the data in a SSE2 register, padding with zeros */
169
static boolean emit_load_sse2( struct translate_sse *p,
170
				       struct x86_reg data,
171
				       struct x86_reg src,
172
				       unsigned size)
173
{
174
   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
175
   struct x86_reg tmp = p->tmp_EAX;
176
   switch(size)
177
   {
178
   case 1:
179
      x86_movzx8(p->func, tmp, src);
180
      sse2_movd(p->func, data, tmp);
181
      break;
182
   case 2:
183
      x86_movzx16(p->func, tmp, src);
184
      sse2_movd(p->func, data, tmp);
185
      break;
186
   case 3:
187
      x86_movzx8(p->func, tmp, x86_make_disp(src, 2));
188
      x86_shl_imm(p->func, tmp, 16);
189
      x86_mov16(p->func, tmp, src);
190
      sse2_movd(p->func, data, tmp);
191
      break;
192
   case 4:
193
      sse2_movd(p->func, data, src);
194
      break;
195
   case 6:
196
      sse2_movd(p->func, data, src);
197
      x86_movzx16(p->func, tmp, x86_make_disp(src, 4));
198
      sse2_movd(p->func, tmpXMM, tmp);
199
      sse2_punpckldq(p->func, data, tmpXMM);
200
      break;
201
   case 8:
202
      sse2_movq(p->func, data, src);
203
      break;
204
   case 12:
205
      sse2_movq(p->func, data, src);
206
      sse2_movd(p->func, tmpXMM, x86_make_disp(src, 8));
207
      sse2_punpcklqdq(p->func, data, tmpXMM);
208
      break;
209
   case 16:
210
      sse2_movdqu(p->func, data, src);
211
      break;
212
   default:
213
      return FALSE;
214
   }
215
   return TRUE;
216
}
217
 
218
/* this value can be passed for the out_chans argument */
219
#define CHANNELS_0001 5
220
 
221
/* this function will load #chans float values, and will
222
 * pad the register with zeroes at least up to out_chans.
223
 *
224
 * If out_chans is set to CHANNELS_0001, then the fourth
225
 * value will be padded with 1. Only pass this value if
226
 * chans < 4 or results are undefined.
227
 */
228
static void emit_load_float32( struct translate_sse *p,
229
                                       struct x86_reg data,
230
                                       struct x86_reg arg0,
231
                                       unsigned out_chans,
232
                                       unsigned chans)
233
{
234
   switch(chans)
235
   {
236
   case 1:
237
      /* a 0 0 0
238
       * a 0 0 1
239
       */
240
      sse_movss(p->func, data, arg0);
241
      if(out_chans == CHANNELS_0001)
242
         sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
243
      break;
244
   case 2:
245
      /* 0 0 0 1
246
       * a b 0 1
247
       */
248
      if(out_chans == CHANNELS_0001)
249
         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
250
      else if(out_chans > 2)
251
         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
252
      sse_movlps(p->func, data, arg0);
253
      break;
254
   case 3:
255
      /* Have to jump through some hoops:
256
       *
257
       * c 0 0 0
258
       * c 0 0 1 if out_chans == CHANNELS_0001
259
       * 0 0 c 0/1
260
       * a b c 0/1
261
       */
262
      sse_movss(p->func, data, x86_make_disp(arg0, 8));
263
      if(out_chans == CHANNELS_0001)
264
         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X,Y,Z,W) );
265
      sse_shufps(p->func, data, data, SHUF(Y,Z,X,W) );
266
      sse_movlps(p->func, data, arg0);
267
      break;
268
   case 4:
269
      sse_movups(p->func, data, arg0);
270
      break;
271
   }
272
}
273
 
274
/* this function behaves like emit_load_float32, but loads
275
   64-bit floating point numbers, converting them to 32-bit
276
  ones */
277
static void emit_load_float64to32( struct translate_sse *p,
278
                                       struct x86_reg data,
279
                                       struct x86_reg arg0,
280
                                       unsigned out_chans,
281
                                       unsigned chans)
282
{
283
   struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
284
   switch(chans)
285
   {
286
   case 1:
287
      sse2_movsd(p->func, data, arg0);
288
      if(out_chans > 1)
289
         sse2_cvtpd2ps(p->func, data, data);
290
      else
291
         sse2_cvtsd2ss(p->func, data, data);
292
      if(out_chans == CHANNELS_0001)
293
         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W)  );
294
      break;
295
   case 2:
296
      sse2_movupd(p->func, data, arg0);
297
      sse2_cvtpd2ps(p->func, data, data);
298
      if(out_chans == CHANNELS_0001)
299
         sse_shufps(p->func, data, get_const(p, CONST_IDENTITY), SHUF(X, Y, Z, W) );
300
      else if(out_chans > 2)
301
         sse_movlhps(p->func, data, get_const(p, CONST_IDENTITY) );
302
       break;
303
   case 3:
304
      sse2_movupd(p->func, data, arg0);
305
      sse2_cvtpd2ps(p->func, data, data);
306
      sse2_movsd(p->func, tmpXMM, x86_make_disp(arg0, 16));
307
      if(out_chans > 3)
308
         sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
309
      else
310
         sse2_cvtsd2ss(p->func, tmpXMM, tmpXMM);
311
      sse_movlhps(p->func, data, tmpXMM);
312
      if(out_chans == CHANNELS_0001)
313
         sse_orps(p->func, data, get_const(p, CONST_IDENTITY) );
314
      break;
315
   case 4:
316
      sse2_movupd(p->func, data, arg0);
317
      sse2_cvtpd2ps(p->func, data, data);
318
      sse2_movupd(p->func, tmpXMM, x86_make_disp(arg0, 16));
319
      sse2_cvtpd2ps(p->func, tmpXMM, tmpXMM);
320
      sse_movlhps(p->func, data, tmpXMM);
321
      break;
322
   }
323
}
324
 
325
static void emit_mov64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src_gpr,  struct x86_reg src_xmm)
326
{
327
   if(x86_target(p->func) != X86_32)
328
      x64_mov64(p->func, dst_gpr, src_gpr);
329
   else
330
   {
331
      /* TODO: when/on which CPUs is SSE2 actually better than SSE? */
332
      if(x86_target_caps(p->func) & X86_SSE2)
333
         sse2_movq(p->func, dst_xmm, src_xmm);
334
      else
335
         sse_movlps(p->func, dst_xmm, src_xmm);
336
   }
337
}
338
 
339
static void emit_load64(struct translate_sse *p, struct x86_reg dst_gpr, struct x86_reg dst_xmm, struct x86_reg src)
340
{
341
   emit_mov64(p, dst_gpr, dst_xmm, src, src);
342
}
343
 
344
static void emit_store64(struct translate_sse *p, struct x86_reg dst, struct x86_reg src_gpr, struct x86_reg src_xmm)
345
{
346
   emit_mov64(p, dst, dst, src_gpr, src_xmm);
347
}
348
 
349
static void emit_mov128(struct translate_sse *p, struct x86_reg dst, struct x86_reg src)
350
{
351
   if(x86_target_caps(p->func) & X86_SSE2)
352
      sse2_movdqu(p->func, dst, src);
353
   else
354
      sse_movups(p->func, dst, src);
355
}
356
 
357
/* TODO: this uses unaligned accesses liberally, which is great on Nehalem,
358
 * but may or may not be good on older processors
359
 * TODO: may perhaps want to use non-temporal stores here if possible
360
 */
361
static void emit_memcpy(struct translate_sse *p, struct x86_reg dst, struct x86_reg src, unsigned size)
362
{
363
   struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
364
   struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
365
   struct x86_reg dataGPR = p->tmp_EAX;
366
   struct x86_reg dataGPR2 = p->tmp2_EDX;
367
 
368
   if(size < 8)
369
   {
370
      switch (size)
371
      {
372
      case 1:
373
         x86_mov8(p->func, dataGPR, src);
374
         x86_mov8(p->func, dst, dataGPR);
375
         break;
376
      case 2:
377
         x86_mov16(p->func, dataGPR, src);
378
         x86_mov16(p->func, dst, dataGPR);
379
         break;
380
      case 3:
381
         x86_mov16(p->func, dataGPR, src);
382
         x86_mov8(p->func, dataGPR2, x86_make_disp(src, 2));
383
         x86_mov16(p->func, dst, dataGPR);
384
         x86_mov8(p->func, x86_make_disp(dst, 2), dataGPR2);
385
         break;
386
      case 4:
387
         x86_mov(p->func, dataGPR, src);
388
         x86_mov(p->func, dst, dataGPR);
389
         break;
390
      case 6:
391
         x86_mov(p->func, dataGPR, src);
392
         x86_mov16(p->func, dataGPR2, x86_make_disp(src, 4));
393
         x86_mov(p->func, dst, dataGPR);
394
         x86_mov16(p->func, x86_make_disp(dst, 4), dataGPR2);
395
         break;
396
      }
397
   }
398
   else if(!(x86_target_caps(p->func) & X86_SSE))
399
   {
400
      unsigned i = 0;
401
      assert((size & 3) == 0);
402
      for(i = 0; i < size; i += 4)
403
      {
404
         x86_mov(p->func, dataGPR, x86_make_disp(src, i));
405
         x86_mov(p->func, x86_make_disp(dst, i), dataGPR);
406
      }
407
   }
408
   else
409
   {
410
      switch(size)
411
      {
412
      case 8:
413
         emit_load64(p, dataGPR, dataXMM, src);
414
         emit_store64(p, dst, dataGPR, dataXMM);
415
         break;
416
      case 12:
417
         emit_load64(p, dataGPR2, dataXMM, src);
418
         x86_mov(p->func, dataGPR, x86_make_disp(src, 8));
419
         emit_store64(p, dst, dataGPR2, dataXMM);
420
         x86_mov(p->func, x86_make_disp(dst, 8), dataGPR);
421
         break;
422
      case 16:
423
         emit_mov128(p, dataXMM, src);
424
         emit_mov128(p, dst, dataXMM);
425
         break;
426
      case 24:
427
         emit_mov128(p, dataXMM, src);
428
         emit_load64(p, dataGPR, dataXMM2, x86_make_disp(src, 16));
429
         emit_mov128(p, dst, dataXMM);
430
         emit_store64(p, x86_make_disp(dst, 16), dataGPR, dataXMM2);
431
         break;
432
      case 32:
433
         emit_mov128(p, dataXMM, src);
434
         emit_mov128(p, dataXMM2, x86_make_disp(src, 16));
435
         emit_mov128(p, dst, dataXMM);
436
         emit_mov128(p, x86_make_disp(dst, 16), dataXMM2);
437
         break;
438
      default:
439
         assert(0);
440
      }
441
   }
442
}
443
 
444
static boolean translate_attr_convert( struct translate_sse *p,
445
                               const struct translate_element *a,
446
                               struct x86_reg src,
447
                               struct x86_reg dst)
448
 
449
{
450
   const struct util_format_description* input_desc = util_format_description(a->input_format);
451
   const struct util_format_description* output_desc = util_format_description(a->output_format);
452
   unsigned i;
453
   boolean id_swizzle = TRUE;
454
   unsigned swizzle[4] = {UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE, UTIL_FORMAT_SWIZZLE_NONE};
455
   unsigned needed_chans = 0;
456
   unsigned imms[2] = {0, 0x3f800000};
457
 
458
   if(a->output_format == PIPE_FORMAT_NONE || a->input_format == PIPE_FORMAT_NONE)
459
      return FALSE;
460
 
461
   if(input_desc->channel[0].size & 7)
462
      return FALSE;
463
 
464
   if(input_desc->colorspace != output_desc->colorspace)
465
      return FALSE;
466
 
467
   for(i = 1; i < input_desc->nr_channels; ++i)
468
   {
469
      if(memcmp(&input_desc->channel[i], &input_desc->channel[0], sizeof(input_desc->channel[0])))
470
         return FALSE;
471
   }
472
 
473
   for(i = 1; i < output_desc->nr_channels; ++i)
474
   {
475
      if(memcmp(&output_desc->channel[i], &output_desc->channel[0], sizeof(output_desc->channel[0])))
476
         return FALSE;
477
   }
478
 
479
   for(i = 0; i < output_desc->nr_channels; ++i)
480
   {
481
      if(output_desc->swizzle[i] < 4)
482
         swizzle[output_desc->swizzle[i]] = input_desc->swizzle[i];
483
   }
484
 
485
   if((x86_target_caps(p->func) & X86_SSE) && (0
486
         || a->output_format == PIPE_FORMAT_R32_FLOAT
487
         || a->output_format == PIPE_FORMAT_R32G32_FLOAT
488
         || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
489
         || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT))
490
   {
491
      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
492
 
493
      for(i = 0; i < output_desc->nr_channels; ++i)
494
      {
495
         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
496
            swizzle[i] = i;
497
      }
498
 
499
      for(i = 0; i < output_desc->nr_channels; ++i)
500
      {
501
         if(swizzle[i] < 4)
502
            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
503
         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
504
            id_swizzle = FALSE;
505
      }
506
 
507
      if(needed_chans > 0)
508
      {
509
         switch(input_desc->channel[0].type)
510
         {
511
         case UTIL_FORMAT_TYPE_UNSIGNED:
512
            if(!(x86_target_caps(p->func) & X86_SSE2))
513
               return FALSE;
514
            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
515
 
516
            /* TODO: add support for SSE4.1 pmovzx */
517
            switch(input_desc->channel[0].size)
518
            {
519
            case 8:
520
               /* TODO: this may be inefficient due to get_identity() being used both as a float and integer register */
521
               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
522
               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
523
               break;
524
            case 16:
525
               sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
526
               break;
527
            case 32: /* we lose precision here */
528
               sse2_psrld_imm(p->func, dataXMM, 1);
529
               break;
530
            default:
531
               return FALSE;
532
            }
533
            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
534
            if(input_desc->channel[0].normalized)
535
            {
536
               struct x86_reg factor;
537
               switch(input_desc->channel[0].size)
538
               {
539
               case 8:
540
                  factor = get_const(p, CONST_INV_255);
541
                  break;
542
               case 16:
543
                  factor = get_const(p, CONST_INV_65535);
544
                  break;
545
               case 32:
546
                  factor = get_const(p, CONST_INV_2147483647);
547
                  break;
548
               default:
549
                  assert(0);
550
                  factor.disp = 0;
551
                  factor.file = 0;
552
                  factor.idx = 0;
553
                  factor.mod = 0;
554
                  break;
555
               }
556
               sse_mulps(p->func, dataXMM, factor);
557
            }
558
            else if(input_desc->channel[0].size == 32)
559
               sse_addps(p->func, dataXMM, dataXMM); /* compensate for the bit we threw away to fit u32 into s32 */
560
            break;
561
         case UTIL_FORMAT_TYPE_SIGNED:
562
            if(!(x86_target_caps(p->func) & X86_SSE2))
563
               return FALSE;
564
            emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
565
 
566
            /* TODO: add support for SSE4.1 pmovsx */
567
            switch(input_desc->channel[0].size)
568
            {
569
            case 8:
570
               sse2_punpcklbw(p->func, dataXMM, dataXMM);
571
               sse2_punpcklbw(p->func, dataXMM, dataXMM);
572
               sse2_psrad_imm(p->func, dataXMM, 24);
573
               break;
574
            case 16:
575
               sse2_punpcklwd(p->func, dataXMM, dataXMM);
576
               sse2_psrad_imm(p->func, dataXMM, 16);
577
               break;
578
            case 32: /* we lose precision here */
579
               break;
580
            default:
581
               return FALSE;
582
            }
583
            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
584
            if(input_desc->channel[0].normalized)
585
            {
586
               struct x86_reg factor;
587
               switch(input_desc->channel[0].size)
588
               {
589
               case 8:
590
                  factor = get_const(p, CONST_INV_127);
591
                  break;
592
               case 16:
593
                  factor = get_const(p, CONST_INV_32767);
594
                  break;
595
               case 32:
596
                  factor = get_const(p, CONST_INV_2147483647);
597
                  break;
598
               default:
599
                  assert(0);
600
                  factor.disp = 0;
601
                  factor.file = 0;
602
                  factor.idx = 0;
603
                  factor.mod = 0;
604
                  break;
605
               }
606
               sse_mulps(p->func, dataXMM, factor);
607
            }
608
            break;
609
 
610
            break;
611
         case UTIL_FORMAT_TYPE_FLOAT:
612
            if(input_desc->channel[0].size != 32 && input_desc->channel[0].size != 64)
613
               return FALSE;
614
            if(swizzle[3] == UTIL_FORMAT_SWIZZLE_1 && input_desc->nr_channels <= 3)
615
            {
616
               swizzle[3] = UTIL_FORMAT_SWIZZLE_W;
617
               needed_chans = CHANNELS_0001;
618
            }
619
            switch(input_desc->channel[0].size)
620
            {
621
            case 32:
622
               emit_load_float32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
623
               break;
624
            case 64: /* we lose precision here */
625
               if(!(x86_target_caps(p->func) & X86_SSE2))
626
                  return FALSE;
627
               emit_load_float64to32(p, dataXMM, src, needed_chans, input_desc->nr_channels);
628
               break;
629
            default:
630
               return FALSE;
631
            }
632
            break;
633
         default:
634
            return FALSE;
635
         }
636
 
637
         if(!id_swizzle)
638
            sse_shufps(p->func, dataXMM, dataXMM, SHUF(swizzle[0], swizzle[1], swizzle[2], swizzle[3]) );
639
      }
640
 
641
      if(output_desc->nr_channels >= 4
642
            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
643
            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
644
            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
645
            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
646
            )
647
         sse_movups(p->func, dst, dataXMM);
648
      else
649
      {
650
         if(output_desc->nr_channels >= 2
651
               && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
652
               && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
653
            sse_movlps(p->func, dst, dataXMM);
654
         else
655
         {
656
            if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
657
               sse_movss(p->func, dst, dataXMM);
658
            else
659
               x86_mov_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
660
 
661
            if(output_desc->nr_channels >= 2)
662
            {
663
               if(swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
664
               {
665
                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(1, 1, 2, 3));
666
                  sse_movss(p->func, x86_make_disp(dst, 4), dataXMM);
667
               }
668
               else
669
                  x86_mov_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
670
            }
671
         }
672
 
673
         if(output_desc->nr_channels >= 3)
674
         {
675
            if(output_desc->nr_channels >= 4
676
                  && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
677
                  && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
678
               sse_movhps(p->func, x86_make_disp(dst, 8), dataXMM);
679
            else
680
            {
681
               if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
682
               {
683
                  sse_shufps(p->func, dataXMM, dataXMM, SHUF(2, 2, 2, 3));
684
                  sse_movss(p->func, x86_make_disp(dst, 8), dataXMM);
685
               }
686
               else
687
                  x86_mov_imm(p->func, x86_make_disp(dst, 8), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
688
 
689
               if(output_desc->nr_channels >= 4)
690
               {
691
                  if(swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
692
                  {
693
                     sse_shufps(p->func, dataXMM, dataXMM, SHUF(3, 3, 3, 3));
694
                     sse_movss(p->func, x86_make_disp(dst, 12), dataXMM);
695
                  }
696
                  else
697
                     x86_mov_imm(p->func, x86_make_disp(dst, 12), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
698
               }
699
            }
700
         }
701
      }
702
      return TRUE;
703
   }
704
   else if((x86_target_caps(p->func) & X86_SSE2) && input_desc->channel[0].size == 8 && output_desc->channel[0].size == 16
705
         && output_desc->channel[0].normalized == input_desc->channel[0].normalized
706
         && (0
707
               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED)
708
               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
709
               || (input_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED && output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
710
               ))
711
   {
712
      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
713
      struct x86_reg tmpXMM = x86_make_reg(file_XMM, 1);
714
      struct x86_reg tmp = p->tmp_EAX;
715
      unsigned imms[2] = {0, 1};
716
 
717
      for(i = 0; i < output_desc->nr_channels; ++i)
718
      {
719
         if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0 && i >= input_desc->nr_channels)
720
            swizzle[i] = i;
721
      }
722
 
723
      for(i = 0; i < output_desc->nr_channels; ++i)
724
      {
725
         if(swizzle[i] < 4)
726
            needed_chans = MAX2(needed_chans, swizzle[i] + 1);
727
         if(swizzle[i] < UTIL_FORMAT_SWIZZLE_0 && swizzle[i] != i)
728
            id_swizzle = FALSE;
729
      }
730
 
731
      if(needed_chans > 0)
732
      {
733
         emit_load_sse2(p, dataXMM, src, input_desc->channel[0].size * input_desc->nr_channels >> 3);
734
 
735
         switch(input_desc->channel[0].type)
736
         {
737
         case UTIL_FORMAT_TYPE_UNSIGNED:
738
            if(input_desc->channel[0].normalized)
739
            {
740
               sse2_punpcklbw(p->func, dataXMM, dataXMM);
741
               if(output_desc->channel[0].type == UTIL_FORMAT_TYPE_SIGNED)
742
        	       sse2_psrlw_imm(p->func, dataXMM, 1);
743
            }
744
            else
745
               sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
746
            break;
747
         case UTIL_FORMAT_TYPE_SIGNED:
748
            if(input_desc->channel[0].normalized)
749
            {
750
               sse2_movq(p->func, tmpXMM, get_const(p, CONST_IDENTITY));
751
               sse2_punpcklbw(p->func, tmpXMM, dataXMM);
752
               sse2_psllw_imm(p->func, dataXMM, 9);
753
               sse2_psrlw_imm(p->func, dataXMM, 8);
754
               sse2_por(p->func, tmpXMM, dataXMM);
755
               sse2_psrlw_imm(p->func, dataXMM, 7);
756
               sse2_por(p->func, tmpXMM, dataXMM);
757
               {
758
                  struct x86_reg t = dataXMM;
759
                  dataXMM = tmpXMM;
760
                  tmpXMM = t;
761
               }
762
            }
763
            else
764
            {
765
               sse2_punpcklbw(p->func, dataXMM, dataXMM);
766
               sse2_psraw_imm(p->func, dataXMM, 8);
767
            }
768
            break;
769
         default:
770
            assert(0);
771
         }
772
 
773
         if(output_desc->channel[0].normalized)
774
            imms[1] = (output_desc->channel[0].type == UTIL_FORMAT_TYPE_UNSIGNED) ? 0xffff : 0x7ffff;
775
 
776
         if(!id_swizzle)
777
            sse2_pshuflw(p->func, dataXMM, dataXMM, (swizzle[0] & 3) | ((swizzle[1] & 3) << 2) | ((swizzle[2] & 3) << 4) | ((swizzle[3] & 3) << 6));
778
      }
779
 
780
      if(output_desc->nr_channels >= 4
781
            && swizzle[0] < UTIL_FORMAT_SWIZZLE_0
782
            && swizzle[1] < UTIL_FORMAT_SWIZZLE_0
783
            && swizzle[2] < UTIL_FORMAT_SWIZZLE_0
784
            && swizzle[3] < UTIL_FORMAT_SWIZZLE_0
785
            )
786
         sse2_movq(p->func, dst, dataXMM);
787
      else
788
      {
789
         if(swizzle[0] < UTIL_FORMAT_SWIZZLE_0)
790
         {
791
            if(output_desc->nr_channels >= 2 && swizzle[1] < UTIL_FORMAT_SWIZZLE_0)
792
               sse2_movd(p->func, dst, dataXMM);
793
            else
794
            {
795
               sse2_movd(p->func, tmp, dataXMM);
796
               x86_mov16(p->func, dst, tmp);
797
               if(output_desc->nr_channels >= 2)
798
                  x86_mov16_imm(p->func, x86_make_disp(dst, 2), imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0]);
799
            }
800
         }
801
         else
802
         {
803
            if(output_desc->nr_channels >= 2 && swizzle[1] >= UTIL_FORMAT_SWIZZLE_0)
804
               x86_mov_imm(p->func, dst, (imms[swizzle[1] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
805
            else
806
            {
807
               x86_mov16_imm(p->func, dst, imms[swizzle[0] - UTIL_FORMAT_SWIZZLE_0]);
808
               if(output_desc->nr_channels >= 2)
809
               {
810
                  sse2_movd(p->func, tmp, dataXMM);
811
                  x86_shr_imm(p->func, tmp, 16);
812
                  x86_mov16(p->func, x86_make_disp(dst, 2), tmp);
813
               }
814
            }
815
         }
816
 
817
         if(output_desc->nr_channels >= 3)
818
         {
819
            if(swizzle[2] < UTIL_FORMAT_SWIZZLE_0)
820
            {
821
               if(output_desc->nr_channels >= 4 && swizzle[3] < UTIL_FORMAT_SWIZZLE_0)
822
               {
823
                  sse2_psrlq_imm(p->func, dataXMM, 32);
824
                  sse2_movd(p->func, x86_make_disp(dst, 4), dataXMM);
825
               }
826
               else
827
               {
828
                  sse2_psrlq_imm(p->func, dataXMM, 32);
829
                  sse2_movd(p->func, tmp, dataXMM);
830
                  x86_mov16(p->func, x86_make_disp(dst, 4), tmp);
831
                  if(output_desc->nr_channels >= 4)
832
                  {
833
                     x86_mov16_imm(p->func, x86_make_disp(dst, 6), imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0]);
834
                  }
835
               }
836
            }
837
            else
838
            {
839
               if(output_desc->nr_channels >= 4 && swizzle[3] >= UTIL_FORMAT_SWIZZLE_0)
840
                  x86_mov_imm(p->func, x86_make_disp(dst, 4), (imms[swizzle[3] - UTIL_FORMAT_SWIZZLE_0] << 16) | imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
841
               else
842
               {
843
                  x86_mov16_imm(p->func, x86_make_disp(dst, 4), imms[swizzle[2] - UTIL_FORMAT_SWIZZLE_0]);
844
 
845
                  if(output_desc->nr_channels >= 4)
846
                  {
847
                     sse2_psrlq_imm(p->func, dataXMM, 48);
848
                     sse2_movd(p->func, tmp, dataXMM);
849
                     x86_mov16(p->func, x86_make_disp(dst, 6), tmp);
850
                  }
851
               }
852
            }
853
         }
854
      }
855
      return TRUE;
856
   }
857
   else if(!memcmp(&output_desc->channel[0], &input_desc->channel[0], sizeof(output_desc->channel[0])))
858
   {
859
      struct x86_reg tmp = p->tmp_EAX;
860
      unsigned i;
861
      if(input_desc->channel[0].size == 8 && input_desc->nr_channels == 4 && output_desc->nr_channels == 4
862
                     && swizzle[0] == UTIL_FORMAT_SWIZZLE_W
863
                     && swizzle[1] == UTIL_FORMAT_SWIZZLE_Z
864
                     && swizzle[2] == UTIL_FORMAT_SWIZZLE_Y
865
                     && swizzle[3] == UTIL_FORMAT_SWIZZLE_X)
866
      {
867
         /* TODO: support movbe */
868
         x86_mov(p->func, tmp, src);
869
         x86_bswap(p->func, tmp);
870
         x86_mov(p->func, dst, tmp);
871
         return TRUE;
872
      }
873
 
874
      for(i = 0; i < output_desc->nr_channels; ++i)
875
      {
876
         switch(output_desc->channel[0].size)
877
         {
878
         case 8:
879
            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
880
            {
881
               unsigned v = 0;
882
               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
883
               {
884
                  switch(output_desc->channel[0].type)
885
                  {
886
                  case UTIL_FORMAT_TYPE_UNSIGNED:
887
                     v = output_desc->channel[0].normalized ? 0xff : 1;
888
                     break;
889
                  case UTIL_FORMAT_TYPE_SIGNED:
890
                     v = output_desc->channel[0].normalized ? 0x7f : 1;
891
                     break;
892
                  default:
893
                     return FALSE;
894
                  }
895
               }
896
               x86_mov8_imm(p->func, x86_make_disp(dst, i * 1), v);
897
            }
898
            else
899
            {
900
               x86_mov8(p->func, tmp, x86_make_disp(src, swizzle[i] * 1));
901
               x86_mov8(p->func, x86_make_disp(dst, i * 1), tmp);
902
            }
903
            break;
904
         case 16:
905
            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
906
            {
907
               unsigned v = 0;
908
               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
909
               {
910
                  switch(output_desc->channel[1].type)
911
                  {
912
                  case UTIL_FORMAT_TYPE_UNSIGNED:
913
                     v = output_desc->channel[1].normalized ? 0xffff : 1;
914
                     break;
915
                  case UTIL_FORMAT_TYPE_SIGNED:
916
                     v = output_desc->channel[1].normalized ? 0x7fff : 1;
917
                     break;
918
                  case UTIL_FORMAT_TYPE_FLOAT:
919
                     v = 0x3c00;
920
                     break;
921
                  default:
922
                     return FALSE;
923
                  }
924
               }
925
               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), v);
926
            }
927
            else if(swizzle[i] == UTIL_FORMAT_SWIZZLE_0)
928
               x86_mov16_imm(p->func, x86_make_disp(dst, i * 2), 0);
929
            else
930
            {
931
               x86_mov16(p->func, tmp, x86_make_disp(src, swizzle[i] * 2));
932
               x86_mov16(p->func, x86_make_disp(dst, i * 2), tmp);
933
            }
934
            break;
935
         case 32:
936
            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
937
            {
938
               unsigned v = 0;
939
               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
940
               {
941
                  switch(output_desc->channel[1].type)
942
                  {
943
                  case UTIL_FORMAT_TYPE_UNSIGNED:
944
                     v = output_desc->channel[1].normalized ? 0xffffffff : 1;
945
                     break;
946
                  case UTIL_FORMAT_TYPE_SIGNED:
947
                     v = output_desc->channel[1].normalized ? 0x7fffffff : 1;
948
                     break;
949
                  case UTIL_FORMAT_TYPE_FLOAT:
950
                     v = 0x3f800000;
951
                     break;
952
                  default:
953
                     return FALSE;
954
                  }
955
               }
956
               x86_mov_imm(p->func, x86_make_disp(dst, i * 4), v);
957
            }
958
            else
959
            {
960
               x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 4));
961
               x86_mov(p->func, x86_make_disp(dst, i * 4), tmp);
962
            }
963
            break;
964
         case 64:
965
            if(swizzle[i] >= UTIL_FORMAT_SWIZZLE_0)
966
            {
967
               unsigned l = 0;
968
               unsigned h = 0;
969
               if(swizzle[i] == UTIL_FORMAT_SWIZZLE_1)
970
               {
971
                  switch(output_desc->channel[1].type)
972
                  {
973
                  case UTIL_FORMAT_TYPE_UNSIGNED:
974
                     h = output_desc->channel[1].normalized ? 0xffffffff : 0;
975
                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
976
                     break;
977
                  case UTIL_FORMAT_TYPE_SIGNED:
978
                     h = output_desc->channel[1].normalized ? 0x7fffffff : 0;
979
                     l = output_desc->channel[1].normalized ? 0xffffffff : 1;
980
                     break;
981
                  case UTIL_FORMAT_TYPE_FLOAT:
982
                     h = 0x3ff00000;
983
                     l = 0;
984
                     break;
985
                  default:
986
                     return FALSE;
987
                  }
988
               }
989
               x86_mov_imm(p->func, x86_make_disp(dst, i * 8), l);
990
               x86_mov_imm(p->func, x86_make_disp(dst, i * 8 + 4), h);
991
            }
992
            else
993
            {
994
               if(x86_target_caps(p->func) & X86_SSE)
995
               {
996
                  struct x86_reg tmpXMM = x86_make_reg(file_XMM, 0);
997
                  emit_load64(p, tmp, tmpXMM, x86_make_disp(src, swizzle[i] * 8));
998
                  emit_store64(p, x86_make_disp(dst, i * 8), tmp, tmpXMM);
999
               }
1000
               else
1001
               {
1002
                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8));
1003
                  x86_mov(p->func, x86_make_disp(dst, i * 8), tmp);
1004
                  x86_mov(p->func, tmp, x86_make_disp(src, swizzle[i] * 8 + 4));
1005
                  x86_mov(p->func, x86_make_disp(dst, i * 8 + 4), tmp);
1006
               }
1007
            }
1008
            break;
1009
         default:
1010
            return FALSE;
1011
         }
1012
      }
1013
      return TRUE;
1014
   }
1015
   /* special case for draw's EMIT_4UB (RGBA) and EMIT_4UB_BGRA */
1016
   else if((x86_target_caps(p->func) & X86_SSE2) &&
1017
         a->input_format == PIPE_FORMAT_R32G32B32A32_FLOAT && (0
1018
               || a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM
1019
               || a->output_format == PIPE_FORMAT_R8G8B8A8_UNORM
1020
         ))
1021
   {
1022
      struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
1023
 
1024
      /* load */
1025
      sse_movups(p->func, dataXMM, src);
1026
 
1027
      if (a->output_format == PIPE_FORMAT_B8G8R8A8_UNORM)
1028
         sse_shufps(p->func, dataXMM, dataXMM, SHUF(2,1,0,3));
1029
 
1030
      /* scale by 255.0 */
1031
      sse_mulps(p->func, dataXMM, get_const(p, CONST_255));
1032
 
1033
      /* pack and emit */
1034
      sse2_cvtps2dq(p->func, dataXMM, dataXMM);
1035
      sse2_packssdw(p->func, dataXMM, dataXMM);
1036
      sse2_packuswb(p->func, dataXMM, dataXMM);
1037
      sse2_movd(p->func, dst, dataXMM);
1038
 
1039
      return TRUE;
1040
   }
1041
 
1042
   return FALSE;
1043
}
1044
 
1045
static boolean translate_attr( struct translate_sse *p,
1046
			       const struct translate_element *a,
1047
			       struct x86_reg src,
1048
			       struct x86_reg dst)
1049
{
1050
   if(a->input_format == a->output_format)
1051
   {
1052
      emit_memcpy(p, dst, src, util_format_get_stride(a->input_format, 1));
1053
      return TRUE;
1054
   }
1055
 
1056
   return translate_attr_convert(p, a, src, dst);
1057
}
1058
 
1059
static boolean init_inputs( struct translate_sse *p,
1060
                            unsigned index_size )
1061
{
1062
   unsigned i;
1063
   struct x86_reg instance_id = x86_make_disp(p->machine_EDI,
1064
                                              get_offset(p, &p->instance_id));
1065
   struct x86_reg start_instance = x86_make_disp(p->machine_EDI,
1066
                                                 get_offset(p, &p->start_instance));
1067
 
1068
   for (i = 0; i < p->nr_buffer_variants; i++) {
1069
      struct translate_buffer_variant *variant = &p->buffer_variant[i];
1070
      struct translate_buffer *buffer = &p->buffer[variant->buffer_index];
1071
 
1072
      if (!index_size || variant->instance_divisor) {
1073
         struct x86_reg buf_max_index = x86_make_disp(p->machine_EDI,
1074
                                                     get_offset(p, &buffer->max_index));
1075
         struct x86_reg buf_stride   = x86_make_disp(p->machine_EDI,
1076
                                                     get_offset(p, &buffer->stride));
1077
         struct x86_reg buf_ptr      = x86_make_disp(p->machine_EDI,
1078
                                                     get_offset(p, &variant->ptr));
1079
         struct x86_reg buf_base_ptr = x86_make_disp(p->machine_EDI,
1080
                                                     get_offset(p, &buffer->base_ptr));
1081
         struct x86_reg elt = p->idx_ESI;
1082
         struct x86_reg tmp_EAX = p->tmp_EAX;
1083
 
1084
         /* Calculate pointer to first attrib:
1085
          *   base_ptr + stride * index, where index depends on instance divisor
1086
          */
1087
         if (variant->instance_divisor) {
1088
            /* Start with instance = instance_id
1089
             * which is true if divisor is 1.
1090
             */
1091
            x86_mov(p->func, tmp_EAX, instance_id);
1092
 
1093
            if (variant->instance_divisor != 1) {
1094
               struct x86_reg tmp_EDX = p->tmp2_EDX;
1095
               struct x86_reg tmp_ECX = p->src_ECX;
1096
 
1097
               /* instance_num = instance_id - start_instance */
1098
               x86_mov(p->func, tmp_EDX, start_instance);
1099
               x86_sub(p->func, tmp_EAX, tmp_EDX);
1100
 
1101
               /* TODO: Add x86_shr() to rtasm and use it whenever
1102
                *       instance divisor is power of two.
1103
                */
1104
               x86_xor(p->func, tmp_EDX, tmp_EDX);
1105
               x86_mov_reg_imm(p->func, tmp_ECX, variant->instance_divisor);
1106
               x86_div(p->func, tmp_ECX);    /* EAX = EDX:EAX / ECX */
1107
 
1108
               /* instance = (instance_id - start_instance) / divisor +
1109
                *             start_instance
1110
                */
1111
               x86_mov(p->func, tmp_EDX, start_instance);
1112
               x86_add(p->func, tmp_EAX, tmp_EDX);
1113
            }
1114
 
1115
            /* XXX we need to clamp the index here too, but to a
1116
             * per-array max value, not the draw->pt.max_index value
1117
             * that's being given to us via translate->set_buffer().
1118
             */
1119
         } else {
1120
            x86_mov(p->func, tmp_EAX, elt);
1121
 
1122
            /* Clamp to max_index
1123
             */
1124
            x86_cmp(p->func, tmp_EAX, buf_max_index);
1125
            x86_cmovcc(p->func, tmp_EAX, buf_max_index, cc_AE);
1126
         }
1127
 
1128
         x86_imul(p->func, tmp_EAX, buf_stride);
1129
         x64_rexw(p->func);
1130
         x86_add(p->func, tmp_EAX, buf_base_ptr);
1131
 
1132
         x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1133
 
1134
         /* In the linear case, keep the buffer pointer instead of the
1135
          * index number.
1136
          */
1137
         if (!index_size && p->nr_buffer_variants == 1)
1138
         {
1139
            x64_rexw(p->func);
1140
            x86_mov(p->func, elt, tmp_EAX);
1141
         }
1142
         else
1143
         {
1144
            x64_rexw(p->func);
1145
            x86_mov(p->func, buf_ptr, tmp_EAX);
1146
         }
1147
      }
1148
   }
1149
 
1150
   return TRUE;
1151
}
1152
 
1153
 
1154
static struct x86_reg get_buffer_ptr( struct translate_sse *p,
1155
                                      unsigned index_size,
1156
                                      unsigned var_idx,
1157
                                      struct x86_reg elt )
1158
{
1159
   if (var_idx == ELEMENT_BUFFER_INSTANCE_ID) {
1160
      return x86_make_disp(p->machine_EDI,
1161
                           get_offset(p, &p->instance_id));
1162
   }
1163
   if (!index_size && p->nr_buffer_variants == 1) {
1164
      return p->idx_ESI;
1165
   }
1166
   else if (!index_size || p->buffer_variant[var_idx].instance_divisor) {
1167
      struct x86_reg ptr = p->src_ECX;
1168
      struct x86_reg buf_ptr =
1169
         x86_make_disp(p->machine_EDI,
1170
                       get_offset(p, &p->buffer_variant[var_idx].ptr));
1171
 
1172
      x64_rexw(p->func);
1173
      x86_mov(p->func, ptr, buf_ptr);
1174
      return ptr;
1175
   }
1176
   else {
1177
      struct x86_reg ptr = p->src_ECX;
1178
      const struct translate_buffer_variant *variant = &p->buffer_variant[var_idx];
1179
 
1180
      struct x86_reg buf_stride =
1181
         x86_make_disp(p->machine_EDI,
1182
                       get_offset(p, &p->buffer[variant->buffer_index].stride));
1183
 
1184
      struct x86_reg buf_base_ptr =
1185
         x86_make_disp(p->machine_EDI,
1186
                       get_offset(p, &p->buffer[variant->buffer_index].base_ptr));
1187
 
1188
      struct x86_reg buf_max_index =
1189
         x86_make_disp(p->machine_EDI,
1190
                       get_offset(p, &p->buffer[variant->buffer_index].max_index));
1191
 
1192
 
1193
 
1194
      /* Calculate pointer to current attrib:
1195
       */
1196
      switch(index_size)
1197
      {
1198
      case 1:
1199
         x86_movzx8(p->func, ptr, elt);
1200
         break;
1201
      case 2:
1202
         x86_movzx16(p->func, ptr, elt);
1203
         break;
1204
      case 4:
1205
         x86_mov(p->func, ptr, elt);
1206
         break;
1207
      }
1208
 
1209
      /* Clamp to max_index
1210
       */
1211
      x86_cmp(p->func, ptr, buf_max_index);
1212
      x86_cmovcc(p->func, ptr, buf_max_index, cc_AE);
1213
 
1214
      x86_imul(p->func, ptr, buf_stride);
1215
      x64_rexw(p->func);
1216
      x86_add(p->func, ptr, buf_base_ptr);
1217
      return ptr;
1218
   }
1219
}
1220
 
1221
 
1222
 
1223
static boolean incr_inputs( struct translate_sse *p,
1224
                            unsigned index_size )
1225
{
1226
   if (!index_size && p->nr_buffer_variants == 1) {
1227
      struct x86_reg stride = x86_make_disp(p->machine_EDI,
1228
                                            get_offset(p, &p->buffer[0].stride));
1229
 
1230
      if (p->buffer_variant[0].instance_divisor == 0) {
1231
         x64_rexw(p->func);
1232
         x86_add(p->func, p->idx_ESI, stride);
1233
         sse_prefetchnta(p->func, x86_make_disp(p->idx_ESI, 192));
1234
      }
1235
   }
1236
   else if (!index_size) {
1237
      unsigned i;
1238
 
1239
      /* Is this worthwhile??
1240
       */
1241
      for (i = 0; i < p->nr_buffer_variants; i++) {
1242
         struct translate_buffer_variant *variant = &p->buffer_variant[i];
1243
         struct x86_reg buf_ptr = x86_make_disp(p->machine_EDI,
1244
                                                get_offset(p, &variant->ptr));
1245
         struct x86_reg buf_stride = x86_make_disp(p->machine_EDI,
1246
                                                   get_offset(p, &p->buffer[variant->buffer_index].stride));
1247
 
1248
         if (variant->instance_divisor == 0) {
1249
            x86_mov(p->func, p->tmp_EAX, buf_stride);
1250
            x64_rexw(p->func);
1251
            x86_add(p->func, p->tmp_EAX, buf_ptr);
1252
            if (i == 0) sse_prefetchnta(p->func, x86_make_disp(p->tmp_EAX, 192));
1253
            x64_rexw(p->func);
1254
            x86_mov(p->func, buf_ptr, p->tmp_EAX);
1255
         }
1256
      }
1257
   }
1258
   else {
1259
      x64_rexw(p->func);
1260
      x86_lea(p->func, p->idx_ESI, x86_make_disp(p->idx_ESI, index_size));
1261
   }
1262
 
1263
   return TRUE;
1264
}
1265
 
1266
 
1267
/* Build run( struct translate *machine,
1268
 *            unsigned start,
1269
 *            unsigned count,
1270
 *            void *output_buffer )
1271
 * or
1272
 *  run_elts( struct translate *machine,
1273
 *            unsigned *elts,
1274
 *            unsigned count,
1275
 *            void *output_buffer )
1276
 *
1277
 *  Lots of hardcoding
1278
 *
1279
 * EAX -- pointer to current output vertex
1280
 * ECX -- pointer to current attribute
1281
 *
1282
 */
1283
static boolean build_vertex_emit( struct translate_sse *p,
1284
				  struct x86_function *func,
1285
				  unsigned index_size )
1286
{
1287
   int fixup, label;
1288
   unsigned j;
1289
 
1290
   memset(p->reg_to_const, 0xff, sizeof(p->reg_to_const));
1291
   memset(p->const_to_reg, 0xff, sizeof(p->const_to_reg));
1292
 
1293
   p->tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
1294
   p->idx_ESI       = x86_make_reg(file_REG32, reg_SI);
1295
   p->outbuf_EBX    = x86_make_reg(file_REG32, reg_BX);
1296
   p->machine_EDI   = x86_make_reg(file_REG32, reg_DI);
1297
   p->count_EBP     = x86_make_reg(file_REG32, reg_BP);
1298
   p->tmp2_EDX     = x86_make_reg(file_REG32, reg_DX);
1299
   p->src_ECX     = x86_make_reg(file_REG32, reg_CX);
1300
 
1301
   p->func = func;
1302
 
1303
   x86_init_func(p->func);
1304
 
1305
   if(x86_target(p->func) == X86_64_WIN64_ABI)
1306
   {
1307
	   /* the ABI guarantees a 16-byte aligned 32-byte "shadow space" above the return address */
1308
	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8), x86_make_reg(file_XMM, 6));
1309
	   sse2_movdqa(p->func, x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24), x86_make_reg(file_XMM, 7));
1310
   }
1311
 
1312
   x86_push(p->func, p->outbuf_EBX);
1313
   x86_push(p->func, p->count_EBP);
1314
 
1315
/* on non-Win64 x86-64, these are already in the right registers */
1316
   if(x86_target(p->func) != X86_64_STD_ABI)
1317
   {
1318
      x86_push(p->func, p->machine_EDI);
1319
      x86_push(p->func, p->idx_ESI);
1320
 
1321
      x86_mov(p->func, p->machine_EDI, x86_fn_arg(p->func, 1));
1322
      x86_mov(p->func, p->idx_ESI, x86_fn_arg(p->func, 2));
1323
   }
1324
 
1325
   x86_mov(p->func, p->count_EBP, x86_fn_arg(p->func, 3));
1326
 
1327
   if(x86_target(p->func) != X86_32)
1328
      x64_mov64(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1329
   else
1330
      x86_mov(p->func, p->outbuf_EBX, x86_fn_arg(p->func, 6));
1331
 
1332
   /* Load instance ID.
1333
    */
1334
   if (p->use_instancing) {
1335
      x86_mov(p->func,
1336
              p->tmp2_EDX,
1337
              x86_fn_arg(p->func, 4));
1338
      x86_mov(p->func,
1339
              x86_make_disp(p->machine_EDI, get_offset(p, &p->start_instance)),
1340
              p->tmp2_EDX);
1341
 
1342
      x86_mov(p->func,
1343
              p->tmp_EAX,
1344
              x86_fn_arg(p->func, 5));
1345
      x86_mov(p->func,
1346
              x86_make_disp(p->machine_EDI, get_offset(p, &p->instance_id)),
1347
              p->tmp_EAX);
1348
   }
1349
 
1350
   /* Get vertex count, compare to zero
1351
    */
1352
   x86_xor(p->func, p->tmp_EAX, p->tmp_EAX);
1353
   x86_cmp(p->func, p->count_EBP, p->tmp_EAX);
1354
   fixup = x86_jcc_forward(p->func, cc_E);
1355
 
1356
   /* always load, needed or not:
1357
    */
1358
   init_inputs(p, index_size);
1359
 
1360
   /* Note address for loop jump
1361
    */
1362
   label = x86_get_label(p->func);
1363
   {
1364
      struct x86_reg elt = !index_size ? p->idx_ESI : x86_deref(p->idx_ESI);
1365
      int last_variant = -1;
1366
      struct x86_reg vb;
1367
 
1368
      for (j = 0; j < p->translate.key.nr_elements; j++) {
1369
         const struct translate_element *a = &p->translate.key.element[j];
1370
         unsigned variant = p->element_to_buffer_variant[j];
1371
 
1372
         /* Figure out source pointer address:
1373
          */
1374
         if (variant != last_variant) {
1375
            last_variant = variant;
1376
            vb = get_buffer_ptr(p, index_size, variant, elt);
1377
         }
1378
 
1379
         if (!translate_attr( p, a,
1380
                              x86_make_disp(vb, a->input_offset),
1381
                              x86_make_disp(p->outbuf_EBX, a->output_offset)))
1382
            return FALSE;
1383
      }
1384
 
1385
      /* Next output vertex:
1386
       */
1387
      x64_rexw(p->func);
1388
      x86_lea(p->func,
1389
              p->outbuf_EBX,
1390
              x86_make_disp(p->outbuf_EBX,
1391
                            p->translate.key.output_stride));
1392
 
1393
      /* Incr index
1394
       */
1395
      incr_inputs( p, index_size );
1396
   }
1397
 
1398
   /* decr count, loop if not zero
1399
    */
1400
   x86_dec(p->func, p->count_EBP);
1401
   x86_jcc(p->func, cc_NZ, label);
1402
 
1403
   /* Exit mmx state?
1404
    */
1405
   if (p->func->need_emms)
1406
      mmx_emms(p->func);
1407
 
1408
   /* Land forward jump here:
1409
    */
1410
   x86_fixup_fwd_jump(p->func, fixup);
1411
 
1412
   /* Pop regs and return
1413
    */
1414
 
1415
   if(x86_target(p->func) != X86_64_STD_ABI)
1416
   {
1417
      x86_pop(p->func, p->idx_ESI);
1418
      x86_pop(p->func, p->machine_EDI);
1419
   }
1420
 
1421
   x86_pop(p->func, p->count_EBP);
1422
   x86_pop(p->func, p->outbuf_EBX);
1423
 
1424
   if(x86_target(p->func) == X86_64_WIN64_ABI)
1425
   {
1426
	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 6), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 8));
1427
	   sse2_movdqa(p->func, x86_make_reg(file_XMM, 7), x86_make_disp(x86_make_reg(file_REG32, reg_SP), 24));
1428
   }
1429
   x86_ret(p->func);
1430
 
1431
   return TRUE;
1432
}
1433
 
1434
 
1435
 
1436
 
1437
 
1438
 
1439
 
1440
static void translate_sse_set_buffer( struct translate *translate,
1441
				unsigned buf,
1442
				const void *ptr,
1443
				unsigned stride,
1444
				unsigned max_index )
1445
{
1446
   struct translate_sse *p = (struct translate_sse *)translate;
1447
 
1448
   if (buf < p->nr_buffers) {
1449
      p->buffer[buf].base_ptr = (char *)ptr;
1450
      p->buffer[buf].stride = stride;
1451
      p->buffer[buf].max_index = max_index;
1452
   }
1453
 
1454
   if (0) debug_printf("%s %d/%d: %p %d\n",
1455
                       __FUNCTION__, buf,
1456
                       p->nr_buffers,
1457
                       ptr, stride);
1458
}
1459
 
1460
 
1461
static void translate_sse_release( struct translate *translate )
1462
{
1463
   struct translate_sse *p = (struct translate_sse *)translate;
1464
 
1465
   x86_release_func( &p->elt8_func );
1466
   x86_release_func( &p->elt16_func );
1467
   x86_release_func( &p->elt_func );
1468
   x86_release_func( &p->linear_func );
1469
 
1470
   os_free_aligned(p);
1471
}
1472
 
1473
 
1474
struct translate *translate_sse2_create( const struct translate_key *key )
1475
{
1476
   struct translate_sse *p = NULL;
1477
   unsigned i;
1478
 
1479
   /* this is misnamed, it actually refers to whether rtasm is enabled or not */
1480
   if (!rtasm_cpu_has_sse())
1481
      goto fail;
1482
 
1483
   p = os_malloc_aligned(sizeof(struct translate_sse), 16);
1484
   if (p == NULL)
1485
      goto fail;
1486
   memset(p, 0, sizeof(*p));
1487
   memcpy(p->consts, consts, sizeof(consts));
1488
 
1489
   p->translate.key = *key;
1490
   p->translate.release = translate_sse_release;
1491
   p->translate.set_buffer = translate_sse_set_buffer;
1492
 
1493
   for (i = 0; i < key->nr_elements; i++) {
1494
      if (key->element[i].type == TRANSLATE_ELEMENT_NORMAL) {
1495
         unsigned j;
1496
 
1497
         p->nr_buffers = MAX2(p->nr_buffers, key->element[i].input_buffer + 1);
1498
 
1499
         if (key->element[i].instance_divisor) {
1500
            p->use_instancing = TRUE;
1501
         }
1502
 
1503
         /*
1504
          * Map vertex element to vertex buffer variant.
1505
          */
1506
         for (j = 0; j < p->nr_buffer_variants; j++) {
1507
            if (p->buffer_variant[j].buffer_index == key->element[i].input_buffer &&
1508
                p->buffer_variant[j].instance_divisor == key->element[i].instance_divisor) {
1509
               break;
1510
            }
1511
         }
1512
         if (j == p->nr_buffer_variants) {
1513
            p->buffer_variant[j].buffer_index = key->element[i].input_buffer;
1514
            p->buffer_variant[j].instance_divisor = key->element[i].instance_divisor;
1515
            p->nr_buffer_variants++;
1516
         }
1517
         p->element_to_buffer_variant[i] = j;
1518
      } else {
1519
         assert(key->element[i].type == TRANSLATE_ELEMENT_INSTANCE_ID);
1520
 
1521
         p->element_to_buffer_variant[i] = ELEMENT_BUFFER_INSTANCE_ID;
1522
      }
1523
   }
1524
 
1525
   if (0) debug_printf("nr_buffers: %d\n", p->nr_buffers);
1526
 
1527
   if (!build_vertex_emit(p, &p->linear_func, 0))
1528
      goto fail;
1529
 
1530
   if (!build_vertex_emit(p, &p->elt_func, 4))
1531
      goto fail;
1532
 
1533
   if (!build_vertex_emit(p, &p->elt16_func, 2))
1534
      goto fail;
1535
 
1536
   if (!build_vertex_emit(p, &p->elt8_func, 1))
1537
      goto fail;
1538
 
1539
   p->translate.run = (run_func) x86_get_func(&p->linear_func);
1540
   if (p->translate.run == NULL)
1541
      goto fail;
1542
 
1543
   p->translate.run_elts = (run_elts_func) x86_get_func(&p->elt_func);
1544
   if (p->translate.run_elts == NULL)
1545
      goto fail;
1546
 
1547
   p->translate.run_elts16 = (run_elts16_func) x86_get_func(&p->elt16_func);
1548
   if (p->translate.run_elts16 == NULL)
1549
      goto fail;
1550
 
1551
   p->translate.run_elts8 = (run_elts8_func) x86_get_func(&p->elt8_func);
1552
   if (p->translate.run_elts8 == NULL)
1553
      goto fail;
1554
 
1555
   return &p->translate;
1556
 
1557
 fail:
1558
   if (p)
1559
      translate_sse_release( &p->translate );
1560
 
1561
   return NULL;
1562
}
1563
 
1564
 
1565
 
1566
#else
1567
 
1568
struct translate *translate_sse2_create( const struct translate_key *key )
1569
{
1570
   return NULL;
1571
}
1572
 
1573
#endif