Subversion Repositories Kolibri OS

Rev

Rev 4358 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4358 Serge 1
/*
2
 * Copyright © 2010 Intel Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23
 
24
/** @file brw_fs_emit.cpp
25
 *
26
 * This file supports emitting code from the FS LIR to the actual
27
 * native instructions.
28
 */
29
 
30
extern "C" {
31
#include "main/macros.h"
32
#include "brw_context.h"
33
#include "brw_eu.h"
34
} /* extern "C" */
35
 
36
#include "brw_fs.h"
37
#include "brw_cfg.h"
38
 
39
fs_generator::fs_generator(struct brw_context *brw,
40
                           struct brw_wm_compile *c,
41
                           struct gl_shader_program *prog,
42
                           struct gl_fragment_program *fp,
43
                           bool dual_source_output)
44
 
45
   : brw(brw), c(c), prog(prog), fp(fp), dual_source_output(dual_source_output)
46
{
47
   ctx = &brw->ctx;
48
 
49
   shader = prog ? prog->_LinkedShaders[MESA_SHADER_FRAGMENT] : NULL;
50
 
51
   mem_ctx = c;
52
 
53
   p = rzalloc(mem_ctx, struct brw_compile);
54
   brw_init_compile(brw, p, mem_ctx);
55
}
56
 
57
fs_generator::~fs_generator()
58
{
59
}
60
 
61
void
62
fs_generator::patch_discard_jumps_to_fb_writes()
63
{
64
   if (brw->gen < 6 || this->discard_halt_patches.is_empty())
65
      return;
66
 
67
   /* There is a somewhat strange undocumented requirement of using
68
    * HALT, according to the simulator.  If some channel has HALTed to
69
    * a particular UIP, then by the end of the program, every channel
70
    * must have HALTed to that UIP.  Furthermore, the tracking is a
71
    * stack, so you can't do the final halt of a UIP after starting
72
    * halting to a new UIP.
73
    *
74
    * Symptoms of not emitting this instruction on actual hardware
75
    * included GPU hangs and sparkly rendering on the piglit discard
76
    * tests.
77
    */
78
   struct brw_instruction *last_halt = gen6_HALT(p);
79
   last_halt->bits3.break_cont.uip = 2;
80
   last_halt->bits3.break_cont.jip = 2;
81
 
82
   int ip = p->nr_insn;
83
 
84
   foreach_list(node, &this->discard_halt_patches) {
85
      ip_record *patch_ip = (ip_record *)node;
86
      struct brw_instruction *patch = &p->store[patch_ip->ip];
87
 
88
      assert(patch->header.opcode == BRW_OPCODE_HALT);
89
      /* HALT takes a half-instruction distance from the pre-incremented IP. */
90
      patch->bits3.break_cont.uip = (ip - patch_ip->ip) * 2;
91
   }
92
 
93
   this->discard_halt_patches.make_empty();
94
}
95
 
96
void
97
fs_generator::generate_fb_write(fs_inst *inst)
98
{
99
   bool eot = inst->eot;
100
   struct brw_reg implied_header;
101
   uint32_t msg_control;
102
 
103
   /* Header is 2 regs, g0 and g1 are the contents. g0 will be implied
104
    * move, here's g1.
105
    */
106
   brw_push_insn_state(p);
107
   brw_set_mask_control(p, BRW_MASK_DISABLE);
108
   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
109
 
4401 Serge 110
   if (fp->UsesKill || c->key.alpha_test_func) {
4358 Serge 111
      struct brw_reg pixel_mask;
112
 
113
      if (brw->gen >= 6)
114
         pixel_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
115
      else
116
         pixel_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
117
 
118
      brw_MOV(p, pixel_mask, brw_flag_reg(0, 1));
119
   }
120
 
121
   if (inst->header_present) {
122
      if (brw->gen >= 6) {
123
	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
124
	 brw_MOV(p,
125
		 retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
126
		 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
127
	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
128
 
129
         if (inst->target > 0 && c->key.replicate_alpha) {
130
            /* Set "Source0 Alpha Present to RenderTarget" bit in message
131
             * header.
132
             */
133
            brw_OR(p,
134
		   vec1(retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD)),
135
		   vec1(retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD)),
136
		   brw_imm_ud(0x1 << 11));
137
         }
138
 
139
	 if (inst->target > 0) {
140
	    /* Set the render target index for choosing BLEND_STATE. */
141
	    brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
142
					   inst->base_mrf, 2),
143
			      BRW_REGISTER_TYPE_UD),
144
		    brw_imm_ud(inst->target));
145
	 }
146
 
147
	 implied_header = brw_null_reg();
148
      } else {
149
	 implied_header = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
150
 
151
	 brw_MOV(p,
152
		 brw_message_reg(inst->base_mrf + 1),
153
		 brw_vec8_grf(1, 0));
154
      }
155
   } else {
156
      implied_header = brw_null_reg();
157
   }
158
 
159
   if (this->dual_source_output)
160
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_DUAL_SOURCE_SUBSPAN01;
161
   else if (dispatch_width == 16)
162
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD16_SINGLE_SOURCE;
163
   else
164
      msg_control = BRW_DATAPORT_RENDER_TARGET_WRITE_SIMD8_SINGLE_SOURCE_SUBSPAN01;
165
 
166
   brw_pop_insn_state(p);
167
 
168
   brw_fb_WRITE(p,
169
		dispatch_width,
170
		inst->base_mrf,
171
		implied_header,
172
		msg_control,
173
		inst->target,
174
		inst->mlen,
175
		0,
176
		eot,
177
		inst->header_present);
178
}
179
 
180
/* Computes the integer pixel x,y values from the origin.
181
 *
182
 * This is the basis of gl_FragCoord computation, but is also used
183
 * pre-gen6 for computing the deltas from v0 for computing
184
 * interpolation.
185
 */
186
void
187
fs_generator::generate_pixel_xy(struct brw_reg dst, bool is_x)
188
{
189
   struct brw_reg g1_uw = retype(brw_vec1_grf(1, 0), BRW_REGISTER_TYPE_UW);
190
   struct brw_reg src;
191
   struct brw_reg deltas;
192
 
193
   if (is_x) {
194
      src = stride(suboffset(g1_uw, 4), 2, 4, 0);
195
      deltas = brw_imm_v(0x10101010);
196
   } else {
197
      src = stride(suboffset(g1_uw, 5), 2, 4, 0);
198
      deltas = brw_imm_v(0x11001100);
199
   }
200
 
201
   if (dispatch_width == 16) {
202
      dst = vec16(dst);
203
   }
204
 
205
   /* We do this 8 or 16-wide, but since the destination is UW we
206
    * don't do compression in the 16-wide case.
207
    */
208
   brw_push_insn_state(p);
209
   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
210
   brw_ADD(p, dst, src, deltas);
211
   brw_pop_insn_state(p);
212
}
213
 
214
void
215
fs_generator::generate_linterp(fs_inst *inst,
216
			     struct brw_reg dst, struct brw_reg *src)
217
{
218
   struct brw_reg delta_x = src[0];
219
   struct brw_reg delta_y = src[1];
220
   struct brw_reg interp = src[2];
221
 
222
   if (brw->has_pln &&
223
       delta_y.nr == delta_x.nr + 1 &&
224
       (brw->gen >= 6 || (delta_x.nr & 1) == 0)) {
225
      brw_PLN(p, dst, interp, delta_x);
226
   } else {
227
      brw_LINE(p, brw_null_reg(), interp, delta_x);
228
      brw_MAC(p, dst, suboffset(interp, 1), delta_y);
229
   }
230
}
231
 
232
void
233
fs_generator::generate_math1_gen7(fs_inst *inst,
234
			        struct brw_reg dst,
235
			        struct brw_reg src0)
236
{
237
   assert(inst->mlen == 0);
238
   brw_math(p, dst,
239
	    brw_math_function(inst->opcode),
240
	    0, src0,
241
	    BRW_MATH_DATA_VECTOR,
242
	    BRW_MATH_PRECISION_FULL);
243
}
244
 
245
void
246
fs_generator::generate_math2_gen7(fs_inst *inst,
247
			        struct brw_reg dst,
248
			        struct brw_reg src0,
249
			        struct brw_reg src1)
250
{
251
   assert(inst->mlen == 0);
252
   brw_math2(p, dst, brw_math_function(inst->opcode), src0, src1);
253
}
254
 
255
void
256
fs_generator::generate_math1_gen6(fs_inst *inst,
257
			        struct brw_reg dst,
258
			        struct brw_reg src0)
259
{
260
   int op = brw_math_function(inst->opcode);
261
 
262
   assert(inst->mlen == 0);
263
 
264
   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
265
   brw_math(p, dst,
266
	    op,
267
	    0, src0,
268
	    BRW_MATH_DATA_VECTOR,
269
	    BRW_MATH_PRECISION_FULL);
270
 
271
   if (dispatch_width == 16) {
272
      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
273
      brw_math(p, sechalf(dst),
274
	       op,
275
	       0, sechalf(src0),
276
	       BRW_MATH_DATA_VECTOR,
277
	       BRW_MATH_PRECISION_FULL);
278
      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
279
   }
280
}
281
 
282
void
283
fs_generator::generate_math2_gen6(fs_inst *inst,
284
			        struct brw_reg dst,
285
			        struct brw_reg src0,
286
			        struct brw_reg src1)
287
{
288
   int op = brw_math_function(inst->opcode);
289
 
290
   assert(inst->mlen == 0);
291
 
292
   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
293
   brw_math2(p, dst, op, src0, src1);
294
 
295
   if (dispatch_width == 16) {
296
      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
297
      brw_math2(p, sechalf(dst), op, sechalf(src0), sechalf(src1));
298
      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
299
   }
300
}
301
 
302
void
303
fs_generator::generate_math_gen4(fs_inst *inst,
304
			       struct brw_reg dst,
305
			       struct brw_reg src)
306
{
307
   int op = brw_math_function(inst->opcode);
308
 
309
   assert(inst->mlen >= 1);
310
 
311
   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
312
   brw_math(p, dst,
313
	    op,
314
	    inst->base_mrf, src,
315
	    BRW_MATH_DATA_VECTOR,
316
	    BRW_MATH_PRECISION_FULL);
317
 
318
   if (dispatch_width == 16) {
319
      brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
320
      brw_math(p, sechalf(dst),
321
	       op,
322
	       inst->base_mrf + 1, sechalf(src),
323
	       BRW_MATH_DATA_VECTOR,
324
	       BRW_MATH_PRECISION_FULL);
325
 
326
      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
327
   }
328
}
329
 
330
void
331
fs_generator::generate_math_g45(fs_inst *inst,
332
                                struct brw_reg dst,
333
                                struct brw_reg src)
334
{
335
   if (inst->opcode == SHADER_OPCODE_POW ||
336
       inst->opcode == SHADER_OPCODE_INT_QUOTIENT ||
337
       inst->opcode == SHADER_OPCODE_INT_REMAINDER) {
338
      generate_math_gen4(inst, dst, src);
339
      return;
340
   }
341
 
342
   int op = brw_math_function(inst->opcode);
343
 
344
   assert(inst->mlen >= 1);
345
 
346
   brw_math(p, dst,
347
            op,
348
            inst->base_mrf, src,
349
            BRW_MATH_DATA_VECTOR,
350
            BRW_MATH_PRECISION_FULL);
351
}
352
 
353
void
354
fs_generator::generate_tex(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
355
{
356
   int msg_type = -1;
357
   int rlen = 4;
358
   uint32_t simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
359
   uint32_t return_format;
360
 
361
   switch (dst.type) {
362
   case BRW_REGISTER_TYPE_D:
363
      return_format = BRW_SAMPLER_RETURN_FORMAT_SINT32;
364
      break;
365
   case BRW_REGISTER_TYPE_UD:
366
      return_format = BRW_SAMPLER_RETURN_FORMAT_UINT32;
367
      break;
368
   default:
369
      return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
370
      break;
371
   }
372
 
373
   if (dispatch_width == 16)
374
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
375
 
376
   if (brw->gen >= 5) {
377
      switch (inst->opcode) {
378
      case SHADER_OPCODE_TEX:
379
	 if (inst->shadow_compare) {
380
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_COMPARE;
381
	 } else {
382
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE;
383
	 }
384
	 break;
385
      case FS_OPCODE_TXB:
386
	 if (inst->shadow_compare) {
387
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS_COMPARE;
388
	 } else {
389
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_BIAS;
390
	 }
391
	 break;
392
      case SHADER_OPCODE_TXL:
393
	 if (inst->shadow_compare) {
394
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD_COMPARE;
395
	 } else {
396
	    msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LOD;
397
	 }
398
	 break;
399
      case SHADER_OPCODE_TXS:
400
	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_RESINFO;
401
	 break;
402
      case SHADER_OPCODE_TXD:
403
         if (inst->shadow_compare) {
404
            /* Gen7.5+.  Otherwise, lowered by brw_lower_texture_gradients(). */
405
            assert(brw->is_haswell);
406
            msg_type = HSW_SAMPLER_MESSAGE_SAMPLE_DERIV_COMPARE;
407
         } else {
408
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_DERIVS;
409
         }
410
	 break;
411
      case SHADER_OPCODE_TXF:
412
	 msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
413
	 break;
414
      case SHADER_OPCODE_TXF_MS:
415
         if (brw->gen >= 7)
416
            msg_type = GEN7_SAMPLER_MESSAGE_SAMPLE_LD2DMS;
417
         else
418
            msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
419
         break;
420
      case SHADER_OPCODE_LOD:
421
         msg_type = GEN5_SAMPLER_MESSAGE_LOD;
422
         break;
423
      default:
424
	 assert(!"not reached");
425
	 break;
426
      }
427
   } else {
428
      switch (inst->opcode) {
429
      case SHADER_OPCODE_TEX:
430
	 /* Note that G45 and older determines shadow compare and dispatch width
431
	  * from message length for most messages.
432
	  */
433
	 assert(dispatch_width == 8);
434
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE;
435
	 if (inst->shadow_compare) {
436
	    assert(inst->mlen == 6);
437
	 } else {
438
	    assert(inst->mlen <= 4);
439
	 }
440
	 break;
441
      case FS_OPCODE_TXB:
442
	 if (inst->shadow_compare) {
443
	    assert(inst->mlen == 6);
444
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_BIAS_COMPARE;
445
	 } else {
446
	    assert(inst->mlen == 9);
447
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_BIAS;
448
	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
449
	 }
450
	 break;
451
      case SHADER_OPCODE_TXL:
452
	 if (inst->shadow_compare) {
453
	    assert(inst->mlen == 6);
454
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_LOD_COMPARE;
455
	 } else {
456
	    assert(inst->mlen == 9);
457
	    msg_type = BRW_SAMPLER_MESSAGE_SIMD16_SAMPLE_LOD;
458
	    simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
459
	 }
460
	 break;
461
      case SHADER_OPCODE_TXD:
462
	 /* There is no sample_d_c message; comparisons are done manually */
463
	 assert(inst->mlen == 7 || inst->mlen == 10);
464
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD8_SAMPLE_GRADIENTS;
465
	 break;
466
      case SHADER_OPCODE_TXF:
467
	 assert(inst->mlen == 9);
468
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
469
	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
470
	 break;
471
      case SHADER_OPCODE_TXS:
472
	 assert(inst->mlen == 3);
473
	 msg_type = BRW_SAMPLER_MESSAGE_SIMD16_RESINFO;
474
	 simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
475
	 break;
476
      default:
477
	 assert(!"not reached");
478
	 break;
479
      }
480
   }
481
   assert(msg_type != -1);
482
 
483
   if (simd_mode == BRW_SAMPLER_SIMD_MODE_SIMD16) {
484
      rlen = 8;
485
      dst = vec16(dst);
486
   }
487
 
488
   /* Load the message header if present.  If there's a texture offset,
489
    * we need to set it up explicitly and load the offset bitfield.
490
    * Otherwise, we can use an implied move from g0 to the first message reg.
491
    */
492
   if (inst->texture_offset) {
493
      brw_push_insn_state(p);
494
      brw_set_mask_control(p, BRW_MASK_DISABLE);
495
      brw_set_compression_control(p, BRW_COMPRESSION_NONE);
496
      /* Explicitly set up the message header by copying g0 to the MRF. */
497
      brw_MOV(p, retype(brw_message_reg(inst->base_mrf), BRW_REGISTER_TYPE_UD),
498
                 retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
499
 
500
      /* Then set the offset bits in DWord 2. */
501
      brw_MOV(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
502
                                     inst->base_mrf, 2), BRW_REGISTER_TYPE_UD),
503
                 brw_imm_ud(inst->texture_offset));
504
      brw_pop_insn_state(p);
505
   } else if (inst->header_present) {
506
      /* Set up an implied move from g0 to the MRF. */
507
      src = retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UW);
508
   }
509
 
510
   brw_SAMPLE(p,
511
	      retype(dst, BRW_REGISTER_TYPE_UW),
512
	      inst->base_mrf,
513
	      src,
514
              SURF_INDEX_TEXTURE(inst->sampler),
515
	      inst->sampler,
516
	      msg_type,
517
	      rlen,
518
	      inst->mlen,
519
	      inst->header_present,
520
	      simd_mode,
521
	      return_format);
522
}
523
 
524
 
525
/* For OPCODE_DDX and OPCODE_DDY, per channel of output we've got input
526
 * looking like:
527
 *
528
 * arg0: ss0.tl ss0.tr ss0.bl ss0.br ss1.tl ss1.tr ss1.bl ss1.br
529
 *
530
 * and we're trying to produce:
531
 *
532
 *           DDX                     DDY
533
 * dst: (ss0.tr - ss0.tl)     (ss0.tl - ss0.bl)
534
 *      (ss0.tr - ss0.tl)     (ss0.tr - ss0.br)
535
 *      (ss0.br - ss0.bl)     (ss0.tl - ss0.bl)
536
 *      (ss0.br - ss0.bl)     (ss0.tr - ss0.br)
537
 *      (ss1.tr - ss1.tl)     (ss1.tl - ss1.bl)
538
 *      (ss1.tr - ss1.tl)     (ss1.tr - ss1.br)
539
 *      (ss1.br - ss1.bl)     (ss1.tl - ss1.bl)
540
 *      (ss1.br - ss1.bl)     (ss1.tr - ss1.br)
541
 *
542
 * and add another set of two more subspans if in 16-pixel dispatch mode.
543
 *
544
 * For DDX, it ends up being easy: width = 2, horiz=0 gets us the same result
545
 * for each pair, and vertstride = 2 jumps us 2 elements after processing a
546
 * pair. But for DDY, it's harder, as we want to produce the pairs swizzled
547
 * between each other.  We could probably do it like ddx and swizzle the right
548
 * order later, but bail for now and just produce
549
 * ((ss0.tl - ss0.bl)x4 (ss1.tl - ss1.bl)x4)
550
 */
551
void
552
fs_generator::generate_ddx(fs_inst *inst, struct brw_reg dst, struct brw_reg src)
553
{
554
   struct brw_reg src0 = brw_reg(src.file, src.nr, 1,
555
				 BRW_REGISTER_TYPE_F,
556
				 BRW_VERTICAL_STRIDE_2,
557
				 BRW_WIDTH_2,
558
				 BRW_HORIZONTAL_STRIDE_0,
559
				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
560
   struct brw_reg src1 = brw_reg(src.file, src.nr, 0,
561
				 BRW_REGISTER_TYPE_F,
562
				 BRW_VERTICAL_STRIDE_2,
563
				 BRW_WIDTH_2,
564
				 BRW_HORIZONTAL_STRIDE_0,
565
				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
566
   brw_ADD(p, dst, src0, negate(src1));
567
}
568
 
569
/* The negate_value boolean is used to negate the derivative computation for
570
 * FBOs, since they place the origin at the upper left instead of the lower
571
 * left.
572
 */
573
void
574
fs_generator::generate_ddy(fs_inst *inst, struct brw_reg dst, struct brw_reg src,
575
                         bool negate_value)
576
{
577
   struct brw_reg src0 = brw_reg(src.file, src.nr, 0,
578
				 BRW_REGISTER_TYPE_F,
579
				 BRW_VERTICAL_STRIDE_4,
580
				 BRW_WIDTH_4,
581
				 BRW_HORIZONTAL_STRIDE_0,
582
				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
583
   struct brw_reg src1 = brw_reg(src.file, src.nr, 2,
584
				 BRW_REGISTER_TYPE_F,
585
				 BRW_VERTICAL_STRIDE_4,
586
				 BRW_WIDTH_4,
587
				 BRW_HORIZONTAL_STRIDE_0,
588
				 BRW_SWIZZLE_XYZW, WRITEMASK_XYZW);
589
   if (negate_value)
590
      brw_ADD(p, dst, src1, negate(src0));
591
   else
592
      brw_ADD(p, dst, src0, negate(src1));
593
}
594
 
595
void
596
fs_generator::generate_discard_jump(fs_inst *inst)
597
{
598
   assert(brw->gen >= 6);
599
 
600
   /* This HALT will be patched up at FB write time to point UIP at the end of
601
    * the program, and at brw_uip_jip() JIP will be set to the end of the
602
    * current block (or the program).
603
    */
604
   this->discard_halt_patches.push_tail(new(mem_ctx) ip_record(p->nr_insn));
605
 
606
   brw_push_insn_state(p);
607
   brw_set_mask_control(p, BRW_MASK_DISABLE);
608
   gen6_HALT(p);
609
   brw_pop_insn_state(p);
610
}
611
 
612
void
613
fs_generator::generate_spill(fs_inst *inst, struct brw_reg src)
614
{
615
   assert(inst->mlen != 0);
616
 
617
   brw_MOV(p,
618
	   retype(brw_message_reg(inst->base_mrf + 1), BRW_REGISTER_TYPE_UD),
619
	   retype(src, BRW_REGISTER_TYPE_UD));
620
   brw_oword_block_write_scratch(p, brw_message_reg(inst->base_mrf), 1,
621
				 inst->offset);
622
}
623
 
624
void
625
fs_generator::generate_unspill(fs_inst *inst, struct brw_reg dst)
626
{
627
   assert(inst->mlen != 0);
628
 
629
   brw_oword_block_read_scratch(p, dst, brw_message_reg(inst->base_mrf), 1,
630
				inst->offset);
631
}
632
 
633
void
634
fs_generator::generate_uniform_pull_constant_load(fs_inst *inst,
635
                                                  struct brw_reg dst,
636
                                                  struct brw_reg index,
637
                                                  struct brw_reg offset)
638
{
639
   assert(inst->mlen != 0);
640
 
641
   assert(index.file == BRW_IMMEDIATE_VALUE &&
642
	  index.type == BRW_REGISTER_TYPE_UD);
643
   uint32_t surf_index = index.dw1.ud;
644
 
645
   assert(offset.file == BRW_IMMEDIATE_VALUE &&
646
	  offset.type == BRW_REGISTER_TYPE_UD);
647
   uint32_t read_offset = offset.dw1.ud;
648
 
649
   brw_oword_block_read(p, dst, brw_message_reg(inst->base_mrf),
650
			read_offset, surf_index);
651
}
652
 
653
void
654
fs_generator::generate_uniform_pull_constant_load_gen7(fs_inst *inst,
655
                                                       struct brw_reg dst,
656
                                                       struct brw_reg index,
657
                                                       struct brw_reg offset)
658
{
659
   assert(inst->mlen == 0);
660
 
661
   assert(index.file == BRW_IMMEDIATE_VALUE &&
662
	  index.type == BRW_REGISTER_TYPE_UD);
663
   uint32_t surf_index = index.dw1.ud;
664
 
665
   assert(offset.file == BRW_GENERAL_REGISTER_FILE);
666
   /* Reference just the dword we need, to avoid angering validate_reg(). */
667
   offset = brw_vec1_grf(offset.nr, 0);
668
 
669
   brw_push_insn_state(p);
670
   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
671
   brw_set_mask_control(p, BRW_MASK_DISABLE);
672
   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
673
   brw_pop_insn_state(p);
674
 
675
   /* We use the SIMD4x2 mode because we want to end up with 4 components in
676
    * the destination loaded consecutively from the same offset (which appears
677
    * in the first component, and the rest are ignored).
678
    */
679
   dst.width = BRW_WIDTH_4;
680
   brw_set_dest(p, send, dst);
681
   brw_set_src0(p, send, offset);
682
   brw_set_sampler_message(p, send,
683
                           surf_index,
684
                           0, /* LD message ignores sampler unit */
685
                           GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
686
                           1, /* rlen */
687
                           1, /* mlen */
688
                           false, /* no header */
689
                           BRW_SAMPLER_SIMD_MODE_SIMD4X2,
690
                           0);
691
}
692
 
693
void
694
fs_generator::generate_varying_pull_constant_load(fs_inst *inst,
695
                                                  struct brw_reg dst,
696
                                                  struct brw_reg index,
697
                                                  struct brw_reg offset)
698
{
699
   assert(brw->gen < 7); /* Should use the gen7 variant. */
700
   assert(inst->header_present);
701
   assert(inst->mlen);
702
 
703
   assert(index.file == BRW_IMMEDIATE_VALUE &&
704
	  index.type == BRW_REGISTER_TYPE_UD);
705
   uint32_t surf_index = index.dw1.ud;
706
 
707
   uint32_t simd_mode, rlen, msg_type;
708
   if (dispatch_width == 16) {
709
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
710
      rlen = 8;
711
   } else {
712
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
713
      rlen = 4;
714
   }
715
 
716
   if (brw->gen >= 5)
717
      msg_type = GEN5_SAMPLER_MESSAGE_SAMPLE_LD;
718
   else {
719
      /* We always use the SIMD16 message so that we only have to load U, and
720
       * not V or R.
721
       */
722
      msg_type = BRW_SAMPLER_MESSAGE_SIMD16_LD;
723
      assert(inst->mlen == 3);
724
      assert(inst->regs_written == 8);
725
      rlen = 8;
726
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
727
   }
728
 
729
   struct brw_reg offset_mrf = retype(brw_message_reg(inst->base_mrf + 1),
730
                                      BRW_REGISTER_TYPE_D);
731
   brw_MOV(p, offset_mrf, offset);
732
 
733
   struct brw_reg header = brw_vec8_grf(0, 0);
734
   gen6_resolve_implied_move(p, &header, inst->base_mrf);
735
 
736
   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
737
   send->header.compression_control = BRW_COMPRESSION_NONE;
738
   brw_set_dest(p, send, dst);
739
   brw_set_src0(p, send, header);
740
   if (brw->gen < 6)
741
      send->header.destreg__conditionalmod = inst->base_mrf;
742
 
743
   /* Our surface is set up as floats, regardless of what actual data is
744
    * stored in it.
745
    */
746
   uint32_t return_format = BRW_SAMPLER_RETURN_FORMAT_FLOAT32;
747
   brw_set_sampler_message(p, send,
748
                           surf_index,
749
                           0, /* sampler (unused) */
750
                           msg_type,
751
                           rlen,
752
                           inst->mlen,
753
                           inst->header_present,
754
                           simd_mode,
755
                           return_format);
756
}
757
 
758
void
759
fs_generator::generate_varying_pull_constant_load_gen7(fs_inst *inst,
760
                                                       struct brw_reg dst,
761
                                                       struct brw_reg index,
762
                                                       struct brw_reg offset)
763
{
764
   assert(brw->gen >= 7);
765
   /* Varying-offset pull constant loads are treated as a normal expression on
766
    * gen7, so the fact that it's a send message is hidden at the IR level.
767
    */
768
   assert(!inst->header_present);
769
   assert(!inst->mlen);
770
 
771
   assert(index.file == BRW_IMMEDIATE_VALUE &&
772
	  index.type == BRW_REGISTER_TYPE_UD);
773
   uint32_t surf_index = index.dw1.ud;
774
 
775
   uint32_t simd_mode, rlen, mlen;
776
   if (dispatch_width == 16) {
777
      mlen = 2;
778
      rlen = 8;
779
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD16;
780
   } else {
781
      mlen = 1;
782
      rlen = 4;
783
      simd_mode = BRW_SAMPLER_SIMD_MODE_SIMD8;
784
   }
785
 
786
   struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
787
   brw_set_dest(p, send, dst);
788
   brw_set_src0(p, send, offset);
789
   brw_set_sampler_message(p, send,
790
                           surf_index,
791
                           0, /* LD message ignores sampler unit */
792
                           GEN5_SAMPLER_MESSAGE_SAMPLE_LD,
793
                           rlen,
794
                           mlen,
795
                           false, /* no header */
796
                           simd_mode,
797
                           0);
798
}
799
 
800
/**
801
 * Cause the current pixel/sample mask (from R1.7 bits 15:0) to be transferred
802
 * into the flags register (f0.0).
803
 *
804
 * Used only on Gen6 and above.
805
 */
806
void
807
fs_generator::generate_mov_dispatch_to_flags(fs_inst *inst)
808
{
809
   struct brw_reg flags = brw_flag_reg(0, inst->flag_subreg);
810
   struct brw_reg dispatch_mask;
811
 
812
   if (brw->gen >= 6)
813
      dispatch_mask = retype(brw_vec1_grf(1, 7), BRW_REGISTER_TYPE_UW);
814
   else
815
      dispatch_mask = retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UW);
816
 
817
   brw_push_insn_state(p);
818
   brw_set_mask_control(p, BRW_MASK_DISABLE);
819
   brw_MOV(p, flags, dispatch_mask);
820
   brw_pop_insn_state(p);
821
}
822
 
823
 
824
static uint32_t brw_file_from_reg(fs_reg *reg)
825
{
826
   switch (reg->file) {
827
   case ARF:
828
      return BRW_ARCHITECTURE_REGISTER_FILE;
829
   case GRF:
830
      return BRW_GENERAL_REGISTER_FILE;
831
   case MRF:
832
      return BRW_MESSAGE_REGISTER_FILE;
833
   case IMM:
834
      return BRW_IMMEDIATE_VALUE;
835
   default:
836
      assert(!"not reached");
837
      return BRW_GENERAL_REGISTER_FILE;
838
   }
839
}
840
 
841
static struct brw_reg
842
brw_reg_from_fs_reg(fs_reg *reg)
843
{
844
   struct brw_reg brw_reg;
845
 
846
   switch (reg->file) {
847
   case GRF:
848
   case ARF:
849
   case MRF:
850
      if (reg->smear == -1) {
851
	 brw_reg = brw_vec8_reg(brw_file_from_reg(reg), reg->reg, 0);
852
      } else {
853
	 brw_reg = brw_vec1_reg(brw_file_from_reg(reg), reg->reg, reg->smear);
854
      }
855
      brw_reg = retype(brw_reg, reg->type);
856
      if (reg->sechalf)
857
	 brw_reg = sechalf(brw_reg);
858
      break;
859
   case IMM:
860
      switch (reg->type) {
861
      case BRW_REGISTER_TYPE_F:
862
	 brw_reg = brw_imm_f(reg->imm.f);
863
	 break;
864
      case BRW_REGISTER_TYPE_D:
865
	 brw_reg = brw_imm_d(reg->imm.i);
866
	 break;
867
      case BRW_REGISTER_TYPE_UD:
868
	 brw_reg = brw_imm_ud(reg->imm.u);
869
	 break;
870
      default:
871
	 assert(!"not reached");
872
	 brw_reg = brw_null_reg();
873
	 break;
874
      }
875
      break;
876
   case HW_REG:
877
      brw_reg = reg->fixed_hw_reg;
878
      break;
879
   case BAD_FILE:
880
      /* Probably unused. */
881
      brw_reg = brw_null_reg();
882
      break;
883
   case UNIFORM:
884
      assert(!"not reached");
885
      brw_reg = brw_null_reg();
886
      break;
887
   default:
888
      assert(!"not reached");
889
      brw_reg = brw_null_reg();
890
      break;
891
   }
892
   if (reg->abs)
893
      brw_reg = brw_abs(brw_reg);
894
   if (reg->negate)
895
      brw_reg = negate(brw_reg);
896
 
897
   return brw_reg;
898
}
899
 
900
/**
901
 * Sets the first word of a vgrf for gen7+ simd4x2 uniform pull constant
902
 * sampler LD messages.
903
 *
904
 * We don't want to bake it into the send message's code generation because
905
 * that means we don't get a chance to schedule the instructions.
906
 */
907
void
908
fs_generator::generate_set_simd4x2_offset(fs_inst *inst,
909
                                          struct brw_reg dst,
910
                                          struct brw_reg value)
911
{
912
   assert(value.file == BRW_IMMEDIATE_VALUE);
913
 
914
   brw_push_insn_state(p);
915
   brw_set_compression_control(p, BRW_COMPRESSION_NONE);
916
   brw_set_mask_control(p, BRW_MASK_DISABLE);
917
   brw_MOV(p, retype(brw_vec1_reg(dst.file, dst.nr, 0), value.type), value);
918
   brw_pop_insn_state(p);
919
}
920
 
921
/**
922
 * Change the register's data type from UD to W, doubling the strides in order
923
 * to compensate for halving the data type width.
924
 */
925
static struct brw_reg
926
ud_reg_to_w(struct brw_reg r)
927
{
928
   assert(r.type == BRW_REGISTER_TYPE_UD);
929
   r.type = BRW_REGISTER_TYPE_W;
930
 
931
   /* The BRW_*_STRIDE enums are defined so that incrementing the field
932
    * doubles the real stride.
933
    */
934
   if (r.hstride != 0)
935
      ++r.hstride;
936
   if (r.vstride != 0)
937
      ++r.vstride;
938
 
939
   return r;
940
}
941
 
942
void
943
fs_generator::generate_pack_half_2x16_split(fs_inst *inst,
944
                                            struct brw_reg dst,
945
                                            struct brw_reg x,
946
                                            struct brw_reg y)
947
{
948
   assert(brw->gen >= 7);
949
   assert(dst.type == BRW_REGISTER_TYPE_UD);
950
   assert(x.type == BRW_REGISTER_TYPE_F);
951
   assert(y.type == BRW_REGISTER_TYPE_F);
952
 
953
   /* From the Ivybridge PRM, Vol4, Part3, Section 6.27 f32to16:
954
    *
955
    *   Because this instruction does not have a 16-bit floating-point type,
956
    *   the destination data type must be Word (W).
957
    *
958
    *   The destination must be DWord-aligned and specify a horizontal stride
959
    *   (HorzStride) of 2. The 16-bit result is stored in the lower word of
960
    *   each destination channel and the upper word is not modified.
961
    */
962
   struct brw_reg dst_w = ud_reg_to_w(dst);
963
 
964
   /* Give each 32-bit channel of dst the form below , where "." means
965
    * unchanged.
966
    *   0x....hhhh
967
    */
968
   brw_F32TO16(p, dst_w, y);
969
 
970
   /* Now the form:
971
    *   0xhhhh0000
972
    */
973
   brw_SHL(p, dst, dst, brw_imm_ud(16u));
974
 
975
   /* And, finally the form of packHalf2x16's output:
976
    *   0xhhhhllll
977
    */
978
   brw_F32TO16(p, dst_w, x);
979
}
980
 
981
void
982
fs_generator::generate_unpack_half_2x16_split(fs_inst *inst,
983
                                              struct brw_reg dst,
984
                                              struct brw_reg src)
985
{
986
   assert(brw->gen >= 7);
987
   assert(dst.type == BRW_REGISTER_TYPE_F);
988
   assert(src.type == BRW_REGISTER_TYPE_UD);
989
 
990
   /* From the Ivybridge PRM, Vol4, Part3, Section 6.26 f16to32:
991
    *
992
    *   Because this instruction does not have a 16-bit floating-point type,
993
    *   the source data type must be Word (W). The destination type must be
994
    *   F (Float).
995
    */
996
   struct brw_reg src_w = ud_reg_to_w(src);
997
 
998
   /* Each channel of src has the form of unpackHalf2x16's input: 0xhhhhllll.
999
    * For the Y case, we wish to access only the upper word; therefore
1000
    * a 16-bit subregister offset is needed.
1001
    */
1002
   assert(inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X ||
1003
          inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y);
1004
   if (inst->opcode == FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y)
1005
      src_w.subnr += 2;
1006
 
1007
   brw_F16TO32(p, dst, src_w);
1008
}
1009
 
1010
void
1011
fs_generator::generate_shader_time_add(fs_inst *inst,
1012
                                       struct brw_reg payload,
1013
                                       struct brw_reg offset,
1014
                                       struct brw_reg value)
1015
{
1016
   assert(brw->gen >= 7);
1017
   brw_push_insn_state(p);
1018
   brw_set_mask_control(p, true);
1019
 
1020
   assert(payload.file == BRW_GENERAL_REGISTER_FILE);
1021
   struct brw_reg payload_offset = retype(brw_vec1_grf(payload.nr, 0),
1022
                                          offset.type);
1023
   struct brw_reg payload_value = retype(brw_vec1_grf(payload.nr + 1, 0),
1024
                                         value.type);
1025
 
1026
   assert(offset.file == BRW_IMMEDIATE_VALUE);
1027
   if (value.file == BRW_GENERAL_REGISTER_FILE) {
1028
      value.width = BRW_WIDTH_1;
1029
      value.hstride = BRW_HORIZONTAL_STRIDE_0;
1030
      value.vstride = BRW_VERTICAL_STRIDE_0;
1031
   } else {
1032
      assert(value.file == BRW_IMMEDIATE_VALUE);
1033
   }
1034
 
1035
   /* Trying to deal with setup of the params from the IR is crazy in the FS8
1036
    * case, and we don't really care about squeezing every bit of performance
1037
    * out of this path, so we just emit the MOVs from here.
1038
    */
1039
   brw_MOV(p, payload_offset, offset);
1040
   brw_MOV(p, payload_value, value);
1041
   brw_shader_time_add(p, payload, SURF_INDEX_WM_SHADER_TIME);
1042
   brw_pop_insn_state(p);
1043
}
1044
 
1045
void
1046
fs_generator::generate_code(exec_list *instructions)
1047
{
1048
   int last_native_insn_offset = p->next_insn_offset;
1049
   const char *last_annotation_string = NULL;
1050
   const void *last_annotation_ir = NULL;
1051
 
1052
   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1053
      if (shader) {
1054
         printf("Native code for fragment shader %d (%d-wide dispatch):\n",
1055
                prog->Name, dispatch_width);
1056
      } else {
1057
         printf("Native code for fragment program %d (%d-wide dispatch):\n",
1058
                fp->Base.Id, dispatch_width);
1059
      }
1060
   }
1061
 
1062
   cfg_t *cfg = NULL;
1063
   if (unlikely(INTEL_DEBUG & DEBUG_WM))
1064
      cfg = new(mem_ctx) cfg_t(mem_ctx, instructions);
1065
 
1066
   foreach_list(node, instructions) {
1067
      fs_inst *inst = (fs_inst *)node;
1068
      struct brw_reg src[3], dst;
1069
 
1070
      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1071
	 foreach_list(node, &cfg->block_list) {
1072
	    bblock_link *link = (bblock_link *)node;
1073
	    bblock_t *block = link->block;
1074
 
1075
	    if (block->start == inst) {
1076
	       printf("   START B%d", block->block_num);
1077
	       foreach_list(predecessor_node, &block->parents) {
1078
		  bblock_link *predecessor_link =
1079
		     (bblock_link *)predecessor_node;
1080
		  bblock_t *predecessor_block = predecessor_link->block;
1081
		  printf(" <-B%d", predecessor_block->block_num);
1082
	       }
1083
	       printf("\n");
1084
	    }
1085
	 }
1086
 
1087
	 if (last_annotation_ir != inst->ir) {
1088
	    last_annotation_ir = inst->ir;
1089
	    if (last_annotation_ir) {
1090
	       printf("   ");
1091
               if (shader)
1092
                  ((ir_instruction *)inst->ir)->print();
1093
               else {
1094
                  const prog_instruction *fpi;
1095
                  fpi = (const prog_instruction *)inst->ir;
1096
                  printf("%d: ", (int)(fpi - fp->Base.Instructions));
1097
                  _mesa_fprint_instruction_opt(stdout,
1098
                                               fpi,
1099
                                               0, PROG_PRINT_DEBUG, NULL);
1100
               }
1101
	       printf("\n");
1102
	    }
1103
	 }
1104
	 if (last_annotation_string != inst->annotation) {
1105
	    last_annotation_string = inst->annotation;
1106
	    if (last_annotation_string)
1107
	       printf("   %s\n", last_annotation_string);
1108
	 }
1109
      }
1110
 
1111
      for (unsigned int i = 0; i < 3; i++) {
1112
	 src[i] = brw_reg_from_fs_reg(&inst->src[i]);
1113
 
1114
	 /* The accumulator result appears to get used for the
1115
	  * conditional modifier generation.  When negating a UD
1116
	  * value, there is a 33rd bit generated for the sign in the
1117
	  * accumulator value, so now you can't check, for example,
1118
	  * equality with a 32-bit value.  See piglit fs-op-neg-uvec4.
1119
	  */
1120
	 assert(!inst->conditional_mod ||
1121
		inst->src[i].type != BRW_REGISTER_TYPE_UD ||
1122
		!inst->src[i].negate);
1123
      }
1124
      dst = brw_reg_from_fs_reg(&inst->dst);
1125
 
1126
      brw_set_conditionalmod(p, inst->conditional_mod);
1127
      brw_set_predicate_control(p, inst->predicate);
1128
      brw_set_predicate_inverse(p, inst->predicate_inverse);
1129
      brw_set_flag_reg(p, 0, inst->flag_subreg);
1130
      brw_set_saturate(p, inst->saturate);
1131
      brw_set_mask_control(p, inst->force_writemask_all);
1132
 
1133
      if (inst->force_uncompressed || dispatch_width == 8) {
1134
	 brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1135
      } else if (inst->force_sechalf) {
1136
	 brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1137
      } else {
1138
	 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1139
      }
1140
 
1141
      switch (inst->opcode) {
1142
      case BRW_OPCODE_MOV:
1143
	 brw_MOV(p, dst, src[0]);
1144
	 break;
1145
      case BRW_OPCODE_ADD:
1146
	 brw_ADD(p, dst, src[0], src[1]);
1147
	 break;
1148
      case BRW_OPCODE_MUL:
1149
	 brw_MUL(p, dst, src[0], src[1]);
1150
	 break;
1151
      case BRW_OPCODE_MACH:
1152
	 brw_set_acc_write_control(p, 1);
1153
	 brw_MACH(p, dst, src[0], src[1]);
1154
	 brw_set_acc_write_control(p, 0);
1155
	 break;
1156
 
1157
      case BRW_OPCODE_MAD:
1158
	 brw_set_access_mode(p, BRW_ALIGN_16);
1159
	 if (dispatch_width == 16) {
1160
	    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1161
	    brw_MAD(p, dst, src[0], src[1], src[2]);
1162
	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1163
	    brw_MAD(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1164
	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1165
	 } else {
1166
	    brw_MAD(p, dst, src[0], src[1], src[2]);
1167
	 }
1168
	 brw_set_access_mode(p, BRW_ALIGN_1);
1169
	 break;
1170
 
1171
      case BRW_OPCODE_LRP:
1172
	 brw_set_access_mode(p, BRW_ALIGN_16);
1173
	 if (dispatch_width == 16) {
1174
	    brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1175
	    brw_LRP(p, dst, src[0], src[1], src[2]);
1176
	    brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1177
	    brw_LRP(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1178
	    brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1179
	 } else {
1180
	    brw_LRP(p, dst, src[0], src[1], src[2]);
1181
	 }
1182
	 brw_set_access_mode(p, BRW_ALIGN_1);
1183
	 break;
1184
 
1185
      case BRW_OPCODE_FRC:
1186
	 brw_FRC(p, dst, src[0]);
1187
	 break;
1188
      case BRW_OPCODE_RNDD:
1189
	 brw_RNDD(p, dst, src[0]);
1190
	 break;
1191
      case BRW_OPCODE_RNDE:
1192
	 brw_RNDE(p, dst, src[0]);
1193
	 break;
1194
      case BRW_OPCODE_RNDZ:
1195
	 brw_RNDZ(p, dst, src[0]);
1196
	 break;
1197
 
1198
      case BRW_OPCODE_AND:
1199
	 brw_AND(p, dst, src[0], src[1]);
1200
	 break;
1201
      case BRW_OPCODE_OR:
1202
	 brw_OR(p, dst, src[0], src[1]);
1203
	 break;
1204
      case BRW_OPCODE_XOR:
1205
	 brw_XOR(p, dst, src[0], src[1]);
1206
	 break;
1207
      case BRW_OPCODE_NOT:
1208
	 brw_NOT(p, dst, src[0]);
1209
	 break;
1210
      case BRW_OPCODE_ASR:
1211
	 brw_ASR(p, dst, src[0], src[1]);
1212
	 break;
1213
      case BRW_OPCODE_SHR:
1214
	 brw_SHR(p, dst, src[0], src[1]);
1215
	 break;
1216
      case BRW_OPCODE_SHL:
1217
	 brw_SHL(p, dst, src[0], src[1]);
1218
	 break;
1219
      case BRW_OPCODE_F32TO16:
1220
         brw_F32TO16(p, dst, src[0]);
1221
         break;
1222
      case BRW_OPCODE_F16TO32:
1223
         brw_F16TO32(p, dst, src[0]);
1224
         break;
1225
      case BRW_OPCODE_CMP:
1226
	 brw_CMP(p, dst, inst->conditional_mod, src[0], src[1]);
1227
	 break;
1228
      case BRW_OPCODE_SEL:
1229
	 brw_SEL(p, dst, src[0], src[1]);
1230
	 break;
1231
      case BRW_OPCODE_BFREV:
1232
         /* BFREV only supports UD type for src and dst. */
1233
         brw_BFREV(p, retype(dst, BRW_REGISTER_TYPE_UD),
1234
                      retype(src[0], BRW_REGISTER_TYPE_UD));
1235
         break;
1236
      case BRW_OPCODE_FBH:
1237
         /* FBH only supports UD type for dst. */
1238
         brw_FBH(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1239
         break;
1240
      case BRW_OPCODE_FBL:
1241
         /* FBL only supports UD type for dst. */
1242
         brw_FBL(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1243
         break;
1244
      case BRW_OPCODE_CBIT:
1245
         /* CBIT only supports UD type for dst. */
1246
         brw_CBIT(p, retype(dst, BRW_REGISTER_TYPE_UD), src[0]);
1247
         break;
1248
 
1249
      case BRW_OPCODE_BFE:
1250
         brw_set_access_mode(p, BRW_ALIGN_16);
1251
         if (dispatch_width == 16) {
1252
            brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1253
            brw_BFE(p, dst, src[0], src[1], src[2]);
1254
            brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1255
            brw_BFE(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1256
            brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1257
         } else {
1258
            brw_BFE(p, dst, src[0], src[1], src[2]);
1259
         }
1260
         brw_set_access_mode(p, BRW_ALIGN_1);
1261
         break;
1262
 
1263
      case BRW_OPCODE_BFI1:
1264
         brw_BFI1(p, dst, src[0], src[1]);
1265
         break;
1266
      case BRW_OPCODE_BFI2:
1267
         brw_set_access_mode(p, BRW_ALIGN_16);
1268
         if (dispatch_width == 16) {
1269
            brw_set_compression_control(p, BRW_COMPRESSION_NONE);
1270
            brw_BFI2(p, dst, src[0], src[1], src[2]);
1271
            brw_set_compression_control(p, BRW_COMPRESSION_2NDHALF);
1272
            brw_BFI2(p, sechalf(dst), sechalf(src[0]), sechalf(src[1]), sechalf(src[2]));
1273
            brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1274
         } else {
1275
            brw_BFI2(p, dst, src[0], src[1], src[2]);
1276
         }
1277
         brw_set_access_mode(p, BRW_ALIGN_1);
1278
         break;
1279
 
1280
      case BRW_OPCODE_IF:
1281
	 if (inst->src[0].file != BAD_FILE) {
1282
	    /* The instruction has an embedded compare (only allowed on gen6) */
1283
	    assert(brw->gen == 6);
1284
	    gen6_IF(p, inst->conditional_mod, src[0], src[1]);
1285
	 } else {
1286
	    brw_IF(p, dispatch_width == 16 ? BRW_EXECUTE_16 : BRW_EXECUTE_8);
1287
	 }
1288
	 break;
1289
 
1290
      case BRW_OPCODE_ELSE:
1291
	 brw_ELSE(p);
1292
	 break;
1293
      case BRW_OPCODE_ENDIF:
1294
	 brw_ENDIF(p);
1295
	 break;
1296
 
1297
      case BRW_OPCODE_DO:
1298
	 brw_DO(p, BRW_EXECUTE_8);
1299
	 break;
1300
 
1301
      case BRW_OPCODE_BREAK:
1302
	 brw_BREAK(p);
1303
	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1304
	 break;
1305
      case BRW_OPCODE_CONTINUE:
1306
	 /* FINISHME: We need to write the loop instruction support still. */
1307
	 if (brw->gen >= 6)
1308
	    gen6_CONT(p);
1309
	 else
1310
	    brw_CONT(p);
1311
	 brw_set_predicate_control(p, BRW_PREDICATE_NONE);
1312
	 break;
1313
 
1314
      case BRW_OPCODE_WHILE:
1315
	 brw_WHILE(p);
1316
	 break;
1317
 
1318
      case SHADER_OPCODE_RCP:
1319
      case SHADER_OPCODE_RSQ:
1320
      case SHADER_OPCODE_SQRT:
1321
      case SHADER_OPCODE_EXP2:
1322
      case SHADER_OPCODE_LOG2:
1323
      case SHADER_OPCODE_SIN:
1324
      case SHADER_OPCODE_COS:
1325
	 if (brw->gen >= 7) {
1326
	    generate_math1_gen7(inst, dst, src[0]);
1327
	 } else if (brw->gen == 6) {
1328
	    generate_math1_gen6(inst, dst, src[0]);
1329
	 } else if (brw->gen == 5 || brw->is_g4x) {
1330
	    generate_math_g45(inst, dst, src[0]);
1331
	 } else {
1332
	    generate_math_gen4(inst, dst, src[0]);
1333
	 }
1334
	 break;
1335
      case SHADER_OPCODE_INT_QUOTIENT:
1336
      case SHADER_OPCODE_INT_REMAINDER:
1337
      case SHADER_OPCODE_POW:
1338
	 if (brw->gen >= 7) {
1339
	    generate_math2_gen7(inst, dst, src[0], src[1]);
1340
	 } else if (brw->gen == 6) {
1341
	    generate_math2_gen6(inst, dst, src[0], src[1]);
1342
	 } else {
1343
	    generate_math_gen4(inst, dst, src[0]);
1344
	 }
1345
	 break;
1346
      case FS_OPCODE_PIXEL_X:
1347
	 generate_pixel_xy(dst, true);
1348
	 break;
1349
      case FS_OPCODE_PIXEL_Y:
1350
	 generate_pixel_xy(dst, false);
1351
	 break;
1352
      case FS_OPCODE_CINTERP:
1353
	 brw_MOV(p, dst, src[0]);
1354
	 break;
1355
      case FS_OPCODE_LINTERP:
1356
	 generate_linterp(inst, dst, src);
1357
	 break;
1358
      case SHADER_OPCODE_TEX:
1359
      case FS_OPCODE_TXB:
1360
      case SHADER_OPCODE_TXD:
1361
      case SHADER_OPCODE_TXF:
1362
      case SHADER_OPCODE_TXF_MS:
1363
      case SHADER_OPCODE_TXL:
1364
      case SHADER_OPCODE_TXS:
1365
      case SHADER_OPCODE_LOD:
1366
	 generate_tex(inst, dst, src[0]);
1367
	 break;
1368
      case FS_OPCODE_DDX:
1369
	 generate_ddx(inst, dst, src[0]);
1370
	 break;
1371
      case FS_OPCODE_DDY:
1372
         /* Make sure fp->UsesDFdy flag got set (otherwise there's no
1373
          * guarantee that c->key.render_to_fbo is set).
1374
          */
1375
         assert(fp->UsesDFdy);
1376
	 generate_ddy(inst, dst, src[0], c->key.render_to_fbo);
1377
	 break;
1378
 
1379
      case FS_OPCODE_SPILL:
1380
	 generate_spill(inst, src[0]);
1381
	 break;
1382
 
1383
      case FS_OPCODE_UNSPILL:
1384
	 generate_unspill(inst, dst);
1385
	 break;
1386
 
1387
      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1388
	 generate_uniform_pull_constant_load(inst, dst, src[0], src[1]);
1389
	 break;
1390
 
1391
      case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
1392
	 generate_uniform_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1393
	 break;
1394
 
1395
      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1396
	 generate_varying_pull_constant_load(inst, dst, src[0], src[1]);
1397
	 break;
1398
 
1399
      case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
1400
	 generate_varying_pull_constant_load_gen7(inst, dst, src[0], src[1]);
1401
	 break;
1402
 
1403
      case FS_OPCODE_FB_WRITE:
1404
	 generate_fb_write(inst);
1405
	 break;
1406
 
1407
      case FS_OPCODE_MOV_DISPATCH_TO_FLAGS:
1408
         generate_mov_dispatch_to_flags(inst);
1409
         break;
1410
 
1411
      case FS_OPCODE_DISCARD_JUMP:
1412
         generate_discard_jump(inst);
1413
         break;
1414
 
1415
      case SHADER_OPCODE_SHADER_TIME_ADD:
1416
         generate_shader_time_add(inst, src[0], src[1], src[2]);
1417
         break;
1418
 
1419
      case FS_OPCODE_SET_SIMD4X2_OFFSET:
1420
         generate_set_simd4x2_offset(inst, dst, src[0]);
1421
         break;
1422
 
1423
      case FS_OPCODE_PACK_HALF_2x16_SPLIT:
1424
          generate_pack_half_2x16_split(inst, dst, src[0], src[1]);
1425
          break;
1426
 
1427
      case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_X:
1428
      case FS_OPCODE_UNPACK_HALF_2x16_SPLIT_Y:
1429
         generate_unpack_half_2x16_split(inst, dst, src[0]);
1430
         break;
1431
 
1432
      case FS_OPCODE_PLACEHOLDER_HALT:
1433
         /* This is the place where the final HALT needs to be inserted if
1434
          * we've emitted any discards.  If not, this will emit no code.
1435
          */
1436
         patch_discard_jumps_to_fb_writes();
1437
         break;
1438
 
1439
      default:
1440
	 if (inst->opcode < (int) ARRAY_SIZE(opcode_descs)) {
1441
	    _mesa_problem(ctx, "Unsupported opcode `%s' in FS",
1442
			  opcode_descs[inst->opcode].name);
1443
	 } else {
1444
	    _mesa_problem(ctx, "Unsupported opcode %d in FS", inst->opcode);
1445
	 }
1446
	 abort();
1447
      }
1448
 
1449
      if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1450
	 brw_dump_compile(p, stdout,
1451
			  last_native_insn_offset, p->next_insn_offset);
1452
 
1453
	 foreach_list(node, &cfg->block_list) {
1454
	    bblock_link *link = (bblock_link *)node;
1455
	    bblock_t *block = link->block;
1456
 
1457
	    if (block->end == inst) {
1458
	       printf("   END B%d", block->block_num);
1459
	       foreach_list(successor_node, &block->children) {
1460
		  bblock_link *successor_link =
1461
		     (bblock_link *)successor_node;
1462
		  bblock_t *successor_block = successor_link->block;
1463
		  printf(" ->B%d", successor_block->block_num);
1464
	       }
1465
	       printf("\n");
1466
	    }
1467
	 }
1468
      }
1469
 
1470
      last_native_insn_offset = p->next_insn_offset;
1471
   }
1472
 
1473
   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1474
      printf("\n");
1475
   }
1476
 
1477
   brw_set_uip_jip(p);
1478
 
1479
   /* OK, while the INTEL_DEBUG=wm above is very nice for debugging FS
1480
    * emit issues, it doesn't get the jump distances into the output,
1481
    * which is often something we want to debug.  So this is here in
1482
    * case you're doing that.
1483
    */
1484
   if (0) {
1485
      brw_dump_compile(p, stdout, 0, p->next_insn_offset);
1486
   }
1487
}
1488
 
1489
const unsigned *
1490
fs_generator::generate_assembly(exec_list *simd8_instructions,
1491
                                exec_list *simd16_instructions,
1492
                                unsigned *assembly_size)
1493
{
1494
   dispatch_width = 8;
1495
   generate_code(simd8_instructions);
1496
 
1497
   if (simd16_instructions) {
1498
      /* We have to do a compaction pass now, or the one at the end of
1499
       * execution will squash down where our prog_offset start needs
1500
       * to be.
1501
       */
1502
      brw_compact_instructions(p);
1503
 
1504
      /* align to 64 byte boundary. */
1505
      while ((p->nr_insn * sizeof(struct brw_instruction)) % 64) {
1506
         brw_NOP(p);
1507
      }
1508
 
1509
      /* Save off the start of this 16-wide program */
1510
      c->prog_data.prog_offset_16 = p->nr_insn * sizeof(struct brw_instruction);
1511
 
1512
      brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1513
 
1514
      dispatch_width = 16;
1515
      generate_code(simd16_instructions);
1516
   }
1517
 
1518
   return brw_get_program(p, assembly_size);
1519
}