Subversion Repositories Kolibri OS

Rev

Rev 4358 | Go to most recent revision | Details | Compare with Previous | Last modification | View Log | RSS feed

Rev Author Line No. Line
4358 Serge 1
/*
2
 * Copyright © 2010 Intel Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23
 
24
/** @file brw_fs.cpp
25
 *
26
 * This file drives the GLSL IR -> LIR translation, contains the
27
 * optimizations on the LIR, and drives the generation of native code
28
 * from the LIR.
29
 */
30
 
31
extern "C" {
32
 
33
#include 
34
 
35
#include "main/hash_table.h"
36
#include "main/macros.h"
37
#include "main/shaderobj.h"
38
#include "main/uniforms.h"
39
#include "main/fbobject.h"
40
#include "program/prog_parameter.h"
41
#include "program/prog_print.h"
42
#include "program/register_allocate.h"
43
#include "program/sampler.h"
44
#include "program/hash_table.h"
45
#include "brw_context.h"
46
#include "brw_eu.h"
47
#include "brw_wm.h"
48
}
49
#include "brw_fs.h"
50
#include "glsl/glsl_types.h"
51
 
52
void
53
fs_inst::init()
54
{
55
   memset(this, 0, sizeof(*this));
56
   this->opcode = BRW_OPCODE_NOP;
57
   this->conditional_mod = BRW_CONDITIONAL_NONE;
58
 
59
   this->dst = reg_undef;
60
   this->src[0] = reg_undef;
61
   this->src[1] = reg_undef;
62
   this->src[2] = reg_undef;
63
 
64
   /* This will be the case for almost all instructions. */
65
   this->regs_written = 1;
66
}
67
 
68
fs_inst::fs_inst()
69
{
70
   init();
71
}
72
 
73
fs_inst::fs_inst(enum opcode opcode)
74
{
75
   init();
76
   this->opcode = opcode;
77
}
78
 
79
fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
80
{
81
   init();
82
   this->opcode = opcode;
83
   this->dst = dst;
84
 
85
   if (dst.file == GRF)
86
      assert(dst.reg_offset >= 0);
87
}
88
 
89
fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
90
{
91
   init();
92
   this->opcode = opcode;
93
   this->dst = dst;
94
   this->src[0] = src0;
95
 
96
   if (dst.file == GRF)
97
      assert(dst.reg_offset >= 0);
98
   if (src[0].file == GRF)
99
      assert(src[0].reg_offset >= 0);
100
}
101
 
102
fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
103
{
104
   init();
105
   this->opcode = opcode;
106
   this->dst = dst;
107
   this->src[0] = src0;
108
   this->src[1] = src1;
109
 
110
   if (dst.file == GRF)
111
      assert(dst.reg_offset >= 0);
112
   if (src[0].file == GRF)
113
      assert(src[0].reg_offset >= 0);
114
   if (src[1].file == GRF)
115
      assert(src[1].reg_offset >= 0);
116
}
117
 
118
fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119
		 fs_reg src0, fs_reg src1, fs_reg src2)
120
{
121
   init();
122
   this->opcode = opcode;
123
   this->dst = dst;
124
   this->src[0] = src0;
125
   this->src[1] = src1;
126
   this->src[2] = src2;
127
 
128
   if (dst.file == GRF)
129
      assert(dst.reg_offset >= 0);
130
   if (src[0].file == GRF)
131
      assert(src[0].reg_offset >= 0);
132
   if (src[1].file == GRF)
133
      assert(src[1].reg_offset >= 0);
134
   if (src[2].file == GRF)
135
      assert(src[2].reg_offset >= 0);
136
}
137
 
138
#define ALU1(op)                                                        \
139
   fs_inst *                                                            \
140
   fs_visitor::op(fs_reg dst, fs_reg src0)                              \
141
   {                                                                    \
142
      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
143
   }
144
 
145
#define ALU2(op)                                                        \
146
   fs_inst *                                                            \
147
   fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1)                 \
148
   {                                                                    \
149
      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
150
   }
151
 
152
#define ALU3(op)                                                        \
153
   fs_inst *                                                            \
154
   fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2)    \
155
   {                                                                    \
156
      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
157
   }
158
 
159
ALU1(NOT)
160
ALU1(MOV)
161
ALU1(FRC)
162
ALU1(RNDD)
163
ALU1(RNDE)
164
ALU1(RNDZ)
165
ALU2(ADD)
166
ALU2(MUL)
167
ALU2(MACH)
168
ALU2(AND)
169
ALU2(OR)
170
ALU2(XOR)
171
ALU2(SHL)
172
ALU2(SHR)
173
ALU2(ASR)
174
ALU3(LRP)
175
ALU1(BFREV)
176
ALU3(BFE)
177
ALU2(BFI1)
178
ALU3(BFI2)
179
ALU1(FBH)
180
ALU1(FBL)
181
ALU1(CBIT)
182
 
183
/** Gen4 predicated IF. */
184
fs_inst *
185
fs_visitor::IF(uint32_t predicate)
186
{
187
   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
188
   inst->predicate = predicate;
189
   return inst;
190
}
191
 
192
/** Gen6+ IF with embedded comparison. */
193
fs_inst *
194
fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
195
{
196
   assert(brw->gen >= 6);
197
   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
198
                                        reg_null_d, src0, src1);
199
   inst->conditional_mod = condition;
200
   return inst;
201
}
202
 
203
/**
204
 * CMP: Sets the low bit of the destination channels with the result
205
 * of the comparison, while the upper bits are undefined, and updates
206
 * the flag register with the packed 16 bits of the result.
207
 */
208
fs_inst *
209
fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
210
{
211
   fs_inst *inst;
212
 
213
   /* Take the instruction:
214
    *
215
    * CMP null src0 src1
216
    *
217
    * Original gen4 does type conversion to the destination type before
218
    * comparison, producing garbage results for floating point comparisons.
219
    * gen5 does the comparison on the execution type (resolved source types),
220
    * so dst type doesn't matter.  gen6 does comparison and then uses the
221
    * result as if it was the dst type with no conversion, which happens to
222
    * mostly work out for float-interpreted-as-int since our comparisons are
223
    * for >0, =0, <0.
224
    */
225
   if (brw->gen == 4) {
226
      dst.type = src0.type;
227
      if (dst.file == HW_REG)
228
	 dst.fixed_hw_reg.type = dst.type;
229
   }
230
 
231
   resolve_ud_negate(&src0);
232
   resolve_ud_negate(&src1);
233
 
234
   inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
235
   inst->conditional_mod = condition;
236
 
237
   return inst;
238
}
239
 
240
exec_list
241
fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
242
                                       fs_reg varying_offset,
243
                                       uint32_t const_offset)
244
{
245
   exec_list instructions;
246
   fs_inst *inst;
247
 
248
   /* We have our constant surface use a pitch of 4 bytes, so our index can
249
    * be any component of a vector, and then we load 4 contiguous
250
    * components starting from that.
251
    *
252
    * We break down the const_offset to a portion added to the variable
253
    * offset and a portion done using reg_offset, which means that if you
254
    * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
255
    * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
256
    * CSE can later notice that those loads are all the same and eliminate
257
    * the redundant ones.
258
    */
259
   fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
260
   instructions.push_tail(ADD(vec4_offset,
261
                              varying_offset, const_offset & ~3));
262
 
263
   int scale = 1;
264
   if (brw->gen == 4 && dispatch_width == 8) {
265
      /* Pre-gen5, we can either use a SIMD8 message that requires (header,
266
       * u, v, r) as parameters, or we can just use the SIMD16 message
267
       * consisting of (header, u).  We choose the second, at the cost of a
268
       * longer return length.
269
       */
270
      scale = 2;
271
   }
272
 
273
   enum opcode op;
274
   if (brw->gen >= 7)
275
      op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
276
   else
277
      op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
278
   fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
279
   inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
280
   inst->regs_written = 4 * scale;
281
   instructions.push_tail(inst);
282
 
283
   if (brw->gen < 7) {
284
      inst->base_mrf = 13;
285
      inst->header_present = true;
286
      if (brw->gen == 4)
287
         inst->mlen = 3;
288
      else
289
         inst->mlen = 1 + dispatch_width / 8;
290
   }
291
 
292
   vec4_result.reg_offset += (const_offset & 3) * scale;
293
   instructions.push_tail(MOV(dst, vec4_result));
294
 
295
   return instructions;
296
}
297
 
298
/**
299
 * A helper for MOV generation for fixing up broken hardware SEND dependency
300
 * handling.
301
 */
302
fs_inst *
303
fs_visitor::DEP_RESOLVE_MOV(int grf)
304
{
305
   fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
306
 
307
   inst->ir = NULL;
308
   inst->annotation = "send dependency resolve";
309
 
310
   /* The caller always wants uncompressed to emit the minimal extra
311
    * dependencies, and to avoid having to deal with aligning its regs to 2.
312
    */
313
   inst->force_uncompressed = true;
314
 
315
   return inst;
316
}
317
 
318
bool
319
fs_inst::equals(fs_inst *inst)
320
{
321
   return (opcode == inst->opcode &&
322
           dst.equals(inst->dst) &&
323
           src[0].equals(inst->src[0]) &&
324
           src[1].equals(inst->src[1]) &&
325
           src[2].equals(inst->src[2]) &&
326
           saturate == inst->saturate &&
327
           predicate == inst->predicate &&
328
           conditional_mod == inst->conditional_mod &&
329
           mlen == inst->mlen &&
330
           base_mrf == inst->base_mrf &&
331
           sampler == inst->sampler &&
332
           target == inst->target &&
333
           eot == inst->eot &&
334
           header_present == inst->header_present &&
335
           shadow_compare == inst->shadow_compare &&
336
           offset == inst->offset);
337
}
338
 
339
bool
340
fs_inst::overwrites_reg(const fs_reg ®)
341
{
342
   return (reg.file == dst.file &&
343
           reg.reg == dst.reg &&
344
           reg.reg_offset >= dst.reg_offset  &&
345
           reg.reg_offset < dst.reg_offset + regs_written);
346
}
347
 
348
bool
349
fs_inst::is_send_from_grf()
350
{
351
   return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
352
           opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
353
           (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
354
            src[1].file == GRF));
355
}
356
 
357
bool
358
fs_visitor::can_do_source_mods(fs_inst *inst)
359
{
360
   if (brw->gen == 6 && inst->is_math())
361
      return false;
362
 
363
   if (inst->is_send_from_grf())
364
      return false;
365
 
366
   return true;
367
}
368
 
369
void
370
fs_reg::init()
371
{
372
   memset(this, 0, sizeof(*this));
373
   this->smear = -1;
374
}
375
 
376
/** Generic unset register constructor. */
377
fs_reg::fs_reg()
378
{
379
   init();
380
   this->file = BAD_FILE;
381
}
382
 
383
/** Immediate value constructor. */
384
fs_reg::fs_reg(float f)
385
{
386
   init();
387
   this->file = IMM;
388
   this->type = BRW_REGISTER_TYPE_F;
389
   this->imm.f = f;
390
}
391
 
392
/** Immediate value constructor. */
393
fs_reg::fs_reg(int32_t i)
394
{
395
   init();
396
   this->file = IMM;
397
   this->type = BRW_REGISTER_TYPE_D;
398
   this->imm.i = i;
399
}
400
 
401
/** Immediate value constructor. */
402
fs_reg::fs_reg(uint32_t u)
403
{
404
   init();
405
   this->file = IMM;
406
   this->type = BRW_REGISTER_TYPE_UD;
407
   this->imm.u = u;
408
}
409
 
410
/** Fixed brw_reg Immediate value constructor. */
411
fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
412
{
413
   init();
414
   this->file = HW_REG;
415
   this->fixed_hw_reg = fixed_hw_reg;
416
   this->type = fixed_hw_reg.type;
417
}
418
 
419
bool
420
fs_reg::equals(const fs_reg &r) const
421
{
422
   return (file == r.file &&
423
           reg == r.reg &&
424
           reg_offset == r.reg_offset &&
425
           type == r.type &&
426
           negate == r.negate &&
427
           abs == r.abs &&
428
           !reladdr && !r.reladdr &&
429
           memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
430
                  sizeof(fixed_hw_reg)) == 0 &&
431
           smear == r.smear &&
432
           imm.u == r.imm.u);
433
}
434
 
435
bool
436
fs_reg::is_zero() const
437
{
438
   if (file != IMM)
439
      return false;
440
 
441
   return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
442
}
443
 
444
bool
445
fs_reg::is_one() const
446
{
447
   if (file != IMM)
448
      return false;
449
 
450
   return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
451
}
452
 
453
bool
454
fs_reg::is_valid_3src() const
455
{
456
   return file == GRF || file == UNIFORM;
457
}
458
 
459
int
460
fs_visitor::type_size(const struct glsl_type *type)
461
{
462
   unsigned int size, i;
463
 
464
   switch (type->base_type) {
465
   case GLSL_TYPE_UINT:
466
   case GLSL_TYPE_INT:
467
   case GLSL_TYPE_FLOAT:
468
   case GLSL_TYPE_BOOL:
469
      return type->components();
470
   case GLSL_TYPE_ARRAY:
471
      return type_size(type->fields.array) * type->length;
472
   case GLSL_TYPE_STRUCT:
473
      size = 0;
474
      for (i = 0; i < type->length; i++) {
475
	 size += type_size(type->fields.structure[i].type);
476
      }
477
      return size;
478
   case GLSL_TYPE_SAMPLER:
479
      /* Samplers take up no register space, since they're baked in at
480
       * link time.
481
       */
482
      return 0;
483
   case GLSL_TYPE_VOID:
484
   case GLSL_TYPE_ERROR:
485
   case GLSL_TYPE_INTERFACE:
486
      assert(!"not reached");
487
      break;
488
   }
489
 
490
   return 0;
491
}
492
 
493
fs_reg
494
fs_visitor::get_timestamp()
495
{
496
   assert(brw->gen >= 7);
497
 
498
   fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
499
                                          BRW_ARF_TIMESTAMP,
500
                                          0),
501
                             BRW_REGISTER_TYPE_UD));
502
 
503
   fs_reg dst = fs_reg(this, glsl_type::uint_type);
504
 
505
   fs_inst *mov = emit(MOV(dst, ts));
506
   /* We want to read the 3 fields we care about (mostly field 0, but also 2)
507
    * even if it's not enabled in the dispatch.
508
    */
509
   mov->force_writemask_all = true;
510
   mov->force_uncompressed = true;
511
 
512
   /* The caller wants the low 32 bits of the timestamp.  Since it's running
513
    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
514
    * which is plenty of time for our purposes.  It is identical across the
515
    * EUs, but since it's tracking GPU core speed it will increment at a
516
    * varying rate as render P-states change.
517
    *
518
    * The caller could also check if render P-states have changed (or anything
519
    * else that might disrupt timing) by setting smear to 2 and checking if
520
    * that field is != 0.
521
    */
522
   dst.smear = 0;
523
 
524
   return dst;
525
}
526
 
527
void
528
fs_visitor::emit_shader_time_begin()
529
{
530
   current_annotation = "shader time start";
531
   shader_start_time = get_timestamp();
532
}
533
 
534
void
535
fs_visitor::emit_shader_time_end()
536
{
537
   current_annotation = "shader time end";
538
 
539
   enum shader_time_shader_type type, written_type, reset_type;
540
   if (dispatch_width == 8) {
541
      type = ST_FS8;
542
      written_type = ST_FS8_WRITTEN;
543
      reset_type = ST_FS8_RESET;
544
   } else {
545
      assert(dispatch_width == 16);
546
      type = ST_FS16;
547
      written_type = ST_FS16_WRITTEN;
548
      reset_type = ST_FS16_RESET;
549
   }
550
 
551
   fs_reg shader_end_time = get_timestamp();
552
 
553
   /* Check that there weren't any timestamp reset events (assuming these
554
    * were the only two timestamp reads that happened).
555
    */
556
   fs_reg reset = shader_end_time;
557
   reset.smear = 2;
558
   fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
559
   test->conditional_mod = BRW_CONDITIONAL_Z;
560
   emit(IF(BRW_PREDICATE_NORMAL));
561
 
562
   push_force_uncompressed();
563
   fs_reg start = shader_start_time;
564
   start.negate = true;
565
   fs_reg diff = fs_reg(this, glsl_type::uint_type);
566
   emit(ADD(diff, start, shader_end_time));
567
 
568
   /* If there were no instructions between the two timestamp gets, the diff
569
    * is 2 cycles.  Remove that overhead, so I can forget about that when
570
    * trying to determine the time taken for single instructions.
571
    */
572
   emit(ADD(diff, diff, fs_reg(-2u)));
573
 
574
   emit_shader_time_write(type, diff);
575
   emit_shader_time_write(written_type, fs_reg(1u));
576
   emit(BRW_OPCODE_ELSE);
577
   emit_shader_time_write(reset_type, fs_reg(1u));
578
   emit(BRW_OPCODE_ENDIF);
579
 
580
   pop_force_uncompressed();
581
}
582
 
583
void
584
fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
585
                                   fs_reg value)
586
{
587
   int shader_time_index =
588
      brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
589
   fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
590
 
591
   fs_reg payload;
592
   if (dispatch_width == 8)
593
      payload = fs_reg(this, glsl_type::uvec2_type);
594
   else
595
      payload = fs_reg(this, glsl_type::uint_type);
596
 
597
   emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
598
                fs_reg(), payload, offset, value));
599
}
600
 
601
void
602
fs_visitor::fail(const char *format, ...)
603
{
604
   va_list va;
605
   char *msg;
606
 
607
   if (failed)
608
      return;
609
 
610
   failed = true;
611
 
612
   va_start(va, format);
613
   msg = ralloc_vasprintf(mem_ctx, format, va);
614
   va_end(va);
615
   msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
616
 
617
   this->fail_msg = msg;
618
 
619
   if (INTEL_DEBUG & DEBUG_WM) {
620
      fprintf(stderr, "%s",  msg);
621
   }
622
}
623
 
624
fs_inst *
625
fs_visitor::emit(enum opcode opcode)
626
{
627
   return emit(fs_inst(opcode));
628
}
629
 
630
fs_inst *
631
fs_visitor::emit(enum opcode opcode, fs_reg dst)
632
{
633
   return emit(fs_inst(opcode, dst));
634
}
635
 
636
fs_inst *
637
fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
638
{
639
   return emit(fs_inst(opcode, dst, src0));
640
}
641
 
642
fs_inst *
643
fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
644
{
645
   return emit(fs_inst(opcode, dst, src0, src1));
646
}
647
 
648
fs_inst *
649
fs_visitor::emit(enum opcode opcode, fs_reg dst,
650
                 fs_reg src0, fs_reg src1, fs_reg src2)
651
{
652
   return emit(fs_inst(opcode, dst, src0, src1, src2));
653
}
654
 
655
void
656
fs_visitor::push_force_uncompressed()
657
{
658
   force_uncompressed_stack++;
659
}
660
 
661
void
662
fs_visitor::pop_force_uncompressed()
663
{
664
   force_uncompressed_stack--;
665
   assert(force_uncompressed_stack >= 0);
666
}
667
 
668
void
669
fs_visitor::push_force_sechalf()
670
{
671
   force_sechalf_stack++;
672
}
673
 
674
void
675
fs_visitor::pop_force_sechalf()
676
{
677
   force_sechalf_stack--;
678
   assert(force_sechalf_stack >= 0);
679
}
680
 
681
/**
682
 * Returns true if the instruction has a flag that means it won't
683
 * update an entire destination register.
684
 *
685
 * For example, dead code elimination and live variable analysis want to know
686
 * when a write to a variable screens off any preceding values that were in
687
 * it.
688
 */
689
bool
690
fs_inst::is_partial_write()
691
{
692
   return (this->predicate ||
693
           this->force_uncompressed ||
694
           this->force_sechalf);
695
}
696
 
697
/**
698
 * Returns how many MRFs an FS opcode will write over.
699
 *
700
 * Note that this is not the 0 or 1 implied writes in an actual gen
701
 * instruction -- the FS opcodes often generate MOVs in addition.
702
 */
703
int
704
fs_visitor::implied_mrf_writes(fs_inst *inst)
705
{
706
   if (inst->mlen == 0)
707
      return 0;
708
 
709
   switch (inst->opcode) {
710
   case SHADER_OPCODE_RCP:
711
   case SHADER_OPCODE_RSQ:
712
   case SHADER_OPCODE_SQRT:
713
   case SHADER_OPCODE_EXP2:
714
   case SHADER_OPCODE_LOG2:
715
   case SHADER_OPCODE_SIN:
716
   case SHADER_OPCODE_COS:
717
      return 1 * dispatch_width / 8;
718
   case SHADER_OPCODE_POW:
719
   case SHADER_OPCODE_INT_QUOTIENT:
720
   case SHADER_OPCODE_INT_REMAINDER:
721
      return 2 * dispatch_width / 8;
722
   case SHADER_OPCODE_TEX:
723
   case FS_OPCODE_TXB:
724
   case SHADER_OPCODE_TXD:
725
   case SHADER_OPCODE_TXF:
726
   case SHADER_OPCODE_TXF_MS:
727
   case SHADER_OPCODE_TXL:
728
   case SHADER_OPCODE_TXS:
729
   case SHADER_OPCODE_LOD:
730
      return 1;
731
   case FS_OPCODE_FB_WRITE:
732
      return 2;
733
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
734
   case FS_OPCODE_UNSPILL:
735
      return 1;
736
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
737
      return inst->mlen;
738
   case FS_OPCODE_SPILL:
739
      return 2;
740
   default:
741
      assert(!"not reached");
742
      return inst->mlen;
743
   }
744
}
745
 
746
int
747
fs_visitor::virtual_grf_alloc(int size)
748
{
749
   if (virtual_grf_array_size <= virtual_grf_count) {
750
      if (virtual_grf_array_size == 0)
751
	 virtual_grf_array_size = 16;
752
      else
753
	 virtual_grf_array_size *= 2;
754
      virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
755
				   virtual_grf_array_size);
756
   }
757
   virtual_grf_sizes[virtual_grf_count] = size;
758
   return virtual_grf_count++;
759
}
760
 
761
/** Fixed HW reg constructor. */
762
fs_reg::fs_reg(enum register_file file, int reg)
763
{
764
   init();
765
   this->file = file;
766
   this->reg = reg;
767
   this->type = BRW_REGISTER_TYPE_F;
768
}
769
 
770
/** Fixed HW reg constructor. */
771
fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
772
{
773
   init();
774
   this->file = file;
775
   this->reg = reg;
776
   this->type = type;
777
}
778
 
779
/** Automatic reg constructor. */
780
fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
781
{
782
   init();
783
 
784
   this->file = GRF;
785
   this->reg = v->virtual_grf_alloc(v->type_size(type));
786
   this->reg_offset = 0;
787
   this->type = brw_type_for_base_type(type);
788
}
789
 
790
fs_reg *
791
fs_visitor::variable_storage(ir_variable *var)
792
{
793
   return (fs_reg *)hash_table_find(this->variable_ht, var);
794
}
795
 
796
void
797
import_uniforms_callback(const void *key,
798
			 void *data,
799
			 void *closure)
800
{
801
   struct hash_table *dst_ht = (struct hash_table *)closure;
802
   const fs_reg *reg = (const fs_reg *)data;
803
 
804
   if (reg->file != UNIFORM)
805
      return;
806
 
807
   hash_table_insert(dst_ht, data, key);
808
}
809
 
810
/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
811
 * This brings in those uniform definitions
812
 */
813
void
814
fs_visitor::import_uniforms(fs_visitor *v)
815
{
816
   hash_table_call_foreach(v->variable_ht,
817
			   import_uniforms_callback,
818
			   variable_ht);
819
   this->params_remap = v->params_remap;
820
   this->nr_params_remap = v->nr_params_remap;
821
}
822
 
823
/* Our support for uniforms is piggy-backed on the struct
824
 * gl_fragment_program, because that's where the values actually
825
 * get stored, rather than in some global gl_shader_program uniform
826
 * store.
827
 */
828
void
829
fs_visitor::setup_uniform_values(ir_variable *ir)
830
{
831
   int namelen = strlen(ir->name);
832
 
833
   /* The data for our (non-builtin) uniforms is stored in a series of
834
    * gl_uniform_driver_storage structs for each subcomponent that
835
    * glGetUniformLocation() could name.  We know it's been set up in the same
836
    * order we'd walk the type, so walk the list of storage and find anything
837
    * with our name, or the prefix of a component that starts with our name.
838
    */
839
   unsigned params_before = c->prog_data.nr_params;
840
   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
841
      struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
842
 
843
      if (strncmp(ir->name, storage->name, namelen) != 0 ||
844
          (storage->name[namelen] != 0 &&
845
           storage->name[namelen] != '.' &&
846
           storage->name[namelen] != '[')) {
847
         continue;
848
      }
849
 
850
      unsigned slots = storage->type->component_slots();
851
      if (storage->array_elements)
852
         slots *= storage->array_elements;
853
 
854
      for (unsigned i = 0; i < slots; i++) {
855
         c->prog_data.param[c->prog_data.nr_params++] =
856
            &storage->storage[i].f;
857
      }
858
   }
859
 
860
   /* Make sure we actually initialized the right amount of stuff here. */
861
   assert(params_before + ir->type->component_slots() ==
862
          c->prog_data.nr_params);
863
   (void)params_before;
864
}
865
 
866
 
867
/* Our support for builtin uniforms is even scarier than non-builtin.
868
 * It sits on top of the PROG_STATE_VAR parameters that are
869
 * automatically updated from GL context state.
870
 */
871
void
872
fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
873
{
874
   const ir_state_slot *const slots = ir->state_slots;
875
   assert(ir->state_slots != NULL);
876
 
877
   for (unsigned int i = 0; i < ir->num_state_slots; i++) {
878
      /* This state reference has already been setup by ir_to_mesa, but we'll
879
       * get the same index back here.
880
       */
881
      int index = _mesa_add_state_reference(this->fp->Base.Parameters,
882
					    (gl_state_index *)slots[i].tokens);
883
 
884
      /* Add each of the unique swizzles of the element as a parameter.
885
       * This'll end up matching the expected layout of the
886
       * array/matrix/structure we're trying to fill in.
887
       */
888
      int last_swiz = -1;
889
      for (unsigned int j = 0; j < 4; j++) {
890
	 int swiz = GET_SWZ(slots[i].swizzle, j);
891
	 if (swiz == last_swiz)
892
	    break;
893
	 last_swiz = swiz;
894
 
895
	 c->prog_data.param[c->prog_data.nr_params++] =
896
            &fp->Base.Parameters->ParameterValues[index][swiz].f;
897
      }
898
   }
899
}
900
 
901
fs_reg *
902
fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
903
{
904
   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
905
   fs_reg wpos = *reg;
906
   bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
907
 
908
   /* gl_FragCoord.x */
909
   if (ir->pixel_center_integer) {
910
      emit(MOV(wpos, this->pixel_x));
911
   } else {
912
      emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
913
   }
914
   wpos.reg_offset++;
915
 
916
   /* gl_FragCoord.y */
917
   if (!flip && ir->pixel_center_integer) {
918
      emit(MOV(wpos, this->pixel_y));
919
   } else {
920
      fs_reg pixel_y = this->pixel_y;
921
      float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
922
 
923
      if (flip) {
924
	 pixel_y.negate = true;
925
	 offset += c->key.drawable_height - 1.0;
926
      }
927
 
928
      emit(ADD(wpos, pixel_y, fs_reg(offset)));
929
   }
930
   wpos.reg_offset++;
931
 
932
   /* gl_FragCoord.z */
933
   if (brw->gen >= 6) {
934
      emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
935
   } else {
936
      emit(FS_OPCODE_LINTERP, wpos,
937
           this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
938
           this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
939
           interp_reg(VARYING_SLOT_POS, 2));
940
   }
941
   wpos.reg_offset++;
942
 
943
   /* gl_FragCoord.w: Already set up in emit_interpolation */
944
   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
945
 
946
   return reg;
947
}
948
 
949
fs_inst *
950
fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
951
                         glsl_interp_qualifier interpolation_mode,
952
                         bool is_centroid)
953
{
954
   brw_wm_barycentric_interp_mode barycoord_mode;
955
   if (brw->gen >= 6) {
956
      if (is_centroid) {
957
         if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
958
            barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
959
         else
960
            barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
961
      } else {
962
         if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
963
            barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
964
         else
965
            barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
966
      }
967
   } else {
968
      /* On Ironlake and below, there is only one interpolation mode.
969
       * Centroid interpolation doesn't mean anything on this hardware --
970
       * there is no multisampling.
971
       */
972
      barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
973
   }
974
   return emit(FS_OPCODE_LINTERP, attr,
975
               this->delta_x[barycoord_mode],
976
               this->delta_y[barycoord_mode], interp);
977
}
978
 
979
fs_reg *
980
fs_visitor::emit_general_interpolation(ir_variable *ir)
981
{
982
   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
983
   reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
984
   fs_reg attr = *reg;
985
 
986
   unsigned int array_elements;
987
   const glsl_type *type;
988
 
989
   if (ir->type->is_array()) {
990
      array_elements = ir->type->length;
991
      if (array_elements == 0) {
992
	 fail("dereferenced array '%s' has length 0\n", ir->name);
993
      }
994
      type = ir->type->fields.array;
995
   } else {
996
      array_elements = 1;
997
      type = ir->type;
998
   }
999
 
1000
   glsl_interp_qualifier interpolation_mode =
1001
      ir->determine_interpolation_mode(c->key.flat_shade);
1002
 
1003
   int location = ir->location;
1004
   for (unsigned int i = 0; i < array_elements; i++) {
1005
      for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006
	 if (urb_setup[location] == -1) {
1007
	    /* If there's no incoming setup data for this slot, don't
1008
	     * emit interpolation for it.
1009
	     */
1010
	    attr.reg_offset += type->vector_elements;
1011
	    location++;
1012
	    continue;
1013
	 }
1014
 
1015
	 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016
	    /* Constant interpolation (flat shading) case. The SF has
1017
	     * handed us defined values in only the constant offset
1018
	     * field of the setup reg.
1019
	     */
1020
	    for (unsigned int k = 0; k < type->vector_elements; k++) {
1021
	       struct brw_reg interp = interp_reg(location, k);
1022
	       interp = suboffset(interp, 3);
1023
               interp.type = reg->type;
1024
	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025
	       attr.reg_offset++;
1026
	    }
1027
	 } else {
1028
	    /* Smooth/noperspective interpolation case. */
1029
	    for (unsigned int k = 0; k < type->vector_elements; k++) {
1030
	       /* FINISHME: At some point we probably want to push
1031
		* this farther by giving similar treatment to the
1032
		* other potentially constant components of the
1033
		* attribute, as well as making brw_vs_constval.c
1034
		* handle varyings other than gl_TexCoord.
1035
		*/
1036
               struct brw_reg interp = interp_reg(location, k);
1037
               emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038
                            ir->centroid);
1039
               if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1040
                  /* Get the pixel/sample mask into f0 so that we know
1041
                   * which pixels are lit.  Then, for each channel that is
1042
                   * unlit, replace the centroid data with non-centroid
1043
                   * data.
1044
                   */
1045
                  emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1046
                  fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1047
                                               interpolation_mode, false);
1048
                  inst->predicate = BRW_PREDICATE_NORMAL;
1049
                  inst->predicate_inverse = true;
1050
               }
1051
               if (brw->gen < 6) {
1052
                  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1053
               }
1054
	       attr.reg_offset++;
1055
	    }
1056
 
1057
	 }
1058
	 location++;
1059
      }
1060
   }
1061
 
1062
   return reg;
1063
}
1064
 
1065
fs_reg *
1066
fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1067
{
1068
   fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1069
 
1070
   /* The frontfacing comes in as a bit in the thread payload. */
1071
   if (brw->gen >= 6) {
1072
      emit(BRW_OPCODE_ASR, *reg,
1073
	   fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1074
	   fs_reg(15));
1075
      emit(BRW_OPCODE_NOT, *reg, *reg);
1076
      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1077
   } else {
1078
      struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1079
      /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1080
       * us front face
1081
       */
1082
      emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1083
      emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1084
   }
1085
 
1086
   return reg;
1087
}
1088
 
1089
fs_reg
1090
fs_visitor::fix_math_operand(fs_reg src)
1091
{
1092
   /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1093
    * might be able to do better by doing execsize = 1 math and then
1094
    * expanding that result out, but we would need to be careful with
1095
    * masking.
1096
    *
1097
    * The hardware ignores source modifiers (negate and abs) on math
1098
    * instructions, so we also move to a temp to set those up.
1099
    */
1100
   if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1101
       !src.abs && !src.negate)
1102
      return src;
1103
 
1104
   /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1105
    * operands to math
1106
    */
1107
   if (brw->gen >= 7 && src.file != IMM)
1108
      return src;
1109
 
1110
   fs_reg expanded = fs_reg(this, glsl_type::float_type);
1111
   expanded.type = src.type;
1112
   emit(BRW_OPCODE_MOV, expanded, src);
1113
   return expanded;
1114
}
1115
 
1116
fs_inst *
1117
fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1118
{
1119
   switch (opcode) {
1120
   case SHADER_OPCODE_RCP:
1121
   case SHADER_OPCODE_RSQ:
1122
   case SHADER_OPCODE_SQRT:
1123
   case SHADER_OPCODE_EXP2:
1124
   case SHADER_OPCODE_LOG2:
1125
   case SHADER_OPCODE_SIN:
1126
   case SHADER_OPCODE_COS:
1127
      break;
1128
   default:
1129
      assert(!"not reached: bad math opcode");
1130
      return NULL;
1131
   }
1132
 
1133
   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1134
    * might be able to do better by doing execsize = 1 math and then
1135
    * expanding that result out, but we would need to be careful with
1136
    * masking.
1137
    *
1138
    * Gen 6 hardware ignores source modifiers (negate and abs) on math
1139
    * instructions, so we also move to a temp to set those up.
1140
    */
1141
   if (brw->gen >= 6)
1142
      src = fix_math_operand(src);
1143
 
1144
   fs_inst *inst = emit(opcode, dst, src);
1145
 
1146
   if (brw->gen < 6) {
1147
      inst->base_mrf = 2;
1148
      inst->mlen = dispatch_width / 8;
1149
   }
1150
 
1151
   return inst;
1152
}
1153
 
1154
fs_inst *
1155
fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1156
{
1157
   int base_mrf = 2;
1158
   fs_inst *inst;
1159
 
1160
   switch (opcode) {
1161
   case SHADER_OPCODE_INT_QUOTIENT:
1162
   case SHADER_OPCODE_INT_REMAINDER:
1163
      if (brw->gen >= 7 && dispatch_width == 16)
1164
	 fail("16-wide INTDIV unsupported\n");
1165
      break;
1166
   case SHADER_OPCODE_POW:
1167
      break;
1168
   default:
1169
      assert(!"not reached: unsupported binary math opcode.");
1170
      return NULL;
1171
   }
1172
 
1173
   if (brw->gen >= 6) {
1174
      src0 = fix_math_operand(src0);
1175
      src1 = fix_math_operand(src1);
1176
 
1177
      inst = emit(opcode, dst, src0, src1);
1178
   } else {
1179
      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1180
       * "Message Payload":
1181
       *
1182
       * "Operand0[7].  For the INT DIV functions, this operand is the
1183
       *  denominator."
1184
       *  ...
1185
       * "Operand1[7].  For the INT DIV functions, this operand is the
1186
       *  numerator."
1187
       */
1188
      bool is_int_div = opcode != SHADER_OPCODE_POW;
1189
      fs_reg &op0 = is_int_div ? src1 : src0;
1190
      fs_reg &op1 = is_int_div ? src0 : src1;
1191
 
1192
      emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1193
      inst = emit(opcode, dst, op0, reg_null_f);
1194
 
1195
      inst->base_mrf = base_mrf;
1196
      inst->mlen = 2 * dispatch_width / 8;
1197
   }
1198
   return inst;
1199
}
1200
 
1201
void
1202
fs_visitor::assign_curb_setup()
1203
{
1204
   c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205
   if (dispatch_width == 8) {
1206
      c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207
   } else {
1208
      c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209
   }
1210
 
1211
   /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212
   foreach_list(node, &this->instructions) {
1213
      fs_inst *inst = (fs_inst *)node;
1214
 
1215
      for (unsigned int i = 0; i < 3; i++) {
1216
	 if (inst->src[i].file == UNIFORM) {
1217
	    int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218
	    struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219
						  constant_nr / 8,
1220
						  constant_nr % 8);
1221
 
1222
	    inst->src[i].file = HW_REG;
1223
	    inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224
	 }
1225
      }
1226
   }
1227
}
1228
 
1229
void
1230
fs_visitor::calculate_urb_setup()
1231
{
1232
   for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1233
      urb_setup[i] = -1;
1234
   }
1235
 
1236
   int urb_next = 0;
1237
   /* Figure out where each of the incoming setup attributes lands. */
1238
   if (brw->gen >= 6) {
1239
      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240
	 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241
	    urb_setup[i] = urb_next++;
1242
	 }
1243
      }
1244
   } else {
1245
      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246
      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247
         /* Point size is packed into the header, not as a general attribute */
1248
         if (i == VARYING_SLOT_PSIZ)
1249
            continue;
1250
 
1251
	 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1252
	    /* The back color slot is skipped when the front color is
1253
	     * also written to.  In addition, some slots can be
1254
	     * written in the vertex shader and not read in the
1255
	     * fragment shader.  So the register number must always be
1256
	     * incremented, mapped or not.
1257
	     */
1258
	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1259
	       urb_setup[i] = urb_next;
1260
            urb_next++;
1261
	 }
1262
      }
1263
 
1264
      /*
1265
       * It's a FS only attribute, and we did interpolation for this attribute
1266
       * in SF thread. So, count it here, too.
1267
       *
1268
       * See compile_sf_prog() for more info.
1269
       */
1270
      if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1271
         urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1272
   }
1273
 
1274
   /* Each attribute is 4 setup channels, each of which is half a reg. */
1275
   c->prog_data.urb_read_length = urb_next * 2;
1276
}
1277
 
1278
void
1279
fs_visitor::assign_urb_setup()
1280
{
1281
   int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1282
 
1283
   /* Offset all the urb_setup[] index by the actual position of the
1284
    * setup regs, now that the location of the constants has been chosen.
1285
    */
1286
   foreach_list(node, &this->instructions) {
1287
      fs_inst *inst = (fs_inst *)node;
1288
 
1289
      if (inst->opcode == FS_OPCODE_LINTERP) {
1290
	 assert(inst->src[2].file == HW_REG);
1291
	 inst->src[2].fixed_hw_reg.nr += urb_start;
1292
      }
1293
 
1294
      if (inst->opcode == FS_OPCODE_CINTERP) {
1295
	 assert(inst->src[0].file == HW_REG);
1296
	 inst->src[0].fixed_hw_reg.nr += urb_start;
1297
      }
1298
   }
1299
 
1300
   this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1301
}
1302
 
1303
/**
1304
 * Split large virtual GRFs into separate components if we can.
1305
 *
1306
 * This is mostly duplicated with what brw_fs_vector_splitting does,
1307
 * but that's really conservative because it's afraid of doing
1308
 * splitting that doesn't result in real progress after the rest of
1309
 * the optimization phases, which would cause infinite looping in
1310
 * optimization.  We can do it once here, safely.  This also has the
1311
 * opportunity to split interpolated values, or maybe even uniforms,
1312
 * which we don't have at the IR level.
1313
 *
1314
 * We want to split, because virtual GRFs are what we register
1315
 * allocate and spill (due to contiguousness requirements for some
1316
 * instructions), and they're what we naturally generate in the
1317
 * codegen process, but most virtual GRFs don't actually need to be
1318
 * contiguous sets of GRFs.  If we split, we'll end up with reduced
1319
 * live intervals and better dead code elimination and coalescing.
1320
 */
1321
void
1322
fs_visitor::split_virtual_grfs()
1323
{
1324
   int num_vars = this->virtual_grf_count;
1325
   bool split_grf[num_vars];
1326
   int new_virtual_grf[num_vars];
1327
 
1328
   /* Try to split anything > 0 sized. */
1329
   for (int i = 0; i < num_vars; i++) {
1330
      if (this->virtual_grf_sizes[i] != 1)
1331
	 split_grf[i] = true;
1332
      else
1333
	 split_grf[i] = false;
1334
   }
1335
 
1336
   if (brw->has_pln &&
1337
       this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1338
      /* PLN opcodes rely on the delta_xy being contiguous.  We only have to
1339
       * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1340
       * Gen6, that was the only supported interpolation mode, and since Gen6,
1341
       * delta_x and delta_y are in fixed hardware registers.
1342
       */
1343
      split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1344
         false;
1345
   }
1346
 
1347
   foreach_list(node, &this->instructions) {
1348
      fs_inst *inst = (fs_inst *)node;
1349
 
1350
      /* If there's a SEND message that requires contiguous destination
1351
       * registers, no splitting is allowed.
1352
       */
1353
      if (inst->regs_written > 1) {
1354
	 split_grf[inst->dst.reg] = false;
1355
      }
1356
 
1357
      /* If we're sending from a GRF, don't split it, on the assumption that
1358
       * the send is reading the whole thing.
1359
       */
1360
      if (inst->is_send_from_grf()) {
1361
         for (int i = 0; i < 3; i++) {
1362
            if (inst->src[i].file == GRF) {
1363
               split_grf[inst->src[i].reg] = false;
1364
            }
1365
         }
1366
      }
1367
   }
1368
 
1369
   /* Allocate new space for split regs.  Note that the virtual
1370
    * numbers will be contiguous.
1371
    */
1372
   for (int i = 0; i < num_vars; i++) {
1373
      if (split_grf[i]) {
1374
	 new_virtual_grf[i] = virtual_grf_alloc(1);
1375
	 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1376
	    int reg = virtual_grf_alloc(1);
1377
	    assert(reg == new_virtual_grf[i] + j - 1);
1378
	    (void) reg;
1379
	 }
1380
	 this->virtual_grf_sizes[i] = 1;
1381
      }
1382
   }
1383
 
1384
   foreach_list(node, &this->instructions) {
1385
      fs_inst *inst = (fs_inst *)node;
1386
 
1387
      if (inst->dst.file == GRF &&
1388
	  split_grf[inst->dst.reg] &&
1389
	  inst->dst.reg_offset != 0) {
1390
	 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1391
			  inst->dst.reg_offset - 1);
1392
	 inst->dst.reg_offset = 0;
1393
      }
1394
      for (int i = 0; i < 3; i++) {
1395
	 if (inst->src[i].file == GRF &&
1396
	     split_grf[inst->src[i].reg] &&
1397
	     inst->src[i].reg_offset != 0) {
1398
	    inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1399
				inst->src[i].reg_offset - 1);
1400
	    inst->src[i].reg_offset = 0;
1401
	 }
1402
      }
1403
   }
1404
   this->live_intervals_valid = false;
1405
}
1406
 
1407
/**
1408
 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1409
 *
1410
 * During code generation, we create tons of temporary variables, many of
1411
 * which get immediately killed and are never used again.  Yet, in later
1412
 * optimization and analysis passes, such as compute_live_intervals, we need
1413
 * to loop over all the virtual GRFs.  Compacting them can save a lot of
1414
 * overhead.
1415
 */
1416
void
1417
fs_visitor::compact_virtual_grfs()
1418
{
1419
   /* Mark which virtual GRFs are used, and count how many. */
1420
   int remap_table[this->virtual_grf_count];
1421
   memset(remap_table, -1, sizeof(remap_table));
1422
 
1423
   foreach_list(node, &this->instructions) {
1424
      const fs_inst *inst = (const fs_inst *) node;
1425
 
1426
      if (inst->dst.file == GRF)
1427
         remap_table[inst->dst.reg] = 0;
1428
 
1429
      for (int i = 0; i < 3; i++) {
1430
         if (inst->src[i].file == GRF)
1431
            remap_table[inst->src[i].reg] = 0;
1432
      }
1433
   }
1434
 
1435
   /* In addition to registers used in instructions, fs_visitor keeps
1436
    * direct references to certain special values which must be patched:
1437
    */
1438
   fs_reg *special[] = {
1439
      &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1440
      &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1441
      &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1442
      &delta_x[0], &delta_x[1], &delta_x[2],
1443
      &delta_x[3], &delta_x[4], &delta_x[5],
1444
      &delta_y[0], &delta_y[1], &delta_y[2],
1445
      &delta_y[3], &delta_y[4], &delta_y[5],
1446
   };
1447
   STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1448
   STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1449
 
1450
   /* Treat all special values as used, to be conservative */
1451
   for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1452
      if (special[i]->file == GRF)
1453
	 remap_table[special[i]->reg] = 0;
1454
   }
1455
 
1456
   /* Compact the GRF arrays. */
1457
   int new_index = 0;
1458
   for (int i = 0; i < this->virtual_grf_count; i++) {
1459
      if (remap_table[i] != -1) {
1460
         remap_table[i] = new_index;
1461
         virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1462
         if (live_intervals_valid) {
1463
            virtual_grf_start[new_index] = virtual_grf_start[i];
1464
            virtual_grf_end[new_index] = virtual_grf_end[i];
1465
         }
1466
         ++new_index;
1467
      }
1468
   }
1469
 
1470
   this->virtual_grf_count = new_index;
1471
 
1472
   /* Patch all the instructions to use the newly renumbered registers */
1473
   foreach_list(node, &this->instructions) {
1474
      fs_inst *inst = (fs_inst *) node;
1475
 
1476
      if (inst->dst.file == GRF)
1477
         inst->dst.reg = remap_table[inst->dst.reg];
1478
 
1479
      for (int i = 0; i < 3; i++) {
1480
         if (inst->src[i].file == GRF)
1481
            inst->src[i].reg = remap_table[inst->src[i].reg];
1482
      }
1483
   }
1484
 
1485
   /* Patch all the references to special values */
1486
   for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1487
      if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1488
	 special[i]->reg = remap_table[special[i]->reg];
1489
   }
1490
}
1491
 
1492
bool
1493
fs_visitor::remove_dead_constants()
1494
{
1495
   if (dispatch_width == 8) {
1496
      this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1497
      this->nr_params_remap = c->prog_data.nr_params;
1498
 
1499
      for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1500
	 this->params_remap[i] = -1;
1501
 
1502
      /* Find which params are still in use. */
1503
      foreach_list(node, &this->instructions) {
1504
	 fs_inst *inst = (fs_inst *)node;
1505
 
1506
	 for (int i = 0; i < 3; i++) {
1507
	    int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1508
 
1509
	    if (inst->src[i].file != UNIFORM)
1510
	       continue;
1511
 
1512
	    /* Section 5.11 of the OpenGL 4.3 spec says:
1513
	     *
1514
	     *     "Out-of-bounds reads return undefined values, which include
1515
	     *     values from other variables of the active program or zero."
1516
	     */
1517
	    if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1518
	       constant_nr = 0;
1519
	    }
1520
 
1521
	    /* For now, set this to non-negative.  We'll give it the
1522
	     * actual new number in a moment, in order to keep the
1523
	     * register numbers nicely ordered.
1524
	     */
1525
	    this->params_remap[constant_nr] = 0;
1526
	 }
1527
      }
1528
 
1529
      /* Figure out what the new numbers for the params will be.  At some
1530
       * point when we're doing uniform array access, we're going to want
1531
       * to keep the distinction between .reg and .reg_offset, but for
1532
       * now we don't care.
1533
       */
1534
      unsigned int new_nr_params = 0;
1535
      for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1536
	 if (this->params_remap[i] != -1) {
1537
	    this->params_remap[i] = new_nr_params++;
1538
	 }
1539
      }
1540
 
1541
      /* Update the list of params to be uploaded to match our new numbering. */
1542
      for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1543
	 int remapped = this->params_remap[i];
1544
 
1545
	 if (remapped == -1)
1546
	    continue;
1547
 
1548
	 c->prog_data.param[remapped] = c->prog_data.param[i];
1549
      }
1550
 
1551
      c->prog_data.nr_params = new_nr_params;
1552
   } else {
1553
      /* This should have been generated in the 8-wide pass already. */
1554
      assert(this->params_remap);
1555
   }
1556
 
1557
   /* Now do the renumbering of the shader to remove unused params. */
1558
   foreach_list(node, &this->instructions) {
1559
      fs_inst *inst = (fs_inst *)node;
1560
 
1561
      for (int i = 0; i < 3; i++) {
1562
	 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1563
 
1564
	 if (inst->src[i].file != UNIFORM)
1565
	    continue;
1566
 
1567
	 /* as above alias to 0 */
1568
	 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1569
	    constant_nr = 0;
1570
	 }
1571
	 assert(this->params_remap[constant_nr] != -1);
1572
	 inst->src[i].reg = this->params_remap[constant_nr];
1573
	 inst->src[i].reg_offset = 0;
1574
      }
1575
   }
1576
 
1577
   return true;
1578
}
1579
 
1580
/*
1581
 * Implements array access of uniforms by inserting a
1582
 * PULL_CONSTANT_LOAD instruction.
1583
 *
1584
 * Unlike temporary GRF array access (where we don't support it due to
1585
 * the difficulty of doing relative addressing on instruction
1586
 * destinations), we could potentially do array access of uniforms
1587
 * that were loaded in GRF space as push constants.  In real-world
1588
 * usage we've seen, though, the arrays being used are always larger
1589
 * than we could load as push constants, so just always move all
1590
 * uniform array access out to a pull constant buffer.
1591
 */
1592
void
1593
fs_visitor::move_uniform_array_access_to_pull_constants()
1594
{
1595
   int pull_constant_loc[c->prog_data.nr_params];
1596
 
1597
   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1598
      pull_constant_loc[i] = -1;
1599
   }
1600
 
1601
   /* Walk through and find array access of uniforms.  Put a copy of that
1602
    * uniform in the pull constant buffer.
1603
    *
1604
    * Note that we don't move constant-indexed accesses to arrays.  No
1605
    * testing has been done of the performance impact of this choice.
1606
    */
1607
   foreach_list_safe(node, &this->instructions) {
1608
      fs_inst *inst = (fs_inst *)node;
1609
 
1610
      for (int i = 0 ; i < 3; i++) {
1611
         if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1612
            continue;
1613
 
1614
         int uniform = inst->src[i].reg;
1615
 
1616
         /* If this array isn't already present in the pull constant buffer,
1617
          * add it.
1618
          */
1619
         if (pull_constant_loc[uniform] == -1) {
1620
            const float **values = &c->prog_data.param[uniform];
1621
 
1622
            pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1623
 
1624
            assert(param_size[uniform]);
1625
 
1626
            for (int j = 0; j < param_size[uniform]; j++) {
1627
               c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1628
                  values[j];
1629
            }
1630
         }
1631
 
1632
         /* Set up the annotation tracking for new generated instructions. */
1633
         base_ir = inst->ir;
1634
         current_annotation = inst->annotation;
1635
 
1636
         fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1637
         fs_reg temp = fs_reg(this, glsl_type::float_type);
1638
         exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1639
                                                     surf_index,
1640
                                                     *inst->src[i].reladdr,
1641
                                                     pull_constant_loc[uniform] +
1642
                                                     inst->src[i].reg_offset);
1643
         inst->insert_before(&list);
1644
 
1645
         inst->src[i].file = temp.file;
1646
         inst->src[i].reg = temp.reg;
1647
         inst->src[i].reg_offset = temp.reg_offset;
1648
         inst->src[i].reladdr = NULL;
1649
      }
1650
   }
1651
}
1652
 
1653
/**
1654
 * Choose accesses from the UNIFORM file to demote to using the pull
1655
 * constant buffer.
1656
 *
1657
 * We allow a fragment shader to have more than the specified minimum
1658
 * maximum number of fragment shader uniform components (64).  If
1659
 * there are too many of these, they'd fill up all of register space.
1660
 * So, this will push some of them out to the pull constant buffer and
1661
 * update the program to load them.
1662
 */
1663
void
1664
fs_visitor::setup_pull_constants()
1665
{
1666
   /* Only allow 16 registers (128 uniform components) as push constants. */
1667
   unsigned int max_uniform_components = 16 * 8;
1668
   if (c->prog_data.nr_params <= max_uniform_components)
1669
      return;
1670
 
1671
   if (dispatch_width == 16) {
1672
      fail("Pull constants not supported in 16-wide\n");
1673
      return;
1674
   }
1675
 
1676
   /* Just demote the end of the list.  We could probably do better
1677
    * here, demoting things that are rarely used in the program first.
1678
    */
1679
   unsigned int pull_uniform_base = max_uniform_components;
1680
 
1681
   int pull_constant_loc[c->prog_data.nr_params];
1682
   for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1683
      if (i < pull_uniform_base) {
1684
         pull_constant_loc[i] = -1;
1685
      } else {
1686
         pull_constant_loc[i] = -1;
1687
         /* If our constant is already being uploaded for reladdr purposes,
1688
          * reuse it.
1689
          */
1690
         for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1691
            if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1692
               pull_constant_loc[i] = j;
1693
               break;
1694
            }
1695
         }
1696
         if (pull_constant_loc[i] == -1) {
1697
            int pull_index = c->prog_data.nr_pull_params++;
1698
            c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1699
            pull_constant_loc[i] = pull_index;;
1700
         }
1701
      }
1702
   }
1703
   c->prog_data.nr_params = pull_uniform_base;
1704
 
1705
   foreach_list(node, &this->instructions) {
1706
      fs_inst *inst = (fs_inst *)node;
1707
 
1708
      for (int i = 0; i < 3; i++) {
1709
	 if (inst->src[i].file != UNIFORM)
1710
	    continue;
1711
 
1712
         int pull_index = pull_constant_loc[inst->src[i].reg +
1713
                                            inst->src[i].reg_offset];
1714
         if (pull_index == -1)
1715
	    continue;
1716
 
1717
         assert(!inst->src[i].reladdr);
1718
 
1719
	 fs_reg dst = fs_reg(this, glsl_type::float_type);
1720
	 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1721
	 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1722
	 fs_inst *pull =
1723
            new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1724
                                 dst, index, offset);
1725
	 pull->ir = inst->ir;
1726
	 pull->annotation = inst->annotation;
1727
 
1728
	 inst->insert_before(pull);
1729
 
1730
	 inst->src[i].file = GRF;
1731
	 inst->src[i].reg = dst.reg;
1732
	 inst->src[i].reg_offset = 0;
1733
	 inst->src[i].smear = pull_index & 3;
1734
      }
1735
   }
1736
}
1737
 
1738
bool
1739
fs_visitor::opt_algebraic()
1740
{
1741
   bool progress = false;
1742
 
1743
   foreach_list(node, &this->instructions) {
1744
      fs_inst *inst = (fs_inst *)node;
1745
 
1746
      switch (inst->opcode) {
1747
      case BRW_OPCODE_MUL:
1748
	 if (inst->src[1].file != IMM)
1749
	    continue;
1750
 
1751
	 /* a * 1.0 = a */
1752
	 if (inst->src[1].is_one()) {
1753
	    inst->opcode = BRW_OPCODE_MOV;
1754
	    inst->src[1] = reg_undef;
1755
	    progress = true;
1756
	    break;
1757
	 }
1758
 
1759
         /* a * 0.0 = 0.0 */
1760
         if (inst->src[1].is_zero()) {
1761
            inst->opcode = BRW_OPCODE_MOV;
1762
            inst->src[0] = inst->src[1];
1763
            inst->src[1] = reg_undef;
1764
            progress = true;
1765
            break;
1766
         }
1767
 
1768
	 break;
1769
      case BRW_OPCODE_ADD:
1770
         if (inst->src[1].file != IMM)
1771
            continue;
1772
 
1773
         /* a + 0.0 = a */
1774
         if (inst->src[1].is_zero()) {
1775
            inst->opcode = BRW_OPCODE_MOV;
1776
            inst->src[1] = reg_undef;
1777
            progress = true;
1778
            break;
1779
         }
1780
         break;
1781
      default:
1782
	 break;
1783
      }
1784
   }
1785
 
1786
   return progress;
1787
}
1788
 
1789
/**
1790
 * Removes any instructions writing a VGRF where that VGRF is not used by any
1791
 * later instruction.
1792
 */
1793
bool
1794
fs_visitor::dead_code_eliminate()
1795
{
1796
   bool progress = false;
1797
   int pc = 0;
1798
 
1799
   calculate_live_intervals();
1800
 
1801
   foreach_list_safe(node, &this->instructions) {
1802
      fs_inst *inst = (fs_inst *)node;
1803
 
1804
      if (inst->dst.file == GRF) {
1805
         assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1806
         if (this->virtual_grf_end[inst->dst.reg] == pc) {
1807
            inst->remove();
1808
            progress = true;
1809
         }
1810
      }
1811
 
1812
      pc++;
1813
   }
1814
 
1815
   if (progress)
1816
      live_intervals_valid = false;
1817
 
1818
   return progress;
1819
}
1820
 
1821
struct dead_code_hash_key
1822
{
1823
   int vgrf;
1824
   int reg_offset;
1825
};
1826
 
1827
static bool
1828
dead_code_hash_compare(const void *a, const void *b)
1829
{
1830
   return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1831
}
1832
 
1833
static void
1834
clear_dead_code_hash(struct hash_table *ht)
1835
{
1836
   struct hash_entry *entry;
1837
 
1838
   hash_table_foreach(ht, entry) {
1839
      _mesa_hash_table_remove(ht, entry);
1840
   }
1841
}
1842
 
1843
static void
1844
insert_dead_code_hash(struct hash_table *ht,
1845
                      int vgrf, int reg_offset, fs_inst *inst)
1846
{
1847
   /* We don't bother freeing keys, because they'll be GCed with the ht. */
1848
   struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1849
 
1850
   key->vgrf = vgrf;
1851
   key->reg_offset = reg_offset;
1852
 
1853
   _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1854
}
1855
 
1856
static struct hash_entry *
1857
get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1858
{
1859
   struct dead_code_hash_key key;
1860
 
1861
   key.vgrf = vgrf;
1862
   key.reg_offset = reg_offset;
1863
 
1864
   return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1865
}
1866
 
1867
static void
1868
remove_dead_code_hash(struct hash_table *ht,
1869
                      int vgrf, int reg_offset)
1870
{
1871
   struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1872
   if (!entry)
1873
      return;
1874
 
1875
   _mesa_hash_table_remove(ht, entry);
1876
}
1877
 
1878
/**
1879
 * Walks basic blocks, removing any regs that are written but not read before
1880
 * being redefined.
1881
 *
1882
 * The dead_code_eliminate() function implements a global dead code
1883
 * elimination, but it only handles the removing the last write to a register
1884
 * if it's never read.  This one can handle intermediate writes, but only
1885
 * within a basic block.
1886
 */
1887
bool
1888
fs_visitor::dead_code_eliminate_local()
1889
{
1890
   struct hash_table *ht;
1891
   bool progress = false;
1892
 
1893
   ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1894
 
1895
   foreach_list_safe(node, &this->instructions) {
1896
      fs_inst *inst = (fs_inst *)node;
1897
 
1898
      /* At a basic block, empty the HT since we don't understand dataflow
1899
       * here.
1900
       */
1901
      if (inst->is_control_flow()) {
1902
         clear_dead_code_hash(ht);
1903
         continue;
1904
      }
1905
 
1906
      /* Clear the HT of any instructions that got read. */
1907
      for (int i = 0; i < 3; i++) {
1908
         fs_reg src = inst->src[i];
1909
         if (src.file != GRF)
1910
            continue;
1911
 
1912
         int read = 1;
1913
         if (inst->is_send_from_grf())
1914
            read = virtual_grf_sizes[src.reg] - src.reg_offset;
1915
 
1916
         for (int reg_offset = src.reg_offset;
1917
              reg_offset < src.reg_offset + read;
1918
              reg_offset++) {
1919
            remove_dead_code_hash(ht, src.reg, reg_offset);
1920
         }
1921
      }
1922
 
1923
      /* Add any update of a GRF to the HT, removing a previous write if it
1924
       * wasn't read.
1925
       */
1926
      if (inst->dst.file == GRF) {
1927
         if (inst->regs_written > 1) {
1928
            /* We don't know how to trim channels from an instruction's
1929
             * writes, so we can't incrementally remove unread channels from
1930
             * it.  Just remove whatever it overwrites from the table
1931
             */
1932
            for (int i = 0; i < inst->regs_written; i++) {
1933
               remove_dead_code_hash(ht,
1934
                                     inst->dst.reg,
1935
                                     inst->dst.reg_offset + i);
1936
            }
1937
         } else {
1938
            struct hash_entry *entry =
1939
               get_dead_code_hash_entry(ht, inst->dst.reg,
1940
                                        inst->dst.reg_offset);
1941
 
1942
            if (inst->is_partial_write()) {
1943
               /* For a partial write, we can't remove any previous dead code
1944
                * candidate, since we're just modifying their result, but we can
1945
                * be dead code eliminiated ourselves.
1946
                */
1947
               if (entry) {
1948
                  entry->data = inst;
1949
               } else {
1950
                  insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1951
                                        inst);
1952
               }
1953
            } else {
1954
               if (entry) {
1955
                  /* We're completely updating a channel, and there was a
1956
                   * previous write to the channel that wasn't read.  Kill it!
1957
                   */
1958
                  fs_inst *inst = (fs_inst *)entry->data;
1959
                  inst->remove();
1960
                  progress = true;
1961
                  _mesa_hash_table_remove(ht, entry);
1962
               }
1963
 
1964
               insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1965
                                     inst);
1966
            }
1967
         }
1968
      }
1969
   }
1970
 
1971
   _mesa_hash_table_destroy(ht, NULL);
1972
 
1973
   if (progress)
1974
      live_intervals_valid = false;
1975
 
1976
   return progress;
1977
}
1978
 
1979
/**
1980
 * Implements a second type of register coalescing: This one checks if
1981
 * the two regs involved in a raw move don't interfere, in which case
1982
 * they can both by stored in the same place and the MOV removed.
1983
 */
1984
bool
1985
fs_visitor::register_coalesce_2()
1986
{
1987
   bool progress = false;
1988
 
1989
   calculate_live_intervals();
1990
 
1991
   foreach_list_safe(node, &this->instructions) {
1992
      fs_inst *inst = (fs_inst *)node;
1993
 
1994
      if (inst->opcode != BRW_OPCODE_MOV ||
1995
	  inst->is_partial_write() ||
1996
	  inst->saturate ||
1997
	  inst->src[0].file != GRF ||
1998
	  inst->src[0].negate ||
1999
	  inst->src[0].abs ||
2000
	  inst->src[0].smear != -1 ||
2001
	  inst->dst.file != GRF ||
2002
	  inst->dst.type != inst->src[0].type ||
2003
	  virtual_grf_sizes[inst->src[0].reg] != 1 ||
2004
	  virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2005
	 continue;
2006
      }
2007
 
2008
      int reg_from = inst->src[0].reg;
2009
      assert(inst->src[0].reg_offset == 0);
2010
      int reg_to = inst->dst.reg;
2011
      int reg_to_offset = inst->dst.reg_offset;
2012
 
2013
      foreach_list(node, &this->instructions) {
2014
	 fs_inst *scan_inst = (fs_inst *)node;
2015
 
2016
	 if (scan_inst->dst.file == GRF &&
2017
	     scan_inst->dst.reg == reg_from) {
2018
	    scan_inst->dst.reg = reg_to;
2019
	    scan_inst->dst.reg_offset = reg_to_offset;
2020
	 }
2021
	 for (int i = 0; i < 3; i++) {
2022
	    if (scan_inst->src[i].file == GRF &&
2023
		scan_inst->src[i].reg == reg_from) {
2024
	       scan_inst->src[i].reg = reg_to;
2025
	       scan_inst->src[i].reg_offset = reg_to_offset;
2026
	    }
2027
	 }
2028
      }
2029
 
2030
      inst->remove();
2031
 
2032
      /* We don't need to recalculate live intervals inside the loop despite
2033
       * flagging live_intervals_valid because we only use live intervals for
2034
       * the interferes test, and we must have had a situation where the
2035
       * intervals were:
2036
       *
2037
       *  from  to
2038
       *  ^
2039
       *  |
2040
       *  v
2041
       *        ^
2042
       *        |
2043
       *        v
2044
       *
2045
       * Some register R that might get coalesced with one of these two could
2046
       * only be referencing "to", otherwise "from"'s range would have been
2047
       * longer.  R's range could also only start at the end of "to" or later,
2048
       * otherwise it will conflict with "to" when we try to coalesce "to"
2049
       * into Rw anyway.
2050
       */
2051
      live_intervals_valid = false;
2052
 
2053
      progress = true;
2054
      continue;
2055
   }
2056
 
2057
   return progress;
2058
}
2059
 
2060
bool
2061
fs_visitor::register_coalesce()
2062
{
2063
   bool progress = false;
2064
   int if_depth = 0;
2065
   int loop_depth = 0;
2066
 
2067
   foreach_list_safe(node, &this->instructions) {
2068
      fs_inst *inst = (fs_inst *)node;
2069
 
2070
      /* Make sure that we dominate the instructions we're going to
2071
       * scan for interfering with our coalescing, or we won't have
2072
       * scanned enough to see if anything interferes with our
2073
       * coalescing.  We don't dominate the following instructions if
2074
       * we're in a loop or an if block.
2075
       */
2076
      switch (inst->opcode) {
2077
      case BRW_OPCODE_DO:
2078
	 loop_depth++;
2079
	 break;
2080
      case BRW_OPCODE_WHILE:
2081
	 loop_depth--;
2082
	 break;
2083
      case BRW_OPCODE_IF:
2084
	 if_depth++;
2085
	 break;
2086
      case BRW_OPCODE_ENDIF:
2087
	 if_depth--;
2088
	 break;
2089
      default:
2090
	 break;
2091
      }
2092
      if (loop_depth || if_depth)
2093
	 continue;
2094
 
2095
      if (inst->opcode != BRW_OPCODE_MOV ||
2096
	  inst->is_partial_write() ||
2097
	  inst->saturate ||
2098
	  inst->dst.file != GRF || (inst->src[0].file != GRF &&
2099
				    inst->src[0].file != UNIFORM)||
2100
	  inst->dst.type != inst->src[0].type)
2101
	 continue;
2102
 
2103
      bool has_source_modifiers = (inst->src[0].abs ||
2104
                                   inst->src[0].negate ||
2105
                                   inst->src[0].smear != -1 ||
2106
                                   inst->src[0].file == UNIFORM);
2107
 
2108
      /* Found a move of a GRF to a GRF.  Let's see if we can coalesce
2109
       * them: check for no writes to either one until the exit of the
2110
       * program.
2111
       */
2112
      bool interfered = false;
2113
 
2114
      for (fs_inst *scan_inst = (fs_inst *)inst->next;
2115
	   !scan_inst->is_tail_sentinel();
2116
	   scan_inst = (fs_inst *)scan_inst->next) {
2117
	 if (scan_inst->dst.file == GRF) {
2118
	    if (scan_inst->overwrites_reg(inst->dst) ||
2119
                scan_inst->overwrites_reg(inst->src[0])) {
2120
	       interfered = true;
2121
	       break;
2122
	    }
2123
	 }
2124
 
2125
         if (has_source_modifiers) {
2126
            for (int i = 0; i < 3; i++) {
2127
               if (scan_inst->src[i].file == GRF &&
2128
                   scan_inst->src[i].reg == inst->dst.reg &&
2129
                   scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2130
                   inst->dst.type != scan_inst->src[i].type)
2131
               {
2132
                 interfered = true;
2133
                 break;
2134
               }
2135
            }
2136
         }
2137
 
2138
 
2139
	 /* The gen6 MATH instruction can't handle source modifiers or
2140
	  * unusual register regions, so avoid coalescing those for
2141
	  * now.  We should do something more specific.
2142
	  */
2143
	 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2144
            interfered = true;
2145
	    break;
2146
	 }
2147
 
2148
	 /* The accumulator result appears to get used for the
2149
	  * conditional modifier generation.  When negating a UD
2150
	  * value, there is a 33rd bit generated for the sign in the
2151
	  * accumulator value, so now you can't check, for example,
2152
	  * equality with a 32-bit value.  See piglit fs-op-neg-uint.
2153
	  */
2154
	 if (scan_inst->conditional_mod &&
2155
	     inst->src[0].negate &&
2156
	     inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2157
	    interfered = true;
2158
	    break;
2159
	 }
2160
      }
2161
      if (interfered) {
2162
	 continue;
2163
      }
2164
 
2165
      /* Rewrite the later usage to point at the source of the move to
2166
       * be removed.
2167
       */
2168
      for (fs_inst *scan_inst = inst;
2169
	   !scan_inst->is_tail_sentinel();
2170
	   scan_inst = (fs_inst *)scan_inst->next) {
2171
	 for (int i = 0; i < 3; i++) {
2172
	    if (scan_inst->src[i].file == GRF &&
2173
		scan_inst->src[i].reg == inst->dst.reg &&
2174
		scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2175
	       fs_reg new_src = inst->src[0];
2176
               if (scan_inst->src[i].abs) {
2177
                  new_src.negate = 0;
2178
                  new_src.abs = 1;
2179
               }
2180
	       new_src.negate ^= scan_inst->src[i].negate;
2181
	       scan_inst->src[i] = new_src;
2182
	    }
2183
	 }
2184
      }
2185
 
2186
      inst->remove();
2187
      progress = true;
2188
   }
2189
 
2190
   if (progress)
2191
      live_intervals_valid = false;
2192
 
2193
   return progress;
2194
}
2195
 
2196
 
2197
bool
2198
fs_visitor::compute_to_mrf()
2199
{
2200
   bool progress = false;
2201
   int next_ip = 0;
2202
 
2203
   calculate_live_intervals();
2204
 
2205
   foreach_list_safe(node, &this->instructions) {
2206
      fs_inst *inst = (fs_inst *)node;
2207
 
2208
      int ip = next_ip;
2209
      next_ip++;
2210
 
2211
      if (inst->opcode != BRW_OPCODE_MOV ||
2212
	  inst->is_partial_write() ||
2213
	  inst->dst.file != MRF || inst->src[0].file != GRF ||
2214
	  inst->dst.type != inst->src[0].type ||
2215
	  inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2216
	 continue;
2217
 
2218
      /* Work out which hardware MRF registers are written by this
2219
       * instruction.
2220
       */
2221
      int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2222
      int mrf_high;
2223
      if (inst->dst.reg & BRW_MRF_COMPR4) {
2224
	 mrf_high = mrf_low + 4;
2225
      } else if (dispatch_width == 16 &&
2226
		 (!inst->force_uncompressed && !inst->force_sechalf)) {
2227
	 mrf_high = mrf_low + 1;
2228
      } else {
2229
	 mrf_high = mrf_low;
2230
      }
2231
 
2232
      /* Can't compute-to-MRF this GRF if someone else was going to
2233
       * read it later.
2234
       */
2235
      if (this->virtual_grf_end[inst->src[0].reg] > ip)
2236
	 continue;
2237
 
2238
      /* Found a move of a GRF to a MRF.  Let's see if we can go
2239
       * rewrite the thing that made this GRF to write into the MRF.
2240
       */
2241
      fs_inst *scan_inst;
2242
      for (scan_inst = (fs_inst *)inst->prev;
2243
	   scan_inst->prev != NULL;
2244
	   scan_inst = (fs_inst *)scan_inst->prev) {
2245
	 if (scan_inst->dst.file == GRF &&
2246
	     scan_inst->dst.reg == inst->src[0].reg) {
2247
	    /* Found the last thing to write our reg we want to turn
2248
	     * into a compute-to-MRF.
2249
	     */
2250
 
2251
	    /* If this one instruction didn't populate all the
2252
	     * channels, bail.  We might be able to rewrite everything
2253
	     * that writes that reg, but it would require smarter
2254
	     * tracking to delay the rewriting until complete success.
2255
	     */
2256
	    if (scan_inst->is_partial_write())
2257
	       break;
2258
 
2259
            /* Things returning more than one register would need us to
2260
             * understand coalescing out more than one MOV at a time.
2261
             */
2262
            if (scan_inst->regs_written > 1)
2263
               break;
2264
 
2265
	    /* SEND instructions can't have MRF as a destination. */
2266
	    if (scan_inst->mlen)
2267
	       break;
2268
 
2269
	    if (brw->gen == 6) {
2270
	       /* gen6 math instructions must have the destination be
2271
		* GRF, so no compute-to-MRF for them.
2272
		*/
2273
	       if (scan_inst->is_math()) {
2274
		  break;
2275
	       }
2276
	    }
2277
 
2278
	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2279
	       /* Found the creator of our MRF's source value. */
2280
	       scan_inst->dst.file = MRF;
2281
	       scan_inst->dst.reg = inst->dst.reg;
2282
	       scan_inst->saturate |= inst->saturate;
2283
	       inst->remove();
2284
	       progress = true;
2285
	    }
2286
	    break;
2287
	 }
2288
 
2289
	 /* We don't handle control flow here.  Most computation of
2290
	  * values that end up in MRFs are shortly before the MRF
2291
	  * write anyway.
2292
	  */
2293
	 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2294
	    break;
2295
 
2296
	 /* You can't read from an MRF, so if someone else reads our
2297
	  * MRF's source GRF that we wanted to rewrite, that stops us.
2298
	  */
2299
	 bool interfered = false;
2300
	 for (int i = 0; i < 3; i++) {
2301
	    if (scan_inst->src[i].file == GRF &&
2302
		scan_inst->src[i].reg == inst->src[0].reg &&
2303
		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2304
	       interfered = true;
2305
	    }
2306
	 }
2307
	 if (interfered)
2308
	    break;
2309
 
2310
	 if (scan_inst->dst.file == MRF) {
2311
	    /* If somebody else writes our MRF here, we can't
2312
	     * compute-to-MRF before that.
2313
	     */
2314
	    int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2315
	    int scan_mrf_high;
2316
 
2317
	    if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2318
	       scan_mrf_high = scan_mrf_low + 4;
2319
	    } else if (dispatch_width == 16 &&
2320
		       (!scan_inst->force_uncompressed &&
2321
			!scan_inst->force_sechalf)) {
2322
	       scan_mrf_high = scan_mrf_low + 1;
2323
	    } else {
2324
	       scan_mrf_high = scan_mrf_low;
2325
	    }
2326
 
2327
	    if (mrf_low == scan_mrf_low ||
2328
		mrf_low == scan_mrf_high ||
2329
		mrf_high == scan_mrf_low ||
2330
		mrf_high == scan_mrf_high) {
2331
	       break;
2332
	    }
2333
	 }
2334
 
2335
	 if (scan_inst->mlen > 0) {
2336
	    /* Found a SEND instruction, which means that there are
2337
	     * live values in MRFs from base_mrf to base_mrf +
2338
	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2339
	     * above it.
2340
	     */
2341
	    if (mrf_low >= scan_inst->base_mrf &&
2342
		mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2343
	       break;
2344
	    }
2345
	    if (mrf_high >= scan_inst->base_mrf &&
2346
		mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2347
	       break;
2348
	    }
2349
	 }
2350
      }
2351
   }
2352
 
2353
   if (progress)
2354
      live_intervals_valid = false;
2355
 
2356
   return progress;
2357
}
2358
 
2359
/**
2360
 * Walks through basic blocks, looking for repeated MRF writes and
2361
 * removing the later ones.
2362
 */
2363
bool
2364
fs_visitor::remove_duplicate_mrf_writes()
2365
{
2366
   fs_inst *last_mrf_move[16];
2367
   bool progress = false;
2368
 
2369
   /* Need to update the MRF tracking for compressed instructions. */
2370
   if (dispatch_width == 16)
2371
      return false;
2372
 
2373
   memset(last_mrf_move, 0, sizeof(last_mrf_move));
2374
 
2375
   foreach_list_safe(node, &this->instructions) {
2376
      fs_inst *inst = (fs_inst *)node;
2377
 
2378
      if (inst->is_control_flow()) {
2379
	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2380
      }
2381
 
2382
      if (inst->opcode == BRW_OPCODE_MOV &&
2383
	  inst->dst.file == MRF) {
2384
	 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2385
	 if (prev_inst && inst->equals(prev_inst)) {
2386
	    inst->remove();
2387
	    progress = true;
2388
	    continue;
2389
	 }
2390
      }
2391
 
2392
      /* Clear out the last-write records for MRFs that were overwritten. */
2393
      if (inst->dst.file == MRF) {
2394
	 last_mrf_move[inst->dst.reg] = NULL;
2395
      }
2396
 
2397
      if (inst->mlen > 0) {
2398
	 /* Found a SEND instruction, which will include two or fewer
2399
	  * implied MRF writes.  We could do better here.
2400
	  */
2401
	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2402
	    last_mrf_move[inst->base_mrf + i] = NULL;
2403
	 }
2404
      }
2405
 
2406
      /* Clear out any MRF move records whose sources got overwritten. */
2407
      if (inst->dst.file == GRF) {
2408
	 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2409
	    if (last_mrf_move[i] &&
2410
		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2411
	       last_mrf_move[i] = NULL;
2412
	    }
2413
	 }
2414
      }
2415
 
2416
      if (inst->opcode == BRW_OPCODE_MOV &&
2417
	  inst->dst.file == MRF &&
2418
	  inst->src[0].file == GRF &&
2419
	  !inst->is_partial_write()) {
2420
	 last_mrf_move[inst->dst.reg] = inst;
2421
      }
2422
   }
2423
 
2424
   if (progress)
2425
      live_intervals_valid = false;
2426
 
2427
   return progress;
2428
}
2429
 
2430
static void
2431
clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2432
                        int first_grf, int grf_len)
2433
{
2434
   bool inst_16wide = (dispatch_width > 8 &&
2435
                       !inst->force_uncompressed &&
2436
                       !inst->force_sechalf);
2437
 
2438
   /* Clear the flag for registers that actually got read (as expected). */
2439
   for (int i = 0; i < 3; i++) {
2440
      int grf;
2441
      if (inst->src[i].file == GRF) {
2442
         grf = inst->src[i].reg;
2443
      } else if (inst->src[i].file == HW_REG &&
2444
                 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2445
         grf = inst->src[i].fixed_hw_reg.nr;
2446
      } else {
2447
         continue;
2448
      }
2449
 
2450
      if (grf >= first_grf &&
2451
          grf < first_grf + grf_len) {
2452
         deps[grf - first_grf] = false;
2453
         if (inst_16wide)
2454
            deps[grf - first_grf + 1] = false;
2455
      }
2456
   }
2457
}
2458
 
2459
/**
2460
 * Implements this workaround for the original 965:
2461
 *
2462
 *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2463
 *      check for post destination dependencies on this instruction, software
2464
 *      must ensure that there is no destination hazard for the case of ‘write
2465
 *      followed by a posted write’ shown in the following example.
2466
 *
2467
 *      1. mov r3 0
2468
 *      2. send r3.xy 
2469
 *      3. mov r2 r3
2470
 *
2471
 *      Due to no post-destination dependency check on the ‘send’, the above
2472
 *      code sequence could have two instructions (1 and 2) in flight at the
2473
 *      same time that both consider ‘r3’ as the target of their final writes.
2474
 */
2475
void
2476
fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2477
{
2478
   int reg_size = dispatch_width / 8;
2479
   int write_len = inst->regs_written * reg_size;
2480
   int first_write_grf = inst->dst.reg;
2481
   bool needs_dep[BRW_MAX_MRF];
2482
   assert(write_len < (int)sizeof(needs_dep) - 1);
2483
 
2484
   memset(needs_dep, false, sizeof(needs_dep));
2485
   memset(needs_dep, true, write_len);
2486
 
2487
   clear_deps_for_inst_src(inst, dispatch_width,
2488
                           needs_dep, first_write_grf, write_len);
2489
 
2490
   /* Walk backwards looking for writes to registers we're writing which
2491
    * aren't read since being written.  If we hit the start of the program,
2492
    * we assume that there are no outstanding dependencies on entry to the
2493
    * program.
2494
    */
2495
   for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2496
        scan_inst != NULL;
2497
        scan_inst = (fs_inst *)scan_inst->prev) {
2498
 
2499
      /* If we hit control flow, assume that there *are* outstanding
2500
       * dependencies, and force their cleanup before our instruction.
2501
       */
2502
      if (scan_inst->is_control_flow()) {
2503
         for (int i = 0; i < write_len; i++) {
2504
            if (needs_dep[i]) {
2505
               inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2506
            }
2507
         }
2508
         return;
2509
      }
2510
 
2511
      bool scan_inst_16wide = (dispatch_width > 8 &&
2512
                               !scan_inst->force_uncompressed &&
2513
                               !scan_inst->force_sechalf);
2514
 
2515
      /* We insert our reads as late as possible on the assumption that any
2516
       * instruction but a MOV that might have left us an outstanding
2517
       * dependency has more latency than a MOV.
2518
       */
2519
      if (scan_inst->dst.file == GRF) {
2520
         for (int i = 0; i < scan_inst->regs_written; i++) {
2521
            int reg = scan_inst->dst.reg + i * reg_size;
2522
 
2523
            if (reg >= first_write_grf &&
2524
                reg < first_write_grf + write_len &&
2525
                needs_dep[reg - first_write_grf]) {
2526
               inst->insert_before(DEP_RESOLVE_MOV(reg));
2527
               needs_dep[reg - first_write_grf] = false;
2528
               if (scan_inst_16wide)
2529
                  needs_dep[reg - first_write_grf + 1] = false;
2530
            }
2531
         }
2532
      }
2533
 
2534
      /* Clear the flag for registers that actually got read (as expected). */
2535
      clear_deps_for_inst_src(scan_inst, dispatch_width,
2536
                              needs_dep, first_write_grf, write_len);
2537
 
2538
      /* Continue the loop only if we haven't resolved all the dependencies */
2539
      int i;
2540
      for (i = 0; i < write_len; i++) {
2541
         if (needs_dep[i])
2542
            break;
2543
      }
2544
      if (i == write_len)
2545
         return;
2546
   }
2547
}
2548
 
2549
/**
2550
 * Implements this workaround for the original 965:
2551
 *
2552
 *     "[DevBW, DevCL] Errata: A destination register from a send can not be
2553
 *      used as a destination register until after it has been sourced by an
2554
 *      instruction with a different destination register.
2555
 */
2556
void
2557
fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2558
{
2559
   int write_len = inst->regs_written * dispatch_width / 8;
2560
   int first_write_grf = inst->dst.reg;
2561
   bool needs_dep[BRW_MAX_MRF];
2562
   assert(write_len < (int)sizeof(needs_dep) - 1);
2563
 
2564
   memset(needs_dep, false, sizeof(needs_dep));
2565
   memset(needs_dep, true, write_len);
2566
   /* Walk forwards looking for writes to registers we're writing which aren't
2567
    * read before being written.
2568
    */
2569
   for (fs_inst *scan_inst = (fs_inst *)inst->next;
2570
        !scan_inst->is_tail_sentinel();
2571
        scan_inst = (fs_inst *)scan_inst->next) {
2572
      /* If we hit control flow, force resolve all remaining dependencies. */
2573
      if (scan_inst->is_control_flow()) {
2574
         for (int i = 0; i < write_len; i++) {
2575
            if (needs_dep[i])
2576
               scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2577
         }
2578
         return;
2579
      }
2580
 
2581
      /* Clear the flag for registers that actually got read (as expected). */
2582
      clear_deps_for_inst_src(scan_inst, dispatch_width,
2583
                              needs_dep, first_write_grf, write_len);
2584
 
2585
      /* We insert our reads as late as possible since they're reading the
2586
       * result of a SEND, which has massive latency.
2587
       */
2588
      if (scan_inst->dst.file == GRF &&
2589
          scan_inst->dst.reg >= first_write_grf &&
2590
          scan_inst->dst.reg < first_write_grf + write_len &&
2591
          needs_dep[scan_inst->dst.reg - first_write_grf]) {
2592
         scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2593
         needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2594
      }
2595
 
2596
      /* Continue the loop only if we haven't resolved all the dependencies */
2597
      int i;
2598
      for (i = 0; i < write_len; i++) {
2599
         if (needs_dep[i])
2600
            break;
2601
      }
2602
      if (i == write_len)
2603
         return;
2604
   }
2605
 
2606
   /* If we hit the end of the program, resolve all remaining dependencies out
2607
    * of paranoia.
2608
    */
2609
   fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2610
   assert(last_inst->eot);
2611
   for (int i = 0; i < write_len; i++) {
2612
      if (needs_dep[i])
2613
         last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2614
   }
2615
}
2616
 
2617
void
2618
fs_visitor::insert_gen4_send_dependency_workarounds()
2619
{
2620
   if (brw->gen != 4 || brw->is_g4x)
2621
      return;
2622
 
2623
   /* Note that we're done with register allocation, so GRF fs_regs always
2624
    * have a .reg_offset of 0.
2625
    */
2626
 
2627
   foreach_list_safe(node, &this->instructions) {
2628
      fs_inst *inst = (fs_inst *)node;
2629
 
2630
      if (inst->mlen != 0 && inst->dst.file == GRF) {
2631
         insert_gen4_pre_send_dependency_workarounds(inst);
2632
         insert_gen4_post_send_dependency_workarounds(inst);
2633
      }
2634
   }
2635
}
2636
 
2637
/**
2638
 * Turns the generic expression-style uniform pull constant load instruction
2639
 * into a hardware-specific series of instructions for loading a pull
2640
 * constant.
2641
 *
2642
 * The expression style allows the CSE pass before this to optimize out
2643
 * repeated loads from the same offset, and gives the pre-register-allocation
2644
 * scheduling full flexibility, while the conversion to native instructions
2645
 * allows the post-register-allocation scheduler the best information
2646
 * possible.
2647
 *
2648
 * Note that execution masking for setting up pull constant loads is special:
2649
 * the channels that need to be written are unrelated to the current execution
2650
 * mask, since a later instruction will use one of the result channels as a
2651
 * source operand for all 8 or 16 of its channels.
2652
 */
2653
void
2654
fs_visitor::lower_uniform_pull_constant_loads()
2655
{
2656
   foreach_list(node, &this->instructions) {
2657
      fs_inst *inst = (fs_inst *)node;
2658
 
2659
      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2660
         continue;
2661
 
2662
      if (brw->gen >= 7) {
2663
         /* The offset arg before was a vec4-aligned byte offset.  We need to
2664
          * turn it into a dword offset.
2665
          */
2666
         fs_reg const_offset_reg = inst->src[1];
2667
         assert(const_offset_reg.file == IMM &&
2668
                const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2669
         const_offset_reg.imm.u /= 4;
2670
         fs_reg payload = fs_reg(this, glsl_type::uint_type);
2671
 
2672
         /* This is actually going to be a MOV, but since only the first dword
2673
          * is accessed, we have a special opcode to do just that one.  Note
2674
          * that this needs to be an operation that will be considered a def
2675
          * by live variable analysis, or register allocation will explode.
2676
          */
2677
         fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2678
                                               payload, const_offset_reg);
2679
         setup->force_writemask_all = true;
2680
 
2681
         setup->ir = inst->ir;
2682
         setup->annotation = inst->annotation;
2683
         inst->insert_before(setup);
2684
 
2685
         /* Similarly, this will only populate the first 4 channels of the
2686
          * result register (since we only use smear values from 0-3), but we
2687
          * don't tell the optimizer.
2688
          */
2689
         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2690
         inst->src[1] = payload;
2691
 
2692
         this->live_intervals_valid = false;
2693
      } else {
2694
         /* Before register allocation, we didn't tell the scheduler about the
2695
          * MRF we use.  We know it's safe to use this MRF because nothing
2696
          * else does except for register spill/unspill, which generates and
2697
          * uses its MRF within a single IR instruction.
2698
          */
2699
         inst->base_mrf = 14;
2700
         inst->mlen = 1;
2701
      }
2702
   }
2703
}
2704
 
2705
void
2706
fs_visitor::dump_instruction(backend_instruction *be_inst)
2707
{
2708
   fs_inst *inst = (fs_inst *)be_inst;
2709
 
2710
   if (inst->predicate) {
2711
      printf("(%cf0.%d) ",
2712
             inst->predicate_inverse ? '-' : '+',
2713
             inst->flag_subreg);
2714
   }
2715
 
2716
   printf("%s", brw_instruction_name(inst->opcode));
2717
   if (inst->saturate)
2718
      printf(".sat");
2719
   if (inst->conditional_mod) {
2720
      printf(".cmod");
2721
      if (!inst->predicate &&
2722
          (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2723
                              inst->opcode != BRW_OPCODE_IF &&
2724
                              inst->opcode != BRW_OPCODE_WHILE))) {
2725
         printf(".f0.%d\n", inst->flag_subreg);
2726
      }
2727
   }
2728
   printf(" ");
2729
 
2730
 
2731
   switch (inst->dst.file) {
2732
   case GRF:
2733
      printf("vgrf%d", inst->dst.reg);
2734
      if (inst->dst.reg_offset)
2735
         printf("+%d", inst->dst.reg_offset);
2736
      break;
2737
   case MRF:
2738
      printf("m%d", inst->dst.reg);
2739
      break;
2740
   case BAD_FILE:
2741
      printf("(null)");
2742
      break;
2743
   case UNIFORM:
2744
      printf("***u%d***", inst->dst.reg);
2745
      break;
2746
   default:
2747
      printf("???");
2748
      break;
2749
   }
2750
   printf(", ");
2751
 
2752
   for (int i = 0; i < 3; i++) {
2753
      if (inst->src[i].negate)
2754
         printf("-");
2755
      if (inst->src[i].abs)
2756
         printf("|");
2757
      switch (inst->src[i].file) {
2758
      case GRF:
2759
         printf("vgrf%d", inst->src[i].reg);
2760
         if (inst->src[i].reg_offset)
2761
            printf("+%d", inst->src[i].reg_offset);
2762
         break;
2763
      case MRF:
2764
         printf("***m%d***", inst->src[i].reg);
2765
         break;
2766
      case UNIFORM:
2767
         printf("u%d", inst->src[i].reg);
2768
         if (inst->src[i].reg_offset)
2769
            printf(".%d", inst->src[i].reg_offset);
2770
         break;
2771
      case BAD_FILE:
2772
         printf("(null)");
2773
         break;
2774
      case IMM:
2775
         switch (inst->src[i].type) {
2776
         case BRW_REGISTER_TYPE_F:
2777
            printf("%ff", inst->src[i].imm.f);
2778
            break;
2779
         case BRW_REGISTER_TYPE_D:
2780
            printf("%dd", inst->src[i].imm.i);
2781
            break;
2782
         case BRW_REGISTER_TYPE_UD:
2783
            printf("%uu", inst->src[i].imm.u);
2784
            break;
2785
         default:
2786
            printf("???");
2787
            break;
2788
         }
2789
         break;
2790
      default:
2791
         printf("???");
2792
         break;
2793
      }
2794
      if (inst->src[i].abs)
2795
         printf("|");
2796
 
2797
      if (i < 3)
2798
         printf(", ");
2799
   }
2800
 
2801
   printf(" ");
2802
 
2803
   if (inst->force_uncompressed)
2804
      printf("1sthalf ");
2805
 
2806
   if (inst->force_sechalf)
2807
      printf("2ndhalf ");
2808
 
2809
   printf("\n");
2810
}
2811
 
2812
/**
2813
 * Possibly returns an instruction that set up @param reg.
2814
 *
2815
 * Sometimes we want to take the result of some expression/variable
2816
 * dereference tree and rewrite the instruction generating the result
2817
 * of the tree.  When processing the tree, we know that the
2818
 * instructions generated are all writing temporaries that are dead
2819
 * outside of this tree.  So, if we have some instructions that write
2820
 * a temporary, we're free to point that temp write somewhere else.
2821
 *
2822
 * Note that this doesn't guarantee that the instruction generated
2823
 * only reg -- it might be the size=4 destination of a texture instruction.
2824
 */
2825
fs_inst *
2826
fs_visitor::get_instruction_generating_reg(fs_inst *start,
2827
					   fs_inst *end,
2828
					   fs_reg reg)
2829
{
2830
   if (end == start ||
2831
       end->is_partial_write() ||
2832
       reg.reladdr ||
2833
       !reg.equals(end->dst)) {
2834
      return NULL;
2835
   } else {
2836
      return end;
2837
   }
2838
}
2839
 
2840
void
2841
fs_visitor::setup_payload_gen6()
2842
{
2843
   bool uses_depth =
2844
      (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2845
   unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2846
 
2847
   assert(brw->gen >= 6);
2848
 
2849
   /* R0-1: masks, pixel X/Y coordinates. */
2850
   c->nr_payload_regs = 2;
2851
   /* R2: only for 32-pixel dispatch.*/
2852
 
2853
   /* R3-26: barycentric interpolation coordinates.  These appear in the
2854
    * same order that they appear in the brw_wm_barycentric_interp_mode
2855
    * enum.  Each set of coordinates occupies 2 registers if dispatch width
2856
    * == 8 and 4 registers if dispatch width == 16.  Coordinates only
2857
    * appear if they were enabled using the "Barycentric Interpolation
2858
    * Mode" bits in WM_STATE.
2859
    */
2860
   for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2861
      if (barycentric_interp_modes & (1 << i)) {
2862
         c->barycentric_coord_reg[i] = c->nr_payload_regs;
2863
         c->nr_payload_regs += 2;
2864
         if (dispatch_width == 16) {
2865
            c->nr_payload_regs += 2;
2866
         }
2867
      }
2868
   }
2869
 
2870
   /* R27: interpolated depth if uses source depth */
2871
   if (uses_depth) {
2872
      c->source_depth_reg = c->nr_payload_regs;
2873
      c->nr_payload_regs++;
2874
      if (dispatch_width == 16) {
2875
         /* R28: interpolated depth if not 8-wide. */
2876
         c->nr_payload_regs++;
2877
      }
2878
   }
2879
   /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2880
   if (uses_depth) {
2881
      c->source_w_reg = c->nr_payload_regs;
2882
      c->nr_payload_regs++;
2883
      if (dispatch_width == 16) {
2884
         /* R30: interpolated W if not 8-wide. */
2885
         c->nr_payload_regs++;
2886
      }
2887
   }
2888
   /* R31: MSAA position offsets. */
2889
   /* R32-: bary for 32-pixel. */
2890
   /* R58-59: interp W for 32-pixel. */
2891
 
2892
   if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2893
      c->source_depth_to_render_target = true;
2894
   }
2895
}
2896
 
2897
bool
2898
fs_visitor::run()
2899
{
2900
   sanity_param_count = fp->Base.Parameters->NumParameters;
2901
   uint32_t orig_nr_params = c->prog_data.nr_params;
2902
 
2903
   if (brw->gen >= 6)
2904
      setup_payload_gen6();
2905
   else
2906
      setup_payload_gen4();
2907
 
2908
   if (0) {
2909
      emit_dummy_fs();
2910
   } else {
2911
      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2912
         emit_shader_time_begin();
2913
 
2914
      calculate_urb_setup();
2915
      if (brw->gen < 6)
2916
	 emit_interpolation_setup_gen4();
2917
      else
2918
	 emit_interpolation_setup_gen6();
2919
 
2920
      /* We handle discards by keeping track of the still-live pixels in f0.1.
2921
       * Initialize it with the dispatched pixels.
2922
       */
4401 Serge 2923
      if (fp->UsesKill || c->key.alpha_test_func) {
4358 Serge 2924
         fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2925
         discard_init->flag_subreg = 1;
2926
      }
2927
 
2928
      /* Generate FS IR for main().  (the visitor only descends into
2929
       * functions called "main").
2930
       */
2931
      if (shader) {
2932
         foreach_list(node, &*shader->ir) {
2933
            ir_instruction *ir = (ir_instruction *)node;
2934
            base_ir = ir;
2935
            this->result = reg_undef;
2936
            ir->accept(this);
2937
         }
2938
      } else {
2939
         emit_fragment_program_code();
2940
      }
2941
      base_ir = NULL;
2942
      if (failed)
2943
	 return false;
2944
 
2945
      emit(FS_OPCODE_PLACEHOLDER_HALT);
2946
 
4401 Serge 2947
      if (c->key.alpha_test_func)
2948
         emit_alpha_test();
2949
 
4358 Serge 2950
      emit_fb_writes();
2951
 
2952
      split_virtual_grfs();
2953
 
2954
      move_uniform_array_access_to_pull_constants();
2955
      setup_pull_constants();
2956
 
2957
      bool progress;
2958
      do {
2959
	 progress = false;
2960
 
2961
         compact_virtual_grfs();
2962
 
2963
	 progress = remove_duplicate_mrf_writes() || progress;
2964
 
2965
	 progress = opt_algebraic() || progress;
2966
	 progress = opt_cse() || progress;
2967
	 progress = opt_copy_propagate() || progress;
2968
	 progress = dead_code_eliminate() || progress;
2969
	 progress = dead_code_eliminate_local() || progress;
2970
	 progress = register_coalesce() || progress;
2971
	 progress = register_coalesce_2() || progress;
2972
	 progress = compute_to_mrf() || progress;
2973
      } while (progress);
2974
 
2975
      remove_dead_constants();
2976
 
2977
      schedule_instructions(false);
2978
 
2979
      lower_uniform_pull_constant_loads();
2980
 
2981
      assign_curb_setup();
2982
      assign_urb_setup();
2983
 
2984
      if (0) {
2985
	 /* Debug of register spilling: Go spill everything. */
2986
	 for (int i = 0; i < virtual_grf_count; i++) {
2987
	    spill_reg(i);
2988
	 }
2989
      }
2990
 
2991
      if (0)
2992
	 assign_regs_trivial();
2993
      else {
2994
	 while (!assign_regs()) {
2995
	    if (failed)
2996
	       break;
2997
	 }
2998
      }
2999
   }
3000
   assert(force_uncompressed_stack == 0);
3001
   assert(force_sechalf_stack == 0);
3002
 
3003
   /* This must come after all optimization and register allocation, since
3004
    * it inserts dead code that happens to have side effects, and it does
3005
    * so based on the actual physical registers in use.
3006
    */
3007
   insert_gen4_send_dependency_workarounds();
3008
 
3009
   if (failed)
3010
      return false;
3011
 
3012
   schedule_instructions(true);
3013
 
3014
   if (dispatch_width == 8) {
3015
      c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3016
   } else {
3017
      c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3018
 
3019
      /* Make sure we didn't try to sneak in an extra uniform */
3020
      assert(orig_nr_params == c->prog_data.nr_params);
3021
      (void) orig_nr_params;
3022
   }
3023
 
3024
   /* If any state parameters were appended, then ParameterValues could have
3025
    * been realloced, in which case the driver uniform storage set up by
3026
    * _mesa_associate_uniform_storage() would point to freed memory.  Make
3027
    * sure that didn't happen.
3028
    */
3029
   assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3030
 
3031
   return !failed;
3032
}
3033
 
3034
const unsigned *
3035
brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3036
               struct gl_fragment_program *fp,
3037
               struct gl_shader_program *prog,
3038
               unsigned *final_assembly_size)
3039
{
3040
   bool start_busy = false;
3041
   float start_time = 0;
3042
 
3043
   if (unlikely(brw->perf_debug)) {
3044
      start_busy = (brw->batch.last_bo &&
3045
                    drm_intel_bo_busy(brw->batch.last_bo));
3046
      start_time = get_time();
3047
   }
3048
 
3049
   struct brw_shader *shader = NULL;
3050
   if (prog)
3051
      shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3052
 
3053
   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3054
      if (prog) {
3055
         printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3056
         _mesa_print_ir(shader->ir, NULL);
3057
         printf("\n\n");
3058
      } else {
3059
         printf("ARB_fragment_program %d ir for native fragment shader\n",
3060
                fp->Base.Id);
3061
         _mesa_print_program(&fp->Base);
3062
      }
3063
   }
3064
 
3065
   /* Now the main event: Visit the shader IR and generate our FS IR for it.
3066
    */
3067
   fs_visitor v(brw, c, prog, fp, 8);
3068
   if (!v.run()) {
3069
      if (prog) {
3070
         prog->LinkStatus = false;
3071
         ralloc_strcat(&prog->InfoLog, v.fail_msg);
3072
      }
3073
 
3074
      _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3075
                    v.fail_msg);
3076
 
3077
      return NULL;
3078
   }
3079
 
3080
   exec_list *simd16_instructions = NULL;
3081
   fs_visitor v2(brw, c, prog, fp, 16);
3082
   bool no16 = INTEL_DEBUG & DEBUG_NO16;
3083
   if (brw->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
3084
      v2.import_uniforms(&v);
3085
      if (!v2.run()) {
3086
         perf_debug("16-wide shader failed to compile, falling back to "
3087
                    "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3088
      } else {
3089
         simd16_instructions = &v2.instructions;
3090
      }
3091
   }
3092
 
3093
   c->prog_data.dispatch_width = 8;
3094
 
3095
   fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3096
   const unsigned *generated = g.generate_assembly(&v.instructions,
3097
                                                   simd16_instructions,
3098
                                                   final_assembly_size);
3099
 
3100
   if (unlikely(brw->perf_debug) && shader) {
3101
      if (shader->compiled_once)
3102
         brw_wm_debug_recompile(brw, prog, &c->key);
3103
      shader->compiled_once = true;
3104
 
3105
      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3106
         perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3107
                    (get_time() - start_time) * 1000);
3108
      }
3109
   }
3110
 
3111
   return generated;
3112
}
3113
 
3114
bool
3115
brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3116
{
3117
   struct brw_context *brw = brw_context(ctx);
3118
   struct brw_wm_prog_key key;
3119
 
3120
   if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3121
      return true;
3122
 
3123
   struct gl_fragment_program *fp = (struct gl_fragment_program *)
3124
      prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3125
   struct brw_fragment_program *bfp = brw_fragment_program(fp);
3126
   bool program_uses_dfdy = fp->UsesDFdy;
3127
 
3128
   memset(&key, 0, sizeof(key));
3129
 
3130
   if (brw->gen < 6) {
3131
      if (fp->UsesKill)
3132
         key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3133
 
3134
      if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3135
         key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3136
 
3137
      /* Just assume depth testing. */
3138
      key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3139
      key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3140
   }
3141
 
3142
   if (brw->gen < 6)
3143
      key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3144
 
3145
   for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3146
      if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3147
	 continue;
3148
 
3149
      if (brw->gen < 6) {
3150
         if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3151
            key.input_slots_valid |= BITFIELD64_BIT(i);
3152
      }
3153
   }
3154
 
3155
   key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3156
 
3157
   for (int i = 0; i < MAX_SAMPLERS; i++) {
3158
      if (fp->Base.ShadowSamplers & (1 << i)) {
3159
         /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3160
         key.tex.swizzles[i] =
3161
            MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3162
      } else {
3163
         /* Color sampler: assume no swizzling. */
3164
         key.tex.swizzles[i] = SWIZZLE_XYZW;
3165
      }
3166
   }
3167
 
3168
   if (fp->Base.InputsRead & VARYING_BIT_POS) {
3169
      key.drawable_height = ctx->DrawBuffer->Height;
3170
   }
3171
 
3172
   if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3173
      key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3174
   }
3175
 
3176
   key.nr_color_regions = 1;
3177
 
3178
   key.program_string_id = bfp->id;
3179
 
3180
   uint32_t old_prog_offset = brw->wm.prog_offset;
3181
   struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3182
 
3183
   bool success = do_wm_prog(brw, prog, bfp, &key);
3184
 
3185
   brw->wm.prog_offset = old_prog_offset;
3186
   brw->wm.prog_data = old_prog_data;
3187
 
3188
   return success;
3189
}