Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5564 serge 1
/*
2
 * Copyright © 2010 Intel Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 */
23
 
24
/** @file brw_fs.cpp
25
 *
26
 * This file drives the GLSL IR -> LIR translation, contains the
27
 * optimizations on the LIR, and drives the generation of native code
28
 * from the LIR.
29
 */
30
 
31
#include 
32
 
33
#include "util/hash_table.h"
34
#include "main/macros.h"
35
#include "main/shaderobj.h"
36
#include "main/fbobject.h"
37
#include "program/prog_parameter.h"
38
#include "program/prog_print.h"
39
#include "util/register_allocate.h"
40
#include "program/hash_table.h"
41
#include "brw_context.h"
42
#include "brw_eu.h"
43
#include "brw_wm.h"
44
#include "brw_fs.h"
45
#include "brw_cfg.h"
46
#include "brw_dead_control_flow.h"
47
#include "main/uniforms.h"
48
#include "brw_fs_live_variables.h"
49
#include "glsl/glsl_types.h"
50
#include "program/sampler.h"
51
 
52
void
53
fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
54
              const fs_reg *src, unsigned sources)
55
{
56
   memset(this, 0, sizeof(*this));
57
 
58
   this->src = new fs_reg[MAX2(sources, 3)];
59
   for (unsigned i = 0; i < sources; i++)
60
      this->src[i] = src[i];
61
 
62
   this->opcode = opcode;
63
   this->dst = dst;
64
   this->sources = sources;
65
   this->exec_size = exec_size;
66
 
67
   assert(dst.file != IMM && dst.file != UNIFORM);
68
 
69
   /* If exec_size == 0, try to guess it from the registers.  Since all
70
    * manner of things may use hardware registers, we first try to guess
71
    * based on GRF registers.  If this fails, we will go ahead and take the
72
    * width from the destination register.
73
    */
74
   if (this->exec_size == 0) {
75
      if (dst.file == GRF) {
76
         this->exec_size = dst.width;
77
      } else {
78
         for (unsigned i = 0; i < sources; ++i) {
79
            if (src[i].file != GRF && src[i].file != ATTR)
80
               continue;
81
 
82
            if (this->exec_size <= 1)
83
               this->exec_size = src[i].width;
84
            assert(src[i].width == 1 || src[i].width == this->exec_size);
85
         }
86
      }
87
 
88
      if (this->exec_size == 0 && dst.file != BAD_FILE)
89
         this->exec_size = dst.width;
90
   }
91
   assert(this->exec_size != 0);
92
 
93
   this->conditional_mod = BRW_CONDITIONAL_NONE;
94
 
95
   /* This will be the case for almost all instructions. */
96
   switch (dst.file) {
97
   case GRF:
98
   case HW_REG:
99
   case MRF:
100
   case ATTR:
101
      this->regs_written =
102
         DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
103
      break;
104
   case BAD_FILE:
105
      this->regs_written = 0;
106
      break;
107
   case IMM:
108
   case UNIFORM:
109
      unreachable("Invalid destination register file");
110
   default:
111
      unreachable("Invalid register file");
112
   }
113
 
114
   this->writes_accumulator = false;
115
}
116
 
117
fs_inst::fs_inst()
118
{
119
   init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
120
}
121
 
122
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
123
{
124
   init(opcode, exec_size, reg_undef, NULL, 0);
125
}
126
 
127
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
128
{
129
   init(opcode, 0, dst, NULL, 0);
130
}
131
 
132
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
133
                 const fs_reg &src0)
134
{
135
   const fs_reg src[1] = { src0 };
136
   init(opcode, exec_size, dst, src, 1);
137
}
138
 
139
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
140
{
141
   const fs_reg src[1] = { src0 };
142
   init(opcode, 0, dst, src, 1);
143
}
144
 
145
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
146
                 const fs_reg &src0, const fs_reg &src1)
147
{
148
   const fs_reg src[2] = { src0, src1 };
149
   init(opcode, exec_size, dst, src, 2);
150
}
151
 
152
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
153
                 const fs_reg &src1)
154
{
155
   const fs_reg src[2] = { src0, src1 };
156
   init(opcode, 0, dst, src, 2);
157
}
158
 
159
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160
                 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
161
{
162
   const fs_reg src[3] = { src0, src1, src2 };
163
   init(opcode, exec_size, dst, src, 3);
164
}
165
 
166
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
167
                 const fs_reg &src1, const fs_reg &src2)
168
{
169
   const fs_reg src[3] = { src0, src1, src2 };
170
   init(opcode, 0, dst, src, 3);
171
}
172
 
173
fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
174
                 const fs_reg src[], unsigned sources)
175
{
176
   init(opcode, 0, dst, src, sources);
177
}
178
 
179
fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
180
                 const fs_reg src[], unsigned sources)
181
{
182
   init(opcode, exec_width, dst, src, sources);
183
}
184
 
185
fs_inst::fs_inst(const fs_inst &that)
186
{
187
   memcpy(this, &that, sizeof(that));
188
 
189
   this->src = new fs_reg[MAX2(that.sources, 3)];
190
 
191
   for (unsigned i = 0; i < that.sources; i++)
192
      this->src[i] = that.src[i];
193
}
194
 
195
fs_inst::~fs_inst()
196
{
197
   delete[] this->src;
198
}
199
 
200
void
201
fs_inst::resize_sources(uint8_t num_sources)
202
{
203
   if (this->sources != num_sources) {
204
      fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
205
 
206
      for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
207
         src[i] = this->src[i];
208
 
209
      delete[] this->src;
210
      this->src = src;
211
      this->sources = num_sources;
212
   }
213
}
214
 
215
#define ALU1(op)                                                        \
216
   fs_inst *                                                            \
217
   fs_visitor::op(const fs_reg &dst, const fs_reg &src0)                \
218
   {                                                                    \
219
      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0);          \
220
   }
221
 
222
#define ALU2(op)                                                        \
223
   fs_inst *                                                            \
224
   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
225
                  const fs_reg &src1)                                   \
226
   {                                                                    \
227
      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);    \
228
   }
229
 
230
#define ALU2_ACC(op)                                                    \
231
   fs_inst *                                                            \
232
   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
233
                  const fs_reg &src1)                                   \
234
   {                                                                    \
235
      fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
236
      inst->writes_accumulator = true;                                  \
237
      return inst;                                                      \
238
   }
239
 
240
#define ALU3(op)                                                        \
241
   fs_inst *                                                            \
242
   fs_visitor::op(const fs_reg &dst, const fs_reg &src0,                \
243
                  const fs_reg &src1, const fs_reg &src2)               \
244
   {                                                                    \
245
      return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
246
   }
247
 
248
ALU1(NOT)
249
ALU1(MOV)
250
ALU1(FRC)
251
ALU1(RNDD)
252
ALU1(RNDE)
253
ALU1(RNDZ)
254
ALU2(ADD)
255
ALU2(MUL)
256
ALU2_ACC(MACH)
257
ALU2(AND)
258
ALU2(OR)
259
ALU2(XOR)
260
ALU2(SHL)
261
ALU2(SHR)
262
ALU2(ASR)
263
ALU3(LRP)
264
ALU1(BFREV)
265
ALU3(BFE)
266
ALU2(BFI1)
267
ALU3(BFI2)
268
ALU1(FBH)
269
ALU1(FBL)
270
ALU1(CBIT)
271
ALU3(MAD)
272
ALU2_ACC(ADDC)
273
ALU2_ACC(SUBB)
274
ALU2(SEL)
275
ALU2(MAC)
276
 
277
/** Gen4 predicated IF. */
278
fs_inst *
279
fs_visitor::IF(enum brw_predicate predicate)
280
{
281
   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
282
   inst->predicate = predicate;
283
   return inst;
284
}
285
 
286
/** Gen6 IF with embedded comparison. */
287
fs_inst *
288
fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
289
               enum brw_conditional_mod condition)
290
{
291
   assert(devinfo->gen == 6);
292
   fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
293
                                        reg_null_d, src0, src1);
294
   inst->conditional_mod = condition;
295
   return inst;
296
}
297
 
298
/**
299
 * CMP: Sets the low bit of the destination channels with the result
300
 * of the comparison, while the upper bits are undefined, and updates
301
 * the flag register with the packed 16 bits of the result.
302
 */
303
fs_inst *
304
fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
305
                enum brw_conditional_mod condition)
306
{
307
   fs_inst *inst;
308
 
309
   /* Take the instruction:
310
    *
311
    * CMP null src0 src1
312
    *
313
    * Original gen4 does type conversion to the destination type before
314
    * comparison, producing garbage results for floating point comparisons.
315
    *
316
    * The destination type doesn't matter on newer generations, so we set the
317
    * type to match src0 so we can compact the instruction.
318
    */
319
   dst.type = src0.type;
320
   if (dst.file == HW_REG)
321
      dst.fixed_hw_reg.type = dst.type;
322
 
323
   resolve_ud_negate(&src0);
324
   resolve_ud_negate(&src1);
325
 
326
   inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
327
   inst->conditional_mod = condition;
328
 
329
   return inst;
330
}
331
 
332
fs_inst *
333
fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
334
                         int header_size)
335
{
336
   assert(dst.width % 8 == 0);
337
   fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
338
                                        dst, src, sources);
339
   inst->header_size = header_size;
340
 
341
   for (int i = 0; i < header_size; i++)
342
      assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32);
343
   inst->regs_written = header_size;
344
 
345
   for (int i = header_size; i < sources; ++i)
346
      assert(src[i].file != GRF || src[i].width == dst.width);
347
   inst->regs_written += (sources - header_size) * (dst.width / 8);
348
 
349
   return inst;
350
}
351
 
352
exec_list
353
fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
354
                                       const fs_reg &surf_index,
355
                                       const fs_reg &varying_offset,
356
                                       uint32_t const_offset)
357
{
358
   exec_list instructions;
359
   fs_inst *inst;
360
 
361
   /* We have our constant surface use a pitch of 4 bytes, so our index can
362
    * be any component of a vector, and then we load 4 contiguous
363
    * components starting from that.
364
    *
365
    * We break down the const_offset to a portion added to the variable
366
    * offset and a portion done using reg_offset, which means that if you
367
    * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
368
    * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
369
    * CSE can later notice that those loads are all the same and eliminate
370
    * the redundant ones.
371
    */
372
   fs_reg vec4_offset = vgrf(glsl_type::int_type);
373
   instructions.push_tail(ADD(vec4_offset,
374
                              varying_offset, fs_reg(const_offset & ~3)));
375
 
376
   int scale = 1;
377
   if (devinfo->gen == 4 && dst.width == 8) {
378
      /* Pre-gen5, we can either use a SIMD8 message that requires (header,
379
       * u, v, r) as parameters, or we can just use the SIMD16 message
380
       * consisting of (header, u).  We choose the second, at the cost of a
381
       * longer return length.
382
       */
383
      scale = 2;
384
   }
385
 
386
   enum opcode op;
387
   if (devinfo->gen >= 7)
388
      op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
389
   else
390
      op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
391
 
392
   assert(dst.width % 8 == 0);
393
   int regs_written = 4 * (dst.width / 8) * scale;
394
   fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
395
                               dst.type, dst.width);
396
   inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
397
   inst->regs_written = regs_written;
398
   instructions.push_tail(inst);
399
 
400
   if (devinfo->gen < 7) {
401
      inst->base_mrf = 13;
402
      inst->header_size = 1;
403
      if (devinfo->gen == 4)
404
         inst->mlen = 3;
405
      else
406
         inst->mlen = 1 + dispatch_width / 8;
407
   }
408
 
409
   fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
410
   instructions.push_tail(MOV(dst, result));
411
 
412
   return instructions;
413
}
414
 
415
/**
416
 * A helper for MOV generation for fixing up broken hardware SEND dependency
417
 * handling.
418
 */
419
fs_inst *
420
fs_visitor::DEP_RESOLVE_MOV(int grf)
421
{
422
   fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
423
 
424
   inst->ir = NULL;
425
   inst->annotation = "send dependency resolve";
426
 
427
   /* The caller always wants uncompressed to emit the minimal extra
428
    * dependencies, and to avoid having to deal with aligning its regs to 2.
429
    */
430
   inst->exec_size = 8;
431
 
432
   return inst;
433
}
434
 
435
bool
436
fs_inst::equals(fs_inst *inst) const
437
{
438
   return (opcode == inst->opcode &&
439
           dst.equals(inst->dst) &&
440
           src[0].equals(inst->src[0]) &&
441
           src[1].equals(inst->src[1]) &&
442
           src[2].equals(inst->src[2]) &&
443
           saturate == inst->saturate &&
444
           predicate == inst->predicate &&
445
           conditional_mod == inst->conditional_mod &&
446
           mlen == inst->mlen &&
447
           base_mrf == inst->base_mrf &&
448
           target == inst->target &&
449
           eot == inst->eot &&
450
           header_size == inst->header_size &&
451
           shadow_compare == inst->shadow_compare &&
452
           exec_size == inst->exec_size &&
453
           offset == inst->offset);
454
}
455
 
456
bool
457
fs_inst::overwrites_reg(const fs_reg ®) const
458
{
459
   return reg.in_range(dst, regs_written);
460
}
461
 
462
bool
463
fs_inst::is_send_from_grf() const
464
{
465
   switch (opcode) {
466
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
467
   case SHADER_OPCODE_SHADER_TIME_ADD:
468
   case FS_OPCODE_INTERPOLATE_AT_CENTROID:
469
   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
470
   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
471
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
472
   case SHADER_OPCODE_UNTYPED_ATOMIC:
473
   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
474
   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
475
   case SHADER_OPCODE_TYPED_ATOMIC:
476
   case SHADER_OPCODE_TYPED_SURFACE_READ:
477
   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
478
   case SHADER_OPCODE_URB_WRITE_SIMD8:
479
      return true;
480
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
481
      return src[1].file == GRF;
482
   case FS_OPCODE_FB_WRITE:
483
      return src[0].file == GRF;
484
   default:
485
      if (is_tex())
486
         return src[0].file == GRF;
487
 
488
      return false;
489
   }
490
}
491
 
492
bool
493
fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
494
{
495
   if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
496
      return false;
497
 
498
   fs_reg reg = this->src[0];
499
   if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0)
500
      return false;
501
 
502
   if (grf_alloc.sizes[reg.reg] != this->regs_written)
503
      return false;
504
 
505
   for (int i = 0; i < this->sources; i++) {
506
      reg.type = this->src[i].type;
507
      reg.width = this->src[i].width;
508
      if (!this->src[i].equals(reg))
509
         return false;
510
      reg = ::offset(reg, 1);
511
   }
512
 
513
   return true;
514
}
515
 
516
bool
517
fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
518
{
519
   if (devinfo->gen == 6 && is_math())
520
      return false;
521
 
522
   if (is_send_from_grf())
523
      return false;
524
 
525
   if (!backend_instruction::can_do_source_mods())
526
      return false;
527
 
528
   return true;
529
}
530
 
531
bool
532
fs_inst::has_side_effects() const
533
{
534
   return this->eot || backend_instruction::has_side_effects();
535
}
536
 
537
void
538
fs_reg::init()
539
{
540
   memset(this, 0, sizeof(*this));
541
   stride = 1;
542
}
543
 
544
/** Generic unset register constructor. */
545
fs_reg::fs_reg()
546
{
547
   init();
548
   this->file = BAD_FILE;
549
}
550
 
551
/** Immediate value constructor. */
552
fs_reg::fs_reg(float f)
553
{
554
   init();
555
   this->file = IMM;
556
   this->type = BRW_REGISTER_TYPE_F;
557
   this->fixed_hw_reg.dw1.f = f;
558
   this->width = 1;
559
}
560
 
561
/** Immediate value constructor. */
562
fs_reg::fs_reg(int32_t i)
563
{
564
   init();
565
   this->file = IMM;
566
   this->type = BRW_REGISTER_TYPE_D;
567
   this->fixed_hw_reg.dw1.d = i;
568
   this->width = 1;
569
}
570
 
571
/** Immediate value constructor. */
572
fs_reg::fs_reg(uint32_t u)
573
{
574
   init();
575
   this->file = IMM;
576
   this->type = BRW_REGISTER_TYPE_UD;
577
   this->fixed_hw_reg.dw1.ud = u;
578
   this->width = 1;
579
}
580
 
581
/** Vector float immediate value constructor. */
582
fs_reg::fs_reg(uint8_t vf[4])
583
{
584
   init();
585
   this->file = IMM;
586
   this->type = BRW_REGISTER_TYPE_VF;
587
   memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
588
}
589
 
590
/** Vector float immediate value constructor. */
591
fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
592
{
593
   init();
594
   this->file = IMM;
595
   this->type = BRW_REGISTER_TYPE_VF;
596
   this->fixed_hw_reg.dw1.ud = (vf0 <<  0) |
597
                               (vf1 <<  8) |
598
                               (vf2 << 16) |
599
                               (vf3 << 24);
600
}
601
 
602
/** Fixed brw_reg. */
603
fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
604
{
605
   init();
606
   this->file = HW_REG;
607
   this->fixed_hw_reg = fixed_hw_reg;
608
   this->type = fixed_hw_reg.type;
609
   this->width = 1 << fixed_hw_reg.width;
610
}
611
 
612
bool
613
fs_reg::equals(const fs_reg &r) const
614
{
615
   return (file == r.file &&
616
           reg == r.reg &&
617
           reg_offset == r.reg_offset &&
618
           subreg_offset == r.subreg_offset &&
619
           type == r.type &&
620
           negate == r.negate &&
621
           abs == r.abs &&
622
           !reladdr && !r.reladdr &&
623
           memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
624
           width == r.width &&
625
           stride == r.stride);
626
}
627
 
628
fs_reg &
629
fs_reg::set_smear(unsigned subreg)
630
{
631
   assert(file != HW_REG && file != IMM);
632
   subreg_offset = subreg * type_sz(type);
633
   stride = 0;
634
   return *this;
635
}
636
 
637
bool
638
fs_reg::is_contiguous() const
639
{
640
   return stride == 1;
641
}
642
 
643
int
644
fs_visitor::type_size(const struct glsl_type *type)
645
{
646
   unsigned int size, i;
647
 
648
   switch (type->base_type) {
649
   case GLSL_TYPE_UINT:
650
   case GLSL_TYPE_INT:
651
   case GLSL_TYPE_FLOAT:
652
   case GLSL_TYPE_BOOL:
653
      return type->components();
654
   case GLSL_TYPE_ARRAY:
655
      return type_size(type->fields.array) * type->length;
656
   case GLSL_TYPE_STRUCT:
657
      size = 0;
658
      for (i = 0; i < type->length; i++) {
659
	 size += type_size(type->fields.structure[i].type);
660
      }
661
      return size;
662
   case GLSL_TYPE_SAMPLER:
663
      /* Samplers take up no register space, since they're baked in at
664
       * link time.
665
       */
666
      return 0;
667
   case GLSL_TYPE_ATOMIC_UINT:
668
      return 0;
669
   case GLSL_TYPE_IMAGE:
670
   case GLSL_TYPE_VOID:
671
   case GLSL_TYPE_ERROR:
672
   case GLSL_TYPE_INTERFACE:
673
   case GLSL_TYPE_DOUBLE:
674
      unreachable("not reached");
675
   }
676
 
677
   return 0;
678
}
679
 
680
/**
681
 * Create a MOV to read the timestamp register.
682
 *
683
 * The caller is responsible for emitting the MOV.  The return value is
684
 * the destination of the MOV, with extra parameters set.
685
 */
686
fs_reg
687
fs_visitor::get_timestamp(fs_inst **out_mov)
688
{
689
   assert(devinfo->gen >= 7);
690
 
691
   fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
692
                                          BRW_ARF_TIMESTAMP,
693
                                          0),
694
                             BRW_REGISTER_TYPE_UD));
695
 
696
   fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
697
 
698
   fs_inst *mov = MOV(dst, ts);
699
   /* We want to read the 3 fields we care about even if it's not enabled in
700
    * the dispatch.
701
    */
702
   mov->force_writemask_all = true;
703
 
704
   /* The caller wants the low 32 bits of the timestamp.  Since it's running
705
    * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
706
    * which is plenty of time for our purposes.  It is identical across the
707
    * EUs, but since it's tracking GPU core speed it will increment at a
708
    * varying rate as render P-states change.
709
    *
710
    * The caller could also check if render P-states have changed (or anything
711
    * else that might disrupt timing) by setting smear to 2 and checking if
712
    * that field is != 0.
713
    */
714
   dst.set_smear(0);
715
 
716
   *out_mov = mov;
717
   return dst;
718
}
719
 
720
void
721
fs_visitor::emit_shader_time_begin()
722
{
723
   current_annotation = "shader time start";
724
   fs_inst *mov;
725
   shader_start_time = get_timestamp(&mov);
726
   emit(mov);
727
}
728
 
729
void
730
fs_visitor::emit_shader_time_end()
731
{
732
   current_annotation = "shader time end";
733
 
734
   enum shader_time_shader_type type, written_type, reset_type;
735
   switch (stage) {
736
   case MESA_SHADER_VERTEX:
737
      type = ST_VS;
738
      written_type = ST_VS_WRITTEN;
739
      reset_type = ST_VS_RESET;
740
      break;
741
   case MESA_SHADER_GEOMETRY:
742
      type = ST_GS;
743
      written_type = ST_GS_WRITTEN;
744
      reset_type = ST_GS_RESET;
745
      break;
746
   case MESA_SHADER_FRAGMENT:
747
      if (dispatch_width == 8) {
748
         type = ST_FS8;
749
         written_type = ST_FS8_WRITTEN;
750
         reset_type = ST_FS8_RESET;
751
      } else {
752
         assert(dispatch_width == 16);
753
         type = ST_FS16;
754
         written_type = ST_FS16_WRITTEN;
755
         reset_type = ST_FS16_RESET;
756
      }
757
      break;
758
   case MESA_SHADER_COMPUTE:
759
      type = ST_CS;
760
      written_type = ST_CS_WRITTEN;
761
      reset_type = ST_CS_RESET;
762
      break;
763
   default:
764
      unreachable("fs_visitor::emit_shader_time_end missing code");
765
   }
766
 
767
   /* Insert our code just before the final SEND with EOT. */
768
   exec_node *end = this->instructions.get_tail();
769
   assert(end && ((fs_inst *) end)->eot);
770
 
771
   fs_inst *tm_read;
772
   fs_reg shader_end_time = get_timestamp(&tm_read);
773
   end->insert_before(tm_read);
774
 
775
   /* Check that there weren't any timestamp reset events (assuming these
776
    * were the only two timestamp reads that happened).
777
    */
778
   fs_reg reset = shader_end_time;
779
   reset.set_smear(2);
780
   fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
781
   test->conditional_mod = BRW_CONDITIONAL_Z;
782
   test->force_writemask_all = true;
783
   end->insert_before(test);
784
   end->insert_before(IF(BRW_PREDICATE_NORMAL));
785
 
786
   fs_reg start = shader_start_time;
787
   start.negate = true;
788
   fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
789
   diff.set_smear(0);
790
   fs_inst *add = ADD(diff, start, shader_end_time);
791
   add->force_writemask_all = true;
792
   end->insert_before(add);
793
 
794
   /* If there were no instructions between the two timestamp gets, the diff
795
    * is 2 cycles.  Remove that overhead, so I can forget about that when
796
    * trying to determine the time taken for single instructions.
797
    */
798
   add = ADD(diff, diff, fs_reg(-2u));
799
   add->force_writemask_all = true;
800
   end->insert_before(add);
801
 
802
   end->insert_before(SHADER_TIME_ADD(type, diff));
803
   end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
804
   end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
805
   end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
806
   end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
807
}
808
 
809
fs_inst *
810
fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
811
{
812
   int shader_time_index =
813
      brw_get_shader_time_index(brw, shader_prog, prog, type);
814
   fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
815
 
816
   fs_reg payload;
817
   if (dispatch_width == 8)
818
      payload = vgrf(glsl_type::uvec2_type);
819
   else
820
      payload = vgrf(glsl_type::uint_type);
821
 
822
   return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
823
                               fs_reg(), payload, offset, value);
824
}
825
 
826
void
827
fs_visitor::vfail(const char *format, va_list va)
828
{
829
   char *msg;
830
 
831
   if (failed)
832
      return;
833
 
834
   failed = true;
835
 
836
   msg = ralloc_vasprintf(mem_ctx, format, va);
837
   msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
838
 
839
   this->fail_msg = msg;
840
 
841
   if (debug_enabled) {
842
      fprintf(stderr, "%s",  msg);
843
   }
844
}
845
 
846
void
847
fs_visitor::fail(const char *format, ...)
848
{
849
   va_list va;
850
 
851
   va_start(va, format);
852
   vfail(format, va);
853
   va_end(va);
854
}
855
 
856
/**
857
 * Mark this program as impossible to compile in SIMD16 mode.
858
 *
859
 * During the SIMD8 compile (which happens first), we can detect and flag
860
 * things that are unsupported in SIMD16 mode, so the compiler can skip
861
 * the SIMD16 compile altogether.
862
 *
863
 * During a SIMD16 compile (if one happens anyway), this just calls fail().
864
 */
865
void
866
fs_visitor::no16(const char *format, ...)
867
{
868
   va_list va;
869
 
870
   va_start(va, format);
871
 
872
   if (dispatch_width == 16) {
873
      vfail(format, va);
874
   } else {
875
      simd16_unsupported = true;
876
 
877
      if (brw->perf_debug) {
878
         if (no16_msg)
879
            ralloc_vasprintf_append(&no16_msg, format, va);
880
         else
881
            no16_msg = ralloc_vasprintf(mem_ctx, format, va);
882
      }
883
   }
884
 
885
   va_end(va);
886
}
887
 
888
fs_inst *
889
fs_visitor::emit(enum opcode opcode)
890
{
891
   return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
892
}
893
 
894
fs_inst *
895
fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
896
{
897
   return emit(new(mem_ctx) fs_inst(opcode, dst));
898
}
899
 
900
fs_inst *
901
fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
902
{
903
   return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
904
}
905
 
906
fs_inst *
907
fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
908
                 const fs_reg &src1)
909
{
910
   return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
911
}
912
 
913
fs_inst *
914
fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
915
                 const fs_reg &src1, const fs_reg &src2)
916
{
917
   return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
918
}
919
 
920
fs_inst *
921
fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
922
                 fs_reg src[], int sources)
923
{
924
   return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
925
}
926
 
927
/**
928
 * Returns true if the instruction has a flag that means it won't
929
 * update an entire destination register.
930
 *
931
 * For example, dead code elimination and live variable analysis want to know
932
 * when a write to a variable screens off any preceding values that were in
933
 * it.
934
 */
935
bool
936
fs_inst::is_partial_write() const
937
{
938
   return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
939
           (this->dst.width * type_sz(this->dst.type)) < 32 ||
940
           !this->dst.is_contiguous());
941
}
942
 
943
int
944
fs_inst::regs_read(int arg) const
945
{
946
   if (is_tex() && arg == 0 && src[0].file == GRF) {
947
      return mlen;
948
   } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
949
      return mlen;
950
   } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
951
      return mlen;
952
   } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
953
      return mlen;
954
   } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
955
      return mlen;
956
   } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
957
      return mlen;
958
   } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
959
      return mlen;
960
   } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
961
      return mlen;
962
   } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
963
      return mlen;
964
   } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
965
      return mlen;
966
   } else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
967
      return exec_size / 4;
968
   }
969
 
970
   switch (src[arg].file) {
971
   case BAD_FILE:
972
   case UNIFORM:
973
   case IMM:
974
      return 1;
975
   case GRF:
976
   case HW_REG:
977
      if (src[arg].stride == 0) {
978
         return 1;
979
      } else {
980
         int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
981
         return (size + 31) / 32;
982
      }
983
   case MRF:
984
      unreachable("MRF registers are not allowed as sources");
985
   default:
986
      unreachable("Invalid register file");
987
   }
988
}
989
 
990
bool
991
fs_inst::reads_flag() const
992
{
993
   return predicate;
994
}
995
 
996
bool
997
fs_inst::writes_flag() const
998
{
999
   return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
1000
                               opcode != BRW_OPCODE_IF &&
1001
                               opcode != BRW_OPCODE_WHILE)) ||
1002
          opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
1003
}
1004
 
1005
/**
1006
 * Returns how many MRFs an FS opcode will write over.
1007
 *
1008
 * Note that this is not the 0 or 1 implied writes in an actual gen
1009
 * instruction -- the FS opcodes often generate MOVs in addition.
1010
 */
1011
int
1012
fs_visitor::implied_mrf_writes(fs_inst *inst)
1013
{
1014
   if (inst->mlen == 0)
1015
      return 0;
1016
 
1017
   if (inst->base_mrf == -1)
1018
      return 0;
1019
 
1020
   switch (inst->opcode) {
1021
   case SHADER_OPCODE_RCP:
1022
   case SHADER_OPCODE_RSQ:
1023
   case SHADER_OPCODE_SQRT:
1024
   case SHADER_OPCODE_EXP2:
1025
   case SHADER_OPCODE_LOG2:
1026
   case SHADER_OPCODE_SIN:
1027
   case SHADER_OPCODE_COS:
1028
      return 1 * dispatch_width / 8;
1029
   case SHADER_OPCODE_POW:
1030
   case SHADER_OPCODE_INT_QUOTIENT:
1031
   case SHADER_OPCODE_INT_REMAINDER:
1032
      return 2 * dispatch_width / 8;
1033
   case SHADER_OPCODE_TEX:
1034
   case FS_OPCODE_TXB:
1035
   case SHADER_OPCODE_TXD:
1036
   case SHADER_OPCODE_TXF:
1037
   case SHADER_OPCODE_TXF_CMS:
1038
   case SHADER_OPCODE_TXF_MCS:
1039
   case SHADER_OPCODE_TG4:
1040
   case SHADER_OPCODE_TG4_OFFSET:
1041
   case SHADER_OPCODE_TXL:
1042
   case SHADER_OPCODE_TXS:
1043
   case SHADER_OPCODE_LOD:
1044
      return 1;
1045
   case FS_OPCODE_FB_WRITE:
1046
      return 2;
1047
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1048
   case SHADER_OPCODE_GEN4_SCRATCH_READ:
1049
      return 1;
1050
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1051
      return inst->mlen;
1052
   case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1053
      return inst->mlen;
1054
   case SHADER_OPCODE_UNTYPED_ATOMIC:
1055
   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1056
   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
1057
   case SHADER_OPCODE_TYPED_ATOMIC:
1058
   case SHADER_OPCODE_TYPED_SURFACE_READ:
1059
   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
1060
   case SHADER_OPCODE_URB_WRITE_SIMD8:
1061
   case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1062
   case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1063
   case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1064
   case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1065
      return 0;
1066
   default:
1067
      unreachable("not reached");
1068
   }
1069
}
1070
 
1071
fs_reg
1072
fs_visitor::vgrf(const glsl_type *const type)
1073
{
1074
   int reg_width = dispatch_width / 8;
1075
   return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
1076
                 brw_type_for_base_type(type), dispatch_width);
1077
}
1078
 
1079
fs_reg
1080
fs_visitor::vgrf(int num_components)
1081
{
1082
   int reg_width = dispatch_width / 8;
1083
   return fs_reg(GRF, alloc.allocate(num_components * reg_width),
1084
                 BRW_REGISTER_TYPE_F, dispatch_width);
1085
}
1086
 
1087
/** Fixed HW reg constructor. */
1088
fs_reg::fs_reg(enum register_file file, int reg)
1089
{
1090
   init();
1091
   this->file = file;
1092
   this->reg = reg;
1093
   this->type = BRW_REGISTER_TYPE_F;
1094
 
1095
   switch (file) {
1096
   case UNIFORM:
1097
      this->width = 1;
1098
      break;
1099
   default:
1100
      this->width = 8;
1101
   }
1102
}
1103
 
1104
/** Fixed HW reg constructor. */
1105
fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1106
{
1107
   init();
1108
   this->file = file;
1109
   this->reg = reg;
1110
   this->type = type;
1111
 
1112
   switch (file) {
1113
   case UNIFORM:
1114
      this->width = 1;
1115
      break;
1116
   default:
1117
      this->width = 8;
1118
   }
1119
}
1120
 
1121
/** Fixed HW reg constructor. */
1122
fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1123
               uint8_t width)
1124
{
1125
   init();
1126
   this->file = file;
1127
   this->reg = reg;
1128
   this->type = type;
1129
   this->width = width;
1130
}
1131
 
1132
fs_reg *
1133
fs_visitor::variable_storage(ir_variable *var)
1134
{
1135
   return (fs_reg *)hash_table_find(this->variable_ht, var);
1136
}
1137
 
1138
void
1139
import_uniforms_callback(const void *key,
1140
			 void *data,
1141
			 void *closure)
1142
{
1143
   struct hash_table *dst_ht = (struct hash_table *)closure;
1144
   const fs_reg *reg = (const fs_reg *)data;
1145
 
1146
   if (reg->file != UNIFORM)
1147
      return;
1148
 
1149
   hash_table_insert(dst_ht, data, key);
1150
}
1151
 
1152
/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1153
 * This brings in those uniform definitions
1154
 */
1155
void
1156
fs_visitor::import_uniforms(fs_visitor *v)
1157
{
1158
   hash_table_call_foreach(v->variable_ht,
1159
			   import_uniforms_callback,
1160
			   variable_ht);
1161
   this->push_constant_loc = v->push_constant_loc;
1162
   this->pull_constant_loc = v->pull_constant_loc;
1163
   this->uniforms = v->uniforms;
1164
   this->param_size = v->param_size;
1165
}
1166
 
1167
/* Our support for uniforms is piggy-backed on the struct
1168
 * gl_fragment_program, because that's where the values actually
1169
 * get stored, rather than in some global gl_shader_program uniform
1170
 * store.
1171
 */
1172
void
1173
fs_visitor::setup_uniform_values(ir_variable *ir)
1174
{
1175
   int namelen = strlen(ir->name);
1176
 
1177
   /* The data for our (non-builtin) uniforms is stored in a series of
1178
    * gl_uniform_driver_storage structs for each subcomponent that
1179
    * glGetUniformLocation() could name.  We know it's been set up in the same
1180
    * order we'd walk the type, so walk the list of storage and find anything
1181
    * with our name, or the prefix of a component that starts with our name.
1182
    */
1183
   unsigned params_before = uniforms;
1184
   for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1185
      struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1186
 
1187
      if (strncmp(ir->name, storage->name, namelen) != 0 ||
1188
          (storage->name[namelen] != 0 &&
1189
           storage->name[namelen] != '.' &&
1190
           storage->name[namelen] != '[')) {
1191
         continue;
1192
      }
1193
 
1194
      unsigned slots = storage->type->component_slots();
1195
      if (storage->array_elements)
1196
         slots *= storage->array_elements;
1197
 
1198
      for (unsigned i = 0; i < slots; i++) {
1199
         stage_prog_data->param[uniforms++] = &storage->storage[i];
1200
      }
1201
   }
1202
 
1203
   /* Make sure we actually initialized the right amount of stuff here. */
1204
   assert(params_before + ir->type->component_slots() == uniforms);
1205
   (void)params_before;
1206
}
1207
 
1208
 
1209
/* Our support for builtin uniforms is even scarier than non-builtin.
1210
 * It sits on top of the PROG_STATE_VAR parameters that are
1211
 * automatically updated from GL context state.
1212
 */
1213
void
1214
fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1215
{
1216
   const ir_state_slot *const slots = ir->get_state_slots();
1217
   assert(slots != NULL);
1218
 
1219
   for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1220
      /* This state reference has already been setup by ir_to_mesa, but we'll
1221
       * get the same index back here.
1222
       */
1223
      int index = _mesa_add_state_reference(this->prog->Parameters,
1224
					    (gl_state_index *)slots[i].tokens);
1225
 
1226
      /* Add each of the unique swizzles of the element as a parameter.
1227
       * This'll end up matching the expected layout of the
1228
       * array/matrix/structure we're trying to fill in.
1229
       */
1230
      int last_swiz = -1;
1231
      for (unsigned int j = 0; j < 4; j++) {
1232
	 int swiz = GET_SWZ(slots[i].swizzle, j);
1233
	 if (swiz == last_swiz)
1234
	    break;
1235
	 last_swiz = swiz;
1236
 
1237
         stage_prog_data->param[uniforms++] =
1238
            &prog->Parameters->ParameterValues[index][swiz];
1239
      }
1240
   }
1241
}
1242
 
1243
fs_reg *
1244
fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1245
                                         bool origin_upper_left)
1246
{
1247
   assert(stage == MESA_SHADER_FRAGMENT);
1248
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1249
   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1250
   fs_reg wpos = *reg;
1251
   bool flip = !origin_upper_left ^ key->render_to_fbo;
1252
 
1253
   /* gl_FragCoord.x */
1254
   if (pixel_center_integer) {
1255
      emit(MOV(wpos, this->pixel_x));
1256
   } else {
1257
      emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1258
   }
1259
   wpos = offset(wpos, 1);
1260
 
1261
   /* gl_FragCoord.y */
1262
   if (!flip && pixel_center_integer) {
1263
      emit(MOV(wpos, this->pixel_y));
1264
   } else {
1265
      fs_reg pixel_y = this->pixel_y;
1266
      float offset = (pixel_center_integer ? 0.0 : 0.5);
1267
 
1268
      if (flip) {
1269
	 pixel_y.negate = true;
1270
	 offset += key->drawable_height - 1.0;
1271
      }
1272
 
1273
      emit(ADD(wpos, pixel_y, fs_reg(offset)));
1274
   }
1275
   wpos = offset(wpos, 1);
1276
 
1277
   /* gl_FragCoord.z */
1278
   if (devinfo->gen >= 6) {
1279
      emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1280
   } else {
1281
      emit(FS_OPCODE_LINTERP, wpos,
1282
           this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1283
           interp_reg(VARYING_SLOT_POS, 2));
1284
   }
1285
   wpos = offset(wpos, 1);
1286
 
1287
   /* gl_FragCoord.w: Already set up in emit_interpolation */
1288
   emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1289
 
1290
   return reg;
1291
}
1292
 
1293
fs_inst *
1294
fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1295
                         glsl_interp_qualifier interpolation_mode,
1296
                         bool is_centroid, bool is_sample)
1297
{
1298
   brw_wm_barycentric_interp_mode barycoord_mode;
1299
   if (devinfo->gen >= 6) {
1300
      if (is_centroid) {
1301
         if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1302
            barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1303
         else
1304
            barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1305
      } else if (is_sample) {
1306
          if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1307
            barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1308
         else
1309
            barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1310
      } else {
1311
         if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1312
            barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1313
         else
1314
            barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1315
      }
1316
   } else {
1317
      /* On Ironlake and below, there is only one interpolation mode.
1318
       * Centroid interpolation doesn't mean anything on this hardware --
1319
       * there is no multisampling.
1320
       */
1321
      barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1322
   }
1323
   return emit(FS_OPCODE_LINTERP, attr,
1324
               this->delta_xy[barycoord_mode], interp);
1325
}
1326
 
1327
void
1328
fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1329
                                       const glsl_type *type,
1330
                                       glsl_interp_qualifier interpolation_mode,
1331
                                       int location, bool mod_centroid,
1332
                                       bool mod_sample)
1333
{
1334
   attr.type = brw_type_for_base_type(type->get_scalar_type());
1335
 
1336
   assert(stage == MESA_SHADER_FRAGMENT);
1337
   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1338
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1339
 
1340
   unsigned int array_elements;
1341
 
1342
   if (type->is_array()) {
1343
      array_elements = type->length;
1344
      if (array_elements == 0) {
1345
         fail("dereferenced array '%s' has length 0\n", name);
1346
      }
1347
      type = type->fields.array;
1348
   } else {
1349
      array_elements = 1;
1350
   }
1351
 
1352
   if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1353
      bool is_gl_Color =
1354
         location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1355
      if (key->flat_shade && is_gl_Color) {
1356
         interpolation_mode = INTERP_QUALIFIER_FLAT;
1357
      } else {
1358
         interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1359
      }
1360
   }
1361
 
1362
   for (unsigned int i = 0; i < array_elements; i++) {
1363
      for (unsigned int j = 0; j < type->matrix_columns; j++) {
1364
	 if (prog_data->urb_setup[location] == -1) {
1365
	    /* If there's no incoming setup data for this slot, don't
1366
	     * emit interpolation for it.
1367
	     */
1368
	    attr = offset(attr, type->vector_elements);
1369
	    location++;
1370
	    continue;
1371
	 }
1372
 
1373
	 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1374
	    /* Constant interpolation (flat shading) case. The SF has
1375
	     * handed us defined values in only the constant offset
1376
	     * field of the setup reg.
1377
	     */
1378
	    for (unsigned int k = 0; k < type->vector_elements; k++) {
1379
	       struct brw_reg interp = interp_reg(location, k);
1380
	       interp = suboffset(interp, 3);
1381
               interp.type = attr.type;
1382
	       emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1383
	       attr = offset(attr, 1);
1384
	    }
1385
	 } else {
1386
	    /* Smooth/noperspective interpolation case. */
1387
	    for (unsigned int k = 0; k < type->vector_elements; k++) {
1388
               struct brw_reg interp = interp_reg(location, k);
1389
               if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
1390
                  /* Get the pixel/sample mask into f0 so that we know
1391
                   * which pixels are lit.  Then, for each channel that is
1392
                   * unlit, replace the centroid data with non-centroid
1393
                   * data.
1394
                   */
1395
                  emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1396
 
1397
                  fs_inst *inst;
1398
                  inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1399
                                      false, false);
1400
                  inst->predicate = BRW_PREDICATE_NORMAL;
1401
                  inst->predicate_inverse = true;
1402
                  if (devinfo->has_pln)
1403
                     inst->no_dd_clear = true;
1404
 
1405
                  inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1406
                                      mod_centroid && !key->persample_shading,
1407
                                      mod_sample || key->persample_shading);
1408
                  inst->predicate = BRW_PREDICATE_NORMAL;
1409
                  inst->predicate_inverse = false;
1410
                  if (devinfo->has_pln)
1411
                     inst->no_dd_check = true;
1412
 
1413
               } else {
1414
                  emit_linterp(attr, fs_reg(interp), interpolation_mode,
1415
                               mod_centroid && !key->persample_shading,
1416
                               mod_sample || key->persample_shading);
1417
               }
1418
               if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1419
                  emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1420
               }
1421
	       attr = offset(attr, 1);
1422
	    }
1423
 
1424
	 }
1425
	 location++;
1426
      }
1427
   }
1428
}
1429
 
1430
fs_reg *
1431
fs_visitor::emit_frontfacing_interpolation()
1432
{
1433
   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1434
 
1435
   if (devinfo->gen >= 6) {
1436
      /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1437
       * a boolean result from this (~0/true or 0/false).
1438
       *
1439
       * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1440
       * this task in only one instruction:
1441
       *    - a negation source modifier will flip the bit; and
1442
       *    - a W -> D type conversion will sign extend the bit into the high
1443
       *      word of the destination.
1444
       *
1445
       * An ASR 15 fills the low word of the destination.
1446
       */
1447
      fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1448
      g0.negate = true;
1449
 
1450
      emit(ASR(*reg, g0, fs_reg(15)));
1451
   } else {
1452
      /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1453
       * a boolean result from this (1/true or 0/false).
1454
       *
1455
       * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1456
       * the negation source modifier to flip it. Unfortunately the SHR
1457
       * instruction only operates on UD (or D with an abs source modifier)
1458
       * sources without negation.
1459
       *
1460
       * Instead, use ASR (which will give ~0/true or 0/false).
1461
       */
1462
      fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1463
      g1_6.negate = true;
1464
 
1465
      emit(ASR(*reg, g1_6, fs_reg(31)));
1466
   }
1467
 
1468
   return reg;
1469
}
1470
 
1471
void
1472
fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1473
{
1474
   assert(stage == MESA_SHADER_FRAGMENT);
1475
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1476
   assert(dst.type == BRW_REGISTER_TYPE_F);
1477
 
1478
   if (key->compute_pos_offset) {
1479
      /* Convert int_sample_pos to floating point */
1480
      emit(MOV(dst, int_sample_pos));
1481
      /* Scale to the range [0, 1] */
1482
      emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1483
   }
1484
   else {
1485
      /* From ARB_sample_shading specification:
1486
       * "When rendering to a non-multisample buffer, or if multisample
1487
       *  rasterization is disabled, gl_SamplePosition will always be
1488
       *  (0.5, 0.5).
1489
       */
1490
      emit(MOV(dst, fs_reg(0.5f)));
1491
   }
1492
}
1493
 
1494
fs_reg *
1495
fs_visitor::emit_samplepos_setup()
1496
{
1497
   assert(devinfo->gen >= 6);
1498
 
1499
   this->current_annotation = "compute sample position";
1500
   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1501
   fs_reg pos = *reg;
1502
   fs_reg int_sample_x = vgrf(glsl_type::int_type);
1503
   fs_reg int_sample_y = vgrf(glsl_type::int_type);
1504
 
1505
   /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1506
    * mode will be enabled.
1507
    *
1508
    * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1509
    * R31.1:0         Position Offset X/Y for Slot[3:0]
1510
    * R31.3:2         Position Offset X/Y for Slot[7:4]
1511
    * .....
1512
    *
1513
    * The X, Y sample positions come in as bytes in  thread payload. So, read
1514
    * the positions using vstride=16, width=8, hstride=2.
1515
    */
1516
   struct brw_reg sample_pos_reg =
1517
      stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1518
                    BRW_REGISTER_TYPE_B), 16, 8, 2);
1519
 
1520
   if (dispatch_width == 8) {
1521
      emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1522
   } else {
1523
      emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1524
      emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1525
         ->force_sechalf = true;
1526
   }
1527
   /* Compute gl_SamplePosition.x */
1528
   compute_sample_position(pos, int_sample_x);
1529
   pos = offset(pos, 1);
1530
   if (dispatch_width == 8) {
1531
      emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1532
   } else {
1533
      emit(MOV(half(int_sample_y, 0),
1534
               fs_reg(suboffset(sample_pos_reg, 1))));
1535
      emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1536
         ->force_sechalf = true;
1537
   }
1538
   /* Compute gl_SamplePosition.y */
1539
   compute_sample_position(pos, int_sample_y);
1540
   return reg;
1541
}
1542
 
1543
fs_reg *
1544
fs_visitor::emit_sampleid_setup()
1545
{
1546
   assert(stage == MESA_SHADER_FRAGMENT);
1547
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1548
   assert(devinfo->gen >= 6);
1549
 
1550
   this->current_annotation = "compute sample id";
1551
   fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1552
 
1553
   if (key->compute_sample_id) {
1554
      fs_reg t1 = vgrf(glsl_type::int_type);
1555
      fs_reg t2 = vgrf(glsl_type::int_type);
1556
      t2.type = BRW_REGISTER_TYPE_UW;
1557
 
1558
      /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1559
       * 8x multisampling, subspan 0 will represent sample N (where N
1560
       * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1561
       * 7. We can find the value of N by looking at R0.0 bits 7:6
1562
       * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1563
       * (since samples are always delivered in pairs). That is, we
1564
       * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1565
       * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1566
       * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1567
       * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1568
       * populating a temporary variable with the sequence (0, 1, 2, 3),
1569
       * and then reading from it using vstride=1, width=4, hstride=0.
1570
       * These computations hold good for 4x multisampling as well.
1571
       *
1572
       * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1573
       * the first four slots are sample 0 of subspan 0; the next four
1574
       * are sample 1 of subspan 0; the third group is sample 0 of
1575
       * subspan 1, and finally sample 1 of subspan 1.
1576
       */
1577
      fs_inst *inst;
1578
      inst = emit(BRW_OPCODE_AND, t1,
1579
                  fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1580
                  fs_reg(0xc0));
1581
      inst->force_writemask_all = true;
1582
      inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1583
      inst->force_writemask_all = true;
1584
      /* This works for both SIMD8 and SIMD16 */
1585
      inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1586
      inst->force_writemask_all = true;
1587
      /* This special instruction takes care of setting vstride=1,
1588
       * width=4, hstride=0 of t2 during an ADD instruction.
1589
       */
1590
      emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1591
   } else {
1592
      /* As per GL_ARB_sample_shading specification:
1593
       * "When rendering to a non-multisample buffer, or if multisample
1594
       *  rasterization is disabled, gl_SampleID will always be zero."
1595
       */
1596
      emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1597
   }
1598
 
1599
   return reg;
1600
}
1601
 
1602
void
1603
fs_visitor::resolve_source_modifiers(fs_reg *src)
1604
{
1605
   if (!src->abs && !src->negate)
1606
      return;
1607
 
1608
   fs_reg temp = retype(vgrf(1), src->type);
1609
   emit(MOV(temp, *src));
1610
   *src = temp;
1611
}
1612
 
1613
fs_reg
1614
fs_visitor::fix_math_operand(fs_reg src)
1615
{
1616
   /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1617
    * might be able to do better by doing execsize = 1 math and then
1618
    * expanding that result out, but we would need to be careful with
1619
    * masking.
1620
    *
1621
    * The hardware ignores source modifiers (negate and abs) on math
1622
    * instructions, so we also move to a temp to set those up.
1623
    */
1624
   if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1625
       !src.abs && !src.negate)
1626
      return src;
1627
 
1628
   /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1629
    * operands to math
1630
    */
1631
   if (devinfo->gen >= 7 && src.file != IMM)
1632
      return src;
1633
 
1634
   fs_reg expanded = vgrf(glsl_type::float_type);
1635
   expanded.type = src.type;
1636
   emit(BRW_OPCODE_MOV, expanded, src);
1637
   return expanded;
1638
}
1639
 
1640
fs_inst *
1641
fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1642
{
1643
   switch (opcode) {
1644
   case SHADER_OPCODE_RCP:
1645
   case SHADER_OPCODE_RSQ:
1646
   case SHADER_OPCODE_SQRT:
1647
   case SHADER_OPCODE_EXP2:
1648
   case SHADER_OPCODE_LOG2:
1649
   case SHADER_OPCODE_SIN:
1650
   case SHADER_OPCODE_COS:
1651
      break;
1652
   default:
1653
      unreachable("not reached: bad math opcode");
1654
   }
1655
 
1656
   /* Can't do hstride == 0 args to gen6 math, so expand it out.  We
1657
    * might be able to do better by doing execsize = 1 math and then
1658
    * expanding that result out, but we would need to be careful with
1659
    * masking.
1660
    *
1661
    * Gen 6 hardware ignores source modifiers (negate and abs) on math
1662
    * instructions, so we also move to a temp to set those up.
1663
    */
1664
   if (devinfo->gen == 6 || devinfo->gen == 7)
1665
      src = fix_math_operand(src);
1666
 
1667
   fs_inst *inst = emit(opcode, dst, src);
1668
 
1669
   if (devinfo->gen < 6) {
1670
      inst->base_mrf = 2;
1671
      inst->mlen = dispatch_width / 8;
1672
   }
1673
 
1674
   return inst;
1675
}
1676
 
1677
fs_inst *
1678
fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1679
{
1680
   int base_mrf = 2;
1681
   fs_inst *inst;
1682
 
1683
   if (devinfo->gen >= 8) {
1684
      inst = emit(opcode, dst, src0, src1);
1685
   } else if (devinfo->gen >= 6) {
1686
      src0 = fix_math_operand(src0);
1687
      src1 = fix_math_operand(src1);
1688
 
1689
      inst = emit(opcode, dst, src0, src1);
1690
   } else {
1691
      /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1692
       * "Message Payload":
1693
       *
1694
       * "Operand0[7].  For the INT DIV functions, this operand is the
1695
       *  denominator."
1696
       *  ...
1697
       * "Operand1[7].  For the INT DIV functions, this operand is the
1698
       *  numerator."
1699
       */
1700
      bool is_int_div = opcode != SHADER_OPCODE_POW;
1701
      fs_reg &op0 = is_int_div ? src1 : src0;
1702
      fs_reg &op1 = is_int_div ? src0 : src1;
1703
 
1704
      emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1705
      inst = emit(opcode, dst, op0, reg_null_f);
1706
 
1707
      inst->base_mrf = base_mrf;
1708
      inst->mlen = 2 * dispatch_width / 8;
1709
   }
1710
   return inst;
1711
}
1712
 
1713
void
1714
fs_visitor::emit_discard_jump()
1715
{
1716
   assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
1717
 
1718
   /* For performance, after a discard, jump to the end of the
1719
    * shader if all relevant channels have been discarded.
1720
    */
1721
   fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
1722
   discard_jump->flag_subreg = 1;
1723
 
1724
   discard_jump->predicate = (dispatch_width == 8)
1725
                             ? BRW_PREDICATE_ALIGN1_ANY8H
1726
                             : BRW_PREDICATE_ALIGN1_ANY16H;
1727
   discard_jump->predicate_inverse = true;
1728
}
1729
 
1730
void
1731
fs_visitor::assign_curb_setup()
1732
{
1733
   if (dispatch_width == 8) {
1734
      prog_data->dispatch_grf_start_reg = payload.num_regs;
1735
   } else {
1736
      if (stage == MESA_SHADER_FRAGMENT) {
1737
         brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1738
         prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1739
      } else if (stage == MESA_SHADER_COMPUTE) {
1740
         brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data;
1741
         prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1742
      } else {
1743
         unreachable("Unsupported shader type!");
1744
      }
1745
   }
1746
 
1747
   prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1748
 
1749
   /* Map the offsets in the UNIFORM file to fixed HW regs. */
1750
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1751
      for (unsigned int i = 0; i < inst->sources; i++) {
1752
	 if (inst->src[i].file == UNIFORM) {
1753
            int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1754
            int constant_nr;
1755
            if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1756
               constant_nr = push_constant_loc[uniform_nr];
1757
            } else {
1758
               /* Section 5.11 of the OpenGL 4.1 spec says:
1759
                * "Out-of-bounds reads return undefined values, which include
1760
                *  values from other variables of the active program or zero."
1761
                * Just return the first push constant.
1762
                */
1763
               constant_nr = 0;
1764
            }
1765
 
1766
	    struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1767
						  constant_nr / 8,
1768
						  constant_nr % 8);
1769
 
1770
	    inst->src[i].file = HW_REG;
1771
	    inst->src[i].fixed_hw_reg = byte_offset(
1772
               retype(brw_reg, inst->src[i].type),
1773
               inst->src[i].subreg_offset);
1774
	 }
1775
      }
1776
   }
1777
}
1778
 
1779
void
1780
fs_visitor::calculate_urb_setup()
1781
{
1782
   assert(stage == MESA_SHADER_FRAGMENT);
1783
   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1784
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1785
 
1786
   memset(prog_data->urb_setup, -1,
1787
          sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1788
 
1789
   int urb_next = 0;
1790
   /* Figure out where each of the incoming setup attributes lands. */
1791
   if (devinfo->gen >= 6) {
1792
      if (_mesa_bitcount_64(prog->InputsRead &
1793
                            BRW_FS_VARYING_INPUT_MASK) <= 16) {
1794
         /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1795
          * first 16 varying inputs, so we can put them wherever we want.
1796
          * Just put them in order.
1797
          *
1798
          * This is useful because it means that (a) inputs not used by the
1799
          * fragment shader won't take up valuable register space, and (b) we
1800
          * won't have to recompile the fragment shader if it gets paired with
1801
          * a different vertex (or geometry) shader.
1802
          */
1803
         for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1804
            if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1805
                BITFIELD64_BIT(i)) {
1806
               prog_data->urb_setup[i] = urb_next++;
1807
            }
1808
         }
1809
      } else {
1810
         /* We have enough input varyings that the SF/SBE pipeline stage can't
1811
          * arbitrarily rearrange them to suit our whim; we have to put them
1812
          * in an order that matches the output of the previous pipeline stage
1813
          * (geometry or vertex shader).
1814
          */
1815
         struct brw_vue_map prev_stage_vue_map;
1816
         brw_compute_vue_map(devinfo, &prev_stage_vue_map,
1817
                             key->input_slots_valid);
1818
         int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1819
         assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1820
         for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1821
              slot++) {
1822
            int varying = prev_stage_vue_map.slot_to_varying[slot];
1823
            /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1824
             * unused.
1825
             */
1826
            if (varying != BRW_VARYING_SLOT_COUNT &&
1827
                (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1828
                 BITFIELD64_BIT(varying))) {
1829
               prog_data->urb_setup[varying] = slot - first_slot;
1830
            }
1831
         }
1832
         urb_next = prev_stage_vue_map.num_slots - first_slot;
1833
      }
1834
   } else {
1835
      /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1836
      for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1837
         /* Point size is packed into the header, not as a general attribute */
1838
         if (i == VARYING_SLOT_PSIZ)
1839
            continue;
1840
 
1841
	 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1842
	    /* The back color slot is skipped when the front color is
1843
	     * also written to.  In addition, some slots can be
1844
	     * written in the vertex shader and not read in the
1845
	     * fragment shader.  So the register number must always be
1846
	     * incremented, mapped or not.
1847
	     */
1848
	    if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1849
	       prog_data->urb_setup[i] = urb_next;
1850
            urb_next++;
1851
	 }
1852
      }
1853
 
1854
      /*
1855
       * It's a FS only attribute, and we did interpolation for this attribute
1856
       * in SF thread. So, count it here, too.
1857
       *
1858
       * See compile_sf_prog() for more info.
1859
       */
1860
      if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1861
         prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1862
   }
1863
 
1864
   prog_data->num_varying_inputs = urb_next;
1865
}
1866
 
1867
void
1868
fs_visitor::assign_urb_setup()
1869
{
1870
   assert(stage == MESA_SHADER_FRAGMENT);
1871
   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1872
 
1873
   int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1874
 
1875
   /* Offset all the urb_setup[] index by the actual position of the
1876
    * setup regs, now that the location of the constants has been chosen.
1877
    */
1878
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1879
      if (inst->opcode == FS_OPCODE_LINTERP) {
1880
	 assert(inst->src[1].file == HW_REG);
1881
	 inst->src[1].fixed_hw_reg.nr += urb_start;
1882
      }
1883
 
1884
      if (inst->opcode == FS_OPCODE_CINTERP) {
1885
	 assert(inst->src[0].file == HW_REG);
1886
	 inst->src[0].fixed_hw_reg.nr += urb_start;
1887
      }
1888
   }
1889
 
1890
   /* Each attribute is 4 setup channels, each of which is half a reg. */
1891
   this->first_non_payload_grf =
1892
      urb_start + prog_data->num_varying_inputs * 2;
1893
}
1894
 
1895
void
1896
fs_visitor::assign_vs_urb_setup()
1897
{
1898
   brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1899
   int grf, count, slot, channel, attr;
1900
 
1901
   assert(stage == MESA_SHADER_VERTEX);
1902
   count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1903
   if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1904
      count++;
1905
 
1906
   /* Each attribute is 4 regs. */
1907
   this->first_non_payload_grf =
1908
      payload.num_regs + prog_data->curb_read_length + count * 4;
1909
 
1910
   unsigned vue_entries =
1911
      MAX2(count, vs_prog_data->base.vue_map.num_slots);
1912
 
1913
   vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1914
   vs_prog_data->base.urb_read_length = (count + 1) / 2;
1915
 
1916
   assert(vs_prog_data->base.urb_read_length <= 15);
1917
 
1918
   /* Rewrite all ATTR file references to the hw grf that they land in. */
1919
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1920
      for (int i = 0; i < inst->sources; i++) {
1921
         if (inst->src[i].file == ATTR) {
1922
 
1923
            if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1924
               slot = count - 1;
1925
            } else {
1926
               /* Attributes come in in a contiguous block, ordered by their
1927
                * gl_vert_attrib value.  That means we can compute the slot
1928
                * number for an attribute by masking out the enabled
1929
                * attributes before it and counting the bits.
1930
                */
1931
               attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1932
               slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1933
                                        BITFIELD64_MASK(attr));
1934
            }
1935
 
1936
            channel = inst->src[i].reg_offset & 3;
1937
 
1938
            grf = payload.num_regs +
1939
               prog_data->curb_read_length +
1940
               slot * 4 + channel;
1941
 
1942
            inst->src[i].file = HW_REG;
1943
            inst->src[i].fixed_hw_reg =
1944
               retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1945
         }
1946
      }
1947
   }
1948
}
1949
 
1950
/**
1951
 * Split large virtual GRFs into separate components if we can.
1952
 *
1953
 * This is mostly duplicated with what brw_fs_vector_splitting does,
1954
 * but that's really conservative because it's afraid of doing
1955
 * splitting that doesn't result in real progress after the rest of
1956
 * the optimization phases, which would cause infinite looping in
1957
 * optimization.  We can do it once here, safely.  This also has the
1958
 * opportunity to split interpolated values, or maybe even uniforms,
1959
 * which we don't have at the IR level.
1960
 *
1961
 * We want to split, because virtual GRFs are what we register
1962
 * allocate and spill (due to contiguousness requirements for some
1963
 * instructions), and they're what we naturally generate in the
1964
 * codegen process, but most virtual GRFs don't actually need to be
1965
 * contiguous sets of GRFs.  If we split, we'll end up with reduced
1966
 * live intervals and better dead code elimination and coalescing.
1967
 */
1968
void
1969
fs_visitor::split_virtual_grfs()
1970
{
1971
   int num_vars = this->alloc.count;
1972
 
1973
   /* Count the total number of registers */
1974
   int reg_count = 0;
1975
   int vgrf_to_reg[num_vars];
1976
   for (int i = 0; i < num_vars; i++) {
1977
      vgrf_to_reg[i] = reg_count;
1978
      reg_count += alloc.sizes[i];
1979
   }
1980
 
1981
   /* An array of "split points".  For each register slot, this indicates
1982
    * if this slot can be separated from the previous slot.  Every time an
1983
    * instruction uses multiple elements of a register (as a source or
1984
    * destination), we mark the used slots as inseparable.  Then we go
1985
    * through and split the registers into the smallest pieces we can.
1986
    */
1987
   bool split_points[reg_count];
1988
   memset(split_points, 0, sizeof(split_points));
1989
 
1990
   /* Mark all used registers as fully splittable */
1991
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
1992
      if (inst->dst.file == GRF) {
1993
         int reg = vgrf_to_reg[inst->dst.reg];
1994
         for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
1995
            split_points[reg + j] = true;
1996
      }
1997
 
1998
      for (int i = 0; i < inst->sources; i++) {
1999
         if (inst->src[i].file == GRF) {
2000
            int reg = vgrf_to_reg[inst->src[i].reg];
2001
            for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
2002
               split_points[reg + j] = true;
2003
         }
2004
      }
2005
   }
2006
 
2007
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2008
      if (inst->dst.file == GRF) {
2009
         int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2010
         for (int j = 1; j < inst->regs_written; j++)
2011
            split_points[reg + j] = false;
2012
      }
2013
      for (int i = 0; i < inst->sources; i++) {
2014
         if (inst->src[i].file == GRF) {
2015
            int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2016
            for (int j = 1; j < inst->regs_read(i); j++)
2017
               split_points[reg + j] = false;
2018
         }
2019
      }
2020
   }
2021
 
2022
   int new_virtual_grf[reg_count];
2023
   int new_reg_offset[reg_count];
2024
 
2025
   int reg = 0;
2026
   for (int i = 0; i < num_vars; i++) {
2027
      /* The first one should always be 0 as a quick sanity check. */
2028
      assert(split_points[reg] == false);
2029
 
2030
      /* j = 0 case */
2031
      new_reg_offset[reg] = 0;
2032
      reg++;
2033
      int offset = 1;
2034
 
2035
      /* j > 0 case */
2036
      for (unsigned j = 1; j < alloc.sizes[i]; j++) {
2037
         /* If this is a split point, reset the offset to 0 and allocate a
2038
          * new virtual GRF for the previous offset many registers
2039
          */
2040
         if (split_points[reg]) {
2041
            assert(offset <= MAX_VGRF_SIZE);
2042
            int grf = alloc.allocate(offset);
2043
            for (int k = reg - offset; k < reg; k++)
2044
               new_virtual_grf[k] = grf;
2045
            offset = 0;
2046
         }
2047
         new_reg_offset[reg] = offset;
2048
         offset++;
2049
         reg++;
2050
      }
2051
 
2052
      /* The last one gets the original register number */
2053
      assert(offset <= MAX_VGRF_SIZE);
2054
      alloc.sizes[i] = offset;
2055
      for (int k = reg - offset; k < reg; k++)
2056
         new_virtual_grf[k] = i;
2057
   }
2058
   assert(reg == reg_count);
2059
 
2060
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2061
      if (inst->dst.file == GRF) {
2062
         reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2063
         inst->dst.reg = new_virtual_grf[reg];
2064
         inst->dst.reg_offset = new_reg_offset[reg];
2065
         assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2066
      }
2067
      for (int i = 0; i < inst->sources; i++) {
2068
	 if (inst->src[i].file == GRF) {
2069
            reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2070
            inst->src[i].reg = new_virtual_grf[reg];
2071
            inst->src[i].reg_offset = new_reg_offset[reg];
2072
            assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
2073
         }
2074
      }
2075
   }
2076
   invalidate_live_intervals();
2077
}
2078
 
2079
/**
2080
 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2081
 *
2082
 * During code generation, we create tons of temporary variables, many of
2083
 * which get immediately killed and are never used again.  Yet, in later
2084
 * optimization and analysis passes, such as compute_live_intervals, we need
2085
 * to loop over all the virtual GRFs.  Compacting them can save a lot of
2086
 * overhead.
2087
 */
2088
bool
2089
fs_visitor::compact_virtual_grfs()
2090
{
2091
   bool progress = false;
2092
   int remap_table[this->alloc.count];
2093
   memset(remap_table, -1, sizeof(remap_table));
2094
 
2095
   /* Mark which virtual GRFs are used. */
2096
   foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2097
      if (inst->dst.file == GRF)
2098
         remap_table[inst->dst.reg] = 0;
2099
 
2100
      for (int i = 0; i < inst->sources; i++) {
2101
         if (inst->src[i].file == GRF)
2102
            remap_table[inst->src[i].reg] = 0;
2103
      }
2104
   }
2105
 
2106
   /* Compact the GRF arrays. */
2107
   int new_index = 0;
2108
   for (unsigned i = 0; i < this->alloc.count; i++) {
2109
      if (remap_table[i] == -1) {
2110
         /* We just found an unused register.  This means that we are
2111
          * actually going to compact something.
2112
          */
2113
         progress = true;
2114
      } else {
2115
         remap_table[i] = new_index;
2116
         alloc.sizes[new_index] = alloc.sizes[i];
2117
         invalidate_live_intervals();
2118
         ++new_index;
2119
      }
2120
   }
2121
 
2122
   this->alloc.count = new_index;
2123
 
2124
   /* Patch all the instructions to use the newly renumbered registers */
2125
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2126
      if (inst->dst.file == GRF)
2127
         inst->dst.reg = remap_table[inst->dst.reg];
2128
 
2129
      for (int i = 0; i < inst->sources; i++) {
2130
         if (inst->src[i].file == GRF)
2131
            inst->src[i].reg = remap_table[inst->src[i].reg];
2132
      }
2133
   }
2134
 
2135
   /* Patch all the references to delta_xy, since they're used in register
2136
    * allocation.  If they're unused, switch them to BAD_FILE so we don't
2137
    * think some random VGRF is delta_xy.
2138
    */
2139
   for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2140
      if (delta_xy[i].file == GRF) {
2141
         if (remap_table[delta_xy[i].reg] != -1) {
2142
            delta_xy[i].reg = remap_table[delta_xy[i].reg];
2143
         } else {
2144
            delta_xy[i].file = BAD_FILE;
2145
         }
2146
      }
2147
   }
2148
 
2149
   return progress;
2150
}
2151
 
2152
/*
2153
 * Implements array access of uniforms by inserting a
2154
 * PULL_CONSTANT_LOAD instruction.
2155
 *
2156
 * Unlike temporary GRF array access (where we don't support it due to
2157
 * the difficulty of doing relative addressing on instruction
2158
 * destinations), we could potentially do array access of uniforms
2159
 * that were loaded in GRF space as push constants.  In real-world
2160
 * usage we've seen, though, the arrays being used are always larger
2161
 * than we could load as push constants, so just always move all
2162
 * uniform array access out to a pull constant buffer.
2163
 */
2164
void
2165
fs_visitor::move_uniform_array_access_to_pull_constants()
2166
{
2167
   if (dispatch_width != 8)
2168
      return;
2169
 
2170
   pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2171
   memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2172
 
2173
   /* Walk through and find array access of uniforms.  Put a copy of that
2174
    * uniform in the pull constant buffer.
2175
    *
2176
    * Note that we don't move constant-indexed accesses to arrays.  No
2177
    * testing has been done of the performance impact of this choice.
2178
    */
2179
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2180
      for (int i = 0 ; i < inst->sources; i++) {
2181
         if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2182
            continue;
2183
 
2184
         int uniform = inst->src[i].reg;
2185
 
2186
         /* If this array isn't already present in the pull constant buffer,
2187
          * add it.
2188
          */
2189
         if (pull_constant_loc[uniform] == -1) {
2190
            const gl_constant_value **values = &stage_prog_data->param[uniform];
2191
 
2192
            assert(param_size[uniform]);
2193
 
2194
            for (int j = 0; j < param_size[uniform]; j++) {
2195
               pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2196
 
2197
               stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2198
                  values[j];
2199
            }
2200
         }
2201
      }
2202
   }
2203
}
2204
 
2205
/**
2206
 * Assign UNIFORM file registers to either push constants or pull constants.
2207
 *
2208
 * We allow a fragment shader to have more than the specified minimum
2209
 * maximum number of fragment shader uniform components (64).  If
2210
 * there are too many of these, they'd fill up all of register space.
2211
 * So, this will push some of them out to the pull constant buffer and
2212
 * update the program to load them.
2213
 */
2214
void
2215
fs_visitor::assign_constant_locations()
2216
{
2217
   /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2218
   if (dispatch_width != 8)
2219
      return;
2220
 
2221
   /* Find which UNIFORM registers are still in use. */
2222
   bool is_live[uniforms];
2223
   for (unsigned int i = 0; i < uniforms; i++) {
2224
      is_live[i] = false;
2225
   }
2226
 
2227
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2228
      for (int i = 0; i < inst->sources; i++) {
2229
         if (inst->src[i].file != UNIFORM)
2230
            continue;
2231
 
2232
         int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2233
         if (constant_nr >= 0 && constant_nr < (int) uniforms)
2234
            is_live[constant_nr] = true;
2235
      }
2236
   }
2237
 
2238
   /* Only allow 16 registers (128 uniform components) as push constants.
2239
    *
2240
    * Just demote the end of the list.  We could probably do better
2241
    * here, demoting things that are rarely used in the program first.
2242
    *
2243
    * If changing this value, note the limitation about total_regs in
2244
    * brw_curbe.c.
2245
    */
2246
   unsigned int max_push_components = 16 * 8;
2247
   unsigned int num_push_constants = 0;
2248
 
2249
   push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2250
 
2251
   for (unsigned int i = 0; i < uniforms; i++) {
2252
      if (!is_live[i] || pull_constant_loc[i] != -1) {
2253
         /* This UNIFORM register is either dead, or has already been demoted
2254
          * to a pull const.  Mark it as no longer living in the param[] array.
2255
          */
2256
         push_constant_loc[i] = -1;
2257
         continue;
2258
      }
2259
 
2260
      if (num_push_constants < max_push_components) {
2261
         /* Retain as a push constant.  Record the location in the params[]
2262
          * array.
2263
          */
2264
         push_constant_loc[i] = num_push_constants++;
2265
      } else {
2266
         /* Demote to a pull constant. */
2267
         push_constant_loc[i] = -1;
2268
 
2269
         int pull_index = stage_prog_data->nr_pull_params++;
2270
         stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2271
         pull_constant_loc[i] = pull_index;
2272
      }
2273
   }
2274
 
2275
   stage_prog_data->nr_params = num_push_constants;
2276
 
2277
   /* Up until now, the param[] array has been indexed by reg + reg_offset
2278
    * of UNIFORM registers.  Condense it to only contain the uniforms we
2279
    * chose to upload as push constants.
2280
    */
2281
   for (unsigned int i = 0; i < uniforms; i++) {
2282
      int remapped = push_constant_loc[i];
2283
 
2284
      if (remapped == -1)
2285
         continue;
2286
 
2287
      assert(remapped <= (int)i);
2288
      stage_prog_data->param[remapped] = stage_prog_data->param[i];
2289
   }
2290
}
2291
 
2292
/**
2293
 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2294
 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2295
 */
2296
void
2297
fs_visitor::demote_pull_constants()
2298
{
2299
   foreach_block_and_inst (block, fs_inst, inst, cfg) {
2300
      for (int i = 0; i < inst->sources; i++) {
2301
	 if (inst->src[i].file != UNIFORM)
2302
	    continue;
2303
 
2304
         int pull_index;
2305
         unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
2306
         if (location >= uniforms) /* Out of bounds access */
2307
            pull_index = -1;
2308
         else
2309
            pull_index = pull_constant_loc[location];
2310
 
2311
         if (pull_index == -1)
2312
	    continue;
2313
 
2314
         /* Set up the annotation tracking for new generated instructions. */
2315
         base_ir = inst->ir;
2316
         current_annotation = inst->annotation;
2317
 
2318
         fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2319
         fs_reg dst = vgrf(glsl_type::float_type);
2320
 
2321
         /* Generate a pull load into dst. */
2322
         if (inst->src[i].reladdr) {
2323
            exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2324
                                                        surf_index,
2325
                                                        *inst->src[i].reladdr,
2326
                                                        pull_index);
2327
            inst->insert_before(block, &list);
2328
            inst->src[i].reladdr = NULL;
2329
         } else {
2330
            fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2331
            fs_inst *pull =
2332
               new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2333
                                    dst, surf_index, offset);
2334
            inst->insert_before(block, pull);
2335
            inst->src[i].set_smear(pull_index & 3);
2336
         }
2337
 
2338
         /* Rewrite the instruction to use the temporary VGRF. */
2339
         inst->src[i].file = GRF;
2340
         inst->src[i].reg = dst.reg;
2341
         inst->src[i].reg_offset = 0;
2342
         inst->src[i].width = dispatch_width;
2343
      }
2344
   }
2345
   invalidate_live_intervals();
2346
}
2347
 
2348
bool
2349
fs_visitor::opt_algebraic()
2350
{
2351
   bool progress = false;
2352
 
2353
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2354
      switch (inst->opcode) {
2355
      case BRW_OPCODE_MOV:
2356
         if (inst->src[0].file != IMM)
2357
            break;
2358
 
2359
         if (inst->saturate) {
2360
            if (inst->dst.type != inst->src[0].type)
2361
               assert(!"unimplemented: saturate mixed types");
2362
 
2363
            if (brw_saturate_immediate(inst->dst.type,
2364
                                       &inst->src[0].fixed_hw_reg)) {
2365
               inst->saturate = false;
2366
               progress = true;
2367
            }
2368
         }
2369
         break;
2370
 
2371
      case BRW_OPCODE_MUL:
2372
	 if (inst->src[1].file != IMM)
2373
	    continue;
2374
 
2375
	 /* a * 1.0 = a */
2376
	 if (inst->src[1].is_one()) {
2377
	    inst->opcode = BRW_OPCODE_MOV;
2378
	    inst->src[1] = reg_undef;
2379
	    progress = true;
2380
	    break;
2381
	 }
2382
 
2383
         /* a * -1.0 = -a */
2384
         if (inst->src[1].is_negative_one()) {
2385
            inst->opcode = BRW_OPCODE_MOV;
2386
            inst->src[0].negate = !inst->src[0].negate;
2387
            inst->src[1] = reg_undef;
2388
            progress = true;
2389
            break;
2390
         }
2391
 
2392
         /* a * 0.0 = 0.0 */
2393
         if (inst->src[1].is_zero()) {
2394
            inst->opcode = BRW_OPCODE_MOV;
2395
            inst->src[0] = inst->src[1];
2396
            inst->src[1] = reg_undef;
2397
            progress = true;
2398
            break;
2399
         }
2400
 
2401
         if (inst->src[0].file == IMM) {
2402
            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2403
            inst->opcode = BRW_OPCODE_MOV;
2404
            inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
2405
            inst->src[1] = reg_undef;
2406
            progress = true;
2407
            break;
2408
         }
2409
	 break;
2410
      case BRW_OPCODE_ADD:
2411
         if (inst->src[1].file != IMM)
2412
            continue;
2413
 
2414
         /* a + 0.0 = a */
2415
         if (inst->src[1].is_zero()) {
2416
            inst->opcode = BRW_OPCODE_MOV;
2417
            inst->src[1] = reg_undef;
2418
            progress = true;
2419
            break;
2420
         }
2421
 
2422
         if (inst->src[0].file == IMM) {
2423
            assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
2424
            inst->opcode = BRW_OPCODE_MOV;
2425
            inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
2426
            inst->src[1] = reg_undef;
2427
            progress = true;
2428
            break;
2429
         }
2430
         break;
2431
      case BRW_OPCODE_OR:
2432
         if (inst->src[0].equals(inst->src[1])) {
2433
            inst->opcode = BRW_OPCODE_MOV;
2434
            inst->src[1] = reg_undef;
2435
            progress = true;
2436
            break;
2437
         }
2438
         break;
2439
      case BRW_OPCODE_LRP:
2440
         if (inst->src[1].equals(inst->src[2])) {
2441
            inst->opcode = BRW_OPCODE_MOV;
2442
            inst->src[0] = inst->src[1];
2443
            inst->src[1] = reg_undef;
2444
            inst->src[2] = reg_undef;
2445
            progress = true;
2446
            break;
2447
         }
2448
         break;
2449
      case BRW_OPCODE_CMP:
2450
         if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2451
             inst->src[0].abs &&
2452
             inst->src[0].negate &&
2453
             inst->src[1].is_zero()) {
2454
            inst->src[0].abs = false;
2455
            inst->src[0].negate = false;
2456
            inst->conditional_mod = BRW_CONDITIONAL_Z;
2457
            progress = true;
2458
            break;
2459
         }
2460
         break;
2461
      case BRW_OPCODE_SEL:
2462
         if (inst->src[0].equals(inst->src[1])) {
2463
            inst->opcode = BRW_OPCODE_MOV;
2464
            inst->src[1] = reg_undef;
2465
            inst->predicate = BRW_PREDICATE_NONE;
2466
            inst->predicate_inverse = false;
2467
            progress = true;
2468
         } else if (inst->saturate && inst->src[1].file == IMM) {
2469
            switch (inst->conditional_mod) {
2470
            case BRW_CONDITIONAL_LE:
2471
            case BRW_CONDITIONAL_L:
2472
               switch (inst->src[1].type) {
2473
               case BRW_REGISTER_TYPE_F:
2474
                  if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2475
                     inst->opcode = BRW_OPCODE_MOV;
2476
                     inst->src[1] = reg_undef;
2477
                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
2478
                     progress = true;
2479
                  }
2480
                  break;
2481
               default:
2482
                  break;
2483
               }
2484
               break;
2485
            case BRW_CONDITIONAL_GE:
2486
            case BRW_CONDITIONAL_G:
2487
               switch (inst->src[1].type) {
2488
               case BRW_REGISTER_TYPE_F:
2489
                  if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2490
                     inst->opcode = BRW_OPCODE_MOV;
2491
                     inst->src[1] = reg_undef;
2492
                     inst->conditional_mod = BRW_CONDITIONAL_NONE;
2493
                     progress = true;
2494
                  }
2495
                  break;
2496
               default:
2497
                  break;
2498
               }
2499
            default:
2500
               break;
2501
            }
2502
         }
2503
         break;
2504
      case BRW_OPCODE_MAD:
2505
         if (inst->src[1].is_zero() || inst->src[2].is_zero()) {
2506
            inst->opcode = BRW_OPCODE_MOV;
2507
            inst->src[1] = reg_undef;
2508
            inst->src[2] = reg_undef;
2509
            progress = true;
2510
         } else if (inst->src[0].is_zero()) {
2511
            inst->opcode = BRW_OPCODE_MUL;
2512
            inst->src[0] = inst->src[2];
2513
            inst->src[2] = reg_undef;
2514
            progress = true;
2515
         } else if (inst->src[1].is_one()) {
2516
            inst->opcode = BRW_OPCODE_ADD;
2517
            inst->src[1] = inst->src[2];
2518
            inst->src[2] = reg_undef;
2519
            progress = true;
2520
         } else if (inst->src[2].is_one()) {
2521
            inst->opcode = BRW_OPCODE_ADD;
2522
            inst->src[2] = reg_undef;
2523
            progress = true;
2524
         } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
2525
            inst->opcode = BRW_OPCODE_ADD;
2526
            inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
2527
            inst->src[2] = reg_undef;
2528
            progress = true;
2529
         }
2530
         break;
2531
      case SHADER_OPCODE_RCP: {
2532
         fs_inst *prev = (fs_inst *)inst->prev;
2533
         if (prev->opcode == SHADER_OPCODE_SQRT) {
2534
            if (inst->src[0].equals(prev->dst)) {
2535
               inst->opcode = SHADER_OPCODE_RSQ;
2536
               inst->src[0] = prev->src[0];
2537
               progress = true;
2538
            }
2539
         }
2540
         break;
2541
      }
2542
      case SHADER_OPCODE_BROADCAST:
2543
         if (is_uniform(inst->src[0])) {
2544
            inst->opcode = BRW_OPCODE_MOV;
2545
            inst->sources = 1;
2546
            inst->force_writemask_all = true;
2547
            progress = true;
2548
         } else if (inst->src[1].file == IMM) {
2549
            inst->opcode = BRW_OPCODE_MOV;
2550
            inst->src[0] = component(inst->src[0],
2551
                                     inst->src[1].fixed_hw_reg.dw1.ud);
2552
            inst->sources = 1;
2553
            inst->force_writemask_all = true;
2554
            progress = true;
2555
         }
2556
         break;
2557
 
2558
      default:
2559
	 break;
2560
      }
2561
 
2562
      /* Swap if src[0] is immediate. */
2563
      if (progress && inst->is_commutative()) {
2564
         if (inst->src[0].file == IMM) {
2565
            fs_reg tmp = inst->src[1];
2566
            inst->src[1] = inst->src[0];
2567
            inst->src[0] = tmp;
2568
         }
2569
      }
2570
   }
2571
   return progress;
2572
}
2573
 
2574
/**
2575
 * Optimize sample messages that have constant zero values for the trailing
2576
 * texture coordinates. We can just reduce the message length for these
2577
 * instructions instead of reserving a register for it. Trailing parameters
2578
 * that aren't sent default to zero anyway. This will cause the dead code
2579
 * eliminator to remove the MOV instruction that would otherwise be emitted to
2580
 * set up the zero value.
2581
 */
2582
bool
2583
fs_visitor::opt_zero_samples()
2584
{
2585
   /* Gen4 infers the texturing opcode based on the message length so we can't
2586
    * change it.
2587
    */
2588
   if (devinfo->gen < 5)
2589
      return false;
2590
 
2591
   bool progress = false;
2592
 
2593
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2594
      if (!inst->is_tex())
2595
         continue;
2596
 
2597
      fs_inst *load_payload = (fs_inst *) inst->prev;
2598
 
2599
      if (load_payload->is_head_sentinel() ||
2600
          load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2601
         continue;
2602
 
2603
      /* We don't want to remove the message header or the first parameter.
2604
       * Removing the first parameter is not allowed, see the Haswell PRM
2605
       * volume 7, page 149:
2606
       *
2607
       *     "Parameter 0 is required except for the sampleinfo message, which
2608
       *      has no parameter 0"
2609
       */
2610
      while (inst->mlen > inst->header_size + dispatch_width / 8 &&
2611
             load_payload->src[(inst->mlen - inst->header_size) /
2612
                               (dispatch_width / 8) +
2613
                               inst->header_size - 1].is_zero()) {
2614
         inst->mlen -= dispatch_width / 8;
2615
         progress = true;
2616
      }
2617
   }
2618
 
2619
   if (progress)
2620
      invalidate_live_intervals();
2621
 
2622
   return progress;
2623
}
2624
 
2625
/**
2626
 * Optimize sample messages which are followed by the final RT write.
2627
 *
2628
 * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
2629
 * results sent directly to the framebuffer, bypassing the EU.  Recognize the
2630
 * final texturing results copied to the framebuffer write payload and modify
2631
 * them to write to the framebuffer directly.
2632
 */
2633
bool
2634
fs_visitor::opt_sampler_eot()
2635
{
2636
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2637
 
2638
   if (stage != MESA_SHADER_FRAGMENT)
2639
      return false;
2640
 
2641
   if (devinfo->gen < 9 && !devinfo->is_cherryview)
2642
      return false;
2643
 
2644
   /* FINISHME: It should be possible to implement this optimization when there
2645
    * are multiple drawbuffers.
2646
    */
2647
   if (key->nr_color_regions != 1)
2648
      return false;
2649
 
2650
   /* Look for a texturing instruction immediately before the final FB_WRITE. */
2651
   fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end();
2652
   assert(fb_write->eot);
2653
   assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
2654
 
2655
   fs_inst *tex_inst = (fs_inst *) fb_write->prev;
2656
 
2657
   /* There wasn't one; nothing to do. */
2658
   if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex())
2659
      return false;
2660
 
2661
   /* This optimisation doesn't seem to work for textureGather for some
2662
    * reason. I can't find any documentation or known workarounds to indicate
2663
    * that this is expected, but considering that it is probably pretty
2664
    * unlikely that a shader would directly write out the results from
2665
    * textureGather we might as well just disable it.
2666
    */
2667
   if (tex_inst->opcode == SHADER_OPCODE_TG4 ||
2668
       tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
2669
      return false;
2670
 
2671
   /* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
2672
    * It's very likely to be the previous instruction.
2673
    */
2674
   fs_inst *load_payload = (fs_inst *) tex_inst->prev;
2675
   if (load_payload->is_head_sentinel() ||
2676
       load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
2677
      return false;
2678
 
2679
   assert(!tex_inst->eot); /* We can't get here twice */
2680
   assert((tex_inst->offset & (0xff << 24)) == 0);
2681
 
2682
   tex_inst->offset |= fb_write->target << 24;
2683
   tex_inst->eot = true;
2684
   tex_inst->dst = reg_null_ud;
2685
   fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
2686
 
2687
   /* If a header is present, marking the eot is sufficient. Otherwise, we need
2688
    * to create a new LOAD_PAYLOAD command with the same sources and a space
2689
    * saved for the header. Using a new destination register not only makes sure
2690
    * we have enough space, but it will make sure the dead code eliminator kills
2691
    * the instruction that this will replace.
2692
    */
2693
   if (tex_inst->header_size != 0)
2694
      return true;
2695
 
2696
   fs_reg send_header = vgrf(load_payload->sources + 1);
2697
   fs_reg *new_sources =
2698
      ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
2699
 
2700
   new_sources[0] = fs_reg();
2701
   for (int i = 0; i < load_payload->sources; i++)
2702
      new_sources[i+1] = load_payload->src[i];
2703
 
2704
   /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
2705
    * requires a lot of information about the sources to appropriately figure
2706
    * out the number of registers needed to be used. Given this stage in our
2707
    * optimization, we may not have the appropriate GRFs required by
2708
    * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
2709
    * manually emit the instruction.
2710
    */
2711
   fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
2712
                                                    load_payload->exec_size,
2713
                                                    send_header,
2714
                                                    new_sources,
2715
                                                    load_payload->sources + 1);
2716
 
2717
   new_load_payload->regs_written = load_payload->regs_written + 1;
2718
   new_load_payload->header_size = 1;
2719
   tex_inst->mlen++;
2720
   tex_inst->header_size = 1;
2721
   tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
2722
   tex_inst->src[0] = send_header;
2723
 
2724
   return true;
2725
}
2726
 
2727
bool
2728
fs_visitor::opt_register_renaming()
2729
{
2730
   bool progress = false;
2731
   int depth = 0;
2732
 
2733
   int remap[alloc.count];
2734
   memset(remap, -1, sizeof(int) * alloc.count);
2735
 
2736
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
2737
      if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2738
         depth++;
2739
      } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2740
                 inst->opcode == BRW_OPCODE_WHILE) {
2741
         depth--;
2742
      }
2743
 
2744
      /* Rewrite instruction sources. */
2745
      for (int i = 0; i < inst->sources; i++) {
2746
         if (inst->src[i].file == GRF &&
2747
             remap[inst->src[i].reg] != -1 &&
2748
             remap[inst->src[i].reg] != inst->src[i].reg) {
2749
            inst->src[i].reg = remap[inst->src[i].reg];
2750
            progress = true;
2751
         }
2752
      }
2753
 
2754
      const int dst = inst->dst.reg;
2755
 
2756
      if (depth == 0 &&
2757
          inst->dst.file == GRF &&
2758
          alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
2759
          !inst->is_partial_write()) {
2760
         if (remap[dst] == -1) {
2761
            remap[dst] = dst;
2762
         } else {
2763
            remap[dst] = alloc.allocate(inst->dst.width / 8);
2764
            inst->dst.reg = remap[dst];
2765
            progress = true;
2766
         }
2767
      } else if (inst->dst.file == GRF &&
2768
                 remap[dst] != -1 &&
2769
                 remap[dst] != dst) {
2770
         inst->dst.reg = remap[dst];
2771
         progress = true;
2772
      }
2773
   }
2774
 
2775
   if (progress) {
2776
      invalidate_live_intervals();
2777
 
2778
      for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
2779
         if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
2780
            delta_xy[i].reg = remap[delta_xy[i].reg];
2781
         }
2782
      }
2783
   }
2784
 
2785
   return progress;
2786
}
2787
 
2788
/**
2789
 * Remove redundant or useless discard jumps.
2790
 *
2791
 * For example, we can eliminate jumps in the following sequence:
2792
 *
2793
 * discard-jump       (redundant with the next jump)
2794
 * discard-jump       (useless; jumps to the next instruction)
2795
 * placeholder-halt
2796
 */
2797
bool
2798
fs_visitor::opt_redundant_discard_jumps()
2799
{
2800
   bool progress = false;
2801
 
2802
   bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
2803
 
2804
   fs_inst *placeholder_halt = NULL;
2805
   foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
2806
      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
2807
         placeholder_halt = inst;
2808
         break;
2809
      }
2810
   }
2811
 
2812
   if (!placeholder_halt)
2813
      return false;
2814
 
2815
   /* Delete any HALTs immediately before the placeholder halt. */
2816
   for (fs_inst *prev = (fs_inst *) placeholder_halt->prev;
2817
        !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
2818
        prev = (fs_inst *) placeholder_halt->prev) {
2819
      prev->remove(last_bblock);
2820
      progress = true;
2821
   }
2822
 
2823
   if (progress)
2824
      invalidate_live_intervals();
2825
 
2826
   return progress;
2827
}
2828
 
2829
bool
2830
fs_visitor::compute_to_mrf()
2831
{
2832
   bool progress = false;
2833
   int next_ip = 0;
2834
 
2835
   /* No MRFs on Gen >= 7. */
2836
   if (devinfo->gen >= 7)
2837
      return false;
2838
 
2839
   calculate_live_intervals();
2840
 
2841
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2842
      int ip = next_ip;
2843
      next_ip++;
2844
 
2845
      if (inst->opcode != BRW_OPCODE_MOV ||
2846
	  inst->is_partial_write() ||
2847
	  inst->dst.file != MRF || inst->src[0].file != GRF ||
2848
	  inst->dst.type != inst->src[0].type ||
2849
	  inst->src[0].abs || inst->src[0].negate ||
2850
          !inst->src[0].is_contiguous() ||
2851
          inst->src[0].subreg_offset)
2852
	 continue;
2853
 
2854
      /* Work out which hardware MRF registers are written by this
2855
       * instruction.
2856
       */
2857
      int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2858
      int mrf_high;
2859
      if (inst->dst.reg & BRW_MRF_COMPR4) {
2860
	 mrf_high = mrf_low + 4;
2861
      } else if (inst->exec_size == 16) {
2862
	 mrf_high = mrf_low + 1;
2863
      } else {
2864
	 mrf_high = mrf_low;
2865
      }
2866
 
2867
      /* Can't compute-to-MRF this GRF if someone else was going to
2868
       * read it later.
2869
       */
2870
      if (this->virtual_grf_end[inst->src[0].reg] > ip)
2871
	 continue;
2872
 
2873
      /* Found a move of a GRF to a MRF.  Let's see if we can go
2874
       * rewrite the thing that made this GRF to write into the MRF.
2875
       */
2876
      foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2877
	 if (scan_inst->dst.file == GRF &&
2878
	     scan_inst->dst.reg == inst->src[0].reg) {
2879
	    /* Found the last thing to write our reg we want to turn
2880
	     * into a compute-to-MRF.
2881
	     */
2882
 
2883
	    /* If this one instruction didn't populate all the
2884
	     * channels, bail.  We might be able to rewrite everything
2885
	     * that writes that reg, but it would require smarter
2886
	     * tracking to delay the rewriting until complete success.
2887
	     */
2888
	    if (scan_inst->is_partial_write())
2889
	       break;
2890
 
2891
            /* Things returning more than one register would need us to
2892
             * understand coalescing out more than one MOV at a time.
2893
             */
2894
            if (scan_inst->regs_written > scan_inst->dst.width / 8)
2895
               break;
2896
 
2897
	    /* SEND instructions can't have MRF as a destination. */
2898
	    if (scan_inst->mlen)
2899
	       break;
2900
 
2901
	    if (devinfo->gen == 6) {
2902
	       /* gen6 math instructions must have the destination be
2903
		* GRF, so no compute-to-MRF for them.
2904
		*/
2905
	       if (scan_inst->is_math()) {
2906
		  break;
2907
	       }
2908
	    }
2909
 
2910
	    if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2911
	       /* Found the creator of our MRF's source value. */
2912
	       scan_inst->dst.file = MRF;
2913
	       scan_inst->dst.reg = inst->dst.reg;
2914
	       scan_inst->saturate |= inst->saturate;
2915
	       inst->remove(block);
2916
	       progress = true;
2917
	    }
2918
	    break;
2919
	 }
2920
 
2921
	 /* We don't handle control flow here.  Most computation of
2922
	  * values that end up in MRFs are shortly before the MRF
2923
	  * write anyway.
2924
	  */
2925
	 if (block->start() == scan_inst)
2926
	    break;
2927
 
2928
	 /* You can't read from an MRF, so if someone else reads our
2929
	  * MRF's source GRF that we wanted to rewrite, that stops us.
2930
	  */
2931
	 bool interfered = false;
2932
	 for (int i = 0; i < scan_inst->sources; i++) {
2933
	    if (scan_inst->src[i].file == GRF &&
2934
		scan_inst->src[i].reg == inst->src[0].reg &&
2935
		scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2936
	       interfered = true;
2937
	    }
2938
	 }
2939
	 if (interfered)
2940
	    break;
2941
 
2942
	 if (scan_inst->dst.file == MRF) {
2943
	    /* If somebody else writes our MRF here, we can't
2944
	     * compute-to-MRF before that.
2945
	     */
2946
	    int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2947
	    int scan_mrf_high;
2948
 
2949
	    if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2950
	       scan_mrf_high = scan_mrf_low + 4;
2951
	    } else if (scan_inst->exec_size == 16) {
2952
	       scan_mrf_high = scan_mrf_low + 1;
2953
	    } else {
2954
	       scan_mrf_high = scan_mrf_low;
2955
	    }
2956
 
2957
	    if (mrf_low == scan_mrf_low ||
2958
		mrf_low == scan_mrf_high ||
2959
		mrf_high == scan_mrf_low ||
2960
		mrf_high == scan_mrf_high) {
2961
	       break;
2962
	    }
2963
	 }
2964
 
2965
	 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2966
	    /* Found a SEND instruction, which means that there are
2967
	     * live values in MRFs from base_mrf to base_mrf +
2968
	     * scan_inst->mlen - 1.  Don't go pushing our MRF write up
2969
	     * above it.
2970
	     */
2971
	    if (mrf_low >= scan_inst->base_mrf &&
2972
		mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2973
	       break;
2974
	    }
2975
	    if (mrf_high >= scan_inst->base_mrf &&
2976
		mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2977
	       break;
2978
	    }
2979
	 }
2980
      }
2981
   }
2982
 
2983
   if (progress)
2984
      invalidate_live_intervals();
2985
 
2986
   return progress;
2987
}
2988
 
2989
/**
2990
 * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
2991
 * flow.  We could probably do better here with some form of divergence
2992
 * analysis.
2993
 */
2994
bool
2995
fs_visitor::eliminate_find_live_channel()
2996
{
2997
   bool progress = false;
2998
   unsigned depth = 0;
2999
 
3000
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3001
      switch (inst->opcode) {
3002
      case BRW_OPCODE_IF:
3003
      case BRW_OPCODE_DO:
3004
         depth++;
3005
         break;
3006
 
3007
      case BRW_OPCODE_ENDIF:
3008
      case BRW_OPCODE_WHILE:
3009
         depth--;
3010
         break;
3011
 
3012
      case FS_OPCODE_DISCARD_JUMP:
3013
         /* This can potentially make control flow non-uniform until the end
3014
          * of the program.
3015
          */
3016
         return progress;
3017
 
3018
      case SHADER_OPCODE_FIND_LIVE_CHANNEL:
3019
         if (depth == 0) {
3020
            inst->opcode = BRW_OPCODE_MOV;
3021
            inst->src[0] = fs_reg(0);
3022
            inst->sources = 1;
3023
            inst->force_writemask_all = true;
3024
            progress = true;
3025
         }
3026
         break;
3027
 
3028
      default:
3029
         break;
3030
      }
3031
   }
3032
 
3033
   return progress;
3034
}
3035
 
3036
/**
3037
 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
3038
 * instructions to FS_OPCODE_REP_FB_WRITE.
3039
 */
3040
void
3041
fs_visitor::emit_repclear_shader()
3042
{
3043
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3044
   int base_mrf = 1;
3045
   int color_mrf = base_mrf + 2;
3046
 
3047
   fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
3048
                           fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
3049
   mov->force_writemask_all = true;
3050
 
3051
   fs_inst *write;
3052
   if (key->nr_color_regions == 1) {
3053
      write = emit(FS_OPCODE_REP_FB_WRITE);
3054
      write->saturate = key->clamp_fragment_color;
3055
      write->base_mrf = color_mrf;
3056
      write->target = 0;
3057
      write->header_size = 0;
3058
      write->mlen = 1;
3059
   } else {
3060
      assume(key->nr_color_regions > 0);
3061
      for (int i = 0; i < key->nr_color_regions; ++i) {
3062
         write = emit(FS_OPCODE_REP_FB_WRITE);
3063
         write->saturate = key->clamp_fragment_color;
3064
         write->base_mrf = base_mrf;
3065
         write->target = i;
3066
         write->header_size = 2;
3067
         write->mlen = 3;
3068
      }
3069
   }
3070
   write->eot = true;
3071
 
3072
   calculate_cfg();
3073
 
3074
   assign_constant_locations();
3075
   assign_curb_setup();
3076
 
3077
   /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
3078
   assert(mov->src[0].file == HW_REG);
3079
   mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
3080
}
3081
 
3082
/**
3083
 * Walks through basic blocks, looking for repeated MRF writes and
3084
 * removing the later ones.
3085
 */
3086
bool
3087
fs_visitor::remove_duplicate_mrf_writes()
3088
{
3089
   fs_inst *last_mrf_move[16];
3090
   bool progress = false;
3091
 
3092
   /* Need to update the MRF tracking for compressed instructions. */
3093
   if (dispatch_width == 16)
3094
      return false;
3095
 
3096
   memset(last_mrf_move, 0, sizeof(last_mrf_move));
3097
 
3098
   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3099
      if (inst->is_control_flow()) {
3100
	 memset(last_mrf_move, 0, sizeof(last_mrf_move));
3101
      }
3102
 
3103
      if (inst->opcode == BRW_OPCODE_MOV &&
3104
	  inst->dst.file == MRF) {
3105
	 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
3106
	 if (prev_inst && inst->equals(prev_inst)) {
3107
	    inst->remove(block);
3108
	    progress = true;
3109
	    continue;
3110
	 }
3111
      }
3112
 
3113
      /* Clear out the last-write records for MRFs that were overwritten. */
3114
      if (inst->dst.file == MRF) {
3115
	 last_mrf_move[inst->dst.reg] = NULL;
3116
      }
3117
 
3118
      if (inst->mlen > 0 && inst->base_mrf != -1) {
3119
	 /* Found a SEND instruction, which will include two or fewer
3120
	  * implied MRF writes.  We could do better here.
3121
	  */
3122
	 for (int i = 0; i < implied_mrf_writes(inst); i++) {
3123
	    last_mrf_move[inst->base_mrf + i] = NULL;
3124
	 }
3125
      }
3126
 
3127
      /* Clear out any MRF move records whose sources got overwritten. */
3128
      if (inst->dst.file == GRF) {
3129
	 for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
3130
	    if (last_mrf_move[i] &&
3131
		last_mrf_move[i]->src[0].reg == inst->dst.reg) {
3132
	       last_mrf_move[i] = NULL;
3133
	    }
3134
	 }
3135
      }
3136
 
3137
      if (inst->opcode == BRW_OPCODE_MOV &&
3138
	  inst->dst.file == MRF &&
3139
	  inst->src[0].file == GRF &&
3140
	  !inst->is_partial_write()) {
3141
	 last_mrf_move[inst->dst.reg] = inst;
3142
      }
3143
   }
3144
 
3145
   if (progress)
3146
      invalidate_live_intervals();
3147
 
3148
   return progress;
3149
}
3150
 
3151
static void
3152
clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len)
3153
{
3154
   /* Clear the flag for registers that actually got read (as expected). */
3155
   for (int i = 0; i < inst->sources; i++) {
3156
      int grf;
3157
      if (inst->src[i].file == GRF) {
3158
         grf = inst->src[i].reg;
3159
      } else if (inst->src[i].file == HW_REG &&
3160
                 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
3161
         grf = inst->src[i].fixed_hw_reg.nr;
3162
      } else {
3163
         continue;
3164
      }
3165
 
3166
      if (grf >= first_grf &&
3167
          grf < first_grf + grf_len) {
3168
         deps[grf - first_grf] = false;
3169
         if (inst->exec_size == 16)
3170
            deps[grf - first_grf + 1] = false;
3171
      }
3172
   }
3173
}
3174
 
3175
/**
3176
 * Implements this workaround for the original 965:
3177
 *
3178
 *     "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
3179
 *      check for post destination dependencies on this instruction, software
3180
 *      must ensure that there is no destination hazard for the case of ‘write
3181
 *      followed by a posted write’ shown in the following example.
3182
 *
3183
 *      1. mov r3 0
3184
 *      2. send r3.xy 
3185
 *      3. mov r2 r3
3186
 *
3187
 *      Due to no post-destination dependency check on the ‘send’, the above
3188
 *      code sequence could have two instructions (1 and 2) in flight at the
3189
 *      same time that both consider ‘r3’ as the target of their final writes.
3190
 */
3191
void
3192
fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
3193
                                                        fs_inst *inst)
3194
{
3195
   int write_len = inst->regs_written;
3196
   int first_write_grf = inst->dst.reg;
3197
   bool needs_dep[BRW_MAX_MRF];
3198
   assert(write_len < (int)sizeof(needs_dep) - 1);
3199
 
3200
   memset(needs_dep, false, sizeof(needs_dep));
3201
   memset(needs_dep, true, write_len);
3202
 
3203
   clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
3204
 
3205
   /* Walk backwards looking for writes to registers we're writing which
3206
    * aren't read since being written.  If we hit the start of the program,
3207
    * we assume that there are no outstanding dependencies on entry to the
3208
    * program.
3209
    */
3210
   foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
3211
      /* If we hit control flow, assume that there *are* outstanding
3212
       * dependencies, and force their cleanup before our instruction.
3213
       */
3214
      if (block->start() == scan_inst) {
3215
         for (int i = 0; i < write_len; i++) {
3216
            if (needs_dep[i]) {
3217
               inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
3218
            }
3219
         }
3220
         return;
3221
      }
3222
 
3223
      /* We insert our reads as late as possible on the assumption that any
3224
       * instruction but a MOV that might have left us an outstanding
3225
       * dependency has more latency than a MOV.
3226
       */
3227
      if (scan_inst->dst.file == GRF) {
3228
         for (int i = 0; i < scan_inst->regs_written; i++) {
3229
            int reg = scan_inst->dst.reg + i;
3230
 
3231
            if (reg >= first_write_grf &&
3232
                reg < first_write_grf + write_len &&
3233
                needs_dep[reg - first_write_grf]) {
3234
               inst->insert_before(block, DEP_RESOLVE_MOV(reg));
3235
               needs_dep[reg - first_write_grf] = false;
3236
               if (scan_inst->exec_size == 16)
3237
                  needs_dep[reg - first_write_grf + 1] = false;
3238
            }
3239
         }
3240
      }
3241
 
3242
      /* Clear the flag for registers that actually got read (as expected). */
3243
      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3244
 
3245
      /* Continue the loop only if we haven't resolved all the dependencies */
3246
      int i;
3247
      for (i = 0; i < write_len; i++) {
3248
         if (needs_dep[i])
3249
            break;
3250
      }
3251
      if (i == write_len)
3252
         return;
3253
   }
3254
}
3255
 
3256
/**
3257
 * Implements this workaround for the original 965:
3258
 *
3259
 *     "[DevBW, DevCL] Errata: A destination register from a send can not be
3260
 *      used as a destination register until after it has been sourced by an
3261
 *      instruction with a different destination register.
3262
 */
3263
void
3264
fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
3265
{
3266
   int write_len = inst->regs_written;
3267
   int first_write_grf = inst->dst.reg;
3268
   bool needs_dep[BRW_MAX_MRF];
3269
   assert(write_len < (int)sizeof(needs_dep) - 1);
3270
 
3271
   memset(needs_dep, false, sizeof(needs_dep));
3272
   memset(needs_dep, true, write_len);
3273
   /* Walk forwards looking for writes to registers we're writing which aren't
3274
    * read before being written.
3275
    */
3276
   foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
3277
      /* If we hit control flow, force resolve all remaining dependencies. */
3278
      if (block->end() == scan_inst) {
3279
         for (int i = 0; i < write_len; i++) {
3280
            if (needs_dep[i])
3281
               scan_inst->insert_before(block,
3282
                                        DEP_RESOLVE_MOV(first_write_grf + i));
3283
         }
3284
         return;
3285
      }
3286
 
3287
      /* Clear the flag for registers that actually got read (as expected). */
3288
      clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
3289
 
3290
      /* We insert our reads as late as possible since they're reading the
3291
       * result of a SEND, which has massive latency.
3292
       */
3293
      if (scan_inst->dst.file == GRF &&
3294
          scan_inst->dst.reg >= first_write_grf &&
3295
          scan_inst->dst.reg < first_write_grf + write_len &&
3296
          needs_dep[scan_inst->dst.reg - first_write_grf]) {
3297
         scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
3298
         needs_dep[scan_inst->dst.reg - first_write_grf] = false;
3299
      }
3300
 
3301
      /* Continue the loop only if we haven't resolved all the dependencies */
3302
      int i;
3303
      for (i = 0; i < write_len; i++) {
3304
         if (needs_dep[i])
3305
            break;
3306
      }
3307
      if (i == write_len)
3308
         return;
3309
   }
3310
}
3311
 
3312
void
3313
fs_visitor::insert_gen4_send_dependency_workarounds()
3314
{
3315
   if (devinfo->gen != 4 || devinfo->is_g4x)
3316
      return;
3317
 
3318
   bool progress = false;
3319
 
3320
   /* Note that we're done with register allocation, so GRF fs_regs always
3321
    * have a .reg_offset of 0.
3322
    */
3323
 
3324
   foreach_block_and_inst(block, fs_inst, inst, cfg) {
3325
      if (inst->mlen != 0 && inst->dst.file == GRF) {
3326
         insert_gen4_pre_send_dependency_workarounds(block, inst);
3327
         insert_gen4_post_send_dependency_workarounds(block, inst);
3328
         progress = true;
3329
      }
3330
   }
3331
 
3332
   if (progress)
3333
      invalidate_live_intervals();
3334
}
3335
 
3336
/**
3337
 * Turns the generic expression-style uniform pull constant load instruction
3338
 * into a hardware-specific series of instructions for loading a pull
3339
 * constant.
3340
 *
3341
 * The expression style allows the CSE pass before this to optimize out
3342
 * repeated loads from the same offset, and gives the pre-register-allocation
3343
 * scheduling full flexibility, while the conversion to native instructions
3344
 * allows the post-register-allocation scheduler the best information
3345
 * possible.
3346
 *
3347
 * Note that execution masking for setting up pull constant loads is special:
3348
 * the channels that need to be written are unrelated to the current execution
3349
 * mask, since a later instruction will use one of the result channels as a
3350
 * source operand for all 8 or 16 of its channels.
3351
 */
3352
void
3353
fs_visitor::lower_uniform_pull_constant_loads()
3354
{
3355
   foreach_block_and_inst (block, fs_inst, inst, cfg) {
3356
      if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3357
         continue;
3358
 
3359
      if (devinfo->gen >= 7) {
3360
         /* The offset arg before was a vec4-aligned byte offset.  We need to
3361
          * turn it into a dword offset.
3362
          */
3363
         fs_reg const_offset_reg = inst->src[1];
3364
         assert(const_offset_reg.file == IMM &&
3365
                const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3366
         const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3367
         fs_reg payload = fs_reg(GRF, alloc.allocate(1));
3368
 
3369
         /* We have to use a message header on Skylake to get SIMD4x2 mode.
3370
          * Reserve space for the register.
3371
          */
3372
         if (devinfo->gen >= 9) {
3373
            payload.reg_offset++;
3374
            alloc.sizes[payload.reg] = 2;
3375
         }
3376
 
3377
         /* This is actually going to be a MOV, but since only the first dword
3378
          * is accessed, we have a special opcode to do just that one.  Note
3379
          * that this needs to be an operation that will be considered a def
3380
          * by live variable analysis, or register allocation will explode.
3381
          */
3382
         fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3383
                                               8, payload, const_offset_reg);
3384
         setup->force_writemask_all = true;
3385
 
3386
         setup->ir = inst->ir;
3387
         setup->annotation = inst->annotation;
3388
         inst->insert_before(block, setup);
3389
 
3390
         /* Similarly, this will only populate the first 4 channels of the
3391
          * result register (since we only use smear values from 0-3), but we
3392
          * don't tell the optimizer.
3393
          */
3394
         inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3395
         inst->src[1] = payload;
3396
 
3397
         invalidate_live_intervals();
3398
      } else {
3399
         /* Before register allocation, we didn't tell the scheduler about the
3400
          * MRF we use.  We know it's safe to use this MRF because nothing
3401
          * else does except for register spill/unspill, which generates and
3402
          * uses its MRF within a single IR instruction.
3403
          */
3404
         inst->base_mrf = 14;
3405
         inst->mlen = 1;
3406
      }
3407
   }
3408
}
3409
 
3410
bool
3411
fs_visitor::lower_load_payload()
3412
{
3413
   bool progress = false;
3414
 
3415
   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3416
      if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
3417
         continue;
3418
 
3419
      assert(inst->dst.file == MRF || inst->dst.file == GRF);
3420
      assert(inst->saturate == false);
3421
 
3422
      fs_reg dst = inst->dst;
3423
 
3424
      /* Get rid of COMPR4.  We'll add it back in if we need it */
3425
      if (dst.file == MRF)
3426
         dst.reg = dst.reg & ~BRW_MRF_COMPR4;
3427
 
3428
      dst.width = 8;
3429
      for (uint8_t i = 0; i < inst->header_size; i++) {
3430
         if (inst->src[i].file != BAD_FILE) {
3431
            fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
3432
            fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
3433
            mov_src.width = 8;
3434
            fs_inst *mov = MOV(mov_dst, mov_src);
3435
            mov->force_writemask_all = true;
3436
            inst->insert_before(block, mov);
3437
         }
3438
         dst = offset(dst, 1);
3439
      }
3440
 
3441
      dst.width = inst->exec_size;
3442
      if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
3443
          inst->exec_size > 8) {
3444
         /* In this case, the payload portion of the LOAD_PAYLOAD isn't
3445
          * a straightforward copy.  Instead, the result of the
3446
          * LOAD_PAYLOAD is treated as interleaved and the first four
3447
          * non-header sources are unpacked as:
3448
          *
3449
          * m + 0: r0
3450
          * m + 1: g0
3451
          * m + 2: b0
3452
          * m + 3: a0
3453
          * m + 4: r1
3454
          * m + 5: g1
3455
          * m + 6: b1
3456
          * m + 7: a1
3457
          *
3458
          * This is used for gen <= 5 fb writes.
3459
          */
3460
         assert(inst->exec_size == 16);
3461
         assert(inst->header_size + 4 <= inst->sources);
3462
         for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
3463
            if (inst->src[i].file != BAD_FILE) {
3464
               if (devinfo->has_compr4) {
3465
                  fs_reg compr4_dst = retype(dst, inst->src[i].type);
3466
                  compr4_dst.reg |= BRW_MRF_COMPR4;
3467
 
3468
                  fs_inst *mov = MOV(compr4_dst, inst->src[i]);
3469
                  mov->force_writemask_all = inst->force_writemask_all;
3470
                  inst->insert_before(block, mov);
3471
               } else {
3472
                  /* Platform doesn't have COMPR4.  We have to fake it */
3473
                  fs_reg mov_dst = retype(dst, inst->src[i].type);
3474
                  mov_dst.width = 8;
3475
 
3476
                  fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
3477
                  mov->force_writemask_all = inst->force_writemask_all;
3478
                  inst->insert_before(block, mov);
3479
 
3480
                  mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
3481
                  mov->force_writemask_all = inst->force_writemask_all;
3482
                  mov->force_sechalf = true;
3483
                  inst->insert_before(block, mov);
3484
               }
3485
            }
3486
 
3487
            dst.reg++;
3488
         }
3489
 
3490
         /* The loop above only ever incremented us through the first set
3491
          * of 4 registers.  However, thanks to the magic of COMPR4, we
3492
          * actually wrote to the first 8 registers, so we need to take
3493
          * that into account now.
3494
          */
3495
         dst.reg += 4;
3496
 
3497
         /* The COMPR4 code took care of the first 4 sources.  We'll let
3498
          * the regular path handle any remaining sources.  Yes, we are
3499
          * modifying the instruction but we're about to delete it so
3500
          * this really doesn't hurt anything.
3501
          */
3502
         inst->header_size += 4;
3503
      }
3504
 
3505
      for (uint8_t i = inst->header_size; i < inst->sources; i++) {
3506
         if (inst->src[i].file != BAD_FILE) {
3507
            fs_inst *mov = MOV(retype(dst, inst->src[i].type),
3508
                               inst->src[i]);
3509
            mov->force_writemask_all = inst->force_writemask_all;
3510
            mov->force_sechalf = inst->force_sechalf;
3511
            inst->insert_before(block, mov);
3512
         }
3513
         dst = offset(dst, 1);
3514
      }
3515
 
3516
      inst->remove(block);
3517
      progress = true;
3518
   }
3519
 
3520
   if (progress)
3521
      invalidate_live_intervals();
3522
 
3523
   return progress;
3524
}
3525
 
3526
bool
3527
fs_visitor::lower_integer_multiplication()
3528
{
3529
   bool progress = false;
3530
 
3531
   /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
3532
    * directly, but Cherryview cannot.
3533
    */
3534
   if (devinfo->gen >= 8 && !devinfo->is_cherryview)
3535
      return false;
3536
 
3537
   foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
3538
      if (inst->opcode != BRW_OPCODE_MUL ||
3539
          inst->dst.is_accumulator() ||
3540
          (inst->dst.type != BRW_REGISTER_TYPE_D &&
3541
           inst->dst.type != BRW_REGISTER_TYPE_UD))
3542
         continue;
3543
 
3544
#define insert(instr) inst->insert_before(block, instr)
3545
 
3546
      /* The MUL instruction isn't commutative. On Gen <= 6, only the low
3547
       * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
3548
       * src1 are used.
3549
       *
3550
       * If multiplying by an immediate value that fits in 16-bits, do a
3551
       * single MUL instruction with that value in the proper location.
3552
       */
3553
      if (inst->src[1].file == IMM &&
3554
          inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
3555
         if (devinfo->gen < 7) {
3556
            fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
3557
                       inst->dst.type, dispatch_width);
3558
            insert(MOV(imm, inst->src[1]));
3559
            insert(MUL(inst->dst, imm, inst->src[0]));
3560
         } else {
3561
            insert(MUL(inst->dst, inst->src[0], inst->src[1]));
3562
         }
3563
      } else {
3564
         /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
3565
          * do 32-bit integer multiplication in one instruction, but instead
3566
          * must do a sequence (which actually calculates a 64-bit result):
3567
          *
3568
          *    mul(8)  acc0<1>D   g3<8,8,1>D      g4<8,8,1>D
3569
          *    mach(8) null       g3<8,8,1>D      g4<8,8,1>D
3570
          *    mov(8)  g2<1>D     acc0<8,8,1>D
3571
          *
3572
          * But on Gen > 6, the ability to use second accumulator register
3573
          * (acc1) for non-float data types was removed, preventing a simple
3574
          * implementation in SIMD16. A 16-channel result can be calculated by
3575
          * executing the three instructions twice in SIMD8, once with quarter
3576
          * control of 1Q for the first eight channels and again with 2Q for
3577
          * the second eight channels.
3578
          *
3579
          * Which accumulator register is implicitly accessed (by AccWrEnable
3580
          * for instance) is determined by the quarter control. Unfortunately
3581
          * Ivybridge (and presumably Baytrail) has a hardware bug in which an
3582
          * implicit accumulator access by an instruction with 2Q will access
3583
          * acc1 regardless of whether the data type is usable in acc1.
3584
          *
3585
          * Specifically, the 2Q mach(8) writes acc1 which does not exist for
3586
          * integer data types.
3587
          *
3588
          * Since we only want the low 32-bits of the result, we can do two
3589
          * 32-bit x 16-bit multiplies (like the mul and mach are doing), and
3590
          * adjust the high result and add them (like the mach is doing):
3591
          *
3592
          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<8,8,1>UW
3593
          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<8,8,1>UW
3594
          *    shl(8)  g9<1>D     g8<8,8,1>D      16D
3595
          *    add(8)  g2<1>D     g7<8,8,1>D      g8<8,8,1>D
3596
          *
3597
          * We avoid the shl instruction by realizing that we only want to add
3598
          * the low 16-bits of the "high" result to the high 16-bits of the
3599
          * "low" result and using proper regioning on the add:
3600
          *
3601
          *    mul(8)  g7<1>D     g3<8,8,1>D      g4.0<16,8,2>UW
3602
          *    mul(8)  g8<1>D     g3<8,8,1>D      g4.1<16,8,2>UW
3603
          *    add(8)  g7.1<2>UW  g7.1<16,8,2>UW  g8<16,8,2>UW
3604
          *
3605
          * Since it does not use the (single) accumulator register, we can
3606
          * schedule multi-component multiplications much better.
3607
          */
3608
 
3609
         fs_reg low = inst->dst;
3610
         fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
3611
                     inst->dst.type, dispatch_width);
3612
 
3613
         if (brw->gen >= 7) {
3614
            fs_reg src1_0_w = inst->src[1];
3615
            fs_reg src1_1_w = inst->src[1];
3616
 
3617
            if (inst->src[1].file == IMM) {
3618
               src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
3619
               src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
3620
            } else {
3621
               src1_0_w.type = BRW_REGISTER_TYPE_UW;
3622
               src1_0_w.stride = 2;
3623
 
3624
               src1_1_w.type = BRW_REGISTER_TYPE_UW;
3625
               src1_1_w.stride = 2;
3626
               src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3627
            }
3628
            insert(MUL(low, inst->src[0], src1_0_w));
3629
            insert(MUL(high, inst->src[0], src1_1_w));
3630
         } else {
3631
            fs_reg src0_0_w = inst->src[0];
3632
            fs_reg src0_1_w = inst->src[0];
3633
 
3634
            src0_0_w.type = BRW_REGISTER_TYPE_UW;
3635
            src0_0_w.stride = 2;
3636
 
3637
            src0_1_w.type = BRW_REGISTER_TYPE_UW;
3638
            src0_1_w.stride = 2;
3639
            src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
3640
 
3641
            insert(MUL(low, src0_0_w, inst->src[1]));
3642
            insert(MUL(high, src0_1_w, inst->src[1]));
3643
         }
3644
 
3645
         fs_reg dst = inst->dst;
3646
         dst.type = BRW_REGISTER_TYPE_UW;
3647
         dst.subreg_offset = 2;
3648
         dst.stride = 2;
3649
 
3650
         high.type = BRW_REGISTER_TYPE_UW;
3651
         high.stride = 2;
3652
 
3653
         low.type = BRW_REGISTER_TYPE_UW;
3654
         low.subreg_offset = 2;
3655
         low.stride = 2;
3656
 
3657
         insert(ADD(dst, low, high));
3658
      }
3659
#undef insert
3660
 
3661
      inst->remove(block);
3662
      progress = true;
3663
   }
3664
 
3665
   if (progress)
3666
      invalidate_live_intervals();
3667
 
3668
   return progress;
3669
}
3670
 
3671
void
3672
fs_visitor::dump_instructions()
3673
{
3674
   dump_instructions(NULL);
3675
}
3676
 
3677
void
3678
fs_visitor::dump_instructions(const char *name)
3679
{
3680
   FILE *file = stderr;
3681
   if (name && geteuid() != 0) {
3682
      file = fopen(name, "w");
3683
      if (!file)
3684
         file = stderr;
3685
   }
3686
 
3687
   if (cfg) {
3688
      calculate_register_pressure();
3689
      int ip = 0, max_pressure = 0;
3690
      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3691
         max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3692
         fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3693
         dump_instruction(inst, file);
3694
         ip++;
3695
      }
3696
      fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3697
   } else {
3698
      int ip = 0;
3699
      foreach_in_list(backend_instruction, inst, &instructions) {
3700
         fprintf(file, "%4d: ", ip++);
3701
         dump_instruction(inst, file);
3702
      }
3703
   }
3704
 
3705
   if (file != stderr) {
3706
      fclose(file);
3707
   }
3708
}
3709
 
3710
void
3711
fs_visitor::dump_instruction(backend_instruction *be_inst)
3712
{
3713
   dump_instruction(be_inst, stderr);
3714
}
3715
 
3716
void
3717
fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3718
{
3719
   fs_inst *inst = (fs_inst *)be_inst;
3720
 
3721
   if (inst->predicate) {
3722
      fprintf(file, "(%cf0.%d) ",
3723
             inst->predicate_inverse ? '-' : '+',
3724
             inst->flag_subreg);
3725
   }
3726
 
3727
   fprintf(file, "%s", brw_instruction_name(inst->opcode));
3728
   if (inst->saturate)
3729
      fprintf(file, ".sat");
3730
   if (inst->conditional_mod) {
3731
      fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3732
      if (!inst->predicate &&
3733
          (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3734
                              inst->opcode != BRW_OPCODE_IF &&
3735
                              inst->opcode != BRW_OPCODE_WHILE))) {
3736
         fprintf(file, ".f0.%d", inst->flag_subreg);
3737
      }
3738
   }
3739
   fprintf(file, "(%d) ", inst->exec_size);
3740
 
3741
 
3742
   switch (inst->dst.file) {
3743
   case GRF:
3744
      fprintf(file, "vgrf%d", inst->dst.reg);
3745
      if (inst->dst.width != dispatch_width)
3746
         fprintf(file, "@%d", inst->dst.width);
3747
      if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 ||
3748
          inst->dst.subreg_offset)
3749
         fprintf(file, "+%d.%d",
3750
                 inst->dst.reg_offset, inst->dst.subreg_offset);
3751
      break;
3752
   case MRF:
3753
      fprintf(file, "m%d", inst->dst.reg);
3754
      break;
3755
   case BAD_FILE:
3756
      fprintf(file, "(null)");
3757
      break;
3758
   case UNIFORM:
3759
      fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3760
      break;
3761
   case ATTR:
3762
      fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3763
      break;
3764
   case HW_REG:
3765
      if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3766
         switch (inst->dst.fixed_hw_reg.nr) {
3767
         case BRW_ARF_NULL:
3768
            fprintf(file, "null");
3769
            break;
3770
         case BRW_ARF_ADDRESS:
3771
            fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3772
            break;
3773
         case BRW_ARF_ACCUMULATOR:
3774
            fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3775
            break;
3776
         case BRW_ARF_FLAG:
3777
            fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3778
                             inst->dst.fixed_hw_reg.subnr);
3779
            break;
3780
         default:
3781
            fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3782
                               inst->dst.fixed_hw_reg.subnr);
3783
            break;
3784
         }
3785
      } else {
3786
         fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3787
      }
3788
      if (inst->dst.fixed_hw_reg.subnr)
3789
         fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3790
      break;
3791
   default:
3792
      fprintf(file, "???");
3793
      break;
3794
   }
3795
   fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3796
 
3797
   for (int i = 0; i < inst->sources; i++) {
3798
      if (inst->src[i].negate)
3799
         fprintf(file, "-");
3800
      if (inst->src[i].abs)
3801
         fprintf(file, "|");
3802
      switch (inst->src[i].file) {
3803
      case GRF:
3804
         fprintf(file, "vgrf%d", inst->src[i].reg);
3805
         if (inst->src[i].width != dispatch_width)
3806
            fprintf(file, "@%d", inst->src[i].width);
3807
         if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3808
             inst->src[i].subreg_offset)
3809
            fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3810
                    inst->src[i].subreg_offset);
3811
         break;
3812
      case MRF:
3813
         fprintf(file, "***m%d***", inst->src[i].reg);
3814
         break;
3815
      case ATTR:
3816
         fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3817
         break;
3818
      case UNIFORM:
3819
         fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3820
         if (inst->src[i].reladdr) {
3821
            fprintf(file, "+reladdr");
3822
         } else if (inst->src[i].subreg_offset) {
3823
            fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3824
                    inst->src[i].subreg_offset);
3825
         }
3826
         break;
3827
      case BAD_FILE:
3828
         fprintf(file, "(null)");
3829
         break;
3830
      case IMM:
3831
         switch (inst->src[i].type) {
3832
         case BRW_REGISTER_TYPE_F:
3833
            fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3834
            break;
3835
         case BRW_REGISTER_TYPE_W:
3836
         case BRW_REGISTER_TYPE_D:
3837
            fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3838
            break;
3839
         case BRW_REGISTER_TYPE_UW:
3840
         case BRW_REGISTER_TYPE_UD:
3841
            fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3842
            break;
3843
         case BRW_REGISTER_TYPE_VF:
3844
            fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3845
                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  0) & 0xff),
3846
                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >>  8) & 0xff),
3847
                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3848
                    brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3849
            break;
3850
         default:
3851
            fprintf(file, "???");
3852
            break;
3853
         }
3854
         break;
3855
      case HW_REG:
3856
         if (inst->src[i].fixed_hw_reg.negate)
3857
            fprintf(file, "-");
3858
         if (inst->src[i].fixed_hw_reg.abs)
3859
            fprintf(file, "|");
3860
         if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3861
            switch (inst->src[i].fixed_hw_reg.nr) {
3862
            case BRW_ARF_NULL:
3863
               fprintf(file, "null");
3864
               break;
3865
            case BRW_ARF_ADDRESS:
3866
               fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3867
               break;
3868
            case BRW_ARF_ACCUMULATOR:
3869
               fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3870
               break;
3871
            case BRW_ARF_FLAG:
3872
               fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3873
                                inst->src[i].fixed_hw_reg.subnr);
3874
               break;
3875
            default:
3876
               fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3877
                                  inst->src[i].fixed_hw_reg.subnr);
3878
               break;
3879
            }
3880
         } else {
3881
            fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3882
         }
3883
         if (inst->src[i].fixed_hw_reg.subnr)
3884
            fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3885
         if (inst->src[i].fixed_hw_reg.abs)
3886
            fprintf(file, "|");
3887
         break;
3888
      default:
3889
         fprintf(file, "???");
3890
         break;
3891
      }
3892
      if (inst->src[i].abs)
3893
         fprintf(file, "|");
3894
 
3895
      if (inst->src[i].file != IMM) {
3896
         fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3897
      }
3898
 
3899
      if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3900
         fprintf(file, ", ");
3901
   }
3902
 
3903
   fprintf(file, " ");
3904
 
3905
   if (dispatch_width == 16 && inst->exec_size == 8) {
3906
      if (inst->force_sechalf)
3907
         fprintf(file, "2ndhalf ");
3908
      else
3909
         fprintf(file, "1sthalf ");
3910
   }
3911
 
3912
   fprintf(file, "\n");
3913
}
3914
 
3915
/**
3916
 * Possibly returns an instruction that set up @param reg.
3917
 *
3918
 * Sometimes we want to take the result of some expression/variable
3919
 * dereference tree and rewrite the instruction generating the result
3920
 * of the tree.  When processing the tree, we know that the
3921
 * instructions generated are all writing temporaries that are dead
3922
 * outside of this tree.  So, if we have some instructions that write
3923
 * a temporary, we're free to point that temp write somewhere else.
3924
 *
3925
 * Note that this doesn't guarantee that the instruction generated
3926
 * only reg -- it might be the size=4 destination of a texture instruction.
3927
 */
3928
fs_inst *
3929
fs_visitor::get_instruction_generating_reg(fs_inst *start,
3930
					   fs_inst *end,
3931
					   const fs_reg ®)
3932
{
3933
   if (end == start ||
3934
       end->is_partial_write() ||
3935
       reg.reladdr ||
3936
       !reg.equals(end->dst)) {
3937
      return NULL;
3938
   } else {
3939
      return end;
3940
   }
3941
}
3942
 
3943
void
3944
fs_visitor::setup_payload_gen6()
3945
{
3946
   bool uses_depth =
3947
      (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3948
   unsigned barycentric_interp_modes =
3949
      (stage == MESA_SHADER_FRAGMENT) ?
3950
      ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3951
 
3952
   assert(devinfo->gen >= 6);
3953
 
3954
   /* R0-1: masks, pixel X/Y coordinates. */
3955
   payload.num_regs = 2;
3956
   /* R2: only for 32-pixel dispatch.*/
3957
 
3958
   /* R3-26: barycentric interpolation coordinates.  These appear in the
3959
    * same order that they appear in the brw_wm_barycentric_interp_mode
3960
    * enum.  Each set of coordinates occupies 2 registers if dispatch width
3961
    * == 8 and 4 registers if dispatch width == 16.  Coordinates only
3962
    * appear if they were enabled using the "Barycentric Interpolation
3963
    * Mode" bits in WM_STATE.
3964
    */
3965
   for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3966
      if (barycentric_interp_modes & (1 << i)) {
3967
         payload.barycentric_coord_reg[i] = payload.num_regs;
3968
         payload.num_regs += 2;
3969
         if (dispatch_width == 16) {
3970
            payload.num_regs += 2;
3971
         }
3972
      }
3973
   }
3974
 
3975
   /* R27: interpolated depth if uses source depth */
3976
   if (uses_depth) {
3977
      payload.source_depth_reg = payload.num_regs;
3978
      payload.num_regs++;
3979
      if (dispatch_width == 16) {
3980
         /* R28: interpolated depth if not SIMD8. */
3981
         payload.num_regs++;
3982
      }
3983
   }
3984
   /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3985
   if (uses_depth) {
3986
      payload.source_w_reg = payload.num_regs;
3987
      payload.num_regs++;
3988
      if (dispatch_width == 16) {
3989
         /* R30: interpolated W if not SIMD8. */
3990
         payload.num_regs++;
3991
      }
3992
   }
3993
 
3994
   if (stage == MESA_SHADER_FRAGMENT) {
3995
      brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3996
      brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3997
      prog_data->uses_pos_offset = key->compute_pos_offset;
3998
      /* R31: MSAA position offsets. */
3999
      if (prog_data->uses_pos_offset) {
4000
         payload.sample_pos_reg = payload.num_regs;
4001
         payload.num_regs++;
4002
      }
4003
   }
4004
 
4005
   /* R32: MSAA input coverage mask */
4006
   if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
4007
      assert(devinfo->gen >= 7);
4008
      payload.sample_mask_in_reg = payload.num_regs;
4009
      payload.num_regs++;
4010
      if (dispatch_width == 16) {
4011
         /* R33: input coverage mask if not SIMD8. */
4012
         payload.num_regs++;
4013
      }
4014
   }
4015
 
4016
   /* R34-: bary for 32-pixel. */
4017
   /* R58-59: interp W for 32-pixel. */
4018
 
4019
   if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
4020
      source_depth_to_render_target = true;
4021
   }
4022
}
4023
 
4024
void
4025
fs_visitor::setup_vs_payload()
4026
{
4027
   /* R0: thread header, R1: urb handles */
4028
   payload.num_regs = 2;
4029
}
4030
 
4031
void
4032
fs_visitor::setup_cs_payload()
4033
{
4034
   assert(brw->gen >= 7);
4035
 
4036
   payload.num_regs = 1;
4037
}
4038
 
4039
void
4040
fs_visitor::assign_binding_table_offsets()
4041
{
4042
   assert(stage == MESA_SHADER_FRAGMENT);
4043
   brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
4044
   brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
4045
   uint32_t next_binding_table_offset = 0;
4046
 
4047
   /* If there are no color regions, we still perform an FB write to a null
4048
    * renderbuffer, which we place at surface index 0.
4049
    */
4050
   prog_data->binding_table.render_target_start = next_binding_table_offset;
4051
   next_binding_table_offset += MAX2(key->nr_color_regions, 1);
4052
 
4053
   assign_common_binding_table_offsets(next_binding_table_offset);
4054
}
4055
 
4056
void
4057
fs_visitor::calculate_register_pressure()
4058
{
4059
   invalidate_live_intervals();
4060
   calculate_live_intervals();
4061
 
4062
   unsigned num_instructions = 0;
4063
   foreach_block(block, cfg)
4064
      num_instructions += block->instructions.length();
4065
 
4066
   regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
4067
 
4068
   for (unsigned reg = 0; reg < alloc.count; reg++) {
4069
      for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
4070
         regs_live_at_ip[ip] += alloc.sizes[reg];
4071
   }
4072
}
4073
 
4074
void
4075
fs_visitor::optimize()
4076
{
4077
   split_virtual_grfs();
4078
 
4079
   move_uniform_array_access_to_pull_constants();
4080
   assign_constant_locations();
4081
   demote_pull_constants();
4082
 
4083
#define OPT(pass, args...) ({                                           \
4084
      pass_num++;                                                       \
4085
      bool this_progress = pass(args);                                  \
4086
                                                                        \
4087
      if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) {   \
4088
         char filename[64];                                             \
4089
         snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass,              \
4090
                  stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
4091
                                                                        \
4092
         backend_visitor::dump_instructions(filename);                  \
4093
      }                                                                 \
4094
                                                                        \
4095
      progress = progress || this_progress;                             \
4096
      this_progress;                                                    \
4097
   })
4098
 
4099
   if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
4100
      char filename[64];
4101
      snprintf(filename, 64, "%s%d-%04d-00-start",
4102
               stage_abbrev, dispatch_width,
4103
               shader_prog ? shader_prog->Name : 0);
4104
 
4105
      backend_visitor::dump_instructions(filename);
4106
   }
4107
 
4108
   bool progress;
4109
   int iteration = 0;
4110
   int pass_num = 0;
4111
   do {
4112
      progress = false;
4113
      pass_num = 0;
4114
      iteration++;
4115
 
4116
      OPT(remove_duplicate_mrf_writes);
4117
 
4118
      OPT(opt_algebraic);
4119
      OPT(opt_cse);
4120
      OPT(opt_copy_propagate);
4121
      OPT(opt_peephole_predicated_break);
4122
      OPT(opt_cmod_propagation);
4123
      OPT(dead_code_eliminate);
4124
      OPT(opt_peephole_sel);
4125
      OPT(dead_control_flow_eliminate, this);
4126
      OPT(opt_register_renaming);
4127
      OPT(opt_redundant_discard_jumps);
4128
      OPT(opt_saturate_propagation);
4129
      OPT(opt_zero_samples);
4130
      OPT(register_coalesce);
4131
      OPT(compute_to_mrf);
4132
      OPT(eliminate_find_live_channel);
4133
 
4134
      OPT(compact_virtual_grfs);
4135
   } while (progress);
4136
 
4137
   pass_num = 0;
4138
 
4139
   OPT(opt_sampler_eot);
4140
 
4141
   if (OPT(lower_load_payload)) {
4142
      split_virtual_grfs();
4143
      OPT(register_coalesce);
4144
      OPT(compute_to_mrf);
4145
      OPT(dead_code_eliminate);
4146
   }
4147
 
4148
   OPT(opt_combine_constants);
4149
   OPT(lower_integer_multiplication);
4150
 
4151
   lower_uniform_pull_constant_loads();
4152
}
4153
 
4154
/**
4155
 * Three source instruction must have a GRF/MRF destination register.
4156
 * ARF NULL is not allowed.  Fix that up by allocating a temporary GRF.
4157
 */
4158
void
4159
fs_visitor::fixup_3src_null_dest()
4160
{
4161
   foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
4162
      if (inst->is_3src() && inst->dst.is_null()) {
4163
         inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
4164
                            inst->dst.type);
4165
      }
4166
   }
4167
}
4168
 
4169
void
4170
fs_visitor::allocate_registers()
4171
{
4172
   bool allocated_without_spills;
4173
 
4174
   static const enum instruction_scheduler_mode pre_modes[] = {
4175
      SCHEDULE_PRE,
4176
      SCHEDULE_PRE_NON_LIFO,
4177
      SCHEDULE_PRE_LIFO,
4178
   };
4179
 
4180
   /* Try each scheduling heuristic to see if it can successfully register
4181
    * allocate without spilling.  They should be ordered by decreasing
4182
    * performance but increasing likelihood of allocating.
4183
    */
4184
   for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
4185
      schedule_instructions(pre_modes[i]);
4186
 
4187
      if (0) {
4188
         assign_regs_trivial();
4189
         allocated_without_spills = true;
4190
      } else {
4191
         allocated_without_spills = assign_regs(false);
4192
      }
4193
      if (allocated_without_spills)
4194
         break;
4195
   }
4196
 
4197
   if (!allocated_without_spills) {
4198
      /* We assume that any spilling is worse than just dropping back to
4199
       * SIMD8.  There's probably actually some intermediate point where
4200
       * SIMD16 with a couple of spills is still better.
4201
       */
4202
      if (dispatch_width == 16) {
4203
         fail("Failure to register allocate.  Reduce number of "
4204
              "live scalar values to avoid this.");
4205
      } else {
4206
         perf_debug("%s shader triggered register spilling.  "
4207
                    "Try reducing the number of live scalar values to "
4208
                    "improve performance.\n", stage_name);
4209
      }
4210
 
4211
      /* Since we're out of heuristics, just go spill registers until we
4212
       * get an allocation.
4213
       */
4214
      while (!assign_regs(true)) {
4215
         if (failed)
4216
            break;
4217
      }
4218
   }
4219
 
4220
   /* This must come after all optimization and register allocation, since
4221
    * it inserts dead code that happens to have side effects, and it does
4222
    * so based on the actual physical registers in use.
4223
    */
4224
   insert_gen4_send_dependency_workarounds();
4225
 
4226
   if (failed)
4227
      return;
4228
 
4229
   if (!allocated_without_spills)
4230
      schedule_instructions(SCHEDULE_POST);
4231
 
4232
   if (last_scratch > 0)
4233
      prog_data->total_scratch = brw_get_scratch_size(last_scratch);
4234
}
4235
 
4236
bool
4237
fs_visitor::run_vs()
4238
{
4239
   assert(stage == MESA_SHADER_VERTEX);
4240
 
4241
   assign_common_binding_table_offsets(0);
4242
   setup_vs_payload();
4243
 
4244
   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4245
      emit_shader_time_begin();
4246
 
4247
   if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
4248
      emit_nir_code();
4249
   } else {
4250
      foreach_in_list(ir_instruction, ir, shader->base.ir) {
4251
         base_ir = ir;
4252
         this->result = reg_undef;
4253
         ir->accept(this);
4254
      }
4255
      base_ir = NULL;
4256
   }
4257
 
4258
   if (failed)
4259
      return false;
4260
 
4261
   emit_urb_writes();
4262
 
4263
   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4264
      emit_shader_time_end();
4265
 
4266
   calculate_cfg();
4267
 
4268
   optimize();
4269
 
4270
   assign_curb_setup();
4271
   assign_vs_urb_setup();
4272
 
4273
   fixup_3src_null_dest();
4274
   allocate_registers();
4275
 
4276
   return !failed;
4277
}
4278
 
4279
bool
4280
fs_visitor::run_fs()
4281
{
4282
   brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
4283
   brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
4284
 
4285
   assert(stage == MESA_SHADER_FRAGMENT);
4286
 
4287
   sanity_param_count = prog->Parameters->NumParameters;
4288
 
4289
   assign_binding_table_offsets();
4290
 
4291
   if (devinfo->gen >= 6)
4292
      setup_payload_gen6();
4293
   else
4294
      setup_payload_gen4();
4295
 
4296
   if (0) {
4297
      emit_dummy_fs();
4298
   } else if (brw->use_rep_send && dispatch_width == 16) {
4299
      emit_repclear_shader();
4300
   } else {
4301
      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4302
         emit_shader_time_begin();
4303
 
4304
      calculate_urb_setup();
4305
      if (prog->InputsRead > 0) {
4306
         if (devinfo->gen < 6)
4307
            emit_interpolation_setup_gen4();
4308
         else
4309
            emit_interpolation_setup_gen6();
4310
      }
4311
 
4312
      /* We handle discards by keeping track of the still-live pixels in f0.1.
4313
       * Initialize it with the dispatched pixels.
4314
       */
4315
      if (wm_prog_data->uses_kill) {
4316
         fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
4317
         discard_init->flag_subreg = 1;
4318
      }
4319
 
4320
      /* Generate FS IR for main().  (the visitor only descends into
4321
       * functions called "main").
4322
       */
4323
      if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
4324
         emit_nir_code();
4325
      } else if (shader) {
4326
         foreach_in_list(ir_instruction, ir, shader->base.ir) {
4327
            base_ir = ir;
4328
            this->result = reg_undef;
4329
            ir->accept(this);
4330
         }
4331
      } else {
4332
         emit_fragment_program_code();
4333
      }
4334
      base_ir = NULL;
4335
      if (failed)
4336
	 return false;
4337
 
4338
      if (wm_prog_data->uses_kill)
4339
         emit(FS_OPCODE_PLACEHOLDER_HALT);
4340
 
4341
      if (wm_key->alpha_test_func)
4342
         emit_alpha_test();
4343
 
4344
      emit_fb_writes();
4345
 
4346
      if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4347
         emit_shader_time_end();
4348
 
4349
      calculate_cfg();
4350
 
4351
      optimize();
4352
 
4353
      assign_curb_setup();
4354
      assign_urb_setup();
4355
 
4356
      fixup_3src_null_dest();
4357
      allocate_registers();
4358
 
4359
      if (failed)
4360
         return false;
4361
   }
4362
 
4363
   if (dispatch_width == 8)
4364
      wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
4365
   else
4366
      wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
4367
 
4368
   /* If any state parameters were appended, then ParameterValues could have
4369
    * been realloced, in which case the driver uniform storage set up by
4370
    * _mesa_associate_uniform_storage() would point to freed memory.  Make
4371
    * sure that didn't happen.
4372
    */
4373
   assert(sanity_param_count == prog->Parameters->NumParameters);
4374
 
4375
   return !failed;
4376
}
4377
 
4378
bool
4379
fs_visitor::run_cs()
4380
{
4381
   assert(stage == MESA_SHADER_COMPUTE);
4382
   assert(shader);
4383
 
4384
   sanity_param_count = prog->Parameters->NumParameters;
4385
 
4386
   assign_common_binding_table_offsets(0);
4387
 
4388
   setup_cs_payload();
4389
 
4390
   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4391
      emit_shader_time_begin();
4392
 
4393
   emit_nir_code();
4394
 
4395
   if (failed)
4396
      return false;
4397
 
4398
   emit_cs_terminate();
4399
 
4400
   if (INTEL_DEBUG & DEBUG_SHADER_TIME)
4401
      emit_shader_time_end();
4402
 
4403
   calculate_cfg();
4404
 
4405
   optimize();
4406
 
4407
   assign_curb_setup();
4408
 
4409
   fixup_3src_null_dest();
4410
   allocate_registers();
4411
 
4412
   if (failed)
4413
      return false;
4414
 
4415
   /* If any state parameters were appended, then ParameterValues could have
4416
    * been realloced, in which case the driver uniform storage set up by
4417
    * _mesa_associate_uniform_storage() would point to freed memory.  Make
4418
    * sure that didn't happen.
4419
    */
4420
   assert(sanity_param_count == prog->Parameters->NumParameters);
4421
 
4422
   return !failed;
4423
}
4424
 
4425
const unsigned *
4426
brw_wm_fs_emit(struct brw_context *brw,
4427
               void *mem_ctx,
4428
               const struct brw_wm_prog_key *key,
4429
               struct brw_wm_prog_data *prog_data,
4430
               struct gl_fragment_program *fp,
4431
               struct gl_shader_program *prog,
4432
               unsigned *final_assembly_size)
4433
{
4434
   bool start_busy = false;
4435
   double start_time = 0;
4436
 
4437
   if (unlikely(brw->perf_debug)) {
4438
      start_busy = (brw->batch.last_bo &&
4439
                    drm_intel_bo_busy(brw->batch.last_bo));
4440
      start_time = get_time();
4441
   }
4442
 
4443
   struct brw_shader *shader = NULL;
4444
   if (prog)
4445
      shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
4446
 
4447
   if (unlikely(INTEL_DEBUG & DEBUG_WM))
4448
      brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
4449
 
4450
   /* Now the main event: Visit the shader IR and generate our FS IR for it.
4451
    */
4452
   fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4453
                prog, &fp->Base, 8);
4454
   if (!v.run_fs()) {
4455
      if (prog) {
4456
         prog->LinkStatus = false;
4457
         ralloc_strcat(&prog->InfoLog, v.fail_msg);
4458
      }
4459
 
4460
      _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
4461
                    v.fail_msg);
4462
 
4463
      return NULL;
4464
   }
4465
 
4466
   cfg_t *simd16_cfg = NULL;
4467
   fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
4468
                 prog, &fp->Base, 16);
4469
   if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) {
4470
      if (!v.simd16_unsupported) {
4471
         /* Try a SIMD16 compile */
4472
         v2.import_uniforms(&v);
4473
         if (!v2.run_fs()) {
4474
            perf_debug("SIMD16 shader failed to compile, falling back to "
4475
                       "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
4476
         } else {
4477
            simd16_cfg = v2.cfg;
4478
         }
4479
      } else {
4480
         perf_debug("SIMD16 shader unsupported, falling back to "
4481
                    "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
4482
      }
4483
   }
4484
 
4485
   cfg_t *simd8_cfg;
4486
   int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
4487
   if ((no_simd8 || brw->gen < 5) && simd16_cfg) {
4488
      simd8_cfg = NULL;
4489
      prog_data->no_8 = true;
4490
   } else {
4491
      simd8_cfg = v.cfg;
4492
      prog_data->no_8 = false;
4493
   }
4494
 
4495
   fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
4496
                  &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
4497
 
4498
   if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
4499
      char *name;
4500
      if (prog)
4501
         name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
4502
                                prog->Label ? prog->Label : "unnamed",
4503
                                prog->Name);
4504
      else
4505
         name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
4506
 
4507
      g.enable_debug(name);
4508
   }
4509
 
4510
   if (simd8_cfg)
4511
      g.generate_code(simd8_cfg, 8);
4512
   if (simd16_cfg)
4513
      prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
4514
 
4515
   if (unlikely(brw->perf_debug) && shader) {
4516
      if (shader->compiled_once)
4517
         brw_wm_debug_recompile(brw, prog, key);
4518
      shader->compiled_once = true;
4519
 
4520
      if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
4521
         perf_debug("FS compile took %.03f ms and stalled the GPU\n",
4522
                    (get_time() - start_time) * 1000);
4523
      }
4524
   }
4525
 
4526
   return g.get_assembly(final_assembly_size);
4527
}
4528
 
4529
extern "C" bool
4530
brw_fs_precompile(struct gl_context *ctx,
4531
                  struct gl_shader_program *shader_prog,
4532
                  struct gl_program *prog)
4533
{
4534
   struct brw_context *brw = brw_context(ctx);
4535
   struct brw_wm_prog_key key;
4536
 
4537
   struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
4538
   struct brw_fragment_program *bfp = brw_fragment_program(fp);
4539
   bool program_uses_dfdy = fp->UsesDFdy;
4540
 
4541
   memset(&key, 0, sizeof(key));
4542
 
4543
   if (brw->gen < 6) {
4544
      if (fp->UsesKill)
4545
         key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
4546
 
4547
      if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
4548
         key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
4549
 
4550
      /* Just assume depth testing. */
4551
      key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
4552
      key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
4553
   }
4554
 
4555
   if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
4556
                                         BRW_FS_VARYING_INPUT_MASK) > 16)
4557
      key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
4558
 
4559
   brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
4560
 
4561
   if (fp->Base.InputsRead & VARYING_BIT_POS) {
4562
      key.drawable_height = ctx->DrawBuffer->Height;
4563
   }
4564
 
4565
   key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
4566
         ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
4567
         BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4568
 
4569
   if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4570
      key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4571
                          key.nr_color_regions > 1;
4572
   }
4573
 
4574
   key.program_string_id = bfp->id;
4575
 
4576
   uint32_t old_prog_offset = brw->wm.base.prog_offset;
4577
   struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4578
 
4579
   bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
4580
 
4581
   brw->wm.base.prog_offset = old_prog_offset;
4582
   brw->wm.prog_data = old_prog_data;
4583
 
4584
   return success;
4585
}
4586
 
4587
void
4588
brw_setup_tex_for_precompile(struct brw_context *brw,
4589
                             struct brw_sampler_prog_key_data *tex,
4590
                             struct gl_program *prog)
4591
{
4592
   const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
4593
   unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
4594
   for (unsigned i = 0; i < sampler_count; i++) {
4595
      if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
4596
         /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
4597
         tex->swizzles[i] =
4598
            MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
4599
      } else {
4600
         /* Color sampler: assume no swizzling. */
4601
         tex->swizzles[i] = SWIZZLE_XYZW;
4602
      }
4603
   }
4604
}