Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5564 serge 1
/*
2
 * Copyright © 2010 Intel Corporation
3
 *
4
 * Permission is hereby granted, free of charge, to any person obtaining a
5
 * copy of this software and associated documentation files (the "Software"),
6
 * to deal in the Software without restriction, including without limitation
7
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8
 * and/or sell copies of the Software, and to permit persons to whom the
9
 * Software is furnished to do so, subject to the following conditions:
10
 *
11
 * The above copyright notice and this permission notice (including the next
12
 * paragraph) shall be included in all copies or substantial portions of the
13
 * Software.
14
 *
15
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20
 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21
 * IN THE SOFTWARE.
22
 *
23
 * Authors:
24
 *    Eric Anholt 
25
 *
26
 */
27
 
28
#include "brw_fs.h"
29
#include "brw_vec4.h"
30
#include "brw_cfg.h"
31
#include "brw_shader.h"
32
#include "glsl/glsl_types.h"
33
#include "glsl/ir_optimization.h"
34
 
35
using namespace brw;
36
 
37
/** @file brw_fs_schedule_instructions.cpp
38
 *
39
 * List scheduling of FS instructions.
40
 *
41
 * The basic model of the list scheduler is to take a basic block,
42
 * compute a DAG of the dependencies (RAW ordering with latency, WAW
43
 * ordering with latency, WAR ordering), and make a list of the DAG heads.
44
 * Heuristically pick a DAG head, then put all the children that are
45
 * now DAG heads into the list of things to schedule.
46
 *
47
 * The heuristic is the important part.  We're trying to be cheap,
48
 * since actually computing the optimal scheduling is NP complete.
49
 * What we do is track a "current clock".  When we schedule a node, we
50
 * update the earliest-unblocked clock time of its children, and
51
 * increment the clock.  Then, when trying to schedule, we just pick
52
 * the earliest-unblocked instruction to schedule.
53
 *
54
 * Note that often there will be many things which could execute
55
 * immediately, and there are a range of heuristic options to choose
56
 * from in picking among those.
57
 */
58
 
59
static bool debug = false;
60
 
61
class instruction_scheduler;
62
 
63
class schedule_node : public exec_node
64
{
65
public:
66
   schedule_node(backend_instruction *inst, instruction_scheduler *sched);
67
   void set_latency_gen4();
68
   void set_latency_gen7(bool is_haswell);
69
 
70
   backend_instruction *inst;
71
   schedule_node **children;
72
   int *child_latency;
73
   int child_count;
74
   int parent_count;
75
   int child_array_size;
76
   int unblocked_time;
77
   int latency;
78
 
79
   /**
80
    * Which iteration of pushing groups of children onto the candidates list
81
    * this node was a part of.
82
    */
83
   unsigned cand_generation;
84
 
85
   /**
86
    * This is the sum of the instruction's latency plus the maximum delay of
87
    * its children, or just the issue_time if it's a leaf node.
88
    */
89
   int delay;
90
};
91
 
92
void
93
schedule_node::set_latency_gen4()
94
{
95
   int chans = 8;
96
   int math_latency = 22;
97
 
98
   switch (inst->opcode) {
99
   case SHADER_OPCODE_RCP:
100
      this->latency = 1 * chans * math_latency;
101
      break;
102
   case SHADER_OPCODE_RSQ:
103
      this->latency = 2 * chans * math_latency;
104
      break;
105
   case SHADER_OPCODE_INT_QUOTIENT:
106
   case SHADER_OPCODE_SQRT:
107
   case SHADER_OPCODE_LOG2:
108
      /* full precision log.  partial is 2. */
109
      this->latency = 3 * chans * math_latency;
110
      break;
111
   case SHADER_OPCODE_INT_REMAINDER:
112
   case SHADER_OPCODE_EXP2:
113
      /* full precision.  partial is 3, same throughput. */
114
      this->latency = 4 * chans * math_latency;
115
      break;
116
   case SHADER_OPCODE_POW:
117
      this->latency = 8 * chans * math_latency;
118
      break;
119
   case SHADER_OPCODE_SIN:
120
   case SHADER_OPCODE_COS:
121
      /* minimum latency, max is 12 rounds. */
122
      this->latency = 5 * chans * math_latency;
123
      break;
124
   default:
125
      this->latency = 2;
126
      break;
127
   }
128
}
129
 
130
void
131
schedule_node::set_latency_gen7(bool is_haswell)
132
{
133
   switch (inst->opcode) {
134
   case BRW_OPCODE_MAD:
135
      /* 2 cycles
136
       *  (since the last two src operands are in different register banks):
137
       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
138
       *
139
       * 3 cycles on IVB, 4 on HSW
140
       *  (since the last two src operands are in the same register bank):
141
       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
142
       *
143
       * 18 cycles on IVB, 16 on HSW
144
       *  (since the last two src operands are in different register banks):
145
       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
146
       * mov(8) null   g4<4,5,1>F                     { align16 WE_normal 1Q };
147
       *
148
       * 20 cycles on IVB, 18 on HSW
149
       *  (since the last two src operands are in the same register bank):
150
       * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
151
       * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
152
       */
153
 
154
      /* Our register allocator doesn't know about register banks, so use the
155
       * higher latency.
156
       */
157
      latency = is_haswell ? 16 : 18;
158
      break;
159
 
160
   case BRW_OPCODE_LRP:
161
      /* 2 cycles
162
       *  (since the last two src operands are in different register banks):
163
       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
164
       *
165
       * 3 cycles on IVB, 4 on HSW
166
       *  (since the last two src operands are in the same register bank):
167
       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
168
       *
169
       * 16 cycles on IVB, 14 on HSW
170
       *  (since the last two src operands are in different register banks):
171
       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
172
       * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
173
       *
174
       * 16 cycles
175
       *  (since the last two src operands are in the same register bank):
176
       * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
177
       * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
178
       */
179
 
180
      /* Our register allocator doesn't know about register banks, so use the
181
       * higher latency.
182
       */
183
      latency = 14;
184
      break;
185
 
186
   case SHADER_OPCODE_RCP:
187
   case SHADER_OPCODE_RSQ:
188
   case SHADER_OPCODE_SQRT:
189
   case SHADER_OPCODE_LOG2:
190
   case SHADER_OPCODE_EXP2:
191
   case SHADER_OPCODE_SIN:
192
   case SHADER_OPCODE_COS:
193
      /* 2 cycles:
194
       * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
195
       *
196
       * 18 cycles:
197
       * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
198
       * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
199
       *
200
       * Same for exp2, log2, rsq, sqrt, sin, cos.
201
       */
202
      latency = is_haswell ? 14 : 16;
203
      break;
204
 
205
   case SHADER_OPCODE_POW:
206
      /* 2 cycles:
207
       * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
208
       *
209
       * 26 cycles:
210
       * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
211
       * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
212
       */
213
      latency = is_haswell ? 22 : 24;
214
      break;
215
 
216
   case SHADER_OPCODE_TEX:
217
   case SHADER_OPCODE_TXD:
218
   case SHADER_OPCODE_TXF:
219
   case SHADER_OPCODE_TXL:
220
      /* 18 cycles:
221
       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
222
       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
223
       * send(8) g4<1>UW    g114<8,8,1>F
224
       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
225
       *
226
       * 697 +/-49 cycles (min 610, n=26):
227
       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
228
       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
229
       * send(8) g4<1>UW    g114<8,8,1>F
230
       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
231
       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
232
       *
233
       * So the latency on our first texture load of the batchbuffer takes
234
       * ~700 cycles, since the caches are cold at that point.
235
       *
236
       * 840 +/- 92 cycles (min 720, n=25):
237
       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
238
       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
239
       * send(8) g4<1>UW    g114<8,8,1>F
240
       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
241
       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
242
       * send(8) g4<1>UW    g114<8,8,1>F
243
       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
244
       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
245
       *
246
       * On the second load, it takes just an extra ~140 cycles, and after
247
       * accounting for the 14 cycles of the MOV's latency, that makes ~130.
248
       *
249
       * 683 +/- 49 cycles (min = 602, n=47):
250
       * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
251
       * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
252
       * send(8) g4<1>UW    g114<8,8,1>F
253
       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
254
       * send(8) g50<1>UW   g114<8,8,1>F
255
       *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
256
       * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
257
       *
258
       * The unit appears to be pipelined, since this matches up with the
259
       * cache-cold case, despite there being two loads here.  If you replace
260
       * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
261
       *
262
       * So, take some number between the cache-hot 140 cycles and the
263
       * cache-cold 700 cycles.  No particular tuning was done on this.
264
       *
265
       * I haven't done significant testing of the non-TEX opcodes.  TXL at
266
       * least looked about the same as TEX.
267
       */
268
      latency = 200;
269
      break;
270
 
271
   case SHADER_OPCODE_TXS:
272
      /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
273
       * cycles (n=15):
274
       * mov(8)   g114<1>UD  0D                        { align1 WE_normal 1Q };
275
       * send(8)  g6<1>UW    g114<8,8,1>F
276
       *   sampler (10, 0, 10, 1) mlen 1 rlen 4        { align1 WE_normal 1Q };
277
       * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
278
       *
279
       *
280
       * Two loads was 535 +/- 30 cycles (n=19):
281
       * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
282
       * send(16)  g6<1>UW    g114<8,8,1>F
283
       *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
284
       * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
285
       * mov(16)   g6<1>F     g6<8,8,1>D               { align1 WE_normal 1H };
286
       * send(16)  g8<1>UW    g114<8,8,1>F
287
       *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
288
       * mov(16)   g8<1>F     g8<8,8,1>D               { align1 WE_normal 1H };
289
       * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
290
       *
291
       * Since the only caches that should matter are just the
292
       * instruction/state cache containing the surface state, assume that we
293
       * always have hot caches.
294
       */
295
      latency = 100;
296
      break;
297
 
298
   case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
299
   case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
300
   case VS_OPCODE_PULL_CONSTANT_LOAD:
301
      /* testing using varying-index pull constants:
302
       *
303
       * 16 cycles:
304
       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
305
       * send(8) g4<1>F  g4<8,8,1>D
306
       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
307
       *
308
       * ~480 cycles:
309
       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
310
       * send(8) g4<1>F  g4<8,8,1>D
311
       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
312
       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
313
       *
314
       * ~620 cycles:
315
       * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
316
       * send(8) g4<1>F  g4<8,8,1>D
317
       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
318
       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
319
       * send(8) g4<1>F  g4<8,8,1>D
320
       *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
321
       * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
322
       *
323
       * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
324
       * about 460.  We expect to mostly be cache hot, so pick something more
325
       * in that direction.
326
       */
327
      latency = 200;
328
      break;
329
 
330
   case SHADER_OPCODE_GEN7_SCRATCH_READ:
331
      /* Testing a load from offset 0, that had been previously written:
332
       *
333
       * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q };
334
       * mov(8)  null      g114<8,8,1>F { align1 WE_normal 1Q };
335
       *
336
       * The cycles spent seemed to be grouped around 40-50 (as low as 38),
337
       * then around 140.  Presumably this is cache hit vs miss.
338
       */
339
      latency = 50;
340
      break;
341
 
342
   case SHADER_OPCODE_UNTYPED_ATOMIC:
343
   case SHADER_OPCODE_TYPED_ATOMIC:
344
      /* Test code:
345
       *   mov(8)    g112<1>ud       0x00000000ud       { align1 WE_all 1Q };
346
       *   mov(1)    g112.7<1>ud     g1.7<0,1,0>ud      { align1 WE_all };
347
       *   mov(8)    g113<1>ud       0x00000000ud       { align1 WE_normal 1Q };
348
       *   send(8)   g4<1>ud         g112<8,8,1>ud
349
       *             data (38, 5, 6) mlen 2 rlen 1      { align1 WE_normal 1Q };
350
       *
351
       * Running it 100 times as fragment shader on a 128x128 quad
352
       * gives an average latency of 13867 cycles per atomic op,
353
       * standard deviation 3%.  Note that this is a rather
354
       * pessimistic estimate, the actual latency in cases with few
355
       * collisions between threads and favorable pipelining has been
356
       * seen to be reduced by a factor of 100.
357
       */
358
      latency = 14000;
359
      break;
360
 
361
   case SHADER_OPCODE_UNTYPED_SURFACE_READ:
362
   case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
363
   case SHADER_OPCODE_TYPED_SURFACE_READ:
364
   case SHADER_OPCODE_TYPED_SURFACE_WRITE:
365
      /* Test code:
366
       *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
367
       *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
368
       *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
369
       *   send(8)   g4<1>UD         g112<8,8,1>UD
370
       *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
371
       *   .
372
       *   . [repeats 8 times]
373
       *   .
374
       *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
375
       *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
376
       *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
377
       *   send(8)   g4<1>UD         g112<8,8,1>UD
378
       *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
379
       *
380
       * Running it 100 times as fragment shader on a 128x128 quad
381
       * gives an average latency of 583 cycles per surface read,
382
       * standard deviation 0.9%.
383
       */
384
      latency = is_haswell ? 300 : 600;
385
      break;
386
 
387
   default:
388
      /* 2 cycles:
389
       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
390
       *
391
       * 16 cycles:
392
       * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
393
       * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
394
       */
395
      latency = 14;
396
      break;
397
   }
398
}
399
 
400
class instruction_scheduler {
401
public:
402
   instruction_scheduler(backend_visitor *v, int grf_count,
403
                         instruction_scheduler_mode mode)
404
   {
405
      this->bv = v;
406
      this->mem_ctx = ralloc_context(NULL);
407
      this->grf_count = grf_count;
408
      this->instructions.make_empty();
409
      this->instructions_to_schedule = 0;
410
      this->post_reg_alloc = (mode == SCHEDULE_POST);
411
      this->mode = mode;
412
      this->time = 0;
413
      if (!post_reg_alloc) {
414
         this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
415
         this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
416
      } else {
417
         this->remaining_grf_uses = NULL;
418
         this->grf_active = NULL;
419
      }
420
   }
421
 
422
   ~instruction_scheduler()
423
   {
424
      ralloc_free(this->mem_ctx);
425
   }
426
   void add_barrier_deps(schedule_node *n);
427
   void add_dep(schedule_node *before, schedule_node *after, int latency);
428
   void add_dep(schedule_node *before, schedule_node *after);
429
 
430
   void run(cfg_t *cfg);
431
   void add_insts_from_block(bblock_t *block);
432
   void compute_delay(schedule_node *node);
433
   virtual void calculate_deps() = 0;
434
   virtual schedule_node *choose_instruction_to_schedule() = 0;
435
 
436
   /**
437
    * Returns how many cycles it takes the instruction to issue.
438
    *
439
    * Instructions in gen hardware are handled one simd4 vector at a time,
440
    * with 1 cycle per vector dispatched.  Thus SIMD8 pixel shaders take 2
441
    * cycles to dispatch and SIMD16 (compressed) instructions take 4.
442
    */
443
   virtual int issue_time(backend_instruction *inst) = 0;
444
 
445
   virtual void count_remaining_grf_uses(backend_instruction *inst) = 0;
446
   virtual void update_register_pressure(backend_instruction *inst) = 0;
447
   virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
448
 
449
   void schedule_instructions(bblock_t *block);
450
 
451
   void *mem_ctx;
452
 
453
   bool post_reg_alloc;
454
   int instructions_to_schedule;
455
   int grf_count;
456
   int time;
457
   exec_list instructions;
458
   backend_visitor *bv;
459
 
460
   instruction_scheduler_mode mode;
461
 
462
   /**
463
    * Number of instructions left to schedule that reference each vgrf.
464
    *
465
    * Used so that we can prefer scheduling instructions that will end the
466
    * live intervals of multiple variables, to reduce register pressure.
467
    */
468
   int *remaining_grf_uses;
469
 
470
   /**
471
    * Tracks whether each VGRF has had an instruction scheduled that uses it.
472
    *
473
    * This is used to estimate whether scheduling a new instruction will
474
    * increase register pressure.
475
    */
476
   bool *grf_active;
477
};
478
 
479
class fs_instruction_scheduler : public instruction_scheduler
480
{
481
public:
482
   fs_instruction_scheduler(fs_visitor *v, int grf_count,
483
                            instruction_scheduler_mode mode);
484
   void calculate_deps();
485
   bool is_compressed(fs_inst *inst);
486
   schedule_node *choose_instruction_to_schedule();
487
   int issue_time(backend_instruction *inst);
488
   fs_visitor *v;
489
 
490
   void count_remaining_grf_uses(backend_instruction *inst);
491
   void update_register_pressure(backend_instruction *inst);
492
   int get_register_pressure_benefit(backend_instruction *inst);
493
};
494
 
495
fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
496
                                                   int grf_count,
497
                                                   instruction_scheduler_mode mode)
498
   : instruction_scheduler(v, grf_count, mode),
499
     v(v)
500
{
501
}
502
 
503
void
504
fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
505
{
506
   fs_inst *inst = (fs_inst *)be;
507
 
508
   if (!remaining_grf_uses)
509
      return;
510
 
511
   if (inst->dst.file == GRF)
512
      remaining_grf_uses[inst->dst.reg]++;
513
 
514
   for (int i = 0; i < inst->sources; i++) {
515
      if (inst->src[i].file != GRF)
516
         continue;
517
 
518
      remaining_grf_uses[inst->src[i].reg]++;
519
   }
520
}
521
 
522
void
523
fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
524
{
525
   fs_inst *inst = (fs_inst *)be;
526
 
527
   if (!remaining_grf_uses)
528
      return;
529
 
530
   if (inst->dst.file == GRF) {
531
      remaining_grf_uses[inst->dst.reg]--;
532
      grf_active[inst->dst.reg] = true;
533
   }
534
 
535
   for (int i = 0; i < inst->sources; i++) {
536
      if (inst->src[i].file == GRF) {
537
         remaining_grf_uses[inst->src[i].reg]--;
538
         grf_active[inst->src[i].reg] = true;
539
      }
540
   }
541
}
542
 
543
int
544
fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
545
{
546
   fs_inst *inst = (fs_inst *)be;
547
   int benefit = 0;
548
 
549
   if (inst->dst.file == GRF) {
550
      if (remaining_grf_uses[inst->dst.reg] == 1)
551
         benefit += v->alloc.sizes[inst->dst.reg];
552
      if (!grf_active[inst->dst.reg])
553
         benefit -= v->alloc.sizes[inst->dst.reg];
554
   }
555
 
556
   for (int i = 0; i < inst->sources; i++) {
557
      if (inst->src[i].file != GRF)
558
         continue;
559
 
560
      if (remaining_grf_uses[inst->src[i].reg] == 1)
561
         benefit += v->alloc.sizes[inst->src[i].reg];
562
      if (!grf_active[inst->src[i].reg])
563
         benefit -= v->alloc.sizes[inst->src[i].reg];
564
   }
565
 
566
   return benefit;
567
}
568
 
569
class vec4_instruction_scheduler : public instruction_scheduler
570
{
571
public:
572
   vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
573
   void calculate_deps();
574
   schedule_node *choose_instruction_to_schedule();
575
   int issue_time(backend_instruction *inst);
576
   vec4_visitor *v;
577
 
578
   void count_remaining_grf_uses(backend_instruction *inst);
579
   void update_register_pressure(backend_instruction *inst);
580
   int get_register_pressure_benefit(backend_instruction *inst);
581
};
582
 
583
vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
584
                                                       int grf_count)
585
   : instruction_scheduler(v, grf_count, SCHEDULE_POST),
586
     v(v)
587
{
588
}
589
 
590
void
591
vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
592
{
593
}
594
 
595
void
596
vec4_instruction_scheduler::update_register_pressure(backend_instruction *be)
597
{
598
}
599
 
600
int
601
vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
602
{
603
   return 0;
604
}
605
 
606
schedule_node::schedule_node(backend_instruction *inst,
607
                             instruction_scheduler *sched)
608
{
609
   const struct brw_device_info *devinfo = sched->bv->devinfo;
610
 
611
   this->inst = inst;
612
   this->child_array_size = 0;
613
   this->children = NULL;
614
   this->child_latency = NULL;
615
   this->child_count = 0;
616
   this->parent_count = 0;
617
   this->unblocked_time = 0;
618
   this->cand_generation = 0;
619
   this->delay = 0;
620
 
621
   /* We can't measure Gen6 timings directly but expect them to be much
622
    * closer to Gen7 than Gen4.
623
    */
624
   if (!sched->post_reg_alloc)
625
      this->latency = 1;
626
   else if (devinfo->gen >= 6)
627
      set_latency_gen7(devinfo->is_haswell);
628
   else
629
      set_latency_gen4();
630
}
631
 
632
void
633
instruction_scheduler::add_insts_from_block(bblock_t *block)
634
{
635
   /* Removing the last instruction from a basic block removes the block as
636
    * well, so put a NOP at the end to keep it alive.
637
    */
638
   if (!block->end()->is_control_flow()) {
639
      backend_instruction *nop = new(mem_ctx) backend_instruction();
640
      nop->opcode = BRW_OPCODE_NOP;
641
      block->end()->insert_after(block, nop);
642
   }
643
 
644
   foreach_inst_in_block_safe(backend_instruction, inst, block) {
645
      if (inst->opcode == BRW_OPCODE_NOP || inst->is_control_flow())
646
         continue;
647
 
648
      schedule_node *n = new(mem_ctx) schedule_node(inst, this);
649
 
650
      this->instructions_to_schedule++;
651
 
652
      inst->remove(block);
653
      instructions.push_tail(n);
654
   }
655
}
656
 
657
/** Recursive computation of the delay member of a node. */
658
void
659
instruction_scheduler::compute_delay(schedule_node *n)
660
{
661
   if (!n->child_count) {
662
      n->delay = issue_time(n->inst);
663
   } else {
664
      for (int i = 0; i < n->child_count; i++) {
665
         if (!n->children[i]->delay)
666
            compute_delay(n->children[i]);
667
         n->delay = MAX2(n->delay, n->latency + n->children[i]->delay);
668
      }
669
   }
670
}
671
 
672
/**
673
 * Add a dependency between two instruction nodes.
674
 *
675
 * The @after node will be scheduled after @before.  We will try to
676
 * schedule it @latency cycles after @before, but no guarantees there.
677
 */
678
void
679
instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
680
                               int latency)
681
{
682
   if (!before || !after)
683
      return;
684
 
685
   assert(before != after);
686
 
687
   for (int i = 0; i < before->child_count; i++) {
688
      if (before->children[i] == after) {
689
         before->child_latency[i] = MAX2(before->child_latency[i], latency);
690
         return;
691
      }
692
   }
693
 
694
   if (before->child_array_size <= before->child_count) {
695
      if (before->child_array_size < 16)
696
         before->child_array_size = 16;
697
      else
698
         before->child_array_size *= 2;
699
 
700
      before->children = reralloc(mem_ctx, before->children,
701
                                  schedule_node *,
702
                                  before->child_array_size);
703
      before->child_latency = reralloc(mem_ctx, before->child_latency,
704
                                       int, before->child_array_size);
705
   }
706
 
707
   before->children[before->child_count] = after;
708
   before->child_latency[before->child_count] = latency;
709
   before->child_count++;
710
   after->parent_count++;
711
}
712
 
713
void
714
instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
715
{
716
   if (!before)
717
      return;
718
 
719
   add_dep(before, after, before->latency);
720
}
721
 
722
/**
723
 * Sometimes we really want this node to execute after everything that
724
 * was before it and before everything that followed it.  This adds
725
 * the deps to do so.
726
 */
727
void
728
instruction_scheduler::add_barrier_deps(schedule_node *n)
729
{
730
   schedule_node *prev = (schedule_node *)n->prev;
731
   schedule_node *next = (schedule_node *)n->next;
732
 
733
   if (prev) {
734
      while (!prev->is_head_sentinel()) {
735
         add_dep(prev, n, 0);
736
         prev = (schedule_node *)prev->prev;
737
      }
738
   }
739
 
740
   if (next) {
741
      while (!next->is_tail_sentinel()) {
742
         add_dep(n, next, 0);
743
         next = (schedule_node *)next->next;
744
      }
745
   }
746
}
747
 
748
/* instruction scheduling needs to be aware of when an MRF write
749
 * actually writes 2 MRFs.
750
 */
751
bool
752
fs_instruction_scheduler::is_compressed(fs_inst *inst)
753
{
754
   return inst->exec_size == 16;
755
}
756
 
757
void
758
fs_instruction_scheduler::calculate_deps()
759
{
760
   /* Pre-register-allocation, this tracks the last write per VGRF offset.
761
    * After register allocation, reg_offsets are gone and we track individual
762
    * GRF registers.
763
    */
764
   schedule_node *last_grf_write[grf_count * 16];
765
   schedule_node *last_mrf_write[BRW_MAX_MRF];
766
   schedule_node *last_conditional_mod[2] = { NULL, NULL };
767
   schedule_node *last_accumulator_write = NULL;
768
   /* Fixed HW registers are assumed to be separate from the virtual
769
    * GRFs, so they can be tracked separately.  We don't really write
770
    * to fixed GRFs much, so don't bother tracking them on a more
771
    * granular level.
772
    */
773
   schedule_node *last_fixed_grf_write = NULL;
774
   int reg_width = v->dispatch_width / 8;
775
 
776
   /* The last instruction always needs to still be the last
777
    * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
778
    * WHILE) and scheduling other things after it would disturb the
779
    * basic block, or it's FB_WRITE and we should do a better job at
780
    * dead code elimination anyway.
781
    */
782
   schedule_node *last = (schedule_node *)instructions.get_tail();
783
   add_barrier_deps(last);
784
 
785
   memset(last_grf_write, 0, sizeof(last_grf_write));
786
   memset(last_mrf_write, 0, sizeof(last_mrf_write));
787
 
788
   /* top-to-bottom dependencies: RAW and WAW. */
789
   foreach_in_list(schedule_node, n, &instructions) {
790
      fs_inst *inst = (fs_inst *)n->inst;
791
 
792
      if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
793
         inst->has_side_effects())
794
         add_barrier_deps(n);
795
 
796
      /* read-after-write deps. */
797
      for (int i = 0; i < inst->sources; i++) {
798
         if (inst->src[i].file == GRF) {
799
            if (post_reg_alloc) {
800
               for (int r = 0; r < inst->regs_read(i); r++)
801
                  add_dep(last_grf_write[inst->src[i].reg + r], n);
802
            } else {
803
               for (int r = 0; r < inst->regs_read(i); r++) {
804
                  add_dep(last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], n);
805
               }
806
            }
807
         } else if (inst->src[i].file == HW_REG &&
808
                    (inst->src[i].fixed_hw_reg.file ==
809
                     BRW_GENERAL_REGISTER_FILE)) {
810
            if (post_reg_alloc) {
811
               int size = reg_width;
812
               if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
813
                  size = 1;
814
               for (int r = 0; r < size; r++)
815
                  add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
816
            } else {
817
               add_dep(last_fixed_grf_write, n);
818
            }
819
         } else if (inst->src[i].is_accumulator()) {
820
            add_dep(last_accumulator_write, n);
821
         } else if (inst->src[i].file != BAD_FILE &&
822
                    inst->src[i].file != IMM &&
823
                    inst->src[i].file != UNIFORM &&
824
                    (inst->src[i].file != HW_REG ||
825
                     inst->src[i].fixed_hw_reg.file != IMM)) {
826
            assert(inst->src[i].file != MRF);
827
            add_barrier_deps(n);
828
         }
829
      }
830
 
831
      if (inst->base_mrf != -1) {
832
         for (int i = 0; i < inst->mlen; i++) {
833
            /* It looks like the MRF regs are released in the send
834
             * instruction once it's sent, not when the result comes
835
             * back.
836
             */
837
            add_dep(last_mrf_write[inst->base_mrf + i], n);
838
         }
839
      }
840
 
841
      if (inst->reads_flag()) {
842
         add_dep(last_conditional_mod[inst->flag_subreg], n);
843
      }
844
 
845
      if (inst->reads_accumulator_implicitly()) {
846
         add_dep(last_accumulator_write, n);
847
      }
848
 
849
      /* write-after-write deps. */
850
      if (inst->dst.file == GRF) {
851
         if (post_reg_alloc) {
852
            for (int r = 0; r < inst->regs_written; r++) {
853
               add_dep(last_grf_write[inst->dst.reg + r], n);
854
               last_grf_write[inst->dst.reg + r] = n;
855
            }
856
         } else {
857
            for (int r = 0; r < inst->regs_written; r++) {
858
               add_dep(last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r], n);
859
               last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n;
860
            }
861
         }
862
      } else if (inst->dst.file == MRF) {
863
         int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
864
 
865
         add_dep(last_mrf_write[reg], n);
866
         last_mrf_write[reg] = n;
867
         if (is_compressed(inst)) {
868
            if (inst->dst.reg & BRW_MRF_COMPR4)
869
               reg += 4;
870
            else
871
               reg++;
872
            add_dep(last_mrf_write[reg], n);
873
            last_mrf_write[reg] = n;
874
         }
875
      } else if (inst->dst.file == HW_REG &&
876
                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
877
         if (post_reg_alloc) {
878
            for (int r = 0; r < reg_width; r++)
879
               last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
880
         } else {
881
            last_fixed_grf_write = n;
882
         }
883
      } else if (inst->dst.is_accumulator()) {
884
         add_dep(last_accumulator_write, n);
885
         last_accumulator_write = n;
886
      } else if (inst->dst.file != BAD_FILE &&
887
                 !inst->dst.is_null()) {
888
         add_barrier_deps(n);
889
      }
890
 
891
      if (inst->mlen > 0 && inst->base_mrf != -1) {
892
         for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
893
            add_dep(last_mrf_write[inst->base_mrf + i], n);
894
            last_mrf_write[inst->base_mrf + i] = n;
895
         }
896
      }
897
 
898
      if (inst->writes_flag()) {
899
         add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
900
         last_conditional_mod[inst->flag_subreg] = n;
901
      }
902
 
903
      if (inst->writes_accumulator_implicitly(v->devinfo) &&
904
          !inst->dst.is_accumulator()) {
905
         add_dep(last_accumulator_write, n);
906
         last_accumulator_write = n;
907
      }
908
   }
909
 
910
   /* bottom-to-top dependencies: WAR */
911
   memset(last_grf_write, 0, sizeof(last_grf_write));
912
   memset(last_mrf_write, 0, sizeof(last_mrf_write));
913
   memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
914
   last_accumulator_write = NULL;
915
   last_fixed_grf_write = NULL;
916
 
917
   exec_node *node;
918
   exec_node *prev;
919
   for (node = instructions.get_tail(), prev = node->prev;
920
        !node->is_head_sentinel();
921
        node = prev, prev = node->prev) {
922
      schedule_node *n = (schedule_node *)node;
923
      fs_inst *inst = (fs_inst *)n->inst;
924
 
925
      /* write-after-read deps. */
926
      for (int i = 0; i < inst->sources; i++) {
927
         if (inst->src[i].file == GRF) {
928
            if (post_reg_alloc) {
929
               for (int r = 0; r < inst->regs_read(i); r++)
930
                  add_dep(n, last_grf_write[inst->src[i].reg + r]);
931
            } else {
932
               for (int r = 0; r < inst->regs_read(i); r++) {
933
                  add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r]);
934
               }
935
            }
936
         } else if (inst->src[i].file == HW_REG &&
937
                    (inst->src[i].fixed_hw_reg.file ==
938
                     BRW_GENERAL_REGISTER_FILE)) {
939
            if (post_reg_alloc) {
940
               int size = reg_width;
941
               if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
942
                  size = 1;
943
               for (int r = 0; r < size; r++)
944
                  add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
945
            } else {
946
               add_dep(n, last_fixed_grf_write);
947
            }
948
         } else if (inst->src[i].is_accumulator()) {
949
            add_dep(n, last_accumulator_write);
950
         } else if (inst->src[i].file != BAD_FILE &&
951
                    inst->src[i].file != IMM &&
952
                    inst->src[i].file != UNIFORM &&
953
                    (inst->src[i].file != HW_REG ||
954
                     inst->src[i].fixed_hw_reg.file != IMM)) {
955
            assert(inst->src[i].file != MRF);
956
            add_barrier_deps(n);
957
         }
958
      }
959
 
960
      if (inst->base_mrf != -1) {
961
         for (int i = 0; i < inst->mlen; i++) {
962
            /* It looks like the MRF regs are released in the send
963
             * instruction once it's sent, not when the result comes
964
             * back.
965
             */
966
            add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
967
         }
968
      }
969
 
970
      if (inst->reads_flag()) {
971
         add_dep(n, last_conditional_mod[inst->flag_subreg]);
972
      }
973
 
974
      if (inst->reads_accumulator_implicitly()) {
975
         add_dep(n, last_accumulator_write);
976
      }
977
 
978
      /* Update the things this instruction wrote, so earlier reads
979
       * can mark this as WAR dependency.
980
       */
981
      if (inst->dst.file == GRF) {
982
         if (post_reg_alloc) {
983
            for (int r = 0; r < inst->regs_written; r++)
984
               last_grf_write[inst->dst.reg + r] = n;
985
         } else {
986
            for (int r = 0; r < inst->regs_written; r++) {
987
               last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n;
988
            }
989
         }
990
      } else if (inst->dst.file == MRF) {
991
         int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
992
 
993
         last_mrf_write[reg] = n;
994
 
995
         if (is_compressed(inst)) {
996
            if (inst->dst.reg & BRW_MRF_COMPR4)
997
               reg += 4;
998
            else
999
               reg++;
1000
 
1001
            last_mrf_write[reg] = n;
1002
         }
1003
      } else if (inst->dst.file == HW_REG &&
1004
                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
1005
         if (post_reg_alloc) {
1006
            for (int r = 0; r < reg_width; r++)
1007
               last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
1008
         } else {
1009
            last_fixed_grf_write = n;
1010
         }
1011
      } else if (inst->dst.is_accumulator()) {
1012
         last_accumulator_write = n;
1013
      } else if (inst->dst.file != BAD_FILE &&
1014
                 !inst->dst.is_null()) {
1015
         add_barrier_deps(n);
1016
      }
1017
 
1018
      if (inst->mlen > 0 && inst->base_mrf != -1) {
1019
         for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
1020
            last_mrf_write[inst->base_mrf + i] = n;
1021
         }
1022
      }
1023
 
1024
      if (inst->writes_flag()) {
1025
         last_conditional_mod[inst->flag_subreg] = n;
1026
      }
1027
 
1028
      if (inst->writes_accumulator_implicitly(v->devinfo)) {
1029
         last_accumulator_write = n;
1030
      }
1031
   }
1032
}
1033
 
1034
void
1035
vec4_instruction_scheduler::calculate_deps()
1036
{
1037
   schedule_node *last_grf_write[grf_count];
1038
   schedule_node *last_mrf_write[BRW_MAX_MRF];
1039
   schedule_node *last_conditional_mod = NULL;
1040
   schedule_node *last_accumulator_write = NULL;
1041
   /* Fixed HW registers are assumed to be separate from the virtual
1042
    * GRFs, so they can be tracked separately.  We don't really write
1043
    * to fixed GRFs much, so don't bother tracking them on a more
1044
    * granular level.
1045
    */
1046
   schedule_node *last_fixed_grf_write = NULL;
1047
 
1048
   /* The last instruction always needs to still be the last instruction.
1049
    * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
1050
    * other things after it would disturb the basic block, or it's the EOT
1051
    * URB_WRITE and we should do a better job at dead code eliminating
1052
    * anything that could have been scheduled after it.
1053
    */
1054
   schedule_node *last = (schedule_node *)instructions.get_tail();
1055
   add_barrier_deps(last);
1056
 
1057
   memset(last_grf_write, 0, sizeof(last_grf_write));
1058
   memset(last_mrf_write, 0, sizeof(last_mrf_write));
1059
 
1060
   /* top-to-bottom dependencies: RAW and WAW. */
1061
   foreach_in_list(schedule_node, n, &instructions) {
1062
      vec4_instruction *inst = (vec4_instruction *)n->inst;
1063
 
1064
      if (inst->has_side_effects())
1065
         add_barrier_deps(n);
1066
 
1067
      /* read-after-write deps. */
1068
      for (int i = 0; i < 3; i++) {
1069
         if (inst->src[i].file == GRF) {
1070
            for (unsigned j = 0; j < inst->regs_read(i); ++j)
1071
               add_dep(last_grf_write[inst->src[i].reg + j], n);
1072
         } else if (inst->src[i].file == HW_REG &&
1073
                    (inst->src[i].fixed_hw_reg.file ==
1074
                     BRW_GENERAL_REGISTER_FILE)) {
1075
            add_dep(last_fixed_grf_write, n);
1076
         } else if (inst->src[i].is_accumulator()) {
1077
            assert(last_accumulator_write);
1078
            add_dep(last_accumulator_write, n);
1079
         } else if (inst->src[i].file != BAD_FILE &&
1080
                    inst->src[i].file != IMM &&
1081
                    inst->src[i].file != UNIFORM &&
1082
                    (inst->src[i].file != HW_REG ||
1083
                     inst->src[i].fixed_hw_reg.file != IMM)) {
1084
            /* No reads from MRF, and ATTR is already translated away */
1085
            assert(inst->src[i].file != MRF &&
1086
                   inst->src[i].file != ATTR);
1087
            add_barrier_deps(n);
1088
         }
1089
      }
1090
 
1091
      if (!inst->is_send_from_grf()) {
1092
         for (int i = 0; i < inst->mlen; i++) {
1093
            /* It looks like the MRF regs are released in the send
1094
             * instruction once it's sent, not when the result comes
1095
             * back.
1096
             */
1097
            add_dep(last_mrf_write[inst->base_mrf + i], n);
1098
         }
1099
      }
1100
 
1101
      if (inst->reads_flag()) {
1102
         assert(last_conditional_mod);
1103
         add_dep(last_conditional_mod, n);
1104
      }
1105
 
1106
      if (inst->reads_accumulator_implicitly()) {
1107
         assert(last_accumulator_write);
1108
         add_dep(last_accumulator_write, n);
1109
      }
1110
 
1111
      /* write-after-write deps. */
1112
      if (inst->dst.file == GRF) {
1113
         for (unsigned j = 0; j < inst->regs_written; ++j) {
1114
            add_dep(last_grf_write[inst->dst.reg + j], n);
1115
            last_grf_write[inst->dst.reg + j] = n;
1116
         }
1117
      } else if (inst->dst.file == MRF) {
1118
         add_dep(last_mrf_write[inst->dst.reg], n);
1119
         last_mrf_write[inst->dst.reg] = n;
1120
     } else if (inst->dst.file == HW_REG &&
1121
                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
1122
         last_fixed_grf_write = n;
1123
      } else if (inst->dst.is_accumulator()) {
1124
         add_dep(last_accumulator_write, n);
1125
         last_accumulator_write = n;
1126
      } else if (inst->dst.file != BAD_FILE &&
1127
                 !inst->dst.is_null()) {
1128
         add_barrier_deps(n);
1129
      }
1130
 
1131
      if (inst->mlen > 0 && !inst->is_send_from_grf()) {
1132
         for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
1133
            add_dep(last_mrf_write[inst->base_mrf + i], n);
1134
            last_mrf_write[inst->base_mrf + i] = n;
1135
         }
1136
      }
1137
 
1138
      if (inst->writes_flag()) {
1139
         add_dep(last_conditional_mod, n, 0);
1140
         last_conditional_mod = n;
1141
      }
1142
 
1143
      if (inst->writes_accumulator_implicitly(v->devinfo) &&
1144
          !inst->dst.is_accumulator()) {
1145
         add_dep(last_accumulator_write, n);
1146
         last_accumulator_write = n;
1147
      }
1148
   }
1149
 
1150
   /* bottom-to-top dependencies: WAR */
1151
   memset(last_grf_write, 0, sizeof(last_grf_write));
1152
   memset(last_mrf_write, 0, sizeof(last_mrf_write));
1153
   last_conditional_mod = NULL;
1154
   last_accumulator_write = NULL;
1155
   last_fixed_grf_write = NULL;
1156
 
1157
   exec_node *node;
1158
   exec_node *prev;
1159
   for (node = instructions.get_tail(), prev = node->prev;
1160
        !node->is_head_sentinel();
1161
        node = prev, prev = node->prev) {
1162
      schedule_node *n = (schedule_node *)node;
1163
      vec4_instruction *inst = (vec4_instruction *)n->inst;
1164
 
1165
      /* write-after-read deps. */
1166
      for (int i = 0; i < 3; i++) {
1167
         if (inst->src[i].file == GRF) {
1168
            for (unsigned j = 0; j < inst->regs_read(i); ++j)
1169
               add_dep(n, last_grf_write[inst->src[i].reg + j]);
1170
         } else if (inst->src[i].file == HW_REG &&
1171
                    (inst->src[i].fixed_hw_reg.file ==
1172
                     BRW_GENERAL_REGISTER_FILE)) {
1173
            add_dep(n, last_fixed_grf_write);
1174
         } else if (inst->src[i].is_accumulator()) {
1175
            add_dep(n, last_accumulator_write);
1176
         } else if (inst->src[i].file != BAD_FILE &&
1177
                    inst->src[i].file != IMM &&
1178
                    inst->src[i].file != UNIFORM &&
1179
                    (inst->src[i].file != HW_REG ||
1180
                     inst->src[i].fixed_hw_reg.file != IMM)) {
1181
            assert(inst->src[i].file != MRF &&
1182
                   inst->src[i].file != ATTR);
1183
            add_barrier_deps(n);
1184
         }
1185
      }
1186
 
1187
      if (!inst->is_send_from_grf()) {
1188
         for (int i = 0; i < inst->mlen; i++) {
1189
            /* It looks like the MRF regs are released in the send
1190
             * instruction once it's sent, not when the result comes
1191
             * back.
1192
             */
1193
            add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
1194
         }
1195
      }
1196
 
1197
      if (inst->reads_flag()) {
1198
         add_dep(n, last_conditional_mod);
1199
      }
1200
 
1201
      if (inst->reads_accumulator_implicitly()) {
1202
         add_dep(n, last_accumulator_write);
1203
      }
1204
 
1205
      /* Update the things this instruction wrote, so earlier reads
1206
       * can mark this as WAR dependency.
1207
       */
1208
      if (inst->dst.file == GRF) {
1209
         for (unsigned j = 0; j < inst->regs_written; ++j)
1210
            last_grf_write[inst->dst.reg + j] = n;
1211
      } else if (inst->dst.file == MRF) {
1212
         last_mrf_write[inst->dst.reg] = n;
1213
      } else if (inst->dst.file == HW_REG &&
1214
                 inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
1215
         last_fixed_grf_write = n;
1216
      } else if (inst->dst.is_accumulator()) {
1217
         last_accumulator_write = n;
1218
      } else if (inst->dst.file != BAD_FILE &&
1219
                 !inst->dst.is_null()) {
1220
         add_barrier_deps(n);
1221
      }
1222
 
1223
      if (inst->mlen > 0 && !inst->is_send_from_grf()) {
1224
         for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
1225
            last_mrf_write[inst->base_mrf + i] = n;
1226
         }
1227
      }
1228
 
1229
      if (inst->writes_flag()) {
1230
         last_conditional_mod = n;
1231
      }
1232
 
1233
      if (inst->writes_accumulator_implicitly(v->devinfo)) {
1234
         last_accumulator_write = n;
1235
      }
1236
   }
1237
}
1238
 
1239
schedule_node *
1240
fs_instruction_scheduler::choose_instruction_to_schedule()
1241
{
1242
   schedule_node *chosen = NULL;
1243
 
1244
   if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) {
1245
      int chosen_time = 0;
1246
 
1247
      /* Of the instructions ready to execute or the closest to
1248
       * being ready, choose the oldest one.
1249
       */
1250
      foreach_in_list(schedule_node, n, &instructions) {
1251
         if (!chosen || n->unblocked_time < chosen_time) {
1252
            chosen = n;
1253
            chosen_time = n->unblocked_time;
1254
         }
1255
      }
1256
   } else {
1257
      /* Before register allocation, we don't care about the latencies of
1258
       * instructions.  All we care about is reducing live intervals of
1259
       * variables so that we can avoid register spilling, or get SIMD16
1260
       * shaders which naturally do a better job of hiding instruction
1261
       * latency.
1262
       */
1263
      foreach_in_list(schedule_node, n, &instructions) {
1264
         fs_inst *inst = (fs_inst *)n->inst;
1265
 
1266
         if (!chosen) {
1267
            chosen = n;
1268
            continue;
1269
         }
1270
 
1271
         /* Most important: If we can definitely reduce register pressure, do
1272
          * so immediately.
1273
          */
1274
         int register_pressure_benefit = get_register_pressure_benefit(n->inst);
1275
         int chosen_register_pressure_benefit =
1276
            get_register_pressure_benefit(chosen->inst);
1277
 
1278
         if (register_pressure_benefit > 0 &&
1279
             register_pressure_benefit > chosen_register_pressure_benefit) {
1280
            chosen = n;
1281
            continue;
1282
         } else if (chosen_register_pressure_benefit > 0 &&
1283
                    (register_pressure_benefit <
1284
                     chosen_register_pressure_benefit)) {
1285
            continue;
1286
         }
1287
 
1288
         if (mode == SCHEDULE_PRE_LIFO) {
1289
            /* Prefer instructions that recently became available for
1290
             * scheduling.  These are the things that are most likely to
1291
             * (eventually) make a variable dead and reduce register pressure.
1292
             * Typical register pressure estimates don't work for us because
1293
             * most of our pressure comes from texturing, where no single
1294
             * instruction to schedule will make a vec4 value dead.
1295
             */
1296
            if (n->cand_generation > chosen->cand_generation) {
1297
               chosen = n;
1298
               continue;
1299
            } else if (n->cand_generation < chosen->cand_generation) {
1300
               continue;
1301
            }
1302
 
1303
            /* On MRF-using chips, prefer non-SEND instructions.  If we don't
1304
             * do this, then because we prefer instructions that just became
1305
             * candidates, we'll end up in a pattern of scheduling a SEND,
1306
             * then the MRFs for the next SEND, then the next SEND, then the
1307
             * MRFs, etc., without ever consuming the results of a send.
1308
             */
1309
            if (v->devinfo->gen < 7) {
1310
               fs_inst *chosen_inst = (fs_inst *)chosen->inst;
1311
 
1312
               /* We use regs_written > 1 as our test for the kind of send
1313
                * instruction to avoid -- only sends generate many regs, and a
1314
                * single-result send is probably actually reducing register
1315
                * pressure.
1316
                */
1317
               if (inst->regs_written <= inst->dst.width / 8 &&
1318
                   chosen_inst->regs_written > chosen_inst->dst.width / 8) {
1319
                  chosen = n;
1320
                  continue;
1321
               } else if (inst->regs_written > chosen_inst->regs_written) {
1322
                  continue;
1323
               }
1324
            }
1325
         }
1326
 
1327
         /* For instructions pushed on the cands list at the same time, prefer
1328
          * the one with the highest delay to the end of the program.  This is
1329
          * most likely to have its values able to be consumed first (such as
1330
          * for a large tree of lowered ubo loads, which appear reversed in
1331
          * the instruction stream with respect to when they can be consumed).
1332
          */
1333
         if (n->delay > chosen->delay) {
1334
            chosen = n;
1335
            continue;
1336
         } else if (n->delay < chosen->delay) {
1337
            continue;
1338
         }
1339
 
1340
         /* If all other metrics are equal, we prefer the first instruction in
1341
          * the list (program execution).
1342
          */
1343
      }
1344
   }
1345
 
1346
   return chosen;
1347
}
1348
 
1349
schedule_node *
1350
vec4_instruction_scheduler::choose_instruction_to_schedule()
1351
{
1352
   schedule_node *chosen = NULL;
1353
   int chosen_time = 0;
1354
 
1355
   /* Of the instructions ready to execute or the closest to being ready,
1356
    * choose the oldest one.
1357
    */
1358
   foreach_in_list(schedule_node, n, &instructions) {
1359
      if (!chosen || n->unblocked_time < chosen_time) {
1360
         chosen = n;
1361
         chosen_time = n->unblocked_time;
1362
      }
1363
   }
1364
 
1365
   return chosen;
1366
}
1367
 
1368
int
1369
fs_instruction_scheduler::issue_time(backend_instruction *inst)
1370
{
1371
   if (is_compressed((fs_inst *)inst))
1372
      return 4;
1373
   else
1374
      return 2;
1375
}
1376
 
1377
int
1378
vec4_instruction_scheduler::issue_time(backend_instruction *inst)
1379
{
1380
   /* We always execute as two vec4s in parallel. */
1381
   return 2;
1382
}
1383
 
1384
void
1385
instruction_scheduler::schedule_instructions(bblock_t *block)
1386
{
1387
   const struct brw_device_info *devinfo = bv->devinfo;
1388
   backend_instruction *inst = block->end();
1389
   time = 0;
1390
 
1391
   /* Remove non-DAG heads from the list. */
1392
   foreach_in_list_safe(schedule_node, n, &instructions) {
1393
      if (n->parent_count != 0)
1394
         n->remove();
1395
   }
1396
 
1397
   unsigned cand_generation = 1;
1398
   while (!instructions.is_empty()) {
1399
      schedule_node *chosen = choose_instruction_to_schedule();
1400
 
1401
      /* Schedule this instruction. */
1402
      assert(chosen);
1403
      chosen->remove();
1404
      inst->insert_before(block, chosen->inst);
1405
      instructions_to_schedule--;
1406
      update_register_pressure(chosen->inst);
1407
 
1408
      /* Update the clock for how soon an instruction could start after the
1409
       * chosen one.
1410
       */
1411
      time += issue_time(chosen->inst);
1412
 
1413
      /* If we expected a delay for scheduling, then bump the clock to reflect
1414
       * that as well.  In reality, the hardware will switch to another
1415
       * hyperthread and may not return to dispatching our thread for a while
1416
       * even after we're unblocked.
1417
       */
1418
      time = MAX2(time, chosen->unblocked_time);
1419
 
1420
      if (debug) {
1421
         fprintf(stderr, "clock %4d, scheduled: ", time);
1422
         bv->dump_instruction(chosen->inst);
1423
      }
1424
 
1425
      /* Now that we've scheduled a new instruction, some of its
1426
       * children can be promoted to the list of instructions ready to
1427
       * be scheduled.  Update the children's unblocked time for this
1428
       * DAG edge as we do so.
1429
       */
1430
      for (int i = chosen->child_count - 1; i >= 0; i--) {
1431
         schedule_node *child = chosen->children[i];
1432
 
1433
         child->unblocked_time = MAX2(child->unblocked_time,
1434
                                      time + chosen->child_latency[i]);
1435
 
1436
         if (debug) {
1437
            fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count);
1438
            bv->dump_instruction(child->inst);
1439
         }
1440
 
1441
         child->cand_generation = cand_generation;
1442
         child->parent_count--;
1443
         if (child->parent_count == 0) {
1444
            if (debug) {
1445
               fprintf(stderr, "\t\tnow available\n");
1446
            }
1447
            instructions.push_head(child);
1448
         }
1449
      }
1450
      cand_generation++;
1451
 
1452
      /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
1453
       * but it's more limited pre-gen6, so if we send something off to it then
1454
       * the next math instruction isn't going to make progress until the first
1455
       * is done.
1456
       */
1457
      if (devinfo->gen < 6 && chosen->inst->is_math()) {
1458
         foreach_in_list(schedule_node, n, &instructions) {
1459
            if (n->inst->is_math())
1460
               n->unblocked_time = MAX2(n->unblocked_time,
1461
                                        time + chosen->latency);
1462
         }
1463
      }
1464
   }
1465
 
1466
   if (block->end()->opcode == BRW_OPCODE_NOP)
1467
      block->end()->remove(block);
1468
   assert(instructions_to_schedule == 0);
1469
}
1470
 
1471
void
1472
instruction_scheduler::run(cfg_t *cfg)
1473
{
1474
   if (debug) {
1475
      fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
1476
              post_reg_alloc);
1477
      bv->dump_instructions();
1478
   }
1479
 
1480
   /* Populate the remaining GRF uses array to improve the pre-regalloc
1481
    * scheduling.
1482
    */
1483
   if (remaining_grf_uses) {
1484
      foreach_block_and_inst(block, backend_instruction, inst, cfg) {
1485
         count_remaining_grf_uses(inst);
1486
      }
1487
   }
1488
 
1489
   foreach_block(block, cfg) {
1490
      if (block->end_ip - block->start_ip <= 1)
1491
         continue;
1492
 
1493
      add_insts_from_block(block);
1494
 
1495
      calculate_deps();
1496
 
1497
      foreach_in_list(schedule_node, n, &instructions) {
1498
         compute_delay(n);
1499
      }
1500
 
1501
      schedule_instructions(block);
1502
   }
1503
 
1504
   if (debug) {
1505
      fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
1506
              post_reg_alloc);
1507
      bv->dump_instructions();
1508
   }
1509
}
1510
 
1511
void
1512
fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
1513
{
1514
   int grf_count;
1515
   if (mode == SCHEDULE_POST)
1516
      grf_count = grf_used;
1517
   else
1518
      grf_count = alloc.count;
1519
 
1520
   fs_instruction_scheduler sched(this, grf_count, mode);
1521
   sched.run(cfg);
1522
 
1523
   if (unlikely(debug_enabled) && mode == SCHEDULE_POST) {
1524
      fprintf(stderr, "%s%d estimated execution time: %d cycles\n",
1525
              stage_abbrev, dispatch_width, sched.time);
1526
   }
1527
 
1528
   invalidate_live_intervals();
1529
}
1530
 
1531
void
1532
vec4_visitor::opt_schedule_instructions()
1533
{
1534
   vec4_instruction_scheduler sched(this, prog_data->total_grf);
1535
   sched.run(cfg);
1536
 
1537
   if (unlikely(debug_enabled)) {
1538
      fprintf(stderr, "%s estimated execution time: %d cycles\n",
1539
              stage_abbrev, sched.time);
1540
   }
1541
 
1542
   invalidate_live_intervals();
1543
}