Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2010 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  *
  23.  * Authors:
  24.  *    Eric Anholt <eric@anholt.net>
  25.  *
  26.  */
  27.  
  28. #include "brw_fs.h"
  29. #include "brw_vec4.h"
  30. #include "brw_cfg.h"
  31. #include "brw_shader.h"
  32. #include "glsl/glsl_types.h"
  33. #include "glsl/ir_optimization.h"
  34.  
  35. using namespace brw;
  36.  
  37. /** @file brw_fs_schedule_instructions.cpp
  38.  *
  39.  * List scheduling of FS instructions.
  40.  *
  41.  * The basic model of the list scheduler is to take a basic block,
  42.  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  43.  * ordering with latency, WAR ordering), and make a list of the DAG heads.
  44.  * Heuristically pick a DAG head, then put all the children that are
  45.  * now DAG heads into the list of things to schedule.
  46.  *
  47.  * The heuristic is the important part.  We're trying to be cheap,
  48.  * since actually computing the optimal scheduling is NP complete.
  49.  * What we do is track a "current clock".  When we schedule a node, we
  50.  * update the earliest-unblocked clock time of its children, and
  51.  * increment the clock.  Then, when trying to schedule, we just pick
  52.  * the earliest-unblocked instruction to schedule.
  53.  *
  54.  * Note that often there will be many things which could execute
  55.  * immediately, and there are a range of heuristic options to choose
  56.  * from in picking among those.
  57.  */
  58.  
  59. static bool debug = false;
  60.  
  61. class instruction_scheduler;
  62.  
  63. class schedule_node : public exec_node
  64. {
  65. public:
  66.    schedule_node(backend_instruction *inst, instruction_scheduler *sched);
  67.    void set_latency_gen4();
  68.    void set_latency_gen7(bool is_haswell);
  69.  
  70.    backend_instruction *inst;
  71.    schedule_node **children;
  72.    int *child_latency;
  73.    int child_count;
  74.    int parent_count;
  75.    int child_array_size;
  76.    int unblocked_time;
  77.    int latency;
  78.  
  79.    /**
  80.     * Which iteration of pushing groups of children onto the candidates list
  81.     * this node was a part of.
  82.     */
  83.    unsigned cand_generation;
  84.  
  85.    /**
  86.     * This is the sum of the instruction's latency plus the maximum delay of
  87.     * its children, or just the issue_time if it's a leaf node.
  88.     */
  89.    int delay;
  90. };
  91.  
  92. void
  93. schedule_node::set_latency_gen4()
  94. {
  95.    int chans = 8;
  96.    int math_latency = 22;
  97.  
  98.    switch (inst->opcode) {
  99.    case SHADER_OPCODE_RCP:
  100.       this->latency = 1 * chans * math_latency;
  101.       break;
  102.    case SHADER_OPCODE_RSQ:
  103.       this->latency = 2 * chans * math_latency;
  104.       break;
  105.    case SHADER_OPCODE_INT_QUOTIENT:
  106.    case SHADER_OPCODE_SQRT:
  107.    case SHADER_OPCODE_LOG2:
  108.       /* full precision log.  partial is 2. */
  109.       this->latency = 3 * chans * math_latency;
  110.       break;
  111.    case SHADER_OPCODE_INT_REMAINDER:
  112.    case SHADER_OPCODE_EXP2:
  113.       /* full precision.  partial is 3, same throughput. */
  114.       this->latency = 4 * chans * math_latency;
  115.       break;
  116.    case SHADER_OPCODE_POW:
  117.       this->latency = 8 * chans * math_latency;
  118.       break;
  119.    case SHADER_OPCODE_SIN:
  120.    case SHADER_OPCODE_COS:
  121.       /* minimum latency, max is 12 rounds. */
  122.       this->latency = 5 * chans * math_latency;
  123.       break;
  124.    default:
  125.       this->latency = 2;
  126.       break;
  127.    }
  128. }
  129.  
  130. void
  131. schedule_node::set_latency_gen7(bool is_haswell)
  132. {
  133.    switch (inst->opcode) {
  134.    case BRW_OPCODE_MAD:
  135.       /* 2 cycles
  136.        *  (since the last two src operands are in different register banks):
  137.        * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
  138.        *
  139.        * 3 cycles on IVB, 4 on HSW
  140.        *  (since the last two src operands are in the same register bank):
  141.        * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
  142.        *
  143.        * 18 cycles on IVB, 16 on HSW
  144.        *  (since the last two src operands are in different register banks):
  145.        * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
  146.        * mov(8) null   g4<4,5,1>F                     { align16 WE_normal 1Q };
  147.        *
  148.        * 20 cycles on IVB, 18 on HSW
  149.        *  (since the last two src operands are in the same register bank):
  150.        * mad(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
  151.        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
  152.        */
  153.  
  154.       /* Our register allocator doesn't know about register banks, so use the
  155.        * higher latency.
  156.        */
  157.       latency = is_haswell ? 16 : 18;
  158.       break;
  159.  
  160.    case BRW_OPCODE_LRP:
  161.       /* 2 cycles
  162.        *  (since the last two src operands are in different register banks):
  163.        * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
  164.        *
  165.        * 3 cycles on IVB, 4 on HSW
  166.        *  (since the last two src operands are in the same register bank):
  167.        * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
  168.        *
  169.        * 16 cycles on IVB, 14 on HSW
  170.        *  (since the last two src operands are in different register banks):
  171.        * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
  172.        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
  173.        *
  174.        * 16 cycles
  175.        *  (since the last two src operands are in the same register bank):
  176.        * lrp(8) g4<1>F g2.2<4,4,1>F.x  g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
  177.        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
  178.        */
  179.  
  180.       /* Our register allocator doesn't know about register banks, so use the
  181.        * higher latency.
  182.        */
  183.       latency = 14;
  184.       break;
  185.  
  186.    case SHADER_OPCODE_RCP:
  187.    case SHADER_OPCODE_RSQ:
  188.    case SHADER_OPCODE_SQRT:
  189.    case SHADER_OPCODE_LOG2:
  190.    case SHADER_OPCODE_EXP2:
  191.    case SHADER_OPCODE_SIN:
  192.    case SHADER_OPCODE_COS:
  193.       /* 2 cycles:
  194.        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
  195.        *
  196.        * 18 cycles:
  197.        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
  198.        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
  199.        *
  200.        * Same for exp2, log2, rsq, sqrt, sin, cos.
  201.        */
  202.       latency = is_haswell ? 14 : 16;
  203.       break;
  204.  
  205.    case SHADER_OPCODE_POW:
  206.       /* 2 cycles:
  207.        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
  208.        *
  209.        * 26 cycles:
  210.        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
  211.        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
  212.        */
  213.       latency = is_haswell ? 22 : 24;
  214.       break;
  215.  
  216.    case SHADER_OPCODE_TEX:
  217.    case SHADER_OPCODE_TXD:
  218.    case SHADER_OPCODE_TXF:
  219.    case SHADER_OPCODE_TXL:
  220.       /* 18 cycles:
  221.        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
  222.        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
  223.        * send(8) g4<1>UW    g114<8,8,1>F
  224.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  225.        *
  226.        * 697 +/-49 cycles (min 610, n=26):
  227.        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
  228.        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
  229.        * send(8) g4<1>UW    g114<8,8,1>F
  230.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  231.        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
  232.        *
  233.        * So the latency on our first texture load of the batchbuffer takes
  234.        * ~700 cycles, since the caches are cold at that point.
  235.        *
  236.        * 840 +/- 92 cycles (min 720, n=25):
  237.        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
  238.        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
  239.        * send(8) g4<1>UW    g114<8,8,1>F
  240.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  241.        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
  242.        * send(8) g4<1>UW    g114<8,8,1>F
  243.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  244.        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
  245.        *
  246.        * On the second load, it takes just an extra ~140 cycles, and after
  247.        * accounting for the 14 cycles of the MOV's latency, that makes ~130.
  248.        *
  249.        * 683 +/- 49 cycles (min = 602, n=47):
  250.        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
  251.        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
  252.        * send(8) g4<1>UW    g114<8,8,1>F
  253.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  254.        * send(8) g50<1>UW   g114<8,8,1>F
  255.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  256.        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
  257.        *
  258.        * The unit appears to be pipelined, since this matches up with the
  259.        * cache-cold case, despite there being two loads here.  If you replace
  260.        * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
  261.        *
  262.        * So, take some number between the cache-hot 140 cycles and the
  263.        * cache-cold 700 cycles.  No particular tuning was done on this.
  264.        *
  265.        * I haven't done significant testing of the non-TEX opcodes.  TXL at
  266.        * least looked about the same as TEX.
  267.        */
  268.       latency = 200;
  269.       break;
  270.  
  271.    case SHADER_OPCODE_TXS:
  272.       /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
  273.        * cycles (n=15):
  274.        * mov(8)   g114<1>UD  0D                        { align1 WE_normal 1Q };
  275.        * send(8)  g6<1>UW    g114<8,8,1>F
  276.        *   sampler (10, 0, 10, 1) mlen 1 rlen 4        { align1 WE_normal 1Q };
  277.        * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
  278.        *
  279.        *
  280.        * Two loads was 535 +/- 30 cycles (n=19):
  281.        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
  282.        * send(16)  g6<1>UW    g114<8,8,1>F
  283.        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
  284.        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
  285.        * mov(16)   g6<1>F     g6<8,8,1>D               { align1 WE_normal 1H };
  286.        * send(16)  g8<1>UW    g114<8,8,1>F
  287.        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
  288.        * mov(16)   g8<1>F     g8<8,8,1>D               { align1 WE_normal 1H };
  289.        * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
  290.        *
  291.        * Since the only caches that should matter are just the
  292.        * instruction/state cache containing the surface state, assume that we
  293.        * always have hot caches.
  294.        */
  295.       latency = 100;
  296.       break;
  297.  
  298.    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
  299.    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
  300.    case VS_OPCODE_PULL_CONSTANT_LOAD:
  301.       /* testing using varying-index pull constants:
  302.        *
  303.        * 16 cycles:
  304.        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
  305.        * send(8) g4<1>F  g4<8,8,1>D
  306.        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
  307.        *
  308.        * ~480 cycles:
  309.        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
  310.        * send(8) g4<1>F  g4<8,8,1>D
  311.        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
  312.        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
  313.        *
  314.        * ~620 cycles:
  315.        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
  316.        * send(8) g4<1>F  g4<8,8,1>D
  317.        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
  318.        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
  319.        * send(8) g4<1>F  g4<8,8,1>D
  320.        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
  321.        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
  322.        *
  323.        * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
  324.        * about 460.  We expect to mostly be cache hot, so pick something more
  325.        * in that direction.
  326.        */
  327.       latency = 200;
  328.       break;
  329.  
  330.    case SHADER_OPCODE_GEN7_SCRATCH_READ:
  331.       /* Testing a load from offset 0, that had been previously written:
  332.        *
  333.        * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q };
  334.        * mov(8)  null      g114<8,8,1>F { align1 WE_normal 1Q };
  335.        *
  336.        * The cycles spent seemed to be grouped around 40-50 (as low as 38),
  337.        * then around 140.  Presumably this is cache hit vs miss.
  338.        */
  339.       latency = 50;
  340.       break;
  341.  
  342.    case SHADER_OPCODE_UNTYPED_ATOMIC:
  343.    case SHADER_OPCODE_TYPED_ATOMIC:
  344.       /* Test code:
  345.        *   mov(8)    g112<1>ud       0x00000000ud       { align1 WE_all 1Q };
  346.        *   mov(1)    g112.7<1>ud     g1.7<0,1,0>ud      { align1 WE_all };
  347.        *   mov(8)    g113<1>ud       0x00000000ud       { align1 WE_normal 1Q };
  348.        *   send(8)   g4<1>ud         g112<8,8,1>ud
  349.        *             data (38, 5, 6) mlen 2 rlen 1      { align1 WE_normal 1Q };
  350.        *
  351.        * Running it 100 times as fragment shader on a 128x128 quad
  352.        * gives an average latency of 13867 cycles per atomic op,
  353.        * standard deviation 3%.  Note that this is a rather
  354.        * pessimistic estimate, the actual latency in cases with few
  355.        * collisions between threads and favorable pipelining has been
  356.        * seen to be reduced by a factor of 100.
  357.        */
  358.       latency = 14000;
  359.       break;
  360.  
  361.    case SHADER_OPCODE_UNTYPED_SURFACE_READ:
  362.    case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
  363.    case SHADER_OPCODE_TYPED_SURFACE_READ:
  364.    case SHADER_OPCODE_TYPED_SURFACE_WRITE:
  365.       /* Test code:
  366.        *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
  367.        *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
  368.        *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
  369.        *   send(8)   g4<1>UD         g112<8,8,1>UD
  370.        *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
  371.        *   .
  372.        *   . [repeats 8 times]
  373.        *   .
  374.        *   mov(8)    g112<1>UD       0x00000000UD       { align1 WE_all 1Q };
  375.        *   mov(1)    g112.7<1>UD     g1.7<0,1,0>UD      { align1 WE_all };
  376.        *   mov(8)    g113<1>UD       0x00000000UD       { align1 WE_normal 1Q };
  377.        *   send(8)   g4<1>UD         g112<8,8,1>UD
  378.        *             data (38, 6, 5) mlen 2 rlen 1      { align1 WE_normal 1Q };
  379.        *
  380.        * Running it 100 times as fragment shader on a 128x128 quad
  381.        * gives an average latency of 583 cycles per surface read,
  382.        * standard deviation 0.9%.
  383.        */
  384.       latency = is_haswell ? 300 : 600;
  385.       break;
  386.  
  387.    default:
  388.       /* 2 cycles:
  389.        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
  390.        *
  391.        * 16 cycles:
  392.        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
  393.        * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
  394.        */
  395.       latency = 14;
  396.       break;
  397.    }
  398. }
  399.  
  400. class instruction_scheduler {
  401. public:
  402.    instruction_scheduler(backend_visitor *v, int grf_count,
  403.                          instruction_scheduler_mode mode)
  404.    {
  405.       this->bv = v;
  406.       this->mem_ctx = ralloc_context(NULL);
  407.       this->grf_count = grf_count;
  408.       this->instructions.make_empty();
  409.       this->instructions_to_schedule = 0;
  410.       this->post_reg_alloc = (mode == SCHEDULE_POST);
  411.       this->mode = mode;
  412.       this->time = 0;
  413.       if (!post_reg_alloc) {
  414.          this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
  415.          this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
  416.       } else {
  417.          this->remaining_grf_uses = NULL;
  418.          this->grf_active = NULL;
  419.       }
  420.    }
  421.  
  422.    ~instruction_scheduler()
  423.    {
  424.       ralloc_free(this->mem_ctx);
  425.    }
  426.    void add_barrier_deps(schedule_node *n);
  427.    void add_dep(schedule_node *before, schedule_node *after, int latency);
  428.    void add_dep(schedule_node *before, schedule_node *after);
  429.  
  430.    void run(cfg_t *cfg);
  431.    void add_insts_from_block(bblock_t *block);
  432.    void compute_delay(schedule_node *node);
  433.    virtual void calculate_deps() = 0;
  434.    virtual schedule_node *choose_instruction_to_schedule() = 0;
  435.  
  436.    /**
  437.     * Returns how many cycles it takes the instruction to issue.
  438.     *
  439.     * Instructions in gen hardware are handled one simd4 vector at a time,
  440.     * with 1 cycle per vector dispatched.  Thus SIMD8 pixel shaders take 2
  441.     * cycles to dispatch and SIMD16 (compressed) instructions take 4.
  442.     */
  443.    virtual int issue_time(backend_instruction *inst) = 0;
  444.  
  445.    virtual void count_remaining_grf_uses(backend_instruction *inst) = 0;
  446.    virtual void update_register_pressure(backend_instruction *inst) = 0;
  447.    virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
  448.  
  449.    void schedule_instructions(bblock_t *block);
  450.  
  451.    void *mem_ctx;
  452.  
  453.    bool post_reg_alloc;
  454.    int instructions_to_schedule;
  455.    int grf_count;
  456.    int time;
  457.    exec_list instructions;
  458.    backend_visitor *bv;
  459.  
  460.    instruction_scheduler_mode mode;
  461.  
  462.    /**
  463.     * Number of instructions left to schedule that reference each vgrf.
  464.     *
  465.     * Used so that we can prefer scheduling instructions that will end the
  466.     * live intervals of multiple variables, to reduce register pressure.
  467.     */
  468.    int *remaining_grf_uses;
  469.  
  470.    /**
  471.     * Tracks whether each VGRF has had an instruction scheduled that uses it.
  472.     *
  473.     * This is used to estimate whether scheduling a new instruction will
  474.     * increase register pressure.
  475.     */
  476.    bool *grf_active;
  477. };
  478.  
  479. class fs_instruction_scheduler : public instruction_scheduler
  480. {
  481. public:
  482.    fs_instruction_scheduler(fs_visitor *v, int grf_count,
  483.                             instruction_scheduler_mode mode);
  484.    void calculate_deps();
  485.    bool is_compressed(fs_inst *inst);
  486.    schedule_node *choose_instruction_to_schedule();
  487.    int issue_time(backend_instruction *inst);
  488.    fs_visitor *v;
  489.  
  490.    void count_remaining_grf_uses(backend_instruction *inst);
  491.    void update_register_pressure(backend_instruction *inst);
  492.    int get_register_pressure_benefit(backend_instruction *inst);
  493. };
  494.  
  495. fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
  496.                                                    int grf_count,
  497.                                                    instruction_scheduler_mode mode)
  498.    : instruction_scheduler(v, grf_count, mode),
  499.      v(v)
  500. {
  501. }
  502.  
  503. void
  504. fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
  505. {
  506.    fs_inst *inst = (fs_inst *)be;
  507.  
  508.    if (!remaining_grf_uses)
  509.       return;
  510.  
  511.    if (inst->dst.file == GRF)
  512.       remaining_grf_uses[inst->dst.reg]++;
  513.  
  514.    for (int i = 0; i < inst->sources; i++) {
  515.       if (inst->src[i].file != GRF)
  516.          continue;
  517.  
  518.       remaining_grf_uses[inst->src[i].reg]++;
  519.    }
  520. }
  521.  
  522. void
  523. fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
  524. {
  525.    fs_inst *inst = (fs_inst *)be;
  526.  
  527.    if (!remaining_grf_uses)
  528.       return;
  529.  
  530.    if (inst->dst.file == GRF) {
  531.       remaining_grf_uses[inst->dst.reg]--;
  532.       grf_active[inst->dst.reg] = true;
  533.    }
  534.  
  535.    for (int i = 0; i < inst->sources; i++) {
  536.       if (inst->src[i].file == GRF) {
  537.          remaining_grf_uses[inst->src[i].reg]--;
  538.          grf_active[inst->src[i].reg] = true;
  539.       }
  540.    }
  541. }
  542.  
  543. int
  544. fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
  545. {
  546.    fs_inst *inst = (fs_inst *)be;
  547.    int benefit = 0;
  548.  
  549.    if (inst->dst.file == GRF) {
  550.       if (remaining_grf_uses[inst->dst.reg] == 1)
  551.          benefit += v->alloc.sizes[inst->dst.reg];
  552.       if (!grf_active[inst->dst.reg])
  553.          benefit -= v->alloc.sizes[inst->dst.reg];
  554.    }
  555.  
  556.    for (int i = 0; i < inst->sources; i++) {
  557.       if (inst->src[i].file != GRF)
  558.          continue;
  559.  
  560.       if (remaining_grf_uses[inst->src[i].reg] == 1)
  561.          benefit += v->alloc.sizes[inst->src[i].reg];
  562.       if (!grf_active[inst->src[i].reg])
  563.          benefit -= v->alloc.sizes[inst->src[i].reg];
  564.    }
  565.  
  566.    return benefit;
  567. }
  568.  
  569. class vec4_instruction_scheduler : public instruction_scheduler
  570. {
  571. public:
  572.    vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
  573.    void calculate_deps();
  574.    schedule_node *choose_instruction_to_schedule();
  575.    int issue_time(backend_instruction *inst);
  576.    vec4_visitor *v;
  577.  
  578.    void count_remaining_grf_uses(backend_instruction *inst);
  579.    void update_register_pressure(backend_instruction *inst);
  580.    int get_register_pressure_benefit(backend_instruction *inst);
  581. };
  582.  
  583. vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
  584.                                                        int grf_count)
  585.    : instruction_scheduler(v, grf_count, SCHEDULE_POST),
  586.      v(v)
  587. {
  588. }
  589.  
  590. void
  591. vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
  592. {
  593. }
  594.  
  595. void
  596. vec4_instruction_scheduler::update_register_pressure(backend_instruction *be)
  597. {
  598. }
  599.  
  600. int
  601. vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
  602. {
  603.    return 0;
  604. }
  605.  
  606. schedule_node::schedule_node(backend_instruction *inst,
  607.                              instruction_scheduler *sched)
  608. {
  609.    const struct brw_device_info *devinfo = sched->bv->devinfo;
  610.  
  611.    this->inst = inst;
  612.    this->child_array_size = 0;
  613.    this->children = NULL;
  614.    this->child_latency = NULL;
  615.    this->child_count = 0;
  616.    this->parent_count = 0;
  617.    this->unblocked_time = 0;
  618.    this->cand_generation = 0;
  619.    this->delay = 0;
  620.  
  621.    /* We can't measure Gen6 timings directly but expect them to be much
  622.     * closer to Gen7 than Gen4.
  623.     */
  624.    if (!sched->post_reg_alloc)
  625.       this->latency = 1;
  626.    else if (devinfo->gen >= 6)
  627.       set_latency_gen7(devinfo->is_haswell);
  628.    else
  629.       set_latency_gen4();
  630. }
  631.  
  632. void
  633. instruction_scheduler::add_insts_from_block(bblock_t *block)
  634. {
  635.    /* Removing the last instruction from a basic block removes the block as
  636.     * well, so put a NOP at the end to keep it alive.
  637.     */
  638.    if (!block->end()->is_control_flow()) {
  639.       backend_instruction *nop = new(mem_ctx) backend_instruction();
  640.       nop->opcode = BRW_OPCODE_NOP;
  641.       block->end()->insert_after(block, nop);
  642.    }
  643.  
  644.    foreach_inst_in_block_safe(backend_instruction, inst, block) {
  645.       if (inst->opcode == BRW_OPCODE_NOP || inst->is_control_flow())
  646.          continue;
  647.  
  648.       schedule_node *n = new(mem_ctx) schedule_node(inst, this);
  649.  
  650.       this->instructions_to_schedule++;
  651.  
  652.       inst->remove(block);
  653.       instructions.push_tail(n);
  654.    }
  655. }
  656.  
  657. /** Recursive computation of the delay member of a node. */
  658. void
  659. instruction_scheduler::compute_delay(schedule_node *n)
  660. {
  661.    if (!n->child_count) {
  662.       n->delay = issue_time(n->inst);
  663.    } else {
  664.       for (int i = 0; i < n->child_count; i++) {
  665.          if (!n->children[i]->delay)
  666.             compute_delay(n->children[i]);
  667.          n->delay = MAX2(n->delay, n->latency + n->children[i]->delay);
  668.       }
  669.    }
  670. }
  671.  
  672. /**
  673.  * Add a dependency between two instruction nodes.
  674.  *
  675.  * The @after node will be scheduled after @before.  We will try to
  676.  * schedule it @latency cycles after @before, but no guarantees there.
  677.  */
  678. void
  679. instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
  680.                                int latency)
  681. {
  682.    if (!before || !after)
  683.       return;
  684.  
  685.    assert(before != after);
  686.  
  687.    for (int i = 0; i < before->child_count; i++) {
  688.       if (before->children[i] == after) {
  689.          before->child_latency[i] = MAX2(before->child_latency[i], latency);
  690.          return;
  691.       }
  692.    }
  693.  
  694.    if (before->child_array_size <= before->child_count) {
  695.       if (before->child_array_size < 16)
  696.          before->child_array_size = 16;
  697.       else
  698.          before->child_array_size *= 2;
  699.  
  700.       before->children = reralloc(mem_ctx, before->children,
  701.                                   schedule_node *,
  702.                                   before->child_array_size);
  703.       before->child_latency = reralloc(mem_ctx, before->child_latency,
  704.                                        int, before->child_array_size);
  705.    }
  706.  
  707.    before->children[before->child_count] = after;
  708.    before->child_latency[before->child_count] = latency;
  709.    before->child_count++;
  710.    after->parent_count++;
  711. }
  712.  
  713. void
  714. instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
  715. {
  716.    if (!before)
  717.       return;
  718.  
  719.    add_dep(before, after, before->latency);
  720. }
  721.  
  722. /**
  723.  * Sometimes we really want this node to execute after everything that
  724.  * was before it and before everything that followed it.  This adds
  725.  * the deps to do so.
  726.  */
  727. void
  728. instruction_scheduler::add_barrier_deps(schedule_node *n)
  729. {
  730.    schedule_node *prev = (schedule_node *)n->prev;
  731.    schedule_node *next = (schedule_node *)n->next;
  732.  
  733.    if (prev) {
  734.       while (!prev->is_head_sentinel()) {
  735.          add_dep(prev, n, 0);
  736.          prev = (schedule_node *)prev->prev;
  737.       }
  738.    }
  739.  
  740.    if (next) {
  741.       while (!next->is_tail_sentinel()) {
  742.          add_dep(n, next, 0);
  743.          next = (schedule_node *)next->next;
  744.       }
  745.    }
  746. }
  747.  
  748. /* instruction scheduling needs to be aware of when an MRF write
  749.  * actually writes 2 MRFs.
  750.  */
  751. bool
  752. fs_instruction_scheduler::is_compressed(fs_inst *inst)
  753. {
  754.    return inst->exec_size == 16;
  755. }
  756.  
  757. void
  758. fs_instruction_scheduler::calculate_deps()
  759. {
  760.    /* Pre-register-allocation, this tracks the last write per VGRF offset.
  761.     * After register allocation, reg_offsets are gone and we track individual
  762.     * GRF registers.
  763.     */
  764.    schedule_node *last_grf_write[grf_count * 16];
  765.    schedule_node *last_mrf_write[BRW_MAX_MRF];
  766.    schedule_node *last_conditional_mod[2] = { NULL, NULL };
  767.    schedule_node *last_accumulator_write = NULL;
  768.    /* Fixed HW registers are assumed to be separate from the virtual
  769.     * GRFs, so they can be tracked separately.  We don't really write
  770.     * to fixed GRFs much, so don't bother tracking them on a more
  771.     * granular level.
  772.     */
  773.    schedule_node *last_fixed_grf_write = NULL;
  774.    int reg_width = v->dispatch_width / 8;
  775.  
  776.    /* The last instruction always needs to still be the last
  777.     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
  778.     * WHILE) and scheduling other things after it would disturb the
  779.     * basic block, or it's FB_WRITE and we should do a better job at
  780.     * dead code elimination anyway.
  781.     */
  782.    schedule_node *last = (schedule_node *)instructions.get_tail();
  783.    add_barrier_deps(last);
  784.  
  785.    memset(last_grf_write, 0, sizeof(last_grf_write));
  786.    memset(last_mrf_write, 0, sizeof(last_mrf_write));
  787.  
  788.    /* top-to-bottom dependencies: RAW and WAW. */
  789.    foreach_in_list(schedule_node, n, &instructions) {
  790.       fs_inst *inst = (fs_inst *)n->inst;
  791.  
  792.       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT ||
  793.          inst->has_side_effects())
  794.          add_barrier_deps(n);
  795.  
  796.       /* read-after-write deps. */
  797.       for (int i = 0; i < inst->sources; i++) {
  798.          if (inst->src[i].file == GRF) {
  799.             if (post_reg_alloc) {
  800.                for (int r = 0; r < inst->regs_read(i); r++)
  801.                   add_dep(last_grf_write[inst->src[i].reg + r], n);
  802.             } else {
  803.                for (int r = 0; r < inst->regs_read(i); r++) {
  804.                   add_dep(last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], n);
  805.                }
  806.             }
  807.          } else if (inst->src[i].file == HW_REG &&
  808.                     (inst->src[i].fixed_hw_reg.file ==
  809.                      BRW_GENERAL_REGISTER_FILE)) {
  810.             if (post_reg_alloc) {
  811.                int size = reg_width;
  812.                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
  813.                   size = 1;
  814.                for (int r = 0; r < size; r++)
  815.                   add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
  816.             } else {
  817.                add_dep(last_fixed_grf_write, n);
  818.             }
  819.          } else if (inst->src[i].is_accumulator()) {
  820.             add_dep(last_accumulator_write, n);
  821.          } else if (inst->src[i].file != BAD_FILE &&
  822.                     inst->src[i].file != IMM &&
  823.                     inst->src[i].file != UNIFORM &&
  824.                     (inst->src[i].file != HW_REG ||
  825.                      inst->src[i].fixed_hw_reg.file != IMM)) {
  826.             assert(inst->src[i].file != MRF);
  827.             add_barrier_deps(n);
  828.          }
  829.       }
  830.  
  831.       if (inst->base_mrf != -1) {
  832.          for (int i = 0; i < inst->mlen; i++) {
  833.             /* It looks like the MRF regs are released in the send
  834.              * instruction once it's sent, not when the result comes
  835.              * back.
  836.              */
  837.             add_dep(last_mrf_write[inst->base_mrf + i], n);
  838.          }
  839.       }
  840.  
  841.       if (inst->reads_flag()) {
  842.          add_dep(last_conditional_mod[inst->flag_subreg], n);
  843.       }
  844.  
  845.       if (inst->reads_accumulator_implicitly()) {
  846.          add_dep(last_accumulator_write, n);
  847.       }
  848.  
  849.       /* write-after-write deps. */
  850.       if (inst->dst.file == GRF) {
  851.          if (post_reg_alloc) {
  852.             for (int r = 0; r < inst->regs_written; r++) {
  853.                add_dep(last_grf_write[inst->dst.reg + r], n);
  854.                last_grf_write[inst->dst.reg + r] = n;
  855.             }
  856.          } else {
  857.             for (int r = 0; r < inst->regs_written; r++) {
  858.                add_dep(last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r], n);
  859.                last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n;
  860.             }
  861.          }
  862.       } else if (inst->dst.file == MRF) {
  863.          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
  864.  
  865.          add_dep(last_mrf_write[reg], n);
  866.          last_mrf_write[reg] = n;
  867.          if (is_compressed(inst)) {
  868.             if (inst->dst.reg & BRW_MRF_COMPR4)
  869.                reg += 4;
  870.             else
  871.                reg++;
  872.             add_dep(last_mrf_write[reg], n);
  873.             last_mrf_write[reg] = n;
  874.          }
  875.       } else if (inst->dst.file == HW_REG &&
  876.                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
  877.          if (post_reg_alloc) {
  878.             for (int r = 0; r < reg_width; r++)
  879.                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
  880.          } else {
  881.             last_fixed_grf_write = n;
  882.          }
  883.       } else if (inst->dst.is_accumulator()) {
  884.          add_dep(last_accumulator_write, n);
  885.          last_accumulator_write = n;
  886.       } else if (inst->dst.file != BAD_FILE &&
  887.                  !inst->dst.is_null()) {
  888.          add_barrier_deps(n);
  889.       }
  890.  
  891.       if (inst->mlen > 0 && inst->base_mrf != -1) {
  892.          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
  893.             add_dep(last_mrf_write[inst->base_mrf + i], n);
  894.             last_mrf_write[inst->base_mrf + i] = n;
  895.          }
  896.       }
  897.  
  898.       if (inst->writes_flag()) {
  899.          add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
  900.          last_conditional_mod[inst->flag_subreg] = n;
  901.       }
  902.  
  903.       if (inst->writes_accumulator_implicitly(v->devinfo) &&
  904.           !inst->dst.is_accumulator()) {
  905.          add_dep(last_accumulator_write, n);
  906.          last_accumulator_write = n;
  907.       }
  908.    }
  909.  
  910.    /* bottom-to-top dependencies: WAR */
  911.    memset(last_grf_write, 0, sizeof(last_grf_write));
  912.    memset(last_mrf_write, 0, sizeof(last_mrf_write));
  913.    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
  914.    last_accumulator_write = NULL;
  915.    last_fixed_grf_write = NULL;
  916.  
  917.    exec_node *node;
  918.    exec_node *prev;
  919.    for (node = instructions.get_tail(), prev = node->prev;
  920.         !node->is_head_sentinel();
  921.         node = prev, prev = node->prev) {
  922.       schedule_node *n = (schedule_node *)node;
  923.       fs_inst *inst = (fs_inst *)n->inst;
  924.  
  925.       /* write-after-read deps. */
  926.       for (int i = 0; i < inst->sources; i++) {
  927.          if (inst->src[i].file == GRF) {
  928.             if (post_reg_alloc) {
  929.                for (int r = 0; r < inst->regs_read(i); r++)
  930.                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
  931.             } else {
  932.                for (int r = 0; r < inst->regs_read(i); r++) {
  933.                   add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r]);
  934.                }
  935.             }
  936.          } else if (inst->src[i].file == HW_REG &&
  937.                     (inst->src[i].fixed_hw_reg.file ==
  938.                      BRW_GENERAL_REGISTER_FILE)) {
  939.             if (post_reg_alloc) {
  940.                int size = reg_width;
  941.                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
  942.                   size = 1;
  943.                for (int r = 0; r < size; r++)
  944.                   add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
  945.             } else {
  946.                add_dep(n, last_fixed_grf_write);
  947.             }
  948.          } else if (inst->src[i].is_accumulator()) {
  949.             add_dep(n, last_accumulator_write);
  950.          } else if (inst->src[i].file != BAD_FILE &&
  951.                     inst->src[i].file != IMM &&
  952.                     inst->src[i].file != UNIFORM &&
  953.                     (inst->src[i].file != HW_REG ||
  954.                      inst->src[i].fixed_hw_reg.file != IMM)) {
  955.             assert(inst->src[i].file != MRF);
  956.             add_barrier_deps(n);
  957.          }
  958.       }
  959.  
  960.       if (inst->base_mrf != -1) {
  961.          for (int i = 0; i < inst->mlen; i++) {
  962.             /* It looks like the MRF regs are released in the send
  963.              * instruction once it's sent, not when the result comes
  964.              * back.
  965.              */
  966.             add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
  967.          }
  968.       }
  969.  
  970.       if (inst->reads_flag()) {
  971.          add_dep(n, last_conditional_mod[inst->flag_subreg]);
  972.       }
  973.  
  974.       if (inst->reads_accumulator_implicitly()) {
  975.          add_dep(n, last_accumulator_write);
  976.       }
  977.  
  978.       /* Update the things this instruction wrote, so earlier reads
  979.        * can mark this as WAR dependency.
  980.        */
  981.       if (inst->dst.file == GRF) {
  982.          if (post_reg_alloc) {
  983.             for (int r = 0; r < inst->regs_written; r++)
  984.                last_grf_write[inst->dst.reg + r] = n;
  985.          } else {
  986.             for (int r = 0; r < inst->regs_written; r++) {
  987.                last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n;
  988.             }
  989.          }
  990.       } else if (inst->dst.file == MRF) {
  991.          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
  992.  
  993.          last_mrf_write[reg] = n;
  994.  
  995.          if (is_compressed(inst)) {
  996.             if (inst->dst.reg & BRW_MRF_COMPR4)
  997.                reg += 4;
  998.             else
  999.                reg++;
  1000.  
  1001.             last_mrf_write[reg] = n;
  1002.          }
  1003.       } else if (inst->dst.file == HW_REG &&
  1004.                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
  1005.          if (post_reg_alloc) {
  1006.             for (int r = 0; r < reg_width; r++)
  1007.                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
  1008.          } else {
  1009.             last_fixed_grf_write = n;
  1010.          }
  1011.       } else if (inst->dst.is_accumulator()) {
  1012.          last_accumulator_write = n;
  1013.       } else if (inst->dst.file != BAD_FILE &&
  1014.                  !inst->dst.is_null()) {
  1015.          add_barrier_deps(n);
  1016.       }
  1017.  
  1018.       if (inst->mlen > 0 && inst->base_mrf != -1) {
  1019.          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
  1020.             last_mrf_write[inst->base_mrf + i] = n;
  1021.          }
  1022.       }
  1023.  
  1024.       if (inst->writes_flag()) {
  1025.          last_conditional_mod[inst->flag_subreg] = n;
  1026.       }
  1027.  
  1028.       if (inst->writes_accumulator_implicitly(v->devinfo)) {
  1029.          last_accumulator_write = n;
  1030.       }
  1031.    }
  1032. }
  1033.  
  1034. void
  1035. vec4_instruction_scheduler::calculate_deps()
  1036. {
  1037.    schedule_node *last_grf_write[grf_count];
  1038.    schedule_node *last_mrf_write[BRW_MAX_MRF];
  1039.    schedule_node *last_conditional_mod = NULL;
  1040.    schedule_node *last_accumulator_write = NULL;
  1041.    /* Fixed HW registers are assumed to be separate from the virtual
  1042.     * GRFs, so they can be tracked separately.  We don't really write
  1043.     * to fixed GRFs much, so don't bother tracking them on a more
  1044.     * granular level.
  1045.     */
  1046.    schedule_node *last_fixed_grf_write = NULL;
  1047.  
  1048.    /* The last instruction always needs to still be the last instruction.
  1049.     * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
  1050.     * other things after it would disturb the basic block, or it's the EOT
  1051.     * URB_WRITE and we should do a better job at dead code eliminating
  1052.     * anything that could have been scheduled after it.
  1053.     */
  1054.    schedule_node *last = (schedule_node *)instructions.get_tail();
  1055.    add_barrier_deps(last);
  1056.  
  1057.    memset(last_grf_write, 0, sizeof(last_grf_write));
  1058.    memset(last_mrf_write, 0, sizeof(last_mrf_write));
  1059.  
  1060.    /* top-to-bottom dependencies: RAW and WAW. */
  1061.    foreach_in_list(schedule_node, n, &instructions) {
  1062.       vec4_instruction *inst = (vec4_instruction *)n->inst;
  1063.  
  1064.       if (inst->has_side_effects())
  1065.          add_barrier_deps(n);
  1066.  
  1067.       /* read-after-write deps. */
  1068.       for (int i = 0; i < 3; i++) {
  1069.          if (inst->src[i].file == GRF) {
  1070.             for (unsigned j = 0; j < inst->regs_read(i); ++j)
  1071.                add_dep(last_grf_write[inst->src[i].reg + j], n);
  1072.          } else if (inst->src[i].file == HW_REG &&
  1073.                     (inst->src[i].fixed_hw_reg.file ==
  1074.                      BRW_GENERAL_REGISTER_FILE)) {
  1075.             add_dep(last_fixed_grf_write, n);
  1076.          } else if (inst->src[i].is_accumulator()) {
  1077.             assert(last_accumulator_write);
  1078.             add_dep(last_accumulator_write, n);
  1079.          } else if (inst->src[i].file != BAD_FILE &&
  1080.                     inst->src[i].file != IMM &&
  1081.                     inst->src[i].file != UNIFORM &&
  1082.                     (inst->src[i].file != HW_REG ||
  1083.                      inst->src[i].fixed_hw_reg.file != IMM)) {
  1084.             /* No reads from MRF, and ATTR is already translated away */
  1085.             assert(inst->src[i].file != MRF &&
  1086.                    inst->src[i].file != ATTR);
  1087.             add_barrier_deps(n);
  1088.          }
  1089.       }
  1090.  
  1091.       if (!inst->is_send_from_grf()) {
  1092.          for (int i = 0; i < inst->mlen; i++) {
  1093.             /* It looks like the MRF regs are released in the send
  1094.              * instruction once it's sent, not when the result comes
  1095.              * back.
  1096.              */
  1097.             add_dep(last_mrf_write[inst->base_mrf + i], n);
  1098.          }
  1099.       }
  1100.  
  1101.       if (inst->reads_flag()) {
  1102.          assert(last_conditional_mod);
  1103.          add_dep(last_conditional_mod, n);
  1104.       }
  1105.  
  1106.       if (inst->reads_accumulator_implicitly()) {
  1107.          assert(last_accumulator_write);
  1108.          add_dep(last_accumulator_write, n);
  1109.       }
  1110.  
  1111.       /* write-after-write deps. */
  1112.       if (inst->dst.file == GRF) {
  1113.          for (unsigned j = 0; j < inst->regs_written; ++j) {
  1114.             add_dep(last_grf_write[inst->dst.reg + j], n);
  1115.             last_grf_write[inst->dst.reg + j] = n;
  1116.          }
  1117.       } else if (inst->dst.file == MRF) {
  1118.          add_dep(last_mrf_write[inst->dst.reg], n);
  1119.          last_mrf_write[inst->dst.reg] = n;
  1120.      } else if (inst->dst.file == HW_REG &&
  1121.                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
  1122.          last_fixed_grf_write = n;
  1123.       } else if (inst->dst.is_accumulator()) {
  1124.          add_dep(last_accumulator_write, n);
  1125.          last_accumulator_write = n;
  1126.       } else if (inst->dst.file != BAD_FILE &&
  1127.                  !inst->dst.is_null()) {
  1128.          add_barrier_deps(n);
  1129.       }
  1130.  
  1131.       if (inst->mlen > 0 && !inst->is_send_from_grf()) {
  1132.          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
  1133.             add_dep(last_mrf_write[inst->base_mrf + i], n);
  1134.             last_mrf_write[inst->base_mrf + i] = n;
  1135.          }
  1136.       }
  1137.  
  1138.       if (inst->writes_flag()) {
  1139.          add_dep(last_conditional_mod, n, 0);
  1140.          last_conditional_mod = n;
  1141.       }
  1142.  
  1143.       if (inst->writes_accumulator_implicitly(v->devinfo) &&
  1144.           !inst->dst.is_accumulator()) {
  1145.          add_dep(last_accumulator_write, n);
  1146.          last_accumulator_write = n;
  1147.       }
  1148.    }
  1149.  
  1150.    /* bottom-to-top dependencies: WAR */
  1151.    memset(last_grf_write, 0, sizeof(last_grf_write));
  1152.    memset(last_mrf_write, 0, sizeof(last_mrf_write));
  1153.    last_conditional_mod = NULL;
  1154.    last_accumulator_write = NULL;
  1155.    last_fixed_grf_write = NULL;
  1156.  
  1157.    exec_node *node;
  1158.    exec_node *prev;
  1159.    for (node = instructions.get_tail(), prev = node->prev;
  1160.         !node->is_head_sentinel();
  1161.         node = prev, prev = node->prev) {
  1162.       schedule_node *n = (schedule_node *)node;
  1163.       vec4_instruction *inst = (vec4_instruction *)n->inst;
  1164.  
  1165.       /* write-after-read deps. */
  1166.       for (int i = 0; i < 3; i++) {
  1167.          if (inst->src[i].file == GRF) {
  1168.             for (unsigned j = 0; j < inst->regs_read(i); ++j)
  1169.                add_dep(n, last_grf_write[inst->src[i].reg + j]);
  1170.          } else if (inst->src[i].file == HW_REG &&
  1171.                     (inst->src[i].fixed_hw_reg.file ==
  1172.                      BRW_GENERAL_REGISTER_FILE)) {
  1173.             add_dep(n, last_fixed_grf_write);
  1174.          } else if (inst->src[i].is_accumulator()) {
  1175.             add_dep(n, last_accumulator_write);
  1176.          } else if (inst->src[i].file != BAD_FILE &&
  1177.                     inst->src[i].file != IMM &&
  1178.                     inst->src[i].file != UNIFORM &&
  1179.                     (inst->src[i].file != HW_REG ||
  1180.                      inst->src[i].fixed_hw_reg.file != IMM)) {
  1181.             assert(inst->src[i].file != MRF &&
  1182.                    inst->src[i].file != ATTR);
  1183.             add_barrier_deps(n);
  1184.          }
  1185.       }
  1186.  
  1187.       if (!inst->is_send_from_grf()) {
  1188.          for (int i = 0; i < inst->mlen; i++) {
  1189.             /* It looks like the MRF regs are released in the send
  1190.              * instruction once it's sent, not when the result comes
  1191.              * back.
  1192.              */
  1193.             add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
  1194.          }
  1195.       }
  1196.  
  1197.       if (inst->reads_flag()) {
  1198.          add_dep(n, last_conditional_mod);
  1199.       }
  1200.  
  1201.       if (inst->reads_accumulator_implicitly()) {
  1202.          add_dep(n, last_accumulator_write);
  1203.       }
  1204.  
  1205.       /* Update the things this instruction wrote, so earlier reads
  1206.        * can mark this as WAR dependency.
  1207.        */
  1208.       if (inst->dst.file == GRF) {
  1209.          for (unsigned j = 0; j < inst->regs_written; ++j)
  1210.             last_grf_write[inst->dst.reg + j] = n;
  1211.       } else if (inst->dst.file == MRF) {
  1212.          last_mrf_write[inst->dst.reg] = n;
  1213.       } else if (inst->dst.file == HW_REG &&
  1214.                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
  1215.          last_fixed_grf_write = n;
  1216.       } else if (inst->dst.is_accumulator()) {
  1217.          last_accumulator_write = n;
  1218.       } else if (inst->dst.file != BAD_FILE &&
  1219.                  !inst->dst.is_null()) {
  1220.          add_barrier_deps(n);
  1221.       }
  1222.  
  1223.       if (inst->mlen > 0 && !inst->is_send_from_grf()) {
  1224.          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
  1225.             last_mrf_write[inst->base_mrf + i] = n;
  1226.          }
  1227.       }
  1228.  
  1229.       if (inst->writes_flag()) {
  1230.          last_conditional_mod = n;
  1231.       }
  1232.  
  1233.       if (inst->writes_accumulator_implicitly(v->devinfo)) {
  1234.          last_accumulator_write = n;
  1235.       }
  1236.    }
  1237. }
  1238.  
  1239. schedule_node *
  1240. fs_instruction_scheduler::choose_instruction_to_schedule()
  1241. {
  1242.    schedule_node *chosen = NULL;
  1243.  
  1244.    if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) {
  1245.       int chosen_time = 0;
  1246.  
  1247.       /* Of the instructions ready to execute or the closest to
  1248.        * being ready, choose the oldest one.
  1249.        */
  1250.       foreach_in_list(schedule_node, n, &instructions) {
  1251.          if (!chosen || n->unblocked_time < chosen_time) {
  1252.             chosen = n;
  1253.             chosen_time = n->unblocked_time;
  1254.          }
  1255.       }
  1256.    } else {
  1257.       /* Before register allocation, we don't care about the latencies of
  1258.        * instructions.  All we care about is reducing live intervals of
  1259.        * variables so that we can avoid register spilling, or get SIMD16
  1260.        * shaders which naturally do a better job of hiding instruction
  1261.        * latency.
  1262.        */
  1263.       foreach_in_list(schedule_node, n, &instructions) {
  1264.          fs_inst *inst = (fs_inst *)n->inst;
  1265.  
  1266.          if (!chosen) {
  1267.             chosen = n;
  1268.             continue;
  1269.          }
  1270.  
  1271.          /* Most important: If we can definitely reduce register pressure, do
  1272.           * so immediately.
  1273.           */
  1274.          int register_pressure_benefit = get_register_pressure_benefit(n->inst);
  1275.          int chosen_register_pressure_benefit =
  1276.             get_register_pressure_benefit(chosen->inst);
  1277.  
  1278.          if (register_pressure_benefit > 0 &&
  1279.              register_pressure_benefit > chosen_register_pressure_benefit) {
  1280.             chosen = n;
  1281.             continue;
  1282.          } else if (chosen_register_pressure_benefit > 0 &&
  1283.                     (register_pressure_benefit <
  1284.                      chosen_register_pressure_benefit)) {
  1285.             continue;
  1286.          }
  1287.  
  1288.          if (mode == SCHEDULE_PRE_LIFO) {
  1289.             /* Prefer instructions that recently became available for
  1290.              * scheduling.  These are the things that are most likely to
  1291.              * (eventually) make a variable dead and reduce register pressure.
  1292.              * Typical register pressure estimates don't work for us because
  1293.              * most of our pressure comes from texturing, where no single
  1294.              * instruction to schedule will make a vec4 value dead.
  1295.              */
  1296.             if (n->cand_generation > chosen->cand_generation) {
  1297.                chosen = n;
  1298.                continue;
  1299.             } else if (n->cand_generation < chosen->cand_generation) {
  1300.                continue;
  1301.             }
  1302.  
  1303.             /* On MRF-using chips, prefer non-SEND instructions.  If we don't
  1304.              * do this, then because we prefer instructions that just became
  1305.              * candidates, we'll end up in a pattern of scheduling a SEND,
  1306.              * then the MRFs for the next SEND, then the next SEND, then the
  1307.              * MRFs, etc., without ever consuming the results of a send.
  1308.              */
  1309.             if (v->devinfo->gen < 7) {
  1310.                fs_inst *chosen_inst = (fs_inst *)chosen->inst;
  1311.  
  1312.                /* We use regs_written > 1 as our test for the kind of send
  1313.                 * instruction to avoid -- only sends generate many regs, and a
  1314.                 * single-result send is probably actually reducing register
  1315.                 * pressure.
  1316.                 */
  1317.                if (inst->regs_written <= inst->dst.width / 8 &&
  1318.                    chosen_inst->regs_written > chosen_inst->dst.width / 8) {
  1319.                   chosen = n;
  1320.                   continue;
  1321.                } else if (inst->regs_written > chosen_inst->regs_written) {
  1322.                   continue;
  1323.                }
  1324.             }
  1325.          }
  1326.  
  1327.          /* For instructions pushed on the cands list at the same time, prefer
  1328.           * the one with the highest delay to the end of the program.  This is
  1329.           * most likely to have its values able to be consumed first (such as
  1330.           * for a large tree of lowered ubo loads, which appear reversed in
  1331.           * the instruction stream with respect to when they can be consumed).
  1332.           */
  1333.          if (n->delay > chosen->delay) {
  1334.             chosen = n;
  1335.             continue;
  1336.          } else if (n->delay < chosen->delay) {
  1337.             continue;
  1338.          }
  1339.  
  1340.          /* If all other metrics are equal, we prefer the first instruction in
  1341.           * the list (program execution).
  1342.           */
  1343.       }
  1344.    }
  1345.  
  1346.    return chosen;
  1347. }
  1348.  
  1349. schedule_node *
  1350. vec4_instruction_scheduler::choose_instruction_to_schedule()
  1351. {
  1352.    schedule_node *chosen = NULL;
  1353.    int chosen_time = 0;
  1354.  
  1355.    /* Of the instructions ready to execute or the closest to being ready,
  1356.     * choose the oldest one.
  1357.     */
  1358.    foreach_in_list(schedule_node, n, &instructions) {
  1359.       if (!chosen || n->unblocked_time < chosen_time) {
  1360.          chosen = n;
  1361.          chosen_time = n->unblocked_time;
  1362.       }
  1363.    }
  1364.  
  1365.    return chosen;
  1366. }
  1367.  
  1368. int
  1369. fs_instruction_scheduler::issue_time(backend_instruction *inst)
  1370. {
  1371.    if (is_compressed((fs_inst *)inst))
  1372.       return 4;
  1373.    else
  1374.       return 2;
  1375. }
  1376.  
  1377. int
  1378. vec4_instruction_scheduler::issue_time(backend_instruction *inst)
  1379. {
  1380.    /* We always execute as two vec4s in parallel. */
  1381.    return 2;
  1382. }
  1383.  
  1384. void
  1385. instruction_scheduler::schedule_instructions(bblock_t *block)
  1386. {
  1387.    const struct brw_device_info *devinfo = bv->devinfo;
  1388.    backend_instruction *inst = block->end();
  1389.    time = 0;
  1390.  
  1391.    /* Remove non-DAG heads from the list. */
  1392.    foreach_in_list_safe(schedule_node, n, &instructions) {
  1393.       if (n->parent_count != 0)
  1394.          n->remove();
  1395.    }
  1396.  
  1397.    unsigned cand_generation = 1;
  1398.    while (!instructions.is_empty()) {
  1399.       schedule_node *chosen = choose_instruction_to_schedule();
  1400.  
  1401.       /* Schedule this instruction. */
  1402.       assert(chosen);
  1403.       chosen->remove();
  1404.       inst->insert_before(block, chosen->inst);
  1405.       instructions_to_schedule--;
  1406.       update_register_pressure(chosen->inst);
  1407.  
  1408.       /* Update the clock for how soon an instruction could start after the
  1409.        * chosen one.
  1410.        */
  1411.       time += issue_time(chosen->inst);
  1412.  
  1413.       /* If we expected a delay for scheduling, then bump the clock to reflect
  1414.        * that as well.  In reality, the hardware will switch to another
  1415.        * hyperthread and may not return to dispatching our thread for a while
  1416.        * even after we're unblocked.
  1417.        */
  1418.       time = MAX2(time, chosen->unblocked_time);
  1419.  
  1420.       if (debug) {
  1421.          fprintf(stderr, "clock %4d, scheduled: ", time);
  1422.          bv->dump_instruction(chosen->inst);
  1423.       }
  1424.  
  1425.       /* Now that we've scheduled a new instruction, some of its
  1426.        * children can be promoted to the list of instructions ready to
  1427.        * be scheduled.  Update the children's unblocked time for this
  1428.        * DAG edge as we do so.
  1429.        */
  1430.       for (int i = chosen->child_count - 1; i >= 0; i--) {
  1431.          schedule_node *child = chosen->children[i];
  1432.  
  1433.          child->unblocked_time = MAX2(child->unblocked_time,
  1434.                                       time + chosen->child_latency[i]);
  1435.  
  1436.          if (debug) {
  1437.             fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count);
  1438.             bv->dump_instruction(child->inst);
  1439.          }
  1440.  
  1441.          child->cand_generation = cand_generation;
  1442.          child->parent_count--;
  1443.          if (child->parent_count == 0) {
  1444.             if (debug) {
  1445.                fprintf(stderr, "\t\tnow available\n");
  1446.             }
  1447.             instructions.push_head(child);
  1448.          }
  1449.       }
  1450.       cand_generation++;
  1451.  
  1452.       /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
  1453.        * but it's more limited pre-gen6, so if we send something off to it then
  1454.        * the next math instruction isn't going to make progress until the first
  1455.        * is done.
  1456.        */
  1457.       if (devinfo->gen < 6 && chosen->inst->is_math()) {
  1458.          foreach_in_list(schedule_node, n, &instructions) {
  1459.             if (n->inst->is_math())
  1460.                n->unblocked_time = MAX2(n->unblocked_time,
  1461.                                         time + chosen->latency);
  1462.          }
  1463.       }
  1464.    }
  1465.  
  1466.    if (block->end()->opcode == BRW_OPCODE_NOP)
  1467.       block->end()->remove(block);
  1468.    assert(instructions_to_schedule == 0);
  1469. }
  1470.  
  1471. void
  1472. instruction_scheduler::run(cfg_t *cfg)
  1473. {
  1474.    if (debug) {
  1475.       fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
  1476.               post_reg_alloc);
  1477.       bv->dump_instructions();
  1478.    }
  1479.  
  1480.    /* Populate the remaining GRF uses array to improve the pre-regalloc
  1481.     * scheduling.
  1482.     */
  1483.    if (remaining_grf_uses) {
  1484.       foreach_block_and_inst(block, backend_instruction, inst, cfg) {
  1485.          count_remaining_grf_uses(inst);
  1486.       }
  1487.    }
  1488.  
  1489.    foreach_block(block, cfg) {
  1490.       if (block->end_ip - block->start_ip <= 1)
  1491.          continue;
  1492.  
  1493.       add_insts_from_block(block);
  1494.  
  1495.       calculate_deps();
  1496.  
  1497.       foreach_in_list(schedule_node, n, &instructions) {
  1498.          compute_delay(n);
  1499.       }
  1500.  
  1501.       schedule_instructions(block);
  1502.    }
  1503.  
  1504.    if (debug) {
  1505.       fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
  1506.               post_reg_alloc);
  1507.       bv->dump_instructions();
  1508.    }
  1509. }
  1510.  
  1511. void
  1512. fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
  1513. {
  1514.    int grf_count;
  1515.    if (mode == SCHEDULE_POST)
  1516.       grf_count = grf_used;
  1517.    else
  1518.       grf_count = alloc.count;
  1519.  
  1520.    fs_instruction_scheduler sched(this, grf_count, mode);
  1521.    sched.run(cfg);
  1522.  
  1523.    if (unlikely(debug_enabled) && mode == SCHEDULE_POST) {
  1524.       fprintf(stderr, "%s%d estimated execution time: %d cycles\n",
  1525.               stage_abbrev, dispatch_width, sched.time);
  1526.    }
  1527.  
  1528.    invalidate_live_intervals();
  1529. }
  1530.  
  1531. void
  1532. vec4_visitor::opt_schedule_instructions()
  1533. {
  1534.    vec4_instruction_scheduler sched(this, prog_data->total_grf);
  1535.    sched.run(cfg);
  1536.  
  1537.    if (unlikely(debug_enabled)) {
  1538.       fprintf(stderr, "%s estimated execution time: %d cycles\n",
  1539.               stage_abbrev, sched.time);
  1540.    }
  1541.  
  1542.    invalidate_live_intervals();
  1543. }
  1544.