Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2010 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  *
  23.  * Authors:
  24.  *    Eric Anholt <eric@anholt.net>
  25.  *
  26.  */
  27.  
  28. #include "brw_fs.h"
  29. #include "brw_vec4.h"
  30. #include "glsl/glsl_types.h"
  31. #include "glsl/ir_optimization.h"
  32.  
  33. using namespace brw;
  34.  
  35. /** @file brw_fs_schedule_instructions.cpp
  36.  *
  37.  * List scheduling of FS instructions.
  38.  *
  39.  * The basic model of the list scheduler is to take a basic block,
  40.  * compute a DAG of the dependencies (RAW ordering with latency, WAW
  41.  * ordering with latency, WAR ordering), and make a list of the DAG heads.
  42.  * Heuristically pick a DAG head, then put all the children that are
  43.  * now DAG heads into the list of things to schedule.
  44.  *
  45.  * The heuristic is the important part.  We're trying to be cheap,
  46.  * since actually computing the optimal scheduling is NP complete.
  47.  * What we do is track a "current clock".  When we schedule a node, we
  48.  * update the earliest-unblocked clock time of its children, and
  49.  * increment the clock.  Then, when trying to schedule, we just pick
  50.  * the earliest-unblocked instruction to schedule.
  51.  *
  52.  * Note that often there will be many things which could execute
  53.  * immediately, and there are a range of heuristic options to choose
  54.  * from in picking among those.
  55.  */
  56.  
  57. static bool debug = false;
  58.  
  59. class schedule_node : public exec_node
  60. {
  61. public:
  62.    schedule_node(backend_instruction *inst, const struct brw_context *brw)
  63.    {
  64.       this->inst = inst;
  65.       this->child_array_size = 0;
  66.       this->children = NULL;
  67.       this->child_latency = NULL;
  68.       this->child_count = 0;
  69.       this->parent_count = 0;
  70.       this->unblocked_time = 0;
  71.  
  72.       /* We can't measure Gen6 timings directly but expect them to be much
  73.        * closer to Gen7 than Gen4.
  74.        */
  75.       if (brw->gen >= 6)
  76.          set_latency_gen7(brw->is_haswell);
  77.       else
  78.          set_latency_gen4();
  79.    }
  80.  
  81.    void set_latency_gen4();
  82.    void set_latency_gen7(bool is_haswell);
  83.  
  84.    backend_instruction *inst;
  85.    schedule_node **children;
  86.    int *child_latency;
  87.    int child_count;
  88.    int parent_count;
  89.    int child_array_size;
  90.    int unblocked_time;
  91.    int latency;
  92. };
  93.  
  94. void
  95. schedule_node::set_latency_gen4()
  96. {
  97.    int chans = 8;
  98.    int math_latency = 22;
  99.  
  100.    switch (inst->opcode) {
  101.    case SHADER_OPCODE_RCP:
  102.       this->latency = 1 * chans * math_latency;
  103.       break;
  104.    case SHADER_OPCODE_RSQ:
  105.       this->latency = 2 * chans * math_latency;
  106.       break;
  107.    case SHADER_OPCODE_INT_QUOTIENT:
  108.    case SHADER_OPCODE_SQRT:
  109.    case SHADER_OPCODE_LOG2:
  110.       /* full precision log.  partial is 2. */
  111.       this->latency = 3 * chans * math_latency;
  112.       break;
  113.    case SHADER_OPCODE_INT_REMAINDER:
  114.    case SHADER_OPCODE_EXP2:
  115.       /* full precision.  partial is 3, same throughput. */
  116.       this->latency = 4 * chans * math_latency;
  117.       break;
  118.    case SHADER_OPCODE_POW:
  119.       this->latency = 8 * chans * math_latency;
  120.       break;
  121.    case SHADER_OPCODE_SIN:
  122.    case SHADER_OPCODE_COS:
  123.       /* minimum latency, max is 12 rounds. */
  124.       this->latency = 5 * chans * math_latency;
  125.       break;
  126.    default:
  127.       this->latency = 2;
  128.       break;
  129.    }
  130. }
  131.  
  132. void
  133. schedule_node::set_latency_gen7(bool is_haswell)
  134. {
  135.    switch (inst->opcode) {
  136.    case BRW_OPCODE_MAD:
  137.       /* 2 cycles
  138.        *  (since the last two src operands are in different register banks):
  139.        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
  140.        *
  141.        * 3 cycles on IVB, 4 on HSW
  142.        *  (since the last two src operands are in the same register bank):
  143.        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
  144.        *
  145.        * 18 cycles on IVB, 16 on HSW
  146.        *  (since the last two src operands are in different register banks):
  147.        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
  148.        * mov(8) null   g4<4,5,1>F                     { align16 WE_normal 1Q };
  149.        *
  150.        * 20 cycles on IVB, 18 on HSW
  151.        *  (since the last two src operands are in the same register bank):
  152.        * mad(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
  153.        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
  154.        */
  155.  
  156.       /* Our register allocator doesn't know about register banks, so use the
  157.        * higher latency.
  158.        */
  159.       latency = is_haswell ? 16 : 18;
  160.       break;
  161.  
  162.    case BRW_OPCODE_LRP:
  163.       /* 2 cycles
  164.        *  (since the last two src operands are in different register banks):
  165.        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
  166.        *
  167.        * 3 cycles on IVB, 4 on HSW
  168.        *  (since the last two src operands are in the same register bank):
  169.        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
  170.        *
  171.        * 16 cycles on IVB, 14 on HSW
  172.        *  (since the last two src operands are in different register banks):
  173.        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g3.1<4,1,1>F.x { align16 WE_normal 1Q };
  174.        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
  175.        *
  176.        * 16 cycles
  177.        *  (since the last two src operands are in the same register bank):
  178.        * lrp(8) g4<1>F g2.2<4,1,1>F.x  g2<4,1,1>F.x g2.1<4,1,1>F.x { align16 WE_normal 1Q };
  179.        * mov(8) null   g4<4,4,1>F                     { align16 WE_normal 1Q };
  180.        */
  181.  
  182.       /* Our register allocator doesn't know about register banks, so use the
  183.        * higher latency.
  184.        */
  185.       latency = 14;
  186.       break;
  187.  
  188.    case SHADER_OPCODE_RCP:
  189.    case SHADER_OPCODE_RSQ:
  190.    case SHADER_OPCODE_SQRT:
  191.    case SHADER_OPCODE_LOG2:
  192.    case SHADER_OPCODE_EXP2:
  193.    case SHADER_OPCODE_SIN:
  194.    case SHADER_OPCODE_COS:
  195.       /* 2 cycles:
  196.        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
  197.        *
  198.        * 18 cycles:
  199.        * math inv(8) g4<1>F g2<0,1,0>F      null       { align1 WE_normal 1Q };
  200.        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
  201.        *
  202.        * Same for exp2, log2, rsq, sqrt, sin, cos.
  203.        */
  204.       latency = is_haswell ? 14 : 16;
  205.       break;
  206.  
  207.    case SHADER_OPCODE_POW:
  208.       /* 2 cycles:
  209.        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
  210.        *
  211.        * 26 cycles:
  212.        * math pow(8) g4<1>F g2<0,1,0>F   g2.1<0,1,0>F  { align1 WE_normal 1Q };
  213.        * mov(8)      null   g4<8,8,1>F                 { align1 WE_normal 1Q };
  214.        */
  215.       latency = is_haswell ? 22 : 24;
  216.       break;
  217.  
  218.    case SHADER_OPCODE_TEX:
  219.    case SHADER_OPCODE_TXD:
  220.    case SHADER_OPCODE_TXF:
  221.    case SHADER_OPCODE_TXL:
  222.       /* 18 cycles:
  223.        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
  224.        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
  225.        * send(8) g4<1>UW    g114<8,8,1>F
  226.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  227.        *
  228.        * 697 +/-49 cycles (min 610, n=26):
  229.        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
  230.        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
  231.        * send(8) g4<1>UW    g114<8,8,1>F
  232.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  233.        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
  234.        *
  235.        * So the latency on our first texture load of the batchbuffer takes
  236.        * ~700 cycles, since the caches are cold at that point.
  237.        *
  238.        * 840 +/- 92 cycles (min 720, n=25):
  239.        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
  240.        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
  241.        * send(8) g4<1>UW    g114<8,8,1>F
  242.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  243.        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
  244.        * send(8) g4<1>UW    g114<8,8,1>F
  245.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  246.        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
  247.        *
  248.        * On the second load, it takes just an extra ~140 cycles, and after
  249.        * accounting for the 14 cycles of the MOV's latency, that makes ~130.
  250.        *
  251.        * 683 +/- 49 cycles (min = 602, n=47):
  252.        * mov(8)  g115<1>F   0F                         { align1 WE_normal 1Q };
  253.        * mov(8)  g114<1>F   0F                         { align1 WE_normal 1Q };
  254.        * send(8) g4<1>UW    g114<8,8,1>F
  255.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  256.        * send(8) g50<1>UW   g114<8,8,1>F
  257.        *   sampler (10, 0, 0, 1) mlen 2 rlen 4         { align1 WE_normal 1Q };
  258.        * mov(8)  null       g4<8,8,1>F                 { align1 WE_normal 1Q };
  259.        *
  260.        * The unit appears to be pipelined, since this matches up with the
  261.        * cache-cold case, despite there being two loads here.  If you replace
  262.        * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
  263.        *
  264.        * So, take some number between the cache-hot 140 cycles and the
  265.        * cache-cold 700 cycles.  No particular tuning was done on this.
  266.        *
  267.        * I haven't done significant testing of the non-TEX opcodes.  TXL at
  268.        * least looked about the same as TEX.
  269.        */
  270.       latency = 200;
  271.       break;
  272.  
  273.    case SHADER_OPCODE_TXS:
  274.       /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
  275.        * cycles (n=15):
  276.        * mov(8)   g114<1>UD  0D                        { align1 WE_normal 1Q };
  277.        * send(8)  g6<1>UW    g114<8,8,1>F
  278.        *   sampler (10, 0, 10, 1) mlen 1 rlen 4        { align1 WE_normal 1Q };
  279.        * mov(16)  g6<1>F     g6<8,8,1>D                { align1 WE_normal 1Q };
  280.        *
  281.        *
  282.        * Two loads was 535 +/- 30 cycles (n=19):
  283.        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
  284.        * send(16)  g6<1>UW    g114<8,8,1>F
  285.        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
  286.        * mov(16)   g114<1>UD  0D                       { align1 WE_normal 1H };
  287.        * mov(16)   g6<1>F     g6<8,8,1>D               { align1 WE_normal 1H };
  288.        * send(16)  g8<1>UW    g114<8,8,1>F
  289.        *   sampler (10, 0, 10, 2) mlen 2 rlen 8        { align1 WE_normal 1H };
  290.        * mov(16)   g8<1>F     g8<8,8,1>D               { align1 WE_normal 1H };
  291.        * add(16)   g6<1>F     g6<8,8,1>F   g8<8,8,1>F  { align1 WE_normal 1H };
  292.        *
  293.        * Since the only caches that should matter are just the
  294.        * instruction/state cache containing the surface state, assume that we
  295.        * always have hot caches.
  296.        */
  297.       latency = 100;
  298.       break;
  299.  
  300.    case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
  301.    case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
  302.    case VS_OPCODE_PULL_CONSTANT_LOAD:
  303.       /* testing using varying-index pull constants:
  304.        *
  305.        * 16 cycles:
  306.        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
  307.        * send(8) g4<1>F  g4<8,8,1>D
  308.        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
  309.        *
  310.        * ~480 cycles:
  311.        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
  312.        * send(8) g4<1>F  g4<8,8,1>D
  313.        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
  314.        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
  315.        *
  316.        * ~620 cycles:
  317.        * mov(8)  g4<1>D  g2.1<0,1,0>F                  { align1 WE_normal 1Q };
  318.        * send(8) g4<1>F  g4<8,8,1>D
  319.        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
  320.        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
  321.        * send(8) g4<1>F  g4<8,8,1>D
  322.        *   data (9, 2, 3) mlen 1 rlen 1                { align1 WE_normal 1Q };
  323.        * mov(8)  null    g4<8,8,1>F                    { align1 WE_normal 1Q };
  324.        *
  325.        * So, if it's cache-hot, it's about 140.  If it's cache cold, it's
  326.        * about 460.  We expect to mostly be cache hot, so pick something more
  327.        * in that direction.
  328.        */
  329.       latency = 200;
  330.       break;
  331.  
  332.    default:
  333.       /* 2 cycles:
  334.        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
  335.        *
  336.        * 16 cycles:
  337.        * mul(8) g4<1>F g2<0,1,0>F      0.5F            { align1 WE_normal 1Q };
  338.        * mov(8) null   g4<8,8,1>F                      { align1 WE_normal 1Q };
  339.        */
  340.       latency = 14;
  341.       break;
  342.    }
  343. }
  344.  
  345. class instruction_scheduler {
  346. public:
  347.    instruction_scheduler(backend_visitor *v, int grf_count, bool post_reg_alloc)
  348.    {
  349.       this->bv = v;
  350.       this->mem_ctx = ralloc_context(v->mem_ctx);
  351.       this->grf_count = grf_count;
  352.       this->instructions.make_empty();
  353.       this->instructions_to_schedule = 0;
  354.       this->post_reg_alloc = post_reg_alloc;
  355.       this->time = 0;
  356.    }
  357.  
  358.    ~instruction_scheduler()
  359.    {
  360.       ralloc_free(this->mem_ctx);
  361.    }
  362.    void add_barrier_deps(schedule_node *n);
  363.    void add_dep(schedule_node *before, schedule_node *after, int latency);
  364.    void add_dep(schedule_node *before, schedule_node *after);
  365.  
  366.    void run(exec_list *instructions);
  367.    void add_inst(backend_instruction *inst);
  368.    virtual void calculate_deps() = 0;
  369.    virtual schedule_node *choose_instruction_to_schedule() = 0;
  370.  
  371.    /**
  372.     * Returns how many cycles it takes the instruction to issue.
  373.     *
  374.     * Instructions in gen hardware are handled one simd4 vector at a time,
  375.     * with 1 cycle per vector dispatched.  Thus 8-wide pixel shaders take 2
  376.     * cycles to dispatch and 16-wide (compressed) instructions take 4.
  377.     */
  378.    virtual int issue_time(backend_instruction *inst) = 0;
  379.  
  380.    void schedule_instructions(backend_instruction *next_block_header);
  381.  
  382.    void *mem_ctx;
  383.  
  384.    bool post_reg_alloc;
  385.    int instructions_to_schedule;
  386.    int grf_count;
  387.    int time;
  388.    exec_list instructions;
  389.    backend_visitor *bv;
  390. };
  391.  
  392. class fs_instruction_scheduler : public instruction_scheduler
  393. {
  394. public:
  395.    fs_instruction_scheduler(fs_visitor *v, int grf_count, bool post_reg_alloc);
  396.    void calculate_deps();
  397.    bool is_compressed(fs_inst *inst);
  398.    schedule_node *choose_instruction_to_schedule();
  399.    int issue_time(backend_instruction *inst);
  400.    fs_visitor *v;
  401. };
  402.  
  403. fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
  404.                                                    int grf_count,
  405.                                                    bool post_reg_alloc)
  406.    : instruction_scheduler(v, grf_count, post_reg_alloc),
  407.      v(v)
  408. {
  409. }
  410.  
  411. class vec4_instruction_scheduler : public instruction_scheduler
  412. {
  413. public:
  414.    vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
  415.    void calculate_deps();
  416.    schedule_node *choose_instruction_to_schedule();
  417.    int issue_time(backend_instruction *inst);
  418.    vec4_visitor *v;
  419. };
  420.  
  421. vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
  422.                                                        int grf_count)
  423.    : instruction_scheduler(v, grf_count, true),
  424.      v(v)
  425. {
  426. }
  427.  
  428. void
  429. instruction_scheduler::add_inst(backend_instruction *inst)
  430. {
  431.    schedule_node *n = new(mem_ctx) schedule_node(inst, bv->brw);
  432.  
  433.    assert(!inst->is_head_sentinel());
  434.    assert(!inst->is_tail_sentinel());
  435.  
  436.    this->instructions_to_schedule++;
  437.  
  438.    inst->remove();
  439.    instructions.push_tail(n);
  440. }
  441.  
  442. /**
  443.  * Add a dependency between two instruction nodes.
  444.  *
  445.  * The @after node will be scheduled after @before.  We will try to
  446.  * schedule it @latency cycles after @before, but no guarantees there.
  447.  */
  448. void
  449. instruction_scheduler::add_dep(schedule_node *before, schedule_node *after,
  450.                                int latency)
  451. {
  452.    if (!before || !after)
  453.       return;
  454.  
  455.    assert(before != after);
  456.  
  457.    for (int i = 0; i < before->child_count; i++) {
  458.       if (before->children[i] == after) {
  459.          before->child_latency[i] = MAX2(before->child_latency[i], latency);
  460.          return;
  461.       }
  462.    }
  463.  
  464.    if (before->child_array_size <= before->child_count) {
  465.       if (before->child_array_size < 16)
  466.          before->child_array_size = 16;
  467.       else
  468.          before->child_array_size *= 2;
  469.  
  470.       before->children = reralloc(mem_ctx, before->children,
  471.                                   schedule_node *,
  472.                                   before->child_array_size);
  473.       before->child_latency = reralloc(mem_ctx, before->child_latency,
  474.                                        int, before->child_array_size);
  475.    }
  476.  
  477.    before->children[before->child_count] = after;
  478.    before->child_latency[before->child_count] = latency;
  479.    before->child_count++;
  480.    after->parent_count++;
  481. }
  482.  
  483. void
  484. instruction_scheduler::add_dep(schedule_node *before, schedule_node *after)
  485. {
  486.    if (!before)
  487.       return;
  488.  
  489.    add_dep(before, after, before->latency);
  490. }
  491.  
  492. /**
  493.  * Sometimes we really want this node to execute after everything that
  494.  * was before it and before everything that followed it.  This adds
  495.  * the deps to do so.
  496.  */
  497. void
  498. instruction_scheduler::add_barrier_deps(schedule_node *n)
  499. {
  500.    schedule_node *prev = (schedule_node *)n->prev;
  501.    schedule_node *next = (schedule_node *)n->next;
  502.  
  503.    if (prev) {
  504.       while (!prev->is_head_sentinel()) {
  505.          add_dep(prev, n, 0);
  506.          prev = (schedule_node *)prev->prev;
  507.       }
  508.    }
  509.  
  510.    if (next) {
  511.       while (!next->is_tail_sentinel()) {
  512.          add_dep(n, next, 0);
  513.          next = (schedule_node *)next->next;
  514.       }
  515.    }
  516. }
  517.  
  518. /* instruction scheduling needs to be aware of when an MRF write
  519.  * actually writes 2 MRFs.
  520.  */
  521. bool
  522. fs_instruction_scheduler::is_compressed(fs_inst *inst)
  523. {
  524.    return (v->dispatch_width == 16 &&
  525.            !inst->force_uncompressed &&
  526.            !inst->force_sechalf);
  527. }
  528.  
  529. void
  530. fs_instruction_scheduler::calculate_deps()
  531. {
  532.    /* Pre-register-allocation, this tracks the last write per VGRF (so
  533.     * different reg_offsets within it can interfere when they shouldn't).
  534.     * After register allocation, reg_offsets are gone and we track individual
  535.     * GRF registers.
  536.     */
  537.    schedule_node *last_grf_write[grf_count];
  538.    schedule_node *last_mrf_write[BRW_MAX_MRF];
  539.    schedule_node *last_conditional_mod[2] = { NULL, NULL };
  540.    /* Fixed HW registers are assumed to be separate from the virtual
  541.     * GRFs, so they can be tracked separately.  We don't really write
  542.     * to fixed GRFs much, so don't bother tracking them on a more
  543.     * granular level.
  544.     */
  545.    schedule_node *last_fixed_grf_write = NULL;
  546.    int reg_width = v->dispatch_width / 8;
  547.  
  548.    /* The last instruction always needs to still be the last
  549.     * instruction.  Either it's flow control (IF, ELSE, ENDIF, DO,
  550.     * WHILE) and scheduling other things after it would disturb the
  551.     * basic block, or it's FB_WRITE and we should do a better job at
  552.     * dead code elimination anyway.
  553.     */
  554.    schedule_node *last = (schedule_node *)instructions.get_tail();
  555.    add_barrier_deps(last);
  556.  
  557.    memset(last_grf_write, 0, sizeof(last_grf_write));
  558.    memset(last_mrf_write, 0, sizeof(last_mrf_write));
  559.  
  560.    /* top-to-bottom dependencies: RAW and WAW. */
  561.    foreach_list(node, &instructions) {
  562.       schedule_node *n = (schedule_node *)node;
  563.       fs_inst *inst = (fs_inst *)n->inst;
  564.  
  565.       if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT)
  566.          add_barrier_deps(n);
  567.  
  568.       /* read-after-write deps. */
  569.       for (int i = 0; i < 3; i++) {
  570.          if (inst->src[i].file == GRF) {
  571.             if (post_reg_alloc) {
  572.                for (int r = 0; r < reg_width; r++)
  573.                   add_dep(last_grf_write[inst->src[i].reg + r], n);
  574.             } else {
  575.                add_dep(last_grf_write[inst->src[i].reg], n);
  576.             }
  577.          } else if (inst->src[i].file == HW_REG &&
  578.                     (inst->src[i].fixed_hw_reg.file ==
  579.                      BRW_GENERAL_REGISTER_FILE)) {
  580.             if (post_reg_alloc) {
  581.                int size = reg_width;
  582.                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
  583.                   size = 1;
  584.                for (int r = 0; r < size; r++)
  585.                   add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
  586.             } else {
  587.                add_dep(last_fixed_grf_write, n);
  588.             }
  589.          } else if (inst->src[i].file != BAD_FILE &&
  590.                     inst->src[i].file != IMM &&
  591.                     inst->src[i].file != UNIFORM) {
  592.             assert(inst->src[i].file != MRF);
  593.             add_barrier_deps(n);
  594.          }
  595.       }
  596.  
  597.       for (int i = 0; i < inst->mlen; i++) {
  598.          /* It looks like the MRF regs are released in the send
  599.           * instruction once it's sent, not when the result comes
  600.           * back.
  601.           */
  602.          add_dep(last_mrf_write[inst->base_mrf + i], n);
  603.       }
  604.  
  605.       if (inst->predicate) {
  606.          add_dep(last_conditional_mod[inst->flag_subreg], n);
  607.       }
  608.  
  609.       /* write-after-write deps. */
  610.       if (inst->dst.file == GRF) {
  611.          if (post_reg_alloc) {
  612.             for (int r = 0; r < inst->regs_written * reg_width; r++) {
  613.                add_dep(last_grf_write[inst->dst.reg + r], n);
  614.                last_grf_write[inst->dst.reg + r] = n;
  615.             }
  616.          } else {
  617.             add_dep(last_grf_write[inst->dst.reg], n);
  618.             last_grf_write[inst->dst.reg] = n;
  619.          }
  620.       } else if (inst->dst.file == MRF) {
  621.          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
  622.  
  623.          add_dep(last_mrf_write[reg], n);
  624.          last_mrf_write[reg] = n;
  625.          if (is_compressed(inst)) {
  626.             if (inst->dst.reg & BRW_MRF_COMPR4)
  627.                reg += 4;
  628.             else
  629.                reg++;
  630.             add_dep(last_mrf_write[reg], n);
  631.             last_mrf_write[reg] = n;
  632.          }
  633.       } else if (inst->dst.file == HW_REG &&
  634.                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
  635.          if (post_reg_alloc) {
  636.             for (int r = 0; r < reg_width; r++)
  637.                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
  638.          } else {
  639.             last_fixed_grf_write = n;
  640.          }
  641.       } else if (inst->dst.file != BAD_FILE) {
  642.          add_barrier_deps(n);
  643.       }
  644.  
  645.       if (inst->mlen > 0) {
  646.          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
  647.             add_dep(last_mrf_write[inst->base_mrf + i], n);
  648.             last_mrf_write[inst->base_mrf + i] = n;
  649.          }
  650.       }
  651.  
  652.       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
  653.        * conditional_mod, because it sets the flag register.
  654.        */
  655.       if (inst->conditional_mod ||
  656.           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
  657.          add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
  658.          last_conditional_mod[inst->flag_subreg] = n;
  659.       }
  660.    }
  661.  
  662.    /* bottom-to-top dependencies: WAR */
  663.    memset(last_grf_write, 0, sizeof(last_grf_write));
  664.    memset(last_mrf_write, 0, sizeof(last_mrf_write));
  665.    memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
  666.    last_fixed_grf_write = NULL;
  667.  
  668.    exec_node *node;
  669.    exec_node *prev;
  670.    for (node = instructions.get_tail(), prev = node->prev;
  671.         !node->is_head_sentinel();
  672.         node = prev, prev = node->prev) {
  673.       schedule_node *n = (schedule_node *)node;
  674.       fs_inst *inst = (fs_inst *)n->inst;
  675.  
  676.       /* write-after-read deps. */
  677.       for (int i = 0; i < 3; i++) {
  678.          if (inst->src[i].file == GRF) {
  679.             if (post_reg_alloc) {
  680.                for (int r = 0; r < reg_width; r++)
  681.                   add_dep(n, last_grf_write[inst->src[i].reg + r]);
  682.             } else {
  683.                add_dep(n, last_grf_write[inst->src[i].reg]);
  684.             }
  685.          } else if (inst->src[i].file == HW_REG &&
  686.                     (inst->src[i].fixed_hw_reg.file ==
  687.                      BRW_GENERAL_REGISTER_FILE)) {
  688.             if (post_reg_alloc) {
  689.                int size = reg_width;
  690.                if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
  691.                   size = 1;
  692.                for (int r = 0; r < size; r++)
  693.                   add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
  694.             } else {
  695.                add_dep(n, last_fixed_grf_write);
  696.             }
  697.          } else if (inst->src[i].file != BAD_FILE &&
  698.                     inst->src[i].file != IMM &&
  699.                     inst->src[i].file != UNIFORM) {
  700.             assert(inst->src[i].file != MRF);
  701.             add_barrier_deps(n);
  702.          }
  703.       }
  704.  
  705.       for (int i = 0; i < inst->mlen; i++) {
  706.          /* It looks like the MRF regs are released in the send
  707.           * instruction once it's sent, not when the result comes
  708.           * back.
  709.           */
  710.          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
  711.       }
  712.  
  713.       if (inst->predicate) {
  714.          add_dep(n, last_conditional_mod[inst->flag_subreg]);
  715.       }
  716.  
  717.       /* Update the things this instruction wrote, so earlier reads
  718.        * can mark this as WAR dependency.
  719.        */
  720.       if (inst->dst.file == GRF) {
  721.          if (post_reg_alloc) {
  722.             for (int r = 0; r < inst->regs_written * reg_width; r++)
  723.                last_grf_write[inst->dst.reg + r] = n;
  724.          } else {
  725.             last_grf_write[inst->dst.reg] = n;
  726.          }
  727.       } else if (inst->dst.file == MRF) {
  728.          int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
  729.  
  730.          last_mrf_write[reg] = n;
  731.  
  732.          if (is_compressed(inst)) {
  733.             if (inst->dst.reg & BRW_MRF_COMPR4)
  734.                reg += 4;
  735.             else
  736.                reg++;
  737.  
  738.             last_mrf_write[reg] = n;
  739.          }
  740.       } else if (inst->dst.file == HW_REG &&
  741.                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
  742.          if (post_reg_alloc) {
  743.             for (int r = 0; r < reg_width; r++)
  744.                last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
  745.          } else {
  746.             last_fixed_grf_write = n;
  747.          }
  748.       } else if (inst->dst.file != BAD_FILE) {
  749.          add_barrier_deps(n);
  750.       }
  751.  
  752.       if (inst->mlen > 0) {
  753.          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
  754.             last_mrf_write[inst->base_mrf + i] = n;
  755.          }
  756.       }
  757.  
  758.       /* Treat FS_OPCODE_MOV_DISPATCH_TO_FLAGS as though it had a
  759.        * conditional_mod, because it sets the flag register.
  760.        */
  761.       if (inst->conditional_mod ||
  762.           inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
  763.          last_conditional_mod[inst->flag_subreg] = n;
  764.       }
  765.    }
  766. }
  767.  
  768. void
  769. vec4_instruction_scheduler::calculate_deps()
  770. {
  771.    schedule_node *last_grf_write[grf_count];
  772.    schedule_node *last_mrf_write[BRW_MAX_MRF];
  773.    schedule_node *last_conditional_mod = NULL;
  774.    /* Fixed HW registers are assumed to be separate from the virtual
  775.     * GRFs, so they can be tracked separately.  We don't really write
  776.     * to fixed GRFs much, so don't bother tracking them on a more
  777.     * granular level.
  778.     */
  779.    schedule_node *last_fixed_grf_write = NULL;
  780.  
  781.    /* The last instruction always needs to still be the last instruction.
  782.     * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
  783.     * other things after it would disturb the basic block, or it's the EOT
  784.     * URB_WRITE and we should do a better job at dead code eliminating
  785.     * anything that could have been scheduled after it.
  786.     */
  787.    schedule_node *last = (schedule_node *)instructions.get_tail();
  788.    add_barrier_deps(last);
  789.  
  790.    memset(last_grf_write, 0, sizeof(last_grf_write));
  791.    memset(last_mrf_write, 0, sizeof(last_mrf_write));
  792.  
  793.    /* top-to-bottom dependencies: RAW and WAW. */
  794.    foreach_list(node, &instructions) {
  795.       schedule_node *n = (schedule_node *)node;
  796.       vec4_instruction *inst = (vec4_instruction *)n->inst;
  797.  
  798.       /* read-after-write deps. */
  799.       for (int i = 0; i < 3; i++) {
  800.          if (inst->src[i].file == GRF) {
  801.             add_dep(last_grf_write[inst->src[i].reg], n);
  802.          } else if (inst->src[i].file == HW_REG &&
  803.                     (inst->src[i].fixed_hw_reg.file ==
  804.                      BRW_GENERAL_REGISTER_FILE)) {
  805.             add_dep(last_fixed_grf_write, n);
  806.          } else if (inst->src[i].file != BAD_FILE &&
  807.                     inst->src[i].file != IMM &&
  808.                     inst->src[i].file != UNIFORM) {
  809.             /* No reads from MRF, and ATTR is already translated away */
  810.             assert(inst->src[i].file != MRF &&
  811.                    inst->src[i].file != ATTR);
  812.             add_barrier_deps(n);
  813.          }
  814.       }
  815.  
  816.       for (int i = 0; i < inst->mlen; i++) {
  817.          /* It looks like the MRF regs are released in the send
  818.           * instruction once it's sent, not when the result comes
  819.           * back.
  820.           */
  821.          add_dep(last_mrf_write[inst->base_mrf + i], n);
  822.       }
  823.  
  824.       if (inst->predicate) {
  825.          assert(last_conditional_mod);
  826.          add_dep(last_conditional_mod, n);
  827.       }
  828.  
  829.       /* write-after-write deps. */
  830.       if (inst->dst.file == GRF) {
  831.          add_dep(last_grf_write[inst->dst.reg], n);
  832.          last_grf_write[inst->dst.reg] = n;
  833.       } else if (inst->dst.file == MRF) {
  834.          add_dep(last_mrf_write[inst->dst.reg], n);
  835.          last_mrf_write[inst->dst.reg] = n;
  836.      } else if (inst->dst.file == HW_REG &&
  837.                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
  838.          last_fixed_grf_write = n;
  839.       } else if (inst->dst.file != BAD_FILE) {
  840.          add_barrier_deps(n);
  841.       }
  842.  
  843.       if (inst->mlen > 0) {
  844.          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
  845.             add_dep(last_mrf_write[inst->base_mrf + i], n);
  846.             last_mrf_write[inst->base_mrf + i] = n;
  847.          }
  848.       }
  849.  
  850.       if (inst->conditional_mod) {
  851.          add_dep(last_conditional_mod, n, 0);
  852.          last_conditional_mod = n;
  853.       }
  854.    }
  855.  
  856.    /* bottom-to-top dependencies: WAR */
  857.    memset(last_grf_write, 0, sizeof(last_grf_write));
  858.    memset(last_mrf_write, 0, sizeof(last_mrf_write));
  859.    last_conditional_mod = NULL;
  860.    last_fixed_grf_write = NULL;
  861.  
  862.    exec_node *node;
  863.    exec_node *prev;
  864.    for (node = instructions.get_tail(), prev = node->prev;
  865.         !node->is_head_sentinel();
  866.         node = prev, prev = node->prev) {
  867.       schedule_node *n = (schedule_node *)node;
  868.       vec4_instruction *inst = (vec4_instruction *)n->inst;
  869.  
  870.       /* write-after-read deps. */
  871.       for (int i = 0; i < 3; i++) {
  872.          if (inst->src[i].file == GRF) {
  873.             add_dep(n, last_grf_write[inst->src[i].reg]);
  874.          } else if (inst->src[i].file == HW_REG &&
  875.                     (inst->src[i].fixed_hw_reg.file ==
  876.                      BRW_GENERAL_REGISTER_FILE)) {
  877.             add_dep(n, last_fixed_grf_write);
  878.          } else if (inst->src[i].file != BAD_FILE &&
  879.                     inst->src[i].file != IMM &&
  880.                     inst->src[i].file != UNIFORM) {
  881.             assert(inst->src[i].file != MRF &&
  882.                    inst->src[i].file != ATTR);
  883.             add_barrier_deps(n);
  884.          }
  885.       }
  886.  
  887.       for (int i = 0; i < inst->mlen; i++) {
  888.          /* It looks like the MRF regs are released in the send
  889.           * instruction once it's sent, not when the result comes
  890.           * back.
  891.           */
  892.          add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
  893.       }
  894.  
  895.       if (inst->predicate) {
  896.          add_dep(n, last_conditional_mod);
  897.       }
  898.  
  899.       /* Update the things this instruction wrote, so earlier reads
  900.        * can mark this as WAR dependency.
  901.        */
  902.       if (inst->dst.file == GRF) {
  903.          last_grf_write[inst->dst.reg] = n;
  904.       } else if (inst->dst.file == MRF) {
  905.          last_mrf_write[inst->dst.reg] = n;
  906.       } else if (inst->dst.file == HW_REG &&
  907.                  inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
  908.          last_fixed_grf_write = n;
  909.       } else if (inst->dst.file != BAD_FILE) {
  910.          add_barrier_deps(n);
  911.       }
  912.  
  913.       if (inst->mlen > 0) {
  914.          for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
  915.             last_mrf_write[inst->base_mrf + i] = n;
  916.          }
  917.       }
  918.  
  919.       if (inst->conditional_mod) {
  920.          last_conditional_mod = n;
  921.       }
  922.    }
  923. }
  924.  
  925. schedule_node *
  926. fs_instruction_scheduler::choose_instruction_to_schedule()
  927. {
  928.    schedule_node *chosen = NULL;
  929.  
  930.    if (post_reg_alloc) {
  931.       int chosen_time = 0;
  932.  
  933.       /* Of the instructions closest ready to execute or the closest to
  934.        * being ready, choose the oldest one.
  935.        */
  936.       foreach_list(node, &instructions) {
  937.          schedule_node *n = (schedule_node *)node;
  938.  
  939.          if (!chosen || n->unblocked_time < chosen_time) {
  940.             chosen = n;
  941.             chosen_time = n->unblocked_time;
  942.          }
  943.       }
  944.    } else {
  945.       /* Before register allocation, we don't care about the latencies of
  946.        * instructions.  All we care about is reducing live intervals of
  947.        * variables so that we can avoid register spilling, or get 16-wide
  948.        * shaders which naturally do a better job of hiding instruction
  949.        * latency.
  950.        *
  951.        * To do so, schedule our instructions in a roughly LIFO/depth-first
  952.        * order: when new instructions become available as a result of
  953.        * scheduling something, choose those first so that our result
  954.        * hopefully is consumed quickly.
  955.        *
  956.        * The exception is messages that generate more than one result
  957.        * register (AKA texturing).  In those cases, the LIFO search would
  958.        * normally tend to choose them quickly (because scheduling the
  959.        * previous message not only unblocked the children using its result,
  960.        * but also the MRF setup for the next sampler message, which in turn
  961.        * unblocks the next sampler message).
  962.        */
  963.       for (schedule_node *node = (schedule_node *)instructions.get_tail();
  964.            node != instructions.get_head()->prev;
  965.            node = (schedule_node *)node->prev) {
  966.          schedule_node *n = (schedule_node *)node;
  967.          fs_inst *inst = (fs_inst *)n->inst;
  968.  
  969.          chosen = n;
  970.          if (inst->regs_written <= 1)
  971.             break;
  972.       }
  973.    }
  974.  
  975.    return chosen;
  976. }
  977.  
  978. schedule_node *
  979. vec4_instruction_scheduler::choose_instruction_to_schedule()
  980. {
  981.    schedule_node *chosen = NULL;
  982.    int chosen_time = 0;
  983.  
  984.    /* Of the instructions ready to execute or the closest to being ready,
  985.     * choose the oldest one.
  986.     */
  987.    foreach_list(node, &instructions) {
  988.       schedule_node *n = (schedule_node *)node;
  989.  
  990.       if (!chosen || n->unblocked_time < chosen_time) {
  991.          chosen = n;
  992.          chosen_time = n->unblocked_time;
  993.       }
  994.    }
  995.  
  996.    return chosen;
  997. }
  998.  
  999. int
  1000. fs_instruction_scheduler::issue_time(backend_instruction *inst)
  1001. {
  1002.    if (is_compressed((fs_inst *)inst))
  1003.       return 4;
  1004.    else
  1005.       return 2;
  1006. }
  1007.  
  1008. int
  1009. vec4_instruction_scheduler::issue_time(backend_instruction *inst)
  1010. {
  1011.    /* We always execute as two vec4s in parallel. */
  1012.    return 2;
  1013. }
  1014.  
  1015. void
  1016. instruction_scheduler::schedule_instructions(backend_instruction *next_block_header)
  1017. {
  1018.    time = 0;
  1019.  
  1020.    /* Remove non-DAG heads from the list. */
  1021.    foreach_list_safe(node, &instructions) {
  1022.       schedule_node *n = (schedule_node *)node;
  1023.       if (n->parent_count != 0)
  1024.          n->remove();
  1025.    }
  1026.  
  1027.    while (!instructions.is_empty()) {
  1028.       schedule_node *chosen = choose_instruction_to_schedule();
  1029.  
  1030.       /* Schedule this instruction. */
  1031.       assert(chosen);
  1032.       chosen->remove();
  1033.       next_block_header->insert_before(chosen->inst);
  1034.       instructions_to_schedule--;
  1035.  
  1036.       /* Update the clock for how soon an instruction could start after the
  1037.        * chosen one.
  1038.        */
  1039.       time += issue_time(chosen->inst);
  1040.  
  1041.       /* If we expected a delay for scheduling, then bump the clock to reflect
  1042.        * that as well.  In reality, the hardware will switch to another
  1043.        * hyperthread and may not return to dispatching our thread for a while
  1044.        * even after we're unblocked.
  1045.        */
  1046.       time = MAX2(time, chosen->unblocked_time);
  1047.  
  1048.       if (debug) {
  1049.          printf("clock %4d, scheduled: ", time);
  1050.          bv->dump_instruction(chosen->inst);
  1051.       }
  1052.  
  1053.       /* Now that we've scheduled a new instruction, some of its
  1054.        * children can be promoted to the list of instructions ready to
  1055.        * be scheduled.  Update the children's unblocked time for this
  1056.        * DAG edge as we do so.
  1057.        */
  1058.       for (int i = 0; i < chosen->child_count; i++) {
  1059.          schedule_node *child = chosen->children[i];
  1060.  
  1061.          child->unblocked_time = MAX2(child->unblocked_time,
  1062.                                       time + chosen->child_latency[i]);
  1063.  
  1064.          child->parent_count--;
  1065.          if (child->parent_count == 0) {
  1066.             if (debug) {
  1067.                printf("now available: ");
  1068.                bv->dump_instruction(child->inst);
  1069.             }
  1070.             instructions.push_tail(child);
  1071.          }
  1072.       }
  1073.  
  1074.       /* Shared resource: the mathbox.  There's one mathbox per EU on Gen6+
  1075.        * but it's more limited pre-gen6, so if we send something off to it then
  1076.        * the next math instruction isn't going to make progress until the first
  1077.        * is done.
  1078.        */
  1079.       if (chosen->inst->is_math()) {
  1080.          foreach_list(node, &instructions) {
  1081.             schedule_node *n = (schedule_node *)node;
  1082.  
  1083.             if (n->inst->is_math())
  1084.                n->unblocked_time = MAX2(n->unblocked_time,
  1085.                                         time + chosen->latency);
  1086.          }
  1087.       }
  1088.    }
  1089.  
  1090.    assert(instructions_to_schedule == 0);
  1091. }
  1092.  
  1093. void
  1094. instruction_scheduler::run(exec_list *all_instructions)
  1095. {
  1096.    backend_instruction *next_block_header =
  1097.       (backend_instruction *)all_instructions->head;
  1098.  
  1099.    if (debug) {
  1100.       printf("\nInstructions before scheduling (reg_alloc %d)\n", post_reg_alloc);
  1101.       bv->dump_instructions();
  1102.    }
  1103.  
  1104.    while (!next_block_header->is_tail_sentinel()) {
  1105.       /* Add things to be scheduled until we get to a new BB. */
  1106.       while (!next_block_header->is_tail_sentinel()) {
  1107.          backend_instruction *inst = next_block_header;
  1108.          next_block_header = (backend_instruction *)next_block_header->next;
  1109.  
  1110.          add_inst(inst);
  1111.          if (inst->is_control_flow())
  1112.             break;
  1113.       }
  1114.       calculate_deps();
  1115.       schedule_instructions(next_block_header);
  1116.    }
  1117.  
  1118.    if (debug) {
  1119.       printf("\nInstructions after scheduling (reg_alloc %d)\n", post_reg_alloc);
  1120.       bv->dump_instructions();
  1121.    }
  1122. }
  1123.  
  1124. void
  1125. fs_visitor::schedule_instructions(bool post_reg_alloc)
  1126. {
  1127.    int grf_count;
  1128.    if (post_reg_alloc)
  1129.       grf_count = grf_used;
  1130.    else
  1131.       grf_count = virtual_grf_count;
  1132.  
  1133.    fs_instruction_scheduler sched(this, grf_count, post_reg_alloc);
  1134.    sched.run(&instructions);
  1135.  
  1136.    if (unlikely(INTEL_DEBUG & DEBUG_WM) && post_reg_alloc) {
  1137.       printf("fs%d estimated execution time: %d cycles\n",
  1138.              dispatch_width, sched.time);
  1139.    }
  1140.  
  1141.    this->live_intervals_valid = false;
  1142. }
  1143.  
  1144. void
  1145. vec4_visitor::opt_schedule_instructions()
  1146. {
  1147.    vec4_instruction_scheduler sched(this, prog_data->total_grf);
  1148.    sched.run(&instructions);
  1149.  
  1150.    if (unlikely(debug_flag)) {
  1151.       printf("vec4 estimated execution time: %d cycles\n", sched.time);
  1152.    }
  1153.  
  1154.    this->live_intervals_valid = false;
  1155. }
  1156.