Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2012 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  */
  23.  
  24. /** @file brw_fs_copy_propagation.cpp
  25.  *
  26.  * Support for global copy propagation in two passes: A local pass that does
  27.  * intra-block copy (and constant) propagation, and a global pass that uses
  28.  * dataflow analysis on the copies available at the end of each block to re-do
  29.  * local copy propagation with more copies available.
  30.  *
  31.  * See Muchnick's Advanced Compiler Design and Implementation, section
  32.  * 12.5 (p356).
  33.  */
  34.  
  35. #define ACP_HASH_SIZE 16
  36.  
  37. #include "util/bitset.h"
  38. #include "brw_fs.h"
  39. #include "brw_cfg.h"
  40.  
  41. namespace { /* avoid conflict with opt_copy_propagation_elements */
  42. struct acp_entry : public exec_node {
  43.    fs_reg dst;
  44.    fs_reg src;
  45.    uint8_t regs_written;
  46.    enum opcode opcode;
  47.    bool saturate;
  48. };
  49.  
  50. struct block_data {
  51.    /**
  52.     * Which entries in the fs_copy_prop_dataflow acp table are live at the
  53.     * start of this block.  This is the useful output of the analysis, since
  54.     * it lets us plug those into the local copy propagation on the second
  55.     * pass.
  56.     */
  57.    BITSET_WORD *livein;
  58.  
  59.    /**
  60.     * Which entries in the fs_copy_prop_dataflow acp table are live at the end
  61.     * of this block.  This is done in initial setup from the per-block acps
  62.     * returned by the first local copy prop pass.
  63.     */
  64.    BITSET_WORD *liveout;
  65.  
  66.    /**
  67.     * Which entries in the fs_copy_prop_dataflow acp table are generated by
  68.     * instructions in this block which reach the end of the block without
  69.     * being killed.
  70.     */
  71.    BITSET_WORD *copy;
  72.  
  73.    /**
  74.     * Which entries in the fs_copy_prop_dataflow acp table are killed over the
  75.     * course of this block.
  76.     */
  77.    BITSET_WORD *kill;
  78. };
  79.  
  80. class fs_copy_prop_dataflow
  81. {
  82. public:
  83.    fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
  84.                          exec_list *out_acp[ACP_HASH_SIZE]);
  85.  
  86.    void setup_initial_values();
  87.    void run();
  88.  
  89.    void dump_block_data() const;
  90.  
  91.    void *mem_ctx;
  92.    cfg_t *cfg;
  93.  
  94.    acp_entry **acp;
  95.    int num_acp;
  96.    int bitset_words;
  97.  
  98.   struct block_data *bd;
  99. };
  100. } /* anonymous namespace */
  101.  
  102. fs_copy_prop_dataflow::fs_copy_prop_dataflow(void *mem_ctx, cfg_t *cfg,
  103.                                              exec_list *out_acp[ACP_HASH_SIZE])
  104.    : mem_ctx(mem_ctx), cfg(cfg)
  105. {
  106.    bd = rzalloc_array(mem_ctx, struct block_data, cfg->num_blocks);
  107.  
  108.    num_acp = 0;
  109.    foreach_block (block, cfg) {
  110.       for (int i = 0; i < ACP_HASH_SIZE; i++) {
  111.          num_acp += out_acp[block->num][i].length();
  112.       }
  113.    }
  114.  
  115.    acp = rzalloc_array(mem_ctx, struct acp_entry *, num_acp);
  116.  
  117.    bitset_words = BITSET_WORDS(num_acp);
  118.  
  119.    int next_acp = 0;
  120.    foreach_block (block, cfg) {
  121.       bd[block->num].livein = rzalloc_array(bd, BITSET_WORD, bitset_words);
  122.       bd[block->num].liveout = rzalloc_array(bd, BITSET_WORD, bitset_words);
  123.       bd[block->num].copy = rzalloc_array(bd, BITSET_WORD, bitset_words);
  124.       bd[block->num].kill = rzalloc_array(bd, BITSET_WORD, bitset_words);
  125.  
  126.       for (int i = 0; i < ACP_HASH_SIZE; i++) {
  127.          foreach_in_list(acp_entry, entry, &out_acp[block->num][i]) {
  128.             acp[next_acp] = entry;
  129.  
  130.             /* opt_copy_propagate_local populates out_acp with copies created
  131.              * in a block which are still live at the end of the block.  This
  132.              * is exactly what we want in the COPY set.
  133.              */
  134.             BITSET_SET(bd[block->num].copy, next_acp);
  135.  
  136.             next_acp++;
  137.          }
  138.       }
  139.    }
  140.  
  141.    assert(next_acp == num_acp);
  142.  
  143.    setup_initial_values();
  144.    run();
  145. }
  146.  
  147. /**
  148.  * Set up initial values for each of the data flow sets, prior to running
  149.  * the fixed-point algorithm.
  150.  */
  151. void
  152. fs_copy_prop_dataflow::setup_initial_values()
  153. {
  154.    /* Initialize the COPY and KILL sets. */
  155.    foreach_block (block, cfg) {
  156.       foreach_inst_in_block(fs_inst, inst, block) {
  157.          if (inst->dst.file != GRF)
  158.             continue;
  159.  
  160.          /* Mark ACP entries which are killed by this instruction. */
  161.          for (int i = 0; i < num_acp; i++) {
  162.             if (inst->overwrites_reg(acp[i]->dst) ||
  163.                 inst->overwrites_reg(acp[i]->src)) {
  164.                BITSET_SET(bd[block->num].kill, i);
  165.             }
  166.          }
  167.       }
  168.    }
  169.  
  170.    /* Populate the initial values for the livein and liveout sets.  For the
  171.     * block at the start of the program, livein = 0 and liveout = copy.
  172.     * For the others, set liveout to 0 (the empty set) and livein to ~0
  173.     * (the universal set).
  174.     */
  175.    foreach_block (block, cfg) {
  176.       if (block->parents.is_empty()) {
  177.          for (int i = 0; i < bitset_words; i++) {
  178.             bd[block->num].livein[i] = 0u;
  179.             bd[block->num].liveout[i] = bd[block->num].copy[i];
  180.          }
  181.       } else {
  182.          for (int i = 0; i < bitset_words; i++) {
  183.             bd[block->num].liveout[i] = 0u;
  184.             bd[block->num].livein[i] = ~0u;
  185.          }
  186.       }
  187.    }
  188. }
  189.  
  190. /**
  191.  * Walk the set of instructions in the block, marking which entries in the acp
  192.  * are killed by the block.
  193.  */
  194. void
  195. fs_copy_prop_dataflow::run()
  196. {
  197.    bool progress;
  198.  
  199.    do {
  200.       progress = false;
  201.  
  202.       /* Update liveout for all blocks. */
  203.       foreach_block (block, cfg) {
  204.          if (block->parents.is_empty())
  205.             continue;
  206.  
  207.          for (int i = 0; i < bitset_words; i++) {
  208.             const BITSET_WORD old_liveout = bd[block->num].liveout[i];
  209.  
  210.             bd[block->num].liveout[i] =
  211.                bd[block->num].copy[i] | (bd[block->num].livein[i] &
  212.                                          ~bd[block->num].kill[i]);
  213.  
  214.             if (old_liveout != bd[block->num].liveout[i])
  215.                progress = true;
  216.          }
  217.       }
  218.  
  219.       /* Update livein for all blocks.  If a copy is live out of all parent
  220.        * blocks, it's live coming in to this block.
  221.        */
  222.       foreach_block (block, cfg) {
  223.          if (block->parents.is_empty())
  224.             continue;
  225.  
  226.          for (int i = 0; i < bitset_words; i++) {
  227.             const BITSET_WORD old_livein = bd[block->num].livein[i];
  228.  
  229.             bd[block->num].livein[i] = ~0u;
  230.             foreach_list_typed(bblock_link, parent_link, link, &block->parents) {
  231.                bblock_t *parent = parent_link->block;
  232.                bd[block->num].livein[i] &= bd[parent->num].liveout[i];
  233.             }
  234.  
  235.             if (old_livein != bd[block->num].livein[i])
  236.                progress = true;
  237.          }
  238.       }
  239.    } while (progress);
  240. }
  241.  
  242. void
  243. fs_copy_prop_dataflow::dump_block_data() const
  244. {
  245.    foreach_block (block, cfg) {
  246.       fprintf(stderr, "Block %d [%d, %d] (parents ", block->num,
  247.              block->start_ip, block->end_ip);
  248.       foreach_list_typed(bblock_link, link, link, &block->parents) {
  249.          bblock_t *parent = link->block;
  250.          fprintf(stderr, "%d ", parent->num);
  251.       }
  252.       fprintf(stderr, "):\n");
  253.       fprintf(stderr, "       livein = 0x");
  254.       for (int i = 0; i < bitset_words; i++)
  255.          fprintf(stderr, "%08x", bd[block->num].livein[i]);
  256.       fprintf(stderr, ", liveout = 0x");
  257.       for (int i = 0; i < bitset_words; i++)
  258.          fprintf(stderr, "%08x", bd[block->num].liveout[i]);
  259.       fprintf(stderr, ",\n       copy   = 0x");
  260.       for (int i = 0; i < bitset_words; i++)
  261.          fprintf(stderr, "%08x", bd[block->num].copy[i]);
  262.       fprintf(stderr, ", kill    = 0x");
  263.       for (int i = 0; i < bitset_words; i++)
  264.          fprintf(stderr, "%08x", bd[block->num].kill[i]);
  265.       fprintf(stderr, "\n");
  266.    }
  267. }
  268.  
  269. static bool
  270. is_logic_op(enum opcode opcode)
  271. {
  272.    return (opcode == BRW_OPCODE_AND ||
  273.            opcode == BRW_OPCODE_OR  ||
  274.            opcode == BRW_OPCODE_XOR ||
  275.            opcode == BRW_OPCODE_NOT);
  276. }
  277.  
  278. static bool
  279. can_change_source_types(fs_inst *inst)
  280. {
  281.    return !inst->src[0].abs && !inst->src[0].negate &&
  282.           (inst->opcode == BRW_OPCODE_MOV ||
  283.            (inst->opcode == BRW_OPCODE_SEL &&
  284.             inst->predicate != BRW_PREDICATE_NONE &&
  285.             !inst->src[1].abs && !inst->src[1].negate));
  286. }
  287.  
  288. bool
  289. fs_visitor::try_copy_propagate(fs_inst *inst, int arg, acp_entry *entry)
  290. {
  291.    if (inst->src[arg].file != GRF)
  292.       return false;
  293.  
  294.    if (entry->src.file == IMM)
  295.       return false;
  296.    assert(entry->src.file == GRF || entry->src.file == UNIFORM ||
  297.           entry->src.file == ATTR);
  298.  
  299.    if (entry->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
  300.        inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD)
  301.       return false;
  302.  
  303.    assert(entry->dst.file == GRF);
  304.    if (inst->src[arg].reg != entry->dst.reg)
  305.       return false;
  306.  
  307.    /* Bail if inst is reading a range that isn't contained in the range
  308.     * that entry is writing.
  309.     */
  310.    if (inst->src[arg].reg_offset < entry->dst.reg_offset ||
  311.        (inst->src[arg].reg_offset * 32 + inst->src[arg].subreg_offset +
  312.         inst->regs_read(arg) * inst->src[arg].stride * 32) >
  313.        (entry->dst.reg_offset + entry->regs_written) * 32)
  314.       return false;
  315.  
  316.    /* we can't generally copy-propagate UD negations because we
  317.     * can end up accessing the resulting values as signed integers
  318.     * instead. See also resolve_ud_negate() and comment in
  319.     * fs_generator::generate_code.
  320.     */
  321.    if (entry->src.type == BRW_REGISTER_TYPE_UD &&
  322.        entry->src.negate)
  323.       return false;
  324.  
  325.    bool has_source_modifiers = entry->src.abs || entry->src.negate;
  326.  
  327.    if ((has_source_modifiers || entry->src.file == UNIFORM ||
  328.         !entry->src.is_contiguous()) &&
  329.        !inst->can_do_source_mods(devinfo))
  330.       return false;
  331.  
  332.    if (has_source_modifiers &&
  333.        inst->opcode == SHADER_OPCODE_GEN4_SCRATCH_WRITE)
  334.       return false;
  335.  
  336.    /* Bail if the result of composing both strides would exceed the
  337.     * hardware limit.
  338.     */
  339.    if (entry->src.stride * inst->src[arg].stride > 4)
  340.       return false;
  341.  
  342.    /* Bail if the result of composing both strides cannot be expressed
  343.     * as another stride. This avoids, for example, trying to transform
  344.     * this:
  345.     *
  346.     *     MOV (8) rX<1>UD rY<0;1,0>UD
  347.     *     FOO (8) ...     rX<8;8,1>UW
  348.     *
  349.     * into this:
  350.     *
  351.     *     FOO (8) ...     rY<0;1,0>UW
  352.     *
  353.     * Which would have different semantics.
  354.     */
  355.    if (entry->src.stride != 1 &&
  356.        (inst->src[arg].stride *
  357.         type_sz(inst->src[arg].type)) % type_sz(entry->src.type) != 0)
  358.       return false;
  359.  
  360.    if (has_source_modifiers &&
  361.        entry->dst.type != inst->src[arg].type &&
  362.        !can_change_source_types(inst))
  363.       return false;
  364.  
  365.    if (devinfo->gen >= 8 && (entry->src.negate || entry->src.abs) &&
  366.        is_logic_op(inst->opcode)) {
  367.       return false;
  368.    }
  369.  
  370.    if (entry->saturate) {
  371.       switch(inst->opcode) {
  372.       case BRW_OPCODE_SEL:
  373.          if (inst->src[1].file != IMM ||
  374.              inst->src[1].fixed_hw_reg.dw1.f < 0.0 ||
  375.              inst->src[1].fixed_hw_reg.dw1.f > 1.0) {
  376.             return false;
  377.          }
  378.          break;
  379.       default:
  380.          return false;
  381.       }
  382.    }
  383.  
  384.    inst->src[arg].file = entry->src.file;
  385.    inst->src[arg].reg = entry->src.reg;
  386.    inst->src[arg].stride *= entry->src.stride;
  387.    inst->saturate = inst->saturate || entry->saturate;
  388.  
  389.    switch (entry->src.file) {
  390.    case UNIFORM:
  391.       assert(entry->src.width == 1);
  392.    case BAD_FILE:
  393.    case HW_REG:
  394.       inst->src[arg].width = entry->src.width;
  395.       inst->src[arg].reg_offset = entry->src.reg_offset;
  396.       inst->src[arg].subreg_offset = entry->src.subreg_offset;
  397.       break;
  398.    case ATTR:
  399.    case GRF:
  400.       {
  401.          assert(entry->src.width % inst->src[arg].width == 0);
  402.          /* In this case, we'll just leave the width alone.  The source
  403.           * register could have different widths depending on how it is
  404.           * being used.  For instance, if only half of the register was
  405.           * used then we want to preserve that and continue to only use
  406.           * half.
  407.           *
  408.           * Also, we have to deal with mapping parts of vgrfs to other
  409.           * parts of vgrfs so we have to do some reg_offset magic.
  410.           */
  411.  
  412.          /* Compute the offset of inst->src[arg] relative to inst->dst */
  413.          assert(entry->dst.subreg_offset == 0);
  414.          int rel_offset = inst->src[arg].reg_offset - entry->dst.reg_offset;
  415.          int rel_suboffset = inst->src[arg].subreg_offset;
  416.  
  417.          /* Compute the final register offset (in bytes) */
  418.          int offset = entry->src.reg_offset * 32 + entry->src.subreg_offset;
  419.          offset += rel_offset * 32 + rel_suboffset;
  420.          inst->src[arg].reg_offset = offset / 32;
  421.          inst->src[arg].subreg_offset = offset % 32;
  422.       }
  423.       break;
  424.    default:
  425.       unreachable("Invalid register file");
  426.       break;
  427.    }
  428.  
  429.    if (has_source_modifiers) {
  430.       if (entry->dst.type != inst->src[arg].type) {
  431.          /* We are propagating source modifiers from a MOV with a different
  432.           * type.  If we got here, then we can just change the source and
  433.           * destination types of the instruction and keep going.
  434.           */
  435.          assert(can_change_source_types(inst));
  436.          for (int i = 0; i < inst->sources; i++) {
  437.             inst->src[i].type = entry->dst.type;
  438.          }
  439.          inst->dst.type = entry->dst.type;
  440.       }
  441.  
  442.       if (!inst->src[arg].abs) {
  443.          inst->src[arg].abs = entry->src.abs;
  444.          inst->src[arg].negate ^= entry->src.negate;
  445.       }
  446.    }
  447.  
  448.    return true;
  449. }
  450.  
  451.  
  452. bool
  453. fs_visitor::try_constant_propagate(fs_inst *inst, acp_entry *entry)
  454. {
  455.    bool progress = false;
  456.  
  457.    if (entry->src.file != IMM)
  458.       return false;
  459.    if (entry->saturate)
  460.       return false;
  461.  
  462.    for (int i = inst->sources - 1; i >= 0; i--) {
  463.       if (inst->src[i].file != GRF)
  464.          continue;
  465.  
  466.       assert(entry->dst.file == GRF);
  467.       if (inst->src[i].reg != entry->dst.reg)
  468.          continue;
  469.  
  470.       /* Bail if inst is reading a range that isn't contained in the range
  471.        * that entry is writing.
  472.        */
  473.       if (inst->src[i].reg_offset < entry->dst.reg_offset ||
  474.           (inst->src[i].reg_offset * 32 + inst->src[i].subreg_offset +
  475.            inst->regs_read(i) * inst->src[i].stride * 32) >
  476.           (entry->dst.reg_offset + entry->regs_written) * 32)
  477.          continue;
  478.  
  479.       fs_reg val = entry->src;
  480.       val.type = inst->src[i].type;
  481.  
  482.       if (inst->src[i].abs) {
  483.          if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
  484.              !brw_abs_immediate(val.type, &val.fixed_hw_reg)) {
  485.             continue;
  486.          }
  487.       }
  488.  
  489.       if (inst->src[i].negate) {
  490.          if ((devinfo->gen >= 8 && is_logic_op(inst->opcode)) ||
  491.              !brw_negate_immediate(val.type, &val.fixed_hw_reg)) {
  492.             continue;
  493.          }
  494.       }
  495.  
  496.       switch (inst->opcode) {
  497.       case BRW_OPCODE_MOV:
  498.       case SHADER_OPCODE_LOAD_PAYLOAD:
  499.          inst->src[i] = val;
  500.          progress = true;
  501.          break;
  502.  
  503.       case SHADER_OPCODE_INT_QUOTIENT:
  504.       case SHADER_OPCODE_INT_REMAINDER:
  505.          /* FINISHME: Promote non-float constants and remove this. */
  506.          if (devinfo->gen < 8)
  507.             break;
  508.          /* fallthrough */
  509.       case SHADER_OPCODE_POW:
  510.          /* Allow constant propagation into src1 (except on Gen 6), and let
  511.           * constant combining promote the constant on Gen < 8.
  512.           *
  513.           * While Gen 6 MATH can take a scalar source, its source and
  514.           * destination offsets must be equal and we cannot ensure that.
  515.           */
  516.          if (devinfo->gen == 6)
  517.             break;
  518.          /* fallthrough */
  519.       case BRW_OPCODE_BFI1:
  520.       case BRW_OPCODE_ASR:
  521.       case BRW_OPCODE_SHL:
  522.       case BRW_OPCODE_SHR:
  523.       case BRW_OPCODE_SUBB:
  524.          if (i == 1) {
  525.             inst->src[i] = val;
  526.             progress = true;
  527.          }
  528.          break;
  529.  
  530.       case BRW_OPCODE_MACH:
  531.       case BRW_OPCODE_MUL:
  532.       case BRW_OPCODE_ADD:
  533.       case BRW_OPCODE_OR:
  534.       case BRW_OPCODE_AND:
  535.       case BRW_OPCODE_XOR:
  536.       case BRW_OPCODE_ADDC:
  537.          if (i == 1) {
  538.             inst->src[i] = val;
  539.             progress = true;
  540.          } else if (i == 0 && inst->src[1].file != IMM) {
  541.             /* Fit this constant in by commuting the operands.
  542.              * Exception: we can't do this for 32-bit integer MUL/MACH
  543.              * because it's asymmetric.
  544.              *
  545.              * The BSpec says for Broadwell that
  546.              *
  547.              *    "When multiplying DW x DW, the dst cannot be accumulator."
  548.              *
  549.              * Integer MUL with a non-accumulator destination will be lowered
  550.              * by lower_integer_multiplication(), so don't restrict it.
  551.              */
  552.             if (((inst->opcode == BRW_OPCODE_MUL &&
  553.                   inst->dst.is_accumulator()) ||
  554.                  inst->opcode == BRW_OPCODE_MACH) &&
  555.                 (inst->src[1].type == BRW_REGISTER_TYPE_D ||
  556.                  inst->src[1].type == BRW_REGISTER_TYPE_UD))
  557.                break;
  558.             inst->src[0] = inst->src[1];
  559.             inst->src[1] = val;
  560.             progress = true;
  561.          }
  562.          break;
  563.  
  564.       case BRW_OPCODE_CMP:
  565.       case BRW_OPCODE_IF:
  566.          if (i == 1) {
  567.             inst->src[i] = val;
  568.             progress = true;
  569.          } else if (i == 0 && inst->src[1].file != IMM) {
  570.             enum brw_conditional_mod new_cmod;
  571.  
  572.             new_cmod = brw_swap_cmod(inst->conditional_mod);
  573.             if (new_cmod != BRW_CONDITIONAL_NONE) {
  574.                /* Fit this constant in by swapping the operands and
  575.                 * flipping the test
  576.                 */
  577.                inst->src[0] = inst->src[1];
  578.                inst->src[1] = val;
  579.                inst->conditional_mod = new_cmod;
  580.                progress = true;
  581.             }
  582.          }
  583.          break;
  584.  
  585.       case BRW_OPCODE_SEL:
  586.          if (i == 1) {
  587.             inst->src[i] = val;
  588.             progress = true;
  589.          } else if (i == 0 && inst->src[1].file != IMM) {
  590.             inst->src[0] = inst->src[1];
  591.             inst->src[1] = val;
  592.  
  593.             /* If this was predicated, flipping operands means
  594.              * we also need to flip the predicate.
  595.              */
  596.             if (inst->conditional_mod == BRW_CONDITIONAL_NONE) {
  597.                inst->predicate_inverse =
  598.                   !inst->predicate_inverse;
  599.             }
  600.             progress = true;
  601.          }
  602.          break;
  603.  
  604.       case SHADER_OPCODE_RCP:
  605.          /* The hardware doesn't do math on immediate values
  606.           * (because why are you doing that, seriously?), but
  607.           * the correct answer is to just constant fold it
  608.           * anyway.
  609.           */
  610.          assert(i == 0);
  611.          if (inst->src[0].fixed_hw_reg.dw1.f != 0.0f) {
  612.             inst->opcode = BRW_OPCODE_MOV;
  613.             inst->src[0] = val;
  614.             inst->src[0].fixed_hw_reg.dw1.f = 1.0f / inst->src[0].fixed_hw_reg.dw1.f;
  615.             progress = true;
  616.          }
  617.          break;
  618.  
  619.       case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
  620.       case SHADER_OPCODE_BROADCAST:
  621.          inst->src[i] = val;
  622.          progress = true;
  623.          break;
  624.  
  625.       case BRW_OPCODE_MAD:
  626.       case BRW_OPCODE_LRP:
  627.          inst->src[i] = val;
  628.          progress = true;
  629.          break;
  630.  
  631.       default:
  632.          break;
  633.       }
  634.    }
  635.  
  636.    return progress;
  637. }
  638.  
  639. static bool
  640. can_propagate_from(fs_inst *inst)
  641. {
  642.    return (inst->opcode == BRW_OPCODE_MOV &&
  643.            inst->dst.file == GRF &&
  644.            ((inst->src[0].file == GRF &&
  645.              (inst->src[0].reg != inst->dst.reg ||
  646.               inst->src[0].reg_offset != inst->dst.reg_offset)) ||
  647.             inst->src[0].file == ATTR ||
  648.             inst->src[0].file == UNIFORM ||
  649.             inst->src[0].file == IMM) &&
  650.            inst->src[0].type == inst->dst.type &&
  651.            !inst->is_partial_write());
  652. }
  653.  
  654. /* Walks a basic block and does copy propagation on it using the acp
  655.  * list.
  656.  */
  657. bool
  658. fs_visitor::opt_copy_propagate_local(void *copy_prop_ctx, bblock_t *block,
  659.                                      exec_list *acp)
  660. {
  661.    bool progress = false;
  662.  
  663.    foreach_inst_in_block(fs_inst, inst, block) {
  664.       /* Try propagating into this instruction. */
  665.       for (int i = 0; i < inst->sources; i++) {
  666.          if (inst->src[i].file != GRF)
  667.             continue;
  668.  
  669.          foreach_in_list(acp_entry, entry, &acp[inst->src[i].reg % ACP_HASH_SIZE]) {
  670.             if (try_constant_propagate(inst, entry))
  671.                progress = true;
  672.  
  673.             if (try_copy_propagate(inst, i, entry))
  674.                progress = true;
  675.          }
  676.       }
  677.  
  678.       /* kill the destination from the ACP */
  679.       if (inst->dst.file == GRF) {
  680.          foreach_in_list_safe(acp_entry, entry, &acp[inst->dst.reg % ACP_HASH_SIZE]) {
  681.             if (inst->overwrites_reg(entry->dst)) {
  682.                entry->remove();
  683.             }
  684.          }
  685.  
  686.          /* Oops, we only have the chaining hash based on the destination, not
  687.           * the source, so walk across the entire table.
  688.           */
  689.          for (int i = 0; i < ACP_HASH_SIZE; i++) {
  690.             foreach_in_list_safe(acp_entry, entry, &acp[i]) {
  691.                if (inst->overwrites_reg(entry->src))
  692.                   entry->remove();
  693.             }
  694.          }
  695.       }
  696.  
  697.       /* If this instruction's source could potentially be folded into the
  698.        * operand of another instruction, add it to the ACP.
  699.        */
  700.       if (can_propagate_from(inst)) {
  701.          acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
  702.          entry->dst = inst->dst;
  703.          entry->src = inst->src[0];
  704.          entry->regs_written = inst->regs_written;
  705.          entry->opcode = inst->opcode;
  706.          entry->saturate = inst->saturate;
  707.          acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
  708.       } else if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD &&
  709.                  inst->dst.file == GRF) {
  710.          int offset = 0;
  711.          for (int i = 0; i < inst->sources; i++) {
  712.             int effective_width = i < inst->header_size ? 8 : inst->exec_size;
  713.             int regs_written = effective_width / 8;
  714.             if (inst->src[i].file == GRF) {
  715.                acp_entry *entry = ralloc(copy_prop_ctx, acp_entry);
  716.                entry->dst = inst->dst;
  717.                entry->dst.reg_offset = offset;
  718.                entry->dst.width = effective_width;
  719.                entry->src = inst->src[i];
  720.                entry->regs_written = regs_written;
  721.                entry->opcode = inst->opcode;
  722.                if (!entry->dst.equals(inst->src[i])) {
  723.                   acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
  724.                } else {
  725.                   ralloc_free(entry);
  726.                }
  727.             }
  728.             offset += regs_written;
  729.          }
  730.       }
  731.    }
  732.  
  733.    return progress;
  734. }
  735.  
  736. bool
  737. fs_visitor::opt_copy_propagate()
  738. {
  739.    bool progress = false;
  740.    void *copy_prop_ctx = ralloc_context(NULL);
  741.    exec_list *out_acp[cfg->num_blocks];
  742.  
  743.    for (int i = 0; i < cfg->num_blocks; i++)
  744.       out_acp[i] = new exec_list [ACP_HASH_SIZE];
  745.  
  746.    /* First, walk through each block doing local copy propagation and getting
  747.     * the set of copies available at the end of the block.
  748.     */
  749.    foreach_block (block, cfg) {
  750.       progress = opt_copy_propagate_local(copy_prop_ctx, block,
  751.                                           out_acp[block->num]) || progress;
  752.    }
  753.  
  754.    /* Do dataflow analysis for those available copies. */
  755.    fs_copy_prop_dataflow dataflow(copy_prop_ctx, cfg, out_acp);
  756.  
  757.    /* Next, re-run local copy propagation, this time with the set of copies
  758.     * provided by the dataflow analysis available at the start of a block.
  759.     */
  760.    foreach_block (block, cfg) {
  761.       exec_list in_acp[ACP_HASH_SIZE];
  762.  
  763.       for (int i = 0; i < dataflow.num_acp; i++) {
  764.          if (BITSET_TEST(dataflow.bd[block->num].livein, i)) {
  765.             struct acp_entry *entry = dataflow.acp[i];
  766.             in_acp[entry->dst.reg % ACP_HASH_SIZE].push_tail(entry);
  767.          }
  768.       }
  769.  
  770.       progress = opt_copy_propagate_local(copy_prop_ctx, block, in_acp) || progress;
  771.    }
  772.  
  773.    for (int i = 0; i < cfg->num_blocks; i++)
  774.       delete [] out_acp[i];
  775.    ralloc_free(copy_prop_ctx);
  776.  
  777.    if (progress)
  778.       invalidate_live_intervals();
  779.  
  780.    return progress;
  781. }
  782.