Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2014 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  *
  23.  * Authors:
  24.  *    Jason Ekstrand (jason@jlekstrand.net)
  25.  *
  26.  */
  27.  
  28. #include "nir.h"
  29.  
  30. /*
  31.  * Implements Global Code Motion.  A description of GCM can be found in
  32.  * "Global Code Motion; Global Value Numbering" by Cliff Click.
  33.  * Unfortunately, the algorithm presented in the paper is broken in a
  34.  * number of ways.  The algorithm used here differs substantially from the
  35.  * one in the paper but it is, in my opinion, much easier to read and
  36.  * verify correcness.
  37.  */
  38.  
  39. struct gcm_block_info {
  40.    /* Number of loops this block is inside */
  41.    unsigned loop_depth;
  42.  
  43.    /* The last instruction inserted into this block.  This is used as we
  44.     * traverse the instructions and insert them back into the program to
  45.     * put them in the right order.
  46.     */
  47.    nir_instr *last_instr;
  48. };
  49.  
  50. /* Flags used in the instr->pass_flags field for various instruction states */
  51. enum {
  52.    GCM_INSTR_PINNED =            (1 << 0),
  53.    GCM_INSTR_SCHEDULED_EARLY =   (1 << 1),
  54.    GCM_INSTR_SCHEDULED_LATE =    (1 << 2),
  55.    GCM_INSTR_PLACED =            (1 << 3),
  56. };
  57.  
  58. struct gcm_state {
  59.    nir_function_impl *impl;
  60.    nir_instr *instr;
  61.  
  62.    /* The list of non-pinned instructions.  As we do the late scheduling,
  63.     * we pull non-pinned instructions out of their blocks and place them in
  64.     * this list.  This saves us from having linked-list problems when we go
  65.     * to put instructions back in their blocks.
  66.     */
  67.    struct exec_list instrs;
  68.  
  69.    struct gcm_block_info *blocks;
  70. };
  71.  
  72. /* Recursively walks the CFG and builds the block_info structure */
  73. static void
  74. gcm_build_block_info(struct exec_list *cf_list, struct gcm_state *state,
  75.                      unsigned loop_depth)
  76. {
  77.    foreach_list_typed(nir_cf_node, node, node, cf_list) {
  78.       switch (node->type) {
  79.       case nir_cf_node_block: {
  80.          nir_block *block = nir_cf_node_as_block(node);
  81.          state->blocks[block->index].loop_depth = loop_depth;
  82.          break;
  83.       }
  84.       case nir_cf_node_if: {
  85.          nir_if *if_stmt = nir_cf_node_as_if(node);
  86.          gcm_build_block_info(&if_stmt->then_list, state, loop_depth);
  87.          gcm_build_block_info(&if_stmt->else_list, state, loop_depth);
  88.          break;
  89.       }
  90.       case nir_cf_node_loop: {
  91.          nir_loop *loop = nir_cf_node_as_loop(node);
  92.          gcm_build_block_info(&loop->body, state, loop_depth + 1);
  93.          break;
  94.       }
  95.       default:
  96.          unreachable("Invalid CF node type");
  97.       }
  98.    }
  99. }
  100.  
  101. /* Walks the instruction list and marks immovable instructions as pinned
  102.  *
  103.  * This function also serves to initialize the instr->pass_flags field.
  104.  * After this is completed, all instructions' pass_flags fields will be set
  105.  * to either GCM_INSTR_PINNED or 0.
  106.  */
  107. static bool
  108. gcm_pin_instructions_block(nir_block *block, void *void_state)
  109. {
  110.    struct gcm_state *state = void_state;
  111.  
  112.    nir_foreach_instr_safe(block, instr) {
  113.       switch (instr->type) {
  114.       case nir_instr_type_alu:
  115.          switch (nir_instr_as_alu(instr)->op) {
  116.          case nir_op_fddx:
  117.          case nir_op_fddy:
  118.          case nir_op_fddx_fine:
  119.          case nir_op_fddy_fine:
  120.          case nir_op_fddx_coarse:
  121.          case nir_op_fddy_coarse:
  122.             /* These can only go in uniform control flow; pin them for now */
  123.             instr->pass_flags = GCM_INSTR_PINNED;
  124.             break;
  125.  
  126.          default:
  127.             instr->pass_flags = 0;
  128.             break;
  129.          }
  130.          break;
  131.  
  132.       case nir_instr_type_tex:
  133.          switch (nir_instr_as_tex(instr)->op) {
  134.          case nir_texop_tex:
  135.          case nir_texop_txb:
  136.          case nir_texop_lod:
  137.             /* These two take implicit derivatives so they need to be pinned */
  138.             instr->pass_flags = GCM_INSTR_PINNED;
  139.             break;
  140.  
  141.          default:
  142.             instr->pass_flags = 0;
  143.             break;
  144.          }
  145.          break;
  146.  
  147.       case nir_instr_type_load_const:
  148.          instr->pass_flags = 0;
  149.          break;
  150.  
  151.       case nir_instr_type_intrinsic: {
  152.          const nir_intrinsic_info *info =
  153.             &nir_intrinsic_infos[nir_instr_as_intrinsic(instr)->intrinsic];
  154.  
  155.          if ((info->flags & NIR_INTRINSIC_CAN_ELIMINATE) &&
  156.              (info->flags & NIR_INTRINSIC_CAN_REORDER)) {
  157.             instr->pass_flags = 0;
  158.          } else {
  159.             instr->pass_flags = GCM_INSTR_PINNED;
  160.          }
  161.          break;
  162.       }
  163.  
  164.       case nir_instr_type_jump:
  165.       case nir_instr_type_ssa_undef:
  166.       case nir_instr_type_phi:
  167.          instr->pass_flags = GCM_INSTR_PINNED;
  168.          break;
  169.  
  170.       default:
  171.          unreachable("Invalid instruction type in GCM");
  172.       }
  173.  
  174.       if (!(instr->pass_flags & GCM_INSTR_PINNED)) {
  175.          /* If this is an unpinned instruction, go ahead and pull it out of
  176.           * the program and put it on the instrs list.  This has a couple
  177.           * of benifits.  First, it makes the scheduling algorithm more
  178.           * efficient because we can avoid walking over basic blocks and
  179.           * pinned instructions.  Second, it keeps us from causing linked
  180.           * list confusion when we're trying to put everything in its
  181.           * proper place at the end of the pass.
  182.           *
  183.           * Note that we don't use nir_instr_remove here because that also
  184.           * cleans up uses and defs and we want to keep that information.
  185.           */
  186.          exec_node_remove(&instr->node);
  187.          exec_list_push_tail(&state->instrs, &instr->node);
  188.       }
  189.    }
  190.  
  191.    return true;
  192. }
  193.  
  194. static void
  195. gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state);
  196.  
  197. /** Update an instructions schedule for the given source
  198.  *
  199.  * This function is called iteratively as we walk the sources of an
  200.  * instruction.  It ensures that the given source instruction has been
  201.  * scheduled and then update this instruction's block if the source
  202.  * instruction is lower down the tree.
  203.  */
  204. static bool
  205. gcm_schedule_early_src(nir_src *src, void *void_state)
  206. {
  207.    struct gcm_state *state = void_state;
  208.    nir_instr *instr = state->instr;
  209.  
  210.    assert(src->is_ssa);
  211.  
  212.    gcm_schedule_early_instr(src->ssa->parent_instr, void_state);
  213.  
  214.    /* While the index isn't a proper dominance depth, it does have the
  215.     * property that if A dominates B then A->index <= B->index.  Since we
  216.     * know that this instruction must have been dominated by all of its
  217.     * sources at some point (even if it's gone through value-numbering),
  218.     * all of the sources must lie on the same branch of the dominance tree.
  219.     * Therefore, we can just go ahead and just compare indices.
  220.     */
  221.    if (instr->block->index < src->ssa->parent_instr->block->index)
  222.       instr->block = src->ssa->parent_instr->block;
  223.  
  224.    /* We need to restore the state instruction because it may have been
  225.     * changed through the gcm_schedule_early_instr call above.  Since we
  226.     * may still be iterating through sources and future calls to
  227.     * gcm_schedule_early_src for the same instruction will still need it.
  228.     */
  229.    state->instr = instr;
  230.  
  231.    return true;
  232. }
  233.  
  234. /** Schedules an instruction early
  235.  *
  236.  * This function performs a recursive depth-first search starting at the
  237.  * given instruction and proceeding through the sources to schedule
  238.  * instructions as early as they can possibly go in the dominance tree.
  239.  * The instructions are "scheduled" by updating their instr->block field.
  240.  */
  241. static void
  242. gcm_schedule_early_instr(nir_instr *instr, struct gcm_state *state)
  243. {
  244.    if (instr->pass_flags & GCM_INSTR_SCHEDULED_EARLY)
  245.       return;
  246.  
  247.    instr->pass_flags |= GCM_INSTR_SCHEDULED_EARLY;
  248.  
  249.    /* Pinned instructions are already scheduled so we don't need to do
  250.     * anything.  Also, bailing here keeps us from ever following the
  251.     * sources of phi nodes which can be back-edges.
  252.     */
  253.    if (instr->pass_flags & GCM_INSTR_PINNED)
  254.       return;
  255.  
  256.    /* Start with the instruction at the top.  As we iterate over the
  257.     * sources, it will get moved down as needed.
  258.     */
  259.    instr->block = state->impl->start_block;
  260.    state->instr = instr;
  261.  
  262.    nir_foreach_src(instr, gcm_schedule_early_src, state);
  263. }
  264.  
  265. static void
  266. gcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state);
  267.  
  268. /** Schedules the instruction associated with the given SSA def late
  269.  *
  270.  * This function works by first walking all of the uses of the given SSA
  271.  * definition, ensuring that they are scheduled, and then computing the LCA
  272.  * (least common ancestor) of its uses.  It then schedules this instruction
  273.  * as close to the LCA as possible while trying to stay out of loops.
  274.  */
  275. static bool
  276. gcm_schedule_late_def(nir_ssa_def *def, void *void_state)
  277. {
  278.    struct gcm_state *state = void_state;
  279.  
  280.    nir_block *lca = NULL;
  281.  
  282.    nir_foreach_use(def, use_src) {
  283.       nir_instr *use_instr = use_src->parent_instr;
  284.  
  285.       gcm_schedule_late_instr(use_instr, state);
  286.  
  287.       /* Phi instructions are a bit special.  SSA definitions don't have to
  288.        * dominate the sources of the phi nodes that use them; instead, they
  289.        * have to dominate the predecessor block corresponding to the phi
  290.        * source.  We handle this by looking through the sources, finding
  291.        * any that are usingg this SSA def, and using those blocks instead
  292.        * of the one the phi lives in.
  293.        */
  294.       if (use_instr->type == nir_instr_type_phi) {
  295.          nir_phi_instr *phi = nir_instr_as_phi(use_instr);
  296.  
  297.          nir_foreach_phi_src(phi, phi_src) {
  298.             if (phi_src->src.ssa == def)
  299.                lca = nir_dominance_lca(lca, phi_src->pred);
  300.          }
  301.       } else {
  302.          lca = nir_dominance_lca(lca, use_instr->block);
  303.       }
  304.    }
  305.  
  306.    nir_foreach_if_use(def, use_src) {
  307.       nir_if *if_stmt = use_src->parent_if;
  308.  
  309.       /* For if statements, we consider the block to be the one immediately
  310.        * preceding the if CF node.
  311.        */
  312.       nir_block *pred_block =
  313.          nir_cf_node_as_block(nir_cf_node_prev(&if_stmt->cf_node));
  314.  
  315.       lca = nir_dominance_lca(lca, pred_block);
  316.    }
  317.  
  318.    /* Some instructions may never be used.  We'll just leave them scheduled
  319.     * early and let dead code clean them up.
  320.     */
  321.    if (lca == NULL)
  322.       return true;
  323.  
  324.    /* We know have the LCA of all of the uses.  If our invariants hold,
  325.     * this is dominated by the block that we chose when scheduling early.
  326.     * We now walk up the dominance tree and pick the lowest block that is
  327.     * as far outside loops as we can get.
  328.     */
  329.    nir_block *best = lca;
  330.    while (lca != def->parent_instr->block) {
  331.       assert(lca);
  332.       if (state->blocks[lca->index].loop_depth <
  333.           state->blocks[best->index].loop_depth)
  334.          best = lca;
  335.       lca = lca->imm_dom;
  336.    }
  337.    def->parent_instr->block = best;
  338.  
  339.    return true;
  340. }
  341.  
  342. /** Schedules an instruction late
  343.  *
  344.  * This function performs a depth-first search starting at the given
  345.  * instruction and proceeding through its uses to schedule instructions as
  346.  * late as they can reasonably go in the dominance tree.  The instructions
  347.  * are "scheduled" by updating their instr->block field.
  348.  *
  349.  * The name of this function is actually a bit of a misnomer as it doesn't
  350.  * schedule them "as late as possible" as the paper implies.  Instead, it
  351.  * first finds the lates possible place it can schedule the instruction and
  352.  * then possibly schedules it earlier than that.  The actual location is as
  353.  * far down the tree as we can go while trying to stay out of loops.
  354.  */
  355. static void
  356. gcm_schedule_late_instr(nir_instr *instr, struct gcm_state *state)
  357. {
  358.    if (instr->pass_flags & GCM_INSTR_SCHEDULED_LATE)
  359.       return;
  360.  
  361.    instr->pass_flags |= GCM_INSTR_SCHEDULED_LATE;
  362.  
  363.    /* Pinned instructions are already scheduled so we don't need to do
  364.     * anything.  Also, bailing here keeps us from ever following phi nodes
  365.     * which can be back-edges.
  366.     */
  367.    if (instr->pass_flags & GCM_INSTR_PINNED)
  368.       return;
  369.  
  370.    nir_foreach_ssa_def(instr, gcm_schedule_late_def, state);
  371. }
  372.  
  373. static void
  374. gcm_place_instr(nir_instr *instr, struct gcm_state *state);
  375.  
  376. static bool
  377. gcm_place_instr_def(nir_ssa_def *def, void *state)
  378. {
  379.    nir_foreach_use(def, use_src)
  380.       gcm_place_instr(use_src->parent_instr, state);
  381.  
  382.    return false;
  383. }
  384.  
  385. /** Places an instrution back into the program
  386.  *
  387.  * The earlier passes of GCM simply choose blocks for each instruction and
  388.  * otherwise leave them alone.  This pass actually places the instructions
  389.  * into their chosen blocks.
  390.  *
  391.  * To do so, we use a standard post-order depth-first search linearization
  392.  * algorithm.  We walk over the uses of the given instruction and ensure
  393.  * that they are placed and then place this instruction.  Because we are
  394.  * working on multiple blocks at a time, we keep track of the last inserted
  395.  * instruction per-block in the state structure's block_info array.  When
  396.  * we insert an instruction in a block we insert it before the last
  397.  * instruction inserted in that block rather than the last instruction
  398.  * inserted globally.
  399.  */
  400. static void
  401. gcm_place_instr(nir_instr *instr, struct gcm_state *state)
  402. {
  403.    if (instr->pass_flags & GCM_INSTR_PLACED)
  404.       return;
  405.  
  406.    instr->pass_flags |= GCM_INSTR_PLACED;
  407.  
  408.    /* Phi nodes are our once source of back-edges.  Since right now we are
  409.     * only doing scheduling within blocks, we don't need to worry about
  410.     * them since they are always at the top.  Just skip them completely.
  411.     */
  412.    if (instr->type == nir_instr_type_phi) {
  413.       assert(instr->pass_flags & GCM_INSTR_PINNED);
  414.       return;
  415.    }
  416.  
  417.    nir_foreach_ssa_def(instr, gcm_place_instr_def, state);
  418.  
  419.    if (instr->pass_flags & GCM_INSTR_PINNED) {
  420.       /* Pinned instructions have an implicit dependence on the pinned
  421.        * instructions that come after them in the block.  Since the pinned
  422.        * instructions will naturally "chain" together, we only need to
  423.        * explicitly visit one of them.
  424.        */
  425.       for (nir_instr *after = nir_instr_next(instr);
  426.            after;
  427.            after = nir_instr_next(after)) {
  428.          if (after->pass_flags & GCM_INSTR_PINNED) {
  429.             gcm_place_instr(after, state);
  430.             break;
  431.          }
  432.       }
  433.    }
  434.  
  435.    struct gcm_block_info *block_info = &state->blocks[instr->block->index];
  436.    if (!(instr->pass_flags & GCM_INSTR_PINNED)) {
  437.       exec_node_remove(&instr->node);
  438.  
  439.       if (block_info->last_instr) {
  440.          exec_node_insert_node_before(&block_info->last_instr->node,
  441.                                       &instr->node);
  442.       } else {
  443.          /* Schedule it at the end of the block */
  444.          nir_instr *jump_instr = nir_block_last_instr(instr->block);
  445.          if (jump_instr && jump_instr->type == nir_instr_type_jump) {
  446.             exec_node_insert_node_before(&jump_instr->node, &instr->node);
  447.          } else {
  448.             exec_list_push_tail(&instr->block->instr_list, &instr->node);
  449.          }
  450.       }
  451.    }
  452.  
  453.    block_info->last_instr = instr;
  454. }
  455.  
  456. static void
  457. opt_gcm_impl(nir_function_impl *impl)
  458. {
  459.    struct gcm_state state;
  460.  
  461.    state.impl = impl;
  462.    state.instr = NULL;
  463.    exec_list_make_empty(&state.instrs);
  464.    state.blocks = rzalloc_array(NULL, struct gcm_block_info, impl->num_blocks);
  465.  
  466.    nir_metadata_require(impl, nir_metadata_block_index |
  467.                               nir_metadata_dominance);
  468.  
  469.    gcm_build_block_info(&impl->body, &state, 0);
  470.    nir_foreach_block(impl, gcm_pin_instructions_block, &state);
  471.  
  472.    foreach_list_typed(nir_instr, instr, node, &state.instrs)
  473.       gcm_schedule_early_instr(instr, &state);
  474.  
  475.    foreach_list_typed(nir_instr, instr, node, &state.instrs)
  476.       gcm_schedule_late_instr(instr, &state);
  477.  
  478.    while (!exec_list_is_empty(&state.instrs)) {
  479.       nir_instr *instr = exec_node_data(nir_instr,
  480.                                         state.instrs.tail_pred, node);
  481.       gcm_place_instr(instr, &state);
  482.    }
  483.  
  484.    ralloc_free(state.blocks);
  485. }
  486.  
  487. void
  488. nir_opt_gcm(nir_shader *shader)
  489. {
  490.    nir_foreach_overload(shader, overload) {
  491.       if (overload->impl)
  492.          opt_gcm_impl(overload->impl);
  493.    }
  494. }
  495.