Subversion Repositories Kolibri OS

Rev

Blame | Last modification | View Log | RSS feed

  1. /*
  2.  * Copyright © 2014 Intel Corporation
  3.  *
  4.  * Permission is hereby granted, free of charge, to any person obtaining a
  5.  * copy of this software and associated documentation files (the "Software"),
  6.  * to deal in the Software without restriction, including without limitation
  7.  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  8.  * and/or sell copies of the Software, and to permit persons to whom the
  9.  * Software is furnished to do so, subject to the following conditions:
  10.  *
  11.  * The above copyright notice and this permission notice (including the next
  12.  * paragraph) shall be included in all copies or substantial portions of the
  13.  * Software.
  14.  *
  15.  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  16.  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  17.  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  18.  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  19.  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  20.  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  21.  * IN THE SOFTWARE.
  22.  *
  23.  * Authors:
  24.  *    Jason Ekstrand (jason@jlekstrand.net)
  25.  *
  26.  */
  27.  
  28. #include "nir.h"
  29. #include "nir_vla.h"
  30.  
  31. /*
  32.  * This file implements an out-of-SSA pass as described in "Revisiting
  33.  * Out-of-SSA Translation for Correctness, Code Quality, and Efficiency" by
  34.  * Boissinot et. al.
  35.  */
  36.  
  37. struct from_ssa_state {
  38.    void *mem_ctx;
  39.    void *dead_ctx;
  40.    struct hash_table *merge_node_table;
  41.    nir_instr *instr;
  42.    nir_function_impl *impl;
  43. };
  44.  
  45. /* Returns true if a dominates b */
  46. static bool
  47. ssa_def_dominates(nir_ssa_def *a, nir_ssa_def *b)
  48. {
  49.    if (a->live_index == 0) {
  50.       /* SSA undefs always dominate */
  51.       return true;
  52.    } else if (b->live_index < a->live_index) {
  53.       return false;
  54.    } else if (a->parent_instr->block == b->parent_instr->block) {
  55.       return a->live_index <= b->live_index;
  56.    } else {
  57.       return nir_block_dominates(a->parent_instr->block,
  58.                                  b->parent_instr->block);
  59.    }
  60. }
  61.  
  62.  
  63. /* The following data structure, which I have named merge_set is a way of
  64.  * representing a set registers of non-interfering registers.  This is
  65.  * based on the concept of a "dominence forest" presented in "Fast Copy
  66.  * Coalescing and Live-Range Identification" by Budimlic et. al. but the
  67.  * implementation concept is taken from  "Revisiting Out-of-SSA Translation
  68.  * for Correctness, Code Quality, and Efficiency" by Boissinot et. al..
  69.  *
  70.  * Each SSA definition is associated with a merge_node and the association
  71.  * is represented by a combination of a hash table and the "def" parameter
  72.  * in the merge_node structure.  The merge_set stores a linked list of
  73.  * merge_node's in dominence order of the ssa definitions.  (Since the
  74.  * liveness analysis pass indexes the SSA values in dominence order for us,
  75.  * this is an easy thing to keep up.)  It is assumed that no pair of the
  76.  * nodes in a given set interfere.  Merging two sets or checking for
  77.  * interference can be done in a single linear-time merge-sort walk of the
  78.  * two lists of nodes.
  79.  */
  80. struct merge_set;
  81.  
  82. typedef struct {
  83.    struct exec_node node;
  84.    struct merge_set *set;
  85.    nir_ssa_def *def;
  86. } merge_node;
  87.  
  88. typedef struct merge_set {
  89.    struct exec_list nodes;
  90.    unsigned size;
  91.    nir_register *reg;
  92. } merge_set;
  93.  
  94. #if 0
  95. static void
  96. merge_set_dump(merge_set *set, FILE *fp)
  97. {
  98.    nir_ssa_def *dom[set->size];
  99.    int dom_idx = -1;
  100.  
  101.    foreach_list_typed(merge_node, node, node, &set->nodes) {
  102.       while (dom_idx >= 0 && !ssa_def_dominates(dom[dom_idx], node->def))
  103.          dom_idx--;
  104.  
  105.       for (int i = 0; i <= dom_idx; i++)
  106.          fprintf(fp, "  ");
  107.  
  108.       if (node->def->name)
  109.          fprintf(fp, "ssa_%d /* %s */\n", node->def->index, node->def->name);
  110.       else
  111.          fprintf(fp, "ssa_%d\n", node->def->index);
  112.  
  113.       dom[++dom_idx] = node->def;
  114.    }
  115. }
  116. #endif
  117.  
  118. static merge_node *
  119. get_merge_node(nir_ssa_def *def, struct from_ssa_state *state)
  120. {
  121.    struct hash_entry *entry =
  122.       _mesa_hash_table_search(state->merge_node_table, def);
  123.    if (entry)
  124.       return entry->data;
  125.  
  126.    merge_set *set = ralloc(state->dead_ctx, merge_set);
  127.    exec_list_make_empty(&set->nodes);
  128.    set->size = 1;
  129.    set->reg = NULL;
  130.  
  131.    merge_node *node = ralloc(state->dead_ctx, merge_node);
  132.    node->set = set;
  133.    node->def = def;
  134.    exec_list_push_head(&set->nodes, &node->node);
  135.  
  136.    _mesa_hash_table_insert(state->merge_node_table, def, node);
  137.  
  138.    return node;
  139. }
  140.  
  141. static bool
  142. merge_nodes_interfere(merge_node *a, merge_node *b)
  143. {
  144.    return nir_ssa_defs_interfere(a->def, b->def);
  145. }
  146.  
  147. /* Merges b into a */
  148. static merge_set *
  149. merge_merge_sets(merge_set *a, merge_set *b)
  150. {
  151.    struct exec_node *an = exec_list_get_head(&a->nodes);
  152.    struct exec_node *bn = exec_list_get_head(&b->nodes);
  153.    while (!exec_node_is_tail_sentinel(bn)) {
  154.       merge_node *a_node = exec_node_data(merge_node, an, node);
  155.       merge_node *b_node = exec_node_data(merge_node, bn, node);
  156.  
  157.       if (exec_node_is_tail_sentinel(an) ||
  158.           a_node->def->live_index > b_node->def->live_index) {
  159.          struct exec_node *next = bn->next;
  160.          exec_node_remove(bn);
  161.          exec_node_insert_node_before(an, bn);
  162.          exec_node_data(merge_node, bn, node)->set = a;
  163.          bn = next;
  164.       } else {
  165.          an = an->next;
  166.       }
  167.    }
  168.  
  169.    a->size += b->size;
  170.    b->size = 0;
  171.  
  172.    return a;
  173. }
  174.  
  175. /* Checks for any interference between two merge sets
  176.  *
  177.  * This is an implementation of Algorithm 2 in "Revisiting Out-of-SSA
  178.  * Translation for Correctness, Code Quality, and Efficiency" by
  179.  * Boissinot et. al.
  180.  */
  181. static bool
  182. merge_sets_interfere(merge_set *a, merge_set *b)
  183. {
  184.    NIR_VLA(merge_node *, dom, a->size + b->size);
  185.    int dom_idx = -1;
  186.  
  187.    struct exec_node *an = exec_list_get_head(&a->nodes);
  188.    struct exec_node *bn = exec_list_get_head(&b->nodes);
  189.    while (!exec_node_is_tail_sentinel(an) ||
  190.           !exec_node_is_tail_sentinel(bn)) {
  191.  
  192.       merge_node *current;
  193.       if (exec_node_is_tail_sentinel(an)) {
  194.          current = exec_node_data(merge_node, bn, node);
  195.          bn = bn->next;
  196.       } else if (exec_node_is_tail_sentinel(bn)) {
  197.          current = exec_node_data(merge_node, an, node);
  198.          an = an->next;
  199.       } else {
  200.          merge_node *a_node = exec_node_data(merge_node, an, node);
  201.          merge_node *b_node = exec_node_data(merge_node, bn, node);
  202.  
  203.          if (a_node->def->live_index <= b_node->def->live_index) {
  204.             current = a_node;
  205.             an = an->next;
  206.          } else {
  207.             current = b_node;
  208.             bn = bn->next;
  209.          }
  210.       }
  211.  
  212.       while (dom_idx >= 0 &&
  213.              !ssa_def_dominates(dom[dom_idx]->def, current->def))
  214.          dom_idx--;
  215.  
  216.       if (dom_idx >= 0 && merge_nodes_interfere(current, dom[dom_idx]))
  217.          return true;
  218.  
  219.       dom[++dom_idx] = current;
  220.    }
  221.  
  222.    return false;
  223. }
  224.  
  225. static bool
  226. add_parallel_copy_to_end_of_block(nir_block *block, void *void_state)
  227. {
  228.    struct from_ssa_state *state = void_state;
  229.  
  230.    bool need_end_copy = false;
  231.    if (block->successors[0]) {
  232.       nir_instr *instr = nir_block_first_instr(block->successors[0]);
  233.       if (instr && instr->type == nir_instr_type_phi)
  234.          need_end_copy = true;
  235.    }
  236.  
  237.    if (block->successors[1]) {
  238.       nir_instr *instr = nir_block_first_instr(block->successors[1]);
  239.       if (instr && instr->type == nir_instr_type_phi)
  240.          need_end_copy = true;
  241.    }
  242.  
  243.    if (need_end_copy) {
  244.       /* If one of our successors has at least one phi node, we need to
  245.        * create a parallel copy at the end of the block but before the jump
  246.        * (if there is one).
  247.        */
  248.       nir_parallel_copy_instr *pcopy =
  249.          nir_parallel_copy_instr_create(state->dead_ctx);
  250.  
  251.       nir_instr *last_instr = nir_block_last_instr(block);
  252.       if (last_instr && last_instr->type == nir_instr_type_jump) {
  253.          nir_instr_insert_before(last_instr, &pcopy->instr);
  254.       } else {
  255.          nir_instr_insert_after_block(block, &pcopy->instr);
  256.       }
  257.    }
  258.  
  259.    return true;
  260. }
  261.  
  262. static nir_parallel_copy_instr *
  263. get_parallel_copy_at_end_of_block(nir_block *block)
  264. {
  265.    nir_instr *last_instr = nir_block_last_instr(block);
  266.    if (last_instr == NULL)
  267.       return NULL;
  268.  
  269.    /* The last instruction may be a jump in which case the parallel copy is
  270.     * right before it.
  271.     */
  272.    if (last_instr->type == nir_instr_type_jump)
  273.       last_instr = nir_instr_prev(last_instr);
  274.  
  275.    if (last_instr && last_instr->type == nir_instr_type_parallel_copy)
  276.       return nir_instr_as_parallel_copy(last_instr);
  277.    else
  278.       return NULL;
  279. }
  280.  
  281. /** Isolate phi nodes with parallel copies
  282.  *
  283.  * In order to solve the dependency problems with the sources and
  284.  * destinations of phi nodes, we first isolate them by adding parallel
  285.  * copies to the beginnings and ends of basic blocks.  For every block with
  286.  * phi nodes, we add a parallel copy immediately following the last phi
  287.  * node that copies the destinations of all of the phi nodes to new SSA
  288.  * values.  We also add a parallel copy to the end of every block that has
  289.  * a successor with phi nodes that, for each phi node in each successor,
  290.  * copies the corresponding sorce of the phi node and adjust the phi to
  291.  * used the destination of the parallel copy.
  292.  *
  293.  * In SSA form, each value has exactly one definition.  What this does is
  294.  * ensure that each value used in a phi also has exactly one use.  The
  295.  * destinations of phis are only used by the parallel copy immediately
  296.  * following the phi nodes and.  Thanks to the parallel copy at the end of
  297.  * the predecessor block, the sources of phi nodes are are the only use of
  298.  * that value.  This allows us to immediately assign all the sources and
  299.  * destinations of any given phi node to the same register without worrying
  300.  * about interference at all.  We do coalescing to get rid of the parallel
  301.  * copies where possible.
  302.  *
  303.  * Before this pass can be run, we have to iterate over the blocks with
  304.  * add_parallel_copy_to_end_of_block to ensure that the parallel copies at
  305.  * the ends of blocks exist.  We can create the ones at the beginnings as
  306.  * we go, but the ones at the ends of blocks need to be created ahead of
  307.  * time because of potential back-edges in the CFG.
  308.  */
  309. static bool
  310. isolate_phi_nodes_block(nir_block *block, void *void_state)
  311. {
  312.    struct from_ssa_state *state = void_state;
  313.  
  314.    nir_instr *last_phi_instr = NULL;
  315.    nir_foreach_instr(block, instr) {
  316.       /* Phi nodes only ever come at the start of a block */
  317.       if (instr->type != nir_instr_type_phi)
  318.          break;
  319.  
  320.       last_phi_instr = instr;
  321.    }
  322.  
  323.    /* If we don't have any phi's, then there's nothing for us to do. */
  324.    if (last_phi_instr == NULL)
  325.       return true;
  326.  
  327.    /* If we have phi nodes, we need to create a parallel copy at the
  328.     * start of this block but after the phi nodes.
  329.     */
  330.    nir_parallel_copy_instr *block_pcopy =
  331.       nir_parallel_copy_instr_create(state->dead_ctx);
  332.    nir_instr_insert_after(last_phi_instr, &block_pcopy->instr);
  333.  
  334.    nir_foreach_instr(block, instr) {
  335.       /* Phi nodes only ever come at the start of a block */
  336.       if (instr->type != nir_instr_type_phi)
  337.          break;
  338.  
  339.       nir_phi_instr *phi = nir_instr_as_phi(instr);
  340.       assert(phi->dest.is_ssa);
  341.       nir_foreach_phi_src(phi, src) {
  342.          nir_parallel_copy_instr *pcopy =
  343.             get_parallel_copy_at_end_of_block(src->pred);
  344.          assert(pcopy);
  345.  
  346.          nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
  347.                                                   nir_parallel_copy_entry);
  348.          nir_ssa_dest_init(&pcopy->instr, &entry->dest,
  349.                            phi->dest.ssa.num_components, src->src.ssa->name);
  350.          exec_list_push_tail(&pcopy->entries, &entry->node);
  351.  
  352.          assert(src->src.is_ssa);
  353.          nir_instr_rewrite_src(&pcopy->instr, &entry->src, src->src);
  354.  
  355.          nir_instr_rewrite_src(&phi->instr, &src->src,
  356.                                nir_src_for_ssa(&entry->dest.ssa));
  357.       }
  358.  
  359.       nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx,
  360.                                                nir_parallel_copy_entry);
  361.       nir_ssa_dest_init(&block_pcopy->instr, &entry->dest,
  362.                         phi->dest.ssa.num_components, phi->dest.ssa.name);
  363.       exec_list_push_tail(&block_pcopy->entries, &entry->node);
  364.  
  365.       nir_ssa_def_rewrite_uses(&phi->dest.ssa,
  366.                                nir_src_for_ssa(&entry->dest.ssa),
  367.                                state->mem_ctx);
  368.  
  369.       nir_instr_rewrite_src(&block_pcopy->instr, &entry->src,
  370.                             nir_src_for_ssa(&phi->dest.ssa));
  371.    }
  372.  
  373.    return true;
  374. }
  375.  
  376. static bool
  377. coalesce_phi_nodes_block(nir_block *block, void *void_state)
  378. {
  379.    struct from_ssa_state *state = void_state;
  380.  
  381.    nir_foreach_instr(block, instr) {
  382.       /* Phi nodes only ever come at the start of a block */
  383.       if (instr->type != nir_instr_type_phi)
  384.          break;
  385.  
  386.       nir_phi_instr *phi = nir_instr_as_phi(instr);
  387.  
  388.       assert(phi->dest.is_ssa);
  389.       merge_node *dest_node = get_merge_node(&phi->dest.ssa, state);
  390.  
  391.       nir_foreach_phi_src(phi, src) {
  392.          assert(src->src.is_ssa);
  393.          merge_node *src_node = get_merge_node(src->src.ssa, state);
  394.          if (src_node->set != dest_node->set)
  395.             merge_merge_sets(dest_node->set, src_node->set);
  396.       }
  397.    }
  398.  
  399.    return true;
  400. }
  401.  
  402. static void
  403. aggressive_coalesce_parallel_copy(nir_parallel_copy_instr *pcopy,
  404.                                  struct from_ssa_state *state)
  405. {
  406.    nir_foreach_parallel_copy_entry(pcopy, entry) {
  407.       if (!entry->src.is_ssa)
  408.          continue;
  409.  
  410.       /* Since load_const instructions are SSA only, we can't replace their
  411.        * destinations with registers and, therefore, can't coalesce them.
  412.        */
  413.       if (entry->src.ssa->parent_instr->type == nir_instr_type_load_const)
  414.          continue;
  415.  
  416.       /* Don't try and coalesce these */
  417.       if (entry->dest.ssa.num_components != entry->src.ssa->num_components)
  418.          continue;
  419.  
  420.       merge_node *src_node = get_merge_node(entry->src.ssa, state);
  421.       merge_node *dest_node = get_merge_node(&entry->dest.ssa, state);
  422.  
  423.       if (src_node->set == dest_node->set)
  424.          continue;
  425.  
  426.       if (!merge_sets_interfere(src_node->set, dest_node->set))
  427.          merge_merge_sets(src_node->set, dest_node->set);
  428.    }
  429. }
  430.  
  431. static bool
  432. aggressive_coalesce_block(nir_block *block, void *void_state)
  433. {
  434.    struct from_ssa_state *state = void_state;
  435.  
  436.    nir_parallel_copy_instr *start_pcopy = NULL;
  437.    nir_foreach_instr(block, instr) {
  438.       /* Phi nodes only ever come at the start of a block */
  439.       if (instr->type != nir_instr_type_phi) {
  440.          if (instr->type != nir_instr_type_parallel_copy)
  441.             break; /* The parallel copy must be right after the phis */
  442.  
  443.          start_pcopy = nir_instr_as_parallel_copy(instr);
  444.  
  445.          aggressive_coalesce_parallel_copy(start_pcopy, state);
  446.  
  447.          break;
  448.       }
  449.    }
  450.  
  451.    nir_parallel_copy_instr *end_pcopy =
  452.       get_parallel_copy_at_end_of_block(block);
  453.  
  454.    if (end_pcopy && end_pcopy != start_pcopy)
  455.       aggressive_coalesce_parallel_copy(end_pcopy, state);
  456.  
  457.    return true;
  458. }
  459.  
  460. static bool
  461. rewrite_ssa_def(nir_ssa_def *def, void *void_state)
  462. {
  463.    struct from_ssa_state *state = void_state;
  464.    nir_register *reg;
  465.  
  466.    struct hash_entry *entry =
  467.       _mesa_hash_table_search(state->merge_node_table, def);
  468.    if (entry) {
  469.       /* In this case, we're part of a phi web.  Use the web's register. */
  470.       merge_node *node = (merge_node *)entry->data;
  471.  
  472.       /* If it doesn't have a register yet, create one.  Note that all of
  473.        * the things in the merge set should be the same so it doesn't
  474.        * matter which node's definition we use.
  475.        */
  476.       if (node->set->reg == NULL) {
  477.          node->set->reg = nir_local_reg_create(state->impl);
  478.          node->set->reg->name = def->name;
  479.          node->set->reg->num_components = def->num_components;
  480.          node->set->reg->num_array_elems = 0;
  481.       }
  482.  
  483.       reg = node->set->reg;
  484.    } else {
  485.       /* We leave load_const SSA values alone.  They act as immediates to
  486.        * the backend.  If it got coalesced into a phi, that's ok.
  487.        */
  488.       if (def->parent_instr->type == nir_instr_type_load_const)
  489.          return true;
  490.  
  491.       reg = nir_local_reg_create(state->impl);
  492.       reg->name = def->name;
  493.       reg->num_components = def->num_components;
  494.       reg->num_array_elems = 0;
  495.  
  496.       /* This register comes from an SSA definition that is defined and not
  497.        * part of a phi-web.  Therefore, we know it has a single unique
  498.        * definition that dominates all of its uses; we can copy the
  499.        * parent_instr from the SSA def safely.
  500.        */
  501.       if (def->parent_instr->type != nir_instr_type_ssa_undef)
  502.          reg->parent_instr = def->parent_instr;
  503.    }
  504.  
  505.    nir_ssa_def_rewrite_uses(def, nir_src_for_reg(reg), state->mem_ctx);
  506.    assert(list_empty(&def->uses) && list_empty(&def->if_uses));
  507.  
  508.    if (def->parent_instr->type == nir_instr_type_ssa_undef)
  509.       return true;
  510.  
  511.    assert(def->parent_instr->type != nir_instr_type_load_const);
  512.  
  513.    /* At this point we know a priori that this SSA def is part of a
  514.     * nir_dest.  We can use exec_node_data to get the dest pointer.
  515.     */
  516.    nir_dest *dest = exec_node_data(nir_dest, def, ssa);
  517.  
  518.    *dest = nir_dest_for_reg(reg);
  519.    dest->reg.parent_instr = state->instr;
  520.    list_addtail(&dest->reg.def_link, &reg->defs);
  521.  
  522.    return true;
  523. }
  524.  
  525. /* Resolves ssa definitions to registers.  While we're at it, we also
  526.  * remove phi nodes and ssa_undef instructions
  527.  */
  528. static bool
  529. resolve_registers_block(nir_block *block, void *void_state)
  530. {
  531.    struct from_ssa_state *state = void_state;
  532.  
  533.    nir_foreach_instr_safe(block, instr) {
  534.       state->instr = instr;
  535.       nir_foreach_ssa_def(instr, rewrite_ssa_def, state);
  536.  
  537.       if (instr->type == nir_instr_type_ssa_undef ||
  538.           instr->type == nir_instr_type_phi) {
  539.          nir_instr_remove(instr);
  540.          ralloc_steal(state->dead_ctx, instr);
  541.       }
  542.    }
  543.    state->instr = NULL;
  544.  
  545.    return true;
  546. }
  547.  
  548. static void
  549. emit_copy(nir_parallel_copy_instr *pcopy, nir_src src, nir_src dest_src,
  550.           void *mem_ctx)
  551. {
  552.    assert(!dest_src.is_ssa &&
  553.           dest_src.reg.indirect == NULL &&
  554.           dest_src.reg.base_offset == 0);
  555.  
  556.    if (src.is_ssa)
  557.       assert(src.ssa->num_components >= dest_src.reg.reg->num_components);
  558.    else
  559.       assert(src.reg.reg->num_components >= dest_src.reg.reg->num_components);
  560.  
  561.    nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov);
  562.    nir_src_copy(&mov->src[0].src, &src, mem_ctx);
  563.    mov->dest.dest = nir_dest_for_reg(dest_src.reg.reg);
  564.    mov->dest.write_mask = (1 << dest_src.reg.reg->num_components) - 1;
  565.  
  566.    nir_instr_insert_before(&pcopy->instr, &mov->instr);
  567. }
  568.  
  569. /* Resolves a single parallel copy operation into a sequence of mov's
  570.  *
  571.  * This is based on Algorithm 1 from "Revisiting Out-of-SSA Translation for
  572.  * Correctness, Code Quality, and Efficiency" by Boissinot et. al..
  573.  * However, I never got the algorithm to work as written, so this version
  574.  * is slightly modified.
  575.  *
  576.  * The algorithm works by playing this little shell game with the values.
  577.  * We start by recording where every source value is and which source value
  578.  * each destination value should receive.  We then grab any copy whose
  579.  * destination is "empty", i.e. not used as a source, and do the following:
  580.  *  - Find where its source value currently lives
  581.  *  - Emit the move instruction
  582.  *  - Set the location of the source value to the destination
  583.  *  - Mark the location containing the source value
  584.  *  - Mark the destination as no longer needing to be copied
  585.  *
  586.  * When we run out of "empty" destinations, we have a cycle and so we
  587.  * create a temporary register, copy to that register, and mark the value
  588.  * we copied as living in that temporary.  Now, the cycle is broken, so we
  589.  * can continue with the above steps.
  590.  */
  591. static void
  592. resolve_parallel_copy(nir_parallel_copy_instr *pcopy,
  593.                       struct from_ssa_state *state)
  594. {
  595.    unsigned num_copies = 0;
  596.    nir_foreach_parallel_copy_entry(pcopy, entry) {
  597.       /* Sources may be SSA */
  598.       if (!entry->src.is_ssa && entry->src.reg.reg == entry->dest.reg.reg)
  599.          continue;
  600.  
  601.       num_copies++;
  602.    }
  603.  
  604.    if (num_copies == 0) {
  605.       /* Hooray, we don't need any copies! */
  606.       nir_instr_remove(&pcopy->instr);
  607.       return;
  608.    }
  609.  
  610.    /* The register/source corresponding to the given index */
  611.    NIR_VLA_ZERO(nir_src, values, num_copies * 2);
  612.  
  613.    /* The current location of a given piece of data.  We will use -1 for "null" */
  614.    NIR_VLA_FILL(int, loc, num_copies * 2, -1);
  615.  
  616.    /* The piece of data that the given piece of data is to be copied from.  We will use -1 for "null" */
  617.    NIR_VLA_FILL(int, pred, num_copies * 2, -1);
  618.  
  619.    /* The destinations we have yet to properly fill */
  620.    NIR_VLA(int, to_do, num_copies * 2);
  621.    int to_do_idx = -1;
  622.  
  623.    /* Now we set everything up:
  624.     *  - All values get assigned a temporary index
  625.     *  - Current locations are set from sources
  626.     *  - Predicessors are recorded from sources and destinations
  627.     */
  628.    int num_vals = 0;
  629.    nir_foreach_parallel_copy_entry(pcopy, entry) {
  630.       /* Sources may be SSA */
  631.       if (!entry->src.is_ssa && entry->src.reg.reg == entry->dest.reg.reg)
  632.          continue;
  633.  
  634.       int src_idx = -1;
  635.       for (int i = 0; i < num_vals; ++i) {
  636.          if (nir_srcs_equal(values[i], entry->src))
  637.             src_idx = i;
  638.       }
  639.       if (src_idx < 0) {
  640.          src_idx = num_vals++;
  641.          values[src_idx] = entry->src;
  642.       }
  643.  
  644.       nir_src dest_src = nir_src_for_reg(entry->dest.reg.reg);
  645.  
  646.       int dest_idx = -1;
  647.       for (int i = 0; i < num_vals; ++i) {
  648.          if (nir_srcs_equal(values[i], dest_src)) {
  649.             /* Each destination of a parallel copy instruction should be
  650.              * unique.  A destination may get used as a source, so we still
  651.              * have to walk the list.  However, the predecessor should not,
  652.              * at this point, be set yet, so we should have -1 here.
  653.              */
  654.             assert(pred[i] == -1);
  655.             dest_idx = i;
  656.          }
  657.       }
  658.       if (dest_idx < 0) {
  659.          dest_idx = num_vals++;
  660.          values[dest_idx] = dest_src;
  661.       }
  662.  
  663.       loc[src_idx] = src_idx;
  664.       pred[dest_idx] = src_idx;
  665.  
  666.       to_do[++to_do_idx] = dest_idx;
  667.    }
  668.  
  669.    /* Currently empty destinations we can go ahead and fill */
  670.    NIR_VLA(int, ready, num_copies * 2);
  671.    int ready_idx = -1;
  672.  
  673.    /* Mark the ones that are ready for copying.  We know an index is a
  674.     * destination if it has a predecessor and it's ready for copying if
  675.     * it's not marked as containing data.
  676.     */
  677.    for (int i = 0; i < num_vals; i++) {
  678.       if (pred[i] != -1 && loc[i] == -1)
  679.          ready[++ready_idx] = i;
  680.    }
  681.  
  682.    while (to_do_idx >= 0) {
  683.       while (ready_idx >= 0) {
  684.          int b = ready[ready_idx--];
  685.          int a = pred[b];
  686.          emit_copy(pcopy, values[loc[a]], values[b], state->mem_ctx);
  687.  
  688.          /* If any other copies want a they can find it at b */
  689.          loc[a] = b;
  690.  
  691.          /* b has been filled, mark it as not needing to be copied */
  692.          pred[b] = -1;
  693.  
  694.          /* If a needs to be filled, it's ready for copying now */
  695.          if (pred[a] != -1)
  696.             ready[++ready_idx] = a;
  697.       }
  698.       int b = to_do[to_do_idx--];
  699.       if (pred[b] == -1)
  700.          continue;
  701.  
  702.       /* If we got here, then we don't have any more trivial copies that we
  703.        * can do.  We have to break a cycle, so we create a new temporary
  704.        * register for that purpose.  Normally, if going out of SSA after
  705.        * register allocation, you would want to avoid creating temporary
  706.        * registers.  However, we are going out of SSA before register
  707.        * allocation, so we would rather not create extra register
  708.        * dependencies for the backend to deal with.  If it wants, the
  709.        * backend can coalesce the (possibly multiple) temporaries.
  710.        */
  711.       assert(num_vals < num_copies * 2);
  712.       nir_register *reg = nir_local_reg_create(state->impl);
  713.       reg->name = "copy_temp";
  714.       reg->num_array_elems = 0;
  715.       if (values[b].is_ssa)
  716.          reg->num_components = values[b].ssa->num_components;
  717.       else
  718.          reg->num_components = values[b].reg.reg->num_components;
  719.       values[num_vals].is_ssa = false;
  720.       values[num_vals].reg.reg = reg;
  721.  
  722.       emit_copy(pcopy, values[b], values[num_vals], state->mem_ctx);
  723.       loc[b] = num_vals;
  724.       ready[++ready_idx] = b;
  725.       num_vals++;
  726.    }
  727.  
  728.    nir_instr_remove(&pcopy->instr);
  729. }
  730.  
  731. /* Resolves the parallel copies in a block.  Each block can have at most
  732.  * two:  One at the beginning, right after all the phi noces, and one at
  733.  * the end (or right before the final jump if it exists).
  734.  */
  735. static bool
  736. resolve_parallel_copies_block(nir_block *block, void *void_state)
  737. {
  738.    struct from_ssa_state *state = void_state;
  739.  
  740.    /* At this point, we have removed all of the phi nodes.  If a parallel
  741.     * copy existed right after the phi nodes in this block, it is now the
  742.     * first instruction.
  743.     */
  744.    nir_instr *first_instr = nir_block_first_instr(block);
  745.    if (first_instr == NULL)
  746.       return true; /* Empty, nothing to do. */
  747.  
  748.    if (first_instr->type == nir_instr_type_parallel_copy) {
  749.       nir_parallel_copy_instr *pcopy = nir_instr_as_parallel_copy(first_instr);
  750.  
  751.       resolve_parallel_copy(pcopy, state);
  752.    }
  753.  
  754.    /* It's possible that the above code already cleaned up the end parallel
  755.     * copy.  However, doing so removed it form the instructions list so we
  756.     * won't find it here.  Therefore, it's safe to go ahead and just look
  757.     * for one and clean it up if it exists.
  758.     */
  759.    nir_parallel_copy_instr *end_pcopy =
  760.       get_parallel_copy_at_end_of_block(block);
  761.    if (end_pcopy)
  762.       resolve_parallel_copy(end_pcopy, state);
  763.  
  764.    return true;
  765. }
  766.  
  767. static void
  768. nir_convert_from_ssa_impl(nir_function_impl *impl)
  769. {
  770.    struct from_ssa_state state;
  771.  
  772.    state.mem_ctx = ralloc_parent(impl);
  773.    state.dead_ctx = ralloc_context(NULL);
  774.    state.impl = impl;
  775.    state.merge_node_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
  776.                                                     _mesa_key_pointer_equal);
  777.  
  778.    nir_foreach_block(impl, add_parallel_copy_to_end_of_block, &state);
  779.    nir_foreach_block(impl, isolate_phi_nodes_block, &state);
  780.  
  781.    /* Mark metadata as dirty before we ask for liveness analysis */
  782.    nir_metadata_preserve(impl, nir_metadata_block_index |
  783.                                nir_metadata_dominance);
  784.  
  785.    nir_metadata_require(impl, nir_metadata_live_variables |
  786.                               nir_metadata_dominance);
  787.  
  788.    nir_foreach_block(impl, coalesce_phi_nodes_block, &state);
  789.    nir_foreach_block(impl, aggressive_coalesce_block, &state);
  790.  
  791.    nir_foreach_block(impl, resolve_registers_block, &state);
  792.  
  793.    nir_foreach_block(impl, resolve_parallel_copies_block, &state);
  794.  
  795.    nir_metadata_preserve(impl, nir_metadata_block_index |
  796.                                nir_metadata_dominance);
  797.  
  798.    /* Clean up dead instructions and the hash tables */
  799.    _mesa_hash_table_destroy(state.merge_node_table, NULL);
  800.    ralloc_free(state.dead_ctx);
  801. }
  802.  
  803. void
  804. nir_convert_from_ssa(nir_shader *shader)
  805. {
  806.    nir_foreach_overload(shader, overload) {
  807.       if (overload->impl)
  808.          nir_convert_from_ssa_impl(overload->impl);
  809.    }
  810. }
  811.