Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Copyright © 2014 Intel Corporation |
||
3 | * |
||
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
5 | * copy of this software and associated documentation files (the "Software"), |
||
6 | * to deal in the Software without restriction, including without limitation |
||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||
8 | * and/or sell copies of the Software, and to permit persons to whom the |
||
9 | * Software is furnished to do so, subject to the following conditions: |
||
10 | * |
||
11 | * The above copyright notice and this permission notice (including the next |
||
12 | * paragraph) shall be included in all copies or substantial portions of the |
||
13 | * Software. |
||
14 | * |
||
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
||
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
||
21 | * IN THE SOFTWARE. |
||
22 | * |
||
23 | * Authors: |
||
24 | * Jason Ekstrand (jason@jlekstrand.net) |
||
25 | * |
||
26 | */ |
||
27 | |||
28 | #include "nir.h" |
||
29 | #include "nir_vla.h" |
||
30 | |||
31 | /* |
||
32 | * This file implements an out-of-SSA pass as described in "Revisiting |
||
33 | * Out-of-SSA Translation for Correctness, Code Quality, and Efficiency" by |
||
34 | * Boissinot et. al. |
||
35 | */ |
||
36 | |||
37 | struct from_ssa_state { |
||
38 | void *mem_ctx; |
||
39 | void *dead_ctx; |
||
40 | struct hash_table *merge_node_table; |
||
41 | nir_instr *instr; |
||
42 | nir_function_impl *impl; |
||
43 | }; |
||
44 | |||
45 | /* Returns true if a dominates b */ |
||
46 | static bool |
||
47 | ssa_def_dominates(nir_ssa_def *a, nir_ssa_def *b) |
||
48 | { |
||
49 | if (a->live_index == 0) { |
||
50 | /* SSA undefs always dominate */ |
||
51 | return true; |
||
52 | } else if (b->live_index < a->live_index) { |
||
53 | return false; |
||
54 | } else if (a->parent_instr->block == b->parent_instr->block) { |
||
55 | return a->live_index <= b->live_index; |
||
56 | } else { |
||
57 | return nir_block_dominates(a->parent_instr->block, |
||
58 | b->parent_instr->block); |
||
59 | } |
||
60 | } |
||
61 | |||
62 | |||
63 | /* The following data structure, which I have named merge_set is a way of |
||
64 | * representing a set registers of non-interfering registers. This is |
||
65 | * based on the concept of a "dominence forest" presented in "Fast Copy |
||
66 | * Coalescing and Live-Range Identification" by Budimlic et. al. but the |
||
67 | * implementation concept is taken from "Revisiting Out-of-SSA Translation |
||
68 | * for Correctness, Code Quality, and Efficiency" by Boissinot et. al.. |
||
69 | * |
||
70 | * Each SSA definition is associated with a merge_node and the association |
||
71 | * is represented by a combination of a hash table and the "def" parameter |
||
72 | * in the merge_node structure. The merge_set stores a linked list of |
||
73 | * merge_node's in dominence order of the ssa definitions. (Since the |
||
74 | * liveness analysis pass indexes the SSA values in dominence order for us, |
||
75 | * this is an easy thing to keep up.) It is assumed that no pair of the |
||
76 | * nodes in a given set interfere. Merging two sets or checking for |
||
77 | * interference can be done in a single linear-time merge-sort walk of the |
||
78 | * two lists of nodes. |
||
79 | */ |
||
80 | struct merge_set; |
||
81 | |||
82 | typedef struct { |
||
83 | struct exec_node node; |
||
84 | struct merge_set *set; |
||
85 | nir_ssa_def *def; |
||
86 | } merge_node; |
||
87 | |||
88 | typedef struct merge_set { |
||
89 | struct exec_list nodes; |
||
90 | unsigned size; |
||
91 | nir_register *reg; |
||
92 | } merge_set; |
||
93 | |||
94 | #if 0 |
||
95 | static void |
||
96 | merge_set_dump(merge_set *set, FILE *fp) |
||
97 | { |
||
98 | nir_ssa_def *dom[set->size]; |
||
99 | int dom_idx = -1; |
||
100 | |||
101 | foreach_list_typed(merge_node, node, node, &set->nodes) { |
||
102 | while (dom_idx >= 0 && !ssa_def_dominates(dom[dom_idx], node->def)) |
||
103 | dom_idx--; |
||
104 | |||
105 | for (int i = 0; i <= dom_idx; i++) |
||
106 | fprintf(fp, " "); |
||
107 | |||
108 | if (node->def->name) |
||
109 | fprintf(fp, "ssa_%d /* %s */\n", node->def->index, node->def->name); |
||
110 | else |
||
111 | fprintf(fp, "ssa_%d\n", node->def->index); |
||
112 | |||
113 | dom[++dom_idx] = node->def; |
||
114 | } |
||
115 | } |
||
116 | #endif |
||
117 | |||
118 | static merge_node * |
||
119 | get_merge_node(nir_ssa_def *def, struct from_ssa_state *state) |
||
120 | { |
||
121 | struct hash_entry *entry = |
||
122 | _mesa_hash_table_search(state->merge_node_table, def); |
||
123 | if (entry) |
||
124 | return entry->data; |
||
125 | |||
126 | merge_set *set = ralloc(state->dead_ctx, merge_set); |
||
127 | exec_list_make_empty(&set->nodes); |
||
128 | set->size = 1; |
||
129 | set->reg = NULL; |
||
130 | |||
131 | merge_node *node = ralloc(state->dead_ctx, merge_node); |
||
132 | node->set = set; |
||
133 | node->def = def; |
||
134 | exec_list_push_head(&set->nodes, &node->node); |
||
135 | |||
136 | _mesa_hash_table_insert(state->merge_node_table, def, node); |
||
137 | |||
138 | return node; |
||
139 | } |
||
140 | |||
141 | static bool |
||
142 | merge_nodes_interfere(merge_node *a, merge_node *b) |
||
143 | { |
||
144 | return nir_ssa_defs_interfere(a->def, b->def); |
||
145 | } |
||
146 | |||
147 | /* Merges b into a */ |
||
148 | static merge_set * |
||
149 | merge_merge_sets(merge_set *a, merge_set *b) |
||
150 | { |
||
151 | struct exec_node *an = exec_list_get_head(&a->nodes); |
||
152 | struct exec_node *bn = exec_list_get_head(&b->nodes); |
||
153 | while (!exec_node_is_tail_sentinel(bn)) { |
||
154 | merge_node *a_node = exec_node_data(merge_node, an, node); |
||
155 | merge_node *b_node = exec_node_data(merge_node, bn, node); |
||
156 | |||
157 | if (exec_node_is_tail_sentinel(an) || |
||
158 | a_node->def->live_index > b_node->def->live_index) { |
||
159 | struct exec_node *next = bn->next; |
||
160 | exec_node_remove(bn); |
||
161 | exec_node_insert_node_before(an, bn); |
||
162 | exec_node_data(merge_node, bn, node)->set = a; |
||
163 | bn = next; |
||
164 | } else { |
||
165 | an = an->next; |
||
166 | } |
||
167 | } |
||
168 | |||
169 | a->size += b->size; |
||
170 | b->size = 0; |
||
171 | |||
172 | return a; |
||
173 | } |
||
174 | |||
175 | /* Checks for any interference between two merge sets |
||
176 | * |
||
177 | * This is an implementation of Algorithm 2 in "Revisiting Out-of-SSA |
||
178 | * Translation for Correctness, Code Quality, and Efficiency" by |
||
179 | * Boissinot et. al. |
||
180 | */ |
||
181 | static bool |
||
182 | merge_sets_interfere(merge_set *a, merge_set *b) |
||
183 | { |
||
184 | NIR_VLA(merge_node *, dom, a->size + b->size); |
||
185 | int dom_idx = -1; |
||
186 | |||
187 | struct exec_node *an = exec_list_get_head(&a->nodes); |
||
188 | struct exec_node *bn = exec_list_get_head(&b->nodes); |
||
189 | while (!exec_node_is_tail_sentinel(an) || |
||
190 | !exec_node_is_tail_sentinel(bn)) { |
||
191 | |||
192 | merge_node *current; |
||
193 | if (exec_node_is_tail_sentinel(an)) { |
||
194 | current = exec_node_data(merge_node, bn, node); |
||
195 | bn = bn->next; |
||
196 | } else if (exec_node_is_tail_sentinel(bn)) { |
||
197 | current = exec_node_data(merge_node, an, node); |
||
198 | an = an->next; |
||
199 | } else { |
||
200 | merge_node *a_node = exec_node_data(merge_node, an, node); |
||
201 | merge_node *b_node = exec_node_data(merge_node, bn, node); |
||
202 | |||
203 | if (a_node->def->live_index <= b_node->def->live_index) { |
||
204 | current = a_node; |
||
205 | an = an->next; |
||
206 | } else { |
||
207 | current = b_node; |
||
208 | bn = bn->next; |
||
209 | } |
||
210 | } |
||
211 | |||
212 | while (dom_idx >= 0 && |
||
213 | !ssa_def_dominates(dom[dom_idx]->def, current->def)) |
||
214 | dom_idx--; |
||
215 | |||
216 | if (dom_idx >= 0 && merge_nodes_interfere(current, dom[dom_idx])) |
||
217 | return true; |
||
218 | |||
219 | dom[++dom_idx] = current; |
||
220 | } |
||
221 | |||
222 | return false; |
||
223 | } |
||
224 | |||
225 | static bool |
||
226 | add_parallel_copy_to_end_of_block(nir_block *block, void *void_state) |
||
227 | { |
||
228 | struct from_ssa_state *state = void_state; |
||
229 | |||
230 | bool need_end_copy = false; |
||
231 | if (block->successors[0]) { |
||
232 | nir_instr *instr = nir_block_first_instr(block->successors[0]); |
||
233 | if (instr && instr->type == nir_instr_type_phi) |
||
234 | need_end_copy = true; |
||
235 | } |
||
236 | |||
237 | if (block->successors[1]) { |
||
238 | nir_instr *instr = nir_block_first_instr(block->successors[1]); |
||
239 | if (instr && instr->type == nir_instr_type_phi) |
||
240 | need_end_copy = true; |
||
241 | } |
||
242 | |||
243 | if (need_end_copy) { |
||
244 | /* If one of our successors has at least one phi node, we need to |
||
245 | * create a parallel copy at the end of the block but before the jump |
||
246 | * (if there is one). |
||
247 | */ |
||
248 | nir_parallel_copy_instr *pcopy = |
||
249 | nir_parallel_copy_instr_create(state->dead_ctx); |
||
250 | |||
251 | nir_instr *last_instr = nir_block_last_instr(block); |
||
252 | if (last_instr && last_instr->type == nir_instr_type_jump) { |
||
253 | nir_instr_insert_before(last_instr, &pcopy->instr); |
||
254 | } else { |
||
255 | nir_instr_insert_after_block(block, &pcopy->instr); |
||
256 | } |
||
257 | } |
||
258 | |||
259 | return true; |
||
260 | } |
||
261 | |||
262 | static nir_parallel_copy_instr * |
||
263 | get_parallel_copy_at_end_of_block(nir_block *block) |
||
264 | { |
||
265 | nir_instr *last_instr = nir_block_last_instr(block); |
||
266 | if (last_instr == NULL) |
||
267 | return NULL; |
||
268 | |||
269 | /* The last instruction may be a jump in which case the parallel copy is |
||
270 | * right before it. |
||
271 | */ |
||
272 | if (last_instr->type == nir_instr_type_jump) |
||
273 | last_instr = nir_instr_prev(last_instr); |
||
274 | |||
275 | if (last_instr && last_instr->type == nir_instr_type_parallel_copy) |
||
276 | return nir_instr_as_parallel_copy(last_instr); |
||
277 | else |
||
278 | return NULL; |
||
279 | } |
||
280 | |||
281 | /** Isolate phi nodes with parallel copies |
||
282 | * |
||
283 | * In order to solve the dependency problems with the sources and |
||
284 | * destinations of phi nodes, we first isolate them by adding parallel |
||
285 | * copies to the beginnings and ends of basic blocks. For every block with |
||
286 | * phi nodes, we add a parallel copy immediately following the last phi |
||
287 | * node that copies the destinations of all of the phi nodes to new SSA |
||
288 | * values. We also add a parallel copy to the end of every block that has |
||
289 | * a successor with phi nodes that, for each phi node in each successor, |
||
290 | * copies the corresponding sorce of the phi node and adjust the phi to |
||
291 | * used the destination of the parallel copy. |
||
292 | * |
||
293 | * In SSA form, each value has exactly one definition. What this does is |
||
294 | * ensure that each value used in a phi also has exactly one use. The |
||
295 | * destinations of phis are only used by the parallel copy immediately |
||
296 | * following the phi nodes and. Thanks to the parallel copy at the end of |
||
297 | * the predecessor block, the sources of phi nodes are are the only use of |
||
298 | * that value. This allows us to immediately assign all the sources and |
||
299 | * destinations of any given phi node to the same register without worrying |
||
300 | * about interference at all. We do coalescing to get rid of the parallel |
||
301 | * copies where possible. |
||
302 | * |
||
303 | * Before this pass can be run, we have to iterate over the blocks with |
||
304 | * add_parallel_copy_to_end_of_block to ensure that the parallel copies at |
||
305 | * the ends of blocks exist. We can create the ones at the beginnings as |
||
306 | * we go, but the ones at the ends of blocks need to be created ahead of |
||
307 | * time because of potential back-edges in the CFG. |
||
308 | */ |
||
309 | static bool |
||
310 | isolate_phi_nodes_block(nir_block *block, void *void_state) |
||
311 | { |
||
312 | struct from_ssa_state *state = void_state; |
||
313 | |||
314 | nir_instr *last_phi_instr = NULL; |
||
315 | nir_foreach_instr(block, instr) { |
||
316 | /* Phi nodes only ever come at the start of a block */ |
||
317 | if (instr->type != nir_instr_type_phi) |
||
318 | break; |
||
319 | |||
320 | last_phi_instr = instr; |
||
321 | } |
||
322 | |||
323 | /* If we don't have any phi's, then there's nothing for us to do. */ |
||
324 | if (last_phi_instr == NULL) |
||
325 | return true; |
||
326 | |||
327 | /* If we have phi nodes, we need to create a parallel copy at the |
||
328 | * start of this block but after the phi nodes. |
||
329 | */ |
||
330 | nir_parallel_copy_instr *block_pcopy = |
||
331 | nir_parallel_copy_instr_create(state->dead_ctx); |
||
332 | nir_instr_insert_after(last_phi_instr, &block_pcopy->instr); |
||
333 | |||
334 | nir_foreach_instr(block, instr) { |
||
335 | /* Phi nodes only ever come at the start of a block */ |
||
336 | if (instr->type != nir_instr_type_phi) |
||
337 | break; |
||
338 | |||
339 | nir_phi_instr *phi = nir_instr_as_phi(instr); |
||
340 | assert(phi->dest.is_ssa); |
||
341 | nir_foreach_phi_src(phi, src) { |
||
342 | nir_parallel_copy_instr *pcopy = |
||
343 | get_parallel_copy_at_end_of_block(src->pred); |
||
344 | assert(pcopy); |
||
345 | |||
346 | nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx, |
||
347 | nir_parallel_copy_entry); |
||
348 | nir_ssa_dest_init(&pcopy->instr, &entry->dest, |
||
349 | phi->dest.ssa.num_components, src->src.ssa->name); |
||
350 | exec_list_push_tail(&pcopy->entries, &entry->node); |
||
351 | |||
352 | assert(src->src.is_ssa); |
||
353 | nir_instr_rewrite_src(&pcopy->instr, &entry->src, src->src); |
||
354 | |||
355 | nir_instr_rewrite_src(&phi->instr, &src->src, |
||
356 | nir_src_for_ssa(&entry->dest.ssa)); |
||
357 | } |
||
358 | |||
359 | nir_parallel_copy_entry *entry = rzalloc(state->dead_ctx, |
||
360 | nir_parallel_copy_entry); |
||
361 | nir_ssa_dest_init(&block_pcopy->instr, &entry->dest, |
||
362 | phi->dest.ssa.num_components, phi->dest.ssa.name); |
||
363 | exec_list_push_tail(&block_pcopy->entries, &entry->node); |
||
364 | |||
365 | nir_ssa_def_rewrite_uses(&phi->dest.ssa, |
||
366 | nir_src_for_ssa(&entry->dest.ssa), |
||
367 | state->mem_ctx); |
||
368 | |||
369 | nir_instr_rewrite_src(&block_pcopy->instr, &entry->src, |
||
370 | nir_src_for_ssa(&phi->dest.ssa)); |
||
371 | } |
||
372 | |||
373 | return true; |
||
374 | } |
||
375 | |||
376 | static bool |
||
377 | coalesce_phi_nodes_block(nir_block *block, void *void_state) |
||
378 | { |
||
379 | struct from_ssa_state *state = void_state; |
||
380 | |||
381 | nir_foreach_instr(block, instr) { |
||
382 | /* Phi nodes only ever come at the start of a block */ |
||
383 | if (instr->type != nir_instr_type_phi) |
||
384 | break; |
||
385 | |||
386 | nir_phi_instr *phi = nir_instr_as_phi(instr); |
||
387 | |||
388 | assert(phi->dest.is_ssa); |
||
389 | merge_node *dest_node = get_merge_node(&phi->dest.ssa, state); |
||
390 | |||
391 | nir_foreach_phi_src(phi, src) { |
||
392 | assert(src->src.is_ssa); |
||
393 | merge_node *src_node = get_merge_node(src->src.ssa, state); |
||
394 | if (src_node->set != dest_node->set) |
||
395 | merge_merge_sets(dest_node->set, src_node->set); |
||
396 | } |
||
397 | } |
||
398 | |||
399 | return true; |
||
400 | } |
||
401 | |||
402 | static void |
||
403 | aggressive_coalesce_parallel_copy(nir_parallel_copy_instr *pcopy, |
||
404 | struct from_ssa_state *state) |
||
405 | { |
||
406 | nir_foreach_parallel_copy_entry(pcopy, entry) { |
||
407 | if (!entry->src.is_ssa) |
||
408 | continue; |
||
409 | |||
410 | /* Since load_const instructions are SSA only, we can't replace their |
||
411 | * destinations with registers and, therefore, can't coalesce them. |
||
412 | */ |
||
413 | if (entry->src.ssa->parent_instr->type == nir_instr_type_load_const) |
||
414 | continue; |
||
415 | |||
416 | /* Don't try and coalesce these */ |
||
417 | if (entry->dest.ssa.num_components != entry->src.ssa->num_components) |
||
418 | continue; |
||
419 | |||
420 | merge_node *src_node = get_merge_node(entry->src.ssa, state); |
||
421 | merge_node *dest_node = get_merge_node(&entry->dest.ssa, state); |
||
422 | |||
423 | if (src_node->set == dest_node->set) |
||
424 | continue; |
||
425 | |||
426 | if (!merge_sets_interfere(src_node->set, dest_node->set)) |
||
427 | merge_merge_sets(src_node->set, dest_node->set); |
||
428 | } |
||
429 | } |
||
430 | |||
431 | static bool |
||
432 | aggressive_coalesce_block(nir_block *block, void *void_state) |
||
433 | { |
||
434 | struct from_ssa_state *state = void_state; |
||
435 | |||
436 | nir_parallel_copy_instr *start_pcopy = NULL; |
||
437 | nir_foreach_instr(block, instr) { |
||
438 | /* Phi nodes only ever come at the start of a block */ |
||
439 | if (instr->type != nir_instr_type_phi) { |
||
440 | if (instr->type != nir_instr_type_parallel_copy) |
||
441 | break; /* The parallel copy must be right after the phis */ |
||
442 | |||
443 | start_pcopy = nir_instr_as_parallel_copy(instr); |
||
444 | |||
445 | aggressive_coalesce_parallel_copy(start_pcopy, state); |
||
446 | |||
447 | break; |
||
448 | } |
||
449 | } |
||
450 | |||
451 | nir_parallel_copy_instr *end_pcopy = |
||
452 | get_parallel_copy_at_end_of_block(block); |
||
453 | |||
454 | if (end_pcopy && end_pcopy != start_pcopy) |
||
455 | aggressive_coalesce_parallel_copy(end_pcopy, state); |
||
456 | |||
457 | return true; |
||
458 | } |
||
459 | |||
460 | static bool |
||
461 | rewrite_ssa_def(nir_ssa_def *def, void *void_state) |
||
462 | { |
||
463 | struct from_ssa_state *state = void_state; |
||
464 | nir_register *reg; |
||
465 | |||
466 | struct hash_entry *entry = |
||
467 | _mesa_hash_table_search(state->merge_node_table, def); |
||
468 | if (entry) { |
||
469 | /* In this case, we're part of a phi web. Use the web's register. */ |
||
470 | merge_node *node = (merge_node *)entry->data; |
||
471 | |||
472 | /* If it doesn't have a register yet, create one. Note that all of |
||
473 | * the things in the merge set should be the same so it doesn't |
||
474 | * matter which node's definition we use. |
||
475 | */ |
||
476 | if (node->set->reg == NULL) { |
||
477 | node->set->reg = nir_local_reg_create(state->impl); |
||
478 | node->set->reg->name = def->name; |
||
479 | node->set->reg->num_components = def->num_components; |
||
480 | node->set->reg->num_array_elems = 0; |
||
481 | } |
||
482 | |||
483 | reg = node->set->reg; |
||
484 | } else { |
||
485 | /* We leave load_const SSA values alone. They act as immediates to |
||
486 | * the backend. If it got coalesced into a phi, that's ok. |
||
487 | */ |
||
488 | if (def->parent_instr->type == nir_instr_type_load_const) |
||
489 | return true; |
||
490 | |||
491 | reg = nir_local_reg_create(state->impl); |
||
492 | reg->name = def->name; |
||
493 | reg->num_components = def->num_components; |
||
494 | reg->num_array_elems = 0; |
||
495 | |||
496 | /* This register comes from an SSA definition that is defined and not |
||
497 | * part of a phi-web. Therefore, we know it has a single unique |
||
498 | * definition that dominates all of its uses; we can copy the |
||
499 | * parent_instr from the SSA def safely. |
||
500 | */ |
||
501 | if (def->parent_instr->type != nir_instr_type_ssa_undef) |
||
502 | reg->parent_instr = def->parent_instr; |
||
503 | } |
||
504 | |||
505 | nir_ssa_def_rewrite_uses(def, nir_src_for_reg(reg), state->mem_ctx); |
||
506 | assert(list_empty(&def->uses) && list_empty(&def->if_uses)); |
||
507 | |||
508 | if (def->parent_instr->type == nir_instr_type_ssa_undef) |
||
509 | return true; |
||
510 | |||
511 | assert(def->parent_instr->type != nir_instr_type_load_const); |
||
512 | |||
513 | /* At this point we know a priori that this SSA def is part of a |
||
514 | * nir_dest. We can use exec_node_data to get the dest pointer. |
||
515 | */ |
||
516 | nir_dest *dest = exec_node_data(nir_dest, def, ssa); |
||
517 | |||
518 | *dest = nir_dest_for_reg(reg); |
||
519 | dest->reg.parent_instr = state->instr; |
||
520 | list_addtail(&dest->reg.def_link, ®->defs); |
||
521 | |||
522 | return true; |
||
523 | } |
||
524 | |||
525 | /* Resolves ssa definitions to registers. While we're at it, we also |
||
526 | * remove phi nodes and ssa_undef instructions |
||
527 | */ |
||
528 | static bool |
||
529 | resolve_registers_block(nir_block *block, void *void_state) |
||
530 | { |
||
531 | struct from_ssa_state *state = void_state; |
||
532 | |||
533 | nir_foreach_instr_safe(block, instr) { |
||
534 | state->instr = instr; |
||
535 | nir_foreach_ssa_def(instr, rewrite_ssa_def, state); |
||
536 | |||
537 | if (instr->type == nir_instr_type_ssa_undef || |
||
538 | instr->type == nir_instr_type_phi) { |
||
539 | nir_instr_remove(instr); |
||
540 | ralloc_steal(state->dead_ctx, instr); |
||
541 | } |
||
542 | } |
||
543 | state->instr = NULL; |
||
544 | |||
545 | return true; |
||
546 | } |
||
547 | |||
548 | static void |
||
549 | emit_copy(nir_parallel_copy_instr *pcopy, nir_src src, nir_src dest_src, |
||
550 | void *mem_ctx) |
||
551 | { |
||
552 | assert(!dest_src.is_ssa && |
||
553 | dest_src.reg.indirect == NULL && |
||
554 | dest_src.reg.base_offset == 0); |
||
555 | |||
556 | if (src.is_ssa) |
||
557 | assert(src.ssa->num_components >= dest_src.reg.reg->num_components); |
||
558 | else |
||
559 | assert(src.reg.reg->num_components >= dest_src.reg.reg->num_components); |
||
560 | |||
561 | nir_alu_instr *mov = nir_alu_instr_create(mem_ctx, nir_op_imov); |
||
562 | nir_src_copy(&mov->src[0].src, &src, mem_ctx); |
||
563 | mov->dest.dest = nir_dest_for_reg(dest_src.reg.reg); |
||
564 | mov->dest.write_mask = (1 << dest_src.reg.reg->num_components) - 1; |
||
565 | |||
566 | nir_instr_insert_before(&pcopy->instr, &mov->instr); |
||
567 | } |
||
568 | |||
569 | /* Resolves a single parallel copy operation into a sequence of mov's |
||
570 | * |
||
571 | * This is based on Algorithm 1 from "Revisiting Out-of-SSA Translation for |
||
572 | * Correctness, Code Quality, and Efficiency" by Boissinot et. al.. |
||
573 | * However, I never got the algorithm to work as written, so this version |
||
574 | * is slightly modified. |
||
575 | * |
||
576 | * The algorithm works by playing this little shell game with the values. |
||
577 | * We start by recording where every source value is and which source value |
||
578 | * each destination value should receive. We then grab any copy whose |
||
579 | * destination is "empty", i.e. not used as a source, and do the following: |
||
580 | * - Find where its source value currently lives |
||
581 | * - Emit the move instruction |
||
582 | * - Set the location of the source value to the destination |
||
583 | * - Mark the location containing the source value |
||
584 | * - Mark the destination as no longer needing to be copied |
||
585 | * |
||
586 | * When we run out of "empty" destinations, we have a cycle and so we |
||
587 | * create a temporary register, copy to that register, and mark the value |
||
588 | * we copied as living in that temporary. Now, the cycle is broken, so we |
||
589 | * can continue with the above steps. |
||
590 | */ |
||
591 | static void |
||
592 | resolve_parallel_copy(nir_parallel_copy_instr *pcopy, |
||
593 | struct from_ssa_state *state) |
||
594 | { |
||
595 | unsigned num_copies = 0; |
||
596 | nir_foreach_parallel_copy_entry(pcopy, entry) { |
||
597 | /* Sources may be SSA */ |
||
598 | if (!entry->src.is_ssa && entry->src.reg.reg == entry->dest.reg.reg) |
||
599 | continue; |
||
600 | |||
601 | num_copies++; |
||
602 | } |
||
603 | |||
604 | if (num_copies == 0) { |
||
605 | /* Hooray, we don't need any copies! */ |
||
606 | nir_instr_remove(&pcopy->instr); |
||
607 | return; |
||
608 | } |
||
609 | |||
610 | /* The register/source corresponding to the given index */ |
||
611 | NIR_VLA_ZERO(nir_src, values, num_copies * 2); |
||
612 | |||
613 | /* The current location of a given piece of data. We will use -1 for "null" */ |
||
614 | NIR_VLA_FILL(int, loc, num_copies * 2, -1); |
||
615 | |||
616 | /* The piece of data that the given piece of data is to be copied from. We will use -1 for "null" */ |
||
617 | NIR_VLA_FILL(int, pred, num_copies * 2, -1); |
||
618 | |||
619 | /* The destinations we have yet to properly fill */ |
||
620 | NIR_VLA(int, to_do, num_copies * 2); |
||
621 | int to_do_idx = -1; |
||
622 | |||
623 | /* Now we set everything up: |
||
624 | * - All values get assigned a temporary index |
||
625 | * - Current locations are set from sources |
||
626 | * - Predicessors are recorded from sources and destinations |
||
627 | */ |
||
628 | int num_vals = 0; |
||
629 | nir_foreach_parallel_copy_entry(pcopy, entry) { |
||
630 | /* Sources may be SSA */ |
||
631 | if (!entry->src.is_ssa && entry->src.reg.reg == entry->dest.reg.reg) |
||
632 | continue; |
||
633 | |||
634 | int src_idx = -1; |
||
635 | for (int i = 0; i < num_vals; ++i) { |
||
636 | if (nir_srcs_equal(values[i], entry->src)) |
||
637 | src_idx = i; |
||
638 | } |
||
639 | if (src_idx < 0) { |
||
640 | src_idx = num_vals++; |
||
641 | values[src_idx] = entry->src; |
||
642 | } |
||
643 | |||
644 | nir_src dest_src = nir_src_for_reg(entry->dest.reg.reg); |
||
645 | |||
646 | int dest_idx = -1; |
||
647 | for (int i = 0; i < num_vals; ++i) { |
||
648 | if (nir_srcs_equal(values[i], dest_src)) { |
||
649 | /* Each destination of a parallel copy instruction should be |
||
650 | * unique. A destination may get used as a source, so we still |
||
651 | * have to walk the list. However, the predecessor should not, |
||
652 | * at this point, be set yet, so we should have -1 here. |
||
653 | */ |
||
654 | assert(pred[i] == -1); |
||
655 | dest_idx = i; |
||
656 | } |
||
657 | } |
||
658 | if (dest_idx < 0) { |
||
659 | dest_idx = num_vals++; |
||
660 | values[dest_idx] = dest_src; |
||
661 | } |
||
662 | |||
663 | loc[src_idx] = src_idx; |
||
664 | pred[dest_idx] = src_idx; |
||
665 | |||
666 | to_do[++to_do_idx] = dest_idx; |
||
667 | } |
||
668 | |||
669 | /* Currently empty destinations we can go ahead and fill */ |
||
670 | NIR_VLA(int, ready, num_copies * 2); |
||
671 | int ready_idx = -1; |
||
672 | |||
673 | /* Mark the ones that are ready for copying. We know an index is a |
||
674 | * destination if it has a predecessor and it's ready for copying if |
||
675 | * it's not marked as containing data. |
||
676 | */ |
||
677 | for (int i = 0; i < num_vals; i++) { |
||
678 | if (pred[i] != -1 && loc[i] == -1) |
||
679 | ready[++ready_idx] = i; |
||
680 | } |
||
681 | |||
682 | while (to_do_idx >= 0) { |
||
683 | while (ready_idx >= 0) { |
||
684 | int b = ready[ready_idx--]; |
||
685 | int a = pred[b]; |
||
686 | emit_copy(pcopy, values[loc[a]], values[b], state->mem_ctx); |
||
687 | |||
688 | /* If any other copies want a they can find it at b */ |
||
689 | loc[a] = b; |
||
690 | |||
691 | /* b has been filled, mark it as not needing to be copied */ |
||
692 | pred[b] = -1; |
||
693 | |||
694 | /* If a needs to be filled, it's ready for copying now */ |
||
695 | if (pred[a] != -1) |
||
696 | ready[++ready_idx] = a; |
||
697 | } |
||
698 | int b = to_do[to_do_idx--]; |
||
699 | if (pred[b] == -1) |
||
700 | continue; |
||
701 | |||
702 | /* If we got here, then we don't have any more trivial copies that we |
||
703 | * can do. We have to break a cycle, so we create a new temporary |
||
704 | * register for that purpose. Normally, if going out of SSA after |
||
705 | * register allocation, you would want to avoid creating temporary |
||
706 | * registers. However, we are going out of SSA before register |
||
707 | * allocation, so we would rather not create extra register |
||
708 | * dependencies for the backend to deal with. If it wants, the |
||
709 | * backend can coalesce the (possibly multiple) temporaries. |
||
710 | */ |
||
711 | assert(num_vals < num_copies * 2); |
||
712 | nir_register *reg = nir_local_reg_create(state->impl); |
||
713 | reg->name = "copy_temp"; |
||
714 | reg->num_array_elems = 0; |
||
715 | if (values[b].is_ssa) |
||
716 | reg->num_components = values[b].ssa->num_components; |
||
717 | else |
||
718 | reg->num_components = values[b].reg.reg->num_components; |
||
719 | values[num_vals].is_ssa = false; |
||
720 | values[num_vals].reg.reg = reg; |
||
721 | |||
722 | emit_copy(pcopy, values[b], values[num_vals], state->mem_ctx); |
||
723 | loc[b] = num_vals; |
||
724 | ready[++ready_idx] = b; |
||
725 | num_vals++; |
||
726 | } |
||
727 | |||
728 | nir_instr_remove(&pcopy->instr); |
||
729 | } |
||
730 | |||
731 | /* Resolves the parallel copies in a block. Each block can have at most |
||
732 | * two: One at the beginning, right after all the phi noces, and one at |
||
733 | * the end (or right before the final jump if it exists). |
||
734 | */ |
||
735 | static bool |
||
736 | resolve_parallel_copies_block(nir_block *block, void *void_state) |
||
737 | { |
||
738 | struct from_ssa_state *state = void_state; |
||
739 | |||
740 | /* At this point, we have removed all of the phi nodes. If a parallel |
||
741 | * copy existed right after the phi nodes in this block, it is now the |
||
742 | * first instruction. |
||
743 | */ |
||
744 | nir_instr *first_instr = nir_block_first_instr(block); |
||
745 | if (first_instr == NULL) |
||
746 | return true; /* Empty, nothing to do. */ |
||
747 | |||
748 | if (first_instr->type == nir_instr_type_parallel_copy) { |
||
749 | nir_parallel_copy_instr *pcopy = nir_instr_as_parallel_copy(first_instr); |
||
750 | |||
751 | resolve_parallel_copy(pcopy, state); |
||
752 | } |
||
753 | |||
754 | /* It's possible that the above code already cleaned up the end parallel |
||
755 | * copy. However, doing so removed it form the instructions list so we |
||
756 | * won't find it here. Therefore, it's safe to go ahead and just look |
||
757 | * for one and clean it up if it exists. |
||
758 | */ |
||
759 | nir_parallel_copy_instr *end_pcopy = |
||
760 | get_parallel_copy_at_end_of_block(block); |
||
761 | if (end_pcopy) |
||
762 | resolve_parallel_copy(end_pcopy, state); |
||
763 | |||
764 | return true; |
||
765 | } |
||
766 | |||
767 | static void |
||
768 | nir_convert_from_ssa_impl(nir_function_impl *impl) |
||
769 | { |
||
770 | struct from_ssa_state state; |
||
771 | |||
772 | state.mem_ctx = ralloc_parent(impl); |
||
773 | state.dead_ctx = ralloc_context(NULL); |
||
774 | state.impl = impl; |
||
775 | state.merge_node_table = _mesa_hash_table_create(NULL, _mesa_hash_pointer, |
||
776 | _mesa_key_pointer_equal); |
||
777 | |||
778 | nir_foreach_block(impl, add_parallel_copy_to_end_of_block, &state); |
||
779 | nir_foreach_block(impl, isolate_phi_nodes_block, &state); |
||
780 | |||
781 | /* Mark metadata as dirty before we ask for liveness analysis */ |
||
782 | nir_metadata_preserve(impl, nir_metadata_block_index | |
||
783 | nir_metadata_dominance); |
||
784 | |||
785 | nir_metadata_require(impl, nir_metadata_live_variables | |
||
786 | nir_metadata_dominance); |
||
787 | |||
788 | nir_foreach_block(impl, coalesce_phi_nodes_block, &state); |
||
789 | nir_foreach_block(impl, aggressive_coalesce_block, &state); |
||
790 | |||
791 | nir_foreach_block(impl, resolve_registers_block, &state); |
||
792 | |||
793 | nir_foreach_block(impl, resolve_parallel_copies_block, &state); |
||
794 | |||
795 | nir_metadata_preserve(impl, nir_metadata_block_index | |
||
796 | nir_metadata_dominance); |
||
797 | |||
798 | /* Clean up dead instructions and the hash tables */ |
||
799 | _mesa_hash_table_destroy(state.merge_node_table, NULL); |
||
800 | ralloc_free(state.dead_ctx); |
||
801 | } |
||
802 | |||
803 | void |
||
804 | nir_convert_from_ssa(nir_shader *shader) |
||
805 | { |
||
806 | nir_foreach_overload(shader, overload) { |
||
807 | if (overload->impl) |
||
808 | nir_convert_from_ssa_impl(overload->impl); |
||
809 | } |
||
810 | }>>>>>>><>=>=>=>> |