Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Copyright © 2010 Intel Corporation |
||
3 | * |
||
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
5 | * copy of this software and associated documentation files (the "Software"), |
||
6 | * to deal in the Software without restriction, including without limitation |
||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||
8 | * and/or sell copies of the Software, and to permit persons to whom the |
||
9 | * Software is furnished to do so, subject to the following conditions: |
||
10 | * |
||
11 | * The above copyright notice and this permission notice (including the next |
||
12 | * paragraph) shall be included in all copies or substantial portions of the |
||
13 | * Software. |
||
14 | * |
||
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
||
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
||
21 | * IN THE SOFTWARE. |
||
22 | * |
||
23 | * Authors: |
||
24 | * Eric Anholt |
||
25 | * |
||
26 | */ |
||
27 | |||
28 | #include "brw_fs.h" |
||
29 | #include "brw_cfg.h" |
||
30 | #include "glsl/glsl_types.h" |
||
31 | #include "glsl/ir_optimization.h" |
||
32 | |||
33 | static void |
||
34 | assign_reg(unsigned *reg_hw_locations, fs_reg *reg) |
||
35 | { |
||
36 | if (reg->file == GRF) { |
||
37 | reg->reg = reg_hw_locations[reg->reg] + reg->reg_offset; |
||
38 | reg->reg_offset = 0; |
||
39 | } |
||
40 | } |
||
41 | |||
42 | void |
||
43 | fs_visitor::assign_regs_trivial() |
||
44 | { |
||
45 | unsigned hw_reg_mapping[this->alloc.count + 1]; |
||
46 | unsigned i; |
||
47 | int reg_width = dispatch_width / 8; |
||
48 | |||
49 | /* Note that compressed instructions require alignment to 2 registers. */ |
||
50 | hw_reg_mapping[0] = ALIGN(this->first_non_payload_grf, reg_width); |
||
51 | for (i = 1; i <= this->alloc.count; i++) { |
||
52 | hw_reg_mapping[i] = (hw_reg_mapping[i - 1] + |
||
53 | this->alloc.sizes[i - 1]); |
||
54 | } |
||
55 | this->grf_used = hw_reg_mapping[this->alloc.count]; |
||
56 | |||
57 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
58 | assign_reg(hw_reg_mapping, &inst->dst); |
||
59 | for (i = 0; i < inst->sources; i++) { |
||
60 | assign_reg(hw_reg_mapping, &inst->src[i]); |
||
61 | } |
||
62 | } |
||
63 | |||
64 | if (this->grf_used >= max_grf) { |
||
65 | fail("Ran out of regs on trivial allocator (%d/%d)\n", |
||
66 | this->grf_used, max_grf); |
||
67 | } else { |
||
68 | this->alloc.count = this->grf_used; |
||
69 | } |
||
70 | |||
71 | } |
||
72 | |||
73 | static void |
||
74 | brw_alloc_reg_set(struct brw_compiler *compiler, int reg_width) |
||
75 | { |
||
76 | const struct brw_device_info *devinfo = compiler->devinfo; |
||
77 | int base_reg_count = BRW_MAX_GRF; |
||
78 | int index = reg_width - 1; |
||
79 | |||
80 | /* The registers used to make up almost all values handled in the compiler |
||
81 | * are a scalar value occupying a single register (or 2 registers in the |
||
82 | * case of SIMD16, which is handled by dividing base_reg_count by 2 and |
||
83 | * multiplying allocated register numbers by 2). Things that were |
||
84 | * aggregates of scalar values at the GLSL level were split to scalar |
||
85 | * values by split_virtual_grfs(). |
||
86 | * |
||
87 | * However, texture SEND messages return a series of contiguous registers |
||
88 | * to write into. We currently always ask for 4 registers, but we may |
||
89 | * convert that to use less some day. |
||
90 | * |
||
91 | * Additionally, on gen5 we need aligned pairs of registers for the PLN |
||
92 | * instruction, and on gen4 we need 8 contiguous regs for workaround simd16 |
||
93 | * texturing. |
||
94 | * |
||
95 | * So we have a need for classes for 1, 2, 4, and 8 registers currently, |
||
96 | * and we add in '3' to make indexing the array easier for the common case |
||
97 | * (since we'll probably want it for texturing later). |
||
98 | * |
||
99 | * And, on gen7 and newer, we do texturing SEND messages from GRFs, which |
||
100 | * means that we may need any size up to the sampler message size limit (11 |
||
101 | * regs). |
||
102 | */ |
||
103 | int class_count; |
||
104 | int class_sizes[MAX_VGRF_SIZE]; |
||
105 | |||
106 | if (devinfo->gen >= 7) { |
||
107 | for (class_count = 0; class_count < MAX_VGRF_SIZE; class_count++) |
||
108 | class_sizes[class_count] = class_count + 1; |
||
109 | } else { |
||
110 | for (class_count = 0; class_count < 4; class_count++) |
||
111 | class_sizes[class_count] = class_count + 1; |
||
112 | class_sizes[class_count++] = 8; |
||
113 | } |
||
114 | |||
115 | memset(compiler->fs_reg_sets[index].class_to_ra_reg_range, 0, |
||
116 | sizeof(compiler->fs_reg_sets[index].class_to_ra_reg_range)); |
||
117 | int *class_to_ra_reg_range = compiler->fs_reg_sets[index].class_to_ra_reg_range; |
||
118 | |||
119 | /* Compute the total number of registers across all classes. */ |
||
120 | int ra_reg_count = 0; |
||
121 | for (int i = 0; i < class_count; i++) { |
||
122 | if (devinfo->gen <= 5 && reg_width == 2) { |
||
123 | /* From the G45 PRM: |
||
124 | * |
||
125 | * In order to reduce the hardware complexity, the following |
||
126 | * rules and restrictions apply to the compressed instruction: |
||
127 | * ... |
||
128 | * * Operand Alignment Rule: With the exceptions listed below, a |
||
129 | * source/destination operand in general should be aligned to |
||
130 | * even 256-bit physical register with a region size equal to |
||
131 | * two 256-bit physical register |
||
132 | */ |
||
133 | ra_reg_count += (base_reg_count - (class_sizes[i] - 1)) / 2; |
||
134 | } else { |
||
135 | ra_reg_count += base_reg_count - (class_sizes[i] - 1); |
||
136 | } |
||
137 | /* Mark the last register. We'll fill in the beginnings later. */ |
||
138 | class_to_ra_reg_range[class_sizes[i]] = ra_reg_count; |
||
139 | } |
||
140 | |||
141 | /* Fill out the rest of the range markers */ |
||
142 | for (int i = 1; i < 17; ++i) { |
||
143 | if (class_to_ra_reg_range[i] == 0) |
||
144 | class_to_ra_reg_range[i] = class_to_ra_reg_range[i-1]; |
||
145 | } |
||
146 | |||
147 | uint8_t *ra_reg_to_grf = ralloc_array(compiler, uint8_t, ra_reg_count); |
||
148 | struct ra_regs *regs = ra_alloc_reg_set(compiler, ra_reg_count); |
||
149 | if (devinfo->gen >= 6) |
||
150 | ra_set_allocate_round_robin(regs); |
||
151 | int *classes = ralloc_array(compiler, int, class_count); |
||
152 | int aligned_pairs_class = -1; |
||
153 | |||
154 | /* Allocate space for q values. We allocate class_count + 1 because we |
||
155 | * want to leave room for the aligned pairs class if we have it. */ |
||
156 | unsigned int **q_values = ralloc_array(compiler, unsigned int *, |
||
157 | class_count + 1); |
||
158 | for (int i = 0; i < class_count + 1; ++i) |
||
159 | q_values[i] = ralloc_array(q_values, unsigned int, class_count + 1); |
||
160 | |||
161 | /* Now, add the registers to their classes, and add the conflicts |
||
162 | * between them and the base GRF registers (and also each other). |
||
163 | */ |
||
164 | int reg = 0; |
||
165 | int pairs_base_reg = 0; |
||
166 | int pairs_reg_count = 0; |
||
167 | for (int i = 0; i < class_count; i++) { |
||
168 | int class_reg_count; |
||
169 | if (devinfo->gen <= 5 && reg_width == 2) { |
||
170 | class_reg_count = (base_reg_count - (class_sizes[i] - 1)) / 2; |
||
171 | |||
172 | /* See comment below. The only difference here is that we are |
||
173 | * dealing with pairs of registers instead of single registers. |
||
174 | * Registers of odd sizes simply get rounded up. */ |
||
175 | for (int j = 0; j < class_count; j++) |
||
176 | q_values[i][j] = (class_sizes[i] + 1) / 2 + |
||
177 | (class_sizes[j] + 1) / 2 - 1; |
||
178 | } else { |
||
179 | class_reg_count = base_reg_count - (class_sizes[i] - 1); |
||
180 | |||
181 | /* From register_allocate.c: |
||
182 | * |
||
183 | * q(B,C) (indexed by C, B is this register class) in |
||
184 | * Runeson/Nyström paper. This is "how many registers of B could |
||
185 | * the worst choice register from C conflict with". |
||
186 | * |
||
187 | * If we just let the register allocation algorithm compute these |
||
188 | * values, is extremely expensive. However, since all of our |
||
189 | * registers are laid out, we can very easily compute them |
||
190 | * ourselves. View the register from C as fixed starting at GRF n |
||
191 | * somwhere in the middle, and the register from B as sliding back |
||
192 | * and forth. Then the first register to conflict from B is the |
||
193 | * one starting at n - class_size[B] + 1 and the last register to |
||
194 | * conflict will start at n + class_size[B] - 1. Therefore, the |
||
195 | * number of conflicts from B is class_size[B] + class_size[C] - 1. |
||
196 | * |
||
197 | * +-+-+-+-+-+-+ +-+-+-+-+-+-+ |
||
198 | * B | | | | | |n| --> | | | | | | | |
||
199 | * +-+-+-+-+-+-+ +-+-+-+-+-+-+ |
||
200 | * +-+-+-+-+-+ |
||
201 | * C |n| | | | | |
||
202 | * +-+-+-+-+-+ |
||
203 | */ |
||
204 | for (int j = 0; j < class_count; j++) |
||
205 | q_values[i][j] = class_sizes[i] + class_sizes[j] - 1; |
||
206 | } |
||
207 | classes[i] = ra_alloc_reg_class(regs); |
||
208 | |||
209 | /* Save this off for the aligned pair class at the end. */ |
||
210 | if (class_sizes[i] == 2) { |
||
211 | pairs_base_reg = reg; |
||
212 | pairs_reg_count = class_reg_count; |
||
213 | } |
||
214 | |||
215 | if (devinfo->gen <= 5 && reg_width == 2) { |
||
216 | for (int j = 0; j < class_reg_count; j++) { |
||
217 | ra_class_add_reg(regs, classes[i], reg); |
||
218 | |||
219 | ra_reg_to_grf[reg] = j * 2; |
||
220 | |||
221 | for (int base_reg = j; |
||
222 | base_reg < j + (class_sizes[i] + 1) / 2; |
||
223 | base_reg++) { |
||
224 | ra_add_transitive_reg_conflict(regs, base_reg, reg); |
||
225 | } |
||
226 | |||
227 | reg++; |
||
228 | } |
||
229 | } else { |
||
230 | for (int j = 0; j < class_reg_count; j++) { |
||
231 | ra_class_add_reg(regs, classes[i], reg); |
||
232 | |||
233 | ra_reg_to_grf[reg] = j; |
||
234 | |||
235 | for (int base_reg = j; |
||
236 | base_reg < j + class_sizes[i]; |
||
237 | base_reg++) { |
||
238 | ra_add_transitive_reg_conflict(regs, base_reg, reg); |
||
239 | } |
||
240 | |||
241 | reg++; |
||
242 | } |
||
243 | } |
||
244 | } |
||
245 | assert(reg == ra_reg_count); |
||
246 | |||
247 | /* Add a special class for aligned pairs, which we'll put delta_xy |
||
248 | * in on Gen <= 6 so that we can do PLN. |
||
249 | */ |
||
250 | if (devinfo->has_pln && reg_width == 1 && devinfo->gen <= 6) { |
||
251 | aligned_pairs_class = ra_alloc_reg_class(regs); |
||
252 | |||
253 | for (int i = 0; i < pairs_reg_count; i++) { |
||
254 | if ((ra_reg_to_grf[pairs_base_reg + i] & 1) == 0) { |
||
255 | ra_class_add_reg(regs, aligned_pairs_class, pairs_base_reg + i); |
||
256 | } |
||
257 | } |
||
258 | |||
259 | for (int i = 0; i < class_count; i++) { |
||
260 | /* These are a little counter-intuitive because the pair registers |
||
261 | * are required to be aligned while the register they are |
||
262 | * potentially interferring with are not. In the case where the |
||
263 | * size is even, the worst-case is that the register is |
||
264 | * odd-aligned. In the odd-size case, it doesn't matter. |
||
265 | */ |
||
266 | q_values[class_count][i] = class_sizes[i] / 2 + 1; |
||
267 | q_values[i][class_count] = class_sizes[i] + 1; |
||
268 | } |
||
269 | q_values[class_count][class_count] = 1; |
||
270 | } |
||
271 | |||
272 | ra_set_finalize(regs, q_values); |
||
273 | |||
274 | ralloc_free(q_values); |
||
275 | |||
276 | compiler->fs_reg_sets[index].regs = regs; |
||
277 | for (unsigned i = 0; i < ARRAY_SIZE(compiler->fs_reg_sets[index].classes); i++) |
||
278 | compiler->fs_reg_sets[index].classes[i] = -1; |
||
279 | for (int i = 0; i < class_count; i++) |
||
280 | compiler->fs_reg_sets[index].classes[class_sizes[i] - 1] = classes[i]; |
||
281 | compiler->fs_reg_sets[index].ra_reg_to_grf = ra_reg_to_grf; |
||
282 | compiler->fs_reg_sets[index].aligned_pairs_class = aligned_pairs_class; |
||
283 | } |
||
284 | |||
285 | void |
||
286 | brw_fs_alloc_reg_sets(struct brw_compiler *compiler) |
||
287 | { |
||
288 | brw_alloc_reg_set(compiler, 1); |
||
289 | brw_alloc_reg_set(compiler, 2); |
||
290 | } |
||
291 | |||
292 | static int |
||
293 | count_to_loop_end(const bblock_t *block) |
||
294 | { |
||
295 | if (block->end()->opcode == BRW_OPCODE_WHILE) |
||
296 | return block->end_ip; |
||
297 | |||
298 | int depth = 1; |
||
299 | /* Skip the first block, since we don't want to count the do the calling |
||
300 | * function found. |
||
301 | */ |
||
302 | for (block = block->next(); |
||
303 | depth > 0; |
||
304 | block = block->next()) { |
||
305 | if (block->start()->opcode == BRW_OPCODE_DO) |
||
306 | depth++; |
||
307 | if (block->end()->opcode == BRW_OPCODE_WHILE) { |
||
308 | depth--; |
||
309 | if (depth == 0) |
||
310 | return block->end_ip; |
||
311 | } |
||
312 | } |
||
313 | unreachable("not reached"); |
||
314 | } |
||
315 | |||
316 | /** |
||
317 | * Sets up interference between thread payload registers and the virtual GRFs |
||
318 | * to be allocated for program temporaries. |
||
319 | * |
||
320 | * We want to be able to reallocate the payload for our virtual GRFs, notably |
||
321 | * because the setup coefficients for a full set of 16 FS inputs takes up 8 of |
||
322 | * our 128 registers. |
||
323 | * |
||
324 | * The layout of the payload registers is: |
||
325 | * |
||
326 | * 0..payload.num_regs-1: fixed function setup (including bary coordinates). |
||
327 | * payload.num_regs..payload.num_regs+curb_read_lengh-1: uniform data |
||
328 | * payload.num_regs+curb_read_lengh..first_non_payload_grf-1: setup coefficients. |
||
329 | * |
||
330 | * And we have payload_node_count nodes covering these registers in order |
||
331 | * (note that in SIMD16, a node is two registers). |
||
332 | */ |
||
333 | void |
||
334 | fs_visitor::setup_payload_interference(struct ra_graph *g, |
||
335 | int payload_node_count, |
||
336 | int first_payload_node) |
||
337 | { |
||
338 | int loop_depth = 0; |
||
339 | int loop_end_ip = 0; |
||
340 | |||
341 | int payload_last_use_ip[payload_node_count]; |
||
342 | memset(payload_last_use_ip, 0, sizeof(payload_last_use_ip)); |
||
343 | int ip = 0; |
||
344 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
345 | switch (inst->opcode) { |
||
346 | case BRW_OPCODE_DO: |
||
347 | loop_depth++; |
||
348 | |||
349 | /* Since payload regs are deffed only at the start of the shader |
||
350 | * execution, any uses of the payload within a loop mean the live |
||
351 | * interval extends to the end of the outermost loop. Find the ip of |
||
352 | * the end now. |
||
353 | */ |
||
354 | if (loop_depth == 1) |
||
355 | loop_end_ip = count_to_loop_end(block); |
||
356 | break; |
||
357 | case BRW_OPCODE_WHILE: |
||
358 | loop_depth--; |
||
359 | break; |
||
360 | default: |
||
361 | break; |
||
362 | } |
||
363 | |||
364 | int use_ip; |
||
365 | if (loop_depth > 0) |
||
366 | use_ip = loop_end_ip; |
||
367 | else |
||
368 | use_ip = ip; |
||
369 | |||
370 | /* Note that UNIFORM args have been turned into FIXED_HW_REG by |
||
371 | * assign_curbe_setup(), and interpolation uses fixed hardware regs from |
||
372 | * the start (see interp_reg()). |
||
373 | */ |
||
374 | for (int i = 0; i < inst->sources; i++) { |
||
375 | if (inst->src[i].file == HW_REG && |
||
376 | inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { |
||
377 | int node_nr = inst->src[i].fixed_hw_reg.nr; |
||
378 | if (node_nr >= payload_node_count) |
||
379 | continue; |
||
380 | |||
381 | payload_last_use_ip[node_nr] = use_ip; |
||
382 | } |
||
383 | } |
||
384 | |||
385 | /* Special case instructions which have extra implied registers used. */ |
||
386 | switch (inst->opcode) { |
||
387 | case FS_OPCODE_LINTERP: |
||
388 | /* On gen6+ in SIMD16, there are 4 adjacent registers used by |
||
389 | * PLN's sourcing of the deltas, while we list only the first one |
||
390 | * in the arguments. Pre-gen6, the deltas are computed in normal |
||
391 | * VGRFs. |
||
392 | */ |
||
393 | if (devinfo->gen >= 6) { |
||
394 | int delta_x_arg = 0; |
||
395 | if (inst->src[delta_x_arg].file == HW_REG && |
||
396 | inst->src[delta_x_arg].fixed_hw_reg.file == |
||
397 | BRW_GENERAL_REGISTER_FILE) { |
||
398 | for (int i = 1; i < 4; ++i) { |
||
399 | int node = inst->src[delta_x_arg].fixed_hw_reg.nr + i; |
||
400 | assert(node < payload_node_count); |
||
401 | payload_last_use_ip[node] = use_ip; |
||
402 | } |
||
403 | } |
||
404 | } |
||
405 | break; |
||
406 | |||
407 | case CS_OPCODE_CS_TERMINATE: |
||
408 | payload_last_use_ip[0] = use_ip; |
||
409 | break; |
||
410 | |||
411 | default: |
||
412 | if (inst->eot) { |
||
413 | /* We could omit this for the !inst->header_present case, except |
||
414 | * that the simulator apparently incorrectly reads from g0/g1 |
||
415 | * instead of sideband. It also really freaks out driver |
||
416 | * developers to see g0 used in unusual places, so just always |
||
417 | * reserve it. |
||
418 | */ |
||
419 | payload_last_use_ip[0] = use_ip; |
||
420 | payload_last_use_ip[1] = use_ip; |
||
421 | } |
||
422 | break; |
||
423 | } |
||
424 | |||
425 | ip++; |
||
426 | } |
||
427 | |||
428 | for (int i = 0; i < payload_node_count; i++) { |
||
429 | /* Mark the payload node as interfering with any virtual grf that is |
||
430 | * live between the start of the program and our last use of the payload |
||
431 | * node. |
||
432 | */ |
||
433 | for (unsigned j = 0; j < this->alloc.count; j++) { |
||
434 | /* Note that we use a <= comparison, unlike virtual_grf_interferes(), |
||
435 | * in order to not have to worry about the uniform issue described in |
||
436 | * calculate_live_intervals(). |
||
437 | */ |
||
438 | if (this->virtual_grf_start[j] <= payload_last_use_ip[i]) { |
||
439 | ra_add_node_interference(g, first_payload_node + i, j); |
||
440 | } |
||
441 | } |
||
442 | } |
||
443 | |||
444 | for (int i = 0; i < payload_node_count; i++) { |
||
445 | /* Mark each payload node as being allocated to its physical register. |
||
446 | * |
||
447 | * The alternative would be to have per-physical-register classes, which |
||
448 | * would just be silly. |
||
449 | */ |
||
450 | if (devinfo->gen <= 5 && dispatch_width == 16) { |
||
451 | /* We have to divide by 2 here because we only have even numbered |
||
452 | * registers. Some of the payload registers will be odd, but |
||
453 | * that's ok because their physical register numbers have already |
||
454 | * been assigned. The only thing this is used for is interference. |
||
455 | */ |
||
456 | ra_set_node_reg(g, first_payload_node + i, i / 2); |
||
457 | } else { |
||
458 | ra_set_node_reg(g, first_payload_node + i, i); |
||
459 | } |
||
460 | } |
||
461 | } |
||
462 | |||
463 | /** |
||
464 | * Sets the mrf_used array to indicate which MRFs are used by the shader IR |
||
465 | * |
||
466 | * This is used in assign_regs() to decide which of the GRFs that we use as |
||
467 | * MRFs on gen7 get normally register allocated, and in register spilling to |
||
468 | * see if we can actually use MRFs to do spills without overwriting normal MRF |
||
469 | * contents. |
||
470 | */ |
||
471 | void |
||
472 | fs_visitor::get_used_mrfs(bool *mrf_used) |
||
473 | { |
||
474 | int reg_width = dispatch_width / 8; |
||
475 | |||
476 | memset(mrf_used, 0, BRW_MAX_MRF * sizeof(bool)); |
||
477 | |||
478 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
479 | if (inst->dst.file == MRF) { |
||
480 | int reg = inst->dst.reg & ~BRW_MRF_COMPR4; |
||
481 | mrf_used[reg] = true; |
||
482 | if (reg_width == 2) { |
||
483 | if (inst->dst.reg & BRW_MRF_COMPR4) { |
||
484 | mrf_used[reg + 4] = true; |
||
485 | } else { |
||
486 | mrf_used[reg + 1] = true; |
||
487 | } |
||
488 | } |
||
489 | } |
||
490 | |||
491 | if (inst->mlen > 0) { |
||
492 | for (int i = 0; i < implied_mrf_writes(inst); i++) { |
||
493 | mrf_used[inst->base_mrf + i] = true; |
||
494 | } |
||
495 | } |
||
496 | } |
||
497 | } |
||
498 | |||
499 | /** |
||
500 | * Sets interference between virtual GRFs and usage of the high GRFs for SEND |
||
501 | * messages (treated as MRFs in code generation). |
||
502 | */ |
||
503 | void |
||
504 | fs_visitor::setup_mrf_hack_interference(struct ra_graph *g, int first_mrf_node) |
||
505 | { |
||
506 | bool mrf_used[BRW_MAX_MRF]; |
||
507 | get_used_mrfs(mrf_used); |
||
508 | |||
509 | for (int i = 0; i < BRW_MAX_MRF; i++) { |
||
510 | /* Mark each MRF reg node as being allocated to its physical register. |
||
511 | * |
||
512 | * The alternative would be to have per-physical-register classes, which |
||
513 | * would just be silly. |
||
514 | */ |
||
515 | ra_set_node_reg(g, first_mrf_node + i, GEN7_MRF_HACK_START + i); |
||
516 | |||
517 | /* Since we don't have any live/dead analysis on the MRFs, just mark all |
||
518 | * that are used as conflicting with all virtual GRFs. |
||
519 | */ |
||
520 | if (mrf_used[i]) { |
||
521 | for (unsigned j = 0; j < this->alloc.count; j++) { |
||
522 | ra_add_node_interference(g, first_mrf_node + i, j); |
||
523 | } |
||
524 | } |
||
525 | } |
||
526 | } |
||
527 | |||
528 | bool |
||
529 | fs_visitor::assign_regs(bool allow_spilling) |
||
530 | { |
||
531 | struct brw_compiler *compiler = brw->intelScreen->compiler; |
||
532 | /* Most of this allocation was written for a reg_width of 1 |
||
533 | * (dispatch_width == 8). In extending to SIMD16, the code was |
||
534 | * left in place and it was converted to have the hardware |
||
535 | * registers it's allocating be contiguous physical pairs of regs |
||
536 | * for reg_width == 2. |
||
537 | */ |
||
538 | int reg_width = dispatch_width / 8; |
||
539 | unsigned hw_reg_mapping[this->alloc.count]; |
||
540 | int payload_node_count = ALIGN(this->first_non_payload_grf, reg_width); |
||
541 | int rsi = reg_width - 1; /* Which compiler->fs_reg_sets[] to use */ |
||
542 | calculate_live_intervals(); |
||
543 | |||
544 | int node_count = this->alloc.count; |
||
545 | int first_payload_node = node_count; |
||
546 | node_count += payload_node_count; |
||
547 | int first_mrf_hack_node = node_count; |
||
548 | if (devinfo->gen >= 7) |
||
549 | node_count += BRW_MAX_GRF - GEN7_MRF_HACK_START; |
||
550 | struct ra_graph *g = |
||
551 | ra_alloc_interference_graph(compiler->fs_reg_sets[rsi].regs, node_count); |
||
552 | |||
553 | for (unsigned i = 0; i < this->alloc.count; i++) { |
||
554 | unsigned size = this->alloc.sizes[i]; |
||
555 | int c; |
||
556 | |||
557 | assert(size <= ARRAY_SIZE(compiler->fs_reg_sets[rsi].classes) && |
||
558 | "Register allocation relies on split_virtual_grfs()"); |
||
559 | c = compiler->fs_reg_sets[rsi].classes[size - 1]; |
||
560 | |||
561 | /* Special case: on pre-GEN6 hardware that supports PLN, the |
||
562 | * second operand of a PLN instruction needs to be an |
||
563 | * even-numbered register, so we have a special register class |
||
564 | * wm_aligned_pairs_class to handle this case. pre-GEN6 always |
||
565 | * uses this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC] as the |
||
566 | * second operand of a PLN instruction (since it doesn't support |
||
567 | * any other interpolation modes). So all we need to do is find |
||
568 | * that register and set it to the appropriate class. |
||
569 | */ |
||
570 | if (compiler->fs_reg_sets[rsi].aligned_pairs_class >= 0 && |
||
571 | this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF && |
||
572 | this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg == i) { |
||
573 | c = compiler->fs_reg_sets[rsi].aligned_pairs_class; |
||
574 | } |
||
575 | |||
576 | ra_set_node_class(g, i, c); |
||
577 | |||
578 | for (unsigned j = 0; j < i; j++) { |
||
579 | if (virtual_grf_interferes(i, j)) { |
||
580 | ra_add_node_interference(g, i, j); |
||
581 | } |
||
582 | } |
||
583 | } |
||
584 | |||
585 | setup_payload_interference(g, payload_node_count, first_payload_node); |
||
586 | if (devinfo->gen >= 7) { |
||
587 | setup_mrf_hack_interference(g, first_mrf_hack_node); |
||
588 | |||
589 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
590 | /* When we do send-from-GRF for FB writes, we need to ensure that |
||
591 | * the last write instruction sends from a high register. This is |
||
592 | * because the vertex fetcher wants to start filling the low |
||
593 | * payload registers while the pixel data port is still working on |
||
594 | * writing out the memory. If we don't do this, we get rendering |
||
595 | * artifacts. |
||
596 | * |
||
597 | * We could just do "something high". Instead, we just pick the |
||
598 | * highest register that works. |
||
599 | */ |
||
600 | if (inst->eot) { |
||
601 | int size = alloc.sizes[inst->src[0].reg]; |
||
602 | int reg = compiler->fs_reg_sets[rsi].class_to_ra_reg_range[size] - 1; |
||
603 | ra_set_node_reg(g, inst->src[0].reg, reg); |
||
604 | break; |
||
605 | } |
||
606 | } |
||
607 | } |
||
608 | |||
609 | if (dispatch_width > 8) { |
||
610 | /* In 16-wide dispatch we have an issue where a compressed |
||
611 | * instruction is actually two instructions executed simultaneiously. |
||
612 | * It's actually ok to have the source and destination registers be |
||
613 | * the same. In this case, each instruction over-writes its own |
||
614 | * source and there's no problem. The real problem here is if the |
||
615 | * source and destination registers are off by one. Then you can end |
||
616 | * up in a scenario where the first instruction over-writes the |
||
617 | * source of the second instruction. Since the compiler doesn't know |
||
618 | * about this level of granularity, we simply make the source and |
||
619 | * destination interfere. |
||
620 | */ |
||
621 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
622 | if (inst->dst.file != GRF) |
||
623 | continue; |
||
624 | |||
625 | for (int i = 0; i < inst->sources; ++i) { |
||
626 | if (inst->src[i].file == GRF) { |
||
627 | ra_add_node_interference(g, inst->dst.reg, inst->src[i].reg); |
||
628 | } |
||
629 | } |
||
630 | } |
||
631 | } |
||
632 | |||
633 | /* Debug of register spilling: Go spill everything. */ |
||
634 | if (unlikely(INTEL_DEBUG & DEBUG_SPILL)) { |
||
635 | int reg = choose_spill_reg(g); |
||
636 | |||
637 | if (reg != -1) { |
||
638 | spill_reg(reg); |
||
639 | ralloc_free(g); |
||
640 | return false; |
||
641 | } |
||
642 | } |
||
643 | |||
644 | if (!ra_allocate(g)) { |
||
645 | /* Failed to allocate registers. Spill a reg, and the caller will |
||
646 | * loop back into here to try again. |
||
647 | */ |
||
648 | int reg = choose_spill_reg(g); |
||
649 | |||
650 | if (reg == -1) { |
||
651 | fail("no register to spill:\n"); |
||
652 | dump_instructions(NULL); |
||
653 | } else if (allow_spilling) { |
||
654 | spill_reg(reg); |
||
655 | } |
||
656 | |||
657 | ralloc_free(g); |
||
658 | |||
659 | return false; |
||
660 | } |
||
661 | |||
662 | /* Get the chosen virtual registers for each node, and map virtual |
||
663 | * regs in the register classes back down to real hardware reg |
||
664 | * numbers. |
||
665 | */ |
||
666 | this->grf_used = payload_node_count; |
||
667 | for (unsigned i = 0; i < this->alloc.count; i++) { |
||
668 | int reg = ra_get_node_reg(g, i); |
||
669 | |||
670 | hw_reg_mapping[i] = compiler->fs_reg_sets[rsi].ra_reg_to_grf[reg]; |
||
671 | this->grf_used = MAX2(this->grf_used, |
||
672 | hw_reg_mapping[i] + this->alloc.sizes[i]); |
||
673 | } |
||
674 | |||
675 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
676 | assign_reg(hw_reg_mapping, &inst->dst); |
||
677 | for (int i = 0; i < inst->sources; i++) { |
||
678 | assign_reg(hw_reg_mapping, &inst->src[i]); |
||
679 | } |
||
680 | } |
||
681 | |||
682 | this->alloc.count = this->grf_used; |
||
683 | |||
684 | ralloc_free(g); |
||
685 | |||
686 | return true; |
||
687 | } |
||
688 | |||
689 | void |
||
690 | fs_visitor::emit_unspill(bblock_t *block, fs_inst *inst, fs_reg dst, |
||
691 | uint32_t spill_offset, int count) |
||
692 | { |
||
693 | int reg_size = 1; |
||
694 | if (dispatch_width == 16 && count % 2 == 0) { |
||
695 | reg_size = 2; |
||
696 | dst.width = 16; |
||
697 | } |
||
698 | |||
699 | for (int i = 0; i < count / reg_size; i++) { |
||
700 | /* The gen7 descriptor-based offset is 12 bits of HWORD units. */ |
||
701 | bool gen7_read = devinfo->gen >= 7 && spill_offset < (1 << 12) * REG_SIZE; |
||
702 | |||
703 | fs_inst *unspill_inst = |
||
704 | new(mem_ctx) fs_inst(gen7_read ? |
||
705 | SHADER_OPCODE_GEN7_SCRATCH_READ : |
||
706 | SHADER_OPCODE_GEN4_SCRATCH_READ, |
||
707 | dst); |
||
708 | unspill_inst->offset = spill_offset; |
||
709 | unspill_inst->ir = inst->ir; |
||
710 | unspill_inst->annotation = inst->annotation; |
||
711 | unspill_inst->regs_written = reg_size; |
||
712 | |||
713 | if (!gen7_read) { |
||
714 | unspill_inst->base_mrf = 14; |
||
715 | unspill_inst->mlen = 1; /* header contains offset */ |
||
716 | } |
||
717 | inst->insert_before(block, unspill_inst); |
||
718 | |||
719 | dst.reg_offset += reg_size; |
||
720 | spill_offset += reg_size * REG_SIZE; |
||
721 | } |
||
722 | } |
||
723 | |||
724 | void |
||
725 | fs_visitor::emit_spill(bblock_t *block, fs_inst *inst, fs_reg src, |
||
726 | uint32_t spill_offset, int count) |
||
727 | { |
||
728 | int reg_size = 1; |
||
729 | int spill_base_mrf = 14; |
||
730 | if (dispatch_width == 16 && count % 2 == 0) { |
||
731 | spill_base_mrf = 13; |
||
732 | reg_size = 2; |
||
733 | } |
||
734 | |||
735 | for (int i = 0; i < count / reg_size; i++) { |
||
736 | fs_inst *spill_inst = |
||
737 | new(mem_ctx) fs_inst(SHADER_OPCODE_GEN4_SCRATCH_WRITE, |
||
738 | reg_size * 8, reg_null_f, src); |
||
739 | src.reg_offset += reg_size; |
||
740 | spill_inst->offset = spill_offset + i * reg_size * REG_SIZE; |
||
741 | spill_inst->ir = inst->ir; |
||
742 | spill_inst->annotation = inst->annotation; |
||
743 | spill_inst->mlen = 1 + reg_size; /* header, value */ |
||
744 | spill_inst->base_mrf = spill_base_mrf; |
||
745 | inst->insert_after(block, spill_inst); |
||
746 | } |
||
747 | } |
||
748 | |||
749 | int |
||
750 | fs_visitor::choose_spill_reg(struct ra_graph *g) |
||
751 | { |
||
752 | float loop_scale = 1.0; |
||
753 | float spill_costs[this->alloc.count]; |
||
754 | bool no_spill[this->alloc.count]; |
||
755 | |||
756 | for (unsigned i = 0; i < this->alloc.count; i++) { |
||
757 | spill_costs[i] = 0.0; |
||
758 | no_spill[i] = false; |
||
759 | } |
||
760 | |||
761 | /* Calculate costs for spilling nodes. Call it a cost of 1 per |
||
762 | * spill/unspill we'll have to do, and guess that the insides of |
||
763 | * loops run 10 times. |
||
764 | */ |
||
765 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
766 | for (unsigned int i = 0; i < inst->sources; i++) { |
||
767 | if (inst->src[i].file == GRF) { |
||
768 | spill_costs[inst->src[i].reg] += loop_scale; |
||
769 | |||
770 | /* Register spilling logic assumes full-width registers; smeared |
||
771 | * registers have a width of 1 so if we try to spill them we'll |
||
772 | * generate invalid assembly. This shouldn't be a problem because |
||
773 | * smeared registers are only used as short-term temporaries when |
||
774 | * loading pull constants, so spilling them is unlikely to reduce |
||
775 | * register pressure anyhow. |
||
776 | */ |
||
777 | if (!inst->src[i].is_contiguous()) { |
||
778 | no_spill[inst->src[i].reg] = true; |
||
779 | } |
||
780 | } |
||
781 | } |
||
782 | |||
783 | if (inst->dst.file == GRF) { |
||
784 | spill_costs[inst->dst.reg] += inst->regs_written * loop_scale; |
||
785 | |||
786 | if (!inst->dst.is_contiguous()) { |
||
787 | no_spill[inst->dst.reg] = true; |
||
788 | } |
||
789 | } |
||
790 | |||
791 | switch (inst->opcode) { |
||
792 | |||
793 | case BRW_OPCODE_DO: |
||
794 | loop_scale *= 10; |
||
795 | break; |
||
796 | |||
797 | case BRW_OPCODE_WHILE: |
||
798 | loop_scale /= 10; |
||
799 | break; |
||
800 | |||
801 | case SHADER_OPCODE_GEN4_SCRATCH_WRITE: |
||
802 | if (inst->src[0].file == GRF) |
||
803 | no_spill[inst->src[0].reg] = true; |
||
804 | break; |
||
805 | |||
806 | case SHADER_OPCODE_GEN4_SCRATCH_READ: |
||
807 | case SHADER_OPCODE_GEN7_SCRATCH_READ: |
||
808 | if (inst->dst.file == GRF) |
||
809 | no_spill[inst->dst.reg] = true; |
||
810 | break; |
||
811 | |||
812 | default: |
||
813 | break; |
||
814 | } |
||
815 | } |
||
816 | |||
817 | for (unsigned i = 0; i < this->alloc.count; i++) { |
||
818 | if (!no_spill[i]) |
||
819 | ra_set_node_spill_cost(g, i, spill_costs[i]); |
||
820 | } |
||
821 | |||
822 | return ra_get_best_spill_node(g); |
||
823 | } |
||
824 | |||
825 | void |
||
826 | fs_visitor::spill_reg(int spill_reg) |
||
827 | { |
||
828 | int size = alloc.sizes[spill_reg]; |
||
829 | unsigned int spill_offset = last_scratch; |
||
830 | assert(ALIGN(spill_offset, 16) == spill_offset); /* oword read/write req. */ |
||
831 | int spill_base_mrf = dispatch_width > 8 ? 13 : 14; |
||
832 | |||
833 | /* Spills may use MRFs 13-15 in the SIMD16 case. Our texturing is done |
||
834 | * using up to 11 MRFs starting from either m1 or m2, and fb writes can use |
||
835 | * up to m13 (gen6+ simd16: 2 header + 8 color + 2 src0alpha + 2 omask) or |
||
836 | * m15 (gen4-5 simd16: 2 header + 8 color + 1 aads + 2 src depth + 2 dst |
||
837 | * depth), starting from m1. In summary: We may not be able to spill in |
||
838 | * SIMD16 mode, because we'd stomp the FB writes. |
||
839 | */ |
||
840 | if (!spilled_any_registers) { |
||
841 | bool mrf_used[BRW_MAX_MRF]; |
||
842 | get_used_mrfs(mrf_used); |
||
843 | |||
844 | for (int i = spill_base_mrf; i < BRW_MAX_MRF; i++) { |
||
845 | if (mrf_used[i]) { |
||
846 | fail("Register spilling not supported with m%d used", i); |
||
847 | return; |
||
848 | } |
||
849 | } |
||
850 | |||
851 | spilled_any_registers = true; |
||
852 | } |
||
853 | |||
854 | last_scratch += size * REG_SIZE; |
||
855 | |||
856 | /* Generate spill/unspill instructions for the objects being |
||
857 | * spilled. Right now, we spill or unspill the whole thing to a |
||
858 | * virtual grf of the same size. For most instructions, though, we |
||
859 | * could just spill/unspill the GRF being accessed. |
||
860 | */ |
||
861 | foreach_block_and_inst (block, fs_inst, inst, cfg) { |
||
862 | for (unsigned int i = 0; i < inst->sources; i++) { |
||
863 | if (inst->src[i].file == GRF && |
||
864 | inst->src[i].reg == spill_reg) { |
||
865 | int regs_read = inst->regs_read(i); |
||
866 | int subset_spill_offset = (spill_offset + |
||
867 | REG_SIZE * inst->src[i].reg_offset); |
||
868 | fs_reg unspill_dst(GRF, alloc.allocate(regs_read)); |
||
869 | |||
870 | inst->src[i].reg = unspill_dst.reg; |
||
871 | inst->src[i].reg_offset = 0; |
||
872 | |||
873 | emit_unspill(block, inst, unspill_dst, subset_spill_offset, |
||
874 | regs_read); |
||
875 | } |
||
876 | } |
||
877 | |||
878 | if (inst->dst.file == GRF && |
||
879 | inst->dst.reg == spill_reg) { |
||
880 | int subset_spill_offset = (spill_offset + |
||
881 | REG_SIZE * inst->dst.reg_offset); |
||
882 | fs_reg spill_src(GRF, alloc.allocate(inst->regs_written)); |
||
883 | |||
884 | inst->dst.reg = spill_src.reg; |
||
885 | inst->dst.reg_offset = 0; |
||
886 | |||
887 | /* If we're immediately spilling the register, we should not use |
||
888 | * destination dependency hints. Doing so will cause the GPU do |
||
889 | * try to read and write the register at the same time and may |
||
890 | * hang the GPU. |
||
891 | */ |
||
892 | inst->no_dd_clear = false; |
||
893 | inst->no_dd_check = false; |
||
894 | |||
895 | /* If our write is going to affect just part of the |
||
896 | * inst->regs_written(), then we need to unspill the destination |
||
897 | * since we write back out all of the regs_written(). |
||
898 | */ |
||
899 | if (inst->is_partial_write()) |
||
900 | emit_unspill(block, inst, spill_src, subset_spill_offset, |
||
901 | inst->regs_written); |
||
902 | |||
903 | emit_spill(block, inst, spill_src, subset_spill_offset, |
||
904 | inst->regs_written); |
||
905 | } |
||
906 | } |
||
907 | |||
908 | invalidate_live_intervals(); |
||
909 | }>>>>>>><>>>>>>>=>>>>>=>>=>=>>>>>>>>>>=>=>>>>>=>>>=>>>>=>>>>>=> |