Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Copyright © 2010 Intel Corporation |
||
3 | * |
||
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
5 | * copy of this software and associated documentation files (the "Software"), |
||
6 | * to deal in the Software without restriction, including without limitation |
||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||
8 | * and/or sell copies of the Software, and to permit persons to whom the |
||
9 | * Software is furnished to do so, subject to the following conditions: |
||
10 | * |
||
11 | * The above copyright notice and this permission notice (including the next |
||
12 | * paragraph) shall be included in all copies or substantial portions of the |
||
13 | * Software. |
||
14 | * |
||
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
||
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
||
21 | * IN THE SOFTWARE. |
||
22 | * |
||
23 | * Authors: |
||
24 | * Eric Anholt |
||
25 | * |
||
26 | */ |
||
27 | |||
28 | #include "brw_fs.h" |
||
29 | #include "brw_vec4.h" |
||
30 | #include "brw_cfg.h" |
||
31 | #include "brw_shader.h" |
||
32 | #include "glsl/glsl_types.h" |
||
33 | #include "glsl/ir_optimization.h" |
||
34 | |||
35 | using namespace brw; |
||
36 | |||
37 | /** @file brw_fs_schedule_instructions.cpp |
||
38 | * |
||
39 | * List scheduling of FS instructions. |
||
40 | * |
||
41 | * The basic model of the list scheduler is to take a basic block, |
||
42 | * compute a DAG of the dependencies (RAW ordering with latency, WAW |
||
43 | * ordering with latency, WAR ordering), and make a list of the DAG heads. |
||
44 | * Heuristically pick a DAG head, then put all the children that are |
||
45 | * now DAG heads into the list of things to schedule. |
||
46 | * |
||
47 | * The heuristic is the important part. We're trying to be cheap, |
||
48 | * since actually computing the optimal scheduling is NP complete. |
||
49 | * What we do is track a "current clock". When we schedule a node, we |
||
50 | * update the earliest-unblocked clock time of its children, and |
||
51 | * increment the clock. Then, when trying to schedule, we just pick |
||
52 | * the earliest-unblocked instruction to schedule. |
||
53 | * |
||
54 | * Note that often there will be many things which could execute |
||
55 | * immediately, and there are a range of heuristic options to choose |
||
56 | * from in picking among those. |
||
57 | */ |
||
58 | |||
59 | static bool debug = false; |
||
60 | |||
61 | class instruction_scheduler; |
||
62 | |||
63 | class schedule_node : public exec_node |
||
64 | { |
||
65 | public: |
||
66 | schedule_node(backend_instruction *inst, instruction_scheduler *sched); |
||
67 | void set_latency_gen4(); |
||
68 | void set_latency_gen7(bool is_haswell); |
||
69 | |||
70 | backend_instruction *inst; |
||
71 | schedule_node **children; |
||
72 | int *child_latency; |
||
73 | int child_count; |
||
74 | int parent_count; |
||
75 | int child_array_size; |
||
76 | int unblocked_time; |
||
77 | int latency; |
||
78 | |||
79 | /** |
||
80 | * Which iteration of pushing groups of children onto the candidates list |
||
81 | * this node was a part of. |
||
82 | */ |
||
83 | unsigned cand_generation; |
||
84 | |||
85 | /** |
||
86 | * This is the sum of the instruction's latency plus the maximum delay of |
||
87 | * its children, or just the issue_time if it's a leaf node. |
||
88 | */ |
||
89 | int delay; |
||
90 | }; |
||
91 | |||
92 | void |
||
93 | schedule_node::set_latency_gen4() |
||
94 | { |
||
95 | int chans = 8; |
||
96 | int math_latency = 22; |
||
97 | |||
98 | switch (inst->opcode) { |
||
99 | case SHADER_OPCODE_RCP: |
||
100 | this->latency = 1 * chans * math_latency; |
||
101 | break; |
||
102 | case SHADER_OPCODE_RSQ: |
||
103 | this->latency = 2 * chans * math_latency; |
||
104 | break; |
||
105 | case SHADER_OPCODE_INT_QUOTIENT: |
||
106 | case SHADER_OPCODE_SQRT: |
||
107 | case SHADER_OPCODE_LOG2: |
||
108 | /* full precision log. partial is 2. */ |
||
109 | this->latency = 3 * chans * math_latency; |
||
110 | break; |
||
111 | case SHADER_OPCODE_INT_REMAINDER: |
||
112 | case SHADER_OPCODE_EXP2: |
||
113 | /* full precision. partial is 3, same throughput. */ |
||
114 | this->latency = 4 * chans * math_latency; |
||
115 | break; |
||
116 | case SHADER_OPCODE_POW: |
||
117 | this->latency = 8 * chans * math_latency; |
||
118 | break; |
||
119 | case SHADER_OPCODE_SIN: |
||
120 | case SHADER_OPCODE_COS: |
||
121 | /* minimum latency, max is 12 rounds. */ |
||
122 | this->latency = 5 * chans * math_latency; |
||
123 | break; |
||
124 | default: |
||
125 | this->latency = 2; |
||
126 | break; |
||
127 | } |
||
128 | } |
||
129 | |||
130 | void |
||
131 | schedule_node::set_latency_gen7(bool is_haswell) |
||
132 | { |
||
133 | switch (inst->opcode) { |
||
134 | case BRW_OPCODE_MAD: |
||
135 | /* 2 cycles |
||
136 | * (since the last two src operands are in different register banks): |
||
137 | * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; |
||
138 | * |
||
139 | * 3 cycles on IVB, 4 on HSW |
||
140 | * (since the last two src operands are in the same register bank): |
||
141 | * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; |
||
142 | * |
||
143 | * 18 cycles on IVB, 16 on HSW |
||
144 | * (since the last two src operands are in different register banks): |
||
145 | * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; |
||
146 | * mov(8) null g4<4,5,1>F { align16 WE_normal 1Q }; |
||
147 | * |
||
148 | * 20 cycles on IVB, 18 on HSW |
||
149 | * (since the last two src operands are in the same register bank): |
||
150 | * mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; |
||
151 | * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; |
||
152 | */ |
||
153 | |||
154 | /* Our register allocator doesn't know about register banks, so use the |
||
155 | * higher latency. |
||
156 | */ |
||
157 | latency = is_haswell ? 16 : 18; |
||
158 | break; |
||
159 | |||
160 | case BRW_OPCODE_LRP: |
||
161 | /* 2 cycles |
||
162 | * (since the last two src operands are in different register banks): |
||
163 | * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; |
||
164 | * |
||
165 | * 3 cycles on IVB, 4 on HSW |
||
166 | * (since the last two src operands are in the same register bank): |
||
167 | * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; |
||
168 | * |
||
169 | * 16 cycles on IVB, 14 on HSW |
||
170 | * (since the last two src operands are in different register banks): |
||
171 | * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q }; |
||
172 | * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; |
||
173 | * |
||
174 | * 16 cycles |
||
175 | * (since the last two src operands are in the same register bank): |
||
176 | * lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q }; |
||
177 | * mov(8) null g4<4,4,1>F { align16 WE_normal 1Q }; |
||
178 | */ |
||
179 | |||
180 | /* Our register allocator doesn't know about register banks, so use the |
||
181 | * higher latency. |
||
182 | */ |
||
183 | latency = 14; |
||
184 | break; |
||
185 | |||
186 | case SHADER_OPCODE_RCP: |
||
187 | case SHADER_OPCODE_RSQ: |
||
188 | case SHADER_OPCODE_SQRT: |
||
189 | case SHADER_OPCODE_LOG2: |
||
190 | case SHADER_OPCODE_EXP2: |
||
191 | case SHADER_OPCODE_SIN: |
||
192 | case SHADER_OPCODE_COS: |
||
193 | /* 2 cycles: |
||
194 | * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; |
||
195 | * |
||
196 | * 18 cycles: |
||
197 | * math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q }; |
||
198 | * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; |
||
199 | * |
||
200 | * Same for exp2, log2, rsq, sqrt, sin, cos. |
||
201 | */ |
||
202 | latency = is_haswell ? 14 : 16; |
||
203 | break; |
||
204 | |||
205 | case SHADER_OPCODE_POW: |
||
206 | /* 2 cycles: |
||
207 | * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; |
||
208 | * |
||
209 | * 26 cycles: |
||
210 | * math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q }; |
||
211 | * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; |
||
212 | */ |
||
213 | latency = is_haswell ? 22 : 24; |
||
214 | break; |
||
215 | |||
216 | case SHADER_OPCODE_TEX: |
||
217 | case SHADER_OPCODE_TXD: |
||
218 | case SHADER_OPCODE_TXF: |
||
219 | case SHADER_OPCODE_TXL: |
||
220 | /* 18 cycles: |
||
221 | * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; |
||
222 | * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; |
||
223 | * send(8) g4<1>UW g114<8,8,1>F |
||
224 | * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; |
||
225 | * |
||
226 | * 697 +/-49 cycles (min 610, n=26): |
||
227 | * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; |
||
228 | * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; |
||
229 | * send(8) g4<1>UW g114<8,8,1>F |
||
230 | * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; |
||
231 | * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; |
||
232 | * |
||
233 | * So the latency on our first texture load of the batchbuffer takes |
||
234 | * ~700 cycles, since the caches are cold at that point. |
||
235 | * |
||
236 | * 840 +/- 92 cycles (min 720, n=25): |
||
237 | * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; |
||
238 | * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; |
||
239 | * send(8) g4<1>UW g114<8,8,1>F |
||
240 | * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; |
||
241 | * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; |
||
242 | * send(8) g4<1>UW g114<8,8,1>F |
||
243 | * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; |
||
244 | * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; |
||
245 | * |
||
246 | * On the second load, it takes just an extra ~140 cycles, and after |
||
247 | * accounting for the 14 cycles of the MOV's latency, that makes ~130. |
||
248 | * |
||
249 | * 683 +/- 49 cycles (min = 602, n=47): |
||
250 | * mov(8) g115<1>F 0F { align1 WE_normal 1Q }; |
||
251 | * mov(8) g114<1>F 0F { align1 WE_normal 1Q }; |
||
252 | * send(8) g4<1>UW g114<8,8,1>F |
||
253 | * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; |
||
254 | * send(8) g50<1>UW g114<8,8,1>F |
||
255 | * sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q }; |
||
256 | * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; |
||
257 | * |
||
258 | * The unit appears to be pipelined, since this matches up with the |
||
259 | * cache-cold case, despite there being two loads here. If you replace |
||
260 | * the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39). |
||
261 | * |
||
262 | * So, take some number between the cache-hot 140 cycles and the |
||
263 | * cache-cold 700 cycles. No particular tuning was done on this. |
||
264 | * |
||
265 | * I haven't done significant testing of the non-TEX opcodes. TXL at |
||
266 | * least looked about the same as TEX. |
||
267 | */ |
||
268 | latency = 200; |
||
269 | break; |
||
270 | |||
271 | case SHADER_OPCODE_TXS: |
||
272 | /* Testing textureSize(sampler2D, 0), one load was 420 +/- 41 |
||
273 | * cycles (n=15): |
||
274 | * mov(8) g114<1>UD 0D { align1 WE_normal 1Q }; |
||
275 | * send(8) g6<1>UW g114<8,8,1>F |
||
276 | * sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q }; |
||
277 | * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q }; |
||
278 | * |
||
279 | * |
||
280 | * Two loads was 535 +/- 30 cycles (n=19): |
||
281 | * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; |
||
282 | * send(16) g6<1>UW g114<8,8,1>F |
||
283 | * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; |
||
284 | * mov(16) g114<1>UD 0D { align1 WE_normal 1H }; |
||
285 | * mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H }; |
||
286 | * send(16) g8<1>UW g114<8,8,1>F |
||
287 | * sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H }; |
||
288 | * mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H }; |
||
289 | * add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H }; |
||
290 | * |
||
291 | * Since the only caches that should matter are just the |
||
292 | * instruction/state cache containing the surface state, assume that we |
||
293 | * always have hot caches. |
||
294 | */ |
||
295 | latency = 100; |
||
296 | break; |
||
297 | |||
298 | case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD: |
||
299 | case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: |
||
300 | case VS_OPCODE_PULL_CONSTANT_LOAD: |
||
301 | /* testing using varying-index pull constants: |
||
302 | * |
||
303 | * 16 cycles: |
||
304 | * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; |
||
305 | * send(8) g4<1>F g4<8,8,1>D |
||
306 | * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; |
||
307 | * |
||
308 | * ~480 cycles: |
||
309 | * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; |
||
310 | * send(8) g4<1>F g4<8,8,1>D |
||
311 | * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; |
||
312 | * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; |
||
313 | * |
||
314 | * ~620 cycles: |
||
315 | * mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q }; |
||
316 | * send(8) g4<1>F g4<8,8,1>D |
||
317 | * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; |
||
318 | * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; |
||
319 | * send(8) g4<1>F g4<8,8,1>D |
||
320 | * data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q }; |
||
321 | * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; |
||
322 | * |
||
323 | * So, if it's cache-hot, it's about 140. If it's cache cold, it's |
||
324 | * about 460. We expect to mostly be cache hot, so pick something more |
||
325 | * in that direction. |
||
326 | */ |
||
327 | latency = 200; |
||
328 | break; |
||
329 | |||
330 | case SHADER_OPCODE_GEN7_SCRATCH_READ: |
||
331 | /* Testing a load from offset 0, that had been previously written: |
||
332 | * |
||
333 | * send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q }; |
||
334 | * mov(8) null g114<8,8,1>F { align1 WE_normal 1Q }; |
||
335 | * |
||
336 | * The cycles spent seemed to be grouped around 40-50 (as low as 38), |
||
337 | * then around 140. Presumably this is cache hit vs miss. |
||
338 | */ |
||
339 | latency = 50; |
||
340 | break; |
||
341 | |||
342 | case SHADER_OPCODE_UNTYPED_ATOMIC: |
||
343 | case SHADER_OPCODE_TYPED_ATOMIC: |
||
344 | /* Test code: |
||
345 | * mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q }; |
||
346 | * mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all }; |
||
347 | * mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q }; |
||
348 | * send(8) g4<1>ud g112<8,8,1>ud |
||
349 | * data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q }; |
||
350 | * |
||
351 | * Running it 100 times as fragment shader on a 128x128 quad |
||
352 | * gives an average latency of 13867 cycles per atomic op, |
||
353 | * standard deviation 3%. Note that this is a rather |
||
354 | * pessimistic estimate, the actual latency in cases with few |
||
355 | * collisions between threads and favorable pipelining has been |
||
356 | * seen to be reduced by a factor of 100. |
||
357 | */ |
||
358 | latency = 14000; |
||
359 | break; |
||
360 | |||
361 | case SHADER_OPCODE_UNTYPED_SURFACE_READ: |
||
362 | case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: |
||
363 | case SHADER_OPCODE_TYPED_SURFACE_READ: |
||
364 | case SHADER_OPCODE_TYPED_SURFACE_WRITE: |
||
365 | /* Test code: |
||
366 | * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; |
||
367 | * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; |
||
368 | * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; |
||
369 | * send(8) g4<1>UD g112<8,8,1>UD |
||
370 | * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; |
||
371 | * . |
||
372 | * . [repeats 8 times] |
||
373 | * . |
||
374 | * mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q }; |
||
375 | * mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all }; |
||
376 | * mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q }; |
||
377 | * send(8) g4<1>UD g112<8,8,1>UD |
||
378 | * data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q }; |
||
379 | * |
||
380 | * Running it 100 times as fragment shader on a 128x128 quad |
||
381 | * gives an average latency of 583 cycles per surface read, |
||
382 | * standard deviation 0.9%. |
||
383 | */ |
||
384 | latency = is_haswell ? 300 : 600; |
||
385 | break; |
||
386 | |||
387 | default: |
||
388 | /* 2 cycles: |
||
389 | * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; |
||
390 | * |
||
391 | * 16 cycles: |
||
392 | * mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q }; |
||
393 | * mov(8) null g4<8,8,1>F { align1 WE_normal 1Q }; |
||
394 | */ |
||
395 | latency = 14; |
||
396 | break; |
||
397 | } |
||
398 | } |
||
399 | |||
400 | class instruction_scheduler { |
||
401 | public: |
||
402 | instruction_scheduler(backend_visitor *v, int grf_count, |
||
403 | instruction_scheduler_mode mode) |
||
404 | { |
||
405 | this->bv = v; |
||
406 | this->mem_ctx = ralloc_context(NULL); |
||
407 | this->grf_count = grf_count; |
||
408 | this->instructions.make_empty(); |
||
409 | this->instructions_to_schedule = 0; |
||
410 | this->post_reg_alloc = (mode == SCHEDULE_POST); |
||
411 | this->mode = mode; |
||
412 | this->time = 0; |
||
413 | if (!post_reg_alloc) { |
||
414 | this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count); |
||
415 | this->grf_active = rzalloc_array(mem_ctx, bool, grf_count); |
||
416 | } else { |
||
417 | this->remaining_grf_uses = NULL; |
||
418 | this->grf_active = NULL; |
||
419 | } |
||
420 | } |
||
421 | |||
422 | ~instruction_scheduler() |
||
423 | { |
||
424 | ralloc_free(this->mem_ctx); |
||
425 | } |
||
426 | void add_barrier_deps(schedule_node *n); |
||
427 | void add_dep(schedule_node *before, schedule_node *after, int latency); |
||
428 | void add_dep(schedule_node *before, schedule_node *after); |
||
429 | |||
430 | void run(cfg_t *cfg); |
||
431 | void add_insts_from_block(bblock_t *block); |
||
432 | void compute_delay(schedule_node *node); |
||
433 | virtual void calculate_deps() = 0; |
||
434 | virtual schedule_node *choose_instruction_to_schedule() = 0; |
||
435 | |||
436 | /** |
||
437 | * Returns how many cycles it takes the instruction to issue. |
||
438 | * |
||
439 | * Instructions in gen hardware are handled one simd4 vector at a time, |
||
440 | * with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2 |
||
441 | * cycles to dispatch and SIMD16 (compressed) instructions take 4. |
||
442 | */ |
||
443 | virtual int issue_time(backend_instruction *inst) = 0; |
||
444 | |||
445 | virtual void count_remaining_grf_uses(backend_instruction *inst) = 0; |
||
446 | virtual void update_register_pressure(backend_instruction *inst) = 0; |
||
447 | virtual int get_register_pressure_benefit(backend_instruction *inst) = 0; |
||
448 | |||
449 | void schedule_instructions(bblock_t *block); |
||
450 | |||
451 | void *mem_ctx; |
||
452 | |||
453 | bool post_reg_alloc; |
||
454 | int instructions_to_schedule; |
||
455 | int grf_count; |
||
456 | int time; |
||
457 | exec_list instructions; |
||
458 | backend_visitor *bv; |
||
459 | |||
460 | instruction_scheduler_mode mode; |
||
461 | |||
462 | /** |
||
463 | * Number of instructions left to schedule that reference each vgrf. |
||
464 | * |
||
465 | * Used so that we can prefer scheduling instructions that will end the |
||
466 | * live intervals of multiple variables, to reduce register pressure. |
||
467 | */ |
||
468 | int *remaining_grf_uses; |
||
469 | |||
470 | /** |
||
471 | * Tracks whether each VGRF has had an instruction scheduled that uses it. |
||
472 | * |
||
473 | * This is used to estimate whether scheduling a new instruction will |
||
474 | * increase register pressure. |
||
475 | */ |
||
476 | bool *grf_active; |
||
477 | }; |
||
478 | |||
479 | class fs_instruction_scheduler : public instruction_scheduler |
||
480 | { |
||
481 | public: |
||
482 | fs_instruction_scheduler(fs_visitor *v, int grf_count, |
||
483 | instruction_scheduler_mode mode); |
||
484 | void calculate_deps(); |
||
485 | bool is_compressed(fs_inst *inst); |
||
486 | schedule_node *choose_instruction_to_schedule(); |
||
487 | int issue_time(backend_instruction *inst); |
||
488 | fs_visitor *v; |
||
489 | |||
490 | void count_remaining_grf_uses(backend_instruction *inst); |
||
491 | void update_register_pressure(backend_instruction *inst); |
||
492 | int get_register_pressure_benefit(backend_instruction *inst); |
||
493 | }; |
||
494 | |||
495 | fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v, |
||
496 | int grf_count, |
||
497 | instruction_scheduler_mode mode) |
||
498 | : instruction_scheduler(v, grf_count, mode), |
||
499 | v(v) |
||
500 | { |
||
501 | } |
||
502 | |||
503 | void |
||
504 | fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be) |
||
505 | { |
||
506 | fs_inst *inst = (fs_inst *)be; |
||
507 | |||
508 | if (!remaining_grf_uses) |
||
509 | return; |
||
510 | |||
511 | if (inst->dst.file == GRF) |
||
512 | remaining_grf_uses[inst->dst.reg]++; |
||
513 | |||
514 | for (int i = 0; i < inst->sources; i++) { |
||
515 | if (inst->src[i].file != GRF) |
||
516 | continue; |
||
517 | |||
518 | remaining_grf_uses[inst->src[i].reg]++; |
||
519 | } |
||
520 | } |
||
521 | |||
522 | void |
||
523 | fs_instruction_scheduler::update_register_pressure(backend_instruction *be) |
||
524 | { |
||
525 | fs_inst *inst = (fs_inst *)be; |
||
526 | |||
527 | if (!remaining_grf_uses) |
||
528 | return; |
||
529 | |||
530 | if (inst->dst.file == GRF) { |
||
531 | remaining_grf_uses[inst->dst.reg]--; |
||
532 | grf_active[inst->dst.reg] = true; |
||
533 | } |
||
534 | |||
535 | for (int i = 0; i < inst->sources; i++) { |
||
536 | if (inst->src[i].file == GRF) { |
||
537 | remaining_grf_uses[inst->src[i].reg]--; |
||
538 | grf_active[inst->src[i].reg] = true; |
||
539 | } |
||
540 | } |
||
541 | } |
||
542 | |||
543 | int |
||
544 | fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) |
||
545 | { |
||
546 | fs_inst *inst = (fs_inst *)be; |
||
547 | int benefit = 0; |
||
548 | |||
549 | if (inst->dst.file == GRF) { |
||
550 | if (remaining_grf_uses[inst->dst.reg] == 1) |
||
551 | benefit += v->alloc.sizes[inst->dst.reg]; |
||
552 | if (!grf_active[inst->dst.reg]) |
||
553 | benefit -= v->alloc.sizes[inst->dst.reg]; |
||
554 | } |
||
555 | |||
556 | for (int i = 0; i < inst->sources; i++) { |
||
557 | if (inst->src[i].file != GRF) |
||
558 | continue; |
||
559 | |||
560 | if (remaining_grf_uses[inst->src[i].reg] == 1) |
||
561 | benefit += v->alloc.sizes[inst->src[i].reg]; |
||
562 | if (!grf_active[inst->src[i].reg]) |
||
563 | benefit -= v->alloc.sizes[inst->src[i].reg]; |
||
564 | } |
||
565 | |||
566 | return benefit; |
||
567 | } |
||
568 | |||
569 | class vec4_instruction_scheduler : public instruction_scheduler |
||
570 | { |
||
571 | public: |
||
572 | vec4_instruction_scheduler(vec4_visitor *v, int grf_count); |
||
573 | void calculate_deps(); |
||
574 | schedule_node *choose_instruction_to_schedule(); |
||
575 | int issue_time(backend_instruction *inst); |
||
576 | vec4_visitor *v; |
||
577 | |||
578 | void count_remaining_grf_uses(backend_instruction *inst); |
||
579 | void update_register_pressure(backend_instruction *inst); |
||
580 | int get_register_pressure_benefit(backend_instruction *inst); |
||
581 | }; |
||
582 | |||
583 | vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v, |
||
584 | int grf_count) |
||
585 | : instruction_scheduler(v, grf_count, SCHEDULE_POST), |
||
586 | v(v) |
||
587 | { |
||
588 | } |
||
589 | |||
590 | void |
||
591 | vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be) |
||
592 | { |
||
593 | } |
||
594 | |||
595 | void |
||
596 | vec4_instruction_scheduler::update_register_pressure(backend_instruction *be) |
||
597 | { |
||
598 | } |
||
599 | |||
600 | int |
||
601 | vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be) |
||
602 | { |
||
603 | return 0; |
||
604 | } |
||
605 | |||
606 | schedule_node::schedule_node(backend_instruction *inst, |
||
607 | instruction_scheduler *sched) |
||
608 | { |
||
609 | const struct brw_device_info *devinfo = sched->bv->devinfo; |
||
610 | |||
611 | this->inst = inst; |
||
612 | this->child_array_size = 0; |
||
613 | this->children = NULL; |
||
614 | this->child_latency = NULL; |
||
615 | this->child_count = 0; |
||
616 | this->parent_count = 0; |
||
617 | this->unblocked_time = 0; |
||
618 | this->cand_generation = 0; |
||
619 | this->delay = 0; |
||
620 | |||
621 | /* We can't measure Gen6 timings directly but expect them to be much |
||
622 | * closer to Gen7 than Gen4. |
||
623 | */ |
||
624 | if (!sched->post_reg_alloc) |
||
625 | this->latency = 1; |
||
626 | else if (devinfo->gen >= 6) |
||
627 | set_latency_gen7(devinfo->is_haswell); |
||
628 | else |
||
629 | set_latency_gen4(); |
||
630 | } |
||
631 | |||
632 | void |
||
633 | instruction_scheduler::add_insts_from_block(bblock_t *block) |
||
634 | { |
||
635 | /* Removing the last instruction from a basic block removes the block as |
||
636 | * well, so put a NOP at the end to keep it alive. |
||
637 | */ |
||
638 | if (!block->end()->is_control_flow()) { |
||
639 | backend_instruction *nop = new(mem_ctx) backend_instruction(); |
||
640 | nop->opcode = BRW_OPCODE_NOP; |
||
641 | block->end()->insert_after(block, nop); |
||
642 | } |
||
643 | |||
644 | foreach_inst_in_block_safe(backend_instruction, inst, block) { |
||
645 | if (inst->opcode == BRW_OPCODE_NOP || inst->is_control_flow()) |
||
646 | continue; |
||
647 | |||
648 | schedule_node *n = new(mem_ctx) schedule_node(inst, this); |
||
649 | |||
650 | this->instructions_to_schedule++; |
||
651 | |||
652 | inst->remove(block); |
||
653 | instructions.push_tail(n); |
||
654 | } |
||
655 | } |
||
656 | |||
657 | /** Recursive computation of the delay member of a node. */ |
||
658 | void |
||
659 | instruction_scheduler::compute_delay(schedule_node *n) |
||
660 | { |
||
661 | if (!n->child_count) { |
||
662 | n->delay = issue_time(n->inst); |
||
663 | } else { |
||
664 | for (int i = 0; i < n->child_count; i++) { |
||
665 | if (!n->children[i]->delay) |
||
666 | compute_delay(n->children[i]); |
||
667 | n->delay = MAX2(n->delay, n->latency + n->children[i]->delay); |
||
668 | } |
||
669 | } |
||
670 | } |
||
671 | |||
672 | /** |
||
673 | * Add a dependency between two instruction nodes. |
||
674 | * |
||
675 | * The @after node will be scheduled after @before. We will try to |
||
676 | * schedule it @latency cycles after @before, but no guarantees there. |
||
677 | */ |
||
678 | void |
||
679 | instruction_scheduler::add_dep(schedule_node *before, schedule_node *after, |
||
680 | int latency) |
||
681 | { |
||
682 | if (!before || !after) |
||
683 | return; |
||
684 | |||
685 | assert(before != after); |
||
686 | |||
687 | for (int i = 0; i < before->child_count; i++) { |
||
688 | if (before->children[i] == after) { |
||
689 | before->child_latency[i] = MAX2(before->child_latency[i], latency); |
||
690 | return; |
||
691 | } |
||
692 | } |
||
693 | |||
694 | if (before->child_array_size <= before->child_count) { |
||
695 | if (before->child_array_size < 16) |
||
696 | before->child_array_size = 16; |
||
697 | else |
||
698 | before->child_array_size *= 2; |
||
699 | |||
700 | before->children = reralloc(mem_ctx, before->children, |
||
701 | schedule_node *, |
||
702 | before->child_array_size); |
||
703 | before->child_latency = reralloc(mem_ctx, before->child_latency, |
||
704 | int, before->child_array_size); |
||
705 | } |
||
706 | |||
707 | before->children[before->child_count] = after; |
||
708 | before->child_latency[before->child_count] = latency; |
||
709 | before->child_count++; |
||
710 | after->parent_count++; |
||
711 | } |
||
712 | |||
713 | void |
||
714 | instruction_scheduler::add_dep(schedule_node *before, schedule_node *after) |
||
715 | { |
||
716 | if (!before) |
||
717 | return; |
||
718 | |||
719 | add_dep(before, after, before->latency); |
||
720 | } |
||
721 | |||
722 | /** |
||
723 | * Sometimes we really want this node to execute after everything that |
||
724 | * was before it and before everything that followed it. This adds |
||
725 | * the deps to do so. |
||
726 | */ |
||
727 | void |
||
728 | instruction_scheduler::add_barrier_deps(schedule_node *n) |
||
729 | { |
||
730 | schedule_node *prev = (schedule_node *)n->prev; |
||
731 | schedule_node *next = (schedule_node *)n->next; |
||
732 | |||
733 | if (prev) { |
||
734 | while (!prev->is_head_sentinel()) { |
||
735 | add_dep(prev, n, 0); |
||
736 | prev = (schedule_node *)prev->prev; |
||
737 | } |
||
738 | } |
||
739 | |||
740 | if (next) { |
||
741 | while (!next->is_tail_sentinel()) { |
||
742 | add_dep(n, next, 0); |
||
743 | next = (schedule_node *)next->next; |
||
744 | } |
||
745 | } |
||
746 | } |
||
747 | |||
748 | /* instruction scheduling needs to be aware of when an MRF write |
||
749 | * actually writes 2 MRFs. |
||
750 | */ |
||
751 | bool |
||
752 | fs_instruction_scheduler::is_compressed(fs_inst *inst) |
||
753 | { |
||
754 | return inst->exec_size == 16; |
||
755 | } |
||
756 | |||
757 | void |
||
758 | fs_instruction_scheduler::calculate_deps() |
||
759 | { |
||
760 | /* Pre-register-allocation, this tracks the last write per VGRF offset. |
||
761 | * After register allocation, reg_offsets are gone and we track individual |
||
762 | * GRF registers. |
||
763 | */ |
||
764 | schedule_node *last_grf_write[grf_count * 16]; |
||
765 | schedule_node *last_mrf_write[BRW_MAX_MRF]; |
||
766 | schedule_node *last_conditional_mod[2] = { NULL, NULL }; |
||
767 | schedule_node *last_accumulator_write = NULL; |
||
768 | /* Fixed HW registers are assumed to be separate from the virtual |
||
769 | * GRFs, so they can be tracked separately. We don't really write |
||
770 | * to fixed GRFs much, so don't bother tracking them on a more |
||
771 | * granular level. |
||
772 | */ |
||
773 | schedule_node *last_fixed_grf_write = NULL; |
||
774 | int reg_width = v->dispatch_width / 8; |
||
775 | |||
776 | /* The last instruction always needs to still be the last |
||
777 | * instruction. Either it's flow control (IF, ELSE, ENDIF, DO, |
||
778 | * WHILE) and scheduling other things after it would disturb the |
||
779 | * basic block, or it's FB_WRITE and we should do a better job at |
||
780 | * dead code elimination anyway. |
||
781 | */ |
||
782 | schedule_node *last = (schedule_node *)instructions.get_tail(); |
||
783 | add_barrier_deps(last); |
||
784 | |||
785 | memset(last_grf_write, 0, sizeof(last_grf_write)); |
||
786 | memset(last_mrf_write, 0, sizeof(last_mrf_write)); |
||
787 | |||
788 | /* top-to-bottom dependencies: RAW and WAW. */ |
||
789 | foreach_in_list(schedule_node, n, &instructions) { |
||
790 | fs_inst *inst = (fs_inst *)n->inst; |
||
791 | |||
792 | if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT || |
||
793 | inst->has_side_effects()) |
||
794 | add_barrier_deps(n); |
||
795 | |||
796 | /* read-after-write deps. */ |
||
797 | for (int i = 0; i < inst->sources; i++) { |
||
798 | if (inst->src[i].file == GRF) { |
||
799 | if (post_reg_alloc) { |
||
800 | for (int r = 0; r < inst->regs_read(i); r++) |
||
801 | add_dep(last_grf_write[inst->src[i].reg + r], n); |
||
802 | } else { |
||
803 | for (int r = 0; r < inst->regs_read(i); r++) { |
||
804 | add_dep(last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], n); |
||
805 | } |
||
806 | } |
||
807 | } else if (inst->src[i].file == HW_REG && |
||
808 | (inst->src[i].fixed_hw_reg.file == |
||
809 | BRW_GENERAL_REGISTER_FILE)) { |
||
810 | if (post_reg_alloc) { |
||
811 | int size = reg_width; |
||
812 | if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0) |
||
813 | size = 1; |
||
814 | for (int r = 0; r < size; r++) |
||
815 | add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n); |
||
816 | } else { |
||
817 | add_dep(last_fixed_grf_write, n); |
||
818 | } |
||
819 | } else if (inst->src[i].is_accumulator()) { |
||
820 | add_dep(last_accumulator_write, n); |
||
821 | } else if (inst->src[i].file != BAD_FILE && |
||
822 | inst->src[i].file != IMM && |
||
823 | inst->src[i].file != UNIFORM && |
||
824 | (inst->src[i].file != HW_REG || |
||
825 | inst->src[i].fixed_hw_reg.file != IMM)) { |
||
826 | assert(inst->src[i].file != MRF); |
||
827 | add_barrier_deps(n); |
||
828 | } |
||
829 | } |
||
830 | |||
831 | if (inst->base_mrf != -1) { |
||
832 | for (int i = 0; i < inst->mlen; i++) { |
||
833 | /* It looks like the MRF regs are released in the send |
||
834 | * instruction once it's sent, not when the result comes |
||
835 | * back. |
||
836 | */ |
||
837 | add_dep(last_mrf_write[inst->base_mrf + i], n); |
||
838 | } |
||
839 | } |
||
840 | |||
841 | if (inst->reads_flag()) { |
||
842 | add_dep(last_conditional_mod[inst->flag_subreg], n); |
||
843 | } |
||
844 | |||
845 | if (inst->reads_accumulator_implicitly()) { |
||
846 | add_dep(last_accumulator_write, n); |
||
847 | } |
||
848 | |||
849 | /* write-after-write deps. */ |
||
850 | if (inst->dst.file == GRF) { |
||
851 | if (post_reg_alloc) { |
||
852 | for (int r = 0; r < inst->regs_written; r++) { |
||
853 | add_dep(last_grf_write[inst->dst.reg + r], n); |
||
854 | last_grf_write[inst->dst.reg + r] = n; |
||
855 | } |
||
856 | } else { |
||
857 | for (int r = 0; r < inst->regs_written; r++) { |
||
858 | add_dep(last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r], n); |
||
859 | last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n; |
||
860 | } |
||
861 | } |
||
862 | } else if (inst->dst.file == MRF) { |
||
863 | int reg = inst->dst.reg & ~BRW_MRF_COMPR4; |
||
864 | |||
865 | add_dep(last_mrf_write[reg], n); |
||
866 | last_mrf_write[reg] = n; |
||
867 | if (is_compressed(inst)) { |
||
868 | if (inst->dst.reg & BRW_MRF_COMPR4) |
||
869 | reg += 4; |
||
870 | else |
||
871 | reg++; |
||
872 | add_dep(last_mrf_write[reg], n); |
||
873 | last_mrf_write[reg] = n; |
||
874 | } |
||
875 | } else if (inst->dst.file == HW_REG && |
||
876 | inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { |
||
877 | if (post_reg_alloc) { |
||
878 | for (int r = 0; r < reg_width; r++) |
||
879 | last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n; |
||
880 | } else { |
||
881 | last_fixed_grf_write = n; |
||
882 | } |
||
883 | } else if (inst->dst.is_accumulator()) { |
||
884 | add_dep(last_accumulator_write, n); |
||
885 | last_accumulator_write = n; |
||
886 | } else if (inst->dst.file != BAD_FILE && |
||
887 | !inst->dst.is_null()) { |
||
888 | add_barrier_deps(n); |
||
889 | } |
||
890 | |||
891 | if (inst->mlen > 0 && inst->base_mrf != -1) { |
||
892 | for (int i = 0; i < v->implied_mrf_writes(inst); i++) { |
||
893 | add_dep(last_mrf_write[inst->base_mrf + i], n); |
||
894 | last_mrf_write[inst->base_mrf + i] = n; |
||
895 | } |
||
896 | } |
||
897 | |||
898 | if (inst->writes_flag()) { |
||
899 | add_dep(last_conditional_mod[inst->flag_subreg], n, 0); |
||
900 | last_conditional_mod[inst->flag_subreg] = n; |
||
901 | } |
||
902 | |||
903 | if (inst->writes_accumulator_implicitly(v->devinfo) && |
||
904 | !inst->dst.is_accumulator()) { |
||
905 | add_dep(last_accumulator_write, n); |
||
906 | last_accumulator_write = n; |
||
907 | } |
||
908 | } |
||
909 | |||
910 | /* bottom-to-top dependencies: WAR */ |
||
911 | memset(last_grf_write, 0, sizeof(last_grf_write)); |
||
912 | memset(last_mrf_write, 0, sizeof(last_mrf_write)); |
||
913 | memset(last_conditional_mod, 0, sizeof(last_conditional_mod)); |
||
914 | last_accumulator_write = NULL; |
||
915 | last_fixed_grf_write = NULL; |
||
916 | |||
917 | exec_node *node; |
||
918 | exec_node *prev; |
||
919 | for (node = instructions.get_tail(), prev = node->prev; |
||
920 | !node->is_head_sentinel(); |
||
921 | node = prev, prev = node->prev) { |
||
922 | schedule_node *n = (schedule_node *)node; |
||
923 | fs_inst *inst = (fs_inst *)n->inst; |
||
924 | |||
925 | /* write-after-read deps. */ |
||
926 | for (int i = 0; i < inst->sources; i++) { |
||
927 | if (inst->src[i].file == GRF) { |
||
928 | if (post_reg_alloc) { |
||
929 | for (int r = 0; r < inst->regs_read(i); r++) |
||
930 | add_dep(n, last_grf_write[inst->src[i].reg + r]); |
||
931 | } else { |
||
932 | for (int r = 0; r < inst->regs_read(i); r++) { |
||
933 | add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r]); |
||
934 | } |
||
935 | } |
||
936 | } else if (inst->src[i].file == HW_REG && |
||
937 | (inst->src[i].fixed_hw_reg.file == |
||
938 | BRW_GENERAL_REGISTER_FILE)) { |
||
939 | if (post_reg_alloc) { |
||
940 | int size = reg_width; |
||
941 | if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0) |
||
942 | size = 1; |
||
943 | for (int r = 0; r < size; r++) |
||
944 | add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]); |
||
945 | } else { |
||
946 | add_dep(n, last_fixed_grf_write); |
||
947 | } |
||
948 | } else if (inst->src[i].is_accumulator()) { |
||
949 | add_dep(n, last_accumulator_write); |
||
950 | } else if (inst->src[i].file != BAD_FILE && |
||
951 | inst->src[i].file != IMM && |
||
952 | inst->src[i].file != UNIFORM && |
||
953 | (inst->src[i].file != HW_REG || |
||
954 | inst->src[i].fixed_hw_reg.file != IMM)) { |
||
955 | assert(inst->src[i].file != MRF); |
||
956 | add_barrier_deps(n); |
||
957 | } |
||
958 | } |
||
959 | |||
960 | if (inst->base_mrf != -1) { |
||
961 | for (int i = 0; i < inst->mlen; i++) { |
||
962 | /* It looks like the MRF regs are released in the send |
||
963 | * instruction once it's sent, not when the result comes |
||
964 | * back. |
||
965 | */ |
||
966 | add_dep(n, last_mrf_write[inst->base_mrf + i], 2); |
||
967 | } |
||
968 | } |
||
969 | |||
970 | if (inst->reads_flag()) { |
||
971 | add_dep(n, last_conditional_mod[inst->flag_subreg]); |
||
972 | } |
||
973 | |||
974 | if (inst->reads_accumulator_implicitly()) { |
||
975 | add_dep(n, last_accumulator_write); |
||
976 | } |
||
977 | |||
978 | /* Update the things this instruction wrote, so earlier reads |
||
979 | * can mark this as WAR dependency. |
||
980 | */ |
||
981 | if (inst->dst.file == GRF) { |
||
982 | if (post_reg_alloc) { |
||
983 | for (int r = 0; r < inst->regs_written; r++) |
||
984 | last_grf_write[inst->dst.reg + r] = n; |
||
985 | } else { |
||
986 | for (int r = 0; r < inst->regs_written; r++) { |
||
987 | last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n; |
||
988 | } |
||
989 | } |
||
990 | } else if (inst->dst.file == MRF) { |
||
991 | int reg = inst->dst.reg & ~BRW_MRF_COMPR4; |
||
992 | |||
993 | last_mrf_write[reg] = n; |
||
994 | |||
995 | if (is_compressed(inst)) { |
||
996 | if (inst->dst.reg & BRW_MRF_COMPR4) |
||
997 | reg += 4; |
||
998 | else |
||
999 | reg++; |
||
1000 | |||
1001 | last_mrf_write[reg] = n; |
||
1002 | } |
||
1003 | } else if (inst->dst.file == HW_REG && |
||
1004 | inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { |
||
1005 | if (post_reg_alloc) { |
||
1006 | for (int r = 0; r < reg_width; r++) |
||
1007 | last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n; |
||
1008 | } else { |
||
1009 | last_fixed_grf_write = n; |
||
1010 | } |
||
1011 | } else if (inst->dst.is_accumulator()) { |
||
1012 | last_accumulator_write = n; |
||
1013 | } else if (inst->dst.file != BAD_FILE && |
||
1014 | !inst->dst.is_null()) { |
||
1015 | add_barrier_deps(n); |
||
1016 | } |
||
1017 | |||
1018 | if (inst->mlen > 0 && inst->base_mrf != -1) { |
||
1019 | for (int i = 0; i < v->implied_mrf_writes(inst); i++) { |
||
1020 | last_mrf_write[inst->base_mrf + i] = n; |
||
1021 | } |
||
1022 | } |
||
1023 | |||
1024 | if (inst->writes_flag()) { |
||
1025 | last_conditional_mod[inst->flag_subreg] = n; |
||
1026 | } |
||
1027 | |||
1028 | if (inst->writes_accumulator_implicitly(v->devinfo)) { |
||
1029 | last_accumulator_write = n; |
||
1030 | } |
||
1031 | } |
||
1032 | } |
||
1033 | |||
1034 | void |
||
1035 | vec4_instruction_scheduler::calculate_deps() |
||
1036 | { |
||
1037 | schedule_node *last_grf_write[grf_count]; |
||
1038 | schedule_node *last_mrf_write[BRW_MAX_MRF]; |
||
1039 | schedule_node *last_conditional_mod = NULL; |
||
1040 | schedule_node *last_accumulator_write = NULL; |
||
1041 | /* Fixed HW registers are assumed to be separate from the virtual |
||
1042 | * GRFs, so they can be tracked separately. We don't really write |
||
1043 | * to fixed GRFs much, so don't bother tracking them on a more |
||
1044 | * granular level. |
||
1045 | */ |
||
1046 | schedule_node *last_fixed_grf_write = NULL; |
||
1047 | |||
1048 | /* The last instruction always needs to still be the last instruction. |
||
1049 | * Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling |
||
1050 | * other things after it would disturb the basic block, or it's the EOT |
||
1051 | * URB_WRITE and we should do a better job at dead code eliminating |
||
1052 | * anything that could have been scheduled after it. |
||
1053 | */ |
||
1054 | schedule_node *last = (schedule_node *)instructions.get_tail(); |
||
1055 | add_barrier_deps(last); |
||
1056 | |||
1057 | memset(last_grf_write, 0, sizeof(last_grf_write)); |
||
1058 | memset(last_mrf_write, 0, sizeof(last_mrf_write)); |
||
1059 | |||
1060 | /* top-to-bottom dependencies: RAW and WAW. */ |
||
1061 | foreach_in_list(schedule_node, n, &instructions) { |
||
1062 | vec4_instruction *inst = (vec4_instruction *)n->inst; |
||
1063 | |||
1064 | if (inst->has_side_effects()) |
||
1065 | add_barrier_deps(n); |
||
1066 | |||
1067 | /* read-after-write deps. */ |
||
1068 | for (int i = 0; i < 3; i++) { |
||
1069 | if (inst->src[i].file == GRF) { |
||
1070 | for (unsigned j = 0; j < inst->regs_read(i); ++j) |
||
1071 | add_dep(last_grf_write[inst->src[i].reg + j], n); |
||
1072 | } else if (inst->src[i].file == HW_REG && |
||
1073 | (inst->src[i].fixed_hw_reg.file == |
||
1074 | BRW_GENERAL_REGISTER_FILE)) { |
||
1075 | add_dep(last_fixed_grf_write, n); |
||
1076 | } else if (inst->src[i].is_accumulator()) { |
||
1077 | assert(last_accumulator_write); |
||
1078 | add_dep(last_accumulator_write, n); |
||
1079 | } else if (inst->src[i].file != BAD_FILE && |
||
1080 | inst->src[i].file != IMM && |
||
1081 | inst->src[i].file != UNIFORM && |
||
1082 | (inst->src[i].file != HW_REG || |
||
1083 | inst->src[i].fixed_hw_reg.file != IMM)) { |
||
1084 | /* No reads from MRF, and ATTR is already translated away */ |
||
1085 | assert(inst->src[i].file != MRF && |
||
1086 | inst->src[i].file != ATTR); |
||
1087 | add_barrier_deps(n); |
||
1088 | } |
||
1089 | } |
||
1090 | |||
1091 | if (!inst->is_send_from_grf()) { |
||
1092 | for (int i = 0; i < inst->mlen; i++) { |
||
1093 | /* It looks like the MRF regs are released in the send |
||
1094 | * instruction once it's sent, not when the result comes |
||
1095 | * back. |
||
1096 | */ |
||
1097 | add_dep(last_mrf_write[inst->base_mrf + i], n); |
||
1098 | } |
||
1099 | } |
||
1100 | |||
1101 | if (inst->reads_flag()) { |
||
1102 | assert(last_conditional_mod); |
||
1103 | add_dep(last_conditional_mod, n); |
||
1104 | } |
||
1105 | |||
1106 | if (inst->reads_accumulator_implicitly()) { |
||
1107 | assert(last_accumulator_write); |
||
1108 | add_dep(last_accumulator_write, n); |
||
1109 | } |
||
1110 | |||
1111 | /* write-after-write deps. */ |
||
1112 | if (inst->dst.file == GRF) { |
||
1113 | for (unsigned j = 0; j < inst->regs_written; ++j) { |
||
1114 | add_dep(last_grf_write[inst->dst.reg + j], n); |
||
1115 | last_grf_write[inst->dst.reg + j] = n; |
||
1116 | } |
||
1117 | } else if (inst->dst.file == MRF) { |
||
1118 | add_dep(last_mrf_write[inst->dst.reg], n); |
||
1119 | last_mrf_write[inst->dst.reg] = n; |
||
1120 | } else if (inst->dst.file == HW_REG && |
||
1121 | inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { |
||
1122 | last_fixed_grf_write = n; |
||
1123 | } else if (inst->dst.is_accumulator()) { |
||
1124 | add_dep(last_accumulator_write, n); |
||
1125 | last_accumulator_write = n; |
||
1126 | } else if (inst->dst.file != BAD_FILE && |
||
1127 | !inst->dst.is_null()) { |
||
1128 | add_barrier_deps(n); |
||
1129 | } |
||
1130 | |||
1131 | if (inst->mlen > 0 && !inst->is_send_from_grf()) { |
||
1132 | for (int i = 0; i < v->implied_mrf_writes(inst); i++) { |
||
1133 | add_dep(last_mrf_write[inst->base_mrf + i], n); |
||
1134 | last_mrf_write[inst->base_mrf + i] = n; |
||
1135 | } |
||
1136 | } |
||
1137 | |||
1138 | if (inst->writes_flag()) { |
||
1139 | add_dep(last_conditional_mod, n, 0); |
||
1140 | last_conditional_mod = n; |
||
1141 | } |
||
1142 | |||
1143 | if (inst->writes_accumulator_implicitly(v->devinfo) && |
||
1144 | !inst->dst.is_accumulator()) { |
||
1145 | add_dep(last_accumulator_write, n); |
||
1146 | last_accumulator_write = n; |
||
1147 | } |
||
1148 | } |
||
1149 | |||
1150 | /* bottom-to-top dependencies: WAR */ |
||
1151 | memset(last_grf_write, 0, sizeof(last_grf_write)); |
||
1152 | memset(last_mrf_write, 0, sizeof(last_mrf_write)); |
||
1153 | last_conditional_mod = NULL; |
||
1154 | last_accumulator_write = NULL; |
||
1155 | last_fixed_grf_write = NULL; |
||
1156 | |||
1157 | exec_node *node; |
||
1158 | exec_node *prev; |
||
1159 | for (node = instructions.get_tail(), prev = node->prev; |
||
1160 | !node->is_head_sentinel(); |
||
1161 | node = prev, prev = node->prev) { |
||
1162 | schedule_node *n = (schedule_node *)node; |
||
1163 | vec4_instruction *inst = (vec4_instruction *)n->inst; |
||
1164 | |||
1165 | /* write-after-read deps. */ |
||
1166 | for (int i = 0; i < 3; i++) { |
||
1167 | if (inst->src[i].file == GRF) { |
||
1168 | for (unsigned j = 0; j < inst->regs_read(i); ++j) |
||
1169 | add_dep(n, last_grf_write[inst->src[i].reg + j]); |
||
1170 | } else if (inst->src[i].file == HW_REG && |
||
1171 | (inst->src[i].fixed_hw_reg.file == |
||
1172 | BRW_GENERAL_REGISTER_FILE)) { |
||
1173 | add_dep(n, last_fixed_grf_write); |
||
1174 | } else if (inst->src[i].is_accumulator()) { |
||
1175 | add_dep(n, last_accumulator_write); |
||
1176 | } else if (inst->src[i].file != BAD_FILE && |
||
1177 | inst->src[i].file != IMM && |
||
1178 | inst->src[i].file != UNIFORM && |
||
1179 | (inst->src[i].file != HW_REG || |
||
1180 | inst->src[i].fixed_hw_reg.file != IMM)) { |
||
1181 | assert(inst->src[i].file != MRF && |
||
1182 | inst->src[i].file != ATTR); |
||
1183 | add_barrier_deps(n); |
||
1184 | } |
||
1185 | } |
||
1186 | |||
1187 | if (!inst->is_send_from_grf()) { |
||
1188 | for (int i = 0; i < inst->mlen; i++) { |
||
1189 | /* It looks like the MRF regs are released in the send |
||
1190 | * instruction once it's sent, not when the result comes |
||
1191 | * back. |
||
1192 | */ |
||
1193 | add_dep(n, last_mrf_write[inst->base_mrf + i], 2); |
||
1194 | } |
||
1195 | } |
||
1196 | |||
1197 | if (inst->reads_flag()) { |
||
1198 | add_dep(n, last_conditional_mod); |
||
1199 | } |
||
1200 | |||
1201 | if (inst->reads_accumulator_implicitly()) { |
||
1202 | add_dep(n, last_accumulator_write); |
||
1203 | } |
||
1204 | |||
1205 | /* Update the things this instruction wrote, so earlier reads |
||
1206 | * can mark this as WAR dependency. |
||
1207 | */ |
||
1208 | if (inst->dst.file == GRF) { |
||
1209 | for (unsigned j = 0; j < inst->regs_written; ++j) |
||
1210 | last_grf_write[inst->dst.reg + j] = n; |
||
1211 | } else if (inst->dst.file == MRF) { |
||
1212 | last_mrf_write[inst->dst.reg] = n; |
||
1213 | } else if (inst->dst.file == HW_REG && |
||
1214 | inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { |
||
1215 | last_fixed_grf_write = n; |
||
1216 | } else if (inst->dst.is_accumulator()) { |
||
1217 | last_accumulator_write = n; |
||
1218 | } else if (inst->dst.file != BAD_FILE && |
||
1219 | !inst->dst.is_null()) { |
||
1220 | add_barrier_deps(n); |
||
1221 | } |
||
1222 | |||
1223 | if (inst->mlen > 0 && !inst->is_send_from_grf()) { |
||
1224 | for (int i = 0; i < v->implied_mrf_writes(inst); i++) { |
||
1225 | last_mrf_write[inst->base_mrf + i] = n; |
||
1226 | } |
||
1227 | } |
||
1228 | |||
1229 | if (inst->writes_flag()) { |
||
1230 | last_conditional_mod = n; |
||
1231 | } |
||
1232 | |||
1233 | if (inst->writes_accumulator_implicitly(v->devinfo)) { |
||
1234 | last_accumulator_write = n; |
||
1235 | } |
||
1236 | } |
||
1237 | } |
||
1238 | |||
1239 | schedule_node * |
||
1240 | fs_instruction_scheduler::choose_instruction_to_schedule() |
||
1241 | { |
||
1242 | schedule_node *chosen = NULL; |
||
1243 | |||
1244 | if (mode == SCHEDULE_PRE || mode == SCHEDULE_POST) { |
||
1245 | int chosen_time = 0; |
||
1246 | |||
1247 | /* Of the instructions ready to execute or the closest to |
||
1248 | * being ready, choose the oldest one. |
||
1249 | */ |
||
1250 | foreach_in_list(schedule_node, n, &instructions) { |
||
1251 | if (!chosen || n->unblocked_time < chosen_time) { |
||
1252 | chosen = n; |
||
1253 | chosen_time = n->unblocked_time; |
||
1254 | } |
||
1255 | } |
||
1256 | } else { |
||
1257 | /* Before register allocation, we don't care about the latencies of |
||
1258 | * instructions. All we care about is reducing live intervals of |
||
1259 | * variables so that we can avoid register spilling, or get SIMD16 |
||
1260 | * shaders which naturally do a better job of hiding instruction |
||
1261 | * latency. |
||
1262 | */ |
||
1263 | foreach_in_list(schedule_node, n, &instructions) { |
||
1264 | fs_inst *inst = (fs_inst *)n->inst; |
||
1265 | |||
1266 | if (!chosen) { |
||
1267 | chosen = n; |
||
1268 | continue; |
||
1269 | } |
||
1270 | |||
1271 | /* Most important: If we can definitely reduce register pressure, do |
||
1272 | * so immediately. |
||
1273 | */ |
||
1274 | int register_pressure_benefit = get_register_pressure_benefit(n->inst); |
||
1275 | int chosen_register_pressure_benefit = |
||
1276 | get_register_pressure_benefit(chosen->inst); |
||
1277 | |||
1278 | if (register_pressure_benefit > 0 && |
||
1279 | register_pressure_benefit > chosen_register_pressure_benefit) { |
||
1280 | chosen = n; |
||
1281 | continue; |
||
1282 | } else if (chosen_register_pressure_benefit > 0 && |
||
1283 | (register_pressure_benefit < |
||
1284 | chosen_register_pressure_benefit)) { |
||
1285 | continue; |
||
1286 | } |
||
1287 | |||
1288 | if (mode == SCHEDULE_PRE_LIFO) { |
||
1289 | /* Prefer instructions that recently became available for |
||
1290 | * scheduling. These are the things that are most likely to |
||
1291 | * (eventually) make a variable dead and reduce register pressure. |
||
1292 | * Typical register pressure estimates don't work for us because |
||
1293 | * most of our pressure comes from texturing, where no single |
||
1294 | * instruction to schedule will make a vec4 value dead. |
||
1295 | */ |
||
1296 | if (n->cand_generation > chosen->cand_generation) { |
||
1297 | chosen = n; |
||
1298 | continue; |
||
1299 | } else if (n->cand_generation < chosen->cand_generation) { |
||
1300 | continue; |
||
1301 | } |
||
1302 | |||
1303 | /* On MRF-using chips, prefer non-SEND instructions. If we don't |
||
1304 | * do this, then because we prefer instructions that just became |
||
1305 | * candidates, we'll end up in a pattern of scheduling a SEND, |
||
1306 | * then the MRFs for the next SEND, then the next SEND, then the |
||
1307 | * MRFs, etc., without ever consuming the results of a send. |
||
1308 | */ |
||
1309 | if (v->devinfo->gen < 7) { |
||
1310 | fs_inst *chosen_inst = (fs_inst *)chosen->inst; |
||
1311 | |||
1312 | /* We use regs_written > 1 as our test for the kind of send |
||
1313 | * instruction to avoid -- only sends generate many regs, and a |
||
1314 | * single-result send is probably actually reducing register |
||
1315 | * pressure. |
||
1316 | */ |
||
1317 | if (inst->regs_written <= inst->dst.width / 8 && |
||
1318 | chosen_inst->regs_written > chosen_inst->dst.width / 8) { |
||
1319 | chosen = n; |
||
1320 | continue; |
||
1321 | } else if (inst->regs_written > chosen_inst->regs_written) { |
||
1322 | continue; |
||
1323 | } |
||
1324 | } |
||
1325 | } |
||
1326 | |||
1327 | /* For instructions pushed on the cands list at the same time, prefer |
||
1328 | * the one with the highest delay to the end of the program. This is |
||
1329 | * most likely to have its values able to be consumed first (such as |
||
1330 | * for a large tree of lowered ubo loads, which appear reversed in |
||
1331 | * the instruction stream with respect to when they can be consumed). |
||
1332 | */ |
||
1333 | if (n->delay > chosen->delay) { |
||
1334 | chosen = n; |
||
1335 | continue; |
||
1336 | } else if (n->delay < chosen->delay) { |
||
1337 | continue; |
||
1338 | } |
||
1339 | |||
1340 | /* If all other metrics are equal, we prefer the first instruction in |
||
1341 | * the list (program execution). |
||
1342 | */ |
||
1343 | } |
||
1344 | } |
||
1345 | |||
1346 | return chosen; |
||
1347 | } |
||
1348 | |||
1349 | schedule_node * |
||
1350 | vec4_instruction_scheduler::choose_instruction_to_schedule() |
||
1351 | { |
||
1352 | schedule_node *chosen = NULL; |
||
1353 | int chosen_time = 0; |
||
1354 | |||
1355 | /* Of the instructions ready to execute or the closest to being ready, |
||
1356 | * choose the oldest one. |
||
1357 | */ |
||
1358 | foreach_in_list(schedule_node, n, &instructions) { |
||
1359 | if (!chosen || n->unblocked_time < chosen_time) { |
||
1360 | chosen = n; |
||
1361 | chosen_time = n->unblocked_time; |
||
1362 | } |
||
1363 | } |
||
1364 | |||
1365 | return chosen; |
||
1366 | } |
||
1367 | |||
1368 | int |
||
1369 | fs_instruction_scheduler::issue_time(backend_instruction *inst) |
||
1370 | { |
||
1371 | if (is_compressed((fs_inst *)inst)) |
||
1372 | return 4; |
||
1373 | else |
||
1374 | return 2; |
||
1375 | } |
||
1376 | |||
1377 | int |
||
1378 | vec4_instruction_scheduler::issue_time(backend_instruction *inst) |
||
1379 | { |
||
1380 | /* We always execute as two vec4s in parallel. */ |
||
1381 | return 2; |
||
1382 | } |
||
1383 | |||
1384 | void |
||
1385 | instruction_scheduler::schedule_instructions(bblock_t *block) |
||
1386 | { |
||
1387 | const struct brw_device_info *devinfo = bv->devinfo; |
||
1388 | backend_instruction *inst = block->end(); |
||
1389 | time = 0; |
||
1390 | |||
1391 | /* Remove non-DAG heads from the list. */ |
||
1392 | foreach_in_list_safe(schedule_node, n, &instructions) { |
||
1393 | if (n->parent_count != 0) |
||
1394 | n->remove(); |
||
1395 | } |
||
1396 | |||
1397 | unsigned cand_generation = 1; |
||
1398 | while (!instructions.is_empty()) { |
||
1399 | schedule_node *chosen = choose_instruction_to_schedule(); |
||
1400 | |||
1401 | /* Schedule this instruction. */ |
||
1402 | assert(chosen); |
||
1403 | chosen->remove(); |
||
1404 | inst->insert_before(block, chosen->inst); |
||
1405 | instructions_to_schedule--; |
||
1406 | update_register_pressure(chosen->inst); |
||
1407 | |||
1408 | /* Update the clock for how soon an instruction could start after the |
||
1409 | * chosen one. |
||
1410 | */ |
||
1411 | time += issue_time(chosen->inst); |
||
1412 | |||
1413 | /* If we expected a delay for scheduling, then bump the clock to reflect |
||
1414 | * that as well. In reality, the hardware will switch to another |
||
1415 | * hyperthread and may not return to dispatching our thread for a while |
||
1416 | * even after we're unblocked. |
||
1417 | */ |
||
1418 | time = MAX2(time, chosen->unblocked_time); |
||
1419 | |||
1420 | if (debug) { |
||
1421 | fprintf(stderr, "clock %4d, scheduled: ", time); |
||
1422 | bv->dump_instruction(chosen->inst); |
||
1423 | } |
||
1424 | |||
1425 | /* Now that we've scheduled a new instruction, some of its |
||
1426 | * children can be promoted to the list of instructions ready to |
||
1427 | * be scheduled. Update the children's unblocked time for this |
||
1428 | * DAG edge as we do so. |
||
1429 | */ |
||
1430 | for (int i = chosen->child_count - 1; i >= 0; i--) { |
||
1431 | schedule_node *child = chosen->children[i]; |
||
1432 | |||
1433 | child->unblocked_time = MAX2(child->unblocked_time, |
||
1434 | time + chosen->child_latency[i]); |
||
1435 | |||
1436 | if (debug) { |
||
1437 | fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count); |
||
1438 | bv->dump_instruction(child->inst); |
||
1439 | } |
||
1440 | |||
1441 | child->cand_generation = cand_generation; |
||
1442 | child->parent_count--; |
||
1443 | if (child->parent_count == 0) { |
||
1444 | if (debug) { |
||
1445 | fprintf(stderr, "\t\tnow available\n"); |
||
1446 | } |
||
1447 | instructions.push_head(child); |
||
1448 | } |
||
1449 | } |
||
1450 | cand_generation++; |
||
1451 | |||
1452 | /* Shared resource: the mathbox. There's one mathbox per EU on Gen6+ |
||
1453 | * but it's more limited pre-gen6, so if we send something off to it then |
||
1454 | * the next math instruction isn't going to make progress until the first |
||
1455 | * is done. |
||
1456 | */ |
||
1457 | if (devinfo->gen < 6 && chosen->inst->is_math()) { |
||
1458 | foreach_in_list(schedule_node, n, &instructions) { |
||
1459 | if (n->inst->is_math()) |
||
1460 | n->unblocked_time = MAX2(n->unblocked_time, |
||
1461 | time + chosen->latency); |
||
1462 | } |
||
1463 | } |
||
1464 | } |
||
1465 | |||
1466 | if (block->end()->opcode == BRW_OPCODE_NOP) |
||
1467 | block->end()->remove(block); |
||
1468 | assert(instructions_to_schedule == 0); |
||
1469 | } |
||
1470 | |||
1471 | void |
||
1472 | instruction_scheduler::run(cfg_t *cfg) |
||
1473 | { |
||
1474 | if (debug) { |
||
1475 | fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n", |
||
1476 | post_reg_alloc); |
||
1477 | bv->dump_instructions(); |
||
1478 | } |
||
1479 | |||
1480 | /* Populate the remaining GRF uses array to improve the pre-regalloc |
||
1481 | * scheduling. |
||
1482 | */ |
||
1483 | if (remaining_grf_uses) { |
||
1484 | foreach_block_and_inst(block, backend_instruction, inst, cfg) { |
||
1485 | count_remaining_grf_uses(inst); |
||
1486 | } |
||
1487 | } |
||
1488 | |||
1489 | foreach_block(block, cfg) { |
||
1490 | if (block->end_ip - block->start_ip <= 1) |
||
1491 | continue; |
||
1492 | |||
1493 | add_insts_from_block(block); |
||
1494 | |||
1495 | calculate_deps(); |
||
1496 | |||
1497 | foreach_in_list(schedule_node, n, &instructions) { |
||
1498 | compute_delay(n); |
||
1499 | } |
||
1500 | |||
1501 | schedule_instructions(block); |
||
1502 | } |
||
1503 | |||
1504 | if (debug) { |
||
1505 | fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n", |
||
1506 | post_reg_alloc); |
||
1507 | bv->dump_instructions(); |
||
1508 | } |
||
1509 | } |
||
1510 | |||
1511 | void |
||
1512 | fs_visitor::schedule_instructions(instruction_scheduler_mode mode) |
||
1513 | { |
||
1514 | int grf_count; |
||
1515 | if (mode == SCHEDULE_POST) |
||
1516 | grf_count = grf_used; |
||
1517 | else |
||
1518 | grf_count = alloc.count; |
||
1519 | |||
1520 | fs_instruction_scheduler sched(this, grf_count, mode); |
||
1521 | sched.run(cfg); |
||
1522 | |||
1523 | if (unlikely(debug_enabled) && mode == SCHEDULE_POST) { |
||
1524 | fprintf(stderr, "%s%d estimated execution time: %d cycles\n", |
||
1525 | stage_abbrev, dispatch_width, sched.time); |
||
1526 | } |
||
1527 | |||
1528 | invalidate_live_intervals(); |
||
1529 | } |
||
1530 | |||
1531 | void |
||
1532 | vec4_visitor::opt_schedule_instructions() |
||
1533 | { |
||
1534 | vec4_instruction_scheduler sched(this, prog_data->total_grf); |
||
1535 | sched.run(cfg); |
||
1536 | |||
1537 | if (unlikely(debug_enabled)) { |
||
1538 | fprintf(stderr, "%s estimated execution time: %d cycles\n", |
||
1539 | stage_abbrev, sched.time); |
||
1540 | } |
||
1541 | |||
1542 | invalidate_live_intervals(); |
||
1543 | }=>>>>=>>> |