Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* |
2 | * Copyright © 2010 Intel Corporation |
||
3 | * |
||
4 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
5 | * copy of this software and associated documentation files (the "Software"), |
||
6 | * to deal in the Software without restriction, including without limitation |
||
7 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||
8 | * and/or sell copies of the Software, and to permit persons to whom the |
||
9 | * Software is furnished to do so, subject to the following conditions: |
||
10 | * |
||
11 | * The above copyright notice and this permission notice (including the next |
||
12 | * paragraph) shall be included in all copies or substantial portions of the |
||
13 | * Software. |
||
14 | * |
||
15 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
16 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
17 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||
18 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||
19 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
||
20 | * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
||
21 | * IN THE SOFTWARE. |
||
22 | */ |
||
23 | |||
24 | /** @file brw_fs.cpp |
||
25 | * |
||
26 | * This file drives the GLSL IR -> LIR translation, contains the |
||
27 | * optimizations on the LIR, and drives the generation of native code |
||
28 | * from the LIR. |
||
29 | */ |
||
30 | |||
31 | #include |
||
32 | |||
33 | #include "util/hash_table.h" |
||
34 | #include "main/macros.h" |
||
35 | #include "main/shaderobj.h" |
||
36 | #include "main/fbobject.h" |
||
37 | #include "program/prog_parameter.h" |
||
38 | #include "program/prog_print.h" |
||
39 | #include "util/register_allocate.h" |
||
40 | #include "program/hash_table.h" |
||
41 | #include "brw_context.h" |
||
42 | #include "brw_eu.h" |
||
43 | #include "brw_wm.h" |
||
44 | #include "brw_fs.h" |
||
45 | #include "brw_cfg.h" |
||
46 | #include "brw_dead_control_flow.h" |
||
47 | #include "main/uniforms.h" |
||
48 | #include "brw_fs_live_variables.h" |
||
49 | #include "glsl/glsl_types.h" |
||
50 | #include "program/sampler.h" |
||
51 | |||
52 | void |
||
53 | fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, |
||
54 | const fs_reg *src, unsigned sources) |
||
55 | { |
||
56 | memset(this, 0, sizeof(*this)); |
||
57 | |||
58 | this->src = new fs_reg[MAX2(sources, 3)]; |
||
59 | for (unsigned i = 0; i < sources; i++) |
||
60 | this->src[i] = src[i]; |
||
61 | |||
62 | this->opcode = opcode; |
||
63 | this->dst = dst; |
||
64 | this->sources = sources; |
||
65 | this->exec_size = exec_size; |
||
66 | |||
67 | assert(dst.file != IMM && dst.file != UNIFORM); |
||
68 | |||
69 | /* If exec_size == 0, try to guess it from the registers. Since all |
||
70 | * manner of things may use hardware registers, we first try to guess |
||
71 | * based on GRF registers. If this fails, we will go ahead and take the |
||
72 | * width from the destination register. |
||
73 | */ |
||
74 | if (this->exec_size == 0) { |
||
75 | if (dst.file == GRF) { |
||
76 | this->exec_size = dst.width; |
||
77 | } else { |
||
78 | for (unsigned i = 0; i < sources; ++i) { |
||
79 | if (src[i].file != GRF && src[i].file != ATTR) |
||
80 | continue; |
||
81 | |||
82 | if (this->exec_size <= 1) |
||
83 | this->exec_size = src[i].width; |
||
84 | assert(src[i].width == 1 || src[i].width == this->exec_size); |
||
85 | } |
||
86 | } |
||
87 | |||
88 | if (this->exec_size == 0 && dst.file != BAD_FILE) |
||
89 | this->exec_size = dst.width; |
||
90 | } |
||
91 | assert(this->exec_size != 0); |
||
92 | |||
93 | this->conditional_mod = BRW_CONDITIONAL_NONE; |
||
94 | |||
95 | /* This will be the case for almost all instructions. */ |
||
96 | switch (dst.file) { |
||
97 | case GRF: |
||
98 | case HW_REG: |
||
99 | case MRF: |
||
100 | case ATTR: |
||
101 | this->regs_written = |
||
102 | DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32); |
||
103 | break; |
||
104 | case BAD_FILE: |
||
105 | this->regs_written = 0; |
||
106 | break; |
||
107 | case IMM: |
||
108 | case UNIFORM: |
||
109 | unreachable("Invalid destination register file"); |
||
110 | default: |
||
111 | unreachable("Invalid register file"); |
||
112 | } |
||
113 | |||
114 | this->writes_accumulator = false; |
||
115 | } |
||
116 | |||
117 | fs_inst::fs_inst() |
||
118 | { |
||
119 | init(BRW_OPCODE_NOP, 8, dst, NULL, 0); |
||
120 | } |
||
121 | |||
122 | fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size) |
||
123 | { |
||
124 | init(opcode, exec_size, reg_undef, NULL, 0); |
||
125 | } |
||
126 | |||
127 | fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst) |
||
128 | { |
||
129 | init(opcode, 0, dst, NULL, 0); |
||
130 | } |
||
131 | |||
132 | fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, |
||
133 | const fs_reg &src0) |
||
134 | { |
||
135 | const fs_reg src[1] = { src0 }; |
||
136 | init(opcode, exec_size, dst, src, 1); |
||
137 | } |
||
138 | |||
139 | fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0) |
||
140 | { |
||
141 | const fs_reg src[1] = { src0 }; |
||
142 | init(opcode, 0, dst, src, 1); |
||
143 | } |
||
144 | |||
145 | fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, |
||
146 | const fs_reg &src0, const fs_reg &src1) |
||
147 | { |
||
148 | const fs_reg src[2] = { src0, src1 }; |
||
149 | init(opcode, exec_size, dst, src, 2); |
||
150 | } |
||
151 | |||
152 | fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0, |
||
153 | const fs_reg &src1) |
||
154 | { |
||
155 | const fs_reg src[2] = { src0, src1 }; |
||
156 | init(opcode, 0, dst, src, 2); |
||
157 | } |
||
158 | |||
159 | fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst, |
||
160 | const fs_reg &src0, const fs_reg &src1, const fs_reg &src2) |
||
161 | { |
||
162 | const fs_reg src[3] = { src0, src1, src2 }; |
||
163 | init(opcode, exec_size, dst, src, 3); |
||
164 | } |
||
165 | |||
166 | fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0, |
||
167 | const fs_reg &src1, const fs_reg &src2) |
||
168 | { |
||
169 | const fs_reg src[3] = { src0, src1, src2 }; |
||
170 | init(opcode, 0, dst, src, 3); |
||
171 | } |
||
172 | |||
173 | fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, |
||
174 | const fs_reg src[], unsigned sources) |
||
175 | { |
||
176 | init(opcode, 0, dst, src, sources); |
||
177 | } |
||
178 | |||
179 | fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst, |
||
180 | const fs_reg src[], unsigned sources) |
||
181 | { |
||
182 | init(opcode, exec_width, dst, src, sources); |
||
183 | } |
||
184 | |||
185 | fs_inst::fs_inst(const fs_inst &that) |
||
186 | { |
||
187 | memcpy(this, &that, sizeof(that)); |
||
188 | |||
189 | this->src = new fs_reg[MAX2(that.sources, 3)]; |
||
190 | |||
191 | for (unsigned i = 0; i < that.sources; i++) |
||
192 | this->src[i] = that.src[i]; |
||
193 | } |
||
194 | |||
195 | fs_inst::~fs_inst() |
||
196 | { |
||
197 | delete[] this->src; |
||
198 | } |
||
199 | |||
200 | void |
||
201 | fs_inst::resize_sources(uint8_t num_sources) |
||
202 | { |
||
203 | if (this->sources != num_sources) { |
||
204 | fs_reg *src = new fs_reg[MAX2(num_sources, 3)]; |
||
205 | |||
206 | for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i) |
||
207 | src[i] = this->src[i]; |
||
208 | |||
209 | delete[] this->src; |
||
210 | this->src = src; |
||
211 | this->sources = num_sources; |
||
212 | } |
||
213 | } |
||
214 | |||
215 | #define ALU1(op) \ |
||
216 | fs_inst * \ |
||
217 | fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \ |
||
218 | { \ |
||
219 | return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \ |
||
220 | } |
||
221 | |||
222 | #define ALU2(op) \ |
||
223 | fs_inst * \ |
||
224 | fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \ |
||
225 | const fs_reg &src1) \ |
||
226 | { \ |
||
227 | return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \ |
||
228 | } |
||
229 | |||
230 | #define ALU2_ACC(op) \ |
||
231 | fs_inst * \ |
||
232 | fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \ |
||
233 | const fs_reg &src1) \ |
||
234 | { \ |
||
235 | fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\ |
||
236 | inst->writes_accumulator = true; \ |
||
237 | return inst; \ |
||
238 | } |
||
239 | |||
240 | #define ALU3(op) \ |
||
241 | fs_inst * \ |
||
242 | fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \ |
||
243 | const fs_reg &src1, const fs_reg &src2) \ |
||
244 | { \ |
||
245 | return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\ |
||
246 | } |
||
247 | |||
248 | ALU1(NOT) |
||
249 | ALU1(MOV) |
||
250 | ALU1(FRC) |
||
251 | ALU1(RNDD) |
||
252 | ALU1(RNDE) |
||
253 | ALU1(RNDZ) |
||
254 | ALU2(ADD) |
||
255 | ALU2(MUL) |
||
256 | ALU2_ACC(MACH) |
||
257 | ALU2(AND) |
||
258 | ALU2(OR) |
||
259 | ALU2(XOR) |
||
260 | ALU2(SHL) |
||
261 | ALU2(SHR) |
||
262 | ALU2(ASR) |
||
263 | ALU3(LRP) |
||
264 | ALU1(BFREV) |
||
265 | ALU3(BFE) |
||
266 | ALU2(BFI1) |
||
267 | ALU3(BFI2) |
||
268 | ALU1(FBH) |
||
269 | ALU1(FBL) |
||
270 | ALU1(CBIT) |
||
271 | ALU3(MAD) |
||
272 | ALU2_ACC(ADDC) |
||
273 | ALU2_ACC(SUBB) |
||
274 | ALU2(SEL) |
||
275 | ALU2(MAC) |
||
276 | |||
277 | /** Gen4 predicated IF. */ |
||
278 | fs_inst * |
||
279 | fs_visitor::IF(enum brw_predicate predicate) |
||
280 | { |
||
281 | fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width); |
||
282 | inst->predicate = predicate; |
||
283 | return inst; |
||
284 | } |
||
285 | |||
286 | /** Gen6 IF with embedded comparison. */ |
||
287 | fs_inst * |
||
288 | fs_visitor::IF(const fs_reg &src0, const fs_reg &src1, |
||
289 | enum brw_conditional_mod condition) |
||
290 | { |
||
291 | assert(devinfo->gen == 6); |
||
292 | fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width, |
||
293 | reg_null_d, src0, src1); |
||
294 | inst->conditional_mod = condition; |
||
295 | return inst; |
||
296 | } |
||
297 | |||
298 | /** |
||
299 | * CMP: Sets the low bit of the destination channels with the result |
||
300 | * of the comparison, while the upper bits are undefined, and updates |
||
301 | * the flag register with the packed 16 bits of the result. |
||
302 | */ |
||
303 | fs_inst * |
||
304 | fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, |
||
305 | enum brw_conditional_mod condition) |
||
306 | { |
||
307 | fs_inst *inst; |
||
308 | |||
309 | /* Take the instruction: |
||
310 | * |
||
311 | * CMP null |
||
312 | * |
||
313 | * Original gen4 does type conversion to the destination type before |
||
314 | * comparison, producing garbage results for floating point comparisons. |
||
315 | * |
||
316 | * The destination type doesn't matter on newer generations, so we set the |
||
317 | * type to match src0 so we can compact the instruction. |
||
318 | */ |
||
319 | dst.type = src0.type; |
||
320 | if (dst.file == HW_REG) |
||
321 | dst.fixed_hw_reg.type = dst.type; |
||
322 | |||
323 | resolve_ud_negate(&src0); |
||
324 | resolve_ud_negate(&src1); |
||
325 | |||
326 | inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1); |
||
327 | inst->conditional_mod = condition; |
||
328 | |||
329 | return inst; |
||
330 | } |
||
331 | |||
332 | fs_inst * |
||
333 | fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources, |
||
334 | int header_size) |
||
335 | { |
||
336 | assert(dst.width % 8 == 0); |
||
337 | fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width, |
||
338 | dst, src, sources); |
||
339 | inst->header_size = header_size; |
||
340 | |||
341 | for (int i = 0; i < header_size; i++) |
||
342 | assert(src[i].file != GRF || src[i].width * type_sz(src[i].type) == 32); |
||
343 | inst->regs_written = header_size; |
||
344 | |||
345 | for (int i = header_size; i < sources; ++i) |
||
346 | assert(src[i].file != GRF || src[i].width == dst.width); |
||
347 | inst->regs_written += (sources - header_size) * (dst.width / 8); |
||
348 | |||
349 | return inst; |
||
350 | } |
||
351 | |||
352 | exec_list |
||
353 | fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst, |
||
354 | const fs_reg &surf_index, |
||
355 | const fs_reg &varying_offset, |
||
356 | uint32_t const_offset) |
||
357 | { |
||
358 | exec_list instructions; |
||
359 | fs_inst *inst; |
||
360 | |||
361 | /* We have our constant surface use a pitch of 4 bytes, so our index can |
||
362 | * be any component of a vector, and then we load 4 contiguous |
||
363 | * components starting from that. |
||
364 | * |
||
365 | * We break down the const_offset to a portion added to the variable |
||
366 | * offset and a portion done using reg_offset, which means that if you |
||
367 | * have GLSL using something like "uniform vec4 a[20]; gl_FragColor = |
||
368 | * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and |
||
369 | * CSE can later notice that those loads are all the same and eliminate |
||
370 | * the redundant ones. |
||
371 | */ |
||
372 | fs_reg vec4_offset = vgrf(glsl_type::int_type); |
||
373 | instructions.push_tail(ADD(vec4_offset, |
||
374 | varying_offset, fs_reg(const_offset & ~3))); |
||
375 | |||
376 | int scale = 1; |
||
377 | if (devinfo->gen == 4 && dst.width == 8) { |
||
378 | /* Pre-gen5, we can either use a SIMD8 message that requires (header, |
||
379 | * u, v, r) as parameters, or we can just use the SIMD16 message |
||
380 | * consisting of (header, u). We choose the second, at the cost of a |
||
381 | * longer return length. |
||
382 | */ |
||
383 | scale = 2; |
||
384 | } |
||
385 | |||
386 | enum opcode op; |
||
387 | if (devinfo->gen >= 7) |
||
388 | op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7; |
||
389 | else |
||
390 | op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD; |
||
391 | |||
392 | assert(dst.width % 8 == 0); |
||
393 | int regs_written = 4 * (dst.width / 8) * scale; |
||
394 | fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written), |
||
395 | dst.type, dst.width); |
||
396 | inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset); |
||
397 | inst->regs_written = regs_written; |
||
398 | instructions.push_tail(inst); |
||
399 | |||
400 | if (devinfo->gen < 7) { |
||
401 | inst->base_mrf = 13; |
||
402 | inst->header_size = 1; |
||
403 | if (devinfo->gen == 4) |
||
404 | inst->mlen = 3; |
||
405 | else |
||
406 | inst->mlen = 1 + dispatch_width / 8; |
||
407 | } |
||
408 | |||
409 | fs_reg result = offset(vec4_result, (const_offset & 3) * scale); |
||
410 | instructions.push_tail(MOV(dst, result)); |
||
411 | |||
412 | return instructions; |
||
413 | } |
||
414 | |||
415 | /** |
||
416 | * A helper for MOV generation for fixing up broken hardware SEND dependency |
||
417 | * handling. |
||
418 | */ |
||
419 | fs_inst * |
||
420 | fs_visitor::DEP_RESOLVE_MOV(int grf) |
||
421 | { |
||
422 | fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F)); |
||
423 | |||
424 | inst->ir = NULL; |
||
425 | inst->annotation = "send dependency resolve"; |
||
426 | |||
427 | /* The caller always wants uncompressed to emit the minimal extra |
||
428 | * dependencies, and to avoid having to deal with aligning its regs to 2. |
||
429 | */ |
||
430 | inst->exec_size = 8; |
||
431 | |||
432 | return inst; |
||
433 | } |
||
434 | |||
435 | bool |
||
436 | fs_inst::equals(fs_inst *inst) const |
||
437 | { |
||
438 | return (opcode == inst->opcode && |
||
439 | dst.equals(inst->dst) && |
||
440 | src[0].equals(inst->src[0]) && |
||
441 | src[1].equals(inst->src[1]) && |
||
442 | src[2].equals(inst->src[2]) && |
||
443 | saturate == inst->saturate && |
||
444 | predicate == inst->predicate && |
||
445 | conditional_mod == inst->conditional_mod && |
||
446 | mlen == inst->mlen && |
||
447 | base_mrf == inst->base_mrf && |
||
448 | target == inst->target && |
||
449 | eot == inst->eot && |
||
450 | header_size == inst->header_size && |
||
451 | shadow_compare == inst->shadow_compare && |
||
452 | exec_size == inst->exec_size && |
||
453 | offset == inst->offset); |
||
454 | } |
||
455 | |||
456 | bool |
||
457 | fs_inst::overwrites_reg(const fs_reg ®) const |
||
458 | { |
||
459 | return reg.in_range(dst, regs_written); |
||
460 | } |
||
461 | |||
462 | bool |
||
463 | fs_inst::is_send_from_grf() const |
||
464 | { |
||
465 | switch (opcode) { |
||
466 | case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7: |
||
467 | case SHADER_OPCODE_SHADER_TIME_ADD: |
||
468 | case FS_OPCODE_INTERPOLATE_AT_CENTROID: |
||
469 | case FS_OPCODE_INTERPOLATE_AT_SAMPLE: |
||
470 | case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: |
||
471 | case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: |
||
472 | case SHADER_OPCODE_UNTYPED_ATOMIC: |
||
473 | case SHADER_OPCODE_UNTYPED_SURFACE_READ: |
||
474 | case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: |
||
475 | case SHADER_OPCODE_TYPED_ATOMIC: |
||
476 | case SHADER_OPCODE_TYPED_SURFACE_READ: |
||
477 | case SHADER_OPCODE_TYPED_SURFACE_WRITE: |
||
478 | case SHADER_OPCODE_URB_WRITE_SIMD8: |
||
479 | return true; |
||
480 | case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: |
||
481 | return src[1].file == GRF; |
||
482 | case FS_OPCODE_FB_WRITE: |
||
483 | return src[0].file == GRF; |
||
484 | default: |
||
485 | if (is_tex()) |
||
486 | return src[0].file == GRF; |
||
487 | |||
488 | return false; |
||
489 | } |
||
490 | } |
||
491 | |||
492 | bool |
||
493 | fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const |
||
494 | { |
||
495 | if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD) |
||
496 | return false; |
||
497 | |||
498 | fs_reg reg = this->src[0]; |
||
499 | if (reg.file != GRF || reg.reg_offset != 0 || reg.stride == 0) |
||
500 | return false; |
||
501 | |||
502 | if (grf_alloc.sizes[reg.reg] != this->regs_written) |
||
503 | return false; |
||
504 | |||
505 | for (int i = 0; i < this->sources; i++) { |
||
506 | reg.type = this->src[i].type; |
||
507 | reg.width = this->src[i].width; |
||
508 | if (!this->src[i].equals(reg)) |
||
509 | return false; |
||
510 | reg = ::offset(reg, 1); |
||
511 | } |
||
512 | |||
513 | return true; |
||
514 | } |
||
515 | |||
516 | bool |
||
517 | fs_inst::can_do_source_mods(const struct brw_device_info *devinfo) |
||
518 | { |
||
519 | if (devinfo->gen == 6 && is_math()) |
||
520 | return false; |
||
521 | |||
522 | if (is_send_from_grf()) |
||
523 | return false; |
||
524 | |||
525 | if (!backend_instruction::can_do_source_mods()) |
||
526 | return false; |
||
527 | |||
528 | return true; |
||
529 | } |
||
530 | |||
531 | bool |
||
532 | fs_inst::has_side_effects() const |
||
533 | { |
||
534 | return this->eot || backend_instruction::has_side_effects(); |
||
535 | } |
||
536 | |||
537 | void |
||
538 | fs_reg::init() |
||
539 | { |
||
540 | memset(this, 0, sizeof(*this)); |
||
541 | stride = 1; |
||
542 | } |
||
543 | |||
544 | /** Generic unset register constructor. */ |
||
545 | fs_reg::fs_reg() |
||
546 | { |
||
547 | init(); |
||
548 | this->file = BAD_FILE; |
||
549 | } |
||
550 | |||
551 | /** Immediate value constructor. */ |
||
552 | fs_reg::fs_reg(float f) |
||
553 | { |
||
554 | init(); |
||
555 | this->file = IMM; |
||
556 | this->type = BRW_REGISTER_TYPE_F; |
||
557 | this->fixed_hw_reg.dw1.f = f; |
||
558 | this->width = 1; |
||
559 | } |
||
560 | |||
561 | /** Immediate value constructor. */ |
||
562 | fs_reg::fs_reg(int32_t i) |
||
563 | { |
||
564 | init(); |
||
565 | this->file = IMM; |
||
566 | this->type = BRW_REGISTER_TYPE_D; |
||
567 | this->fixed_hw_reg.dw1.d = i; |
||
568 | this->width = 1; |
||
569 | } |
||
570 | |||
571 | /** Immediate value constructor. */ |
||
572 | fs_reg::fs_reg(uint32_t u) |
||
573 | { |
||
574 | init(); |
||
575 | this->file = IMM; |
||
576 | this->type = BRW_REGISTER_TYPE_UD; |
||
577 | this->fixed_hw_reg.dw1.ud = u; |
||
578 | this->width = 1; |
||
579 | } |
||
580 | |||
581 | /** Vector float immediate value constructor. */ |
||
582 | fs_reg::fs_reg(uint8_t vf[4]) |
||
583 | { |
||
584 | init(); |
||
585 | this->file = IMM; |
||
586 | this->type = BRW_REGISTER_TYPE_VF; |
||
587 | memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned)); |
||
588 | } |
||
589 | |||
590 | /** Vector float immediate value constructor. */ |
||
591 | fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3) |
||
592 | { |
||
593 | init(); |
||
594 | this->file = IMM; |
||
595 | this->type = BRW_REGISTER_TYPE_VF; |
||
596 | this->fixed_hw_reg.dw1.ud = (vf0 << 0) | |
||
597 | (vf1 << 8) | |
||
598 | (vf2 << 16) | |
||
599 | (vf3 << 24); |
||
600 | } |
||
601 | |||
602 | /** Fixed brw_reg. */ |
||
603 | fs_reg::fs_reg(struct brw_reg fixed_hw_reg) |
||
604 | { |
||
605 | init(); |
||
606 | this->file = HW_REG; |
||
607 | this->fixed_hw_reg = fixed_hw_reg; |
||
608 | this->type = fixed_hw_reg.type; |
||
609 | this->width = 1 << fixed_hw_reg.width; |
||
610 | } |
||
611 | |||
612 | bool |
||
613 | fs_reg::equals(const fs_reg &r) const |
||
614 | { |
||
615 | return (file == r.file && |
||
616 | reg == r.reg && |
||
617 | reg_offset == r.reg_offset && |
||
618 | subreg_offset == r.subreg_offset && |
||
619 | type == r.type && |
||
620 | negate == r.negate && |
||
621 | abs == r.abs && |
||
622 | !reladdr && !r.reladdr && |
||
623 | memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 && |
||
624 | width == r.width && |
||
625 | stride == r.stride); |
||
626 | } |
||
627 | |||
628 | fs_reg & |
||
629 | fs_reg::set_smear(unsigned subreg) |
||
630 | { |
||
631 | assert(file != HW_REG && file != IMM); |
||
632 | subreg_offset = subreg * type_sz(type); |
||
633 | stride = 0; |
||
634 | return *this; |
||
635 | } |
||
636 | |||
637 | bool |
||
638 | fs_reg::is_contiguous() const |
||
639 | { |
||
640 | return stride == 1; |
||
641 | } |
||
642 | |||
643 | int |
||
644 | fs_visitor::type_size(const struct glsl_type *type) |
||
645 | { |
||
646 | unsigned int size, i; |
||
647 | |||
648 | switch (type->base_type) { |
||
649 | case GLSL_TYPE_UINT: |
||
650 | case GLSL_TYPE_INT: |
||
651 | case GLSL_TYPE_FLOAT: |
||
652 | case GLSL_TYPE_BOOL: |
||
653 | return type->components(); |
||
654 | case GLSL_TYPE_ARRAY: |
||
655 | return type_size(type->fields.array) * type->length; |
||
656 | case GLSL_TYPE_STRUCT: |
||
657 | size = 0; |
||
658 | for (i = 0; i < type->length; i++) { |
||
659 | size += type_size(type->fields.structure[i].type); |
||
660 | } |
||
661 | return size; |
||
662 | case GLSL_TYPE_SAMPLER: |
||
663 | /* Samplers take up no register space, since they're baked in at |
||
664 | * link time. |
||
665 | */ |
||
666 | return 0; |
||
667 | case GLSL_TYPE_ATOMIC_UINT: |
||
668 | return 0; |
||
669 | case GLSL_TYPE_IMAGE: |
||
670 | case GLSL_TYPE_VOID: |
||
671 | case GLSL_TYPE_ERROR: |
||
672 | case GLSL_TYPE_INTERFACE: |
||
673 | case GLSL_TYPE_DOUBLE: |
||
674 | unreachable("not reached"); |
||
675 | } |
||
676 | |||
677 | return 0; |
||
678 | } |
||
679 | |||
680 | /** |
||
681 | * Create a MOV to read the timestamp register. |
||
682 | * |
||
683 | * The caller is responsible for emitting the MOV. The return value is |
||
684 | * the destination of the MOV, with extra parameters set. |
||
685 | */ |
||
686 | fs_reg |
||
687 | fs_visitor::get_timestamp(fs_inst **out_mov) |
||
688 | { |
||
689 | assert(devinfo->gen >= 7); |
||
690 | |||
691 | fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE, |
||
692 | BRW_ARF_TIMESTAMP, |
||
693 | 0), |
||
694 | BRW_REGISTER_TYPE_UD)); |
||
695 | |||
696 | fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4); |
||
697 | |||
698 | fs_inst *mov = MOV(dst, ts); |
||
699 | /* We want to read the 3 fields we care about even if it's not enabled in |
||
700 | * the dispatch. |
||
701 | */ |
||
702 | mov->force_writemask_all = true; |
||
703 | |||
704 | /* The caller wants the low 32 bits of the timestamp. Since it's running |
||
705 | * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds, |
||
706 | * which is plenty of time for our purposes. It is identical across the |
||
707 | * EUs, but since it's tracking GPU core speed it will increment at a |
||
708 | * varying rate as render P-states change. |
||
709 | * |
||
710 | * The caller could also check if render P-states have changed (or anything |
||
711 | * else that might disrupt timing) by setting smear to 2 and checking if |
||
712 | * that field is != 0. |
||
713 | */ |
||
714 | dst.set_smear(0); |
||
715 | |||
716 | *out_mov = mov; |
||
717 | return dst; |
||
718 | } |
||
719 | |||
720 | void |
||
721 | fs_visitor::emit_shader_time_begin() |
||
722 | { |
||
723 | current_annotation = "shader time start"; |
||
724 | fs_inst *mov; |
||
725 | shader_start_time = get_timestamp(&mov); |
||
726 | emit(mov); |
||
727 | } |
||
728 | |||
729 | void |
||
730 | fs_visitor::emit_shader_time_end() |
||
731 | { |
||
732 | current_annotation = "shader time end"; |
||
733 | |||
734 | enum shader_time_shader_type type, written_type, reset_type; |
||
735 | switch (stage) { |
||
736 | case MESA_SHADER_VERTEX: |
||
737 | type = ST_VS; |
||
738 | written_type = ST_VS_WRITTEN; |
||
739 | reset_type = ST_VS_RESET; |
||
740 | break; |
||
741 | case MESA_SHADER_GEOMETRY: |
||
742 | type = ST_GS; |
||
743 | written_type = ST_GS_WRITTEN; |
||
744 | reset_type = ST_GS_RESET; |
||
745 | break; |
||
746 | case MESA_SHADER_FRAGMENT: |
||
747 | if (dispatch_width == 8) { |
||
748 | type = ST_FS8; |
||
749 | written_type = ST_FS8_WRITTEN; |
||
750 | reset_type = ST_FS8_RESET; |
||
751 | } else { |
||
752 | assert(dispatch_width == 16); |
||
753 | type = ST_FS16; |
||
754 | written_type = ST_FS16_WRITTEN; |
||
755 | reset_type = ST_FS16_RESET; |
||
756 | } |
||
757 | break; |
||
758 | case MESA_SHADER_COMPUTE: |
||
759 | type = ST_CS; |
||
760 | written_type = ST_CS_WRITTEN; |
||
761 | reset_type = ST_CS_RESET; |
||
762 | break; |
||
763 | default: |
||
764 | unreachable("fs_visitor::emit_shader_time_end missing code"); |
||
765 | } |
||
766 | |||
767 | /* Insert our code just before the final SEND with EOT. */ |
||
768 | exec_node *end = this->instructions.get_tail(); |
||
769 | assert(end && ((fs_inst *) end)->eot); |
||
770 | |||
771 | fs_inst *tm_read; |
||
772 | fs_reg shader_end_time = get_timestamp(&tm_read); |
||
773 | end->insert_before(tm_read); |
||
774 | |||
775 | /* Check that there weren't any timestamp reset events (assuming these |
||
776 | * were the only two timestamp reads that happened). |
||
777 | */ |
||
778 | fs_reg reset = shader_end_time; |
||
779 | reset.set_smear(2); |
||
780 | fs_inst *test = AND(reg_null_d, reset, fs_reg(1u)); |
||
781 | test->conditional_mod = BRW_CONDITIONAL_Z; |
||
782 | test->force_writemask_all = true; |
||
783 | end->insert_before(test); |
||
784 | end->insert_before(IF(BRW_PREDICATE_NORMAL)); |
||
785 | |||
786 | fs_reg start = shader_start_time; |
||
787 | start.negate = true; |
||
788 | fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1); |
||
789 | diff.set_smear(0); |
||
790 | fs_inst *add = ADD(diff, start, shader_end_time); |
||
791 | add->force_writemask_all = true; |
||
792 | end->insert_before(add); |
||
793 | |||
794 | /* If there were no instructions between the two timestamp gets, the diff |
||
795 | * is 2 cycles. Remove that overhead, so I can forget about that when |
||
796 | * trying to determine the time taken for single instructions. |
||
797 | */ |
||
798 | add = ADD(diff, diff, fs_reg(-2u)); |
||
799 | add->force_writemask_all = true; |
||
800 | end->insert_before(add); |
||
801 | |||
802 | end->insert_before(SHADER_TIME_ADD(type, diff)); |
||
803 | end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u))); |
||
804 | end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width)); |
||
805 | end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u))); |
||
806 | end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width)); |
||
807 | } |
||
808 | |||
809 | fs_inst * |
||
810 | fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value) |
||
811 | { |
||
812 | int shader_time_index = |
||
813 | brw_get_shader_time_index(brw, shader_prog, prog, type); |
||
814 | fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE); |
||
815 | |||
816 | fs_reg payload; |
||
817 | if (dispatch_width == 8) |
||
818 | payload = vgrf(glsl_type::uvec2_type); |
||
819 | else |
||
820 | payload = vgrf(glsl_type::uint_type); |
||
821 | |||
822 | return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD, |
||
823 | fs_reg(), payload, offset, value); |
||
824 | } |
||
825 | |||
826 | void |
||
827 | fs_visitor::vfail(const char *format, va_list va) |
||
828 | { |
||
829 | char *msg; |
||
830 | |||
831 | if (failed) |
||
832 | return; |
||
833 | |||
834 | failed = true; |
||
835 | |||
836 | msg = ralloc_vasprintf(mem_ctx, format, va); |
||
837 | msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg); |
||
838 | |||
839 | this->fail_msg = msg; |
||
840 | |||
841 | if (debug_enabled) { |
||
842 | fprintf(stderr, "%s", msg); |
||
843 | } |
||
844 | } |
||
845 | |||
846 | void |
||
847 | fs_visitor::fail(const char *format, ...) |
||
848 | { |
||
849 | va_list va; |
||
850 | |||
851 | va_start(va, format); |
||
852 | vfail(format, va); |
||
853 | va_end(va); |
||
854 | } |
||
855 | |||
856 | /** |
||
857 | * Mark this program as impossible to compile in SIMD16 mode. |
||
858 | * |
||
859 | * During the SIMD8 compile (which happens first), we can detect and flag |
||
860 | * things that are unsupported in SIMD16 mode, so the compiler can skip |
||
861 | * the SIMD16 compile altogether. |
||
862 | * |
||
863 | * During a SIMD16 compile (if one happens anyway), this just calls fail(). |
||
864 | */ |
||
865 | void |
||
866 | fs_visitor::no16(const char *format, ...) |
||
867 | { |
||
868 | va_list va; |
||
869 | |||
870 | va_start(va, format); |
||
871 | |||
872 | if (dispatch_width == 16) { |
||
873 | vfail(format, va); |
||
874 | } else { |
||
875 | simd16_unsupported = true; |
||
876 | |||
877 | if (brw->perf_debug) { |
||
878 | if (no16_msg) |
||
879 | ralloc_vasprintf_append(&no16_msg, format, va); |
||
880 | else |
||
881 | no16_msg = ralloc_vasprintf(mem_ctx, format, va); |
||
882 | } |
||
883 | } |
||
884 | |||
885 | va_end(va); |
||
886 | } |
||
887 | |||
888 | fs_inst * |
||
889 | fs_visitor::emit(enum opcode opcode) |
||
890 | { |
||
891 | return emit(new(mem_ctx) fs_inst(opcode, dispatch_width)); |
||
892 | } |
||
893 | |||
894 | fs_inst * |
||
895 | fs_visitor::emit(enum opcode opcode, const fs_reg &dst) |
||
896 | { |
||
897 | return emit(new(mem_ctx) fs_inst(opcode, dst)); |
||
898 | } |
||
899 | |||
900 | fs_inst * |
||
901 | fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0) |
||
902 | { |
||
903 | return emit(new(mem_ctx) fs_inst(opcode, dst, src0)); |
||
904 | } |
||
905 | |||
906 | fs_inst * |
||
907 | fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0, |
||
908 | const fs_reg &src1) |
||
909 | { |
||
910 | return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1)); |
||
911 | } |
||
912 | |||
913 | fs_inst * |
||
914 | fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0, |
||
915 | const fs_reg &src1, const fs_reg &src2) |
||
916 | { |
||
917 | return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2)); |
||
918 | } |
||
919 | |||
920 | fs_inst * |
||
921 | fs_visitor::emit(enum opcode opcode, const fs_reg &dst, |
||
922 | fs_reg src[], int sources) |
||
923 | { |
||
924 | return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources)); |
||
925 | } |
||
926 | |||
927 | /** |
||
928 | * Returns true if the instruction has a flag that means it won't |
||
929 | * update an entire destination register. |
||
930 | * |
||
931 | * For example, dead code elimination and live variable analysis want to know |
||
932 | * when a write to a variable screens off any preceding values that were in |
||
933 | * it. |
||
934 | */ |
||
935 | bool |
||
936 | fs_inst::is_partial_write() const |
||
937 | { |
||
938 | return ((this->predicate && this->opcode != BRW_OPCODE_SEL) || |
||
939 | (this->dst.width * type_sz(this->dst.type)) < 32 || |
||
940 | !this->dst.is_contiguous()); |
||
941 | } |
||
942 | |||
943 | int |
||
944 | fs_inst::regs_read(int arg) const |
||
945 | { |
||
946 | if (is_tex() && arg == 0 && src[0].file == GRF) { |
||
947 | return mlen; |
||
948 | } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) { |
||
949 | return mlen; |
||
950 | } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) { |
||
951 | return mlen; |
||
952 | } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) { |
||
953 | return mlen; |
||
954 | } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) { |
||
955 | return mlen; |
||
956 | } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) { |
||
957 | return mlen; |
||
958 | } else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) { |
||
959 | return mlen; |
||
960 | } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) { |
||
961 | return mlen; |
||
962 | } else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) { |
||
963 | return mlen; |
||
964 | } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) { |
||
965 | return mlen; |
||
966 | } else if (opcode == FS_OPCODE_LINTERP && arg == 0) { |
||
967 | return exec_size / 4; |
||
968 | } |
||
969 | |||
970 | switch (src[arg].file) { |
||
971 | case BAD_FILE: |
||
972 | case UNIFORM: |
||
973 | case IMM: |
||
974 | return 1; |
||
975 | case GRF: |
||
976 | case HW_REG: |
||
977 | if (src[arg].stride == 0) { |
||
978 | return 1; |
||
979 | } else { |
||
980 | int size = src[arg].width * src[arg].stride * type_sz(src[arg].type); |
||
981 | return (size + 31) / 32; |
||
982 | } |
||
983 | case MRF: |
||
984 | unreachable("MRF registers are not allowed as sources"); |
||
985 | default: |
||
986 | unreachable("Invalid register file"); |
||
987 | } |
||
988 | } |
||
989 | |||
990 | bool |
||
991 | fs_inst::reads_flag() const |
||
992 | { |
||
993 | return predicate; |
||
994 | } |
||
995 | |||
996 | bool |
||
997 | fs_inst::writes_flag() const |
||
998 | { |
||
999 | return (conditional_mod && (opcode != BRW_OPCODE_SEL && |
||
1000 | opcode != BRW_OPCODE_IF && |
||
1001 | opcode != BRW_OPCODE_WHILE)) || |
||
1002 | opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS; |
||
1003 | } |
||
1004 | |||
1005 | /** |
||
1006 | * Returns how many MRFs an FS opcode will write over. |
||
1007 | * |
||
1008 | * Note that this is not the 0 or 1 implied writes in an actual gen |
||
1009 | * instruction -- the FS opcodes often generate MOVs in addition. |
||
1010 | */ |
||
1011 | int |
||
1012 | fs_visitor::implied_mrf_writes(fs_inst *inst) |
||
1013 | { |
||
1014 | if (inst->mlen == 0) |
||
1015 | return 0; |
||
1016 | |||
1017 | if (inst->base_mrf == -1) |
||
1018 | return 0; |
||
1019 | |||
1020 | switch (inst->opcode) { |
||
1021 | case SHADER_OPCODE_RCP: |
||
1022 | case SHADER_OPCODE_RSQ: |
||
1023 | case SHADER_OPCODE_SQRT: |
||
1024 | case SHADER_OPCODE_EXP2: |
||
1025 | case SHADER_OPCODE_LOG2: |
||
1026 | case SHADER_OPCODE_SIN: |
||
1027 | case SHADER_OPCODE_COS: |
||
1028 | return 1 * dispatch_width / 8; |
||
1029 | case SHADER_OPCODE_POW: |
||
1030 | case SHADER_OPCODE_INT_QUOTIENT: |
||
1031 | case SHADER_OPCODE_INT_REMAINDER: |
||
1032 | return 2 * dispatch_width / 8; |
||
1033 | case SHADER_OPCODE_TEX: |
||
1034 | case FS_OPCODE_TXB: |
||
1035 | case SHADER_OPCODE_TXD: |
||
1036 | case SHADER_OPCODE_TXF: |
||
1037 | case SHADER_OPCODE_TXF_CMS: |
||
1038 | case SHADER_OPCODE_TXF_MCS: |
||
1039 | case SHADER_OPCODE_TG4: |
||
1040 | case SHADER_OPCODE_TG4_OFFSET: |
||
1041 | case SHADER_OPCODE_TXL: |
||
1042 | case SHADER_OPCODE_TXS: |
||
1043 | case SHADER_OPCODE_LOD: |
||
1044 | return 1; |
||
1045 | case FS_OPCODE_FB_WRITE: |
||
1046 | return 2; |
||
1047 | case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD: |
||
1048 | case SHADER_OPCODE_GEN4_SCRATCH_READ: |
||
1049 | return 1; |
||
1050 | case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD: |
||
1051 | return inst->mlen; |
||
1052 | case SHADER_OPCODE_GEN4_SCRATCH_WRITE: |
||
1053 | return inst->mlen; |
||
1054 | case SHADER_OPCODE_UNTYPED_ATOMIC: |
||
1055 | case SHADER_OPCODE_UNTYPED_SURFACE_READ: |
||
1056 | case SHADER_OPCODE_UNTYPED_SURFACE_WRITE: |
||
1057 | case SHADER_OPCODE_TYPED_ATOMIC: |
||
1058 | case SHADER_OPCODE_TYPED_SURFACE_READ: |
||
1059 | case SHADER_OPCODE_TYPED_SURFACE_WRITE: |
||
1060 | case SHADER_OPCODE_URB_WRITE_SIMD8: |
||
1061 | case FS_OPCODE_INTERPOLATE_AT_CENTROID: |
||
1062 | case FS_OPCODE_INTERPOLATE_AT_SAMPLE: |
||
1063 | case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET: |
||
1064 | case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET: |
||
1065 | return 0; |
||
1066 | default: |
||
1067 | unreachable("not reached"); |
||
1068 | } |
||
1069 | } |
||
1070 | |||
1071 | fs_reg |
||
1072 | fs_visitor::vgrf(const glsl_type *const type) |
||
1073 | { |
||
1074 | int reg_width = dispatch_width / 8; |
||
1075 | return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width), |
||
1076 | brw_type_for_base_type(type), dispatch_width); |
||
1077 | } |
||
1078 | |||
1079 | fs_reg |
||
1080 | fs_visitor::vgrf(int num_components) |
||
1081 | { |
||
1082 | int reg_width = dispatch_width / 8; |
||
1083 | return fs_reg(GRF, alloc.allocate(num_components * reg_width), |
||
1084 | BRW_REGISTER_TYPE_F, dispatch_width); |
||
1085 | } |
||
1086 | |||
1087 | /** Fixed HW reg constructor. */ |
||
1088 | fs_reg::fs_reg(enum register_file file, int reg) |
||
1089 | { |
||
1090 | init(); |
||
1091 | this->file = file; |
||
1092 | this->reg = reg; |
||
1093 | this->type = BRW_REGISTER_TYPE_F; |
||
1094 | |||
1095 | switch (file) { |
||
1096 | case UNIFORM: |
||
1097 | this->width = 1; |
||
1098 | break; |
||
1099 | default: |
||
1100 | this->width = 8; |
||
1101 | } |
||
1102 | } |
||
1103 | |||
1104 | /** Fixed HW reg constructor. */ |
||
1105 | fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type) |
||
1106 | { |
||
1107 | init(); |
||
1108 | this->file = file; |
||
1109 | this->reg = reg; |
||
1110 | this->type = type; |
||
1111 | |||
1112 | switch (file) { |
||
1113 | case UNIFORM: |
||
1114 | this->width = 1; |
||
1115 | break; |
||
1116 | default: |
||
1117 | this->width = 8; |
||
1118 | } |
||
1119 | } |
||
1120 | |||
1121 | /** Fixed HW reg constructor. */ |
||
1122 | fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type, |
||
1123 | uint8_t width) |
||
1124 | { |
||
1125 | init(); |
||
1126 | this->file = file; |
||
1127 | this->reg = reg; |
||
1128 | this->type = type; |
||
1129 | this->width = width; |
||
1130 | } |
||
1131 | |||
1132 | fs_reg * |
||
1133 | fs_visitor::variable_storage(ir_variable *var) |
||
1134 | { |
||
1135 | return (fs_reg *)hash_table_find(this->variable_ht, var); |
||
1136 | } |
||
1137 | |||
1138 | void |
||
1139 | import_uniforms_callback(const void *key, |
||
1140 | void *data, |
||
1141 | void *closure) |
||
1142 | { |
||
1143 | struct hash_table *dst_ht = (struct hash_table *)closure; |
||
1144 | const fs_reg *reg = (const fs_reg *)data; |
||
1145 | |||
1146 | if (reg->file != UNIFORM) |
||
1147 | return; |
||
1148 | |||
1149 | hash_table_insert(dst_ht, data, key); |
||
1150 | } |
||
1151 | |||
1152 | /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch. |
||
1153 | * This brings in those uniform definitions |
||
1154 | */ |
||
1155 | void |
||
1156 | fs_visitor::import_uniforms(fs_visitor *v) |
||
1157 | { |
||
1158 | hash_table_call_foreach(v->variable_ht, |
||
1159 | import_uniforms_callback, |
||
1160 | variable_ht); |
||
1161 | this->push_constant_loc = v->push_constant_loc; |
||
1162 | this->pull_constant_loc = v->pull_constant_loc; |
||
1163 | this->uniforms = v->uniforms; |
||
1164 | this->param_size = v->param_size; |
||
1165 | } |
||
1166 | |||
1167 | /* Our support for uniforms is piggy-backed on the struct |
||
1168 | * gl_fragment_program, because that's where the values actually |
||
1169 | * get stored, rather than in some global gl_shader_program uniform |
||
1170 | * store. |
||
1171 | */ |
||
1172 | void |
||
1173 | fs_visitor::setup_uniform_values(ir_variable *ir) |
||
1174 | { |
||
1175 | int namelen = strlen(ir->name); |
||
1176 | |||
1177 | /* The data for our (non-builtin) uniforms is stored in a series of |
||
1178 | * gl_uniform_driver_storage structs for each subcomponent that |
||
1179 | * glGetUniformLocation() could name. We know it's been set up in the same |
||
1180 | * order we'd walk the type, so walk the list of storage and find anything |
||
1181 | * with our name, or the prefix of a component that starts with our name. |
||
1182 | */ |
||
1183 | unsigned params_before = uniforms; |
||
1184 | for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) { |
||
1185 | struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u]; |
||
1186 | |||
1187 | if (strncmp(ir->name, storage->name, namelen) != 0 || |
||
1188 | (storage->name[namelen] != 0 && |
||
1189 | storage->name[namelen] != '.' && |
||
1190 | storage->name[namelen] != '[')) { |
||
1191 | continue; |
||
1192 | } |
||
1193 | |||
1194 | unsigned slots = storage->type->component_slots(); |
||
1195 | if (storage->array_elements) |
||
1196 | slots *= storage->array_elements; |
||
1197 | |||
1198 | for (unsigned i = 0; i < slots; i++) { |
||
1199 | stage_prog_data->param[uniforms++] = &storage->storage[i]; |
||
1200 | } |
||
1201 | } |
||
1202 | |||
1203 | /* Make sure we actually initialized the right amount of stuff here. */ |
||
1204 | assert(params_before + ir->type->component_slots() == uniforms); |
||
1205 | (void)params_before; |
||
1206 | } |
||
1207 | |||
1208 | |||
1209 | /* Our support for builtin uniforms is even scarier than non-builtin. |
||
1210 | * It sits on top of the PROG_STATE_VAR parameters that are |
||
1211 | * automatically updated from GL context state. |
||
1212 | */ |
||
1213 | void |
||
1214 | fs_visitor::setup_builtin_uniform_values(ir_variable *ir) |
||
1215 | { |
||
1216 | const ir_state_slot *const slots = ir->get_state_slots(); |
||
1217 | assert(slots != NULL); |
||
1218 | |||
1219 | for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) { |
||
1220 | /* This state reference has already been setup by ir_to_mesa, but we'll |
||
1221 | * get the same index back here. |
||
1222 | */ |
||
1223 | int index = _mesa_add_state_reference(this->prog->Parameters, |
||
1224 | (gl_state_index *)slots[i].tokens); |
||
1225 | |||
1226 | /* Add each of the unique swizzles of the element as a parameter. |
||
1227 | * This'll end up matching the expected layout of the |
||
1228 | * array/matrix/structure we're trying to fill in. |
||
1229 | */ |
||
1230 | int last_swiz = -1; |
||
1231 | for (unsigned int j = 0; j < 4; j++) { |
||
1232 | int swiz = GET_SWZ(slots[i].swizzle, j); |
||
1233 | if (swiz == last_swiz) |
||
1234 | break; |
||
1235 | last_swiz = swiz; |
||
1236 | |||
1237 | stage_prog_data->param[uniforms++] = |
||
1238 | &prog->Parameters->ParameterValues[index][swiz]; |
||
1239 | } |
||
1240 | } |
||
1241 | } |
||
1242 | |||
1243 | fs_reg * |
||
1244 | fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer, |
||
1245 | bool origin_upper_left) |
||
1246 | { |
||
1247 | assert(stage == MESA_SHADER_FRAGMENT); |
||
1248 | brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
||
1249 | fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type)); |
||
1250 | fs_reg wpos = *reg; |
||
1251 | bool flip = !origin_upper_left ^ key->render_to_fbo; |
||
1252 | |||
1253 | /* gl_FragCoord.x */ |
||
1254 | if (pixel_center_integer) { |
||
1255 | emit(MOV(wpos, this->pixel_x)); |
||
1256 | } else { |
||
1257 | emit(ADD(wpos, this->pixel_x, fs_reg(0.5f))); |
||
1258 | } |
||
1259 | wpos = offset(wpos, 1); |
||
1260 | |||
1261 | /* gl_FragCoord.y */ |
||
1262 | if (!flip && pixel_center_integer) { |
||
1263 | emit(MOV(wpos, this->pixel_y)); |
||
1264 | } else { |
||
1265 | fs_reg pixel_y = this->pixel_y; |
||
1266 | float offset = (pixel_center_integer ? 0.0 : 0.5); |
||
1267 | |||
1268 | if (flip) { |
||
1269 | pixel_y.negate = true; |
||
1270 | offset += key->drawable_height - 1.0; |
||
1271 | } |
||
1272 | |||
1273 | emit(ADD(wpos, pixel_y, fs_reg(offset))); |
||
1274 | } |
||
1275 | wpos = offset(wpos, 1); |
||
1276 | |||
1277 | /* gl_FragCoord.z */ |
||
1278 | if (devinfo->gen >= 6) { |
||
1279 | emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0)))); |
||
1280 | } else { |
||
1281 | emit(FS_OPCODE_LINTERP, wpos, |
||
1282 | this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC], |
||
1283 | interp_reg(VARYING_SLOT_POS, 2)); |
||
1284 | } |
||
1285 | wpos = offset(wpos, 1); |
||
1286 | |||
1287 | /* gl_FragCoord.w: Already set up in emit_interpolation */ |
||
1288 | emit(BRW_OPCODE_MOV, wpos, this->wpos_w); |
||
1289 | |||
1290 | return reg; |
||
1291 | } |
||
1292 | |||
1293 | fs_inst * |
||
1294 | fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp, |
||
1295 | glsl_interp_qualifier interpolation_mode, |
||
1296 | bool is_centroid, bool is_sample) |
||
1297 | { |
||
1298 | brw_wm_barycentric_interp_mode barycoord_mode; |
||
1299 | if (devinfo->gen >= 6) { |
||
1300 | if (is_centroid) { |
||
1301 | if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) |
||
1302 | barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC; |
||
1303 | else |
||
1304 | barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC; |
||
1305 | } else if (is_sample) { |
||
1306 | if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) |
||
1307 | barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC; |
||
1308 | else |
||
1309 | barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC; |
||
1310 | } else { |
||
1311 | if (interpolation_mode == INTERP_QUALIFIER_SMOOTH) |
||
1312 | barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; |
||
1313 | else |
||
1314 | barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC; |
||
1315 | } |
||
1316 | } else { |
||
1317 | /* On Ironlake and below, there is only one interpolation mode. |
||
1318 | * Centroid interpolation doesn't mean anything on this hardware -- |
||
1319 | * there is no multisampling. |
||
1320 | */ |
||
1321 | barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC; |
||
1322 | } |
||
1323 | return emit(FS_OPCODE_LINTERP, attr, |
||
1324 | this->delta_xy[barycoord_mode], interp); |
||
1325 | } |
||
1326 | |||
1327 | void |
||
1328 | fs_visitor::emit_general_interpolation(fs_reg attr, const char *name, |
||
1329 | const glsl_type *type, |
||
1330 | glsl_interp_qualifier interpolation_mode, |
||
1331 | int location, bool mod_centroid, |
||
1332 | bool mod_sample) |
||
1333 | { |
||
1334 | attr.type = brw_type_for_base_type(type->get_scalar_type()); |
||
1335 | |||
1336 | assert(stage == MESA_SHADER_FRAGMENT); |
||
1337 | brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
||
1338 | brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
||
1339 | |||
1340 | unsigned int array_elements; |
||
1341 | |||
1342 | if (type->is_array()) { |
||
1343 | array_elements = type->length; |
||
1344 | if (array_elements == 0) { |
||
1345 | fail("dereferenced array '%s' has length 0\n", name); |
||
1346 | } |
||
1347 | type = type->fields.array; |
||
1348 | } else { |
||
1349 | array_elements = 1; |
||
1350 | } |
||
1351 | |||
1352 | if (interpolation_mode == INTERP_QUALIFIER_NONE) { |
||
1353 | bool is_gl_Color = |
||
1354 | location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1; |
||
1355 | if (key->flat_shade && is_gl_Color) { |
||
1356 | interpolation_mode = INTERP_QUALIFIER_FLAT; |
||
1357 | } else { |
||
1358 | interpolation_mode = INTERP_QUALIFIER_SMOOTH; |
||
1359 | } |
||
1360 | } |
||
1361 | |||
1362 | for (unsigned int i = 0; i < array_elements; i++) { |
||
1363 | for (unsigned int j = 0; j < type->matrix_columns; j++) { |
||
1364 | if (prog_data->urb_setup[location] == -1) { |
||
1365 | /* If there's no incoming setup data for this slot, don't |
||
1366 | * emit interpolation for it. |
||
1367 | */ |
||
1368 | attr = offset(attr, type->vector_elements); |
||
1369 | location++; |
||
1370 | continue; |
||
1371 | } |
||
1372 | |||
1373 | if (interpolation_mode == INTERP_QUALIFIER_FLAT) { |
||
1374 | /* Constant interpolation (flat shading) case. The SF has |
||
1375 | * handed us defined values in only the constant offset |
||
1376 | * field of the setup reg. |
||
1377 | */ |
||
1378 | for (unsigned int k = 0; k < type->vector_elements; k++) { |
||
1379 | struct brw_reg interp = interp_reg(location, k); |
||
1380 | interp = suboffset(interp, 3); |
||
1381 | interp.type = attr.type; |
||
1382 | emit(FS_OPCODE_CINTERP, attr, fs_reg(interp)); |
||
1383 | attr = offset(attr, 1); |
||
1384 | } |
||
1385 | } else { |
||
1386 | /* Smooth/noperspective interpolation case. */ |
||
1387 | for (unsigned int k = 0; k < type->vector_elements; k++) { |
||
1388 | struct brw_reg interp = interp_reg(location, k); |
||
1389 | if (devinfo->needs_unlit_centroid_workaround && mod_centroid) { |
||
1390 | /* Get the pixel/sample mask into f0 so that we know |
||
1391 | * which pixels are lit. Then, for each channel that is |
||
1392 | * unlit, replace the centroid data with non-centroid |
||
1393 | * data. |
||
1394 | */ |
||
1395 | emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); |
||
1396 | |||
1397 | fs_inst *inst; |
||
1398 | inst = emit_linterp(attr, fs_reg(interp), interpolation_mode, |
||
1399 | false, false); |
||
1400 | inst->predicate = BRW_PREDICATE_NORMAL; |
||
1401 | inst->predicate_inverse = true; |
||
1402 | if (devinfo->has_pln) |
||
1403 | inst->no_dd_clear = true; |
||
1404 | |||
1405 | inst = emit_linterp(attr, fs_reg(interp), interpolation_mode, |
||
1406 | mod_centroid && !key->persample_shading, |
||
1407 | mod_sample || key->persample_shading); |
||
1408 | inst->predicate = BRW_PREDICATE_NORMAL; |
||
1409 | inst->predicate_inverse = false; |
||
1410 | if (devinfo->has_pln) |
||
1411 | inst->no_dd_check = true; |
||
1412 | |||
1413 | } else { |
||
1414 | emit_linterp(attr, fs_reg(interp), interpolation_mode, |
||
1415 | mod_centroid && !key->persample_shading, |
||
1416 | mod_sample || key->persample_shading); |
||
1417 | } |
||
1418 | if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) { |
||
1419 | emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w); |
||
1420 | } |
||
1421 | attr = offset(attr, 1); |
||
1422 | } |
||
1423 | |||
1424 | } |
||
1425 | location++; |
||
1426 | } |
||
1427 | } |
||
1428 | } |
||
1429 | |||
1430 | fs_reg * |
||
1431 | fs_visitor::emit_frontfacing_interpolation() |
||
1432 | { |
||
1433 | fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type)); |
||
1434 | |||
1435 | if (devinfo->gen >= 6) { |
||
1436 | /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create |
||
1437 | * a boolean result from this (~0/true or 0/false). |
||
1438 | * |
||
1439 | * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish |
||
1440 | * this task in only one instruction: |
||
1441 | * - a negation source modifier will flip the bit; and |
||
1442 | * - a W -> D type conversion will sign extend the bit into the high |
||
1443 | * word of the destination. |
||
1444 | * |
||
1445 | * An ASR 15 fills the low word of the destination. |
||
1446 | */ |
||
1447 | fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W)); |
||
1448 | g0.negate = true; |
||
1449 | |||
1450 | emit(ASR(*reg, g0, fs_reg(15))); |
||
1451 | } else { |
||
1452 | /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create |
||
1453 | * a boolean result from this (1/true or 0/false). |
||
1454 | * |
||
1455 | * Like in the above case, since the bit is the MSB of g1.6:UD we can use |
||
1456 | * the negation source modifier to flip it. Unfortunately the SHR |
||
1457 | * instruction only operates on UD (or D with an abs source modifier) |
||
1458 | * sources without negation. |
||
1459 | * |
||
1460 | * Instead, use ASR (which will give ~0/true or 0/false). |
||
1461 | */ |
||
1462 | fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D)); |
||
1463 | g1_6.negate = true; |
||
1464 | |||
1465 | emit(ASR(*reg, g1_6, fs_reg(31))); |
||
1466 | } |
||
1467 | |||
1468 | return reg; |
||
1469 | } |
||
1470 | |||
1471 | void |
||
1472 | fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos) |
||
1473 | { |
||
1474 | assert(stage == MESA_SHADER_FRAGMENT); |
||
1475 | brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
||
1476 | assert(dst.type == BRW_REGISTER_TYPE_F); |
||
1477 | |||
1478 | if (key->compute_pos_offset) { |
||
1479 | /* Convert int_sample_pos to floating point */ |
||
1480 | emit(MOV(dst, int_sample_pos)); |
||
1481 | /* Scale to the range [0, 1] */ |
||
1482 | emit(MUL(dst, dst, fs_reg(1 / 16.0f))); |
||
1483 | } |
||
1484 | else { |
||
1485 | /* From ARB_sample_shading specification: |
||
1486 | * "When rendering to a non-multisample buffer, or if multisample |
||
1487 | * rasterization is disabled, gl_SamplePosition will always be |
||
1488 | * (0.5, 0.5). |
||
1489 | */ |
||
1490 | emit(MOV(dst, fs_reg(0.5f))); |
||
1491 | } |
||
1492 | } |
||
1493 | |||
1494 | fs_reg * |
||
1495 | fs_visitor::emit_samplepos_setup() |
||
1496 | { |
||
1497 | assert(devinfo->gen >= 6); |
||
1498 | |||
1499 | this->current_annotation = "compute sample position"; |
||
1500 | fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type)); |
||
1501 | fs_reg pos = *reg; |
||
1502 | fs_reg int_sample_x = vgrf(glsl_type::int_type); |
||
1503 | fs_reg int_sample_y = vgrf(glsl_type::int_type); |
||
1504 | |||
1505 | /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16 |
||
1506 | * mode will be enabled. |
||
1507 | * |
||
1508 | * From the Ivy Bridge PRM, volume 2 part 1, page 344: |
||
1509 | * R31.1:0 Position Offset X/Y for Slot[3:0] |
||
1510 | * R31.3:2 Position Offset X/Y for Slot[7:4] |
||
1511 | * ..... |
||
1512 | * |
||
1513 | * The X, Y sample positions come in as bytes in thread payload. So, read |
||
1514 | * the positions using vstride=16, width=8, hstride=2. |
||
1515 | */ |
||
1516 | struct brw_reg sample_pos_reg = |
||
1517 | stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0), |
||
1518 | BRW_REGISTER_TYPE_B), 16, 8, 2); |
||
1519 | |||
1520 | if (dispatch_width == 8) { |
||
1521 | emit(MOV(int_sample_x, fs_reg(sample_pos_reg))); |
||
1522 | } else { |
||
1523 | emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg))); |
||
1524 | emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16)))) |
||
1525 | ->force_sechalf = true; |
||
1526 | } |
||
1527 | /* Compute gl_SamplePosition.x */ |
||
1528 | compute_sample_position(pos, int_sample_x); |
||
1529 | pos = offset(pos, 1); |
||
1530 | if (dispatch_width == 8) { |
||
1531 | emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1)))); |
||
1532 | } else { |
||
1533 | emit(MOV(half(int_sample_y, 0), |
||
1534 | fs_reg(suboffset(sample_pos_reg, 1)))); |
||
1535 | emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17)))) |
||
1536 | ->force_sechalf = true; |
||
1537 | } |
||
1538 | /* Compute gl_SamplePosition.y */ |
||
1539 | compute_sample_position(pos, int_sample_y); |
||
1540 | return reg; |
||
1541 | } |
||
1542 | |||
1543 | fs_reg * |
||
1544 | fs_visitor::emit_sampleid_setup() |
||
1545 | { |
||
1546 | assert(stage == MESA_SHADER_FRAGMENT); |
||
1547 | brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
||
1548 | assert(devinfo->gen >= 6); |
||
1549 | |||
1550 | this->current_annotation = "compute sample id"; |
||
1551 | fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type)); |
||
1552 | |||
1553 | if (key->compute_sample_id) { |
||
1554 | fs_reg t1 = vgrf(glsl_type::int_type); |
||
1555 | fs_reg t2 = vgrf(glsl_type::int_type); |
||
1556 | t2.type = BRW_REGISTER_TYPE_UW; |
||
1557 | |||
1558 | /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with |
||
1559 | * 8x multisampling, subspan 0 will represent sample N (where N |
||
1560 | * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or |
||
1561 | * 7. We can find the value of N by looking at R0.0 bits 7:6 |
||
1562 | * ("Starting Sample Pair Index (SSPI)") and multiplying by two |
||
1563 | * (since samples are always delivered in pairs). That is, we |
||
1564 | * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then |
||
1565 | * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in |
||
1566 | * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, |
||
1567 | * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by |
||
1568 | * populating a temporary variable with the sequence (0, 1, 2, 3), |
||
1569 | * and then reading from it using vstride=1, width=4, hstride=0. |
||
1570 | * These computations hold good for 4x multisampling as well. |
||
1571 | * |
||
1572 | * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1): |
||
1573 | * the first four slots are sample 0 of subspan 0; the next four |
||
1574 | * are sample 1 of subspan 0; the third group is sample 0 of |
||
1575 | * subspan 1, and finally sample 1 of subspan 1. |
||
1576 | */ |
||
1577 | fs_inst *inst; |
||
1578 | inst = emit(BRW_OPCODE_AND, t1, |
||
1579 | fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)), |
||
1580 | fs_reg(0xc0)); |
||
1581 | inst->force_writemask_all = true; |
||
1582 | inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5)); |
||
1583 | inst->force_writemask_all = true; |
||
1584 | /* This works for both SIMD8 and SIMD16 */ |
||
1585 | inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210))); |
||
1586 | inst->force_writemask_all = true; |
||
1587 | /* This special instruction takes care of setting vstride=1, |
||
1588 | * width=4, hstride=0 of t2 during an ADD instruction. |
||
1589 | */ |
||
1590 | emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2); |
||
1591 | } else { |
||
1592 | /* As per GL_ARB_sample_shading specification: |
||
1593 | * "When rendering to a non-multisample buffer, or if multisample |
||
1594 | * rasterization is disabled, gl_SampleID will always be zero." |
||
1595 | */ |
||
1596 | emit(BRW_OPCODE_MOV, *reg, fs_reg(0)); |
||
1597 | } |
||
1598 | |||
1599 | return reg; |
||
1600 | } |
||
1601 | |||
1602 | void |
||
1603 | fs_visitor::resolve_source_modifiers(fs_reg *src) |
||
1604 | { |
||
1605 | if (!src->abs && !src->negate) |
||
1606 | return; |
||
1607 | |||
1608 | fs_reg temp = retype(vgrf(1), src->type); |
||
1609 | emit(MOV(temp, *src)); |
||
1610 | *src = temp; |
||
1611 | } |
||
1612 | |||
1613 | fs_reg |
||
1614 | fs_visitor::fix_math_operand(fs_reg src) |
||
1615 | { |
||
1616 | /* Can't do hstride == 0 args on gen6 math, so expand it out. We |
||
1617 | * might be able to do better by doing execsize = 1 math and then |
||
1618 | * expanding that result out, but we would need to be careful with |
||
1619 | * masking. |
||
1620 | * |
||
1621 | * The hardware ignores source modifiers (negate and abs) on math |
||
1622 | * instructions, so we also move to a temp to set those up. |
||
1623 | */ |
||
1624 | if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM && |
||
1625 | !src.abs && !src.negate) |
||
1626 | return src; |
||
1627 | |||
1628 | /* Gen7 relaxes most of the above restrictions, but still can't use IMM |
||
1629 | * operands to math |
||
1630 | */ |
||
1631 | if (devinfo->gen >= 7 && src.file != IMM) |
||
1632 | return src; |
||
1633 | |||
1634 | fs_reg expanded = vgrf(glsl_type::float_type); |
||
1635 | expanded.type = src.type; |
||
1636 | emit(BRW_OPCODE_MOV, expanded, src); |
||
1637 | return expanded; |
||
1638 | } |
||
1639 | |||
1640 | fs_inst * |
||
1641 | fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src) |
||
1642 | { |
||
1643 | switch (opcode) { |
||
1644 | case SHADER_OPCODE_RCP: |
||
1645 | case SHADER_OPCODE_RSQ: |
||
1646 | case SHADER_OPCODE_SQRT: |
||
1647 | case SHADER_OPCODE_EXP2: |
||
1648 | case SHADER_OPCODE_LOG2: |
||
1649 | case SHADER_OPCODE_SIN: |
||
1650 | case SHADER_OPCODE_COS: |
||
1651 | break; |
||
1652 | default: |
||
1653 | unreachable("not reached: bad math opcode"); |
||
1654 | } |
||
1655 | |||
1656 | /* Can't do hstride == 0 args to gen6 math, so expand it out. We |
||
1657 | * might be able to do better by doing execsize = 1 math and then |
||
1658 | * expanding that result out, but we would need to be careful with |
||
1659 | * masking. |
||
1660 | * |
||
1661 | * Gen 6 hardware ignores source modifiers (negate and abs) on math |
||
1662 | * instructions, so we also move to a temp to set those up. |
||
1663 | */ |
||
1664 | if (devinfo->gen == 6 || devinfo->gen == 7) |
||
1665 | src = fix_math_operand(src); |
||
1666 | |||
1667 | fs_inst *inst = emit(opcode, dst, src); |
||
1668 | |||
1669 | if (devinfo->gen < 6) { |
||
1670 | inst->base_mrf = 2; |
||
1671 | inst->mlen = dispatch_width / 8; |
||
1672 | } |
||
1673 | |||
1674 | return inst; |
||
1675 | } |
||
1676 | |||
1677 | fs_inst * |
||
1678 | fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1) |
||
1679 | { |
||
1680 | int base_mrf = 2; |
||
1681 | fs_inst *inst; |
||
1682 | |||
1683 | if (devinfo->gen >= 8) { |
||
1684 | inst = emit(opcode, dst, src0, src1); |
||
1685 | } else if (devinfo->gen >= 6) { |
||
1686 | src0 = fix_math_operand(src0); |
||
1687 | src1 = fix_math_operand(src1); |
||
1688 | |||
1689 | inst = emit(opcode, dst, src0, src1); |
||
1690 | } else { |
||
1691 | /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13 |
||
1692 | * "Message Payload": |
||
1693 | * |
||
1694 | * "Operand0[7]. For the INT DIV functions, this operand is the |
||
1695 | * denominator." |
||
1696 | * ... |
||
1697 | * "Operand1[7]. For the INT DIV functions, this operand is the |
||
1698 | * numerator." |
||
1699 | */ |
||
1700 | bool is_int_div = opcode != SHADER_OPCODE_POW; |
||
1701 | fs_reg &op0 = is_int_div ? src1 : src0; |
||
1702 | fs_reg &op1 = is_int_div ? src0 : src1; |
||
1703 | |||
1704 | emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1)); |
||
1705 | inst = emit(opcode, dst, op0, reg_null_f); |
||
1706 | |||
1707 | inst->base_mrf = base_mrf; |
||
1708 | inst->mlen = 2 * dispatch_width / 8; |
||
1709 | } |
||
1710 | return inst; |
||
1711 | } |
||
1712 | |||
1713 | void |
||
1714 | fs_visitor::emit_discard_jump() |
||
1715 | { |
||
1716 | assert(((brw_wm_prog_data*) this->prog_data)->uses_kill); |
||
1717 | |||
1718 | /* For performance, after a discard, jump to the end of the |
||
1719 | * shader if all relevant channels have been discarded. |
||
1720 | */ |
||
1721 | fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP); |
||
1722 | discard_jump->flag_subreg = 1; |
||
1723 | |||
1724 | discard_jump->predicate = (dispatch_width == 8) |
||
1725 | ? BRW_PREDICATE_ALIGN1_ANY8H |
||
1726 | : BRW_PREDICATE_ALIGN1_ANY16H; |
||
1727 | discard_jump->predicate_inverse = true; |
||
1728 | } |
||
1729 | |||
1730 | void |
||
1731 | fs_visitor::assign_curb_setup() |
||
1732 | { |
||
1733 | if (dispatch_width == 8) { |
||
1734 | prog_data->dispatch_grf_start_reg = payload.num_regs; |
||
1735 | } else { |
||
1736 | if (stage == MESA_SHADER_FRAGMENT) { |
||
1737 | brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
||
1738 | prog_data->dispatch_grf_start_reg_16 = payload.num_regs; |
||
1739 | } else if (stage == MESA_SHADER_COMPUTE) { |
||
1740 | brw_cs_prog_data *prog_data = (brw_cs_prog_data*) this->prog_data; |
||
1741 | prog_data->dispatch_grf_start_reg_16 = payload.num_regs; |
||
1742 | } else { |
||
1743 | unreachable("Unsupported shader type!"); |
||
1744 | } |
||
1745 | } |
||
1746 | |||
1747 | prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8; |
||
1748 | |||
1749 | /* Map the offsets in the UNIFORM file to fixed HW regs. */ |
||
1750 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
1751 | for (unsigned int i = 0; i < inst->sources; i++) { |
||
1752 | if (inst->src[i].file == UNIFORM) { |
||
1753 | int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset; |
||
1754 | int constant_nr; |
||
1755 | if (uniform_nr >= 0 && uniform_nr < (int) uniforms) { |
||
1756 | constant_nr = push_constant_loc[uniform_nr]; |
||
1757 | } else { |
||
1758 | /* Section 5.11 of the OpenGL 4.1 spec says: |
||
1759 | * "Out-of-bounds reads return undefined values, which include |
||
1760 | * values from other variables of the active program or zero." |
||
1761 | * Just return the first push constant. |
||
1762 | */ |
||
1763 | constant_nr = 0; |
||
1764 | } |
||
1765 | |||
1766 | struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs + |
||
1767 | constant_nr / 8, |
||
1768 | constant_nr % 8); |
||
1769 | |||
1770 | inst->src[i].file = HW_REG; |
||
1771 | inst->src[i].fixed_hw_reg = byte_offset( |
||
1772 | retype(brw_reg, inst->src[i].type), |
||
1773 | inst->src[i].subreg_offset); |
||
1774 | } |
||
1775 | } |
||
1776 | } |
||
1777 | } |
||
1778 | |||
1779 | void |
||
1780 | fs_visitor::calculate_urb_setup() |
||
1781 | { |
||
1782 | assert(stage == MESA_SHADER_FRAGMENT); |
||
1783 | brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
||
1784 | brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
||
1785 | |||
1786 | memset(prog_data->urb_setup, -1, |
||
1787 | sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX); |
||
1788 | |||
1789 | int urb_next = 0; |
||
1790 | /* Figure out where each of the incoming setup attributes lands. */ |
||
1791 | if (devinfo->gen >= 6) { |
||
1792 | if (_mesa_bitcount_64(prog->InputsRead & |
||
1793 | BRW_FS_VARYING_INPUT_MASK) <= 16) { |
||
1794 | /* The SF/SBE pipeline stage can do arbitrary rearrangement of the |
||
1795 | * first 16 varying inputs, so we can put them wherever we want. |
||
1796 | * Just put them in order. |
||
1797 | * |
||
1798 | * This is useful because it means that (a) inputs not used by the |
||
1799 | * fragment shader won't take up valuable register space, and (b) we |
||
1800 | * won't have to recompile the fragment shader if it gets paired with |
||
1801 | * a different vertex (or geometry) shader. |
||
1802 | */ |
||
1803 | for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { |
||
1804 | if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK & |
||
1805 | BITFIELD64_BIT(i)) { |
||
1806 | prog_data->urb_setup[i] = urb_next++; |
||
1807 | } |
||
1808 | } |
||
1809 | } else { |
||
1810 | /* We have enough input varyings that the SF/SBE pipeline stage can't |
||
1811 | * arbitrarily rearrange them to suit our whim; we have to put them |
||
1812 | * in an order that matches the output of the previous pipeline stage |
||
1813 | * (geometry or vertex shader). |
||
1814 | */ |
||
1815 | struct brw_vue_map prev_stage_vue_map; |
||
1816 | brw_compute_vue_map(devinfo, &prev_stage_vue_map, |
||
1817 | key->input_slots_valid); |
||
1818 | int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET; |
||
1819 | assert(prev_stage_vue_map.num_slots <= first_slot + 32); |
||
1820 | for (int slot = first_slot; slot < prev_stage_vue_map.num_slots; |
||
1821 | slot++) { |
||
1822 | int varying = prev_stage_vue_map.slot_to_varying[slot]; |
||
1823 | /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is |
||
1824 | * unused. |
||
1825 | */ |
||
1826 | if (varying != BRW_VARYING_SLOT_COUNT && |
||
1827 | (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK & |
||
1828 | BITFIELD64_BIT(varying))) { |
||
1829 | prog_data->urb_setup[varying] = slot - first_slot; |
||
1830 | } |
||
1831 | } |
||
1832 | urb_next = prev_stage_vue_map.num_slots - first_slot; |
||
1833 | } |
||
1834 | } else { |
||
1835 | /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */ |
||
1836 | for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) { |
||
1837 | /* Point size is packed into the header, not as a general attribute */ |
||
1838 | if (i == VARYING_SLOT_PSIZ) |
||
1839 | continue; |
||
1840 | |||
1841 | if (key->input_slots_valid & BITFIELD64_BIT(i)) { |
||
1842 | /* The back color slot is skipped when the front color is |
||
1843 | * also written to. In addition, some slots can be |
||
1844 | * written in the vertex shader and not read in the |
||
1845 | * fragment shader. So the register number must always be |
||
1846 | * incremented, mapped or not. |
||
1847 | */ |
||
1848 | if (_mesa_varying_slot_in_fs((gl_varying_slot) i)) |
||
1849 | prog_data->urb_setup[i] = urb_next; |
||
1850 | urb_next++; |
||
1851 | } |
||
1852 | } |
||
1853 | |||
1854 | /* |
||
1855 | * It's a FS only attribute, and we did interpolation for this attribute |
||
1856 | * in SF thread. So, count it here, too. |
||
1857 | * |
||
1858 | * See compile_sf_prog() for more info. |
||
1859 | */ |
||
1860 | if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC)) |
||
1861 | prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++; |
||
1862 | } |
||
1863 | |||
1864 | prog_data->num_varying_inputs = urb_next; |
||
1865 | } |
||
1866 | |||
1867 | void |
||
1868 | fs_visitor::assign_urb_setup() |
||
1869 | { |
||
1870 | assert(stage == MESA_SHADER_FRAGMENT); |
||
1871 | brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
||
1872 | |||
1873 | int urb_start = payload.num_regs + prog_data->base.curb_read_length; |
||
1874 | |||
1875 | /* Offset all the urb_setup[] index by the actual position of the |
||
1876 | * setup regs, now that the location of the constants has been chosen. |
||
1877 | */ |
||
1878 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
1879 | if (inst->opcode == FS_OPCODE_LINTERP) { |
||
1880 | assert(inst->src[1].file == HW_REG); |
||
1881 | inst->src[1].fixed_hw_reg.nr += urb_start; |
||
1882 | } |
||
1883 | |||
1884 | if (inst->opcode == FS_OPCODE_CINTERP) { |
||
1885 | assert(inst->src[0].file == HW_REG); |
||
1886 | inst->src[0].fixed_hw_reg.nr += urb_start; |
||
1887 | } |
||
1888 | } |
||
1889 | |||
1890 | /* Each attribute is 4 setup channels, each of which is half a reg. */ |
||
1891 | this->first_non_payload_grf = |
||
1892 | urb_start + prog_data->num_varying_inputs * 2; |
||
1893 | } |
||
1894 | |||
1895 | void |
||
1896 | fs_visitor::assign_vs_urb_setup() |
||
1897 | { |
||
1898 | brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data; |
||
1899 | int grf, count, slot, channel, attr; |
||
1900 | |||
1901 | assert(stage == MESA_SHADER_VERTEX); |
||
1902 | count = _mesa_bitcount_64(vs_prog_data->inputs_read); |
||
1903 | if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid) |
||
1904 | count++; |
||
1905 | |||
1906 | /* Each attribute is 4 regs. */ |
||
1907 | this->first_non_payload_grf = |
||
1908 | payload.num_regs + prog_data->curb_read_length + count * 4; |
||
1909 | |||
1910 | unsigned vue_entries = |
||
1911 | MAX2(count, vs_prog_data->base.vue_map.num_slots); |
||
1912 | |||
1913 | vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4; |
||
1914 | vs_prog_data->base.urb_read_length = (count + 1) / 2; |
||
1915 | |||
1916 | assert(vs_prog_data->base.urb_read_length <= 15); |
||
1917 | |||
1918 | /* Rewrite all ATTR file references to the hw grf that they land in. */ |
||
1919 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
1920 | for (int i = 0; i < inst->sources; i++) { |
||
1921 | if (inst->src[i].file == ATTR) { |
||
1922 | |||
1923 | if (inst->src[i].reg == VERT_ATTRIB_MAX) { |
||
1924 | slot = count - 1; |
||
1925 | } else { |
||
1926 | /* Attributes come in in a contiguous block, ordered by their |
||
1927 | * gl_vert_attrib value. That means we can compute the slot |
||
1928 | * number for an attribute by masking out the enabled |
||
1929 | * attributes before it and counting the bits. |
||
1930 | */ |
||
1931 | attr = inst->src[i].reg + inst->src[i].reg_offset / 4; |
||
1932 | slot = _mesa_bitcount_64(vs_prog_data->inputs_read & |
||
1933 | BITFIELD64_MASK(attr)); |
||
1934 | } |
||
1935 | |||
1936 | channel = inst->src[i].reg_offset & 3; |
||
1937 | |||
1938 | grf = payload.num_regs + |
||
1939 | prog_data->curb_read_length + |
||
1940 | slot * 4 + channel; |
||
1941 | |||
1942 | inst->src[i].file = HW_REG; |
||
1943 | inst->src[i].fixed_hw_reg = |
||
1944 | retype(brw_vec8_grf(grf, 0), inst->src[i].type); |
||
1945 | } |
||
1946 | } |
||
1947 | } |
||
1948 | } |
||
1949 | |||
1950 | /** |
||
1951 | * Split large virtual GRFs into separate components if we can. |
||
1952 | * |
||
1953 | * This is mostly duplicated with what brw_fs_vector_splitting does, |
||
1954 | * but that's really conservative because it's afraid of doing |
||
1955 | * splitting that doesn't result in real progress after the rest of |
||
1956 | * the optimization phases, which would cause infinite looping in |
||
1957 | * optimization. We can do it once here, safely. This also has the |
||
1958 | * opportunity to split interpolated values, or maybe even uniforms, |
||
1959 | * which we don't have at the IR level. |
||
1960 | * |
||
1961 | * We want to split, because virtual GRFs are what we register |
||
1962 | * allocate and spill (due to contiguousness requirements for some |
||
1963 | * instructions), and they're what we naturally generate in the |
||
1964 | * codegen process, but most virtual GRFs don't actually need to be |
||
1965 | * contiguous sets of GRFs. If we split, we'll end up with reduced |
||
1966 | * live intervals and better dead code elimination and coalescing. |
||
1967 | */ |
||
1968 | void |
||
1969 | fs_visitor::split_virtual_grfs() |
||
1970 | { |
||
1971 | int num_vars = this->alloc.count; |
||
1972 | |||
1973 | /* Count the total number of registers */ |
||
1974 | int reg_count = 0; |
||
1975 | int vgrf_to_reg[num_vars]; |
||
1976 | for (int i = 0; i < num_vars; i++) { |
||
1977 | vgrf_to_reg[i] = reg_count; |
||
1978 | reg_count += alloc.sizes[i]; |
||
1979 | } |
||
1980 | |||
1981 | /* An array of "split points". For each register slot, this indicates |
||
1982 | * if this slot can be separated from the previous slot. Every time an |
||
1983 | * instruction uses multiple elements of a register (as a source or |
||
1984 | * destination), we mark the used slots as inseparable. Then we go |
||
1985 | * through and split the registers into the smallest pieces we can. |
||
1986 | */ |
||
1987 | bool split_points[reg_count]; |
||
1988 | memset(split_points, 0, sizeof(split_points)); |
||
1989 | |||
1990 | /* Mark all used registers as fully splittable */ |
||
1991 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
1992 | if (inst->dst.file == GRF) { |
||
1993 | int reg = vgrf_to_reg[inst->dst.reg]; |
||
1994 | for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++) |
||
1995 | split_points[reg + j] = true; |
||
1996 | } |
||
1997 | |||
1998 | for (int i = 0; i < inst->sources; i++) { |
||
1999 | if (inst->src[i].file == GRF) { |
||
2000 | int reg = vgrf_to_reg[inst->src[i].reg]; |
||
2001 | for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++) |
||
2002 | split_points[reg + j] = true; |
||
2003 | } |
||
2004 | } |
||
2005 | } |
||
2006 | |||
2007 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
2008 | if (inst->dst.file == GRF) { |
||
2009 | int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset; |
||
2010 | for (int j = 1; j < inst->regs_written; j++) |
||
2011 | split_points[reg + j] = false; |
||
2012 | } |
||
2013 | for (int i = 0; i < inst->sources; i++) { |
||
2014 | if (inst->src[i].file == GRF) { |
||
2015 | int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset; |
||
2016 | for (int j = 1; j < inst->regs_read(i); j++) |
||
2017 | split_points[reg + j] = false; |
||
2018 | } |
||
2019 | } |
||
2020 | } |
||
2021 | |||
2022 | int new_virtual_grf[reg_count]; |
||
2023 | int new_reg_offset[reg_count]; |
||
2024 | |||
2025 | int reg = 0; |
||
2026 | for (int i = 0; i < num_vars; i++) { |
||
2027 | /* The first one should always be 0 as a quick sanity check. */ |
||
2028 | assert(split_points[reg] == false); |
||
2029 | |||
2030 | /* j = 0 case */ |
||
2031 | new_reg_offset[reg] = 0; |
||
2032 | reg++; |
||
2033 | int offset = 1; |
||
2034 | |||
2035 | /* j > 0 case */ |
||
2036 | for (unsigned j = 1; j < alloc.sizes[i]; j++) { |
||
2037 | /* If this is a split point, reset the offset to 0 and allocate a |
||
2038 | * new virtual GRF for the previous offset many registers |
||
2039 | */ |
||
2040 | if (split_points[reg]) { |
||
2041 | assert(offset <= MAX_VGRF_SIZE); |
||
2042 | int grf = alloc.allocate(offset); |
||
2043 | for (int k = reg - offset; k < reg; k++) |
||
2044 | new_virtual_grf[k] = grf; |
||
2045 | offset = 0; |
||
2046 | } |
||
2047 | new_reg_offset[reg] = offset; |
||
2048 | offset++; |
||
2049 | reg++; |
||
2050 | } |
||
2051 | |||
2052 | /* The last one gets the original register number */ |
||
2053 | assert(offset <= MAX_VGRF_SIZE); |
||
2054 | alloc.sizes[i] = offset; |
||
2055 | for (int k = reg - offset; k < reg; k++) |
||
2056 | new_virtual_grf[k] = i; |
||
2057 | } |
||
2058 | assert(reg == reg_count); |
||
2059 | |||
2060 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
2061 | if (inst->dst.file == GRF) { |
||
2062 | reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset; |
||
2063 | inst->dst.reg = new_virtual_grf[reg]; |
||
2064 | inst->dst.reg_offset = new_reg_offset[reg]; |
||
2065 | assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); |
||
2066 | } |
||
2067 | for (int i = 0; i < inst->sources; i++) { |
||
2068 | if (inst->src[i].file == GRF) { |
||
2069 | reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset; |
||
2070 | inst->src[i].reg = new_virtual_grf[reg]; |
||
2071 | inst->src[i].reg_offset = new_reg_offset[reg]; |
||
2072 | assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]); |
||
2073 | } |
||
2074 | } |
||
2075 | } |
||
2076 | invalidate_live_intervals(); |
||
2077 | } |
||
2078 | |||
2079 | /** |
||
2080 | * Remove unused virtual GRFs and compact the virtual_grf_* arrays. |
||
2081 | * |
||
2082 | * During code generation, we create tons of temporary variables, many of |
||
2083 | * which get immediately killed and are never used again. Yet, in later |
||
2084 | * optimization and analysis passes, such as compute_live_intervals, we need |
||
2085 | * to loop over all the virtual GRFs. Compacting them can save a lot of |
||
2086 | * overhead. |
||
2087 | */ |
||
2088 | bool |
||
2089 | fs_visitor::compact_virtual_grfs() |
||
2090 | { |
||
2091 | bool progress = false; |
||
2092 | int remap_table[this->alloc.count]; |
||
2093 | memset(remap_table, -1, sizeof(remap_table)); |
||
2094 | |||
2095 | /* Mark which virtual GRFs are used. */ |
||
2096 | foreach_block_and_inst(block, const fs_inst, inst, cfg) { |
||
2097 | if (inst->dst.file == GRF) |
||
2098 | remap_table[inst->dst.reg] = 0; |
||
2099 | |||
2100 | for (int i = 0; i < inst->sources; i++) { |
||
2101 | if (inst->src[i].file == GRF) |
||
2102 | remap_table[inst->src[i].reg] = 0; |
||
2103 | } |
||
2104 | } |
||
2105 | |||
2106 | /* Compact the GRF arrays. */ |
||
2107 | int new_index = 0; |
||
2108 | for (unsigned i = 0; i < this->alloc.count; i++) { |
||
2109 | if (remap_table[i] == -1) { |
||
2110 | /* We just found an unused register. This means that we are |
||
2111 | * actually going to compact something. |
||
2112 | */ |
||
2113 | progress = true; |
||
2114 | } else { |
||
2115 | remap_table[i] = new_index; |
||
2116 | alloc.sizes[new_index] = alloc.sizes[i]; |
||
2117 | invalidate_live_intervals(); |
||
2118 | ++new_index; |
||
2119 | } |
||
2120 | } |
||
2121 | |||
2122 | this->alloc.count = new_index; |
||
2123 | |||
2124 | /* Patch all the instructions to use the newly renumbered registers */ |
||
2125 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
2126 | if (inst->dst.file == GRF) |
||
2127 | inst->dst.reg = remap_table[inst->dst.reg]; |
||
2128 | |||
2129 | for (int i = 0; i < inst->sources; i++) { |
||
2130 | if (inst->src[i].file == GRF) |
||
2131 | inst->src[i].reg = remap_table[inst->src[i].reg]; |
||
2132 | } |
||
2133 | } |
||
2134 | |||
2135 | /* Patch all the references to delta_xy, since they're used in register |
||
2136 | * allocation. If they're unused, switch them to BAD_FILE so we don't |
||
2137 | * think some random VGRF is delta_xy. |
||
2138 | */ |
||
2139 | for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { |
||
2140 | if (delta_xy[i].file == GRF) { |
||
2141 | if (remap_table[delta_xy[i].reg] != -1) { |
||
2142 | delta_xy[i].reg = remap_table[delta_xy[i].reg]; |
||
2143 | } else { |
||
2144 | delta_xy[i].file = BAD_FILE; |
||
2145 | } |
||
2146 | } |
||
2147 | } |
||
2148 | |||
2149 | return progress; |
||
2150 | } |
||
2151 | |||
2152 | /* |
||
2153 | * Implements array access of uniforms by inserting a |
||
2154 | * PULL_CONSTANT_LOAD instruction. |
||
2155 | * |
||
2156 | * Unlike temporary GRF array access (where we don't support it due to |
||
2157 | * the difficulty of doing relative addressing on instruction |
||
2158 | * destinations), we could potentially do array access of uniforms |
||
2159 | * that were loaded in GRF space as push constants. In real-world |
||
2160 | * usage we've seen, though, the arrays being used are always larger |
||
2161 | * than we could load as push constants, so just always move all |
||
2162 | * uniform array access out to a pull constant buffer. |
||
2163 | */ |
||
2164 | void |
||
2165 | fs_visitor::move_uniform_array_access_to_pull_constants() |
||
2166 | { |
||
2167 | if (dispatch_width != 8) |
||
2168 | return; |
||
2169 | |||
2170 | pull_constant_loc = ralloc_array(mem_ctx, int, uniforms); |
||
2171 | memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms); |
||
2172 | |||
2173 | /* Walk through and find array access of uniforms. Put a copy of that |
||
2174 | * uniform in the pull constant buffer. |
||
2175 | * |
||
2176 | * Note that we don't move constant-indexed accesses to arrays. No |
||
2177 | * testing has been done of the performance impact of this choice. |
||
2178 | */ |
||
2179 | foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { |
||
2180 | for (int i = 0 ; i < inst->sources; i++) { |
||
2181 | if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr) |
||
2182 | continue; |
||
2183 | |||
2184 | int uniform = inst->src[i].reg; |
||
2185 | |||
2186 | /* If this array isn't already present in the pull constant buffer, |
||
2187 | * add it. |
||
2188 | */ |
||
2189 | if (pull_constant_loc[uniform] == -1) { |
||
2190 | const gl_constant_value **values = &stage_prog_data->param[uniform]; |
||
2191 | |||
2192 | assert(param_size[uniform]); |
||
2193 | |||
2194 | for (int j = 0; j < param_size[uniform]; j++) { |
||
2195 | pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params; |
||
2196 | |||
2197 | stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] = |
||
2198 | values[j]; |
||
2199 | } |
||
2200 | } |
||
2201 | } |
||
2202 | } |
||
2203 | } |
||
2204 | |||
2205 | /** |
||
2206 | * Assign UNIFORM file registers to either push constants or pull constants. |
||
2207 | * |
||
2208 | * We allow a fragment shader to have more than the specified minimum |
||
2209 | * maximum number of fragment shader uniform components (64). If |
||
2210 | * there are too many of these, they'd fill up all of register space. |
||
2211 | * So, this will push some of them out to the pull constant buffer and |
||
2212 | * update the program to load them. |
||
2213 | */ |
||
2214 | void |
||
2215 | fs_visitor::assign_constant_locations() |
||
2216 | { |
||
2217 | /* Only the first compile (SIMD8 mode) gets to decide on locations. */ |
||
2218 | if (dispatch_width != 8) |
||
2219 | return; |
||
2220 | |||
2221 | /* Find which UNIFORM registers are still in use. */ |
||
2222 | bool is_live[uniforms]; |
||
2223 | for (unsigned int i = 0; i < uniforms; i++) { |
||
2224 | is_live[i] = false; |
||
2225 | } |
||
2226 | |||
2227 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
2228 | for (int i = 0; i < inst->sources; i++) { |
||
2229 | if (inst->src[i].file != UNIFORM) |
||
2230 | continue; |
||
2231 | |||
2232 | int constant_nr = inst->src[i].reg + inst->src[i].reg_offset; |
||
2233 | if (constant_nr >= 0 && constant_nr < (int) uniforms) |
||
2234 | is_live[constant_nr] = true; |
||
2235 | } |
||
2236 | } |
||
2237 | |||
2238 | /* Only allow 16 registers (128 uniform components) as push constants. |
||
2239 | * |
||
2240 | * Just demote the end of the list. We could probably do better |
||
2241 | * here, demoting things that are rarely used in the program first. |
||
2242 | * |
||
2243 | * If changing this value, note the limitation about total_regs in |
||
2244 | * brw_curbe.c. |
||
2245 | */ |
||
2246 | unsigned int max_push_components = 16 * 8; |
||
2247 | unsigned int num_push_constants = 0; |
||
2248 | |||
2249 | push_constant_loc = ralloc_array(mem_ctx, int, uniforms); |
||
2250 | |||
2251 | for (unsigned int i = 0; i < uniforms; i++) { |
||
2252 | if (!is_live[i] || pull_constant_loc[i] != -1) { |
||
2253 | /* This UNIFORM register is either dead, or has already been demoted |
||
2254 | * to a pull const. Mark it as no longer living in the param[] array. |
||
2255 | */ |
||
2256 | push_constant_loc[i] = -1; |
||
2257 | continue; |
||
2258 | } |
||
2259 | |||
2260 | if (num_push_constants < max_push_components) { |
||
2261 | /* Retain as a push constant. Record the location in the params[] |
||
2262 | * array. |
||
2263 | */ |
||
2264 | push_constant_loc[i] = num_push_constants++; |
||
2265 | } else { |
||
2266 | /* Demote to a pull constant. */ |
||
2267 | push_constant_loc[i] = -1; |
||
2268 | |||
2269 | int pull_index = stage_prog_data->nr_pull_params++; |
||
2270 | stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i]; |
||
2271 | pull_constant_loc[i] = pull_index; |
||
2272 | } |
||
2273 | } |
||
2274 | |||
2275 | stage_prog_data->nr_params = num_push_constants; |
||
2276 | |||
2277 | /* Up until now, the param[] array has been indexed by reg + reg_offset |
||
2278 | * of UNIFORM registers. Condense it to only contain the uniforms we |
||
2279 | * chose to upload as push constants. |
||
2280 | */ |
||
2281 | for (unsigned int i = 0; i < uniforms; i++) { |
||
2282 | int remapped = push_constant_loc[i]; |
||
2283 | |||
2284 | if (remapped == -1) |
||
2285 | continue; |
||
2286 | |||
2287 | assert(remapped <= (int)i); |
||
2288 | stage_prog_data->param[remapped] = stage_prog_data->param[i]; |
||
2289 | } |
||
2290 | } |
||
2291 | |||
2292 | /** |
||
2293 | * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD |
||
2294 | * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs. |
||
2295 | */ |
||
2296 | void |
||
2297 | fs_visitor::demote_pull_constants() |
||
2298 | { |
||
2299 | foreach_block_and_inst (block, fs_inst, inst, cfg) { |
||
2300 | for (int i = 0; i < inst->sources; i++) { |
||
2301 | if (inst->src[i].file != UNIFORM) |
||
2302 | continue; |
||
2303 | |||
2304 | int pull_index; |
||
2305 | unsigned location = inst->src[i].reg + inst->src[i].reg_offset; |
||
2306 | if (location >= uniforms) /* Out of bounds access */ |
||
2307 | pull_index = -1; |
||
2308 | else |
||
2309 | pull_index = pull_constant_loc[location]; |
||
2310 | |||
2311 | if (pull_index == -1) |
||
2312 | continue; |
||
2313 | |||
2314 | /* Set up the annotation tracking for new generated instructions. */ |
||
2315 | base_ir = inst->ir; |
||
2316 | current_annotation = inst->annotation; |
||
2317 | |||
2318 | fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start); |
||
2319 | fs_reg dst = vgrf(glsl_type::float_type); |
||
2320 | |||
2321 | /* Generate a pull load into dst. */ |
||
2322 | if (inst->src[i].reladdr) { |
||
2323 | exec_list list = VARYING_PULL_CONSTANT_LOAD(dst, |
||
2324 | surf_index, |
||
2325 | *inst->src[i].reladdr, |
||
2326 | pull_index); |
||
2327 | inst->insert_before(block, &list); |
||
2328 | inst->src[i].reladdr = NULL; |
||
2329 | } else { |
||
2330 | fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15); |
||
2331 | fs_inst *pull = |
||
2332 | new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8, |
||
2333 | dst, surf_index, offset); |
||
2334 | inst->insert_before(block, pull); |
||
2335 | inst->src[i].set_smear(pull_index & 3); |
||
2336 | } |
||
2337 | |||
2338 | /* Rewrite the instruction to use the temporary VGRF. */ |
||
2339 | inst->src[i].file = GRF; |
||
2340 | inst->src[i].reg = dst.reg; |
||
2341 | inst->src[i].reg_offset = 0; |
||
2342 | inst->src[i].width = dispatch_width; |
||
2343 | } |
||
2344 | } |
||
2345 | invalidate_live_intervals(); |
||
2346 | } |
||
2347 | |||
2348 | bool |
||
2349 | fs_visitor::opt_algebraic() |
||
2350 | { |
||
2351 | bool progress = false; |
||
2352 | |||
2353 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
2354 | switch (inst->opcode) { |
||
2355 | case BRW_OPCODE_MOV: |
||
2356 | if (inst->src[0].file != IMM) |
||
2357 | break; |
||
2358 | |||
2359 | if (inst->saturate) { |
||
2360 | if (inst->dst.type != inst->src[0].type) |
||
2361 | assert(!"unimplemented: saturate mixed types"); |
||
2362 | |||
2363 | if (brw_saturate_immediate(inst->dst.type, |
||
2364 | &inst->src[0].fixed_hw_reg)) { |
||
2365 | inst->saturate = false; |
||
2366 | progress = true; |
||
2367 | } |
||
2368 | } |
||
2369 | break; |
||
2370 | |||
2371 | case BRW_OPCODE_MUL: |
||
2372 | if (inst->src[1].file != IMM) |
||
2373 | continue; |
||
2374 | |||
2375 | /* a * 1.0 = a */ |
||
2376 | if (inst->src[1].is_one()) { |
||
2377 | inst->opcode = BRW_OPCODE_MOV; |
||
2378 | inst->src[1] = reg_undef; |
||
2379 | progress = true; |
||
2380 | break; |
||
2381 | } |
||
2382 | |||
2383 | /* a * -1.0 = -a */ |
||
2384 | if (inst->src[1].is_negative_one()) { |
||
2385 | inst->opcode = BRW_OPCODE_MOV; |
||
2386 | inst->src[0].negate = !inst->src[0].negate; |
||
2387 | inst->src[1] = reg_undef; |
||
2388 | progress = true; |
||
2389 | break; |
||
2390 | } |
||
2391 | |||
2392 | /* a * 0.0 = 0.0 */ |
||
2393 | if (inst->src[1].is_zero()) { |
||
2394 | inst->opcode = BRW_OPCODE_MOV; |
||
2395 | inst->src[0] = inst->src[1]; |
||
2396 | inst->src[1] = reg_undef; |
||
2397 | progress = true; |
||
2398 | break; |
||
2399 | } |
||
2400 | |||
2401 | if (inst->src[0].file == IMM) { |
||
2402 | assert(inst->src[0].type == BRW_REGISTER_TYPE_F); |
||
2403 | inst->opcode = BRW_OPCODE_MOV; |
||
2404 | inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f; |
||
2405 | inst->src[1] = reg_undef; |
||
2406 | progress = true; |
||
2407 | break; |
||
2408 | } |
||
2409 | break; |
||
2410 | case BRW_OPCODE_ADD: |
||
2411 | if (inst->src[1].file != IMM) |
||
2412 | continue; |
||
2413 | |||
2414 | /* a + 0.0 = a */ |
||
2415 | if (inst->src[1].is_zero()) { |
||
2416 | inst->opcode = BRW_OPCODE_MOV; |
||
2417 | inst->src[1] = reg_undef; |
||
2418 | progress = true; |
||
2419 | break; |
||
2420 | } |
||
2421 | |||
2422 | if (inst->src[0].file == IMM) { |
||
2423 | assert(inst->src[0].type == BRW_REGISTER_TYPE_F); |
||
2424 | inst->opcode = BRW_OPCODE_MOV; |
||
2425 | inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f; |
||
2426 | inst->src[1] = reg_undef; |
||
2427 | progress = true; |
||
2428 | break; |
||
2429 | } |
||
2430 | break; |
||
2431 | case BRW_OPCODE_OR: |
||
2432 | if (inst->src[0].equals(inst->src[1])) { |
||
2433 | inst->opcode = BRW_OPCODE_MOV; |
||
2434 | inst->src[1] = reg_undef; |
||
2435 | progress = true; |
||
2436 | break; |
||
2437 | } |
||
2438 | break; |
||
2439 | case BRW_OPCODE_LRP: |
||
2440 | if (inst->src[1].equals(inst->src[2])) { |
||
2441 | inst->opcode = BRW_OPCODE_MOV; |
||
2442 | inst->src[0] = inst->src[1]; |
||
2443 | inst->src[1] = reg_undef; |
||
2444 | inst->src[2] = reg_undef; |
||
2445 | progress = true; |
||
2446 | break; |
||
2447 | } |
||
2448 | break; |
||
2449 | case BRW_OPCODE_CMP: |
||
2450 | if (inst->conditional_mod == BRW_CONDITIONAL_GE && |
||
2451 | inst->src[0].abs && |
||
2452 | inst->src[0].negate && |
||
2453 | inst->src[1].is_zero()) { |
||
2454 | inst->src[0].abs = false; |
||
2455 | inst->src[0].negate = false; |
||
2456 | inst->conditional_mod = BRW_CONDITIONAL_Z; |
||
2457 | progress = true; |
||
2458 | break; |
||
2459 | } |
||
2460 | break; |
||
2461 | case BRW_OPCODE_SEL: |
||
2462 | if (inst->src[0].equals(inst->src[1])) { |
||
2463 | inst->opcode = BRW_OPCODE_MOV; |
||
2464 | inst->src[1] = reg_undef; |
||
2465 | inst->predicate = BRW_PREDICATE_NONE; |
||
2466 | inst->predicate_inverse = false; |
||
2467 | progress = true; |
||
2468 | } else if (inst->saturate && inst->src[1].file == IMM) { |
||
2469 | switch (inst->conditional_mod) { |
||
2470 | case BRW_CONDITIONAL_LE: |
||
2471 | case BRW_CONDITIONAL_L: |
||
2472 | switch (inst->src[1].type) { |
||
2473 | case BRW_REGISTER_TYPE_F: |
||
2474 | if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) { |
||
2475 | inst->opcode = BRW_OPCODE_MOV; |
||
2476 | inst->src[1] = reg_undef; |
||
2477 | inst->conditional_mod = BRW_CONDITIONAL_NONE; |
||
2478 | progress = true; |
||
2479 | } |
||
2480 | break; |
||
2481 | default: |
||
2482 | break; |
||
2483 | } |
||
2484 | break; |
||
2485 | case BRW_CONDITIONAL_GE: |
||
2486 | case BRW_CONDITIONAL_G: |
||
2487 | switch (inst->src[1].type) { |
||
2488 | case BRW_REGISTER_TYPE_F: |
||
2489 | if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) { |
||
2490 | inst->opcode = BRW_OPCODE_MOV; |
||
2491 | inst->src[1] = reg_undef; |
||
2492 | inst->conditional_mod = BRW_CONDITIONAL_NONE; |
||
2493 | progress = true; |
||
2494 | } |
||
2495 | break; |
||
2496 | default: |
||
2497 | break; |
||
2498 | } |
||
2499 | default: |
||
2500 | break; |
||
2501 | } |
||
2502 | } |
||
2503 | break; |
||
2504 | case BRW_OPCODE_MAD: |
||
2505 | if (inst->src[1].is_zero() || inst->src[2].is_zero()) { |
||
2506 | inst->opcode = BRW_OPCODE_MOV; |
||
2507 | inst->src[1] = reg_undef; |
||
2508 | inst->src[2] = reg_undef; |
||
2509 | progress = true; |
||
2510 | } else if (inst->src[0].is_zero()) { |
||
2511 | inst->opcode = BRW_OPCODE_MUL; |
||
2512 | inst->src[0] = inst->src[2]; |
||
2513 | inst->src[2] = reg_undef; |
||
2514 | progress = true; |
||
2515 | } else if (inst->src[1].is_one()) { |
||
2516 | inst->opcode = BRW_OPCODE_ADD; |
||
2517 | inst->src[1] = inst->src[2]; |
||
2518 | inst->src[2] = reg_undef; |
||
2519 | progress = true; |
||
2520 | } else if (inst->src[2].is_one()) { |
||
2521 | inst->opcode = BRW_OPCODE_ADD; |
||
2522 | inst->src[2] = reg_undef; |
||
2523 | progress = true; |
||
2524 | } else if (inst->src[1].file == IMM && inst->src[2].file == IMM) { |
||
2525 | inst->opcode = BRW_OPCODE_ADD; |
||
2526 | inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f; |
||
2527 | inst->src[2] = reg_undef; |
||
2528 | progress = true; |
||
2529 | } |
||
2530 | break; |
||
2531 | case SHADER_OPCODE_RCP: { |
||
2532 | fs_inst *prev = (fs_inst *)inst->prev; |
||
2533 | if (prev->opcode == SHADER_OPCODE_SQRT) { |
||
2534 | if (inst->src[0].equals(prev->dst)) { |
||
2535 | inst->opcode = SHADER_OPCODE_RSQ; |
||
2536 | inst->src[0] = prev->src[0]; |
||
2537 | progress = true; |
||
2538 | } |
||
2539 | } |
||
2540 | break; |
||
2541 | } |
||
2542 | case SHADER_OPCODE_BROADCAST: |
||
2543 | if (is_uniform(inst->src[0])) { |
||
2544 | inst->opcode = BRW_OPCODE_MOV; |
||
2545 | inst->sources = 1; |
||
2546 | inst->force_writemask_all = true; |
||
2547 | progress = true; |
||
2548 | } else if (inst->src[1].file == IMM) { |
||
2549 | inst->opcode = BRW_OPCODE_MOV; |
||
2550 | inst->src[0] = component(inst->src[0], |
||
2551 | inst->src[1].fixed_hw_reg.dw1.ud); |
||
2552 | inst->sources = 1; |
||
2553 | inst->force_writemask_all = true; |
||
2554 | progress = true; |
||
2555 | } |
||
2556 | break; |
||
2557 | |||
2558 | default: |
||
2559 | break; |
||
2560 | } |
||
2561 | |||
2562 | /* Swap if src[0] is immediate. */ |
||
2563 | if (progress && inst->is_commutative()) { |
||
2564 | if (inst->src[0].file == IMM) { |
||
2565 | fs_reg tmp = inst->src[1]; |
||
2566 | inst->src[1] = inst->src[0]; |
||
2567 | inst->src[0] = tmp; |
||
2568 | } |
||
2569 | } |
||
2570 | } |
||
2571 | return progress; |
||
2572 | } |
||
2573 | |||
2574 | /** |
||
2575 | * Optimize sample messages that have constant zero values for the trailing |
||
2576 | * texture coordinates. We can just reduce the message length for these |
||
2577 | * instructions instead of reserving a register for it. Trailing parameters |
||
2578 | * that aren't sent default to zero anyway. This will cause the dead code |
||
2579 | * eliminator to remove the MOV instruction that would otherwise be emitted to |
||
2580 | * set up the zero value. |
||
2581 | */ |
||
2582 | bool |
||
2583 | fs_visitor::opt_zero_samples() |
||
2584 | { |
||
2585 | /* Gen4 infers the texturing opcode based on the message length so we can't |
||
2586 | * change it. |
||
2587 | */ |
||
2588 | if (devinfo->gen < 5) |
||
2589 | return false; |
||
2590 | |||
2591 | bool progress = false; |
||
2592 | |||
2593 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
2594 | if (!inst->is_tex()) |
||
2595 | continue; |
||
2596 | |||
2597 | fs_inst *load_payload = (fs_inst *) inst->prev; |
||
2598 | |||
2599 | if (load_payload->is_head_sentinel() || |
||
2600 | load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD) |
||
2601 | continue; |
||
2602 | |||
2603 | /* We don't want to remove the message header or the first parameter. |
||
2604 | * Removing the first parameter is not allowed, see the Haswell PRM |
||
2605 | * volume 7, page 149: |
||
2606 | * |
||
2607 | * "Parameter 0 is required except for the sampleinfo message, which |
||
2608 | * has no parameter 0" |
||
2609 | */ |
||
2610 | while (inst->mlen > inst->header_size + dispatch_width / 8 && |
||
2611 | load_payload->src[(inst->mlen - inst->header_size) / |
||
2612 | (dispatch_width / 8) + |
||
2613 | inst->header_size - 1].is_zero()) { |
||
2614 | inst->mlen -= dispatch_width / 8; |
||
2615 | progress = true; |
||
2616 | } |
||
2617 | } |
||
2618 | |||
2619 | if (progress) |
||
2620 | invalidate_live_intervals(); |
||
2621 | |||
2622 | return progress; |
||
2623 | } |
||
2624 | |||
2625 | /** |
||
2626 | * Optimize sample messages which are followed by the final RT write. |
||
2627 | * |
||
2628 | * CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its |
||
2629 | * results sent directly to the framebuffer, bypassing the EU. Recognize the |
||
2630 | * final texturing results copied to the framebuffer write payload and modify |
||
2631 | * them to write to the framebuffer directly. |
||
2632 | */ |
||
2633 | bool |
||
2634 | fs_visitor::opt_sampler_eot() |
||
2635 | { |
||
2636 | brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
||
2637 | |||
2638 | if (stage != MESA_SHADER_FRAGMENT) |
||
2639 | return false; |
||
2640 | |||
2641 | if (devinfo->gen < 9 && !devinfo->is_cherryview) |
||
2642 | return false; |
||
2643 | |||
2644 | /* FINISHME: It should be possible to implement this optimization when there |
||
2645 | * are multiple drawbuffers. |
||
2646 | */ |
||
2647 | if (key->nr_color_regions != 1) |
||
2648 | return false; |
||
2649 | |||
2650 | /* Look for a texturing instruction immediately before the final FB_WRITE. */ |
||
2651 | fs_inst *fb_write = (fs_inst *) cfg->blocks[cfg->num_blocks - 1]->end(); |
||
2652 | assert(fb_write->eot); |
||
2653 | assert(fb_write->opcode == FS_OPCODE_FB_WRITE); |
||
2654 | |||
2655 | fs_inst *tex_inst = (fs_inst *) fb_write->prev; |
||
2656 | |||
2657 | /* There wasn't one; nothing to do. */ |
||
2658 | if (unlikely(tex_inst->is_head_sentinel()) || !tex_inst->is_tex()) |
||
2659 | return false; |
||
2660 | |||
2661 | /* This optimisation doesn't seem to work for textureGather for some |
||
2662 | * reason. I can't find any documentation or known workarounds to indicate |
||
2663 | * that this is expected, but considering that it is probably pretty |
||
2664 | * unlikely that a shader would directly write out the results from |
||
2665 | * textureGather we might as well just disable it. |
||
2666 | */ |
||
2667 | if (tex_inst->opcode == SHADER_OPCODE_TG4 || |
||
2668 | tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET) |
||
2669 | return false; |
||
2670 | |||
2671 | /* If there's no header present, we need to munge the LOAD_PAYLOAD as well. |
||
2672 | * It's very likely to be the previous instruction. |
||
2673 | */ |
||
2674 | fs_inst *load_payload = (fs_inst *) tex_inst->prev; |
||
2675 | if (load_payload->is_head_sentinel() || |
||
2676 | load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD) |
||
2677 | return false; |
||
2678 | |||
2679 | assert(!tex_inst->eot); /* We can't get here twice */ |
||
2680 | assert((tex_inst->offset & (0xff << 24)) == 0); |
||
2681 | |||
2682 | tex_inst->offset |= fb_write->target << 24; |
||
2683 | tex_inst->eot = true; |
||
2684 | tex_inst->dst = reg_null_ud; |
||
2685 | fb_write->remove(cfg->blocks[cfg->num_blocks - 1]); |
||
2686 | |||
2687 | /* If a header is present, marking the eot is sufficient. Otherwise, we need |
||
2688 | * to create a new LOAD_PAYLOAD command with the same sources and a space |
||
2689 | * saved for the header. Using a new destination register not only makes sure |
||
2690 | * we have enough space, but it will make sure the dead code eliminator kills |
||
2691 | * the instruction that this will replace. |
||
2692 | */ |
||
2693 | if (tex_inst->header_size != 0) |
||
2694 | return true; |
||
2695 | |||
2696 | fs_reg send_header = vgrf(load_payload->sources + 1); |
||
2697 | fs_reg *new_sources = |
||
2698 | ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1); |
||
2699 | |||
2700 | new_sources[0] = fs_reg(); |
||
2701 | for (int i = 0; i < load_payload->sources; i++) |
||
2702 | new_sources[i+1] = load_payload->src[i]; |
||
2703 | |||
2704 | /* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it |
||
2705 | * requires a lot of information about the sources to appropriately figure |
||
2706 | * out the number of registers needed to be used. Given this stage in our |
||
2707 | * optimization, we may not have the appropriate GRFs required by |
||
2708 | * LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to |
||
2709 | * manually emit the instruction. |
||
2710 | */ |
||
2711 | fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, |
||
2712 | load_payload->exec_size, |
||
2713 | send_header, |
||
2714 | new_sources, |
||
2715 | load_payload->sources + 1); |
||
2716 | |||
2717 | new_load_payload->regs_written = load_payload->regs_written + 1; |
||
2718 | new_load_payload->header_size = 1; |
||
2719 | tex_inst->mlen++; |
||
2720 | tex_inst->header_size = 1; |
||
2721 | tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload); |
||
2722 | tex_inst->src[0] = send_header; |
||
2723 | |||
2724 | return true; |
||
2725 | } |
||
2726 | |||
2727 | bool |
||
2728 | fs_visitor::opt_register_renaming() |
||
2729 | { |
||
2730 | bool progress = false; |
||
2731 | int depth = 0; |
||
2732 | |||
2733 | int remap[alloc.count]; |
||
2734 | memset(remap, -1, sizeof(int) * alloc.count); |
||
2735 | |||
2736 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
2737 | if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) { |
||
2738 | depth++; |
||
2739 | } else if (inst->opcode == BRW_OPCODE_ENDIF || |
||
2740 | inst->opcode == BRW_OPCODE_WHILE) { |
||
2741 | depth--; |
||
2742 | } |
||
2743 | |||
2744 | /* Rewrite instruction sources. */ |
||
2745 | for (int i = 0; i < inst->sources; i++) { |
||
2746 | if (inst->src[i].file == GRF && |
||
2747 | remap[inst->src[i].reg] != -1 && |
||
2748 | remap[inst->src[i].reg] != inst->src[i].reg) { |
||
2749 | inst->src[i].reg = remap[inst->src[i].reg]; |
||
2750 | progress = true; |
||
2751 | } |
||
2752 | } |
||
2753 | |||
2754 | const int dst = inst->dst.reg; |
||
2755 | |||
2756 | if (depth == 0 && |
||
2757 | inst->dst.file == GRF && |
||
2758 | alloc.sizes[inst->dst.reg] == inst->dst.width / 8 && |
||
2759 | !inst->is_partial_write()) { |
||
2760 | if (remap[dst] == -1) { |
||
2761 | remap[dst] = dst; |
||
2762 | } else { |
||
2763 | remap[dst] = alloc.allocate(inst->dst.width / 8); |
||
2764 | inst->dst.reg = remap[dst]; |
||
2765 | progress = true; |
||
2766 | } |
||
2767 | } else if (inst->dst.file == GRF && |
||
2768 | remap[dst] != -1 && |
||
2769 | remap[dst] != dst) { |
||
2770 | inst->dst.reg = remap[dst]; |
||
2771 | progress = true; |
||
2772 | } |
||
2773 | } |
||
2774 | |||
2775 | if (progress) { |
||
2776 | invalidate_live_intervals(); |
||
2777 | |||
2778 | for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) { |
||
2779 | if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) { |
||
2780 | delta_xy[i].reg = remap[delta_xy[i].reg]; |
||
2781 | } |
||
2782 | } |
||
2783 | } |
||
2784 | |||
2785 | return progress; |
||
2786 | } |
||
2787 | |||
2788 | /** |
||
2789 | * Remove redundant or useless discard jumps. |
||
2790 | * |
||
2791 | * For example, we can eliminate jumps in the following sequence: |
||
2792 | * |
||
2793 | * discard-jump (redundant with the next jump) |
||
2794 | * discard-jump (useless; jumps to the next instruction) |
||
2795 | * placeholder-halt |
||
2796 | */ |
||
2797 | bool |
||
2798 | fs_visitor::opt_redundant_discard_jumps() |
||
2799 | { |
||
2800 | bool progress = false; |
||
2801 | |||
2802 | bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1]; |
||
2803 | |||
2804 | fs_inst *placeholder_halt = NULL; |
||
2805 | foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) { |
||
2806 | if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) { |
||
2807 | placeholder_halt = inst; |
||
2808 | break; |
||
2809 | } |
||
2810 | } |
||
2811 | |||
2812 | if (!placeholder_halt) |
||
2813 | return false; |
||
2814 | |||
2815 | /* Delete any HALTs immediately before the placeholder halt. */ |
||
2816 | for (fs_inst *prev = (fs_inst *) placeholder_halt->prev; |
||
2817 | !prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP; |
||
2818 | prev = (fs_inst *) placeholder_halt->prev) { |
||
2819 | prev->remove(last_bblock); |
||
2820 | progress = true; |
||
2821 | } |
||
2822 | |||
2823 | if (progress) |
||
2824 | invalidate_live_intervals(); |
||
2825 | |||
2826 | return progress; |
||
2827 | } |
||
2828 | |||
2829 | bool |
||
2830 | fs_visitor::compute_to_mrf() |
||
2831 | { |
||
2832 | bool progress = false; |
||
2833 | int next_ip = 0; |
||
2834 | |||
2835 | /* No MRFs on Gen >= 7. */ |
||
2836 | if (devinfo->gen >= 7) |
||
2837 | return false; |
||
2838 | |||
2839 | calculate_live_intervals(); |
||
2840 | |||
2841 | foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { |
||
2842 | int ip = next_ip; |
||
2843 | next_ip++; |
||
2844 | |||
2845 | if (inst->opcode != BRW_OPCODE_MOV || |
||
2846 | inst->is_partial_write() || |
||
2847 | inst->dst.file != MRF || inst->src[0].file != GRF || |
||
2848 | inst->dst.type != inst->src[0].type || |
||
2849 | inst->src[0].abs || inst->src[0].negate || |
||
2850 | !inst->src[0].is_contiguous() || |
||
2851 | inst->src[0].subreg_offset) |
||
2852 | continue; |
||
2853 | |||
2854 | /* Work out which hardware MRF registers are written by this |
||
2855 | * instruction. |
||
2856 | */ |
||
2857 | int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4; |
||
2858 | int mrf_high; |
||
2859 | if (inst->dst.reg & BRW_MRF_COMPR4) { |
||
2860 | mrf_high = mrf_low + 4; |
||
2861 | } else if (inst->exec_size == 16) { |
||
2862 | mrf_high = mrf_low + 1; |
||
2863 | } else { |
||
2864 | mrf_high = mrf_low; |
||
2865 | } |
||
2866 | |||
2867 | /* Can't compute-to-MRF this GRF if someone else was going to |
||
2868 | * read it later. |
||
2869 | */ |
||
2870 | if (this->virtual_grf_end[inst->src[0].reg] > ip) |
||
2871 | continue; |
||
2872 | |||
2873 | /* Found a move of a GRF to a MRF. Let's see if we can go |
||
2874 | * rewrite the thing that made this GRF to write into the MRF. |
||
2875 | */ |
||
2876 | foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) { |
||
2877 | if (scan_inst->dst.file == GRF && |
||
2878 | scan_inst->dst.reg == inst->src[0].reg) { |
||
2879 | /* Found the last thing to write our reg we want to turn |
||
2880 | * into a compute-to-MRF. |
||
2881 | */ |
||
2882 | |||
2883 | /* If this one instruction didn't populate all the |
||
2884 | * channels, bail. We might be able to rewrite everything |
||
2885 | * that writes that reg, but it would require smarter |
||
2886 | * tracking to delay the rewriting until complete success. |
||
2887 | */ |
||
2888 | if (scan_inst->is_partial_write()) |
||
2889 | break; |
||
2890 | |||
2891 | /* Things returning more than one register would need us to |
||
2892 | * understand coalescing out more than one MOV at a time. |
||
2893 | */ |
||
2894 | if (scan_inst->regs_written > scan_inst->dst.width / 8) |
||
2895 | break; |
||
2896 | |||
2897 | /* SEND instructions can't have MRF as a destination. */ |
||
2898 | if (scan_inst->mlen) |
||
2899 | break; |
||
2900 | |||
2901 | if (devinfo->gen == 6) { |
||
2902 | /* gen6 math instructions must have the destination be |
||
2903 | * GRF, so no compute-to-MRF for them. |
||
2904 | */ |
||
2905 | if (scan_inst->is_math()) { |
||
2906 | break; |
||
2907 | } |
||
2908 | } |
||
2909 | |||
2910 | if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) { |
||
2911 | /* Found the creator of our MRF's source value. */ |
||
2912 | scan_inst->dst.file = MRF; |
||
2913 | scan_inst->dst.reg = inst->dst.reg; |
||
2914 | scan_inst->saturate |= inst->saturate; |
||
2915 | inst->remove(block); |
||
2916 | progress = true; |
||
2917 | } |
||
2918 | break; |
||
2919 | } |
||
2920 | |||
2921 | /* We don't handle control flow here. Most computation of |
||
2922 | * values that end up in MRFs are shortly before the MRF |
||
2923 | * write anyway. |
||
2924 | */ |
||
2925 | if (block->start() == scan_inst) |
||
2926 | break; |
||
2927 | |||
2928 | /* You can't read from an MRF, so if someone else reads our |
||
2929 | * MRF's source GRF that we wanted to rewrite, that stops us. |
||
2930 | */ |
||
2931 | bool interfered = false; |
||
2932 | for (int i = 0; i < scan_inst->sources; i++) { |
||
2933 | if (scan_inst->src[i].file == GRF && |
||
2934 | scan_inst->src[i].reg == inst->src[0].reg && |
||
2935 | scan_inst->src[i].reg_offset == inst->src[0].reg_offset) { |
||
2936 | interfered = true; |
||
2937 | } |
||
2938 | } |
||
2939 | if (interfered) |
||
2940 | break; |
||
2941 | |||
2942 | if (scan_inst->dst.file == MRF) { |
||
2943 | /* If somebody else writes our MRF here, we can't |
||
2944 | * compute-to-MRF before that. |
||
2945 | */ |
||
2946 | int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4; |
||
2947 | int scan_mrf_high; |
||
2948 | |||
2949 | if (scan_inst->dst.reg & BRW_MRF_COMPR4) { |
||
2950 | scan_mrf_high = scan_mrf_low + 4; |
||
2951 | } else if (scan_inst->exec_size == 16) { |
||
2952 | scan_mrf_high = scan_mrf_low + 1; |
||
2953 | } else { |
||
2954 | scan_mrf_high = scan_mrf_low; |
||
2955 | } |
||
2956 | |||
2957 | if (mrf_low == scan_mrf_low || |
||
2958 | mrf_low == scan_mrf_high || |
||
2959 | mrf_high == scan_mrf_low || |
||
2960 | mrf_high == scan_mrf_high) { |
||
2961 | break; |
||
2962 | } |
||
2963 | } |
||
2964 | |||
2965 | if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) { |
||
2966 | /* Found a SEND instruction, which means that there are |
||
2967 | * live values in MRFs from base_mrf to base_mrf + |
||
2968 | * scan_inst->mlen - 1. Don't go pushing our MRF write up |
||
2969 | * above it. |
||
2970 | */ |
||
2971 | if (mrf_low >= scan_inst->base_mrf && |
||
2972 | mrf_low < scan_inst->base_mrf + scan_inst->mlen) { |
||
2973 | break; |
||
2974 | } |
||
2975 | if (mrf_high >= scan_inst->base_mrf && |
||
2976 | mrf_high < scan_inst->base_mrf + scan_inst->mlen) { |
||
2977 | break; |
||
2978 | } |
||
2979 | } |
||
2980 | } |
||
2981 | } |
||
2982 | |||
2983 | if (progress) |
||
2984 | invalidate_live_intervals(); |
||
2985 | |||
2986 | return progress; |
||
2987 | } |
||
2988 | |||
2989 | /** |
||
2990 | * Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control |
||
2991 | * flow. We could probably do better here with some form of divergence |
||
2992 | * analysis. |
||
2993 | */ |
||
2994 | bool |
||
2995 | fs_visitor::eliminate_find_live_channel() |
||
2996 | { |
||
2997 | bool progress = false; |
||
2998 | unsigned depth = 0; |
||
2999 | |||
3000 | foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { |
||
3001 | switch (inst->opcode) { |
||
3002 | case BRW_OPCODE_IF: |
||
3003 | case BRW_OPCODE_DO: |
||
3004 | depth++; |
||
3005 | break; |
||
3006 | |||
3007 | case BRW_OPCODE_ENDIF: |
||
3008 | case BRW_OPCODE_WHILE: |
||
3009 | depth--; |
||
3010 | break; |
||
3011 | |||
3012 | case FS_OPCODE_DISCARD_JUMP: |
||
3013 | /* This can potentially make control flow non-uniform until the end |
||
3014 | * of the program. |
||
3015 | */ |
||
3016 | return progress; |
||
3017 | |||
3018 | case SHADER_OPCODE_FIND_LIVE_CHANNEL: |
||
3019 | if (depth == 0) { |
||
3020 | inst->opcode = BRW_OPCODE_MOV; |
||
3021 | inst->src[0] = fs_reg(0); |
||
3022 | inst->sources = 1; |
||
3023 | inst->force_writemask_all = true; |
||
3024 | progress = true; |
||
3025 | } |
||
3026 | break; |
||
3027 | |||
3028 | default: |
||
3029 | break; |
||
3030 | } |
||
3031 | } |
||
3032 | |||
3033 | return progress; |
||
3034 | } |
||
3035 | |||
3036 | /** |
||
3037 | * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE |
||
3038 | * instructions to FS_OPCODE_REP_FB_WRITE. |
||
3039 | */ |
||
3040 | void |
||
3041 | fs_visitor::emit_repclear_shader() |
||
3042 | { |
||
3043 | brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
||
3044 | int base_mrf = 1; |
||
3045 | int color_mrf = base_mrf + 2; |
||
3046 | |||
3047 | fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)), |
||
3048 | fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F))); |
||
3049 | mov->force_writemask_all = true; |
||
3050 | |||
3051 | fs_inst *write; |
||
3052 | if (key->nr_color_regions == 1) { |
||
3053 | write = emit(FS_OPCODE_REP_FB_WRITE); |
||
3054 | write->saturate = key->clamp_fragment_color; |
||
3055 | write->base_mrf = color_mrf; |
||
3056 | write->target = 0; |
||
3057 | write->header_size = 0; |
||
3058 | write->mlen = 1; |
||
3059 | } else { |
||
3060 | assume(key->nr_color_regions > 0); |
||
3061 | for (int i = 0; i < key->nr_color_regions; ++i) { |
||
3062 | write = emit(FS_OPCODE_REP_FB_WRITE); |
||
3063 | write->saturate = key->clamp_fragment_color; |
||
3064 | write->base_mrf = base_mrf; |
||
3065 | write->target = i; |
||
3066 | write->header_size = 2; |
||
3067 | write->mlen = 3; |
||
3068 | } |
||
3069 | } |
||
3070 | write->eot = true; |
||
3071 | |||
3072 | calculate_cfg(); |
||
3073 | |||
3074 | assign_constant_locations(); |
||
3075 | assign_curb_setup(); |
||
3076 | |||
3077 | /* Now that we have the uniform assigned, go ahead and force it to a vec4. */ |
||
3078 | assert(mov->src[0].file == HW_REG); |
||
3079 | mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0); |
||
3080 | } |
||
3081 | |||
3082 | /** |
||
3083 | * Walks through basic blocks, looking for repeated MRF writes and |
||
3084 | * removing the later ones. |
||
3085 | */ |
||
3086 | bool |
||
3087 | fs_visitor::remove_duplicate_mrf_writes() |
||
3088 | { |
||
3089 | fs_inst *last_mrf_move[16]; |
||
3090 | bool progress = false; |
||
3091 | |||
3092 | /* Need to update the MRF tracking for compressed instructions. */ |
||
3093 | if (dispatch_width == 16) |
||
3094 | return false; |
||
3095 | |||
3096 | memset(last_mrf_move, 0, sizeof(last_mrf_move)); |
||
3097 | |||
3098 | foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { |
||
3099 | if (inst->is_control_flow()) { |
||
3100 | memset(last_mrf_move, 0, sizeof(last_mrf_move)); |
||
3101 | } |
||
3102 | |||
3103 | if (inst->opcode == BRW_OPCODE_MOV && |
||
3104 | inst->dst.file == MRF) { |
||
3105 | fs_inst *prev_inst = last_mrf_move[inst->dst.reg]; |
||
3106 | if (prev_inst && inst->equals(prev_inst)) { |
||
3107 | inst->remove(block); |
||
3108 | progress = true; |
||
3109 | continue; |
||
3110 | } |
||
3111 | } |
||
3112 | |||
3113 | /* Clear out the last-write records for MRFs that were overwritten. */ |
||
3114 | if (inst->dst.file == MRF) { |
||
3115 | last_mrf_move[inst->dst.reg] = NULL; |
||
3116 | } |
||
3117 | |||
3118 | if (inst->mlen > 0 && inst->base_mrf != -1) { |
||
3119 | /* Found a SEND instruction, which will include two or fewer |
||
3120 | * implied MRF writes. We could do better here. |
||
3121 | */ |
||
3122 | for (int i = 0; i < implied_mrf_writes(inst); i++) { |
||
3123 | last_mrf_move[inst->base_mrf + i] = NULL; |
||
3124 | } |
||
3125 | } |
||
3126 | |||
3127 | /* Clear out any MRF move records whose sources got overwritten. */ |
||
3128 | if (inst->dst.file == GRF) { |
||
3129 | for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) { |
||
3130 | if (last_mrf_move[i] && |
||
3131 | last_mrf_move[i]->src[0].reg == inst->dst.reg) { |
||
3132 | last_mrf_move[i] = NULL; |
||
3133 | } |
||
3134 | } |
||
3135 | } |
||
3136 | |||
3137 | if (inst->opcode == BRW_OPCODE_MOV && |
||
3138 | inst->dst.file == MRF && |
||
3139 | inst->src[0].file == GRF && |
||
3140 | !inst->is_partial_write()) { |
||
3141 | last_mrf_move[inst->dst.reg] = inst; |
||
3142 | } |
||
3143 | } |
||
3144 | |||
3145 | if (progress) |
||
3146 | invalidate_live_intervals(); |
||
3147 | |||
3148 | return progress; |
||
3149 | } |
||
3150 | |||
3151 | static void |
||
3152 | clear_deps_for_inst_src(fs_inst *inst, bool *deps, int first_grf, int grf_len) |
||
3153 | { |
||
3154 | /* Clear the flag for registers that actually got read (as expected). */ |
||
3155 | for (int i = 0; i < inst->sources; i++) { |
||
3156 | int grf; |
||
3157 | if (inst->src[i].file == GRF) { |
||
3158 | grf = inst->src[i].reg; |
||
3159 | } else if (inst->src[i].file == HW_REG && |
||
3160 | inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) { |
||
3161 | grf = inst->src[i].fixed_hw_reg.nr; |
||
3162 | } else { |
||
3163 | continue; |
||
3164 | } |
||
3165 | |||
3166 | if (grf >= first_grf && |
||
3167 | grf < first_grf + grf_len) { |
||
3168 | deps[grf - first_grf] = false; |
||
3169 | if (inst->exec_size == 16) |
||
3170 | deps[grf - first_grf + 1] = false; |
||
3171 | } |
||
3172 | } |
||
3173 | } |
||
3174 | |||
3175 | /** |
||
3176 | * Implements this workaround for the original 965: |
||
3177 | * |
||
3178 | * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not |
||
3179 | * check for post destination dependencies on this instruction, software |
||
3180 | * must ensure that there is no destination hazard for the case of ‘write |
||
3181 | * followed by a posted write’ shown in the following example. |
||
3182 | * |
||
3183 | * 1. mov r3 0 |
||
3184 | * 2. send r3.xy |
||
3185 | * 3. mov r2 r3 |
||
3186 | * |
||
3187 | * Due to no post-destination dependency check on the ‘send’, the above |
||
3188 | * code sequence could have two instructions (1 and 2) in flight at the |
||
3189 | * same time that both consider ‘r3’ as the target of their final writes. |
||
3190 | */ |
||
3191 | void |
||
3192 | fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block, |
||
3193 | fs_inst *inst) |
||
3194 | { |
||
3195 | int write_len = inst->regs_written; |
||
3196 | int first_write_grf = inst->dst.reg; |
||
3197 | bool needs_dep[BRW_MAX_MRF]; |
||
3198 | assert(write_len < (int)sizeof(needs_dep) - 1); |
||
3199 | |||
3200 | memset(needs_dep, false, sizeof(needs_dep)); |
||
3201 | memset(needs_dep, true, write_len); |
||
3202 | |||
3203 | clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len); |
||
3204 | |||
3205 | /* Walk backwards looking for writes to registers we're writing which |
||
3206 | * aren't read since being written. If we hit the start of the program, |
||
3207 | * we assume that there are no outstanding dependencies on entry to the |
||
3208 | * program. |
||
3209 | */ |
||
3210 | foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) { |
||
3211 | /* If we hit control flow, assume that there *are* outstanding |
||
3212 | * dependencies, and force their cleanup before our instruction. |
||
3213 | */ |
||
3214 | if (block->start() == scan_inst) { |
||
3215 | for (int i = 0; i < write_len; i++) { |
||
3216 | if (needs_dep[i]) { |
||
3217 | inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i)); |
||
3218 | } |
||
3219 | } |
||
3220 | return; |
||
3221 | } |
||
3222 | |||
3223 | /* We insert our reads as late as possible on the assumption that any |
||
3224 | * instruction but a MOV that might have left us an outstanding |
||
3225 | * dependency has more latency than a MOV. |
||
3226 | */ |
||
3227 | if (scan_inst->dst.file == GRF) { |
||
3228 | for (int i = 0; i < scan_inst->regs_written; i++) { |
||
3229 | int reg = scan_inst->dst.reg + i; |
||
3230 | |||
3231 | if (reg >= first_write_grf && |
||
3232 | reg < first_write_grf + write_len && |
||
3233 | needs_dep[reg - first_write_grf]) { |
||
3234 | inst->insert_before(block, DEP_RESOLVE_MOV(reg)); |
||
3235 | needs_dep[reg - first_write_grf] = false; |
||
3236 | if (scan_inst->exec_size == 16) |
||
3237 | needs_dep[reg - first_write_grf + 1] = false; |
||
3238 | } |
||
3239 | } |
||
3240 | } |
||
3241 | |||
3242 | /* Clear the flag for registers that actually got read (as expected). */ |
||
3243 | clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); |
||
3244 | |||
3245 | /* Continue the loop only if we haven't resolved all the dependencies */ |
||
3246 | int i; |
||
3247 | for (i = 0; i < write_len; i++) { |
||
3248 | if (needs_dep[i]) |
||
3249 | break; |
||
3250 | } |
||
3251 | if (i == write_len) |
||
3252 | return; |
||
3253 | } |
||
3254 | } |
||
3255 | |||
3256 | /** |
||
3257 | * Implements this workaround for the original 965: |
||
3258 | * |
||
3259 | * "[DevBW, DevCL] Errata: A destination register from a send can not be |
||
3260 | * used as a destination register until after it has been sourced by an |
||
3261 | * instruction with a different destination register. |
||
3262 | */ |
||
3263 | void |
||
3264 | fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst) |
||
3265 | { |
||
3266 | int write_len = inst->regs_written; |
||
3267 | int first_write_grf = inst->dst.reg; |
||
3268 | bool needs_dep[BRW_MAX_MRF]; |
||
3269 | assert(write_len < (int)sizeof(needs_dep) - 1); |
||
3270 | |||
3271 | memset(needs_dep, false, sizeof(needs_dep)); |
||
3272 | memset(needs_dep, true, write_len); |
||
3273 | /* Walk forwards looking for writes to registers we're writing which aren't |
||
3274 | * read before being written. |
||
3275 | */ |
||
3276 | foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) { |
||
3277 | /* If we hit control flow, force resolve all remaining dependencies. */ |
||
3278 | if (block->end() == scan_inst) { |
||
3279 | for (int i = 0; i < write_len; i++) { |
||
3280 | if (needs_dep[i]) |
||
3281 | scan_inst->insert_before(block, |
||
3282 | DEP_RESOLVE_MOV(first_write_grf + i)); |
||
3283 | } |
||
3284 | return; |
||
3285 | } |
||
3286 | |||
3287 | /* Clear the flag for registers that actually got read (as expected). */ |
||
3288 | clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len); |
||
3289 | |||
3290 | /* We insert our reads as late as possible since they're reading the |
||
3291 | * result of a SEND, which has massive latency. |
||
3292 | */ |
||
3293 | if (scan_inst->dst.file == GRF && |
||
3294 | scan_inst->dst.reg >= first_write_grf && |
||
3295 | scan_inst->dst.reg < first_write_grf + write_len && |
||
3296 | needs_dep[scan_inst->dst.reg - first_write_grf]) { |
||
3297 | scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg)); |
||
3298 | needs_dep[scan_inst->dst.reg - first_write_grf] = false; |
||
3299 | } |
||
3300 | |||
3301 | /* Continue the loop only if we haven't resolved all the dependencies */ |
||
3302 | int i; |
||
3303 | for (i = 0; i < write_len; i++) { |
||
3304 | if (needs_dep[i]) |
||
3305 | break; |
||
3306 | } |
||
3307 | if (i == write_len) |
||
3308 | return; |
||
3309 | } |
||
3310 | } |
||
3311 | |||
3312 | void |
||
3313 | fs_visitor::insert_gen4_send_dependency_workarounds() |
||
3314 | { |
||
3315 | if (devinfo->gen != 4 || devinfo->is_g4x) |
||
3316 | return; |
||
3317 | |||
3318 | bool progress = false; |
||
3319 | |||
3320 | /* Note that we're done with register allocation, so GRF fs_regs always |
||
3321 | * have a .reg_offset of 0. |
||
3322 | */ |
||
3323 | |||
3324 | foreach_block_and_inst(block, fs_inst, inst, cfg) { |
||
3325 | if (inst->mlen != 0 && inst->dst.file == GRF) { |
||
3326 | insert_gen4_pre_send_dependency_workarounds(block, inst); |
||
3327 | insert_gen4_post_send_dependency_workarounds(block, inst); |
||
3328 | progress = true; |
||
3329 | } |
||
3330 | } |
||
3331 | |||
3332 | if (progress) |
||
3333 | invalidate_live_intervals(); |
||
3334 | } |
||
3335 | |||
3336 | /** |
||
3337 | * Turns the generic expression-style uniform pull constant load instruction |
||
3338 | * into a hardware-specific series of instructions for loading a pull |
||
3339 | * constant. |
||
3340 | * |
||
3341 | * The expression style allows the CSE pass before this to optimize out |
||
3342 | * repeated loads from the same offset, and gives the pre-register-allocation |
||
3343 | * scheduling full flexibility, while the conversion to native instructions |
||
3344 | * allows the post-register-allocation scheduler the best information |
||
3345 | * possible. |
||
3346 | * |
||
3347 | * Note that execution masking for setting up pull constant loads is special: |
||
3348 | * the channels that need to be written are unrelated to the current execution |
||
3349 | * mask, since a later instruction will use one of the result channels as a |
||
3350 | * source operand for all 8 or 16 of its channels. |
||
3351 | */ |
||
3352 | void |
||
3353 | fs_visitor::lower_uniform_pull_constant_loads() |
||
3354 | { |
||
3355 | foreach_block_and_inst (block, fs_inst, inst, cfg) { |
||
3356 | if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD) |
||
3357 | continue; |
||
3358 | |||
3359 | if (devinfo->gen >= 7) { |
||
3360 | /* The offset arg before was a vec4-aligned byte offset. We need to |
||
3361 | * turn it into a dword offset. |
||
3362 | */ |
||
3363 | fs_reg const_offset_reg = inst->src[1]; |
||
3364 | assert(const_offset_reg.file == IMM && |
||
3365 | const_offset_reg.type == BRW_REGISTER_TYPE_UD); |
||
3366 | const_offset_reg.fixed_hw_reg.dw1.ud /= 4; |
||
3367 | fs_reg payload = fs_reg(GRF, alloc.allocate(1)); |
||
3368 | |||
3369 | /* We have to use a message header on Skylake to get SIMD4x2 mode. |
||
3370 | * Reserve space for the register. |
||
3371 | */ |
||
3372 | if (devinfo->gen >= 9) { |
||
3373 | payload.reg_offset++; |
||
3374 | alloc.sizes[payload.reg] = 2; |
||
3375 | } |
||
3376 | |||
3377 | /* This is actually going to be a MOV, but since only the first dword |
||
3378 | * is accessed, we have a special opcode to do just that one. Note |
||
3379 | * that this needs to be an operation that will be considered a def |
||
3380 | * by live variable analysis, or register allocation will explode. |
||
3381 | */ |
||
3382 | fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET, |
||
3383 | 8, payload, const_offset_reg); |
||
3384 | setup->force_writemask_all = true; |
||
3385 | |||
3386 | setup->ir = inst->ir; |
||
3387 | setup->annotation = inst->annotation; |
||
3388 | inst->insert_before(block, setup); |
||
3389 | |||
3390 | /* Similarly, this will only populate the first 4 channels of the |
||
3391 | * result register (since we only use smear values from 0-3), but we |
||
3392 | * don't tell the optimizer. |
||
3393 | */ |
||
3394 | inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7; |
||
3395 | inst->src[1] = payload; |
||
3396 | |||
3397 | invalidate_live_intervals(); |
||
3398 | } else { |
||
3399 | /* Before register allocation, we didn't tell the scheduler about the |
||
3400 | * MRF we use. We know it's safe to use this MRF because nothing |
||
3401 | * else does except for register spill/unspill, which generates and |
||
3402 | * uses its MRF within a single IR instruction. |
||
3403 | */ |
||
3404 | inst->base_mrf = 14; |
||
3405 | inst->mlen = 1; |
||
3406 | } |
||
3407 | } |
||
3408 | } |
||
3409 | |||
3410 | bool |
||
3411 | fs_visitor::lower_load_payload() |
||
3412 | { |
||
3413 | bool progress = false; |
||
3414 | |||
3415 | foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { |
||
3416 | if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD) |
||
3417 | continue; |
||
3418 | |||
3419 | assert(inst->dst.file == MRF || inst->dst.file == GRF); |
||
3420 | assert(inst->saturate == false); |
||
3421 | |||
3422 | fs_reg dst = inst->dst; |
||
3423 | |||
3424 | /* Get rid of COMPR4. We'll add it back in if we need it */ |
||
3425 | if (dst.file == MRF) |
||
3426 | dst.reg = dst.reg & ~BRW_MRF_COMPR4; |
||
3427 | |||
3428 | dst.width = 8; |
||
3429 | for (uint8_t i = 0; i < inst->header_size; i++) { |
||
3430 | if (inst->src[i].file != BAD_FILE) { |
||
3431 | fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD); |
||
3432 | fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD); |
||
3433 | mov_src.width = 8; |
||
3434 | fs_inst *mov = MOV(mov_dst, mov_src); |
||
3435 | mov->force_writemask_all = true; |
||
3436 | inst->insert_before(block, mov); |
||
3437 | } |
||
3438 | dst = offset(dst, 1); |
||
3439 | } |
||
3440 | |||
3441 | dst.width = inst->exec_size; |
||
3442 | if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) && |
||
3443 | inst->exec_size > 8) { |
||
3444 | /* In this case, the payload portion of the LOAD_PAYLOAD isn't |
||
3445 | * a straightforward copy. Instead, the result of the |
||
3446 | * LOAD_PAYLOAD is treated as interleaved and the first four |
||
3447 | * non-header sources are unpacked as: |
||
3448 | * |
||
3449 | * m + 0: r0 |
||
3450 | * m + 1: g0 |
||
3451 | * m + 2: b0 |
||
3452 | * m + 3: a0 |
||
3453 | * m + 4: r1 |
||
3454 | * m + 5: g1 |
||
3455 | * m + 6: b1 |
||
3456 | * m + 7: a1 |
||
3457 | * |
||
3458 | * This is used for gen <= 5 fb writes. |
||
3459 | */ |
||
3460 | assert(inst->exec_size == 16); |
||
3461 | assert(inst->header_size + 4 <= inst->sources); |
||
3462 | for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) { |
||
3463 | if (inst->src[i].file != BAD_FILE) { |
||
3464 | if (devinfo->has_compr4) { |
||
3465 | fs_reg compr4_dst = retype(dst, inst->src[i].type); |
||
3466 | compr4_dst.reg |= BRW_MRF_COMPR4; |
||
3467 | |||
3468 | fs_inst *mov = MOV(compr4_dst, inst->src[i]); |
||
3469 | mov->force_writemask_all = inst->force_writemask_all; |
||
3470 | inst->insert_before(block, mov); |
||
3471 | } else { |
||
3472 | /* Platform doesn't have COMPR4. We have to fake it */ |
||
3473 | fs_reg mov_dst = retype(dst, inst->src[i].type); |
||
3474 | mov_dst.width = 8; |
||
3475 | |||
3476 | fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0)); |
||
3477 | mov->force_writemask_all = inst->force_writemask_all; |
||
3478 | inst->insert_before(block, mov); |
||
3479 | |||
3480 | mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1)); |
||
3481 | mov->force_writemask_all = inst->force_writemask_all; |
||
3482 | mov->force_sechalf = true; |
||
3483 | inst->insert_before(block, mov); |
||
3484 | } |
||
3485 | } |
||
3486 | |||
3487 | dst.reg++; |
||
3488 | } |
||
3489 | |||
3490 | /* The loop above only ever incremented us through the first set |
||
3491 | * of 4 registers. However, thanks to the magic of COMPR4, we |
||
3492 | * actually wrote to the first 8 registers, so we need to take |
||
3493 | * that into account now. |
||
3494 | */ |
||
3495 | dst.reg += 4; |
||
3496 | |||
3497 | /* The COMPR4 code took care of the first 4 sources. We'll let |
||
3498 | * the regular path handle any remaining sources. Yes, we are |
||
3499 | * modifying the instruction but we're about to delete it so |
||
3500 | * this really doesn't hurt anything. |
||
3501 | */ |
||
3502 | inst->header_size += 4; |
||
3503 | } |
||
3504 | |||
3505 | for (uint8_t i = inst->header_size; i < inst->sources; i++) { |
||
3506 | if (inst->src[i].file != BAD_FILE) { |
||
3507 | fs_inst *mov = MOV(retype(dst, inst->src[i].type), |
||
3508 | inst->src[i]); |
||
3509 | mov->force_writemask_all = inst->force_writemask_all; |
||
3510 | mov->force_sechalf = inst->force_sechalf; |
||
3511 | inst->insert_before(block, mov); |
||
3512 | } |
||
3513 | dst = offset(dst, 1); |
||
3514 | } |
||
3515 | |||
3516 | inst->remove(block); |
||
3517 | progress = true; |
||
3518 | } |
||
3519 | |||
3520 | if (progress) |
||
3521 | invalidate_live_intervals(); |
||
3522 | |||
3523 | return progress; |
||
3524 | } |
||
3525 | |||
3526 | bool |
||
3527 | fs_visitor::lower_integer_multiplication() |
||
3528 | { |
||
3529 | bool progress = false; |
||
3530 | |||
3531 | /* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation |
||
3532 | * directly, but Cherryview cannot. |
||
3533 | */ |
||
3534 | if (devinfo->gen >= 8 && !devinfo->is_cherryview) |
||
3535 | return false; |
||
3536 | |||
3537 | foreach_block_and_inst_safe(block, fs_inst, inst, cfg) { |
||
3538 | if (inst->opcode != BRW_OPCODE_MUL || |
||
3539 | inst->dst.is_accumulator() || |
||
3540 | (inst->dst.type != BRW_REGISTER_TYPE_D && |
||
3541 | inst->dst.type != BRW_REGISTER_TYPE_UD)) |
||
3542 | continue; |
||
3543 | |||
3544 | #define insert(instr) inst->insert_before(block, instr) |
||
3545 | |||
3546 | /* The MUL instruction isn't commutative. On Gen <= 6, only the low |
||
3547 | * 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of |
||
3548 | * src1 are used. |
||
3549 | * |
||
3550 | * If multiplying by an immediate value that fits in 16-bits, do a |
||
3551 | * single MUL instruction with that value in the proper location. |
||
3552 | */ |
||
3553 | if (inst->src[1].file == IMM && |
||
3554 | inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) { |
||
3555 | if (devinfo->gen < 7) { |
||
3556 | fs_reg imm(GRF, alloc.allocate(dispatch_width / 8), |
||
3557 | inst->dst.type, dispatch_width); |
||
3558 | insert(MOV(imm, inst->src[1])); |
||
3559 | insert(MUL(inst->dst, imm, inst->src[0])); |
||
3560 | } else { |
||
3561 | insert(MUL(inst->dst, inst->src[0], inst->src[1])); |
||
3562 | } |
||
3563 | } else { |
||
3564 | /* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot |
||
3565 | * do 32-bit integer multiplication in one instruction, but instead |
||
3566 | * must do a sequence (which actually calculates a 64-bit result): |
||
3567 | * |
||
3568 | * mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D |
||
3569 | * mach(8) null g3<8,8,1>D g4<8,8,1>D |
||
3570 | * mov(8) g2<1>D acc0<8,8,1>D |
||
3571 | * |
||
3572 | * But on Gen > 6, the ability to use second accumulator register |
||
3573 | * (acc1) for non-float data types was removed, preventing a simple |
||
3574 | * implementation in SIMD16. A 16-channel result can be calculated by |
||
3575 | * executing the three instructions twice in SIMD8, once with quarter |
||
3576 | * control of 1Q for the first eight channels and again with 2Q for |
||
3577 | * the second eight channels. |
||
3578 | * |
||
3579 | * Which accumulator register is implicitly accessed (by AccWrEnable |
||
3580 | * for instance) is determined by the quarter control. Unfortunately |
||
3581 | * Ivybridge (and presumably Baytrail) has a hardware bug in which an |
||
3582 | * implicit accumulator access by an instruction with 2Q will access |
||
3583 | * acc1 regardless of whether the data type is usable in acc1. |
||
3584 | * |
||
3585 | * Specifically, the 2Q mach(8) writes acc1 which does not exist for |
||
3586 | * integer data types. |
||
3587 | * |
||
3588 | * Since we only want the low 32-bits of the result, we can do two |
||
3589 | * 32-bit x 16-bit multiplies (like the mul and mach are doing), and |
||
3590 | * adjust the high result and add them (like the mach is doing): |
||
3591 | * |
||
3592 | * mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW |
||
3593 | * mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW |
||
3594 | * shl(8) g9<1>D g8<8,8,1>D 16D |
||
3595 | * add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D |
||
3596 | * |
||
3597 | * We avoid the shl instruction by realizing that we only want to add |
||
3598 | * the low 16-bits of the "high" result to the high 16-bits of the |
||
3599 | * "low" result and using proper regioning on the add: |
||
3600 | * |
||
3601 | * mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW |
||
3602 | * mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW |
||
3603 | * add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW |
||
3604 | * |
||
3605 | * Since it does not use the (single) accumulator register, we can |
||
3606 | * schedule multi-component multiplications much better. |
||
3607 | */ |
||
3608 | |||
3609 | fs_reg low = inst->dst; |
||
3610 | fs_reg high(GRF, alloc.allocate(dispatch_width / 8), |
||
3611 | inst->dst.type, dispatch_width); |
||
3612 | |||
3613 | if (brw->gen >= 7) { |
||
3614 | fs_reg src1_0_w = inst->src[1]; |
||
3615 | fs_reg src1_1_w = inst->src[1]; |
||
3616 | |||
3617 | if (inst->src[1].file == IMM) { |
||
3618 | src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff; |
||
3619 | src1_1_w.fixed_hw_reg.dw1.ud >>= 16; |
||
3620 | } else { |
||
3621 | src1_0_w.type = BRW_REGISTER_TYPE_UW; |
||
3622 | src1_0_w.stride = 2; |
||
3623 | |||
3624 | src1_1_w.type = BRW_REGISTER_TYPE_UW; |
||
3625 | src1_1_w.stride = 2; |
||
3626 | src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); |
||
3627 | } |
||
3628 | insert(MUL(low, inst->src[0], src1_0_w)); |
||
3629 | insert(MUL(high, inst->src[0], src1_1_w)); |
||
3630 | } else { |
||
3631 | fs_reg src0_0_w = inst->src[0]; |
||
3632 | fs_reg src0_1_w = inst->src[0]; |
||
3633 | |||
3634 | src0_0_w.type = BRW_REGISTER_TYPE_UW; |
||
3635 | src0_0_w.stride = 2; |
||
3636 | |||
3637 | src0_1_w.type = BRW_REGISTER_TYPE_UW; |
||
3638 | src0_1_w.stride = 2; |
||
3639 | src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW); |
||
3640 | |||
3641 | insert(MUL(low, src0_0_w, inst->src[1])); |
||
3642 | insert(MUL(high, src0_1_w, inst->src[1])); |
||
3643 | } |
||
3644 | |||
3645 | fs_reg dst = inst->dst; |
||
3646 | dst.type = BRW_REGISTER_TYPE_UW; |
||
3647 | dst.subreg_offset = 2; |
||
3648 | dst.stride = 2; |
||
3649 | |||
3650 | high.type = BRW_REGISTER_TYPE_UW; |
||
3651 | high.stride = 2; |
||
3652 | |||
3653 | low.type = BRW_REGISTER_TYPE_UW; |
||
3654 | low.subreg_offset = 2; |
||
3655 | low.stride = 2; |
||
3656 | |||
3657 | insert(ADD(dst, low, high)); |
||
3658 | } |
||
3659 | #undef insert |
||
3660 | |||
3661 | inst->remove(block); |
||
3662 | progress = true; |
||
3663 | } |
||
3664 | |||
3665 | if (progress) |
||
3666 | invalidate_live_intervals(); |
||
3667 | |||
3668 | return progress; |
||
3669 | } |
||
3670 | |||
3671 | void |
||
3672 | fs_visitor::dump_instructions() |
||
3673 | { |
||
3674 | dump_instructions(NULL); |
||
3675 | } |
||
3676 | |||
3677 | void |
||
3678 | fs_visitor::dump_instructions(const char *name) |
||
3679 | { |
||
3680 | FILE *file = stderr; |
||
3681 | if (name && geteuid() != 0) { |
||
3682 | file = fopen(name, "w"); |
||
3683 | if (!file) |
||
3684 | file = stderr; |
||
3685 | } |
||
3686 | |||
3687 | if (cfg) { |
||
3688 | calculate_register_pressure(); |
||
3689 | int ip = 0, max_pressure = 0; |
||
3690 | foreach_block_and_inst(block, backend_instruction, inst, cfg) { |
||
3691 | max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]); |
||
3692 | fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip); |
||
3693 | dump_instruction(inst, file); |
||
3694 | ip++; |
||
3695 | } |
||
3696 | fprintf(file, "Maximum %3d registers live at once.\n", max_pressure); |
||
3697 | } else { |
||
3698 | int ip = 0; |
||
3699 | foreach_in_list(backend_instruction, inst, &instructions) { |
||
3700 | fprintf(file, "%4d: ", ip++); |
||
3701 | dump_instruction(inst, file); |
||
3702 | } |
||
3703 | } |
||
3704 | |||
3705 | if (file != stderr) { |
||
3706 | fclose(file); |
||
3707 | } |
||
3708 | } |
||
3709 | |||
3710 | void |
||
3711 | fs_visitor::dump_instruction(backend_instruction *be_inst) |
||
3712 | { |
||
3713 | dump_instruction(be_inst, stderr); |
||
3714 | } |
||
3715 | |||
3716 | void |
||
3717 | fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file) |
||
3718 | { |
||
3719 | fs_inst *inst = (fs_inst *)be_inst; |
||
3720 | |||
3721 | if (inst->predicate) { |
||
3722 | fprintf(file, "(%cf0.%d) ", |
||
3723 | inst->predicate_inverse ? '-' : '+', |
||
3724 | inst->flag_subreg); |
||
3725 | } |
||
3726 | |||
3727 | fprintf(file, "%s", brw_instruction_name(inst->opcode)); |
||
3728 | if (inst->saturate) |
||
3729 | fprintf(file, ".sat"); |
||
3730 | if (inst->conditional_mod) { |
||
3731 | fprintf(file, "%s", conditional_modifier[inst->conditional_mod]); |
||
3732 | if (!inst->predicate && |
||
3733 | (devinfo->gen < 5 || (inst->opcode != BRW_OPCODE_SEL && |
||
3734 | inst->opcode != BRW_OPCODE_IF && |
||
3735 | inst->opcode != BRW_OPCODE_WHILE))) { |
||
3736 | fprintf(file, ".f0.%d", inst->flag_subreg); |
||
3737 | } |
||
3738 | } |
||
3739 | fprintf(file, "(%d) ", inst->exec_size); |
||
3740 | |||
3741 | |||
3742 | switch (inst->dst.file) { |
||
3743 | case GRF: |
||
3744 | fprintf(file, "vgrf%d", inst->dst.reg); |
||
3745 | if (inst->dst.width != dispatch_width) |
||
3746 | fprintf(file, "@%d", inst->dst.width); |
||
3747 | if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 || |
||
3748 | inst->dst.subreg_offset) |
||
3749 | fprintf(file, "+%d.%d", |
||
3750 | inst->dst.reg_offset, inst->dst.subreg_offset); |
||
3751 | break; |
||
3752 | case MRF: |
||
3753 | fprintf(file, "m%d", inst->dst.reg); |
||
3754 | break; |
||
3755 | case BAD_FILE: |
||
3756 | fprintf(file, "(null)"); |
||
3757 | break; |
||
3758 | case UNIFORM: |
||
3759 | fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset); |
||
3760 | break; |
||
3761 | case ATTR: |
||
3762 | fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset); |
||
3763 | break; |
||
3764 | case HW_REG: |
||
3765 | if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { |
||
3766 | switch (inst->dst.fixed_hw_reg.nr) { |
||
3767 | case BRW_ARF_NULL: |
||
3768 | fprintf(file, "null"); |
||
3769 | break; |
||
3770 | case BRW_ARF_ADDRESS: |
||
3771 | fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr); |
||
3772 | break; |
||
3773 | case BRW_ARF_ACCUMULATOR: |
||
3774 | fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr); |
||
3775 | break; |
||
3776 | case BRW_ARF_FLAG: |
||
3777 | fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, |
||
3778 | inst->dst.fixed_hw_reg.subnr); |
||
3779 | break; |
||
3780 | default: |
||
3781 | fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf, |
||
3782 | inst->dst.fixed_hw_reg.subnr); |
||
3783 | break; |
||
3784 | } |
||
3785 | } else { |
||
3786 | fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr); |
||
3787 | } |
||
3788 | if (inst->dst.fixed_hw_reg.subnr) |
||
3789 | fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr); |
||
3790 | break; |
||
3791 | default: |
||
3792 | fprintf(file, "???"); |
||
3793 | break; |
||
3794 | } |
||
3795 | fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type)); |
||
3796 | |||
3797 | for (int i = 0; i < inst->sources; i++) { |
||
3798 | if (inst->src[i].negate) |
||
3799 | fprintf(file, "-"); |
||
3800 | if (inst->src[i].abs) |
||
3801 | fprintf(file, "|"); |
||
3802 | switch (inst->src[i].file) { |
||
3803 | case GRF: |
||
3804 | fprintf(file, "vgrf%d", inst->src[i].reg); |
||
3805 | if (inst->src[i].width != dispatch_width) |
||
3806 | fprintf(file, "@%d", inst->src[i].width); |
||
3807 | if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 || |
||
3808 | inst->src[i].subreg_offset) |
||
3809 | fprintf(file, "+%d.%d", inst->src[i].reg_offset, |
||
3810 | inst->src[i].subreg_offset); |
||
3811 | break; |
||
3812 | case MRF: |
||
3813 | fprintf(file, "***m%d***", inst->src[i].reg); |
||
3814 | break; |
||
3815 | case ATTR: |
||
3816 | fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset); |
||
3817 | break; |
||
3818 | case UNIFORM: |
||
3819 | fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset); |
||
3820 | if (inst->src[i].reladdr) { |
||
3821 | fprintf(file, "+reladdr"); |
||
3822 | } else if (inst->src[i].subreg_offset) { |
||
3823 | fprintf(file, "+%d.%d", inst->src[i].reg_offset, |
||
3824 | inst->src[i].subreg_offset); |
||
3825 | } |
||
3826 | break; |
||
3827 | case BAD_FILE: |
||
3828 | fprintf(file, "(null)"); |
||
3829 | break; |
||
3830 | case IMM: |
||
3831 | switch (inst->src[i].type) { |
||
3832 | case BRW_REGISTER_TYPE_F: |
||
3833 | fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f); |
||
3834 | break; |
||
3835 | case BRW_REGISTER_TYPE_W: |
||
3836 | case BRW_REGISTER_TYPE_D: |
||
3837 | fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d); |
||
3838 | break; |
||
3839 | case BRW_REGISTER_TYPE_UW: |
||
3840 | case BRW_REGISTER_TYPE_UD: |
||
3841 | fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud); |
||
3842 | break; |
||
3843 | case BRW_REGISTER_TYPE_VF: |
||
3844 | fprintf(file, "[%-gF, %-gF, %-gF, %-gF]", |
||
3845 | brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff), |
||
3846 | brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff), |
||
3847 | brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff), |
||
3848 | brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff)); |
||
3849 | break; |
||
3850 | default: |
||
3851 | fprintf(file, "???"); |
||
3852 | break; |
||
3853 | } |
||
3854 | break; |
||
3855 | case HW_REG: |
||
3856 | if (inst->src[i].fixed_hw_reg.negate) |
||
3857 | fprintf(file, "-"); |
||
3858 | if (inst->src[i].fixed_hw_reg.abs) |
||
3859 | fprintf(file, "|"); |
||
3860 | if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) { |
||
3861 | switch (inst->src[i].fixed_hw_reg.nr) { |
||
3862 | case BRW_ARF_NULL: |
||
3863 | fprintf(file, "null"); |
||
3864 | break; |
||
3865 | case BRW_ARF_ADDRESS: |
||
3866 | fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr); |
||
3867 | break; |
||
3868 | case BRW_ARF_ACCUMULATOR: |
||
3869 | fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr); |
||
3870 | break; |
||
3871 | case BRW_ARF_FLAG: |
||
3872 | fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, |
||
3873 | inst->src[i].fixed_hw_reg.subnr); |
||
3874 | break; |
||
3875 | default: |
||
3876 | fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf, |
||
3877 | inst->src[i].fixed_hw_reg.subnr); |
||
3878 | break; |
||
3879 | } |
||
3880 | } else { |
||
3881 | fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr); |
||
3882 | } |
||
3883 | if (inst->src[i].fixed_hw_reg.subnr) |
||
3884 | fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr); |
||
3885 | if (inst->src[i].fixed_hw_reg.abs) |
||
3886 | fprintf(file, "|"); |
||
3887 | break; |
||
3888 | default: |
||
3889 | fprintf(file, "???"); |
||
3890 | break; |
||
3891 | } |
||
3892 | if (inst->src[i].abs) |
||
3893 | fprintf(file, "|"); |
||
3894 | |||
3895 | if (inst->src[i].file != IMM) { |
||
3896 | fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type)); |
||
3897 | } |
||
3898 | |||
3899 | if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE) |
||
3900 | fprintf(file, ", "); |
||
3901 | } |
||
3902 | |||
3903 | fprintf(file, " "); |
||
3904 | |||
3905 | if (dispatch_width == 16 && inst->exec_size == 8) { |
||
3906 | if (inst->force_sechalf) |
||
3907 | fprintf(file, "2ndhalf "); |
||
3908 | else |
||
3909 | fprintf(file, "1sthalf "); |
||
3910 | } |
||
3911 | |||
3912 | fprintf(file, "\n"); |
||
3913 | } |
||
3914 | |||
3915 | /** |
||
3916 | * Possibly returns an instruction that set up @param reg. |
||
3917 | * |
||
3918 | * Sometimes we want to take the result of some expression/variable |
||
3919 | * dereference tree and rewrite the instruction generating the result |
||
3920 | * of the tree. When processing the tree, we know that the |
||
3921 | * instructions generated are all writing temporaries that are dead |
||
3922 | * outside of this tree. So, if we have some instructions that write |
||
3923 | * a temporary, we're free to point that temp write somewhere else. |
||
3924 | * |
||
3925 | * Note that this doesn't guarantee that the instruction generated |
||
3926 | * only reg -- it might be the size=4 destination of a texture instruction. |
||
3927 | */ |
||
3928 | fs_inst * |
||
3929 | fs_visitor::get_instruction_generating_reg(fs_inst *start, |
||
3930 | fs_inst *end, |
||
3931 | const fs_reg ®) |
||
3932 | { |
||
3933 | if (end == start || |
||
3934 | end->is_partial_write() || |
||
3935 | reg.reladdr || |
||
3936 | !reg.equals(end->dst)) { |
||
3937 | return NULL; |
||
3938 | } else { |
||
3939 | return end; |
||
3940 | } |
||
3941 | } |
||
3942 | |||
3943 | void |
||
3944 | fs_visitor::setup_payload_gen6() |
||
3945 | { |
||
3946 | bool uses_depth = |
||
3947 | (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0; |
||
3948 | unsigned barycentric_interp_modes = |
||
3949 | (stage == MESA_SHADER_FRAGMENT) ? |
||
3950 | ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0; |
||
3951 | |||
3952 | assert(devinfo->gen >= 6); |
||
3953 | |||
3954 | /* R0-1: masks, pixel X/Y coordinates. */ |
||
3955 | payload.num_regs = 2; |
||
3956 | /* R2: only for 32-pixel dispatch.*/ |
||
3957 | |||
3958 | /* R3-26: barycentric interpolation coordinates. These appear in the |
||
3959 | * same order that they appear in the brw_wm_barycentric_interp_mode |
||
3960 | * enum. Each set of coordinates occupies 2 registers if dispatch width |
||
3961 | * == 8 and 4 registers if dispatch width == 16. Coordinates only |
||
3962 | * appear if they were enabled using the "Barycentric Interpolation |
||
3963 | * Mode" bits in WM_STATE. |
||
3964 | */ |
||
3965 | for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) { |
||
3966 | if (barycentric_interp_modes & (1 << i)) { |
||
3967 | payload.barycentric_coord_reg[i] = payload.num_regs; |
||
3968 | payload.num_regs += 2; |
||
3969 | if (dispatch_width == 16) { |
||
3970 | payload.num_regs += 2; |
||
3971 | } |
||
3972 | } |
||
3973 | } |
||
3974 | |||
3975 | /* R27: interpolated depth if uses source depth */ |
||
3976 | if (uses_depth) { |
||
3977 | payload.source_depth_reg = payload.num_regs; |
||
3978 | payload.num_regs++; |
||
3979 | if (dispatch_width == 16) { |
||
3980 | /* R28: interpolated depth if not SIMD8. */ |
||
3981 | payload.num_regs++; |
||
3982 | } |
||
3983 | } |
||
3984 | /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */ |
||
3985 | if (uses_depth) { |
||
3986 | payload.source_w_reg = payload.num_regs; |
||
3987 | payload.num_regs++; |
||
3988 | if (dispatch_width == 16) { |
||
3989 | /* R30: interpolated W if not SIMD8. */ |
||
3990 | payload.num_regs++; |
||
3991 | } |
||
3992 | } |
||
3993 | |||
3994 | if (stage == MESA_SHADER_FRAGMENT) { |
||
3995 | brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
||
3996 | brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
||
3997 | prog_data->uses_pos_offset = key->compute_pos_offset; |
||
3998 | /* R31: MSAA position offsets. */ |
||
3999 | if (prog_data->uses_pos_offset) { |
||
4000 | payload.sample_pos_reg = payload.num_regs; |
||
4001 | payload.num_regs++; |
||
4002 | } |
||
4003 | } |
||
4004 | |||
4005 | /* R32: MSAA input coverage mask */ |
||
4006 | if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) { |
||
4007 | assert(devinfo->gen >= 7); |
||
4008 | payload.sample_mask_in_reg = payload.num_regs; |
||
4009 | payload.num_regs++; |
||
4010 | if (dispatch_width == 16) { |
||
4011 | /* R33: input coverage mask if not SIMD8. */ |
||
4012 | payload.num_regs++; |
||
4013 | } |
||
4014 | } |
||
4015 | |||
4016 | /* R34-: bary for 32-pixel. */ |
||
4017 | /* R58-59: interp W for 32-pixel. */ |
||
4018 | |||
4019 | if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) { |
||
4020 | source_depth_to_render_target = true; |
||
4021 | } |
||
4022 | } |
||
4023 | |||
4024 | void |
||
4025 | fs_visitor::setup_vs_payload() |
||
4026 | { |
||
4027 | /* R0: thread header, R1: urb handles */ |
||
4028 | payload.num_regs = 2; |
||
4029 | } |
||
4030 | |||
4031 | void |
||
4032 | fs_visitor::setup_cs_payload() |
||
4033 | { |
||
4034 | assert(brw->gen >= 7); |
||
4035 | |||
4036 | payload.num_regs = 1; |
||
4037 | } |
||
4038 | |||
4039 | void |
||
4040 | fs_visitor::assign_binding_table_offsets() |
||
4041 | { |
||
4042 | assert(stage == MESA_SHADER_FRAGMENT); |
||
4043 | brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data; |
||
4044 | brw_wm_prog_key *key = (brw_wm_prog_key*) this->key; |
||
4045 | uint32_t next_binding_table_offset = 0; |
||
4046 | |||
4047 | /* If there are no color regions, we still perform an FB write to a null |
||
4048 | * renderbuffer, which we place at surface index 0. |
||
4049 | */ |
||
4050 | prog_data->binding_table.render_target_start = next_binding_table_offset; |
||
4051 | next_binding_table_offset += MAX2(key->nr_color_regions, 1); |
||
4052 | |||
4053 | assign_common_binding_table_offsets(next_binding_table_offset); |
||
4054 | } |
||
4055 | |||
4056 | void |
||
4057 | fs_visitor::calculate_register_pressure() |
||
4058 | { |
||
4059 | invalidate_live_intervals(); |
||
4060 | calculate_live_intervals(); |
||
4061 | |||
4062 | unsigned num_instructions = 0; |
||
4063 | foreach_block(block, cfg) |
||
4064 | num_instructions += block->instructions.length(); |
||
4065 | |||
4066 | regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions); |
||
4067 | |||
4068 | for (unsigned reg = 0; reg < alloc.count; reg++) { |
||
4069 | for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++) |
||
4070 | regs_live_at_ip[ip] += alloc.sizes[reg]; |
||
4071 | } |
||
4072 | } |
||
4073 | |||
4074 | void |
||
4075 | fs_visitor::optimize() |
||
4076 | { |
||
4077 | split_virtual_grfs(); |
||
4078 | |||
4079 | move_uniform_array_access_to_pull_constants(); |
||
4080 | assign_constant_locations(); |
||
4081 | demote_pull_constants(); |
||
4082 | |||
4083 | #define OPT(pass, args...) ({ \ |
||
4084 | pass_num++; \ |
||
4085 | bool this_progress = pass(args); \ |
||
4086 | \ |
||
4087 | if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \ |
||
4088 | char filename[64]; \ |
||
4089 | snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \ |
||
4090 | stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \ |
||
4091 | \ |
||
4092 | backend_visitor::dump_instructions(filename); \ |
||
4093 | } \ |
||
4094 | \ |
||
4095 | progress = progress || this_progress; \ |
||
4096 | this_progress; \ |
||
4097 | }) |
||
4098 | |||
4099 | if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) { |
||
4100 | char filename[64]; |
||
4101 | snprintf(filename, 64, "%s%d-%04d-00-start", |
||
4102 | stage_abbrev, dispatch_width, |
||
4103 | shader_prog ? shader_prog->Name : 0); |
||
4104 | |||
4105 | backend_visitor::dump_instructions(filename); |
||
4106 | } |
||
4107 | |||
4108 | bool progress; |
||
4109 | int iteration = 0; |
||
4110 | int pass_num = 0; |
||
4111 | do { |
||
4112 | progress = false; |
||
4113 | pass_num = 0; |
||
4114 | iteration++; |
||
4115 | |||
4116 | OPT(remove_duplicate_mrf_writes); |
||
4117 | |||
4118 | OPT(opt_algebraic); |
||
4119 | OPT(opt_cse); |
||
4120 | OPT(opt_copy_propagate); |
||
4121 | OPT(opt_peephole_predicated_break); |
||
4122 | OPT(opt_cmod_propagation); |
||
4123 | OPT(dead_code_eliminate); |
||
4124 | OPT(opt_peephole_sel); |
||
4125 | OPT(dead_control_flow_eliminate, this); |
||
4126 | OPT(opt_register_renaming); |
||
4127 | OPT(opt_redundant_discard_jumps); |
||
4128 | OPT(opt_saturate_propagation); |
||
4129 | OPT(opt_zero_samples); |
||
4130 | OPT(register_coalesce); |
||
4131 | OPT(compute_to_mrf); |
||
4132 | OPT(eliminate_find_live_channel); |
||
4133 | |||
4134 | OPT(compact_virtual_grfs); |
||
4135 | } while (progress); |
||
4136 | |||
4137 | pass_num = 0; |
||
4138 | |||
4139 | OPT(opt_sampler_eot); |
||
4140 | |||
4141 | if (OPT(lower_load_payload)) { |
||
4142 | split_virtual_grfs(); |
||
4143 | OPT(register_coalesce); |
||
4144 | OPT(compute_to_mrf); |
||
4145 | OPT(dead_code_eliminate); |
||
4146 | } |
||
4147 | |||
4148 | OPT(opt_combine_constants); |
||
4149 | OPT(lower_integer_multiplication); |
||
4150 | |||
4151 | lower_uniform_pull_constant_loads(); |
||
4152 | } |
||
4153 | |||
4154 | /** |
||
4155 | * Three source instruction must have a GRF/MRF destination register. |
||
4156 | * ARF NULL is not allowed. Fix that up by allocating a temporary GRF. |
||
4157 | */ |
||
4158 | void |
||
4159 | fs_visitor::fixup_3src_null_dest() |
||
4160 | { |
||
4161 | foreach_block_and_inst_safe (block, fs_inst, inst, cfg) { |
||
4162 | if (inst->is_3src() && inst->dst.is_null()) { |
||
4163 | inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8), |
||
4164 | inst->dst.type); |
||
4165 | } |
||
4166 | } |
||
4167 | } |
||
4168 | |||
4169 | void |
||
4170 | fs_visitor::allocate_registers() |
||
4171 | { |
||
4172 | bool allocated_without_spills; |
||
4173 | |||
4174 | static const enum instruction_scheduler_mode pre_modes[] = { |
||
4175 | SCHEDULE_PRE, |
||
4176 | SCHEDULE_PRE_NON_LIFO, |
||
4177 | SCHEDULE_PRE_LIFO, |
||
4178 | }; |
||
4179 | |||
4180 | /* Try each scheduling heuristic to see if it can successfully register |
||
4181 | * allocate without spilling. They should be ordered by decreasing |
||
4182 | * performance but increasing likelihood of allocating. |
||
4183 | */ |
||
4184 | for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) { |
||
4185 | schedule_instructions(pre_modes[i]); |
||
4186 | |||
4187 | if (0) { |
||
4188 | assign_regs_trivial(); |
||
4189 | allocated_without_spills = true; |
||
4190 | } else { |
||
4191 | allocated_without_spills = assign_regs(false); |
||
4192 | } |
||
4193 | if (allocated_without_spills) |
||
4194 | break; |
||
4195 | } |
||
4196 | |||
4197 | if (!allocated_without_spills) { |
||
4198 | /* We assume that any spilling is worse than just dropping back to |
||
4199 | * SIMD8. There's probably actually some intermediate point where |
||
4200 | * SIMD16 with a couple of spills is still better. |
||
4201 | */ |
||
4202 | if (dispatch_width == 16) { |
||
4203 | fail("Failure to register allocate. Reduce number of " |
||
4204 | "live scalar values to avoid this."); |
||
4205 | } else { |
||
4206 | perf_debug("%s shader triggered register spilling. " |
||
4207 | "Try reducing the number of live scalar values to " |
||
4208 | "improve performance.\n", stage_name); |
||
4209 | } |
||
4210 | |||
4211 | /* Since we're out of heuristics, just go spill registers until we |
||
4212 | * get an allocation. |
||
4213 | */ |
||
4214 | while (!assign_regs(true)) { |
||
4215 | if (failed) |
||
4216 | break; |
||
4217 | } |
||
4218 | } |
||
4219 | |||
4220 | /* This must come after all optimization and register allocation, since |
||
4221 | * it inserts dead code that happens to have side effects, and it does |
||
4222 | * so based on the actual physical registers in use. |
||
4223 | */ |
||
4224 | insert_gen4_send_dependency_workarounds(); |
||
4225 | |||
4226 | if (failed) |
||
4227 | return; |
||
4228 | |||
4229 | if (!allocated_without_spills) |
||
4230 | schedule_instructions(SCHEDULE_POST); |
||
4231 | |||
4232 | if (last_scratch > 0) |
||
4233 | prog_data->total_scratch = brw_get_scratch_size(last_scratch); |
||
4234 | } |
||
4235 | |||
4236 | bool |
||
4237 | fs_visitor::run_vs() |
||
4238 | { |
||
4239 | assert(stage == MESA_SHADER_VERTEX); |
||
4240 | |||
4241 | assign_common_binding_table_offsets(0); |
||
4242 | setup_vs_payload(); |
||
4243 | |||
4244 | if (INTEL_DEBUG & DEBUG_SHADER_TIME) |
||
4245 | emit_shader_time_begin(); |
||
4246 | |||
4247 | if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) { |
||
4248 | emit_nir_code(); |
||
4249 | } else { |
||
4250 | foreach_in_list(ir_instruction, ir, shader->base.ir) { |
||
4251 | base_ir = ir; |
||
4252 | this->result = reg_undef; |
||
4253 | ir->accept(this); |
||
4254 | } |
||
4255 | base_ir = NULL; |
||
4256 | } |
||
4257 | |||
4258 | if (failed) |
||
4259 | return false; |
||
4260 | |||
4261 | emit_urb_writes(); |
||
4262 | |||
4263 | if (INTEL_DEBUG & DEBUG_SHADER_TIME) |
||
4264 | emit_shader_time_end(); |
||
4265 | |||
4266 | calculate_cfg(); |
||
4267 | |||
4268 | optimize(); |
||
4269 | |||
4270 | assign_curb_setup(); |
||
4271 | assign_vs_urb_setup(); |
||
4272 | |||
4273 | fixup_3src_null_dest(); |
||
4274 | allocate_registers(); |
||
4275 | |||
4276 | return !failed; |
||
4277 | } |
||
4278 | |||
4279 | bool |
||
4280 | fs_visitor::run_fs() |
||
4281 | { |
||
4282 | brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data; |
||
4283 | brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key; |
||
4284 | |||
4285 | assert(stage == MESA_SHADER_FRAGMENT); |
||
4286 | |||
4287 | sanity_param_count = prog->Parameters->NumParameters; |
||
4288 | |||
4289 | assign_binding_table_offsets(); |
||
4290 | |||
4291 | if (devinfo->gen >= 6) |
||
4292 | setup_payload_gen6(); |
||
4293 | else |
||
4294 | setup_payload_gen4(); |
||
4295 | |||
4296 | if (0) { |
||
4297 | emit_dummy_fs(); |
||
4298 | } else if (brw->use_rep_send && dispatch_width == 16) { |
||
4299 | emit_repclear_shader(); |
||
4300 | } else { |
||
4301 | if (INTEL_DEBUG & DEBUG_SHADER_TIME) |
||
4302 | emit_shader_time_begin(); |
||
4303 | |||
4304 | calculate_urb_setup(); |
||
4305 | if (prog->InputsRead > 0) { |
||
4306 | if (devinfo->gen < 6) |
||
4307 | emit_interpolation_setup_gen4(); |
||
4308 | else |
||
4309 | emit_interpolation_setup_gen6(); |
||
4310 | } |
||
4311 | |||
4312 | /* We handle discards by keeping track of the still-live pixels in f0.1. |
||
4313 | * Initialize it with the dispatched pixels. |
||
4314 | */ |
||
4315 | if (wm_prog_data->uses_kill) { |
||
4316 | fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS); |
||
4317 | discard_init->flag_subreg = 1; |
||
4318 | } |
||
4319 | |||
4320 | /* Generate FS IR for main(). (the visitor only descends into |
||
4321 | * functions called "main"). |
||
4322 | */ |
||
4323 | if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) { |
||
4324 | emit_nir_code(); |
||
4325 | } else if (shader) { |
||
4326 | foreach_in_list(ir_instruction, ir, shader->base.ir) { |
||
4327 | base_ir = ir; |
||
4328 | this->result = reg_undef; |
||
4329 | ir->accept(this); |
||
4330 | } |
||
4331 | } else { |
||
4332 | emit_fragment_program_code(); |
||
4333 | } |
||
4334 | base_ir = NULL; |
||
4335 | if (failed) |
||
4336 | return false; |
||
4337 | |||
4338 | if (wm_prog_data->uses_kill) |
||
4339 | emit(FS_OPCODE_PLACEHOLDER_HALT); |
||
4340 | |||
4341 | if (wm_key->alpha_test_func) |
||
4342 | emit_alpha_test(); |
||
4343 | |||
4344 | emit_fb_writes(); |
||
4345 | |||
4346 | if (INTEL_DEBUG & DEBUG_SHADER_TIME) |
||
4347 | emit_shader_time_end(); |
||
4348 | |||
4349 | calculate_cfg(); |
||
4350 | |||
4351 | optimize(); |
||
4352 | |||
4353 | assign_curb_setup(); |
||
4354 | assign_urb_setup(); |
||
4355 | |||
4356 | fixup_3src_null_dest(); |
||
4357 | allocate_registers(); |
||
4358 | |||
4359 | if (failed) |
||
4360 | return false; |
||
4361 | } |
||
4362 | |||
4363 | if (dispatch_width == 8) |
||
4364 | wm_prog_data->reg_blocks = brw_register_blocks(grf_used); |
||
4365 | else |
||
4366 | wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used); |
||
4367 | |||
4368 | /* If any state parameters were appended, then ParameterValues could have |
||
4369 | * been realloced, in which case the driver uniform storage set up by |
||
4370 | * _mesa_associate_uniform_storage() would point to freed memory. Make |
||
4371 | * sure that didn't happen. |
||
4372 | */ |
||
4373 | assert(sanity_param_count == prog->Parameters->NumParameters); |
||
4374 | |||
4375 | return !failed; |
||
4376 | } |
||
4377 | |||
4378 | bool |
||
4379 | fs_visitor::run_cs() |
||
4380 | { |
||
4381 | assert(stage == MESA_SHADER_COMPUTE); |
||
4382 | assert(shader); |
||
4383 | |||
4384 | sanity_param_count = prog->Parameters->NumParameters; |
||
4385 | |||
4386 | assign_common_binding_table_offsets(0); |
||
4387 | |||
4388 | setup_cs_payload(); |
||
4389 | |||
4390 | if (INTEL_DEBUG & DEBUG_SHADER_TIME) |
||
4391 | emit_shader_time_begin(); |
||
4392 | |||
4393 | emit_nir_code(); |
||
4394 | |||
4395 | if (failed) |
||
4396 | return false; |
||
4397 | |||
4398 | emit_cs_terminate(); |
||
4399 | |||
4400 | if (INTEL_DEBUG & DEBUG_SHADER_TIME) |
||
4401 | emit_shader_time_end(); |
||
4402 | |||
4403 | calculate_cfg(); |
||
4404 | |||
4405 | optimize(); |
||
4406 | |||
4407 | assign_curb_setup(); |
||
4408 | |||
4409 | fixup_3src_null_dest(); |
||
4410 | allocate_registers(); |
||
4411 | |||
4412 | if (failed) |
||
4413 | return false; |
||
4414 | |||
4415 | /* If any state parameters were appended, then ParameterValues could have |
||
4416 | * been realloced, in which case the driver uniform storage set up by |
||
4417 | * _mesa_associate_uniform_storage() would point to freed memory. Make |
||
4418 | * sure that didn't happen. |
||
4419 | */ |
||
4420 | assert(sanity_param_count == prog->Parameters->NumParameters); |
||
4421 | |||
4422 | return !failed; |
||
4423 | } |
||
4424 | |||
4425 | const unsigned * |
||
4426 | brw_wm_fs_emit(struct brw_context *brw, |
||
4427 | void *mem_ctx, |
||
4428 | const struct brw_wm_prog_key *key, |
||
4429 | struct brw_wm_prog_data *prog_data, |
||
4430 | struct gl_fragment_program *fp, |
||
4431 | struct gl_shader_program *prog, |
||
4432 | unsigned *final_assembly_size) |
||
4433 | { |
||
4434 | bool start_busy = false; |
||
4435 | double start_time = 0; |
||
4436 | |||
4437 | if (unlikely(brw->perf_debug)) { |
||
4438 | start_busy = (brw->batch.last_bo && |
||
4439 | drm_intel_bo_busy(brw->batch.last_bo)); |
||
4440 | start_time = get_time(); |
||
4441 | } |
||
4442 | |||
4443 | struct brw_shader *shader = NULL; |
||
4444 | if (prog) |
||
4445 | shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT]; |
||
4446 | |||
4447 | if (unlikely(INTEL_DEBUG & DEBUG_WM)) |
||
4448 | brw_dump_ir("fragment", prog, &shader->base, &fp->Base); |
||
4449 | |||
4450 | /* Now the main event: Visit the shader IR and generate our FS IR for it. |
||
4451 | */ |
||
4452 | fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base, |
||
4453 | prog, &fp->Base, 8); |
||
4454 | if (!v.run_fs()) { |
||
4455 | if (prog) { |
||
4456 | prog->LinkStatus = false; |
||
4457 | ralloc_strcat(&prog->InfoLog, v.fail_msg); |
||
4458 | } |
||
4459 | |||
4460 | _mesa_problem(NULL, "Failed to compile fragment shader: %s\n", |
||
4461 | v.fail_msg); |
||
4462 | |||
4463 | return NULL; |
||
4464 | } |
||
4465 | |||
4466 | cfg_t *simd16_cfg = NULL; |
||
4467 | fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base, |
||
4468 | prog, &fp->Base, 16); |
||
4469 | if (likely(!(INTEL_DEBUG & DEBUG_NO16) || brw->use_rep_send)) { |
||
4470 | if (!v.simd16_unsupported) { |
||
4471 | /* Try a SIMD16 compile */ |
||
4472 | v2.import_uniforms(&v); |
||
4473 | if (!v2.run_fs()) { |
||
4474 | perf_debug("SIMD16 shader failed to compile, falling back to " |
||
4475 | "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg); |
||
4476 | } else { |
||
4477 | simd16_cfg = v2.cfg; |
||
4478 | } |
||
4479 | } else { |
||
4480 | perf_debug("SIMD16 shader unsupported, falling back to " |
||
4481 | "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg); |
||
4482 | } |
||
4483 | } |
||
4484 | |||
4485 | cfg_t *simd8_cfg; |
||
4486 | int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8; |
||
4487 | if ((no_simd8 || brw->gen < 5) && simd16_cfg) { |
||
4488 | simd8_cfg = NULL; |
||
4489 | prog_data->no_8 = true; |
||
4490 | } else { |
||
4491 | simd8_cfg = v.cfg; |
||
4492 | prog_data->no_8 = false; |
||
4493 | } |
||
4494 | |||
4495 | fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base, |
||
4496 | &fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS"); |
||
4497 | |||
4498 | if (unlikely(INTEL_DEBUG & DEBUG_WM)) { |
||
4499 | char *name; |
||
4500 | if (prog) |
||
4501 | name = ralloc_asprintf(mem_ctx, "%s fragment shader %d", |
||
4502 | prog->Label ? prog->Label : "unnamed", |
||
4503 | prog->Name); |
||
4504 | else |
||
4505 | name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id); |
||
4506 | |||
4507 | g.enable_debug(name); |
||
4508 | } |
||
4509 | |||
4510 | if (simd8_cfg) |
||
4511 | g.generate_code(simd8_cfg, 8); |
||
4512 | if (simd16_cfg) |
||
4513 | prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16); |
||
4514 | |||
4515 | if (unlikely(brw->perf_debug) && shader) { |
||
4516 | if (shader->compiled_once) |
||
4517 | brw_wm_debug_recompile(brw, prog, key); |
||
4518 | shader->compiled_once = true; |
||
4519 | |||
4520 | if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) { |
||
4521 | perf_debug("FS compile took %.03f ms and stalled the GPU\n", |
||
4522 | (get_time() - start_time) * 1000); |
||
4523 | } |
||
4524 | } |
||
4525 | |||
4526 | return g.get_assembly(final_assembly_size); |
||
4527 | } |
||
4528 | |||
4529 | extern "C" bool |
||
4530 | brw_fs_precompile(struct gl_context *ctx, |
||
4531 | struct gl_shader_program *shader_prog, |
||
4532 | struct gl_program *prog) |
||
4533 | { |
||
4534 | struct brw_context *brw = brw_context(ctx); |
||
4535 | struct brw_wm_prog_key key; |
||
4536 | |||
4537 | struct gl_fragment_program *fp = (struct gl_fragment_program *) prog; |
||
4538 | struct brw_fragment_program *bfp = brw_fragment_program(fp); |
||
4539 | bool program_uses_dfdy = fp->UsesDFdy; |
||
4540 | |||
4541 | memset(&key, 0, sizeof(key)); |
||
4542 | |||
4543 | if (brw->gen < 6) { |
||
4544 | if (fp->UsesKill) |
||
4545 | key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT; |
||
4546 | |||
4547 | if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) |
||
4548 | key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT; |
||
4549 | |||
4550 | /* Just assume depth testing. */ |
||
4551 | key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT; |
||
4552 | key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT; |
||
4553 | } |
||
4554 | |||
4555 | if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead & |
||
4556 | BRW_FS_VARYING_INPUT_MASK) > 16) |
||
4557 | key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS; |
||
4558 | |||
4559 | brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base); |
||
4560 | |||
4561 | if (fp->Base.InputsRead & VARYING_BIT_POS) { |
||
4562 | key.drawable_height = ctx->DrawBuffer->Height; |
||
4563 | } |
||
4564 | |||
4565 | key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten & |
||
4566 | ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) | |
||
4567 | BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK))); |
||
4568 | |||
4569 | if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) { |
||
4570 | key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) || |
||
4571 | key.nr_color_regions > 1; |
||
4572 | } |
||
4573 | |||
4574 | key.program_string_id = bfp->id; |
||
4575 | |||
4576 | uint32_t old_prog_offset = brw->wm.base.prog_offset; |
||
4577 | struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data; |
||
4578 | |||
4579 | bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key); |
||
4580 | |||
4581 | brw->wm.base.prog_offset = old_prog_offset; |
||
4582 | brw->wm.prog_data = old_prog_data; |
||
4583 | |||
4584 | return success; |
||
4585 | } |
||
4586 | |||
4587 | void |
||
4588 | brw_setup_tex_for_precompile(struct brw_context *brw, |
||
4589 | struct brw_sampler_prog_key_data *tex, |
||
4590 | struct gl_program *prog) |
||
4591 | { |
||
4592 | const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8; |
||
4593 | unsigned sampler_count = _mesa_fls(prog->SamplersUsed); |
||
4594 | for (unsigned i = 0; i < sampler_count; i++) { |
||
4595 | if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) { |
||
4596 | /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */ |
||
4597 | tex->swizzles[i] = |
||
4598 | MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE); |
||
4599 | } else { |
||
4600 | /* Color sampler: assume no swizzling. */ |
||
4601 | tex->swizzles[i] = SWIZZLE_XYZW; |
||
4602 | } |
||
4603 | } |
||
4604 | }><>>>>>>>=>>><>>><>>>>16,8,2>16,8,2>2>16,8,2>8,8,1>1>16,8,2>8,8,1>1>8,8,1>8,8,1>1>8,8,1>1>8,8,1>8,8,1>1>8,8,1>8,8,1>1>8,8,1>1>8,8,1>8,8,1>8,8,1>8,8,1>1>>>><>>=>>>=>=>>>>>>>>>>>>>>>>>>>>>>><>><>>>=>>=>>>>>>>>>>>>>>>>>=>>=>>>>>>>>>>>=>>>=>>=>>>>>>>>>>>>>>>><>><>><>><>><>>>>>>>=>>> |