Go to most recent revision | Details | Last modification | View Log | RSS feed
Rev | Author | Line No. | Line |
---|---|---|---|
5564 | serge | 1 | /* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */ |
2 | |||
3 | /* |
||
4 | * Copyright (C) 2015 Rob Clark |
||
5 | * |
||
6 | * Permission is hereby granted, free of charge, to any person obtaining a |
||
7 | * copy of this software and associated documentation files (the "Software"), |
||
8 | * to deal in the Software without restriction, including without limitation |
||
9 | * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
||
10 | * and/or sell copies of the Software, and to permit persons to whom the |
||
11 | * Software is furnished to do so, subject to the following conditions: |
||
12 | * |
||
13 | * The above copyright notice and this permission notice (including the next |
||
14 | * paragraph) shall be included in all copies or substantial portions of the |
||
15 | * Software. |
||
16 | * |
||
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
||
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
||
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
||
20 | * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
||
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
||
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE |
||
23 | * SOFTWARE. |
||
24 | * |
||
25 | * Authors: |
||
26 | * Rob Clark |
||
27 | */ |
||
28 | |||
29 | #include |
||
30 | |||
31 | #include "pipe/p_state.h" |
||
32 | #include "util/u_string.h" |
||
33 | #include "util/u_memory.h" |
||
34 | #include "util/u_inlines.h" |
||
35 | #include "tgsi/tgsi_lowering.h" |
||
36 | #include "tgsi/tgsi_strings.h" |
||
37 | |||
38 | #include "nir/tgsi_to_nir.h" |
||
39 | #include "glsl/shader_enums.h" |
||
40 | |||
41 | #include "freedreno_util.h" |
||
42 | |||
43 | #include "ir3_compiler.h" |
||
44 | #include "ir3_shader.h" |
||
45 | #include "ir3_nir.h" |
||
46 | |||
47 | #include "instr-a3xx.h" |
||
48 | #include "ir3.h" |
||
49 | |||
50 | |||
51 | static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val); |
||
52 | |||
53 | struct ir3_compile { |
||
54 | const struct tgsi_token *tokens; |
||
55 | struct nir_shader *s; |
||
56 | |||
57 | struct ir3 *ir; |
||
58 | struct ir3_shader_variant *so; |
||
59 | |||
60 | /* bitmask of which samplers are integer: */ |
||
61 | uint16_t integer_s; |
||
62 | |||
63 | struct ir3_block *block; |
||
64 | |||
65 | /* For fragment shaders, from the hw perspective the only |
||
66 | * actual input is r0.xy position register passed to bary.f. |
||
67 | * But TGSI doesn't know that, it still declares things as |
||
68 | * IN[] registers. So we do all the input tracking normally |
||
69 | * and fix things up after compile_instructions() |
||
70 | * |
||
71 | * NOTE that frag_pos is the hardware position (possibly it |
||
72 | * is actually an index or tag or some such.. it is *not* |
||
73 | * values that can be directly used for gl_FragCoord..) |
||
74 | */ |
||
75 | struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4]; |
||
76 | |||
77 | /* For vertex shaders, keep track of the system values sources */ |
||
78 | struct ir3_instruction *vertex_id, *basevertex, *instance_id; |
||
79 | |||
80 | /* mapping from nir_register to defining instruction: */ |
||
81 | struct hash_table *def_ht; |
||
82 | |||
83 | /* mapping from nir_variable to ir3_array: */ |
||
84 | struct hash_table *var_ht; |
||
85 | unsigned num_arrays; |
||
86 | |||
87 | /* a common pattern for indirect addressing is to request the |
||
88 | * same address register multiple times. To avoid generating |
||
89 | * duplicate instruction sequences (which our backend does not |
||
90 | * try to clean up, since that should be done as the NIR stage) |
||
91 | * we cache the address value generated for a given src value: |
||
92 | */ |
||
93 | struct hash_table *addr_ht; |
||
94 | |||
95 | /* for calculating input/output positions/linkages: */ |
||
96 | unsigned next_inloc; |
||
97 | |||
98 | /* a4xx (at least patchlevel 0) cannot seem to flat-interpolate |
||
99 | * so we need to use ldlv.u32 to load the varying directly: |
||
100 | */ |
||
101 | bool flat_bypass; |
||
102 | |||
103 | /* on a3xx, we need to add one to # of array levels: |
||
104 | */ |
||
105 | bool levels_add_one; |
||
106 | |||
107 | /* for looking up which system value is which */ |
||
108 | unsigned sysval_semantics[8]; |
||
109 | |||
110 | /* list of kill instructions: */ |
||
111 | struct ir3_instruction *kill[16]; |
||
112 | unsigned int kill_count; |
||
113 | |||
114 | /* set if we encounter something we can't handle yet, so we |
||
115 | * can bail cleanly and fallback to TGSI compiler f/e |
||
116 | */ |
||
117 | bool error; |
||
118 | }; |
||
119 | |||
120 | |||
121 | static struct nir_shader *to_nir(const struct tgsi_token *tokens) |
||
122 | { |
||
123 | struct nir_shader_compiler_options options = { |
||
124 | .lower_fpow = true, |
||
125 | .lower_fsat = true, |
||
126 | .lower_scmp = true, |
||
127 | .lower_flrp = true, |
||
128 | .native_integers = true, |
||
129 | }; |
||
130 | bool progress; |
||
131 | |||
132 | struct nir_shader *s = tgsi_to_nir(tokens, &options); |
||
133 | |||
134 | if (fd_mesa_debug & FD_DBG_OPTMSGS) { |
||
135 | debug_printf("----------------------\n"); |
||
136 | nir_print_shader(s, stdout); |
||
137 | debug_printf("----------------------\n"); |
||
138 | } |
||
139 | |||
140 | nir_opt_global_to_local(s); |
||
141 | nir_convert_to_ssa(s); |
||
142 | nir_lower_idiv(s); |
||
143 | |||
144 | do { |
||
145 | progress = false; |
||
146 | |||
147 | nir_lower_vars_to_ssa(s); |
||
148 | nir_lower_alu_to_scalar(s); |
||
149 | |||
150 | progress |= nir_copy_prop(s); |
||
151 | progress |= nir_opt_dce(s); |
||
152 | progress |= nir_opt_cse(s); |
||
153 | progress |= ir3_nir_lower_if_else(s); |
||
154 | progress |= nir_opt_algebraic(s); |
||
155 | progress |= nir_opt_constant_folding(s); |
||
156 | |||
157 | } while (progress); |
||
158 | |||
159 | nir_remove_dead_variables(s); |
||
160 | nir_validate_shader(s); |
||
161 | |||
162 | if (fd_mesa_debug & FD_DBG_OPTMSGS) { |
||
163 | debug_printf("----------------------\n"); |
||
164 | nir_print_shader(s, stdout); |
||
165 | debug_printf("----------------------\n"); |
||
166 | } |
||
167 | |||
168 | return s; |
||
169 | } |
||
170 | |||
171 | /* TODO nir doesn't lower everything for us yet, but ideally it would: */ |
||
172 | static const struct tgsi_token * |
||
173 | lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so) |
||
174 | { |
||
175 | struct tgsi_shader_info info; |
||
176 | struct tgsi_lowering_config lconfig = { |
||
177 | .color_two_side = so->key.color_two_side, |
||
178 | .lower_FRC = true, |
||
179 | }; |
||
180 | |||
181 | switch (so->type) { |
||
182 | case SHADER_FRAGMENT: |
||
183 | case SHADER_COMPUTE: |
||
184 | lconfig.saturate_s = so->key.fsaturate_s; |
||
185 | lconfig.saturate_t = so->key.fsaturate_t; |
||
186 | lconfig.saturate_r = so->key.fsaturate_r; |
||
187 | break; |
||
188 | case SHADER_VERTEX: |
||
189 | lconfig.saturate_s = so->key.vsaturate_s; |
||
190 | lconfig.saturate_t = so->key.vsaturate_t; |
||
191 | lconfig.saturate_r = so->key.vsaturate_r; |
||
192 | break; |
||
193 | } |
||
194 | |||
195 | if (!so->shader) { |
||
196 | /* hack for standalone compiler which does not have |
||
197 | * screen/context: |
||
198 | */ |
||
199 | } else if (ir3_shader_gpuid(so->shader) >= 400) { |
||
200 | /* a4xx seems to have *no* sam.p */ |
||
201 | lconfig.lower_TXP = ~0; /* lower all txp */ |
||
202 | } else { |
||
203 | /* a3xx just needs to avoid sam.p for 3d tex */ |
||
204 | lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D); |
||
205 | } |
||
206 | |||
207 | return tgsi_transform_lowering(&lconfig, tokens, &info); |
||
208 | } |
||
209 | |||
210 | static struct ir3_compile * |
||
211 | compile_init(struct ir3_shader_variant *so, |
||
212 | const struct tgsi_token *tokens) |
||
213 | { |
||
214 | struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile); |
||
215 | const struct tgsi_token *lowered_tokens; |
||
216 | |||
217 | if (!so->shader) { |
||
218 | /* hack for standalone compiler which does not have |
||
219 | * screen/context: |
||
220 | */ |
||
221 | } else if (ir3_shader_gpuid(so->shader) >= 400) { |
||
222 | /* need special handling for "flat" */ |
||
223 | ctx->flat_bypass = true; |
||
224 | ctx->levels_add_one = false; |
||
225 | } else { |
||
226 | /* no special handling for "flat" */ |
||
227 | ctx->flat_bypass = false; |
||
228 | ctx->levels_add_one = true; |
||
229 | } |
||
230 | |||
231 | switch (so->type) { |
||
232 | case SHADER_FRAGMENT: |
||
233 | case SHADER_COMPUTE: |
||
234 | ctx->integer_s = so->key.finteger_s; |
||
235 | break; |
||
236 | case SHADER_VERTEX: |
||
237 | ctx->integer_s = so->key.vinteger_s; |
||
238 | break; |
||
239 | } |
||
240 | |||
241 | ctx->ir = so->ir; |
||
242 | ctx->so = so; |
||
243 | ctx->next_inloc = 8; |
||
244 | ctx->def_ht = _mesa_hash_table_create(ctx, |
||
245 | _mesa_hash_pointer, _mesa_key_pointer_equal); |
||
246 | ctx->var_ht = _mesa_hash_table_create(ctx, |
||
247 | _mesa_hash_pointer, _mesa_key_pointer_equal); |
||
248 | ctx->addr_ht = _mesa_hash_table_create(ctx, |
||
249 | _mesa_hash_pointer, _mesa_key_pointer_equal); |
||
250 | |||
251 | lowered_tokens = lower_tgsi(tokens, so); |
||
252 | if (!lowered_tokens) |
||
253 | lowered_tokens = tokens; |
||
254 | ctx->s = to_nir(lowered_tokens); |
||
255 | |||
256 | if (lowered_tokens != tokens) |
||
257 | free((void *)lowered_tokens); |
||
258 | |||
259 | so->first_driver_param = so->first_immediate = ctx->s->num_uniforms; |
||
260 | |||
261 | /* one (vec4) slot for vertex id base: */ |
||
262 | if (so->type == SHADER_VERTEX) |
||
263 | so->first_immediate++; |
||
264 | |||
265 | /* reserve 4 (vec4) slots for ubo base addresses: */ |
||
266 | so->first_immediate += 4; |
||
267 | |||
268 | return ctx; |
||
269 | } |
||
270 | |||
271 | static void |
||
272 | compile_error(struct ir3_compile *ctx, const char *format, ...) |
||
273 | { |
||
274 | va_list ap; |
||
275 | va_start(ap, format); |
||
276 | _debug_vprintf(format, ap); |
||
277 | va_end(ap); |
||
278 | nir_print_shader(ctx->s, stdout); |
||
279 | ctx->error = true; |
||
280 | debug_assert(0); |
||
281 | } |
||
282 | |||
283 | #define compile_assert(ctx, cond) do { \ |
||
284 | if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \ |
||
285 | } while (0) |
||
286 | |||
287 | static void |
||
288 | compile_free(struct ir3_compile *ctx) |
||
289 | { |
||
290 | ralloc_free(ctx); |
||
291 | } |
||
292 | |||
293 | |||
294 | struct ir3_array { |
||
295 | unsigned length, aid; |
||
296 | struct ir3_instruction *arr[]; |
||
297 | }; |
||
298 | |||
299 | static void |
||
300 | declare_var(struct ir3_compile *ctx, nir_variable *var) |
||
301 | { |
||
302 | unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */ |
||
303 | struct ir3_array *arr = ralloc_size(ctx, sizeof(*arr) + |
||
304 | (length * sizeof(arr->arr[0]))); |
||
305 | arr->length = length; |
||
306 | arr->aid = ++ctx->num_arrays; |
||
307 | /* Some shaders end up reading array elements without first writing.. |
||
308 | * so initialize things to prevent null instr ptrs later: |
||
309 | */ |
||
310 | for (unsigned i = 0; i < length; i++) |
||
311 | arr->arr[i] = create_immed(ctx->block, 0); |
||
312 | _mesa_hash_table_insert(ctx->var_ht, var, arr); |
||
313 | } |
||
314 | |||
315 | static struct ir3_array * |
||
316 | get_var(struct ir3_compile *ctx, nir_variable *var) |
||
317 | { |
||
318 | struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var); |
||
319 | return entry->data; |
||
320 | } |
||
321 | |||
322 | /* allocate a n element value array (to be populated by caller) and |
||
323 | * insert in def_ht |
||
324 | */ |
||
325 | static struct ir3_instruction ** |
||
326 | __get_dst(struct ir3_compile *ctx, void *key, unsigned n) |
||
327 | { |
||
328 | struct ir3_instruction **value = |
||
329 | ralloc_array(ctx->def_ht, struct ir3_instruction *, n); |
||
330 | _mesa_hash_table_insert(ctx->def_ht, key, value); |
||
331 | return value; |
||
332 | } |
||
333 | |||
334 | static struct ir3_instruction ** |
||
335 | get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n) |
||
336 | { |
||
337 | if (dst->is_ssa) { |
||
338 | return __get_dst(ctx, &dst->ssa, n); |
||
339 | } else { |
||
340 | return __get_dst(ctx, dst->reg.reg, n); |
||
341 | } |
||
342 | } |
||
343 | |||
344 | static struct ir3_instruction ** |
||
345 | get_dst_ssa(struct ir3_compile *ctx, nir_ssa_def *dst, unsigned n) |
||
346 | { |
||
347 | return __get_dst(ctx, dst, n); |
||
348 | } |
||
349 | |||
350 | static struct ir3_instruction ** |
||
351 | get_src(struct ir3_compile *ctx, nir_src *src) |
||
352 | { |
||
353 | struct hash_entry *entry; |
||
354 | if (src->is_ssa) { |
||
355 | entry = _mesa_hash_table_search(ctx->def_ht, src->ssa); |
||
356 | } else { |
||
357 | entry = _mesa_hash_table_search(ctx->def_ht, src->reg.reg); |
||
358 | } |
||
359 | compile_assert(ctx, entry); |
||
360 | return entry->data; |
||
361 | } |
||
362 | |||
363 | static struct ir3_instruction * |
||
364 | create_immed(struct ir3_block *block, uint32_t val) |
||
365 | { |
||
366 | struct ir3_instruction *mov; |
||
367 | |||
368 | mov = ir3_instr_create(block, 1, 0); |
||
369 | mov->cat1.src_type = TYPE_U32; |
||
370 | mov->cat1.dst_type = TYPE_U32; |
||
371 | ir3_reg_create(mov, 0, 0); |
||
372 | ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val; |
||
373 | |||
374 | return mov; |
||
375 | } |
||
376 | |||
377 | static struct ir3_instruction * |
||
378 | create_addr(struct ir3_block *block, struct ir3_instruction *src) |
||
379 | { |
||
380 | struct ir3_instruction *instr, *immed; |
||
381 | |||
382 | /* TODO in at least some cases, the backend could probably be |
||
383 | * made clever enough to propagate IR3_REG_HALF.. |
||
384 | */ |
||
385 | instr = ir3_COV(block, src, TYPE_U32, TYPE_S16); |
||
386 | instr->regs[0]->flags |= IR3_REG_HALF; |
||
387 | |||
388 | immed = create_immed(block, 2); |
||
389 | immed->regs[0]->flags |= IR3_REG_HALF; |
||
390 | |||
391 | instr = ir3_SHL_B(block, instr, 0, immed, 0); |
||
392 | instr->regs[0]->flags |= IR3_REG_HALF; |
||
393 | instr->regs[1]->flags |= IR3_REG_HALF; |
||
394 | |||
395 | instr = ir3_MOV(block, instr, TYPE_S16); |
||
396 | instr->regs[0]->flags |= IR3_REG_ADDR | IR3_REG_HALF; |
||
397 | instr->regs[1]->flags |= IR3_REG_HALF; |
||
398 | |||
399 | return instr; |
||
400 | } |
||
401 | |||
402 | /* caches addr values to avoid generating multiple cov/shl/mova |
||
403 | * sequences for each use of a given NIR level src as address |
||
404 | */ |
||
405 | static struct ir3_instruction * |
||
406 | get_addr(struct ir3_compile *ctx, struct ir3_instruction *src) |
||
407 | { |
||
408 | struct ir3_instruction *addr; |
||
409 | struct hash_entry *entry; |
||
410 | entry = _mesa_hash_table_search(ctx->addr_ht, src); |
||
411 | if (entry) |
||
412 | return entry->data; |
||
413 | |||
414 | /* TODO do we need to cache per block? */ |
||
415 | addr = create_addr(ctx->block, src); |
||
416 | _mesa_hash_table_insert(ctx->addr_ht, src, addr); |
||
417 | |||
418 | return addr; |
||
419 | } |
||
420 | |||
421 | static struct ir3_instruction * |
||
422 | create_uniform(struct ir3_compile *ctx, unsigned n) |
||
423 | { |
||
424 | struct ir3_instruction *mov; |
||
425 | |||
426 | mov = ir3_instr_create(ctx->block, 1, 0); |
||
427 | /* TODO get types right? */ |
||
428 | mov->cat1.src_type = TYPE_F32; |
||
429 | mov->cat1.dst_type = TYPE_F32; |
||
430 | ir3_reg_create(mov, 0, 0); |
||
431 | ir3_reg_create(mov, n, IR3_REG_CONST); |
||
432 | |||
433 | return mov; |
||
434 | } |
||
435 | |||
436 | static struct ir3_instruction * |
||
437 | create_uniform_indirect(struct ir3_compile *ctx, unsigned n, |
||
438 | struct ir3_instruction *address) |
||
439 | { |
||
440 | struct ir3_instruction *mov; |
||
441 | |||
442 | mov = ir3_instr_create(ctx->block, 1, 0); |
||
443 | mov->cat1.src_type = TYPE_U32; |
||
444 | mov->cat1.dst_type = TYPE_U32; |
||
445 | ir3_reg_create(mov, 0, 0); |
||
446 | ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV); |
||
447 | mov->address = address; |
||
448 | |||
449 | array_insert(ctx->ir->indirects, mov); |
||
450 | |||
451 | return mov; |
||
452 | } |
||
453 | |||
454 | static struct ir3_instruction * |
||
455 | create_collect(struct ir3_block *block, struct ir3_instruction **arr, |
||
456 | unsigned arrsz) |
||
457 | { |
||
458 | struct ir3_instruction *collect; |
||
459 | |||
460 | if (arrsz == 0) |
||
461 | return NULL; |
||
462 | |||
463 | collect = ir3_instr_create2(block, -1, OPC_META_FI, 1 + arrsz); |
||
464 | ir3_reg_create(collect, 0, 0); |
||
465 | for (unsigned i = 0; i < arrsz; i++) |
||
466 | ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i]; |
||
467 | |||
468 | return collect; |
||
469 | } |
||
470 | |||
471 | static struct ir3_instruction * |
||
472 | create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n, |
||
473 | struct ir3_instruction *address, struct ir3_instruction *collect) |
||
474 | { |
||
475 | struct ir3_block *block = ctx->block; |
||
476 | struct ir3_instruction *mov; |
||
477 | struct ir3_register *src; |
||
478 | |||
479 | mov = ir3_instr_create(block, 1, 0); |
||
480 | mov->cat1.src_type = TYPE_U32; |
||
481 | mov->cat1.dst_type = TYPE_U32; |
||
482 | ir3_reg_create(mov, 0, 0); |
||
483 | src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV); |
||
484 | src->instr = collect; |
||
485 | src->size = arrsz; |
||
486 | src->offset = n; |
||
487 | mov->address = address; |
||
488 | |||
489 | array_insert(ctx->ir->indirects, mov); |
||
490 | |||
491 | return mov; |
||
492 | } |
||
493 | |||
494 | static struct ir3_instruction * |
||
495 | create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n, |
||
496 | struct ir3_instruction *src, struct ir3_instruction *address, |
||
497 | struct ir3_instruction *collect) |
||
498 | { |
||
499 | struct ir3_block *block = ctx->block; |
||
500 | struct ir3_instruction *mov; |
||
501 | struct ir3_register *dst; |
||
502 | |||
503 | mov = ir3_instr_create(block, 1, 0); |
||
504 | mov->cat1.src_type = TYPE_U32; |
||
505 | mov->cat1.dst_type = TYPE_U32; |
||
506 | dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV); |
||
507 | dst->size = arrsz; |
||
508 | dst->offset = n; |
||
509 | ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src; |
||
510 | mov->address = address; |
||
511 | mov->fanin = collect; |
||
512 | |||
513 | array_insert(ctx->ir->indirects, mov); |
||
514 | |||
515 | return mov; |
||
516 | } |
||
517 | |||
518 | static struct ir3_instruction * |
||
519 | create_input(struct ir3_block *block, struct ir3_instruction *instr, |
||
520 | unsigned n) |
||
521 | { |
||
522 | struct ir3_instruction *in; |
||
523 | |||
524 | in = ir3_instr_create(block, -1, OPC_META_INPUT); |
||
525 | in->inout.block = block; |
||
526 | ir3_reg_create(in, n, 0); |
||
527 | if (instr) |
||
528 | ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr; |
||
529 | |||
530 | return in; |
||
531 | } |
||
532 | |||
533 | static struct ir3_instruction * |
||
534 | create_frag_input(struct ir3_compile *ctx, unsigned n, bool use_ldlv) |
||
535 | { |
||
536 | struct ir3_block *block = ctx->block; |
||
537 | struct ir3_instruction *instr; |
||
538 | struct ir3_instruction *inloc = create_immed(block, n); |
||
539 | |||
540 | if (use_ldlv) { |
||
541 | instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0); |
||
542 | instr->cat6.type = TYPE_U32; |
||
543 | instr->cat6.iim_val = 1; |
||
544 | } else { |
||
545 | instr = ir3_BARY_F(block, inloc, 0, ctx->frag_pos, 0); |
||
546 | instr->regs[2]->wrmask = 0x3; |
||
547 | } |
||
548 | |||
549 | return instr; |
||
550 | } |
||
551 | |||
552 | static struct ir3_instruction * |
||
553 | create_frag_coord(struct ir3_compile *ctx, unsigned comp) |
||
554 | { |
||
555 | struct ir3_block *block = ctx->block; |
||
556 | struct ir3_instruction *instr; |
||
557 | |||
558 | compile_assert(ctx, !ctx->frag_coord[comp]); |
||
559 | |||
560 | ctx->frag_coord[comp] = create_input(ctx->block, NULL, 0); |
||
561 | |||
562 | switch (comp) { |
||
563 | case 0: /* .x */ |
||
564 | case 1: /* .y */ |
||
565 | /* for frag_coord, we get unsigned values.. we need |
||
566 | * to subtract (integer) 8 and divide by 16 (right- |
||
567 | * shift by 4) then convert to float: |
||
568 | * |
||
569 | * sub.s tmp, src, 8 |
||
570 | * shr.b tmp, tmp, 4 |
||
571 | * mov.u32f32 dst, tmp |
||
572 | * |
||
573 | */ |
||
574 | instr = ir3_SUB_S(block, ctx->frag_coord[comp], 0, |
||
575 | create_immed(block, 8), 0); |
||
576 | instr = ir3_SHR_B(block, instr, 0, |
||
577 | create_immed(block, 4), 0); |
||
578 | instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32); |
||
579 | |||
580 | return instr; |
||
581 | case 2: /* .z */ |
||
582 | case 3: /* .w */ |
||
583 | default: |
||
584 | /* seems that we can use these as-is: */ |
||
585 | return ctx->frag_coord[comp]; |
||
586 | } |
||
587 | } |
||
588 | |||
589 | static struct ir3_instruction * |
||
590 | create_frag_face(struct ir3_compile *ctx, unsigned comp) |
||
591 | { |
||
592 | struct ir3_block *block = ctx->block; |
||
593 | struct ir3_instruction *instr; |
||
594 | |||
595 | switch (comp) { |
||
596 | case 0: /* .x */ |
||
597 | compile_assert(ctx, !ctx->frag_face); |
||
598 | |||
599 | ctx->frag_face = create_input(block, NULL, 0); |
||
600 | |||
601 | /* for faceness, we always get -1 or 0 (int).. but TGSI expects |
||
602 | * positive vs negative float.. and piglit further seems to |
||
603 | * expect -1.0 or 1.0: |
||
604 | * |
||
605 | * mul.s tmp, hr0.x, 2 |
||
606 | * add.s tmp, tmp, 1 |
||
607 | * mov.s32f32, dst, tmp |
||
608 | * |
||
609 | */ |
||
610 | instr = ir3_MUL_S(block, ctx->frag_face, 0, |
||
611 | create_immed(block, 2), 0); |
||
612 | instr = ir3_ADD_S(block, instr, 0, |
||
613 | create_immed(block, 1), 0); |
||
614 | instr = ir3_COV(block, instr, TYPE_S32, TYPE_F32); |
||
615 | |||
616 | return instr; |
||
617 | case 1: /* .y */ |
||
618 | case 2: /* .z */ |
||
619 | return create_immed(block, fui(0.0)); |
||
620 | default: |
||
621 | case 3: /* .w */ |
||
622 | return create_immed(block, fui(1.0)); |
||
623 | } |
||
624 | } |
||
625 | |||
626 | /* helper for instructions that produce multiple consecutive scalar |
||
627 | * outputs which need to have a split/fanout meta instruction inserted |
||
628 | */ |
||
629 | static void |
||
630 | split_dest(struct ir3_block *block, struct ir3_instruction **dst, |
||
631 | struct ir3_instruction *src) |
||
632 | { |
||
633 | struct ir3_instruction *prev = NULL; |
||
634 | for (int i = 0, j = 0; i < 4; i++) { |
||
635 | struct ir3_instruction *split = |
||
636 | ir3_instr_create(block, -1, OPC_META_FO); |
||
637 | ir3_reg_create(split, 0, IR3_REG_SSA); |
||
638 | ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src; |
||
639 | split->fo.off = i; |
||
640 | |||
641 | if (prev) { |
||
642 | split->cp.left = prev; |
||
643 | split->cp.left_cnt++; |
||
644 | prev->cp.right = split; |
||
645 | prev->cp.right_cnt++; |
||
646 | } |
||
647 | prev = split; |
||
648 | |||
649 | if (src->regs[0]->wrmask & (1 << i)) |
||
650 | dst[j++] = split; |
||
651 | } |
||
652 | } |
||
653 | |||
654 | /* |
||
655 | * Adreno uses uint rather than having dedicated bool type, |
||
656 | * which (potentially) requires some conversion, in particular |
||
657 | * when using output of an bool instr to int input, or visa |
||
658 | * versa. |
||
659 | * |
||
660 | * | Adreno | NIR | |
||
661 | * -------+---------+-------+- |
||
662 | * true | 1 | ~0 | |
||
663 | * false | 0 | 0 | |
||
664 | * |
||
665 | * To convert from an adreno bool (uint) to nir, use: |
||
666 | * |
||
667 | * absneg.s dst, (neg)src |
||
668 | * |
||
669 | * To convert back in the other direction: |
||
670 | * |
||
671 | * absneg.s dst, (abs)arc |
||
672 | * |
||
673 | * The CP step can clean up the absneg.s that cancel each other |
||
674 | * out, and with a slight bit of extra cleverness (to recognize |
||
675 | * the instructions which produce either a 0 or 1) can eliminate |
||
676 | * the absneg.s's completely when an instruction that wants |
||
677 | * 0/1 consumes the result. For example, when a nir 'bcsel' |
||
678 | * consumes the result of 'feq'. So we should be able to get by |
||
679 | * without a boolean resolve step, and without incuring any |
||
680 | * extra penalty in instruction count. |
||
681 | */ |
||
682 | |||
683 | /* NIR bool -> native (adreno): */ |
||
684 | static struct ir3_instruction * |
||
685 | ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr) |
||
686 | { |
||
687 | return ir3_ABSNEG_S(block, instr, IR3_REG_SABS); |
||
688 | } |
||
689 | |||
690 | /* native (adreno) -> NIR bool: */ |
||
691 | static struct ir3_instruction * |
||
692 | ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr) |
||
693 | { |
||
694 | return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG); |
||
695 | } |
||
696 | |||
697 | /* |
||
698 | * alu/sfu instructions: |
||
699 | */ |
||
700 | |||
701 | static void |
||
702 | emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu) |
||
703 | { |
||
704 | const nir_op_info *info = &nir_op_infos[alu->op]; |
||
705 | struct ir3_instruction **dst, *src[info->num_inputs]; |
||
706 | struct ir3_block *b = ctx->block; |
||
707 | |||
708 | dst = get_dst(ctx, &alu->dest.dest, MAX2(info->output_size, 1)); |
||
709 | |||
710 | /* Vectors are special in that they have non-scalarized writemasks, |
||
711 | * and just take the first swizzle channel for each argument in |
||
712 | * order into each writemask channel. |
||
713 | */ |
||
714 | if ((alu->op == nir_op_vec2) || |
||
715 | (alu->op == nir_op_vec3) || |
||
716 | (alu->op == nir_op_vec4)) { |
||
717 | |||
718 | for (int i = 0; i < info->num_inputs; i++) { |
||
719 | nir_alu_src *asrc = &alu->src[i]; |
||
720 | |||
721 | compile_assert(ctx, !asrc->abs); |
||
722 | compile_assert(ctx, !asrc->negate); |
||
723 | |||
724 | src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]]; |
||
725 | if (!src[i]) |
||
726 | src[i] = create_immed(ctx->block, 0); |
||
727 | dst[i] = ir3_MOV(b, src[i], TYPE_U32); |
||
728 | } |
||
729 | |||
730 | return; |
||
731 | } |
||
732 | |||
733 | /* General case: We can just grab the one used channel per src. */ |
||
734 | for (int i = 0; i < info->num_inputs; i++) { |
||
735 | unsigned chan = ffs(alu->dest.write_mask) - 1; |
||
736 | nir_alu_src *asrc = &alu->src[i]; |
||
737 | |||
738 | compile_assert(ctx, !asrc->abs); |
||
739 | compile_assert(ctx, !asrc->negate); |
||
740 | |||
741 | src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]]; |
||
742 | |||
743 | compile_assert(ctx, src[i]); |
||
744 | } |
||
745 | |||
746 | switch (alu->op) { |
||
747 | case nir_op_f2i: |
||
748 | dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_S32); |
||
749 | break; |
||
750 | case nir_op_f2u: |
||
751 | dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_U32); |
||
752 | break; |
||
753 | case nir_op_i2f: |
||
754 | dst[0] = ir3_COV(b, src[0], TYPE_S32, TYPE_F32); |
||
755 | break; |
||
756 | case nir_op_u2f: |
||
757 | dst[0] = ir3_COV(b, src[0], TYPE_U32, TYPE_F32); |
||
758 | break; |
||
759 | case nir_op_imov: |
||
760 | dst[0] = ir3_MOV(b, src[0], TYPE_S32); |
||
761 | break; |
||
762 | case nir_op_fmov: |
||
763 | dst[0] = ir3_MOV(b, src[0], TYPE_F32); |
||
764 | break; |
||
765 | case nir_op_f2b: |
||
766 | dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0); |
||
767 | dst[0]->cat2.condition = IR3_COND_NE; |
||
768 | dst[0] = ir3_n2b(b, dst[0]); |
||
769 | break; |
||
770 | case nir_op_b2f: |
||
771 | dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32); |
||
772 | break; |
||
773 | case nir_op_b2i: |
||
774 | dst[0] = ir3_b2n(b, src[0]); |
||
775 | break; |
||
776 | case nir_op_i2b: |
||
777 | dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); |
||
778 | dst[0]->cat2.condition = IR3_COND_NE; |
||
779 | dst[0] = ir3_n2b(b, dst[0]); |
||
780 | break; |
||
781 | |||
782 | case nir_op_fneg: |
||
783 | dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG); |
||
784 | break; |
||
785 | case nir_op_fabs: |
||
786 | dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS); |
||
787 | break; |
||
788 | case nir_op_fmax: |
||
789 | dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0); |
||
790 | break; |
||
791 | case nir_op_fmin: |
||
792 | dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0); |
||
793 | break; |
||
794 | case nir_op_fmul: |
||
795 | dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0); |
||
796 | break; |
||
797 | case nir_op_fadd: |
||
798 | dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0); |
||
799 | break; |
||
800 | case nir_op_fsub: |
||
801 | dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG); |
||
802 | break; |
||
803 | case nir_op_ffma: |
||
804 | dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0); |
||
805 | break; |
||
806 | case nir_op_fddx: |
||
807 | dst[0] = ir3_DSX(b, src[0], 0); |
||
808 | dst[0]->cat5.type = TYPE_F32; |
||
809 | break; |
||
810 | case nir_op_fddy: |
||
811 | dst[0] = ir3_DSY(b, src[0], 0); |
||
812 | dst[0]->cat5.type = TYPE_F32; |
||
813 | break; |
||
814 | break; |
||
815 | case nir_op_flt: |
||
816 | dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); |
||
817 | dst[0]->cat2.condition = IR3_COND_LT; |
||
818 | dst[0] = ir3_n2b(b, dst[0]); |
||
819 | break; |
||
820 | case nir_op_fge: |
||
821 | dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); |
||
822 | dst[0]->cat2.condition = IR3_COND_GE; |
||
823 | dst[0] = ir3_n2b(b, dst[0]); |
||
824 | break; |
||
825 | case nir_op_feq: |
||
826 | dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); |
||
827 | dst[0]->cat2.condition = IR3_COND_EQ; |
||
828 | dst[0] = ir3_n2b(b, dst[0]); |
||
829 | break; |
||
830 | case nir_op_fne: |
||
831 | dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0); |
||
832 | dst[0]->cat2.condition = IR3_COND_NE; |
||
833 | dst[0] = ir3_n2b(b, dst[0]); |
||
834 | break; |
||
835 | case nir_op_fceil: |
||
836 | dst[0] = ir3_CEIL_F(b, src[0], 0); |
||
837 | break; |
||
838 | case nir_op_ffloor: |
||
839 | dst[0] = ir3_FLOOR_F(b, src[0], 0); |
||
840 | break; |
||
841 | case nir_op_ftrunc: |
||
842 | dst[0] = ir3_TRUNC_F(b, src[0], 0); |
||
843 | break; |
||
844 | case nir_op_fround_even: |
||
845 | dst[0] = ir3_RNDNE_F(b, src[0], 0); |
||
846 | break; |
||
847 | case nir_op_fsign: |
||
848 | dst[0] = ir3_SIGN_F(b, src[0], 0); |
||
849 | break; |
||
850 | |||
851 | case nir_op_fsin: |
||
852 | dst[0] = ir3_SIN(b, src[0], 0); |
||
853 | break; |
||
854 | case nir_op_fcos: |
||
855 | dst[0] = ir3_COS(b, src[0], 0); |
||
856 | break; |
||
857 | case nir_op_frsq: |
||
858 | dst[0] = ir3_RSQ(b, src[0], 0); |
||
859 | break; |
||
860 | case nir_op_frcp: |
||
861 | dst[0] = ir3_RCP(b, src[0], 0); |
||
862 | break; |
||
863 | case nir_op_flog2: |
||
864 | dst[0] = ir3_LOG2(b, src[0], 0); |
||
865 | break; |
||
866 | case nir_op_fexp2: |
||
867 | dst[0] = ir3_EXP2(b, src[0], 0); |
||
868 | break; |
||
869 | case nir_op_fsqrt: |
||
870 | dst[0] = ir3_SQRT(b, src[0], 0); |
||
871 | break; |
||
872 | |||
873 | case nir_op_iabs: |
||
874 | dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS); |
||
875 | break; |
||
876 | case nir_op_iadd: |
||
877 | dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0); |
||
878 | break; |
||
879 | case nir_op_iand: |
||
880 | dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0); |
||
881 | break; |
||
882 | case nir_op_imax: |
||
883 | dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0); |
||
884 | break; |
||
885 | case nir_op_imin: |
||
886 | dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0); |
||
887 | break; |
||
888 | case nir_op_imul: |
||
889 | /* |
||
890 | * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16) |
||
891 | * mull.u tmp0, a, b ; mul low, i.e. al * bl |
||
892 | * madsh.m16 tmp1, a, b, tmp0 ; mul-add shift high mix, i.e. ah * bl << 16 |
||
893 | * madsh.m16 dst, b, a, tmp1 ; i.e. al * bh << 16 |
||
894 | */ |
||
895 | dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0, |
||
896 | ir3_MADSH_M16(b, src[0], 0, src[1], 0, |
||
897 | ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0); |
||
898 | break; |
||
899 | case nir_op_ineg: |
||
900 | dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG); |
||
901 | break; |
||
902 | case nir_op_inot: |
||
903 | dst[0] = ir3_NOT_B(b, src[0], 0); |
||
904 | break; |
||
905 | case nir_op_ior: |
||
906 | dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0); |
||
907 | break; |
||
908 | case nir_op_ishl: |
||
909 | dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0); |
||
910 | break; |
||
911 | case nir_op_ishr: |
||
912 | dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0); |
||
913 | break; |
||
914 | case nir_op_isign: { |
||
915 | /* maybe this would be sane to lower in nir.. */ |
||
916 | struct ir3_instruction *neg, *pos; |
||
917 | |||
918 | neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); |
||
919 | neg->cat2.condition = IR3_COND_LT; |
||
920 | |||
921 | pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0); |
||
922 | pos->cat2.condition = IR3_COND_GT; |
||
923 | |||
924 | dst[0] = ir3_SUB_U(b, pos, 0, neg, 0); |
||
925 | |||
926 | break; |
||
927 | } |
||
928 | case nir_op_isub: |
||
929 | dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0); |
||
930 | break; |
||
931 | case nir_op_ixor: |
||
932 | dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0); |
||
933 | break; |
||
934 | case nir_op_ushr: |
||
935 | dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0); |
||
936 | break; |
||
937 | case nir_op_ilt: |
||
938 | dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); |
||
939 | dst[0]->cat2.condition = IR3_COND_LT; |
||
940 | dst[0] = ir3_n2b(b, dst[0]); |
||
941 | break; |
||
942 | case nir_op_ige: |
||
943 | dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); |
||
944 | dst[0]->cat2.condition = IR3_COND_GE; |
||
945 | dst[0] = ir3_n2b(b, dst[0]); |
||
946 | break; |
||
947 | case nir_op_ieq: |
||
948 | dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); |
||
949 | dst[0]->cat2.condition = IR3_COND_EQ; |
||
950 | dst[0] = ir3_n2b(b, dst[0]); |
||
951 | break; |
||
952 | case nir_op_ine: |
||
953 | dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0); |
||
954 | dst[0]->cat2.condition = IR3_COND_NE; |
||
955 | dst[0] = ir3_n2b(b, dst[0]); |
||
956 | break; |
||
957 | case nir_op_ult: |
||
958 | dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); |
||
959 | dst[0]->cat2.condition = IR3_COND_LT; |
||
960 | dst[0] = ir3_n2b(b, dst[0]); |
||
961 | break; |
||
962 | case nir_op_uge: |
||
963 | dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0); |
||
964 | dst[0]->cat2.condition = IR3_COND_GE; |
||
965 | dst[0] = ir3_n2b(b, dst[0]); |
||
966 | break; |
||
967 | |||
968 | case nir_op_bcsel: |
||
969 | dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0); |
||
970 | break; |
||
971 | |||
972 | default: |
||
973 | compile_error(ctx, "Unhandled ALU op: %s\n", |
||
974 | nir_op_infos[alu->op].name); |
||
975 | break; |
||
976 | } |
||
977 | } |
||
978 | |||
979 | /* handles direct/indirect UBO reads: */ |
||
980 | static void |
||
981 | emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr, |
||
982 | struct ir3_instruction **dst) |
||
983 | { |
||
984 | struct ir3_block *b = ctx->block; |
||
985 | struct ir3_instruction *addr, *src0, *src1; |
||
986 | /* UBO addresses are the first driver params: */ |
||
987 | unsigned ubo = regid(ctx->so->first_driver_param, 0); |
||
988 | unsigned off = intr->const_index[0]; |
||
989 | |||
990 | /* First src is ubo index, which could either be an immed or not: */ |
||
991 | src0 = get_src(ctx, &intr->src[0])[0]; |
||
992 | if (is_same_type_mov(src0) && |
||
993 | (src0->regs[1]->flags & IR3_REG_IMMED)) { |
||
994 | addr = create_uniform(ctx, ubo + src0->regs[1]->iim_val); |
||
995 | } else { |
||
996 | addr = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0)); |
||
997 | } |
||
998 | |||
999 | if (intr->intrinsic == nir_intrinsic_load_ubo_indirect) { |
||
1000 | /* For load_ubo_indirect, second src is indirect offset: */ |
||
1001 | src1 = get_src(ctx, &intr->src[1])[0]; |
||
1002 | |||
1003 | /* and add offset to addr: */ |
||
1004 | addr = ir3_ADD_S(b, addr, 0, src1, 0); |
||
1005 | } |
||
1006 | |||
1007 | /* if offset is to large to encode in the ldg, split it out: */ |
||
1008 | if ((off + (intr->num_components * 4)) > 1024) { |
||
1009 | /* split out the minimal amount to improve the odds that |
||
1010 | * cp can fit the immediate in the add.s instruction: |
||
1011 | */ |
||
1012 | unsigned off2 = off + (intr->num_components * 4) - 1024; |
||
1013 | addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0); |
||
1014 | off -= off2; |
||
1015 | } |
||
1016 | |||
1017 | for (int i = 0; i < intr->num_components; i++) { |
||
1018 | struct ir3_instruction *load = |
||
1019 | ir3_LDG(b, addr, 0, create_immed(b, 1), 0); |
||
1020 | load->cat6.type = TYPE_U32; |
||
1021 | load->cat6.offset = off + i * 4; /* byte offset */ |
||
1022 | dst[i] = load; |
||
1023 | } |
||
1024 | } |
||
1025 | |||
1026 | /* handles array reads: */ |
||
1027 | static void |
||
1028 | emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr, |
||
1029 | struct ir3_instruction **dst) |
||
1030 | { |
||
1031 | nir_deref_var *dvar = intr->variables[0]; |
||
1032 | nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); |
||
1033 | struct ir3_array *arr = get_var(ctx, dvar->var); |
||
1034 | |||
1035 | compile_assert(ctx, dvar->deref.child && |
||
1036 | (dvar->deref.child->deref_type == nir_deref_type_array)); |
||
1037 | |||
1038 | switch (darr->deref_array_type) { |
||
1039 | case nir_deref_array_type_direct: |
||
1040 | /* direct access does not require anything special: */ |
||
1041 | for (int i = 0; i < intr->num_components; i++) { |
||
1042 | unsigned n = darr->base_offset * 4 + i; |
||
1043 | compile_assert(ctx, n < arr->length); |
||
1044 | dst[i] = arr->arr[n]; |
||
1045 | } |
||
1046 | break; |
||
1047 | case nir_deref_array_type_indirect: { |
||
1048 | /* for indirect, we need to collect all the array elements: */ |
||
1049 | struct ir3_instruction *collect = |
||
1050 | create_collect(ctx->block, arr->arr, arr->length); |
||
1051 | struct ir3_instruction *addr = |
||
1052 | get_addr(ctx, get_src(ctx, &darr->indirect)[0]); |
||
1053 | for (int i = 0; i < intr->num_components; i++) { |
||
1054 | unsigned n = darr->base_offset * 4 + i; |
||
1055 | compile_assert(ctx, n < arr->length); |
||
1056 | dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect); |
||
1057 | } |
||
1058 | break; |
||
1059 | } |
||
1060 | default: |
||
1061 | compile_error(ctx, "Unhandled load deref type: %u\n", |
||
1062 | darr->deref_array_type); |
||
1063 | break; |
||
1064 | } |
||
1065 | } |
||
1066 | |||
1067 | /* handles array writes: */ |
||
1068 | static void |
||
1069 | emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr) |
||
1070 | { |
||
1071 | nir_deref_var *dvar = intr->variables[0]; |
||
1072 | nir_deref_array *darr = nir_deref_as_array(dvar->deref.child); |
||
1073 | struct ir3_array *arr = get_var(ctx, dvar->var); |
||
1074 | struct ir3_instruction **src; |
||
1075 | |||
1076 | compile_assert(ctx, dvar->deref.child && |
||
1077 | (dvar->deref.child->deref_type == nir_deref_type_array)); |
||
1078 | |||
1079 | src = get_src(ctx, &intr->src[0]); |
||
1080 | |||
1081 | switch (darr->deref_array_type) { |
||
1082 | case nir_deref_array_type_direct: |
||
1083 | /* direct access does not require anything special: */ |
||
1084 | for (int i = 0; i < intr->num_components; i++) { |
||
1085 | unsigned n = darr->base_offset * 4 + i; |
||
1086 | compile_assert(ctx, n < arr->length); |
||
1087 | arr->arr[n] = src[i]; |
||
1088 | } |
||
1089 | break; |
||
1090 | case nir_deref_array_type_indirect: { |
||
1091 | /* for indirect, create indirect-store and fan that out: */ |
||
1092 | struct ir3_instruction *collect = |
||
1093 | create_collect(ctx->block, arr->arr, arr->length); |
||
1094 | struct ir3_instruction *addr = |
||
1095 | get_addr(ctx, get_src(ctx, &darr->indirect)[0]); |
||
1096 | for (int i = 0; i < intr->num_components; i++) { |
||
1097 | struct ir3_instruction *store; |
||
1098 | unsigned n = darr->base_offset * 4 + i; |
||
1099 | compile_assert(ctx, n < arr->length); |
||
1100 | |||
1101 | store = create_indirect_store(ctx, arr->length, |
||
1102 | n, src[i], addr, collect); |
||
1103 | |||
1104 | store->fanin->fi.aid = arr->aid; |
||
1105 | |||
1106 | /* TODO: probably split this out to be used for |
||
1107 | * store_output_indirect? or move this into |
||
1108 | * create_indirect_store()? |
||
1109 | */ |
||
1110 | for (int j = i; j < arr->length; j += 4) { |
||
1111 | struct ir3_instruction *split; |
||
1112 | |||
1113 | split = ir3_instr_create(ctx->block, -1, OPC_META_FO); |
||
1114 | split->fo.off = j; |
||
1115 | ir3_reg_create(split, 0, 0); |
||
1116 | ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store; |
||
1117 | |||
1118 | arr->arr[j] = split; |
||
1119 | } |
||
1120 | } |
||
1121 | break; |
||
1122 | } |
||
1123 | default: |
||
1124 | compile_error(ctx, "Unhandled store deref type: %u\n", |
||
1125 | darr->deref_array_type); |
||
1126 | break; |
||
1127 | } |
||
1128 | } |
||
1129 | |||
1130 | static void add_sysval_input(struct ir3_compile *ctx, unsigned name, |
||
1131 | struct ir3_instruction *instr) |
||
1132 | { |
||
1133 | struct ir3_shader_variant *so = ctx->so; |
||
1134 | unsigned r = regid(so->inputs_count, 0); |
||
1135 | unsigned n = so->inputs_count++; |
||
1136 | |||
1137 | so->inputs[n].semantic = ir3_semantic_name(name, 0); |
||
1138 | so->inputs[n].compmask = 1; |
||
1139 | so->inputs[n].regid = r; |
||
1140 | so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT; |
||
1141 | so->total_in++; |
||
1142 | |||
1143 | ctx->block->ninputs = MAX2(ctx->block->ninputs, r + 1); |
||
1144 | ctx->block->inputs[r] = instr; |
||
1145 | } |
||
1146 | |||
1147 | static void |
||
1148 | emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr) |
||
1149 | { |
||
1150 | const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic]; |
||
1151 | struct ir3_instruction **dst, **src; |
||
1152 | struct ir3_block *b = ctx->block; |
||
1153 | unsigned idx = intr->const_index[0]; |
||
1154 | |||
1155 | if (info->has_dest) { |
||
1156 | dst = get_dst(ctx, &intr->dest, intr->num_components); |
||
1157 | } |
||
1158 | |||
1159 | switch (intr->intrinsic) { |
||
1160 | case nir_intrinsic_load_uniform: |
||
1161 | compile_assert(ctx, intr->const_index[1] == 1); |
||
1162 | for (int i = 0; i < intr->num_components; i++) { |
||
1163 | unsigned n = idx * 4 + i; |
||
1164 | dst[i] = create_uniform(ctx, n); |
||
1165 | } |
||
1166 | break; |
||
1167 | case nir_intrinsic_load_uniform_indirect: |
||
1168 | compile_assert(ctx, intr->const_index[1] == 1); |
||
1169 | src = get_src(ctx, &intr->src[0]); |
||
1170 | for (int i = 0; i < intr->num_components; i++) { |
||
1171 | unsigned n = idx * 4 + i; |
||
1172 | dst[i] = create_uniform_indirect(ctx, n, |
||
1173 | get_addr(ctx, src[0])); |
||
1174 | } |
||
1175 | break; |
||
1176 | case nir_intrinsic_load_ubo: |
||
1177 | case nir_intrinsic_load_ubo_indirect: |
||
1178 | emit_intrinsic_load_ubo(ctx, intr, dst); |
||
1179 | break; |
||
1180 | case nir_intrinsic_load_input: |
||
1181 | compile_assert(ctx, intr->const_index[1] == 1); |
||
1182 | for (int i = 0; i < intr->num_components; i++) { |
||
1183 | unsigned n = idx * 4 + i; |
||
1184 | dst[i] = b->inputs[n]; |
||
1185 | } |
||
1186 | break; |
||
1187 | case nir_intrinsic_load_input_indirect: |
||
1188 | compile_assert(ctx, intr->const_index[1] == 1); |
||
1189 | src = get_src(ctx, &intr->src[0]); |
||
1190 | struct ir3_instruction *collect = |
||
1191 | create_collect(b, b->inputs, b->ninputs); |
||
1192 | struct ir3_instruction *addr = get_addr(ctx, src[0]); |
||
1193 | for (int i = 0; i < intr->num_components; i++) { |
||
1194 | unsigned n = idx * 4 + i; |
||
1195 | dst[i] = create_indirect_load(ctx, b->ninputs, n, addr, collect); |
||
1196 | } |
||
1197 | break; |
||
1198 | case nir_intrinsic_load_var: |
||
1199 | emit_intrinisic_load_var(ctx, intr, dst); |
||
1200 | break; |
||
1201 | case nir_intrinsic_store_var: |
||
1202 | emit_intrinisic_store_var(ctx, intr); |
||
1203 | break; |
||
1204 | case nir_intrinsic_store_output: |
||
1205 | compile_assert(ctx, intr->const_index[1] == 1); |
||
1206 | src = get_src(ctx, &intr->src[0]); |
||
1207 | for (int i = 0; i < intr->num_components; i++) { |
||
1208 | unsigned n = idx * 4 + i; |
||
1209 | b->outputs[n] = src[i]; |
||
1210 | } |
||
1211 | break; |
||
1212 | case nir_intrinsic_load_base_vertex: |
||
1213 | if (!ctx->basevertex) { |
||
1214 | /* first four vec4 sysval's reserved for UBOs: */ |
||
1215 | unsigned r = regid(ctx->so->first_driver_param + 4, 0); |
||
1216 | ctx->basevertex = create_uniform(ctx, r); |
||
1217 | add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX, |
||
1218 | ctx->basevertex); |
||
1219 | } |
||
1220 | dst[0] = ctx->basevertex; |
||
1221 | break; |
||
1222 | case nir_intrinsic_load_vertex_id_zero_base: |
||
1223 | if (!ctx->vertex_id) { |
||
1224 | ctx->vertex_id = create_input(ctx->block, NULL, 0); |
||
1225 | add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE, |
||
1226 | ctx->vertex_id); |
||
1227 | } |
||
1228 | dst[0] = ctx->vertex_id; |
||
1229 | break; |
||
1230 | case nir_intrinsic_load_instance_id: |
||
1231 | if (!ctx->instance_id) { |
||
1232 | ctx->instance_id = create_input(ctx->block, NULL, 0); |
||
1233 | add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID, |
||
1234 | ctx->instance_id); |
||
1235 | } |
||
1236 | dst[0] = ctx->instance_id; |
||
1237 | break; |
||
1238 | case nir_intrinsic_discard_if: |
||
1239 | case nir_intrinsic_discard: { |
||
1240 | struct ir3_instruction *cond, *kill; |
||
1241 | |||
1242 | if (intr->intrinsic == nir_intrinsic_discard_if) { |
||
1243 | /* conditional discard: */ |
||
1244 | src = get_src(ctx, &intr->src[0]); |
||
1245 | cond = ir3_b2n(b, src[0]); |
||
1246 | } else { |
||
1247 | /* unconditional discard: */ |
||
1248 | cond = create_immed(b, 1); |
||
1249 | } |
||
1250 | |||
1251 | cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0); |
||
1252 | cond->cat2.condition = IR3_COND_NE; |
||
1253 | |||
1254 | /* condition always goes in predicate register: */ |
||
1255 | cond->regs[0]->num = regid(REG_P0, 0); |
||
1256 | |||
1257 | kill = ir3_KILL(b, cond, 0); |
||
1258 | |||
1259 | ctx->kill[ctx->kill_count++] = kill; |
||
1260 | ctx->so->has_kill = true; |
||
1261 | |||
1262 | break; |
||
1263 | } |
||
1264 | default: |
||
1265 | compile_error(ctx, "Unhandled intrinsic type: %s\n", |
||
1266 | nir_intrinsic_infos[intr->intrinsic].name); |
||
1267 | break; |
||
1268 | } |
||
1269 | } |
||
1270 | |||
1271 | static void |
||
1272 | emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr) |
||
1273 | { |
||
1274 | struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def, |
||
1275 | instr->def.num_components); |
||
1276 | for (int i = 0; i < instr->def.num_components; i++) |
||
1277 | dst[i] = create_immed(ctx->block, instr->value.u[i]); |
||
1278 | } |
||
1279 | |||
1280 | static void |
||
1281 | emit_undef(struct ir3_compile *ctx, nir_ssa_undef_instr *undef) |
||
1282 | { |
||
1283 | struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def, |
||
1284 | undef->def.num_components); |
||
1285 | /* backend doesn't want undefined instructions, so just plug |
||
1286 | * in 0.0.. |
||
1287 | */ |
||
1288 | for (int i = 0; i < undef->def.num_components; i++) |
||
1289 | dst[i] = create_immed(ctx->block, fui(0.0)); |
||
1290 | } |
||
1291 | |||
1292 | /* |
||
1293 | * texture fetch/sample instructions: |
||
1294 | */ |
||
1295 | |||
1296 | static void |
||
1297 | tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp) |
||
1298 | { |
||
1299 | unsigned coords, flags = 0; |
||
1300 | |||
1301 | /* note: would use tex->coord_components.. except txs.. also, |
||
1302 | * since array index goes after shadow ref, we don't want to |
||
1303 | * count it: |
||
1304 | */ |
||
1305 | switch (tex->sampler_dim) { |
||
1306 | case GLSL_SAMPLER_DIM_1D: |
||
1307 | case GLSL_SAMPLER_DIM_BUF: |
||
1308 | coords = 1; |
||
1309 | break; |
||
1310 | case GLSL_SAMPLER_DIM_2D: |
||
1311 | case GLSL_SAMPLER_DIM_RECT: |
||
1312 | case GLSL_SAMPLER_DIM_EXTERNAL: |
||
1313 | case GLSL_SAMPLER_DIM_MS: |
||
1314 | coords = 2; |
||
1315 | break; |
||
1316 | case GLSL_SAMPLER_DIM_3D: |
||
1317 | case GLSL_SAMPLER_DIM_CUBE: |
||
1318 | coords = 3; |
||
1319 | flags |= IR3_INSTR_3D; |
||
1320 | break; |
||
1321 | } |
||
1322 | |||
1323 | if (tex->is_shadow) |
||
1324 | flags |= IR3_INSTR_S; |
||
1325 | |||
1326 | if (tex->is_array) |
||
1327 | flags |= IR3_INSTR_A; |
||
1328 | |||
1329 | *flagsp = flags; |
||
1330 | *coordsp = coords; |
||
1331 | } |
||
1332 | |||
1333 | static void |
||
1334 | emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex) |
||
1335 | { |
||
1336 | struct ir3_block *b = ctx->block; |
||
1337 | struct ir3_instruction **dst, *sam, *src0[12], *src1[4]; |
||
1338 | struct ir3_instruction **coord, *lod, *compare, *proj, **off, **ddx, **ddy; |
||
1339 | bool has_bias = false, has_lod = false, has_proj = false, has_off = false; |
||
1340 | unsigned i, coords, flags; |
||
1341 | unsigned nsrc0 = 0, nsrc1 = 0; |
||
1342 | type_t type; |
||
1343 | opc_t opc; |
||
1344 | |||
1345 | /* TODO: might just be one component for gathers? */ |
||
1346 | dst = get_dst(ctx, &tex->dest, 4); |
||
1347 | |||
1348 | for (unsigned i = 0; i < tex->num_srcs; i++) { |
||
1349 | switch (tex->src[i].src_type) { |
||
1350 | case nir_tex_src_coord: |
||
1351 | coord = get_src(ctx, &tex->src[i].src); |
||
1352 | break; |
||
1353 | case nir_tex_src_bias: |
||
1354 | lod = get_src(ctx, &tex->src[i].src)[0]; |
||
1355 | has_bias = true; |
||
1356 | break; |
||
1357 | case nir_tex_src_lod: |
||
1358 | lod = get_src(ctx, &tex->src[i].src)[0]; |
||
1359 | has_lod = true; |
||
1360 | break; |
||
1361 | case nir_tex_src_comparitor: /* shadow comparator */ |
||
1362 | compare = get_src(ctx, &tex->src[i].src)[0]; |
||
1363 | break; |
||
1364 | case nir_tex_src_projector: |
||
1365 | proj = get_src(ctx, &tex->src[i].src)[0]; |
||
1366 | has_proj = true; |
||
1367 | break; |
||
1368 | case nir_tex_src_offset: |
||
1369 | off = get_src(ctx, &tex->src[i].src); |
||
1370 | has_off = true; |
||
1371 | break; |
||
1372 | case nir_tex_src_ddx: |
||
1373 | ddx = get_src(ctx, &tex->src[i].src); |
||
1374 | break; |
||
1375 | case nir_tex_src_ddy: |
||
1376 | ddy = get_src(ctx, &tex->src[i].src); |
||
1377 | break; |
||
1378 | default: |
||
1379 | compile_error(ctx, "Unhandled NIR tex serc type: %d\n", |
||
1380 | tex->src[i].src_type); |
||
1381 | return; |
||
1382 | } |
||
1383 | } |
||
1384 | |||
1385 | switch (tex->op) { |
||
1386 | case nir_texop_tex: opc = OPC_SAM; break; |
||
1387 | case nir_texop_txb: opc = OPC_SAMB; break; |
||
1388 | case nir_texop_txl: opc = OPC_SAML; break; |
||
1389 | case nir_texop_txd: opc = OPC_SAMGQ; break; |
||
1390 | case nir_texop_txf: opc = OPC_ISAML; break; |
||
1391 | case nir_texop_txf_ms: |
||
1392 | case nir_texop_txs: |
||
1393 | case nir_texop_lod: |
||
1394 | case nir_texop_tg4: |
||
1395 | case nir_texop_query_levels: |
||
1396 | compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op); |
||
1397 | return; |
||
1398 | } |
||
1399 | |||
1400 | tex_info(tex, &flags, &coords); |
||
1401 | |||
1402 | /* scale up integer coords for TXF based on the LOD */ |
||
1403 | if (opc == OPC_ISAML) { |
||
1404 | assert(has_lod); |
||
1405 | for (i = 0; i < coords; i++) |
||
1406 | coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0); |
||
1407 | } |
||
1408 | /* |
||
1409 | * lay out the first argument in the proper order: |
||
1410 | * - actual coordinates first |
||
1411 | * - shadow reference |
||
1412 | * - array index |
||
1413 | * - projection w |
||
1414 | * - starting at offset 4, dpdx.xy, dpdy.xy |
||
1415 | * |
||
1416 | * bias/lod go into the second arg |
||
1417 | */ |
||
1418 | |||
1419 | /* insert tex coords: */ |
||
1420 | for (i = 0; i < coords; i++) |
||
1421 | src0[nsrc0++] = coord[i]; |
||
1422 | |||
1423 | if (coords == 1) { |
||
1424 | /* hw doesn't do 1d, so we treat it as 2d with |
||
1425 | * height of 1, and patch up the y coord. |
||
1426 | * TODO: y coord should be (int)0 in some cases.. |
||
1427 | */ |
||
1428 | src0[nsrc0++] = create_immed(b, fui(0.5)); |
||
1429 | } |
||
1430 | |||
1431 | if (tex->is_shadow) |
||
1432 | src0[nsrc0++] = compare; |
||
1433 | |||
1434 | if (tex->is_array) |
||
1435 | src0[nsrc0++] = coord[coords]; |
||
1436 | |||
1437 | if (has_proj) { |
||
1438 | src0[nsrc0++] = proj; |
||
1439 | flags |= IR3_INSTR_P; |
||
1440 | } |
||
1441 | |||
1442 | /* pad to 4, then ddx/ddy: */ |
||
1443 | if (tex->op == nir_texop_txd) { |
||
1444 | while (nsrc0 < 4) |
||
1445 | src0[nsrc0++] = create_immed(b, fui(0.0)); |
||
1446 | for (i = 0; i < coords; i++) |
||
1447 | src0[nsrc0++] = ddx[i]; |
||
1448 | if (coords < 2) |
||
1449 | src0[nsrc0++] = create_immed(b, fui(0.0)); |
||
1450 | for (i = 0; i < coords; i++) |
||
1451 | src0[nsrc0++] = ddy[i]; |
||
1452 | if (coords < 2) |
||
1453 | src0[nsrc0++] = create_immed(b, fui(0.0)); |
||
1454 | } |
||
1455 | |||
1456 | /* |
||
1457 | * second argument (if applicable): |
||
1458 | * - offsets |
||
1459 | * - lod |
||
1460 | * - bias |
||
1461 | */ |
||
1462 | if (has_off | has_lod | has_bias) { |
||
1463 | if (has_off) { |
||
1464 | for (i = 0; i < coords; i++) |
||
1465 | src1[nsrc1++] = off[i]; |
||
1466 | if (coords < 2) |
||
1467 | src1[nsrc1++] = create_immed(b, fui(0.0)); |
||
1468 | flags |= IR3_INSTR_O; |
||
1469 | } |
||
1470 | |||
1471 | if (has_lod | has_bias) |
||
1472 | src1[nsrc1++] = lod; |
||
1473 | } |
||
1474 | |||
1475 | switch (tex->dest_type) { |
||
1476 | case nir_type_invalid: |
||
1477 | case nir_type_float: |
||
1478 | type = TYPE_F32; |
||
1479 | break; |
||
1480 | case nir_type_int: |
||
1481 | type = TYPE_S32; |
||
1482 | break; |
||
1483 | case nir_type_unsigned: |
||
1484 | case nir_type_bool: |
||
1485 | type = TYPE_U32; |
||
1486 | break; |
||
1487 | } |
||
1488 | |||
1489 | sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW, |
||
1490 | flags, tex->sampler_index, tex->sampler_index, |
||
1491 | create_collect(b, src0, nsrc0), |
||
1492 | create_collect(b, src1, nsrc1)); |
||
1493 | |||
1494 | split_dest(b, dst, sam); |
||
1495 | } |
||
1496 | |||
1497 | static void |
||
1498 | emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex) |
||
1499 | { |
||
1500 | struct ir3_block *b = ctx->block; |
||
1501 | struct ir3_instruction **dst, *sam; |
||
1502 | |||
1503 | dst = get_dst(ctx, &tex->dest, 1); |
||
1504 | |||
1505 | sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, TGSI_WRITEMASK_Z, 0, |
||
1506 | tex->sampler_index, tex->sampler_index, NULL, NULL); |
||
1507 | |||
1508 | /* even though there is only one component, since it ends |
||
1509 | * up in .z rather than .x, we need a split_dest() |
||
1510 | */ |
||
1511 | split_dest(b, dst, sam); |
||
1512 | |||
1513 | /* The # of levels comes from getinfo.z. We need to add 1 to it, since |
||
1514 | * the value in TEX_CONST_0 is zero-based. |
||
1515 | */ |
||
1516 | if (ctx->levels_add_one) |
||
1517 | dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0); |
||
1518 | } |
||
1519 | |||
1520 | static void |
||
1521 | emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex) |
||
1522 | { |
||
1523 | struct ir3_block *b = ctx->block; |
||
1524 | struct ir3_instruction **dst, *sam, *lod; |
||
1525 | unsigned flags, coords; |
||
1526 | |||
1527 | tex_info(tex, &flags, &coords); |
||
1528 | |||
1529 | dst = get_dst(ctx, &tex->dest, 4); |
||
1530 | |||
1531 | compile_assert(ctx, tex->num_srcs == 1); |
||
1532 | compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod); |
||
1533 | |||
1534 | lod = get_src(ctx, &tex->src[0].src)[0]; |
||
1535 | |||
1536 | sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags, |
||
1537 | tex->sampler_index, tex->sampler_index, lod, NULL); |
||
1538 | |||
1539 | split_dest(b, dst, sam); |
||
1540 | |||
1541 | /* Array size actually ends up in .w rather than .z. This doesn't |
||
1542 | * matter for miplevel 0, but for higher mips the value in z is |
||
1543 | * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is |
||
1544 | * returned, which means that we have to add 1 to it for arrays. |
||
1545 | */ |
||
1546 | if (tex->is_array) { |
||
1547 | if (ctx->levels_add_one) { |
||
1548 | dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0); |
||
1549 | } else { |
||
1550 | dst[coords] = ir3_MOV(b, dst[3], TYPE_U32); |
||
1551 | } |
||
1552 | } |
||
1553 | } |
||
1554 | |||
1555 | static void |
||
1556 | emit_instr(struct ir3_compile *ctx, nir_instr *instr) |
||
1557 | { |
||
1558 | switch (instr->type) { |
||
1559 | case nir_instr_type_alu: |
||
1560 | emit_alu(ctx, nir_instr_as_alu(instr)); |
||
1561 | break; |
||
1562 | case nir_instr_type_intrinsic: |
||
1563 | emit_intrinisic(ctx, nir_instr_as_intrinsic(instr)); |
||
1564 | break; |
||
1565 | case nir_instr_type_load_const: |
||
1566 | emit_load_const(ctx, nir_instr_as_load_const(instr)); |
||
1567 | break; |
||
1568 | case nir_instr_type_ssa_undef: |
||
1569 | emit_undef(ctx, nir_instr_as_ssa_undef(instr)); |
||
1570 | break; |
||
1571 | case nir_instr_type_tex: { |
||
1572 | nir_tex_instr *tex = nir_instr_as_tex(instr); |
||
1573 | /* couple tex instructions get special-cased: |
||
1574 | */ |
||
1575 | switch (tex->op) { |
||
1576 | case nir_texop_txs: |
||
1577 | emit_tex_txs(ctx, tex); |
||
1578 | break; |
||
1579 | case nir_texop_query_levels: |
||
1580 | emit_tex_query_levels(ctx, tex); |
||
1581 | break; |
||
1582 | default: |
||
1583 | emit_tex(ctx, tex); |
||
1584 | break; |
||
1585 | } |
||
1586 | break; |
||
1587 | } |
||
1588 | case nir_instr_type_call: |
||
1589 | case nir_instr_type_jump: |
||
1590 | case nir_instr_type_phi: |
||
1591 | case nir_instr_type_parallel_copy: |
||
1592 | compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type); |
||
1593 | break; |
||
1594 | } |
||
1595 | } |
||
1596 | |||
1597 | static void |
||
1598 | emit_block(struct ir3_compile *ctx, nir_block *block) |
||
1599 | { |
||
1600 | nir_foreach_instr(block, instr) { |
||
1601 | emit_instr(ctx, instr); |
||
1602 | if (ctx->error) |
||
1603 | return; |
||
1604 | } |
||
1605 | } |
||
1606 | |||
1607 | static void |
||
1608 | emit_function(struct ir3_compile *ctx, nir_function_impl *impl) |
||
1609 | { |
||
1610 | foreach_list_typed(nir_cf_node, node, node, &impl->body) { |
||
1611 | switch (node->type) { |
||
1612 | case nir_cf_node_block: |
||
1613 | emit_block(ctx, nir_cf_node_as_block(node)); |
||
1614 | break; |
||
1615 | case nir_cf_node_if: |
||
1616 | case nir_cf_node_loop: |
||
1617 | case nir_cf_node_function: |
||
1618 | compile_error(ctx, "TODO\n"); |
||
1619 | break; |
||
1620 | } |
||
1621 | if (ctx->error) |
||
1622 | return; |
||
1623 | } |
||
1624 | } |
||
1625 | |||
1626 | static void |
||
1627 | setup_input(struct ir3_compile *ctx, nir_variable *in) |
||
1628 | { |
||
1629 | struct ir3_shader_variant *so = ctx->so; |
||
1630 | unsigned array_len = MAX2(glsl_get_length(in->type), 1); |
||
1631 | unsigned ncomp = glsl_get_components(in->type); |
||
1632 | /* XXX: map loc slots to semantics */ |
||
1633 | unsigned semantic_name = in->data.location; |
||
1634 | unsigned semantic_index = in->data.index; |
||
1635 | unsigned n = in->data.driver_location; |
||
1636 | |||
1637 | DBG("; in: %u:%u, len=%ux%u, loc=%u\n", |
||
1638 | semantic_name, semantic_index, array_len, |
||
1639 | ncomp, n); |
||
1640 | |||
1641 | so->inputs[n].semantic = |
||
1642 | ir3_semantic_name(semantic_name, semantic_index); |
||
1643 | so->inputs[n].compmask = (1 << ncomp) - 1; |
||
1644 | so->inputs[n].inloc = ctx->next_inloc; |
||
1645 | so->inputs[n].interpolate = 0; |
||
1646 | so->inputs_count = MAX2(so->inputs_count, n + 1); |
||
1647 | |||
1648 | /* the fdN_program_emit() code expects tgsi consts here, so map |
||
1649 | * things back to tgsi for now: |
||
1650 | */ |
||
1651 | switch (in->data.interpolation) { |
||
1652 | case INTERP_QUALIFIER_FLAT: |
||
1653 | so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT; |
||
1654 | break; |
||
1655 | case INTERP_QUALIFIER_NOPERSPECTIVE: |
||
1656 | so->inputs[n].interpolate = TGSI_INTERPOLATE_LINEAR; |
||
1657 | break; |
||
1658 | case INTERP_QUALIFIER_SMOOTH: |
||
1659 | so->inputs[n].interpolate = TGSI_INTERPOLATE_PERSPECTIVE; |
||
1660 | break; |
||
1661 | } |
||
1662 | |||
1663 | for (int i = 0; i < ncomp; i++) { |
||
1664 | struct ir3_instruction *instr = NULL; |
||
1665 | unsigned idx = (n * 4) + i; |
||
1666 | |||
1667 | if (ctx->so->type == SHADER_FRAGMENT) { |
||
1668 | if (semantic_name == TGSI_SEMANTIC_POSITION) { |
||
1669 | so->inputs[n].bary = false; |
||
1670 | so->frag_coord = true; |
||
1671 | instr = create_frag_coord(ctx, i); |
||
1672 | } else if (semantic_name == TGSI_SEMANTIC_FACE) { |
||
1673 | so->inputs[n].bary = false; |
||
1674 | so->frag_face = true; |
||
1675 | instr = create_frag_face(ctx, i); |
||
1676 | } else { |
||
1677 | bool use_ldlv = false; |
||
1678 | |||
1679 | /* with NIR, we need to infer TGSI_INTERPOLATE_COLOR |
||
1680 | * from the semantic name: |
||
1681 | */ |
||
1682 | if ((in->data.interpolation == INTERP_QUALIFIER_NONE) && |
||
1683 | ((semantic_name == TGSI_SEMANTIC_COLOR) || |
||
1684 | (semantic_name == TGSI_SEMANTIC_BCOLOR))) |
||
1685 | so->inputs[n].interpolate = TGSI_INTERPOLATE_COLOR; |
||
1686 | |||
1687 | if (ctx->flat_bypass) { |
||
1688 | /* with NIR, we need to infer TGSI_INTERPOLATE_COLOR |
||
1689 | * from the semantic name: |
||
1690 | */ |
||
1691 | switch (so->inputs[n].interpolate) { |
||
1692 | case TGSI_INTERPOLATE_COLOR: |
||
1693 | if (!ctx->so->key.rasterflat) |
||
1694 | break; |
||
1695 | /* fallthrough */ |
||
1696 | case TGSI_INTERPOLATE_CONSTANT: |
||
1697 | use_ldlv = true; |
||
1698 | break; |
||
1699 | } |
||
1700 | } |
||
1701 | |||
1702 | so->inputs[n].bary = true; |
||
1703 | |||
1704 | instr = create_frag_input(ctx, |
||
1705 | so->inputs[n].inloc + i - 8, use_ldlv); |
||
1706 | } |
||
1707 | } else { |
||
1708 | instr = create_input(ctx->block, NULL, idx); |
||
1709 | } |
||
1710 | |||
1711 | ctx->block->inputs[idx] = instr; |
||
1712 | } |
||
1713 | |||
1714 | if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) { |
||
1715 | ctx->next_inloc += ncomp; |
||
1716 | so->total_in += ncomp; |
||
1717 | } |
||
1718 | } |
||
1719 | |||
1720 | static void |
||
1721 | setup_output(struct ir3_compile *ctx, nir_variable *out) |
||
1722 | { |
||
1723 | struct ir3_shader_variant *so = ctx->so; |
||
1724 | unsigned array_len = MAX2(glsl_get_length(out->type), 1); |
||
1725 | unsigned ncomp = glsl_get_components(out->type); |
||
1726 | /* XXX: map loc slots to semantics */ |
||
1727 | unsigned semantic_name = out->data.location; |
||
1728 | unsigned semantic_index = out->data.index; |
||
1729 | unsigned n = out->data.driver_location; |
||
1730 | unsigned comp = 0; |
||
1731 | |||
1732 | DBG("; out: %u:%u, len=%ux%u, loc=%u\n", |
||
1733 | semantic_name, semantic_index, array_len, |
||
1734 | ncomp, n); |
||
1735 | |||
1736 | if (ctx->so->type == SHADER_VERTEX) { |
||
1737 | switch (semantic_name) { |
||
1738 | case TGSI_SEMANTIC_POSITION: |
||
1739 | so->writes_pos = true; |
||
1740 | break; |
||
1741 | case TGSI_SEMANTIC_PSIZE: |
||
1742 | so->writes_psize = true; |
||
1743 | break; |
||
1744 | case TGSI_SEMANTIC_COLOR: |
||
1745 | case TGSI_SEMANTIC_BCOLOR: |
||
1746 | case TGSI_SEMANTIC_GENERIC: |
||
1747 | case TGSI_SEMANTIC_FOG: |
||
1748 | case TGSI_SEMANTIC_TEXCOORD: |
||
1749 | break; |
||
1750 | default: |
||
1751 | compile_error(ctx, "unknown VS semantic name: %s\n", |
||
1752 | tgsi_semantic_names[semantic_name]); |
||
1753 | } |
||
1754 | } else { |
||
1755 | switch (semantic_name) { |
||
1756 | case TGSI_SEMANTIC_POSITION: |
||
1757 | comp = 2; /* tgsi will write to .z component */ |
||
1758 | so->writes_pos = true; |
||
1759 | break; |
||
1760 | case TGSI_SEMANTIC_COLOR: |
||
1761 | break; |
||
1762 | default: |
||
1763 | compile_error(ctx, "unknown FS semantic name: %s\n", |
||
1764 | tgsi_semantic_names[semantic_name]); |
||
1765 | } |
||
1766 | } |
||
1767 | |||
1768 | compile_assert(ctx, n < ARRAY_SIZE(so->outputs)); |
||
1769 | |||
1770 | so->outputs[n].semantic = |
||
1771 | ir3_semantic_name(semantic_name, semantic_index); |
||
1772 | so->outputs[n].regid = regid(n, comp); |
||
1773 | so->outputs_count = MAX2(so->outputs_count, n + 1); |
||
1774 | |||
1775 | for (int i = 0; i < ncomp; i++) { |
||
1776 | unsigned idx = (n * 4) + i; |
||
1777 | |||
1778 | ctx->block->outputs[idx] = create_immed(ctx->block, fui(0.0)); |
||
1779 | } |
||
1780 | } |
||
1781 | |||
1782 | static void |
||
1783 | emit_instructions(struct ir3_compile *ctx) |
||
1784 | { |
||
1785 | unsigned ninputs = exec_list_length(&ctx->s->inputs) * 4; |
||
1786 | unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4; |
||
1787 | |||
1788 | /* we need to allocate big enough outputs array so that |
||
1789 | * we can stuff the kill's at the end. Likewise for vtx |
||
1790 | * shaders, we need to leave room for sysvals: |
||
1791 | */ |
||
1792 | if (ctx->so->type == SHADER_FRAGMENT) { |
||
1793 | noutputs += ARRAY_SIZE(ctx->kill); |
||
1794 | } else if (ctx->so->type == SHADER_VERTEX) { |
||
1795 | ninputs += 8; |
||
1796 | } |
||
1797 | |||
1798 | ctx->block = ir3_block_create(ctx->ir, 0, ninputs, noutputs); |
||
1799 | |||
1800 | if (ctx->so->type == SHADER_FRAGMENT) { |
||
1801 | ctx->block->noutputs -= ARRAY_SIZE(ctx->kill); |
||
1802 | } else if (ctx->so->type == SHADER_VERTEX) { |
||
1803 | ctx->block->ninputs -= 8; |
||
1804 | } |
||
1805 | |||
1806 | /* for fragment shader, we have a single input register (usually |
||
1807 | * r0.xy) which is used as the base for bary.f varying fetch instrs: |
||
1808 | */ |
||
1809 | if (ctx->so->type == SHADER_FRAGMENT) { |
||
1810 | // TODO maybe a helper for fi since we need it a few places.. |
||
1811 | struct ir3_instruction *instr; |
||
1812 | instr = ir3_instr_create(ctx->block, -1, OPC_META_FI); |
||
1813 | ir3_reg_create(instr, 0, 0); |
||
1814 | ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */ |
||
1815 | ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */ |
||
1816 | ctx->frag_pos = instr; |
||
1817 | } |
||
1818 | |||
1819 | /* Setup inputs: */ |
||
1820 | foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) { |
||
1821 | setup_input(ctx, var); |
||
1822 | } |
||
1823 | |||
1824 | /* Setup outputs: */ |
||
1825 | foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) { |
||
1826 | setup_output(ctx, var); |
||
1827 | } |
||
1828 | |||
1829 | /* Setup variables (which should only be arrays): */ |
||
1830 | foreach_list_typed(nir_variable, var, node, &ctx->s->globals) { |
||
1831 | declare_var(ctx, var); |
||
1832 | } |
||
1833 | |||
1834 | /* Find the main function and emit the body: */ |
||
1835 | nir_foreach_overload(ctx->s, overload) { |
||
1836 | compile_assert(ctx, strcmp(overload->function->name, "main") == 0); |
||
1837 | compile_assert(ctx, overload->impl); |
||
1838 | emit_function(ctx, overload->impl); |
||
1839 | if (ctx->error) |
||
1840 | return; |
||
1841 | } |
||
1842 | } |
||
1843 | |||
1844 | /* from NIR perspective, we actually have inputs. But most of the "inputs" |
||
1845 | * for a fragment shader are just bary.f instructions. The *actual* inputs |
||
1846 | * from the hw perspective are the frag_pos and optionally frag_coord and |
||
1847 | * frag_face. |
||
1848 | */ |
||
1849 | static void |
||
1850 | fixup_frag_inputs(struct ir3_compile *ctx) |
||
1851 | { |
||
1852 | struct ir3_shader_variant *so = ctx->so; |
||
1853 | struct ir3_block *block = ctx->block; |
||
1854 | struct ir3_instruction **inputs; |
||
1855 | struct ir3_instruction *instr; |
||
1856 | int n, regid = 0; |
||
1857 | |||
1858 | block->ninputs = 0; |
||
1859 | |||
1860 | n = 4; /* always have frag_pos */ |
||
1861 | n += COND(so->frag_face, 4); |
||
1862 | n += COND(so->frag_coord, 4); |
||
1863 | |||
1864 | inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *))); |
||
1865 | |||
1866 | if (so->frag_face) { |
||
1867 | /* this ultimately gets assigned to hr0.x so doesn't conflict |
||
1868 | * with frag_coord/frag_pos.. |
||
1869 | */ |
||
1870 | inputs[block->ninputs++] = ctx->frag_face; |
||
1871 | ctx->frag_face->regs[0]->num = 0; |
||
1872 | |||
1873 | /* remaining channels not used, but let's avoid confusing |
||
1874 | * other parts that expect inputs to come in groups of vec4 |
||
1875 | */ |
||
1876 | inputs[block->ninputs++] = NULL; |
||
1877 | inputs[block->ninputs++] = NULL; |
||
1878 | inputs[block->ninputs++] = NULL; |
||
1879 | } |
||
1880 | |||
1881 | /* since we don't know where to set the regid for frag_coord, |
||
1882 | * we have to use r0.x for it. But we don't want to *always* |
||
1883 | * use r1.x for frag_pos as that could increase the register |
||
1884 | * footprint on simple shaders: |
||
1885 | */ |
||
1886 | if (so->frag_coord) { |
||
1887 | ctx->frag_coord[0]->regs[0]->num = regid++; |
||
1888 | ctx->frag_coord[1]->regs[0]->num = regid++; |
||
1889 | ctx->frag_coord[2]->regs[0]->num = regid++; |
||
1890 | ctx->frag_coord[3]->regs[0]->num = regid++; |
||
1891 | |||
1892 | inputs[block->ninputs++] = ctx->frag_coord[0]; |
||
1893 | inputs[block->ninputs++] = ctx->frag_coord[1]; |
||
1894 | inputs[block->ninputs++] = ctx->frag_coord[2]; |
||
1895 | inputs[block->ninputs++] = ctx->frag_coord[3]; |
||
1896 | } |
||
1897 | |||
1898 | /* we always have frag_pos: */ |
||
1899 | so->pos_regid = regid; |
||
1900 | |||
1901 | /* r0.x */ |
||
1902 | instr = create_input(block, NULL, block->ninputs); |
||
1903 | instr->regs[0]->num = regid++; |
||
1904 | inputs[block->ninputs++] = instr; |
||
1905 | ctx->frag_pos->regs[1]->instr = instr; |
||
1906 | |||
1907 | /* r0.y */ |
||
1908 | instr = create_input(block, NULL, block->ninputs); |
||
1909 | instr->regs[0]->num = regid++; |
||
1910 | inputs[block->ninputs++] = instr; |
||
1911 | ctx->frag_pos->regs[2]->instr = instr; |
||
1912 | |||
1913 | block->inputs = inputs; |
||
1914 | } |
||
1915 | |||
1916 | static void |
||
1917 | compile_dump(struct ir3_compile *ctx) |
||
1918 | { |
||
1919 | const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag"; |
||
1920 | static unsigned n = 0; |
||
1921 | char fname[16]; |
||
1922 | FILE *f; |
||
1923 | snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++); |
||
1924 | f = fopen(fname, "w"); |
||
1925 | if (!f) |
||
1926 | return; |
||
1927 | ir3_block_depth(ctx->block); |
||
1928 | ir3_dump(ctx->ir, name, ctx->block, f); |
||
1929 | fclose(f); |
||
1930 | } |
||
1931 | |||
1932 | int |
||
1933 | ir3_compile_shader_nir(struct ir3_shader_variant *so, |
||
1934 | const struct tgsi_token *tokens, struct ir3_shader_key key) |
||
1935 | { |
||
1936 | struct ir3_compile *ctx; |
||
1937 | struct ir3_block *block; |
||
1938 | struct ir3_instruction **inputs; |
||
1939 | unsigned i, j, actual_in; |
||
1940 | int ret = 0, max_bary; |
||
1941 | |||
1942 | assert(!so->ir); |
||
1943 | |||
1944 | so->ir = ir3_create(); |
||
1945 | |||
1946 | assert(so->ir); |
||
1947 | |||
1948 | ctx = compile_init(so, tokens); |
||
1949 | if (!ctx) { |
||
1950 | DBG("INIT failed!"); |
||
1951 | ret = -1; |
||
1952 | goto out; |
||
1953 | } |
||
1954 | |||
1955 | emit_instructions(ctx); |
||
1956 | |||
1957 | if (ctx->error) { |
||
1958 | DBG("EMIT failed!"); |
||
1959 | ret = -1; |
||
1960 | goto out; |
||
1961 | } |
||
1962 | |||
1963 | block = ctx->block; |
||
1964 | so->ir->block = block; |
||
1965 | |||
1966 | /* keep track of the inputs from TGSI perspective.. */ |
||
1967 | inputs = block->inputs; |
||
1968 | |||
1969 | /* but fixup actual inputs for frag shader: */ |
||
1970 | if (so->type == SHADER_FRAGMENT) |
||
1971 | fixup_frag_inputs(ctx); |
||
1972 | |||
1973 | /* at this point, for binning pass, throw away unneeded outputs: */ |
||
1974 | if (key.binning_pass) { |
||
1975 | for (i = 0, j = 0; i < so->outputs_count; i++) { |
||
1976 | unsigned name = sem2name(so->outputs[i].semantic); |
||
1977 | unsigned idx = sem2idx(so->outputs[i].semantic); |
||
1978 | |||
1979 | /* throw away everything but first position/psize */ |
||
1980 | if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) || |
||
1981 | (name == TGSI_SEMANTIC_PSIZE))) { |
||
1982 | if (i != j) { |
||
1983 | so->outputs[j] = so->outputs[i]; |
||
1984 | block->outputs[(j*4)+0] = block->outputs[(i*4)+0]; |
||
1985 | block->outputs[(j*4)+1] = block->outputs[(i*4)+1]; |
||
1986 | block->outputs[(j*4)+2] = block->outputs[(i*4)+2]; |
||
1987 | block->outputs[(j*4)+3] = block->outputs[(i*4)+3]; |
||
1988 | } |
||
1989 | j++; |
||
1990 | } |
||
1991 | } |
||
1992 | so->outputs_count = j; |
||
1993 | block->noutputs = j * 4; |
||
1994 | } |
||
1995 | |||
1996 | /* if we want half-precision outputs, mark the output registers |
||
1997 | * as half: |
||
1998 | */ |
||
1999 | if (key.half_precision) { |
||
2000 | for (i = 0; i < block->noutputs; i++) { |
||
2001 | if (!block->outputs[i]) |
||
2002 | continue; |
||
2003 | block->outputs[i]->regs[0]->flags |= IR3_REG_HALF; |
||
2004 | } |
||
2005 | } |
||
2006 | |||
2007 | /* at this point, we want the kill's in the outputs array too, |
||
2008 | * so that they get scheduled (since they have no dst).. we've |
||
2009 | * already ensured that the array is big enough in push_block(): |
||
2010 | */ |
||
2011 | if (so->type == SHADER_FRAGMENT) { |
||
2012 | for (i = 0; i < ctx->kill_count; i++) |
||
2013 | block->outputs[block->noutputs++] = ctx->kill[i]; |
||
2014 | } |
||
2015 | |||
2016 | if (fd_mesa_debug & FD_DBG_OPTDUMP) |
||
2017 | compile_dump(ctx); |
||
2018 | |||
2019 | if (fd_mesa_debug & FD_DBG_OPTMSGS) { |
||
2020 | printf("BEFORE CP:\n"); |
||
2021 | ir3_dump_instr_list(block->head); |
||
2022 | } |
||
2023 | |||
2024 | ir3_block_depth(block); |
||
2025 | |||
2026 | ir3_block_cp(block); |
||
2027 | |||
2028 | if (fd_mesa_debug & FD_DBG_OPTMSGS) { |
||
2029 | printf("BEFORE GROUPING:\n"); |
||
2030 | ir3_dump_instr_list(block->head); |
||
2031 | } |
||
2032 | |||
2033 | /* Group left/right neighbors, inserting mov's where needed to |
||
2034 | * solve conflicts: |
||
2035 | */ |
||
2036 | ir3_block_group(block); |
||
2037 | |||
2038 | if (fd_mesa_debug & FD_DBG_OPTDUMP) |
||
2039 | compile_dump(ctx); |
||
2040 | |||
2041 | ir3_block_depth(block); |
||
2042 | |||
2043 | if (fd_mesa_debug & FD_DBG_OPTMSGS) { |
||
2044 | printf("AFTER DEPTH:\n"); |
||
2045 | ir3_dump_instr_list(block->head); |
||
2046 | } |
||
2047 | |||
2048 | ret = ir3_block_sched(block); |
||
2049 | if (ret) { |
||
2050 | DBG("SCHED failed!"); |
||
2051 | goto out; |
||
2052 | } |
||
2053 | |||
2054 | if (fd_mesa_debug & FD_DBG_OPTMSGS) { |
||
2055 | printf("AFTER SCHED:\n"); |
||
2056 | ir3_dump_instr_list(block->head); |
||
2057 | } |
||
2058 | |||
2059 | ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face); |
||
2060 | if (ret) { |
||
2061 | DBG("RA failed!"); |
||
2062 | goto out; |
||
2063 | } |
||
2064 | |||
2065 | if (fd_mesa_debug & FD_DBG_OPTMSGS) { |
||
2066 | printf("AFTER RA:\n"); |
||
2067 | ir3_dump_instr_list(block->head); |
||
2068 | } |
||
2069 | |||
2070 | ir3_block_legalize(block, &so->has_samp, &max_bary); |
||
2071 | |||
2072 | /* fixup input/outputs: */ |
||
2073 | for (i = 0; i < so->outputs_count; i++) { |
||
2074 | so->outputs[i].regid = block->outputs[i*4]->regs[0]->num; |
||
2075 | /* preserve hack for depth output.. tgsi writes depth to .z, |
||
2076 | * but what we give the hw is the scalar register: |
||
2077 | */ |
||
2078 | if ((so->type == SHADER_FRAGMENT) && |
||
2079 | (sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION)) |
||
2080 | so->outputs[i].regid += 2; |
||
2081 | } |
||
2082 | |||
2083 | /* Note that some or all channels of an input may be unused: */ |
||
2084 | actual_in = 0; |
||
2085 | for (i = 0; i < so->inputs_count; i++) { |
||
2086 | unsigned j, regid = ~0, compmask = 0; |
||
2087 | so->inputs[i].ncomp = 0; |
||
2088 | for (j = 0; j < 4; j++) { |
||
2089 | struct ir3_instruction *in = inputs[(i*4) + j]; |
||
2090 | if (in) { |
||
2091 | compmask |= (1 << j); |
||
2092 | regid = in->regs[0]->num - j; |
||
2093 | actual_in++; |
||
2094 | so->inputs[i].ncomp++; |
||
2095 | } |
||
2096 | } |
||
2097 | so->inputs[i].regid = regid; |
||
2098 | so->inputs[i].compmask = compmask; |
||
2099 | } |
||
2100 | |||
2101 | /* fragment shader always gets full vec4's even if it doesn't |
||
2102 | * fetch all components, but vertex shader we need to update |
||
2103 | * with the actual number of components fetch, otherwise thing |
||
2104 | * will hang due to mismaptch between VFD_DECODE's and |
||
2105 | * TOTALATTRTOVS |
||
2106 | */ |
||
2107 | if (so->type == SHADER_VERTEX) |
||
2108 | so->total_in = actual_in; |
||
2109 | else |
||
2110 | so->total_in = align(max_bary + 1, 4); |
||
2111 | |||
2112 | out: |
||
2113 | if (ret) { |
||
2114 | ir3_destroy(so->ir); |
||
2115 | so->ir = NULL; |
||
2116 | } |
||
2117 | compile_free(ctx); |
||
2118 | |||
2119 | return ret; |
||
2120 | }><>>>>>>>>>>><>>>>>>>>>>>>>>>>>>>>>>>>>>>>><>><>><>><>>>><>>>>><> |