Subversion Repositories Kolibri OS

Rev

Go to most recent revision | Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5564 serge 1
/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
 
3
/*
4
 * Copyright (C) 2015 Rob Clark 
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a
7
 * copy of this software and associated documentation files (the "Software"),
8
 * to deal in the Software without restriction, including without limitation
9
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10
 * and/or sell copies of the Software, and to permit persons to whom the
11
 * Software is furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice (including the next
14
 * paragraph) shall be included in all copies or substantial portions of the
15
 * Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
 * SOFTWARE.
24
 *
25
 * Authors:
26
 *    Rob Clark 
27
 */
28
 
29
#include 
30
 
31
#include "pipe/p_state.h"
32
#include "util/u_string.h"
33
#include "util/u_memory.h"
34
#include "util/u_inlines.h"
35
#include "tgsi/tgsi_lowering.h"
36
#include "tgsi/tgsi_strings.h"
37
 
38
#include "nir/tgsi_to_nir.h"
39
#include "glsl/shader_enums.h"
40
 
41
#include "freedreno_util.h"
42
 
43
#include "ir3_compiler.h"
44
#include "ir3_shader.h"
45
#include "ir3_nir.h"
46
 
47
#include "instr-a3xx.h"
48
#include "ir3.h"
49
 
50
 
51
static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
52
 
53
struct ir3_compile {
54
	const struct tgsi_token *tokens;
55
	struct nir_shader *s;
56
 
57
	struct ir3 *ir;
58
	struct ir3_shader_variant *so;
59
 
60
	/* bitmask of which samplers are integer: */
61
	uint16_t integer_s;
62
 
63
	struct ir3_block *block;
64
 
65
	/* For fragment shaders, from the hw perspective the only
66
	 * actual input is r0.xy position register passed to bary.f.
67
	 * But TGSI doesn't know that, it still declares things as
68
	 * IN[] registers.  So we do all the input tracking normally
69
	 * and fix things up after compile_instructions()
70
	 *
71
	 * NOTE that frag_pos is the hardware position (possibly it
72
	 * is actually an index or tag or some such.. it is *not*
73
	 * values that can be directly used for gl_FragCoord..)
74
	 */
75
	struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
76
 
77
	/* For vertex shaders, keep track of the system values sources */
78
	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
79
 
80
	/* mapping from nir_register to defining instruction: */
81
	struct hash_table *def_ht;
82
 
83
	/* mapping from nir_variable to ir3_array: */
84
	struct hash_table *var_ht;
85
	unsigned num_arrays;
86
 
87
	/* a common pattern for indirect addressing is to request the
88
	 * same address register multiple times.  To avoid generating
89
	 * duplicate instruction sequences (which our backend does not
90
	 * try to clean up, since that should be done as the NIR stage)
91
	 * we cache the address value generated for a given src value:
92
	 */
93
	struct hash_table *addr_ht;
94
 
95
	/* for calculating input/output positions/linkages: */
96
	unsigned next_inloc;
97
 
98
	/* a4xx (at least patchlevel 0) cannot seem to flat-interpolate
99
	 * so we need to use ldlv.u32 to load the varying directly:
100
	 */
101
	bool flat_bypass;
102
 
103
	/* on a3xx, we need to add one to # of array levels:
104
	 */
105
	bool levels_add_one;
106
 
107
	/* for looking up which system value is which */
108
	unsigned sysval_semantics[8];
109
 
110
	/* list of kill instructions: */
111
	struct ir3_instruction *kill[16];
112
	unsigned int kill_count;
113
 
114
	/* set if we encounter something we can't handle yet, so we
115
	 * can bail cleanly and fallback to TGSI compiler f/e
116
	 */
117
	bool error;
118
};
119
 
120
 
121
static struct nir_shader *to_nir(const struct tgsi_token *tokens)
122
{
123
	struct nir_shader_compiler_options options = {
124
			.lower_fpow = true,
125
			.lower_fsat = true,
126
			.lower_scmp = true,
127
			.lower_flrp = true,
128
			.native_integers = true,
129
	};
130
	bool progress;
131
 
132
	struct nir_shader *s = tgsi_to_nir(tokens, &options);
133
 
134
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
135
		debug_printf("----------------------\n");
136
		nir_print_shader(s, stdout);
137
		debug_printf("----------------------\n");
138
	}
139
 
140
	nir_opt_global_to_local(s);
141
	nir_convert_to_ssa(s);
142
	nir_lower_idiv(s);
143
 
144
	do {
145
		progress = false;
146
 
147
		nir_lower_vars_to_ssa(s);
148
		nir_lower_alu_to_scalar(s);
149
 
150
		progress |= nir_copy_prop(s);
151
		progress |= nir_opt_dce(s);
152
		progress |= nir_opt_cse(s);
153
		progress |= ir3_nir_lower_if_else(s);
154
		progress |= nir_opt_algebraic(s);
155
		progress |= nir_opt_constant_folding(s);
156
 
157
	} while (progress);
158
 
159
	nir_remove_dead_variables(s);
160
	nir_validate_shader(s);
161
 
162
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
163
		debug_printf("----------------------\n");
164
		nir_print_shader(s, stdout);
165
		debug_printf("----------------------\n");
166
	}
167
 
168
	return s;
169
}
170
 
171
/* TODO nir doesn't lower everything for us yet, but ideally it would: */
172
static const struct tgsi_token *
173
lower_tgsi(const struct tgsi_token *tokens, struct ir3_shader_variant *so)
174
{
175
	struct tgsi_shader_info info;
176
	struct tgsi_lowering_config lconfig = {
177
			.color_two_side = so->key.color_two_side,
178
			.lower_FRC = true,
179
	};
180
 
181
	switch (so->type) {
182
	case SHADER_FRAGMENT:
183
	case SHADER_COMPUTE:
184
		lconfig.saturate_s = so->key.fsaturate_s;
185
		lconfig.saturate_t = so->key.fsaturate_t;
186
		lconfig.saturate_r = so->key.fsaturate_r;
187
		break;
188
	case SHADER_VERTEX:
189
		lconfig.saturate_s = so->key.vsaturate_s;
190
		lconfig.saturate_t = so->key.vsaturate_t;
191
		lconfig.saturate_r = so->key.vsaturate_r;
192
		break;
193
	}
194
 
195
	if (!so->shader) {
196
		/* hack for standalone compiler which does not have
197
		 * screen/context:
198
		 */
199
	} else if (ir3_shader_gpuid(so->shader) >= 400) {
200
		/* a4xx seems to have *no* sam.p */
201
		lconfig.lower_TXP = ~0;  /* lower all txp */
202
	} else {
203
		/* a3xx just needs to avoid sam.p for 3d tex */
204
		lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D);
205
	}
206
 
207
	return tgsi_transform_lowering(&lconfig, tokens, &info);
208
}
209
 
210
static struct ir3_compile *
211
compile_init(struct ir3_shader_variant *so,
212
		const struct tgsi_token *tokens)
213
{
214
	struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
215
	const struct tgsi_token *lowered_tokens;
216
 
217
	if (!so->shader) {
218
		/* hack for standalone compiler which does not have
219
		 * screen/context:
220
		 */
221
	} else if (ir3_shader_gpuid(so->shader) >= 400) {
222
		/* need special handling for "flat" */
223
		ctx->flat_bypass = true;
224
		ctx->levels_add_one = false;
225
	} else {
226
		/* no special handling for "flat" */
227
		ctx->flat_bypass = false;
228
		ctx->levels_add_one = true;
229
	}
230
 
231
	switch (so->type) {
232
	case SHADER_FRAGMENT:
233
	case SHADER_COMPUTE:
234
		ctx->integer_s = so->key.finteger_s;
235
		break;
236
	case SHADER_VERTEX:
237
		ctx->integer_s = so->key.vinteger_s;
238
		break;
239
	}
240
 
241
	ctx->ir = so->ir;
242
	ctx->so = so;
243
	ctx->next_inloc = 8;
244
	ctx->def_ht = _mesa_hash_table_create(ctx,
245
			_mesa_hash_pointer, _mesa_key_pointer_equal);
246
	ctx->var_ht = _mesa_hash_table_create(ctx,
247
			_mesa_hash_pointer, _mesa_key_pointer_equal);
248
	ctx->addr_ht = _mesa_hash_table_create(ctx,
249
			_mesa_hash_pointer, _mesa_key_pointer_equal);
250
 
251
	lowered_tokens = lower_tgsi(tokens, so);
252
	if (!lowered_tokens)
253
		lowered_tokens = tokens;
254
	ctx->s = to_nir(lowered_tokens);
255
 
256
	if (lowered_tokens != tokens)
257
		free((void *)lowered_tokens);
258
 
259
	so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
260
 
261
	/* one (vec4) slot for vertex id base: */
262
	if (so->type == SHADER_VERTEX)
263
		so->first_immediate++;
264
 
265
	/* reserve 4 (vec4) slots for ubo base addresses: */
266
	so->first_immediate += 4;
267
 
268
	return ctx;
269
}
270
 
271
static void
272
compile_error(struct ir3_compile *ctx, const char *format, ...)
273
{
274
	va_list ap;
275
	va_start(ap, format);
276
	_debug_vprintf(format, ap);
277
	va_end(ap);
278
	nir_print_shader(ctx->s, stdout);
279
	ctx->error = true;
280
	debug_assert(0);
281
}
282
 
283
#define compile_assert(ctx, cond) do { \
284
		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
285
	} while (0)
286
 
287
static void
288
compile_free(struct ir3_compile *ctx)
289
{
290
	ralloc_free(ctx);
291
}
292
 
293
 
294
struct ir3_array {
295
	unsigned length, aid;
296
	struct ir3_instruction *arr[];
297
};
298
 
299
static void
300
declare_var(struct ir3_compile *ctx, nir_variable *var)
301
{
302
	unsigned length = glsl_get_length(var->type) * 4;  /* always vec4, at least with ttn */
303
	struct ir3_array *arr = ralloc_size(ctx, sizeof(*arr) +
304
			(length * sizeof(arr->arr[0])));
305
	arr->length = length;
306
	arr->aid = ++ctx->num_arrays;
307
	/* Some shaders end up reading array elements without first writing..
308
	 * so initialize things to prevent null instr ptrs later:
309
	 */
310
	for (unsigned i = 0; i < length; i++)
311
		arr->arr[i] = create_immed(ctx->block, 0);
312
	_mesa_hash_table_insert(ctx->var_ht, var, arr);
313
}
314
 
315
static struct ir3_array *
316
get_var(struct ir3_compile *ctx, nir_variable *var)
317
{
318
	struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
319
	return entry->data;
320
}
321
 
322
/* allocate a n element value array (to be populated by caller) and
323
 * insert in def_ht
324
 */
325
static struct ir3_instruction **
326
__get_dst(struct ir3_compile *ctx, void *key, unsigned n)
327
{
328
	struct ir3_instruction **value =
329
		ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
330
	_mesa_hash_table_insert(ctx->def_ht, key, value);
331
	return value;
332
}
333
 
334
static struct ir3_instruction **
335
get_dst(struct ir3_compile *ctx, nir_dest *dst, unsigned n)
336
{
337
	if (dst->is_ssa) {
338
		return __get_dst(ctx, &dst->ssa, n);
339
	} else {
340
		return __get_dst(ctx, dst->reg.reg, n);
341
	}
342
}
343
 
344
static struct ir3_instruction **
345
get_dst_ssa(struct ir3_compile *ctx, nir_ssa_def *dst, unsigned n)
346
{
347
	return __get_dst(ctx, dst, n);
348
}
349
 
350
static struct ir3_instruction **
351
get_src(struct ir3_compile *ctx, nir_src *src)
352
{
353
	struct hash_entry *entry;
354
	if (src->is_ssa) {
355
		entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
356
	} else {
357
		entry = _mesa_hash_table_search(ctx->def_ht, src->reg.reg);
358
	}
359
	compile_assert(ctx, entry);
360
	return entry->data;
361
}
362
 
363
static struct ir3_instruction *
364
create_immed(struct ir3_block *block, uint32_t val)
365
{
366
	struct ir3_instruction *mov;
367
 
368
	mov = ir3_instr_create(block, 1, 0);
369
	mov->cat1.src_type = TYPE_U32;
370
	mov->cat1.dst_type = TYPE_U32;
371
	ir3_reg_create(mov, 0, 0);
372
	ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
373
 
374
	return mov;
375
}
376
 
377
static struct ir3_instruction *
378
create_addr(struct ir3_block *block, struct ir3_instruction *src)
379
{
380
	struct ir3_instruction *instr, *immed;
381
 
382
	/* TODO in at least some cases, the backend could probably be
383
	 * made clever enough to propagate IR3_REG_HALF..
384
	 */
385
	instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
386
	instr->regs[0]->flags |= IR3_REG_HALF;
387
 
388
	immed = create_immed(block, 2);
389
	immed->regs[0]->flags |= IR3_REG_HALF;
390
 
391
	instr = ir3_SHL_B(block, instr, 0, immed, 0);
392
	instr->regs[0]->flags |= IR3_REG_HALF;
393
	instr->regs[1]->flags |= IR3_REG_HALF;
394
 
395
	instr = ir3_MOV(block, instr, TYPE_S16);
396
	instr->regs[0]->flags |= IR3_REG_ADDR | IR3_REG_HALF;
397
	instr->regs[1]->flags |= IR3_REG_HALF;
398
 
399
	return instr;
400
}
401
 
402
/* caches addr values to avoid generating multiple cov/shl/mova
403
 * sequences for each use of a given NIR level src as address
404
 */
405
static struct ir3_instruction *
406
get_addr(struct ir3_compile *ctx, struct ir3_instruction *src)
407
{
408
	struct ir3_instruction *addr;
409
	struct hash_entry *entry;
410
	entry = _mesa_hash_table_search(ctx->addr_ht, src);
411
	if (entry)
412
		return entry->data;
413
 
414
	/* TODO do we need to cache per block? */
415
	addr = create_addr(ctx->block, src);
416
	_mesa_hash_table_insert(ctx->addr_ht, src, addr);
417
 
418
	return addr;
419
}
420
 
421
static struct ir3_instruction *
422
create_uniform(struct ir3_compile *ctx, unsigned n)
423
{
424
	struct ir3_instruction *mov;
425
 
426
	mov = ir3_instr_create(ctx->block, 1, 0);
427
	/* TODO get types right? */
428
	mov->cat1.src_type = TYPE_F32;
429
	mov->cat1.dst_type = TYPE_F32;
430
	ir3_reg_create(mov, 0, 0);
431
	ir3_reg_create(mov, n, IR3_REG_CONST);
432
 
433
	return mov;
434
}
435
 
436
static struct ir3_instruction *
437
create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
438
		struct ir3_instruction *address)
439
{
440
	struct ir3_instruction *mov;
441
 
442
	mov = ir3_instr_create(ctx->block, 1, 0);
443
	mov->cat1.src_type = TYPE_U32;
444
	mov->cat1.dst_type = TYPE_U32;
445
	ir3_reg_create(mov, 0, 0);
446
	ir3_reg_create(mov, n, IR3_REG_CONST | IR3_REG_RELATIV);
447
	mov->address = address;
448
 
449
	array_insert(ctx->ir->indirects, mov);
450
 
451
	return mov;
452
}
453
 
454
static struct ir3_instruction *
455
create_collect(struct ir3_block *block, struct ir3_instruction **arr,
456
		unsigned arrsz)
457
{
458
	struct ir3_instruction *collect;
459
 
460
	if (arrsz == 0)
461
		return NULL;
462
 
463
	collect = ir3_instr_create2(block, -1, OPC_META_FI, 1 + arrsz);
464
	ir3_reg_create(collect, 0, 0);
465
	for (unsigned i = 0; i < arrsz; i++)
466
		ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i];
467
 
468
	return collect;
469
}
470
 
471
static struct ir3_instruction *
472
create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
473
		struct ir3_instruction *address, struct ir3_instruction *collect)
474
{
475
	struct ir3_block *block = ctx->block;
476
	struct ir3_instruction *mov;
477
	struct ir3_register *src;
478
 
479
	mov = ir3_instr_create(block, 1, 0);
480
	mov->cat1.src_type = TYPE_U32;
481
	mov->cat1.dst_type = TYPE_U32;
482
	ir3_reg_create(mov, 0, 0);
483
	src = ir3_reg_create(mov, 0, IR3_REG_SSA | IR3_REG_RELATIV);
484
	src->instr = collect;
485
	src->size  = arrsz;
486
	src->offset = n;
487
	mov->address = address;
488
 
489
	array_insert(ctx->ir->indirects, mov);
490
 
491
	return mov;
492
}
493
 
494
static struct ir3_instruction *
495
create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
496
		struct ir3_instruction *src, struct ir3_instruction *address,
497
		struct ir3_instruction *collect)
498
{
499
	struct ir3_block *block = ctx->block;
500
	struct ir3_instruction *mov;
501
	struct ir3_register *dst;
502
 
503
	mov = ir3_instr_create(block, 1, 0);
504
	mov->cat1.src_type = TYPE_U32;
505
	mov->cat1.dst_type = TYPE_U32;
506
	dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV);
507
	dst->size  = arrsz;
508
	dst->offset = n;
509
	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
510
	mov->address = address;
511
	mov->fanin = collect;
512
 
513
	array_insert(ctx->ir->indirects, mov);
514
 
515
	return mov;
516
}
517
 
518
static struct ir3_instruction *
519
create_input(struct ir3_block *block, struct ir3_instruction *instr,
520
		unsigned n)
521
{
522
	struct ir3_instruction *in;
523
 
524
	in = ir3_instr_create(block, -1, OPC_META_INPUT);
525
	in->inout.block = block;
526
	ir3_reg_create(in, n, 0);
527
	if (instr)
528
		ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
529
 
530
	return in;
531
}
532
 
533
static struct ir3_instruction *
534
create_frag_input(struct ir3_compile *ctx, unsigned n, bool use_ldlv)
535
{
536
	struct ir3_block *block = ctx->block;
537
	struct ir3_instruction *instr;
538
	struct ir3_instruction *inloc = create_immed(block, n);
539
 
540
	if (use_ldlv) {
541
		instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
542
		instr->cat6.type = TYPE_U32;
543
		instr->cat6.iim_val = 1;
544
	} else {
545
		instr = ir3_BARY_F(block, inloc, 0, ctx->frag_pos, 0);
546
		instr->regs[2]->wrmask = 0x3;
547
	}
548
 
549
	return instr;
550
}
551
 
552
static struct ir3_instruction *
553
create_frag_coord(struct ir3_compile *ctx, unsigned comp)
554
{
555
	struct ir3_block *block = ctx->block;
556
	struct ir3_instruction *instr;
557
 
558
	compile_assert(ctx, !ctx->frag_coord[comp]);
559
 
560
	ctx->frag_coord[comp] = create_input(ctx->block, NULL, 0);
561
 
562
	switch (comp) {
563
	case 0: /* .x */
564
	case 1: /* .y */
565
		/* for frag_coord, we get unsigned values.. we need
566
		 * to subtract (integer) 8 and divide by 16 (right-
567
		 * shift by 4) then convert to float:
568
		 *
569
		 *    sub.s tmp, src, 8
570
		 *    shr.b tmp, tmp, 4
571
		 *    mov.u32f32 dst, tmp
572
		 *
573
		 */
574
		instr = ir3_SUB_S(block, ctx->frag_coord[comp], 0,
575
				create_immed(block, 8), 0);
576
		instr = ir3_SHR_B(block, instr, 0,
577
				create_immed(block, 4), 0);
578
		instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
579
 
580
		return instr;
581
	case 2: /* .z */
582
	case 3: /* .w */
583
	default:
584
		/* seems that we can use these as-is: */
585
		return ctx->frag_coord[comp];
586
	}
587
}
588
 
589
static struct ir3_instruction *
590
create_frag_face(struct ir3_compile *ctx, unsigned comp)
591
{
592
	struct ir3_block *block = ctx->block;
593
	struct ir3_instruction *instr;
594
 
595
	switch (comp) {
596
	case 0: /* .x */
597
		compile_assert(ctx, !ctx->frag_face);
598
 
599
		ctx->frag_face = create_input(block, NULL, 0);
600
 
601
		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
602
		 * positive vs negative float.. and piglit further seems to
603
		 * expect -1.0 or 1.0:
604
		 *
605
		 *    mul.s tmp, hr0.x, 2
606
		 *    add.s tmp, tmp, 1
607
		 *    mov.s32f32, dst, tmp
608
		 *
609
		 */
610
		instr = ir3_MUL_S(block, ctx->frag_face, 0,
611
				create_immed(block, 2), 0);
612
		instr = ir3_ADD_S(block, instr, 0,
613
				create_immed(block, 1), 0);
614
		instr = ir3_COV(block, instr, TYPE_S32, TYPE_F32);
615
 
616
		return instr;
617
	case 1: /* .y */
618
	case 2: /* .z */
619
		return create_immed(block, fui(0.0));
620
	default:
621
	case 3: /* .w */
622
		return create_immed(block, fui(1.0));
623
	}
624
}
625
 
626
/* helper for instructions that produce multiple consecutive scalar
627
 * outputs which need to have a split/fanout meta instruction inserted
628
 */
629
static void
630
split_dest(struct ir3_block *block, struct ir3_instruction **dst,
631
		struct ir3_instruction *src)
632
{
633
	struct ir3_instruction *prev = NULL;
634
	for (int i = 0, j = 0; i < 4; i++) {
635
		struct ir3_instruction *split =
636
				ir3_instr_create(block, -1, OPC_META_FO);
637
		ir3_reg_create(split, 0, IR3_REG_SSA);
638
		ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
639
		split->fo.off = i;
640
 
641
		if (prev) {
642
			split->cp.left = prev;
643
			split->cp.left_cnt++;
644
			prev->cp.right = split;
645
			prev->cp.right_cnt++;
646
		}
647
		prev = split;
648
 
649
		if (src->regs[0]->wrmask & (1 << i))
650
			dst[j++] = split;
651
	}
652
}
653
 
654
/*
655
 * Adreno uses uint rather than having dedicated bool type,
656
 * which (potentially) requires some conversion, in particular
657
 * when using output of an bool instr to int input, or visa
658
 * versa.
659
 *
660
 *         | Adreno  |  NIR  |
661
 *  -------+---------+-------+-
662
 *   true  |    1    |  ~0   |
663
 *   false |    0    |   0   |
664
 *
665
 * To convert from an adreno bool (uint) to nir, use:
666
 *
667
 *    absneg.s dst, (neg)src
668
 *
669
 * To convert back in the other direction:
670
 *
671
 *    absneg.s dst, (abs)arc
672
 *
673
 * The CP step can clean up the absneg.s that cancel each other
674
 * out, and with a slight bit of extra cleverness (to recognize
675
 * the instructions which produce either a 0 or 1) can eliminate
676
 * the absneg.s's completely when an instruction that wants
677
 * 0/1 consumes the result.  For example, when a nir 'bcsel'
678
 * consumes the result of 'feq'.  So we should be able to get by
679
 * without a boolean resolve step, and without incuring any
680
 * extra penalty in instruction count.
681
 */
682
 
683
/* NIR bool -> native (adreno): */
684
static struct ir3_instruction *
685
ir3_b2n(struct ir3_block *block, struct ir3_instruction *instr)
686
{
687
	return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
688
}
689
 
690
/* native (adreno) -> NIR bool: */
691
static struct ir3_instruction *
692
ir3_n2b(struct ir3_block *block, struct ir3_instruction *instr)
693
{
694
	return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
695
}
696
 
697
/*
698
 * alu/sfu instructions:
699
 */
700
 
701
static void
702
emit_alu(struct ir3_compile *ctx, nir_alu_instr *alu)
703
{
704
	const nir_op_info *info = &nir_op_infos[alu->op];
705
	struct ir3_instruction **dst, *src[info->num_inputs];
706
	struct ir3_block *b = ctx->block;
707
 
708
	dst = get_dst(ctx, &alu->dest.dest, MAX2(info->output_size, 1));
709
 
710
	/* Vectors are special in that they have non-scalarized writemasks,
711
	 * and just take the first swizzle channel for each argument in
712
	 * order into each writemask channel.
713
	 */
714
	if ((alu->op == nir_op_vec2) ||
715
			(alu->op == nir_op_vec3) ||
716
			(alu->op == nir_op_vec4)) {
717
 
718
		for (int i = 0; i < info->num_inputs; i++) {
719
			nir_alu_src *asrc = &alu->src[i];
720
 
721
			compile_assert(ctx, !asrc->abs);
722
			compile_assert(ctx, !asrc->negate);
723
 
724
			src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
725
			if (!src[i])
726
				src[i] = create_immed(ctx->block, 0);
727
			dst[i] = ir3_MOV(b, src[i], TYPE_U32);
728
		}
729
 
730
		return;
731
	}
732
 
733
	/* General case: We can just grab the one used channel per src. */
734
	for (int i = 0; i < info->num_inputs; i++) {
735
		unsigned chan = ffs(alu->dest.write_mask) - 1;
736
		nir_alu_src *asrc = &alu->src[i];
737
 
738
		compile_assert(ctx, !asrc->abs);
739
		compile_assert(ctx, !asrc->negate);
740
 
741
		src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
742
 
743
		compile_assert(ctx, src[i]);
744
	}
745
 
746
	switch (alu->op) {
747
	case nir_op_f2i:
748
		dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_S32);
749
		break;
750
	case nir_op_f2u:
751
		dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_U32);
752
		break;
753
	case nir_op_i2f:
754
		dst[0] = ir3_COV(b, src[0], TYPE_S32, TYPE_F32);
755
		break;
756
	case nir_op_u2f:
757
		dst[0] = ir3_COV(b, src[0], TYPE_U32, TYPE_F32);
758
		break;
759
	case nir_op_imov:
760
		dst[0] = ir3_MOV(b, src[0], TYPE_S32);
761
		break;
762
	case nir_op_fmov:
763
		dst[0] = ir3_MOV(b, src[0], TYPE_F32);
764
		break;
765
	case nir_op_f2b:
766
		dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
767
		dst[0]->cat2.condition = IR3_COND_NE;
768
		dst[0] = ir3_n2b(b, dst[0]);
769
		break;
770
	case nir_op_b2f:
771
		dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
772
		break;
773
	case nir_op_b2i:
774
		dst[0] = ir3_b2n(b, src[0]);
775
		break;
776
	case nir_op_i2b:
777
		dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
778
		dst[0]->cat2.condition = IR3_COND_NE;
779
		dst[0] = ir3_n2b(b, dst[0]);
780
		break;
781
 
782
	case nir_op_fneg:
783
		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
784
		break;
785
	case nir_op_fabs:
786
		dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
787
		break;
788
	case nir_op_fmax:
789
		dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
790
		break;
791
	case nir_op_fmin:
792
		dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
793
		break;
794
	case nir_op_fmul:
795
		dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
796
		break;
797
	case nir_op_fadd:
798
		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
799
		break;
800
	case nir_op_fsub:
801
		dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
802
		break;
803
	case nir_op_ffma:
804
		dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
805
		break;
806
	case nir_op_fddx:
807
		dst[0] = ir3_DSX(b, src[0], 0);
808
		dst[0]->cat5.type = TYPE_F32;
809
		break;
810
	case nir_op_fddy:
811
		dst[0] = ir3_DSY(b, src[0], 0);
812
		dst[0]->cat5.type = TYPE_F32;
813
		break;
814
		break;
815
	case nir_op_flt:
816
		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
817
		dst[0]->cat2.condition = IR3_COND_LT;
818
		dst[0] = ir3_n2b(b, dst[0]);
819
		break;
820
	case nir_op_fge:
821
		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
822
		dst[0]->cat2.condition = IR3_COND_GE;
823
		dst[0] = ir3_n2b(b, dst[0]);
824
		break;
825
	case nir_op_feq:
826
		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
827
		dst[0]->cat2.condition = IR3_COND_EQ;
828
		dst[0] = ir3_n2b(b, dst[0]);
829
		break;
830
	case nir_op_fne:
831
		dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
832
		dst[0]->cat2.condition = IR3_COND_NE;
833
		dst[0] = ir3_n2b(b, dst[0]);
834
		break;
835
	case nir_op_fceil:
836
		dst[0] = ir3_CEIL_F(b, src[0], 0);
837
		break;
838
	case nir_op_ffloor:
839
		dst[0] = ir3_FLOOR_F(b, src[0], 0);
840
		break;
841
	case nir_op_ftrunc:
842
		dst[0] = ir3_TRUNC_F(b, src[0], 0);
843
		break;
844
	case nir_op_fround_even:
845
		dst[0] = ir3_RNDNE_F(b, src[0], 0);
846
		break;
847
	case nir_op_fsign:
848
		dst[0] = ir3_SIGN_F(b, src[0], 0);
849
		break;
850
 
851
	case nir_op_fsin:
852
		dst[0] = ir3_SIN(b, src[0], 0);
853
		break;
854
	case nir_op_fcos:
855
		dst[0] = ir3_COS(b, src[0], 0);
856
		break;
857
	case nir_op_frsq:
858
		dst[0] = ir3_RSQ(b, src[0], 0);
859
		break;
860
	case nir_op_frcp:
861
		dst[0] = ir3_RCP(b, src[0], 0);
862
		break;
863
	case nir_op_flog2:
864
		dst[0] = ir3_LOG2(b, src[0], 0);
865
		break;
866
	case nir_op_fexp2:
867
		dst[0] = ir3_EXP2(b, src[0], 0);
868
		break;
869
	case nir_op_fsqrt:
870
		dst[0] = ir3_SQRT(b, src[0], 0);
871
		break;
872
 
873
	case nir_op_iabs:
874
		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
875
		break;
876
	case nir_op_iadd:
877
		dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
878
		break;
879
	case nir_op_iand:
880
		dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
881
		break;
882
	case nir_op_imax:
883
		dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
884
		break;
885
	case nir_op_imin:
886
		dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
887
		break;
888
	case nir_op_imul:
889
		/*
890
		 * dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
891
		 *   mull.u tmp0, a, b           ; mul low, i.e. al * bl
892
		 *   madsh.m16 tmp1, a, b, tmp0  ; mul-add shift high mix, i.e. ah * bl << 16
893
		 *   madsh.m16 dst, b, a, tmp1   ; i.e. al * bh << 16
894
		 */
895
		dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
896
					ir3_MADSH_M16(b, src[0], 0, src[1], 0,
897
						ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
898
		break;
899
	case nir_op_ineg:
900
		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
901
		break;
902
	case nir_op_inot:
903
		dst[0] = ir3_NOT_B(b, src[0], 0);
904
		break;
905
	case nir_op_ior:
906
		dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
907
		break;
908
	case nir_op_ishl:
909
		dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
910
		break;
911
	case nir_op_ishr:
912
		dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
913
		break;
914
	case nir_op_isign: {
915
		/* maybe this would be sane to lower in nir.. */
916
		struct ir3_instruction *neg, *pos;
917
 
918
		neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
919
		neg->cat2.condition = IR3_COND_LT;
920
 
921
		pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
922
		pos->cat2.condition = IR3_COND_GT;
923
 
924
		dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
925
 
926
		break;
927
	}
928
	case nir_op_isub:
929
		dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
930
		break;
931
	case nir_op_ixor:
932
		dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
933
		break;
934
	case nir_op_ushr:
935
		dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
936
		break;
937
	case nir_op_ilt:
938
		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
939
		dst[0]->cat2.condition = IR3_COND_LT;
940
		dst[0] = ir3_n2b(b, dst[0]);
941
		break;
942
	case nir_op_ige:
943
		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
944
		dst[0]->cat2.condition = IR3_COND_GE;
945
		dst[0] = ir3_n2b(b, dst[0]);
946
		break;
947
	case nir_op_ieq:
948
		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
949
		dst[0]->cat2.condition = IR3_COND_EQ;
950
		dst[0] = ir3_n2b(b, dst[0]);
951
		break;
952
	case nir_op_ine:
953
		dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
954
		dst[0]->cat2.condition = IR3_COND_NE;
955
		dst[0] = ir3_n2b(b, dst[0]);
956
		break;
957
	case nir_op_ult:
958
		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
959
		dst[0]->cat2.condition = IR3_COND_LT;
960
		dst[0] = ir3_n2b(b, dst[0]);
961
		break;
962
	case nir_op_uge:
963
		dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
964
		dst[0]->cat2.condition = IR3_COND_GE;
965
		dst[0] = ir3_n2b(b, dst[0]);
966
		break;
967
 
968
	case nir_op_bcsel:
969
		dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0);
970
		break;
971
 
972
	default:
973
		compile_error(ctx, "Unhandled ALU op: %s\n",
974
				nir_op_infos[alu->op].name);
975
		break;
976
	}
977
}
978
 
979
/* handles direct/indirect UBO reads: */
980
static void
981
emit_intrinsic_load_ubo(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
982
		struct ir3_instruction **dst)
983
{
984
	struct ir3_block *b = ctx->block;
985
	struct ir3_instruction *addr, *src0, *src1;
986
	/* UBO addresses are the first driver params: */
987
	unsigned ubo = regid(ctx->so->first_driver_param, 0);
988
	unsigned off = intr->const_index[0];
989
 
990
	/* First src is ubo index, which could either be an immed or not: */
991
	src0 = get_src(ctx, &intr->src[0])[0];
992
	if (is_same_type_mov(src0) &&
993
			(src0->regs[1]->flags & IR3_REG_IMMED)) {
994
		addr = create_uniform(ctx, ubo + src0->regs[1]->iim_val);
995
	} else {
996
		addr = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0));
997
	}
998
 
999
	if (intr->intrinsic == nir_intrinsic_load_ubo_indirect) {
1000
		/* For load_ubo_indirect, second src is indirect offset: */
1001
		src1 = get_src(ctx, &intr->src[1])[0];
1002
 
1003
		/* and add offset to addr: */
1004
		addr = ir3_ADD_S(b, addr, 0, src1, 0);
1005
	}
1006
 
1007
	/* if offset is to large to encode in the ldg, split it out: */
1008
	if ((off + (intr->num_components * 4)) > 1024) {
1009
		/* split out the minimal amount to improve the odds that
1010
		 * cp can fit the immediate in the add.s instruction:
1011
		 */
1012
		unsigned off2 = off + (intr->num_components * 4) - 1024;
1013
		addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
1014
		off -= off2;
1015
	}
1016
 
1017
	for (int i = 0; i < intr->num_components; i++) {
1018
		struct ir3_instruction *load =
1019
				ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
1020
		load->cat6.type = TYPE_U32;
1021
		load->cat6.offset = off + i * 4;    /* byte offset */
1022
		dst[i] = load;
1023
	}
1024
}
1025
 
1026
/* handles array reads: */
1027
static void
1028
emit_intrinisic_load_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr,
1029
		struct ir3_instruction **dst)
1030
{
1031
	nir_deref_var *dvar = intr->variables[0];
1032
	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
1033
	struct ir3_array *arr = get_var(ctx, dvar->var);
1034
 
1035
	compile_assert(ctx, dvar->deref.child &&
1036
		(dvar->deref.child->deref_type == nir_deref_type_array));
1037
 
1038
	switch (darr->deref_array_type) {
1039
	case nir_deref_array_type_direct:
1040
		/* direct access does not require anything special: */
1041
		for (int i = 0; i < intr->num_components; i++) {
1042
			unsigned n = darr->base_offset * 4 + i;
1043
			compile_assert(ctx, n < arr->length);
1044
			dst[i] = arr->arr[n];
1045
		}
1046
		break;
1047
	case nir_deref_array_type_indirect: {
1048
		/* for indirect, we need to collect all the array elements: */
1049
		struct ir3_instruction *collect =
1050
				create_collect(ctx->block, arr->arr, arr->length);
1051
		struct ir3_instruction *addr =
1052
				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
1053
		for (int i = 0; i < intr->num_components; i++) {
1054
			unsigned n = darr->base_offset * 4 + i;
1055
			compile_assert(ctx, n < arr->length);
1056
			dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect);
1057
		}
1058
		break;
1059
	}
1060
	default:
1061
		compile_error(ctx, "Unhandled load deref type: %u\n",
1062
				darr->deref_array_type);
1063
		break;
1064
	}
1065
}
1066
 
1067
/* handles array writes: */
1068
static void
1069
emit_intrinisic_store_var(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
1070
{
1071
	nir_deref_var *dvar = intr->variables[0];
1072
	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
1073
	struct ir3_array *arr = get_var(ctx, dvar->var);
1074
	struct ir3_instruction **src;
1075
 
1076
	compile_assert(ctx, dvar->deref.child &&
1077
		(dvar->deref.child->deref_type == nir_deref_type_array));
1078
 
1079
	src = get_src(ctx, &intr->src[0]);
1080
 
1081
	switch (darr->deref_array_type) {
1082
	case nir_deref_array_type_direct:
1083
		/* direct access does not require anything special: */
1084
		for (int i = 0; i < intr->num_components; i++) {
1085
			unsigned n = darr->base_offset * 4 + i;
1086
			compile_assert(ctx, n < arr->length);
1087
			arr->arr[n] = src[i];
1088
		}
1089
		break;
1090
	case nir_deref_array_type_indirect: {
1091
		/* for indirect, create indirect-store and fan that out: */
1092
		struct ir3_instruction *collect =
1093
				create_collect(ctx->block, arr->arr, arr->length);
1094
		struct ir3_instruction *addr =
1095
				get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
1096
		for (int i = 0; i < intr->num_components; i++) {
1097
			struct ir3_instruction *store;
1098
			unsigned n = darr->base_offset * 4 + i;
1099
			compile_assert(ctx, n < arr->length);
1100
 
1101
			store = create_indirect_store(ctx, arr->length,
1102
					n, src[i], addr, collect);
1103
 
1104
			store->fanin->fi.aid = arr->aid;
1105
 
1106
			/* TODO: probably split this out to be used for
1107
			 * store_output_indirect? or move this into
1108
			 * create_indirect_store()?
1109
			 */
1110
			for (int j = i; j < arr->length; j += 4) {
1111
				struct ir3_instruction *split;
1112
 
1113
				split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
1114
				split->fo.off = j;
1115
				ir3_reg_create(split, 0, 0);
1116
				ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store;
1117
 
1118
				arr->arr[j] = split;
1119
			}
1120
		}
1121
		break;
1122
	}
1123
	default:
1124
		compile_error(ctx, "Unhandled store deref type: %u\n",
1125
				darr->deref_array_type);
1126
		break;
1127
	}
1128
}
1129
 
1130
static void add_sysval_input(struct ir3_compile *ctx, unsigned name,
1131
		struct ir3_instruction *instr)
1132
{
1133
	struct ir3_shader_variant *so = ctx->so;
1134
	unsigned r = regid(so->inputs_count, 0);
1135
	unsigned n = so->inputs_count++;
1136
 
1137
	so->inputs[n].semantic = ir3_semantic_name(name, 0);
1138
	so->inputs[n].compmask = 1;
1139
	so->inputs[n].regid = r;
1140
	so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
1141
	so->total_in++;
1142
 
1143
	ctx->block->ninputs = MAX2(ctx->block->ninputs, r + 1);
1144
	ctx->block->inputs[r] = instr;
1145
}
1146
 
1147
static void
1148
emit_intrinisic(struct ir3_compile *ctx, nir_intrinsic_instr *intr)
1149
{
1150
	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
1151
	struct ir3_instruction **dst, **src;
1152
	struct ir3_block *b = ctx->block;
1153
	unsigned idx = intr->const_index[0];
1154
 
1155
	if (info->has_dest) {
1156
		dst = get_dst(ctx, &intr->dest, intr->num_components);
1157
	}
1158
 
1159
	switch (intr->intrinsic) {
1160
	case nir_intrinsic_load_uniform:
1161
		compile_assert(ctx, intr->const_index[1] == 1);
1162
		for (int i = 0; i < intr->num_components; i++) {
1163
			unsigned n = idx * 4 + i;
1164
			dst[i] = create_uniform(ctx, n);
1165
		}
1166
		break;
1167
	case nir_intrinsic_load_uniform_indirect:
1168
		compile_assert(ctx, intr->const_index[1] == 1);
1169
		src = get_src(ctx, &intr->src[0]);
1170
		for (int i = 0; i < intr->num_components; i++) {
1171
			unsigned n = idx * 4 + i;
1172
			dst[i] = create_uniform_indirect(ctx, n,
1173
					get_addr(ctx, src[0]));
1174
		}
1175
		break;
1176
	case nir_intrinsic_load_ubo:
1177
	case nir_intrinsic_load_ubo_indirect:
1178
		emit_intrinsic_load_ubo(ctx, intr, dst);
1179
		break;
1180
	case nir_intrinsic_load_input:
1181
		compile_assert(ctx, intr->const_index[1] == 1);
1182
		for (int i = 0; i < intr->num_components; i++) {
1183
			unsigned n = idx * 4 + i;
1184
			dst[i] = b->inputs[n];
1185
		}
1186
		break;
1187
	case nir_intrinsic_load_input_indirect:
1188
		compile_assert(ctx, intr->const_index[1] == 1);
1189
		src = get_src(ctx, &intr->src[0]);
1190
		struct ir3_instruction *collect =
1191
				create_collect(b, b->inputs, b->ninputs);
1192
		struct ir3_instruction *addr = get_addr(ctx, src[0]);
1193
		for (int i = 0; i < intr->num_components; i++) {
1194
			unsigned n = idx * 4 + i;
1195
			dst[i] = create_indirect_load(ctx, b->ninputs, n, addr, collect);
1196
		}
1197
		break;
1198
	case nir_intrinsic_load_var:
1199
		emit_intrinisic_load_var(ctx, intr, dst);
1200
		break;
1201
	case nir_intrinsic_store_var:
1202
		emit_intrinisic_store_var(ctx, intr);
1203
		break;
1204
	case nir_intrinsic_store_output:
1205
		compile_assert(ctx, intr->const_index[1] == 1);
1206
		src = get_src(ctx, &intr->src[0]);
1207
		for (int i = 0; i < intr->num_components; i++) {
1208
			unsigned n = idx * 4 + i;
1209
			b->outputs[n] = src[i];
1210
		}
1211
		break;
1212
	case nir_intrinsic_load_base_vertex:
1213
		if (!ctx->basevertex) {
1214
			/* first four vec4 sysval's reserved for UBOs: */
1215
			unsigned r = regid(ctx->so->first_driver_param + 4, 0);
1216
			ctx->basevertex = create_uniform(ctx, r);
1217
			add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX,
1218
					ctx->basevertex);
1219
		}
1220
		dst[0] = ctx->basevertex;
1221
		break;
1222
	case nir_intrinsic_load_vertex_id_zero_base:
1223
		if (!ctx->vertex_id) {
1224
			ctx->vertex_id = create_input(ctx->block, NULL, 0);
1225
			add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE,
1226
					ctx->vertex_id);
1227
		}
1228
		dst[0] = ctx->vertex_id;
1229
		break;
1230
	case nir_intrinsic_load_instance_id:
1231
		if (!ctx->instance_id) {
1232
			ctx->instance_id = create_input(ctx->block, NULL, 0);
1233
			add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID,
1234
					ctx->instance_id);
1235
		}
1236
		dst[0] = ctx->instance_id;
1237
		break;
1238
	case nir_intrinsic_discard_if:
1239
	case nir_intrinsic_discard: {
1240
		struct ir3_instruction *cond, *kill;
1241
 
1242
		if (intr->intrinsic == nir_intrinsic_discard_if) {
1243
			/* conditional discard: */
1244
			src = get_src(ctx, &intr->src[0]);
1245
			cond = ir3_b2n(b, src[0]);
1246
		} else {
1247
			/* unconditional discard: */
1248
			cond = create_immed(b, 1);
1249
		}
1250
 
1251
		cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
1252
		cond->cat2.condition = IR3_COND_NE;
1253
 
1254
		/* condition always goes in predicate register: */
1255
		cond->regs[0]->num = regid(REG_P0, 0);
1256
 
1257
		kill = ir3_KILL(b, cond, 0);
1258
 
1259
		ctx->kill[ctx->kill_count++] = kill;
1260
		ctx->so->has_kill = true;
1261
 
1262
		break;
1263
	}
1264
	default:
1265
		compile_error(ctx, "Unhandled intrinsic type: %s\n",
1266
				nir_intrinsic_infos[intr->intrinsic].name);
1267
		break;
1268
	}
1269
}
1270
 
1271
static void
1272
emit_load_const(struct ir3_compile *ctx, nir_load_const_instr *instr)
1273
{
1274
	struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
1275
			instr->def.num_components);
1276
	for (int i = 0; i < instr->def.num_components; i++)
1277
		dst[i] = create_immed(ctx->block, instr->value.u[i]);
1278
}
1279
 
1280
static void
1281
emit_undef(struct ir3_compile *ctx, nir_ssa_undef_instr *undef)
1282
{
1283
	struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
1284
			undef->def.num_components);
1285
	/* backend doesn't want undefined instructions, so just plug
1286
	 * in 0.0..
1287
	 */
1288
	for (int i = 0; i < undef->def.num_components; i++)
1289
		dst[i] = create_immed(ctx->block, fui(0.0));
1290
}
1291
 
1292
/*
1293
 * texture fetch/sample instructions:
1294
 */
1295
 
1296
static void
1297
tex_info(nir_tex_instr *tex, unsigned *flagsp, unsigned *coordsp)
1298
{
1299
	unsigned coords, flags = 0;
1300
 
1301
	/* note: would use tex->coord_components.. except txs.. also,
1302
	 * since array index goes after shadow ref, we don't want to
1303
	 * count it:
1304
	 */
1305
	switch (tex->sampler_dim) {
1306
	case GLSL_SAMPLER_DIM_1D:
1307
	case GLSL_SAMPLER_DIM_BUF:
1308
		coords = 1;
1309
		break;
1310
	case GLSL_SAMPLER_DIM_2D:
1311
	case GLSL_SAMPLER_DIM_RECT:
1312
	case GLSL_SAMPLER_DIM_EXTERNAL:
1313
	case GLSL_SAMPLER_DIM_MS:
1314
		coords = 2;
1315
		break;
1316
	case GLSL_SAMPLER_DIM_3D:
1317
	case GLSL_SAMPLER_DIM_CUBE:
1318
		coords = 3;
1319
		flags |= IR3_INSTR_3D;
1320
		break;
1321
	}
1322
 
1323
	if (tex->is_shadow)
1324
		flags |= IR3_INSTR_S;
1325
 
1326
	if (tex->is_array)
1327
		flags |= IR3_INSTR_A;
1328
 
1329
	*flagsp = flags;
1330
	*coordsp = coords;
1331
}
1332
 
1333
static void
1334
emit_tex(struct ir3_compile *ctx, nir_tex_instr *tex)
1335
{
1336
	struct ir3_block *b = ctx->block;
1337
	struct ir3_instruction **dst, *sam, *src0[12], *src1[4];
1338
	struct ir3_instruction **coord, *lod, *compare, *proj, **off, **ddx, **ddy;
1339
	bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
1340
	unsigned i, coords, flags;
1341
	unsigned nsrc0 = 0, nsrc1 = 0;
1342
	type_t type;
1343
	opc_t opc;
1344
 
1345
	/* TODO: might just be one component for gathers? */
1346
	dst = get_dst(ctx, &tex->dest, 4);
1347
 
1348
	for (unsigned i = 0; i < tex->num_srcs; i++) {
1349
		switch (tex->src[i].src_type) {
1350
		case nir_tex_src_coord:
1351
			coord = get_src(ctx, &tex->src[i].src);
1352
			break;
1353
		case nir_tex_src_bias:
1354
			lod = get_src(ctx, &tex->src[i].src)[0];
1355
			has_bias = true;
1356
			break;
1357
		case nir_tex_src_lod:
1358
			lod = get_src(ctx, &tex->src[i].src)[0];
1359
			has_lod = true;
1360
			break;
1361
		case nir_tex_src_comparitor: /* shadow comparator */
1362
			compare = get_src(ctx, &tex->src[i].src)[0];
1363
			break;
1364
		case nir_tex_src_projector:
1365
			proj = get_src(ctx, &tex->src[i].src)[0];
1366
			has_proj = true;
1367
			break;
1368
		case nir_tex_src_offset:
1369
			off = get_src(ctx, &tex->src[i].src);
1370
			has_off = true;
1371
			break;
1372
		case nir_tex_src_ddx:
1373
			ddx = get_src(ctx, &tex->src[i].src);
1374
			break;
1375
		case nir_tex_src_ddy:
1376
			ddy = get_src(ctx, &tex->src[i].src);
1377
			break;
1378
		default:
1379
			compile_error(ctx, "Unhandled NIR tex serc type: %d\n",
1380
					tex->src[i].src_type);
1381
			return;
1382
		}
1383
	}
1384
 
1385
	switch (tex->op) {
1386
	case nir_texop_tex:      opc = OPC_SAM;      break;
1387
	case nir_texop_txb:      opc = OPC_SAMB;     break;
1388
	case nir_texop_txl:      opc = OPC_SAML;     break;
1389
	case nir_texop_txd:      opc = OPC_SAMGQ;    break;
1390
	case nir_texop_txf:      opc = OPC_ISAML;    break;
1391
	case nir_texop_txf_ms:
1392
	case nir_texop_txs:
1393
	case nir_texop_lod:
1394
	case nir_texop_tg4:
1395
	case nir_texop_query_levels:
1396
		compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
1397
		return;
1398
	}
1399
 
1400
	tex_info(tex, &flags, &coords);
1401
 
1402
	/* scale up integer coords for TXF based on the LOD */
1403
	if (opc == OPC_ISAML) {
1404
		assert(has_lod);
1405
		for (i = 0; i < coords; i++)
1406
			coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0);
1407
	}
1408
	/*
1409
	 * lay out the first argument in the proper order:
1410
	 *  - actual coordinates first
1411
	 *  - shadow reference
1412
	 *  - array index
1413
	 *  - projection w
1414
	 *  - starting at offset 4, dpdx.xy, dpdy.xy
1415
	 *
1416
	 * bias/lod go into the second arg
1417
	 */
1418
 
1419
	/* insert tex coords: */
1420
	for (i = 0; i < coords; i++)
1421
		src0[nsrc0++] = coord[i];
1422
 
1423
	if (coords == 1) {
1424
		/* hw doesn't do 1d, so we treat it as 2d with
1425
		 * height of 1, and patch up the y coord.
1426
		 * TODO: y coord should be (int)0 in some cases..
1427
		 */
1428
		src0[nsrc0++] = create_immed(b, fui(0.5));
1429
	}
1430
 
1431
	if (tex->is_shadow)
1432
		src0[nsrc0++] = compare;
1433
 
1434
	if (tex->is_array)
1435
		src0[nsrc0++] = coord[coords];
1436
 
1437
	if (has_proj) {
1438
		src0[nsrc0++] = proj;
1439
		flags |= IR3_INSTR_P;
1440
	}
1441
 
1442
	/* pad to 4, then ddx/ddy: */
1443
	if (tex->op == nir_texop_txd) {
1444
		while (nsrc0 < 4)
1445
			src0[nsrc0++] = create_immed(b, fui(0.0));
1446
		for (i = 0; i < coords; i++)
1447
			src0[nsrc0++] = ddx[i];
1448
		if (coords < 2)
1449
			src0[nsrc0++] = create_immed(b, fui(0.0));
1450
		for (i = 0; i < coords; i++)
1451
			src0[nsrc0++] = ddy[i];
1452
		if (coords < 2)
1453
			src0[nsrc0++] = create_immed(b, fui(0.0));
1454
	}
1455
 
1456
	/*
1457
	 * second argument (if applicable):
1458
	 *  - offsets
1459
	 *  - lod
1460
	 *  - bias
1461
	 */
1462
	if (has_off | has_lod | has_bias) {
1463
		if (has_off) {
1464
			for (i = 0; i < coords; i++)
1465
				src1[nsrc1++] = off[i];
1466
			if (coords < 2)
1467
				src1[nsrc1++] = create_immed(b, fui(0.0));
1468
			flags |= IR3_INSTR_O;
1469
		}
1470
 
1471
		if (has_lod | has_bias)
1472
			src1[nsrc1++] = lod;
1473
	}
1474
 
1475
	switch (tex->dest_type) {
1476
	case nir_type_invalid:
1477
	case nir_type_float:
1478
		type = TYPE_F32;
1479
		break;
1480
	case nir_type_int:
1481
		type = TYPE_S32;
1482
		break;
1483
	case nir_type_unsigned:
1484
	case nir_type_bool:
1485
		type = TYPE_U32;
1486
		break;
1487
	}
1488
 
1489
	sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW,
1490
			flags, tex->sampler_index, tex->sampler_index,
1491
			create_collect(b, src0, nsrc0),
1492
			create_collect(b, src1, nsrc1));
1493
 
1494
	split_dest(b, dst, sam);
1495
}
1496
 
1497
static void
1498
emit_tex_query_levels(struct ir3_compile *ctx, nir_tex_instr *tex)
1499
{
1500
	struct ir3_block *b = ctx->block;
1501
	struct ir3_instruction **dst, *sam;
1502
 
1503
	dst = get_dst(ctx, &tex->dest, 1);
1504
 
1505
	sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, TGSI_WRITEMASK_Z, 0,
1506
			tex->sampler_index, tex->sampler_index, NULL, NULL);
1507
 
1508
	/* even though there is only one component, since it ends
1509
	 * up in .z rather than .x, we need a split_dest()
1510
	 */
1511
	split_dest(b, dst, sam);
1512
 
1513
	/* The # of levels comes from getinfo.z. We need to add 1 to it, since
1514
	 * the value in TEX_CONST_0 is zero-based.
1515
	 */
1516
	if (ctx->levels_add_one)
1517
		dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
1518
}
1519
 
1520
static void
1521
emit_tex_txs(struct ir3_compile *ctx, nir_tex_instr *tex)
1522
{
1523
	struct ir3_block *b = ctx->block;
1524
	struct ir3_instruction **dst, *sam, *lod;
1525
	unsigned flags, coords;
1526
 
1527
	tex_info(tex, &flags, &coords);
1528
 
1529
	dst = get_dst(ctx, &tex->dest, 4);
1530
 
1531
	compile_assert(ctx, tex->num_srcs == 1);
1532
	compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
1533
 
1534
	lod = get_src(ctx, &tex->src[0].src)[0];
1535
 
1536
	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
1537
			tex->sampler_index, tex->sampler_index, lod, NULL);
1538
 
1539
	split_dest(b, dst, sam);
1540
 
1541
	/* Array size actually ends up in .w rather than .z. This doesn't
1542
	 * matter for miplevel 0, but for higher mips the value in z is
1543
	 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1544
	 * returned, which means that we have to add 1 to it for arrays.
1545
	 */
1546
	if (tex->is_array) {
1547
		if (ctx->levels_add_one) {
1548
			dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
1549
		} else {
1550
			dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
1551
		}
1552
	}
1553
}
1554
 
1555
static void
1556
emit_instr(struct ir3_compile *ctx, nir_instr *instr)
1557
{
1558
	switch (instr->type) {
1559
	case nir_instr_type_alu:
1560
		emit_alu(ctx, nir_instr_as_alu(instr));
1561
		break;
1562
	case nir_instr_type_intrinsic:
1563
		emit_intrinisic(ctx, nir_instr_as_intrinsic(instr));
1564
		break;
1565
	case nir_instr_type_load_const:
1566
		emit_load_const(ctx, nir_instr_as_load_const(instr));
1567
		break;
1568
	case nir_instr_type_ssa_undef:
1569
		emit_undef(ctx, nir_instr_as_ssa_undef(instr));
1570
		break;
1571
	case nir_instr_type_tex: {
1572
		nir_tex_instr *tex = nir_instr_as_tex(instr);
1573
		/* couple tex instructions get special-cased:
1574
		 */
1575
		switch (tex->op) {
1576
		case nir_texop_txs:
1577
			emit_tex_txs(ctx, tex);
1578
			break;
1579
		case nir_texop_query_levels:
1580
			emit_tex_query_levels(ctx, tex);
1581
			break;
1582
		default:
1583
			emit_tex(ctx, tex);
1584
			break;
1585
		}
1586
		break;
1587
	}
1588
	case nir_instr_type_call:
1589
	case nir_instr_type_jump:
1590
	case nir_instr_type_phi:
1591
	case nir_instr_type_parallel_copy:
1592
		compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
1593
		break;
1594
	}
1595
}
1596
 
1597
static void
1598
emit_block(struct ir3_compile *ctx, nir_block *block)
1599
{
1600
	nir_foreach_instr(block, instr) {
1601
		emit_instr(ctx, instr);
1602
		if (ctx->error)
1603
			return;
1604
	}
1605
}
1606
 
1607
static void
1608
emit_function(struct ir3_compile *ctx, nir_function_impl *impl)
1609
{
1610
	foreach_list_typed(nir_cf_node, node, node, &impl->body) {
1611
		switch (node->type) {
1612
		case nir_cf_node_block:
1613
			emit_block(ctx, nir_cf_node_as_block(node));
1614
			break;
1615
		case nir_cf_node_if:
1616
		case nir_cf_node_loop:
1617
		case nir_cf_node_function:
1618
			compile_error(ctx, "TODO\n");
1619
			break;
1620
		}
1621
		if (ctx->error)
1622
			return;
1623
	}
1624
}
1625
 
1626
static void
1627
setup_input(struct ir3_compile *ctx, nir_variable *in)
1628
{
1629
	struct ir3_shader_variant *so = ctx->so;
1630
	unsigned array_len = MAX2(glsl_get_length(in->type), 1);
1631
	unsigned ncomp = glsl_get_components(in->type);
1632
	/* XXX: map loc slots to semantics */
1633
	unsigned semantic_name = in->data.location;
1634
	unsigned semantic_index = in->data.index;
1635
	unsigned n = in->data.driver_location;
1636
 
1637
	DBG("; in: %u:%u, len=%ux%u, loc=%u\n",
1638
			semantic_name, semantic_index, array_len,
1639
			ncomp, n);
1640
 
1641
	so->inputs[n].semantic =
1642
			ir3_semantic_name(semantic_name, semantic_index);
1643
	so->inputs[n].compmask = (1 << ncomp) - 1;
1644
	so->inputs[n].inloc = ctx->next_inloc;
1645
	so->inputs[n].interpolate = 0;
1646
	so->inputs_count = MAX2(so->inputs_count, n + 1);
1647
 
1648
	/* the fdN_program_emit() code expects tgsi consts here, so map
1649
	 * things back to tgsi for now:
1650
	 */
1651
	switch (in->data.interpolation) {
1652
	case INTERP_QUALIFIER_FLAT:
1653
		so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
1654
		break;
1655
	case INTERP_QUALIFIER_NOPERSPECTIVE:
1656
		so->inputs[n].interpolate = TGSI_INTERPOLATE_LINEAR;
1657
		break;
1658
	case INTERP_QUALIFIER_SMOOTH:
1659
		so->inputs[n].interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
1660
		break;
1661
	}
1662
 
1663
	for (int i = 0; i < ncomp; i++) {
1664
		struct ir3_instruction *instr = NULL;
1665
		unsigned idx = (n * 4) + i;
1666
 
1667
		if (ctx->so->type == SHADER_FRAGMENT) {
1668
			if (semantic_name == TGSI_SEMANTIC_POSITION) {
1669
				so->inputs[n].bary = false;
1670
				so->frag_coord = true;
1671
				instr = create_frag_coord(ctx, i);
1672
			} else if (semantic_name == TGSI_SEMANTIC_FACE) {
1673
				so->inputs[n].bary = false;
1674
				so->frag_face = true;
1675
				instr = create_frag_face(ctx, i);
1676
			} else {
1677
				bool use_ldlv = false;
1678
 
1679
				/* with NIR, we need to infer TGSI_INTERPOLATE_COLOR
1680
				 * from the semantic name:
1681
				 */
1682
				if ((in->data.interpolation == INTERP_QUALIFIER_NONE) &&
1683
						((semantic_name == TGSI_SEMANTIC_COLOR) ||
1684
							(semantic_name == TGSI_SEMANTIC_BCOLOR)))
1685
					so->inputs[n].interpolate = TGSI_INTERPOLATE_COLOR;
1686
 
1687
				if (ctx->flat_bypass) {
1688
					/* with NIR, we need to infer TGSI_INTERPOLATE_COLOR
1689
					 * from the semantic name:
1690
					 */
1691
					switch (so->inputs[n].interpolate) {
1692
					case TGSI_INTERPOLATE_COLOR:
1693
						if (!ctx->so->key.rasterflat)
1694
							break;
1695
						/* fallthrough */
1696
					case TGSI_INTERPOLATE_CONSTANT:
1697
						use_ldlv = true;
1698
						break;
1699
					}
1700
				}
1701
 
1702
				so->inputs[n].bary = true;
1703
 
1704
				instr = create_frag_input(ctx,
1705
						so->inputs[n].inloc + i - 8, use_ldlv);
1706
			}
1707
		} else {
1708
			instr = create_input(ctx->block, NULL, idx);
1709
		}
1710
 
1711
		ctx->block->inputs[idx] = instr;
1712
	}
1713
 
1714
	if (so->inputs[n].bary || (ctx->so->type == SHADER_VERTEX)) {
1715
		ctx->next_inloc += ncomp;
1716
		so->total_in += ncomp;
1717
	}
1718
}
1719
 
1720
static void
1721
setup_output(struct ir3_compile *ctx, nir_variable *out)
1722
{
1723
	struct ir3_shader_variant *so = ctx->so;
1724
	unsigned array_len = MAX2(glsl_get_length(out->type), 1);
1725
	unsigned ncomp = glsl_get_components(out->type);
1726
	/* XXX: map loc slots to semantics */
1727
	unsigned semantic_name = out->data.location;
1728
	unsigned semantic_index = out->data.index;
1729
	unsigned n = out->data.driver_location;
1730
	unsigned comp = 0;
1731
 
1732
	DBG("; out: %u:%u, len=%ux%u, loc=%u\n",
1733
			semantic_name, semantic_index, array_len,
1734
			ncomp, n);
1735
 
1736
	if (ctx->so->type == SHADER_VERTEX) {
1737
		switch (semantic_name) {
1738
		case TGSI_SEMANTIC_POSITION:
1739
			so->writes_pos = true;
1740
			break;
1741
		case TGSI_SEMANTIC_PSIZE:
1742
			so->writes_psize = true;
1743
			break;
1744
		case TGSI_SEMANTIC_COLOR:
1745
		case TGSI_SEMANTIC_BCOLOR:
1746
		case TGSI_SEMANTIC_GENERIC:
1747
		case TGSI_SEMANTIC_FOG:
1748
		case TGSI_SEMANTIC_TEXCOORD:
1749
			break;
1750
		default:
1751
			compile_error(ctx, "unknown VS semantic name: %s\n",
1752
					tgsi_semantic_names[semantic_name]);
1753
		}
1754
	} else {
1755
		switch (semantic_name) {
1756
		case TGSI_SEMANTIC_POSITION:
1757
			comp = 2;  /* tgsi will write to .z component */
1758
			so->writes_pos = true;
1759
			break;
1760
		case TGSI_SEMANTIC_COLOR:
1761
			break;
1762
		default:
1763
			compile_error(ctx, "unknown FS semantic name: %s\n",
1764
					tgsi_semantic_names[semantic_name]);
1765
		}
1766
	}
1767
 
1768
	compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
1769
 
1770
	so->outputs[n].semantic =
1771
			ir3_semantic_name(semantic_name, semantic_index);
1772
	so->outputs[n].regid = regid(n, comp);
1773
	so->outputs_count = MAX2(so->outputs_count, n + 1);
1774
 
1775
	for (int i = 0; i < ncomp; i++) {
1776
		unsigned idx = (n * 4) + i;
1777
 
1778
		ctx->block->outputs[idx] = create_immed(ctx->block, fui(0.0));
1779
	}
1780
}
1781
 
1782
static void
1783
emit_instructions(struct ir3_compile *ctx)
1784
{
1785
	unsigned ninputs  = exec_list_length(&ctx->s->inputs) * 4;
1786
	unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4;
1787
 
1788
	/* we need to allocate big enough outputs array so that
1789
	 * we can stuff the kill's at the end.  Likewise for vtx
1790
	 * shaders, we need to leave room for sysvals:
1791
	 */
1792
	if (ctx->so->type == SHADER_FRAGMENT) {
1793
		noutputs += ARRAY_SIZE(ctx->kill);
1794
	} else if (ctx->so->type == SHADER_VERTEX) {
1795
		ninputs += 8;
1796
	}
1797
 
1798
	ctx->block = ir3_block_create(ctx->ir, 0, ninputs, noutputs);
1799
 
1800
	if (ctx->so->type == SHADER_FRAGMENT) {
1801
		ctx->block->noutputs -= ARRAY_SIZE(ctx->kill);
1802
	} else if (ctx->so->type == SHADER_VERTEX) {
1803
		ctx->block->ninputs -= 8;
1804
	}
1805
 
1806
	/* for fragment shader, we have a single input register (usually
1807
	 * r0.xy) which is used as the base for bary.f varying fetch instrs:
1808
	 */
1809
	if (ctx->so->type == SHADER_FRAGMENT) {
1810
		// TODO maybe a helper for fi since we need it a few places..
1811
		struct ir3_instruction *instr;
1812
		instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1813
		ir3_reg_create(instr, 0, 0);
1814
		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
1815
		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
1816
		ctx->frag_pos = instr;
1817
	}
1818
 
1819
	/* Setup inputs: */
1820
	foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) {
1821
		setup_input(ctx, var);
1822
	}
1823
 
1824
	/* Setup outputs: */
1825
	foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) {
1826
		setup_output(ctx, var);
1827
	}
1828
 
1829
	/* Setup variables (which should only be arrays): */
1830
	foreach_list_typed(nir_variable, var, node, &ctx->s->globals) {
1831
		declare_var(ctx, var);
1832
	}
1833
 
1834
	/* Find the main function and emit the body: */
1835
	nir_foreach_overload(ctx->s, overload) {
1836
		compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
1837
		compile_assert(ctx, overload->impl);
1838
		emit_function(ctx, overload->impl);
1839
		if (ctx->error)
1840
			return;
1841
	}
1842
}
1843
 
1844
/* from NIR perspective, we actually have inputs.  But most of the "inputs"
1845
 * for a fragment shader are just bary.f instructions.  The *actual* inputs
1846
 * from the hw perspective are the frag_pos and optionally frag_coord and
1847
 * frag_face.
1848
 */
1849
static void
1850
fixup_frag_inputs(struct ir3_compile *ctx)
1851
{
1852
	struct ir3_shader_variant *so = ctx->so;
1853
	struct ir3_block *block = ctx->block;
1854
	struct ir3_instruction **inputs;
1855
	struct ir3_instruction *instr;
1856
	int n, regid = 0;
1857
 
1858
	block->ninputs = 0;
1859
 
1860
	n  = 4;  /* always have frag_pos */
1861
	n += COND(so->frag_face, 4);
1862
	n += COND(so->frag_coord, 4);
1863
 
1864
	inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
1865
 
1866
	if (so->frag_face) {
1867
		/* this ultimately gets assigned to hr0.x so doesn't conflict
1868
		 * with frag_coord/frag_pos..
1869
		 */
1870
		inputs[block->ninputs++] = ctx->frag_face;
1871
		ctx->frag_face->regs[0]->num = 0;
1872
 
1873
		/* remaining channels not used, but let's avoid confusing
1874
		 * other parts that expect inputs to come in groups of vec4
1875
		 */
1876
		inputs[block->ninputs++] = NULL;
1877
		inputs[block->ninputs++] = NULL;
1878
		inputs[block->ninputs++] = NULL;
1879
	}
1880
 
1881
	/* since we don't know where to set the regid for frag_coord,
1882
	 * we have to use r0.x for it.  But we don't want to *always*
1883
	 * use r1.x for frag_pos as that could increase the register
1884
	 * footprint on simple shaders:
1885
	 */
1886
	if (so->frag_coord) {
1887
		ctx->frag_coord[0]->regs[0]->num = regid++;
1888
		ctx->frag_coord[1]->regs[0]->num = regid++;
1889
		ctx->frag_coord[2]->regs[0]->num = regid++;
1890
		ctx->frag_coord[3]->regs[0]->num = regid++;
1891
 
1892
		inputs[block->ninputs++] = ctx->frag_coord[0];
1893
		inputs[block->ninputs++] = ctx->frag_coord[1];
1894
		inputs[block->ninputs++] = ctx->frag_coord[2];
1895
		inputs[block->ninputs++] = ctx->frag_coord[3];
1896
	}
1897
 
1898
	/* we always have frag_pos: */
1899
	so->pos_regid = regid;
1900
 
1901
	/* r0.x */
1902
	instr = create_input(block, NULL, block->ninputs);
1903
	instr->regs[0]->num = regid++;
1904
	inputs[block->ninputs++] = instr;
1905
	ctx->frag_pos->regs[1]->instr = instr;
1906
 
1907
	/* r0.y */
1908
	instr = create_input(block, NULL, block->ninputs);
1909
	instr->regs[0]->num = regid++;
1910
	inputs[block->ninputs++] = instr;
1911
	ctx->frag_pos->regs[2]->instr = instr;
1912
 
1913
	block->inputs = inputs;
1914
}
1915
 
1916
static void
1917
compile_dump(struct ir3_compile *ctx)
1918
{
1919
	const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
1920
	static unsigned n = 0;
1921
	char fname[16];
1922
	FILE *f;
1923
	snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
1924
	f = fopen(fname, "w");
1925
	if (!f)
1926
		return;
1927
	ir3_block_depth(ctx->block);
1928
	ir3_dump(ctx->ir, name, ctx->block, f);
1929
	fclose(f);
1930
}
1931
 
1932
int
1933
ir3_compile_shader_nir(struct ir3_shader_variant *so,
1934
		const struct tgsi_token *tokens, struct ir3_shader_key key)
1935
{
1936
	struct ir3_compile *ctx;
1937
	struct ir3_block *block;
1938
	struct ir3_instruction **inputs;
1939
	unsigned i, j, actual_in;
1940
	int ret = 0, max_bary;
1941
 
1942
	assert(!so->ir);
1943
 
1944
	so->ir = ir3_create();
1945
 
1946
	assert(so->ir);
1947
 
1948
	ctx = compile_init(so, tokens);
1949
	if (!ctx) {
1950
		DBG("INIT failed!");
1951
		ret = -1;
1952
		goto out;
1953
	}
1954
 
1955
	emit_instructions(ctx);
1956
 
1957
	if (ctx->error) {
1958
		DBG("EMIT failed!");
1959
		ret = -1;
1960
		goto out;
1961
	}
1962
 
1963
	block = ctx->block;
1964
	so->ir->block = block;
1965
 
1966
	/* keep track of the inputs from TGSI perspective.. */
1967
	inputs = block->inputs;
1968
 
1969
	/* but fixup actual inputs for frag shader: */
1970
	if (so->type == SHADER_FRAGMENT)
1971
		fixup_frag_inputs(ctx);
1972
 
1973
	/* at this point, for binning pass, throw away unneeded outputs: */
1974
	if (key.binning_pass) {
1975
		for (i = 0, j = 0; i < so->outputs_count; i++) {
1976
			unsigned name = sem2name(so->outputs[i].semantic);
1977
			unsigned idx = sem2idx(so->outputs[i].semantic);
1978
 
1979
			/* throw away everything but first position/psize */
1980
			if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
1981
					(name == TGSI_SEMANTIC_PSIZE))) {
1982
				if (i != j) {
1983
					so->outputs[j] = so->outputs[i];
1984
					block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
1985
					block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
1986
					block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
1987
					block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
1988
				}
1989
				j++;
1990
			}
1991
		}
1992
		so->outputs_count = j;
1993
		block->noutputs = j * 4;
1994
	}
1995
 
1996
	/* if we want half-precision outputs, mark the output registers
1997
	 * as half:
1998
	 */
1999
	if (key.half_precision) {
2000
		for (i = 0; i < block->noutputs; i++) {
2001
			if (!block->outputs[i])
2002
				continue;
2003
			block->outputs[i]->regs[0]->flags |= IR3_REG_HALF;
2004
		}
2005
	}
2006
 
2007
	/* at this point, we want the kill's in the outputs array too,
2008
	 * so that they get scheduled (since they have no dst).. we've
2009
	 * already ensured that the array is big enough in push_block():
2010
	 */
2011
	if (so->type == SHADER_FRAGMENT) {
2012
		for (i = 0; i < ctx->kill_count; i++)
2013
			block->outputs[block->noutputs++] = ctx->kill[i];
2014
	}
2015
 
2016
	if (fd_mesa_debug & FD_DBG_OPTDUMP)
2017
		compile_dump(ctx);
2018
 
2019
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2020
		printf("BEFORE CP:\n");
2021
		ir3_dump_instr_list(block->head);
2022
	}
2023
 
2024
	ir3_block_depth(block);
2025
 
2026
	ir3_block_cp(block);
2027
 
2028
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2029
		printf("BEFORE GROUPING:\n");
2030
		ir3_dump_instr_list(block->head);
2031
	}
2032
 
2033
	/* Group left/right neighbors, inserting mov's where needed to
2034
	 * solve conflicts:
2035
	 */
2036
	ir3_block_group(block);
2037
 
2038
	if (fd_mesa_debug & FD_DBG_OPTDUMP)
2039
		compile_dump(ctx);
2040
 
2041
	ir3_block_depth(block);
2042
 
2043
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2044
		printf("AFTER DEPTH:\n");
2045
		ir3_dump_instr_list(block->head);
2046
	}
2047
 
2048
	ret = ir3_block_sched(block);
2049
	if (ret) {
2050
		DBG("SCHED failed!");
2051
		goto out;
2052
	}
2053
 
2054
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2055
		printf("AFTER SCHED:\n");
2056
		ir3_dump_instr_list(block->head);
2057
	}
2058
 
2059
	ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face);
2060
	if (ret) {
2061
		DBG("RA failed!");
2062
		goto out;
2063
	}
2064
 
2065
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
2066
		printf("AFTER RA:\n");
2067
		ir3_dump_instr_list(block->head);
2068
	}
2069
 
2070
	ir3_block_legalize(block, &so->has_samp, &max_bary);
2071
 
2072
	/* fixup input/outputs: */
2073
	for (i = 0; i < so->outputs_count; i++) {
2074
		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
2075
		/* preserve hack for depth output.. tgsi writes depth to .z,
2076
		 * but what we give the hw is the scalar register:
2077
		 */
2078
		if ((so->type == SHADER_FRAGMENT) &&
2079
			(sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
2080
			so->outputs[i].regid += 2;
2081
	}
2082
 
2083
	/* Note that some or all channels of an input may be unused: */
2084
	actual_in = 0;
2085
	for (i = 0; i < so->inputs_count; i++) {
2086
		unsigned j, regid = ~0, compmask = 0;
2087
		so->inputs[i].ncomp = 0;
2088
		for (j = 0; j < 4; j++) {
2089
			struct ir3_instruction *in = inputs[(i*4) + j];
2090
			if (in) {
2091
				compmask |= (1 << j);
2092
				regid = in->regs[0]->num - j;
2093
				actual_in++;
2094
				so->inputs[i].ncomp++;
2095
			}
2096
		}
2097
		so->inputs[i].regid = regid;
2098
		so->inputs[i].compmask = compmask;
2099
	}
2100
 
2101
	/* fragment shader always gets full vec4's even if it doesn't
2102
	 * fetch all components, but vertex shader we need to update
2103
	 * with the actual number of components fetch, otherwise thing
2104
	 * will hang due to mismaptch between VFD_DECODE's and
2105
	 * TOTALATTRTOVS
2106
	 */
2107
	if (so->type == SHADER_VERTEX)
2108
		so->total_in = actual_in;
2109
	else
2110
		so->total_in = align(max_bary + 1, 4);
2111
 
2112
out:
2113
	if (ret) {
2114
		ir3_destroy(so->ir);
2115
		so->ir = NULL;
2116
	}
2117
	compile_free(ctx);
2118
 
2119
	return ret;
2120
}