Subversion Repositories Kolibri OS

Rev

Details | Last modification | View Log | RSS feed

Rev Author Line No. Line
5564 serge 1
/* -*- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -*- */
2
 
3
/*
4
 * Copyright (C) 2013 Rob Clark 
5
 *
6
 * Permission is hereby granted, free of charge, to any person obtaining a
7
 * copy of this software and associated documentation files (the "Software"),
8
 * to deal in the Software without restriction, including without limitation
9
 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
10
 * and/or sell copies of the Software, and to permit persons to whom the
11
 * Software is furnished to do so, subject to the following conditions:
12
 *
13
 * The above copyright notice and this permission notice (including the next
14
 * paragraph) shall be included in all copies or substantial portions of the
15
 * Software.
16
 *
17
 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
18
 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
19
 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
20
 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
21
 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
22
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
23
 * SOFTWARE.
24
 *
25
 * Authors:
26
 *    Rob Clark 
27
 */
28
 
29
#include 
30
 
31
#include "pipe/p_state.h"
32
#include "util/u_string.h"
33
#include "util/u_memory.h"
34
#include "util/u_inlines.h"
35
#include "tgsi/tgsi_lowering.h"
36
#include "tgsi/tgsi_parse.h"
37
#include "tgsi/tgsi_ureg.h"
38
#include "tgsi/tgsi_info.h"
39
#include "tgsi/tgsi_strings.h"
40
#include "tgsi/tgsi_dump.h"
41
#include "tgsi/tgsi_scan.h"
42
 
43
#include "freedreno_util.h"
44
 
45
#include "ir3_compiler.h"
46
#include "ir3_shader.h"
47
 
48
#include "instr-a3xx.h"
49
#include "ir3.h"
50
 
51
struct ir3_compile_context {
52
	const struct tgsi_token *tokens;
53
	bool free_tokens;
54
	struct ir3 *ir;
55
	struct ir3_shader_variant *so;
56
	uint16_t integer_s;
57
 
58
	struct ir3_block *block;
59
	struct ir3_instruction *current_instr;
60
 
61
	/* we need to defer updates to block->outputs[] until the end
62
	 * of an instruction (so we don't see new value until *after*
63
	 * the src registers are processed)
64
	 */
65
	struct {
66
		struct ir3_instruction *instr, **instrp;
67
	} output_updates[64];
68
	unsigned num_output_updates;
69
 
70
	/* are we in a sequence of "atomic" instructions?
71
	 */
72
	bool atomic;
73
 
74
	/* For fragment shaders, from the hw perspective the only
75
	 * actual input is r0.xy position register passed to bary.f.
76
	 * But TGSI doesn't know that, it still declares things as
77
	 * IN[] registers.  So we do all the input tracking normally
78
	 * and fix things up after compile_instructions()
79
	 *
80
	 * NOTE that frag_pos is the hardware position (possibly it
81
	 * is actually an index or tag or some such.. it is *not*
82
	 * values that can be directly used for gl_FragCoord..)
83
	 */
84
	struct ir3_instruction *frag_pos, *frag_face, *frag_coord[4];
85
 
86
	/* For vertex shaders, keep track of the system values sources */
87
	struct ir3_instruction *vertex_id, *basevertex, *instance_id;
88
 
89
	struct tgsi_parse_context parser;
90
	unsigned type;
91
 
92
	struct tgsi_shader_info info;
93
 
94
	/* hmm, would be nice if tgsi_scan_shader figured this out
95
	 * for us:
96
	 */
97
	struct {
98
		unsigned first, last;
99
		struct ir3_instruction *fanin;
100
	} array[MAX_ARRAYS];
101
	uint32_t array_dirty;
102
	/* offset into array[], per file, of first array info */
103
	uint8_t array_offsets[TGSI_FILE_COUNT];
104
 
105
	/* for calculating input/output positions/linkages: */
106
	unsigned next_inloc;
107
 
108
	/* a4xx (at least patchlevel 0) cannot seem to flat-interpolate
109
	 * so we need to use ldlv.u32 to load the varying directly:
110
	 */
111
	bool flat_bypass;
112
 
113
	unsigned num_internal_temps;
114
	struct tgsi_src_register internal_temps[8];
115
 
116
	/* for looking up which system value is which */
117
	unsigned sysval_semantics[8];
118
 
119
	/* idx/slot for last compiler generated immediate */
120
	unsigned immediate_idx;
121
 
122
	/* stack of branch instructions that mark (potentially nested)
123
	 * branch if/else/loop/etc
124
	 */
125
	struct {
126
		struct ir3_instruction *instr, *cond;
127
		bool inv;   /* true iff in else leg of branch */
128
	} branch[16];
129
	unsigned int branch_count;
130
 
131
	/* list of kill instructions: */
132
	struct ir3_instruction *kill[16];
133
	unsigned int kill_count;
134
 
135
	/* used when dst is same as one of the src, to avoid overwriting a
136
	 * src element before the remaining scalar instructions that make
137
	 * up the vector operation
138
	 */
139
	struct tgsi_dst_register tmp_dst;
140
	struct tgsi_src_register *tmp_src;
141
 
142
	/* just for catching incorrect use of get_dst()/put_dst():
143
	 */
144
	bool using_tmp_dst;
145
};
146
 
147
 
148
static void vectorize(struct ir3_compile_context *ctx,
149
		struct ir3_instruction *instr, struct tgsi_dst_register *dst,
150
		int nsrcs, ...);
151
static void create_mov(struct ir3_compile_context *ctx,
152
		struct tgsi_dst_register *dst, struct tgsi_src_register *src);
153
static type_t get_ftype(struct ir3_compile_context *ctx);
154
static type_t get_utype(struct ir3_compile_context *ctx);
155
 
156
static unsigned setup_arrays(struct ir3_compile_context *ctx, unsigned file, unsigned i)
157
{
158
	/* ArrayID 0 for a given file is the legacy array spanning the entire file: */
159
	ctx->array[i].first = 0;
160
	ctx->array[i].last = ctx->info.file_max[file];
161
	ctx->array_offsets[file] = i;
162
	i += ctx->info.array_max[file] + 1;
163
	return i;
164
}
165
 
166
static unsigned
167
compile_init(struct ir3_compile_context *ctx, struct ir3_shader_variant *so,
168
		const struct tgsi_token *tokens)
169
{
170
	unsigned ret, i;
171
	struct tgsi_shader_info *info = &ctx->info;
172
	struct tgsi_lowering_config lconfig = {
173
			.color_two_side = so->key.color_two_side,
174
			.lower_DST  = true,
175
			.lower_XPD  = true,
176
			.lower_SCS  = true,
177
			.lower_LRP  = true,
178
			.lower_FRC  = true,
179
			.lower_POW  = true,
180
			.lower_LIT  = true,
181
			.lower_EXP  = true,
182
			.lower_LOG  = true,
183
			.lower_DP4  = true,
184
			.lower_DP3  = true,
185
			.lower_DPH  = true,
186
			.lower_DP2  = true,
187
			.lower_DP2A = true,
188
	};
189
 
190
	switch (so->type) {
191
	case SHADER_FRAGMENT:
192
	case SHADER_COMPUTE:
193
		lconfig.saturate_s = so->key.fsaturate_s;
194
		lconfig.saturate_t = so->key.fsaturate_t;
195
		lconfig.saturate_r = so->key.fsaturate_r;
196
		ctx->integer_s = so->key.finteger_s;
197
		break;
198
	case SHADER_VERTEX:
199
		lconfig.saturate_s = so->key.vsaturate_s;
200
		lconfig.saturate_t = so->key.vsaturate_t;
201
		lconfig.saturate_r = so->key.vsaturate_r;
202
		ctx->integer_s = so->key.vinteger_s;
203
		break;
204
	}
205
 
206
	if (!so->shader) {
207
		/* hack for standalone compiler which does not have
208
		 * screen/context:
209
		 */
210
	} else if (ir3_shader_gpuid(so->shader) >= 400) {
211
		/* a4xx seems to have *no* sam.p */
212
		lconfig.lower_TXP = ~0;  /* lower all txp */
213
		/* need special handling for "flat" */
214
		ctx->flat_bypass = true;
215
	} else {
216
		/* a3xx just needs to avoid sam.p for 3d tex */
217
		lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D);
218
		/* no special handling for "flat" */
219
		ctx->flat_bypass = false;
220
	}
221
 
222
	ctx->tokens = tgsi_transform_lowering(&lconfig, tokens, &ctx->info);
223
	ctx->free_tokens = !!ctx->tokens;
224
	if (!ctx->tokens) {
225
		/* no lowering */
226
		ctx->tokens = tokens;
227
	}
228
	ctx->ir = so->ir;
229
	ctx->so = so;
230
	ctx->array_dirty = 0;
231
	ctx->next_inloc = 8;
232
	ctx->num_internal_temps = 0;
233
	ctx->branch_count = 0;
234
	ctx->kill_count = 0;
235
	ctx->block = NULL;
236
	ctx->current_instr = NULL;
237
	ctx->num_output_updates = 0;
238
	ctx->atomic = false;
239
	ctx->frag_pos = NULL;
240
	ctx->frag_face = NULL;
241
	ctx->vertex_id = NULL;
242
	ctx->instance_id = NULL;
243
	ctx->tmp_src = NULL;
244
	ctx->using_tmp_dst = false;
245
 
246
	memset(ctx->frag_coord, 0, sizeof(ctx->frag_coord));
247
	memset(ctx->array, 0, sizeof(ctx->array));
248
	memset(ctx->array_offsets, 0, sizeof(ctx->array_offsets));
249
 
250
#define FM(x) (1 << TGSI_FILE_##x)
251
	/* NOTE: if relative addressing is used, we set constlen in
252
	 * the compiler (to worst-case value) since we don't know in
253
	 * the assembler what the max addr reg value can be:
254
	 */
255
	if (info->indirect_files & FM(CONSTANT))
256
		so->constlen = MIN2(255, ctx->info.const_file_max[0] + 1);
257
 
258
	i = 0;
259
	i += setup_arrays(ctx, TGSI_FILE_INPUT, i);
260
	i += setup_arrays(ctx, TGSI_FILE_TEMPORARY, i);
261
	i += setup_arrays(ctx, TGSI_FILE_OUTPUT, i);
262
	/* any others? we don't track arrays for const..*/
263
 
264
	/* Immediates go after constants: */
265
	so->first_immediate = so->first_driver_param =
266
		info->const_file_max[0] + 1;
267
	/* 1 unit for the vertex id base */
268
	if (so->type == SHADER_VERTEX)
269
		so->first_immediate++;
270
	/* 4 (vec4) units for ubo base addresses */
271
	so->first_immediate += 4;
272
	ctx->immediate_idx = 4 * (ctx->info.file_max[TGSI_FILE_IMMEDIATE] + 1);
273
 
274
	ret = tgsi_parse_init(&ctx->parser, ctx->tokens);
275
	if (ret != TGSI_PARSE_OK)
276
		return ret;
277
 
278
	ctx->type = ctx->parser.FullHeader.Processor.Processor;
279
 
280
	return ret;
281
}
282
 
283
static void
284
compile_error(struct ir3_compile_context *ctx, const char *format, ...)
285
{
286
	va_list ap;
287
	va_start(ap, format);
288
	_debug_vprintf(format, ap);
289
	va_end(ap);
290
	tgsi_dump(ctx->tokens, 0);
291
	debug_assert(0);
292
}
293
 
294
#define compile_assert(ctx, cond) do { \
295
		if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
296
	} while (0)
297
 
298
static void
299
compile_free(struct ir3_compile_context *ctx)
300
{
301
	if (ctx->free_tokens)
302
		free((void *)ctx->tokens);
303
	tgsi_parse_free(&ctx->parser);
304
}
305
 
306
struct instr_translater {
307
	void (*fxn)(const struct instr_translater *t,
308
			struct ir3_compile_context *ctx,
309
			struct tgsi_full_instruction *inst);
310
	unsigned tgsi_opc;
311
	opc_t opc;
312
	opc_t hopc;    /* opc to use for half_precision mode, if different */
313
	unsigned arg;
314
};
315
 
316
static void
317
instr_finish(struct ir3_compile_context *ctx)
318
{
319
	unsigned i;
320
 
321
	if (ctx->atomic)
322
		return;
323
 
324
	for (i = 0; i < ctx->num_output_updates; i++)
325
		*(ctx->output_updates[i].instrp) = ctx->output_updates[i].instr;
326
 
327
	ctx->num_output_updates = 0;
328
 
329
	while (ctx->array_dirty) {
330
		unsigned aid = ffs(ctx->array_dirty) - 1;
331
		ctx->array[aid].fanin = NULL;
332
		ctx->array_dirty &= ~(1 << aid);
333
	}
334
}
335
 
336
/* For "atomic" groups of instructions, for example the four scalar
337
 * instructions to perform a vec4 operation.  Basically this just
338
 * blocks out handling of output_updates so the next scalar instruction
339
 * still sees the result from before the start of the atomic group.
340
 *
341
 * NOTE: when used properly, this could probably replace get/put_dst()
342
 * stuff.
343
 */
344
static void
345
instr_atomic_start(struct ir3_compile_context *ctx)
346
{
347
	ctx->atomic = true;
348
}
349
 
350
static void
351
instr_atomic_end(struct ir3_compile_context *ctx)
352
{
353
	ctx->atomic = false;
354
	instr_finish(ctx);
355
}
356
 
357
static struct ir3_instruction *
358
instr_create(struct ir3_compile_context *ctx, int category, opc_t opc)
359
{
360
	instr_finish(ctx);
361
	return (ctx->current_instr = ir3_instr_create(ctx->block, category, opc));
362
}
363
 
364
static struct ir3_block *
365
push_block(struct ir3_compile_context *ctx)
366
{
367
	struct ir3_block *block;
368
	unsigned ntmp, nin, nout;
369
 
370
#define SCALAR_REGS(file) (4 * (ctx->info.file_max[TGSI_FILE_ ## file] + 1))
371
 
372
	/* hmm, give ourselves room to create 8 extra temporaries (vec4):
373
	 */
374
	ntmp = SCALAR_REGS(TEMPORARY);
375
	ntmp += 8 * 4;
376
 
377
	nout = SCALAR_REGS(OUTPUT);
378
	nin  = SCALAR_REGS(INPUT) + SCALAR_REGS(SYSTEM_VALUE);
379
 
380
	/* for outermost block, 'inputs' are the actual shader INPUT
381
	 * register file.  Reads from INPUT registers always go back to
382
	 * top block.  For nested blocks, 'inputs' is used to track any
383
	 * TEMPORARY file register from one of the enclosing blocks that
384
	 * is ready in this block.
385
	 */
386
	if (!ctx->block) {
387
		/* NOTE: fragment shaders actually have two inputs (r0.xy, the
388
		 * position)
389
		 */
390
		if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
391
			int n = 2;
392
			if (ctx->info.reads_position)
393
				n += 4;
394
			if (ctx->info.uses_frontface)
395
				n += 4;
396
			nin = MAX2(n, nin);
397
			nout += ARRAY_SIZE(ctx->kill);
398
		}
399
	} else {
400
		nin = ntmp;
401
	}
402
 
403
	block = ir3_block_create(ctx->ir, ntmp, nin, nout);
404
 
405
	if ((ctx->type == TGSI_PROCESSOR_FRAGMENT) && !ctx->block)
406
		block->noutputs -= ARRAY_SIZE(ctx->kill);
407
 
408
	block->parent = ctx->block;
409
	ctx->block = block;
410
 
411
	return block;
412
}
413
 
414
static void
415
pop_block(struct ir3_compile_context *ctx)
416
{
417
	ctx->block = ctx->block->parent;
418
	compile_assert(ctx, ctx->block);
419
}
420
 
421
static struct ir3_instruction *
422
create_output(struct ir3_block *block, struct ir3_instruction *instr,
423
		unsigned n)
424
{
425
	struct ir3_instruction *out;
426
 
427
	out = ir3_instr_create(block, -1, OPC_META_OUTPUT);
428
	out->inout.block = block;
429
	ir3_reg_create(out, n, 0);
430
	if (instr)
431
		ir3_reg_create(out, 0, IR3_REG_SSA)->instr = instr;
432
 
433
	return out;
434
}
435
 
436
static struct ir3_instruction *
437
create_input(struct ir3_block *block, struct ir3_instruction *instr,
438
		unsigned n)
439
{
440
	struct ir3_instruction *in;
441
 
442
	in = ir3_instr_create(block, -1, OPC_META_INPUT);
443
	in->inout.block = block;
444
	ir3_reg_create(in, n, 0);
445
	if (instr)
446
		ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
447
 
448
	return in;
449
}
450
 
451
static struct ir3_instruction *
452
block_input(struct ir3_block *block, unsigned n)
453
{
454
	/* references to INPUT register file always go back up to
455
	 * top level:
456
	 */
457
	if (block->parent)
458
		return block_input(block->parent, n);
459
	return block->inputs[n];
460
}
461
 
462
/* return temporary in scope, creating if needed meta-input node
463
 * to track block inputs
464
 */
465
static struct ir3_instruction *
466
block_temporary(struct ir3_block *block, unsigned n)
467
{
468
	/* references to TEMPORARY register file, find the nearest
469
	 * enclosing block which has already assigned this temporary,
470
	 * creating meta-input instructions along the way to keep
471
	 * track of block inputs
472
	 */
473
	if (block->parent && !block->temporaries[n]) {
474
		/* if already have input for this block, reuse: */
475
		if (!block->inputs[n])
476
			block->inputs[n] = block_temporary(block->parent, n);
477
 
478
		/* and create new input to return: */
479
		return create_input(block, block->inputs[n], n);
480
	}
481
	return block->temporaries[n];
482
}
483
 
484
static struct ir3_instruction *
485
create_immed(struct ir3_compile_context *ctx, float val)
486
{
487
	/* NOTE: *don't* use instr_create() here!
488
	 */
489
	struct ir3_instruction *instr;
490
	instr = ir3_instr_create(ctx->block, 1, 0);
491
	instr->cat1.src_type = get_ftype(ctx);
492
	instr->cat1.dst_type = get_ftype(ctx);
493
	ir3_reg_create(instr, 0, 0);
494
	ir3_reg_create(instr, 0, IR3_REG_IMMED)->fim_val = val;
495
	return instr;
496
}
497
 
498
static void
499
ssa_instr_set(struct ir3_compile_context *ctx, unsigned file, unsigned n,
500
		struct ir3_instruction *instr)
501
{
502
	struct ir3_block *block = ctx->block;
503
	unsigned idx = ctx->num_output_updates;
504
 
505
	compile_assert(ctx, idx < ARRAY_SIZE(ctx->output_updates));
506
 
507
	/* NOTE: defer update of temporaries[idx] or output[idx]
508
	 * until instr_finish(), so that if the current instruction
509
	 * reads the same TEMP/OUT[] it gets the old value:
510
	 *
511
	 * bleh.. this might be a bit easier to just figure out
512
	 * in instr_finish().  But at that point we've already
513
	 * lost information about OUTPUT vs TEMPORARY register
514
	 * file..
515
	 */
516
 
517
	switch (file) {
518
	case TGSI_FILE_OUTPUT:
519
		compile_assert(ctx, n < block->noutputs);
520
		ctx->output_updates[idx].instrp = &block->outputs[n];
521
		ctx->output_updates[idx].instr = instr;
522
		ctx->num_output_updates++;
523
		break;
524
	case TGSI_FILE_TEMPORARY:
525
		compile_assert(ctx, n < block->ntemporaries);
526
		ctx->output_updates[idx].instrp = &block->temporaries[n];
527
		ctx->output_updates[idx].instr = instr;
528
		ctx->num_output_updates++;
529
		break;
530
	case TGSI_FILE_ADDRESS:
531
		compile_assert(ctx, n < 1);
532
		ctx->output_updates[idx].instrp = &block->address;
533
		ctx->output_updates[idx].instr = instr;
534
		ctx->num_output_updates++;
535
		break;
536
	}
537
}
538
 
539
static struct ir3_instruction *
540
ssa_instr_get(struct ir3_compile_context *ctx, unsigned file, unsigned n)
541
{
542
	struct ir3_block *block = ctx->block;
543
	struct ir3_instruction *instr = NULL;
544
 
545
	switch (file) {
546
	case TGSI_FILE_INPUT:
547
		instr = block_input(ctx->block, n);
548
		break;
549
	case TGSI_FILE_OUTPUT:
550
		/* really this should just happen in case of 'MOV_SAT OUT[n], ..',
551
		 * for the following clamp instructions:
552
		 */
553
		instr = block->outputs[n];
554
		/* we don't have to worry about read from an OUTPUT that was
555
		 * assigned outside of the current block, because the _SAT
556
		 * clamp instructions will always be in the same block as
557
		 * the original instruction which wrote the OUTPUT
558
		 */
559
		compile_assert(ctx, instr);
560
		break;
561
	case TGSI_FILE_TEMPORARY:
562
		instr = block_temporary(ctx->block, n);
563
		if (!instr) {
564
			/* this can happen when registers (or components of a TGSI
565
			 * register) are used as src before they have been assigned
566
			 * (undefined contents).  To avoid confusing the rest of the
567
			 * compiler, and to generally keep things peachy, substitute
568
			 * an instruction that sets the src to 0.0.  Or to keep
569
			 * things undefined, I could plug in a random number? :-P
570
			 *
571
			 * NOTE: *don't* use instr_create() here!
572
			 */
573
			instr = create_immed(ctx, 0.0);
574
			/* no need to recreate the immed for every access: */
575
			block->temporaries[n] = instr;
576
		}
577
		break;
578
	case TGSI_FILE_SYSTEM_VALUE:
579
		switch (ctx->sysval_semantics[n >> 2]) {
580
		case TGSI_SEMANTIC_VERTEXID_NOBASE:
581
			instr = ctx->vertex_id;
582
			break;
583
		case TGSI_SEMANTIC_BASEVERTEX:
584
			instr = ctx->basevertex;
585
			break;
586
		case TGSI_SEMANTIC_INSTANCEID:
587
			instr = ctx->instance_id;
588
			break;
589
		}
590
		break;
591
	}
592
 
593
	return instr;
594
}
595
 
596
static int dst_array_id(struct ir3_compile_context *ctx,
597
		const struct tgsi_dst_register *dst)
598
{
599
	// XXX complete hack to recover tgsi_full_dst_register...
600
	// nothing that isn't wrapped in a tgsi_full_dst_register
601
	// should be indirect
602
	const struct tgsi_full_dst_register *fdst = (const void *)dst;
603
	return fdst->Indirect.ArrayID + ctx->array_offsets[dst->File];
604
}
605
 
606
static int src_array_id(struct ir3_compile_context *ctx,
607
		const struct tgsi_src_register *src)
608
{
609
	// XXX complete hack to recover tgsi_full_src_register...
610
	// nothing that isn't wrapped in a tgsi_full_src_register
611
	// should be indirect
612
	const struct tgsi_full_src_register *fsrc = (const void *)src;
613
	debug_assert(src->File != TGSI_FILE_CONSTANT);
614
	return fsrc->Indirect.ArrayID + ctx->array_offsets[src->File];
615
}
616
 
617
static struct ir3_instruction *
618
array_fanin(struct ir3_compile_context *ctx, unsigned aid, unsigned file)
619
{
620
	struct ir3_instruction *instr;
621
 
622
	if (ctx->array[aid].fanin) {
623
		instr = ctx->array[aid].fanin;
624
	} else {
625
		unsigned first = ctx->array[aid].first;
626
		unsigned last  = ctx->array[aid].last;
627
		unsigned i, j;
628
 
629
		instr = ir3_instr_create2(ctx->block, -1, OPC_META_FI,
630
				1 + (4 * (last + 1 - first)));
631
		ir3_reg_create(instr, 0, 0);
632
		for (i = first; i <= last; i++) {
633
			for (j = 0; j < 4; j++) {
634
				unsigned n = regid(i, j);
635
				ir3_reg_create(instr, 0, IR3_REG_SSA)->instr =
636
						ssa_instr_get(ctx, file, n);
637
			}
638
		}
639
		ctx->array[aid].fanin = instr;
640
		ctx->array_dirty |= (1 << aid);
641
	}
642
 
643
	return instr;
644
}
645
 
646
static void
647
ssa_dst(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
648
		const struct tgsi_dst_register *dst, unsigned chan)
649
{
650
	if (dst->Indirect) {
651
		struct ir3_register *reg = instr->regs[0];
652
		unsigned i, aid = dst_array_id(ctx, dst);
653
		unsigned first = ctx->array[aid].first;
654
		unsigned last  = ctx->array[aid].last;
655
		unsigned off   = dst->Index - first; /* vec4 offset */
656
 
657
		reg->size = 4 * (1 + last - first);
658
		reg->offset = regid(off, chan);
659
 
660
		instr->fanin = array_fanin(ctx, aid, dst->File);
661
 
662
		/* annotate with the array-id, to help out the register-
663
		 * assignment stage.  At least for the case of indirect
664
		 * writes, we should capture enough dependencies to
665
		 * preserve the order of reads/writes of the array, so
666
		 * the multiple "names" for the array should end up all
667
		 * assigned to the same registers.
668
		 */
669
		instr->fanin->fi.aid = aid;
670
 
671
		/* Since we are scalarizing vec4 tgsi instructions/regs, we
672
		 * run into a slight complication here.  To do the naive thing
673
		 * and setup a fanout for each scalar array element would end
674
		 * up with the result that the instructions generated for each
675
		 * component of the vec4 would end up clobbering each other.
676
		 * So we take advantage here of knowing that the array index
677
		 * (after the shl.b) will be a multiple of four, and only set
678
		 * every fourth scalar component in the array.  See also
679
		 * fixup_ssa_dst_array()
680
		 */
681
		for (i = first; i <= last; i++) {
682
			struct ir3_instruction *split;
683
			unsigned n = regid(i, chan);
684
			int off = (4 * (i - first)) + chan;
685
 
686
			if (is_meta(instr) && (instr->opc == OPC_META_FO))
687
				off -= instr->fo.off;
688
 
689
			split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
690
			split->fo.off = off;
691
			ir3_reg_create(split, 0, 0);
692
			ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
693
 
694
			ssa_instr_set(ctx, dst->File, n, split);
695
		}
696
	} else {
697
		/* normal case (not relative addressed GPR) */
698
		ssa_instr_set(ctx, dst->File, regid(dst->Index, chan), instr);
699
	}
700
}
701
 
702
static void
703
ssa_src(struct ir3_compile_context *ctx, struct ir3_register *reg,
704
		const struct tgsi_src_register *src, unsigned chan)
705
{
706
	struct ir3_instruction *instr;
707
 
708
	if (src->Indirect && (src->File != TGSI_FILE_CONSTANT)) {
709
		/* for relative addressing of gpr's (due to register assignment)
710
		 * we must generate a fanin instruction to collect all possible
711
		 * array elements that the instruction could address together:
712
		 */
713
		unsigned aid   = src_array_id(ctx, src);
714
		unsigned first = ctx->array[aid].first;
715
		unsigned last  = ctx->array[aid].last;
716
		unsigned off   = src->Index - first; /* vec4 offset */
717
 
718
		reg->size = 4 * (1 + last - first);
719
		reg->offset = regid(off, chan);
720
 
721
		instr = array_fanin(ctx, aid, src->File);
722
	} else if (src->File == TGSI_FILE_CONSTANT && src->Dimension) {
723
		const struct tgsi_full_src_register *fsrc = (const void *)src;
724
		struct ir3_instruction *temp = NULL;
725
		int ubo_regid = regid(ctx->so->first_driver_param, 0) +
726
			fsrc->Dimension.Index - 1;
727
		int offset = 0;
728
 
729
		/* We don't handle indirect UBO array accesses... yet. */
730
		compile_assert(ctx, !fsrc->Dimension.Indirect);
731
		/* UBOs start at index 1. */
732
		compile_assert(ctx, fsrc->Dimension.Index > 0);
733
 
734
		if (src->Indirect) {
735
			/* In case of an indirect index, it will have been loaded into an
736
			 * address register. There will be a sequence of
737
			 *
738
			 *   shl.b x, val, 2
739
			 *   mova a0, x
740
			 *
741
			 * We rely on this sequence to get the original val out and shift
742
			 * it by 4, since we're dealing in vec4 units.
743
			 */
744
			compile_assert(ctx, ctx->block->address);
745
			compile_assert(ctx, ctx->block->address->regs[1]->instr->opc ==
746
						   OPC_SHL_B);
747
 
748
			temp = instr = instr_create(ctx, 2, OPC_SHL_B);
749
			ir3_reg_create(instr, 0, 0);
750
			ir3_reg_create(instr, 0, IR3_REG_HALF | IR3_REG_SSA)->instr =
751
				ctx->block->address->regs[1]->instr->regs[1]->instr;
752
			ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
753
		} else if (src->Index >= 64) {
754
			/* Otherwise it's a plain index (in vec4 units). Move it into a
755
			 * register.
756
			 */
757
			temp = instr = instr_create(ctx, 1, 0);
758
			instr->cat1.src_type = get_utype(ctx);
759
			instr->cat1.dst_type = get_utype(ctx);
760
			ir3_reg_create(instr, 0, 0);
761
			ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = src->Index * 16;
762
		} else {
763
			/* The offset is small enough to fit into the ldg instruction
764
			 * directly.
765
			 */
766
			offset = src->Index * 16;
767
		}
768
 
769
		if (temp) {
770
			/* If there was an offset (most common), add it to the buffer
771
			 * address.
772
			 */
773
			instr = instr_create(ctx, 2, OPC_ADD_S);
774
			ir3_reg_create(instr, 0, 0);
775
			ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp;
776
			ir3_reg_create(instr, ubo_regid, IR3_REG_CONST);
777
		} else {
778
			/* Otherwise just load the buffer address directly */
779
			instr = instr_create(ctx, 1, 0);
780
			instr->cat1.src_type = get_utype(ctx);
781
			instr->cat1.dst_type = get_utype(ctx);
782
			ir3_reg_create(instr, 0, 0);
783
			ir3_reg_create(instr, ubo_regid, IR3_REG_CONST);
784
		}
785
 
786
		temp = instr;
787
 
788
		instr = instr_create(ctx, 6, OPC_LDG);
789
		instr->cat6.type = TYPE_U32;
790
		instr->cat6.offset = offset + chan * 4;
791
		ir3_reg_create(instr, 0, 0);
792
		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = temp;
793
		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
794
 
795
		reg->flags &= ~(IR3_REG_RELATIV | IR3_REG_CONST);
796
	} else {
797
		/* normal case (not relative addressed GPR) */
798
		instr = ssa_instr_get(ctx, src->File, regid(src->Index, chan));
799
	}
800
 
801
	if (instr) {
802
		reg->flags |= IR3_REG_SSA;
803
		reg->instr = instr;
804
	} else if (reg->flags & IR3_REG_SSA) {
805
		/* special hack for trans_samp() which calls ssa_src() directly
806
		 * to build up the collect (fanin) for const src.. (so SSA flag
807
		 * set but no src instr... it basically gets lucky because we
808
		 * default to 0.0 for "undefined" src instructions, which is
809
		 * what it wants.  We probably need to give it a better way to
810
		 * do this, but for now this hack:
811
		 */
812
		reg->instr = create_immed(ctx, 0.0);
813
	}
814
}
815
 
816
static struct ir3_register *
817
add_dst_reg_wrmask(struct ir3_compile_context *ctx,
818
		struct ir3_instruction *instr, const struct tgsi_dst_register *dst,
819
		unsigned chan, unsigned wrmask)
820
{
821
	unsigned flags = 0, num = 0;
822
	struct ir3_register *reg;
823
 
824
	switch (dst->File) {
825
	case TGSI_FILE_OUTPUT:
826
	case TGSI_FILE_TEMPORARY:
827
		/* uses SSA */
828
		break;
829
	case TGSI_FILE_ADDRESS:
830
		flags |= IR3_REG_ADDR;
831
		/* uses SSA */
832
		break;
833
	default:
834
		compile_error(ctx, "unsupported dst register file: %s\n",
835
			tgsi_file_name(dst->File));
836
		break;
837
	}
838
 
839
	if (dst->Indirect) {
840
		flags |= IR3_REG_RELATIV;
841
 
842
		/* shouldn't happen, and we can't cope with it below: */
843
		compile_assert(ctx, wrmask == 0x1);
844
 
845
		compile_assert(ctx, ctx->block->address);
846
		if (instr->address)
847
			compile_assert(ctx, ctx->block->address == instr->address);
848
 
849
		instr->address = ctx->block->address;
850
		array_insert(ctx->ir->indirects, instr);
851
	}
852
 
853
	reg = ir3_reg_create(instr, regid(num, chan), flags);
854
	reg->wrmask = wrmask;
855
 
856
	if (wrmask == 0x1) {
857
		/* normal case */
858
		ssa_dst(ctx, instr, dst, chan);
859
	} else if ((dst->File == TGSI_FILE_TEMPORARY) ||
860
			(dst->File == TGSI_FILE_OUTPUT) ||
861
			(dst->File == TGSI_FILE_ADDRESS)) {
862
		struct ir3_instruction *prev = NULL;
863
		unsigned i;
864
 
865
		compile_assert(ctx, !dst->Indirect);
866
 
867
		/* if instruction writes multiple, we need to create
868
		 * some place-holder collect the registers:
869
		 */
870
		for (i = 0; i < 4; i++) {
871
			/* NOTE: slightly ugly that we setup neighbor ptrs
872
			 * for FO here, but handle FI in CP pass.. we should
873
			 * probably just always setup neighbor ptrs in the
874
			 * frontend?
875
			 */
876
			struct ir3_instruction *split =
877
					ir3_instr_create(ctx->block, -1, OPC_META_FO);
878
			split->fo.off = i;
879
			/* unused dst reg: */
880
			/* NOTE: set SSA flag on dst here, because unused FO's
881
			 * which don't get scheduled will end up not in the
882
			 * instruction list when RA sets SSA flag on each dst.
883
			 * Slight hack.  We really should set SSA flag on
884
			 * every dst register in the frontend.
885
			 */
886
			ir3_reg_create(split, 0, IR3_REG_SSA);
887
			/* and src reg used to hold original instr */
888
			ir3_reg_create(split, 0, IR3_REG_SSA)->instr = instr;
889
			if (prev) {
890
				split->cp.left = prev;
891
				split->cp.left_cnt++;
892
				prev->cp.right = split;
893
				prev->cp.right_cnt++;
894
			}
895
			if ((wrmask & (1 << i)) && !ctx->atomic)
896
				ssa_dst(ctx, split, dst, chan+i);
897
			prev = split;
898
		}
899
	}
900
 
901
	return reg;
902
}
903
 
904
static struct ir3_register *
905
add_dst_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
906
		const struct tgsi_dst_register *dst, unsigned chan)
907
{
908
	return add_dst_reg_wrmask(ctx, instr, dst, chan, 0x1);
909
}
910
 
911
static struct ir3_register *
912
add_src_reg_wrmask(struct ir3_compile_context *ctx,
913
		struct ir3_instruction *instr, const struct tgsi_src_register *src,
914
		unsigned chan, unsigned wrmask)
915
{
916
	unsigned flags = 0, num = 0;
917
	struct ir3_register *reg;
918
 
919
	switch (src->File) {
920
	case TGSI_FILE_IMMEDIATE:
921
		/* TODO if possible, use actual immediate instead of const.. but
922
		 * TGSI has vec4 immediates, we can only embed scalar (of limited
923
		 * size, depending on instruction..)
924
		 */
925
		flags |= IR3_REG_CONST;
926
		num = src->Index + ctx->so->first_immediate;
927
		break;
928
	case TGSI_FILE_CONSTANT:
929
		flags |= IR3_REG_CONST;
930
		num = src->Index;
931
		break;
932
	case TGSI_FILE_OUTPUT:
933
		/* NOTE: we should only end up w/ OUTPUT file for things like
934
		 * clamp()'ing saturated dst instructions
935
		 */
936
	case TGSI_FILE_INPUT:
937
	case TGSI_FILE_TEMPORARY:
938
	case TGSI_FILE_SYSTEM_VALUE:
939
		/* uses SSA */
940
		break;
941
	default:
942
		compile_error(ctx, "unsupported src register file: %s\n",
943
			tgsi_file_name(src->File));
944
		break;
945
	}
946
 
947
	/* We seem to have 8 bits (6.2) for dst register always, so I think
948
	 * it is safe to assume GPR cannot be >=64
949
	 *
950
	 * cat3 instructions only have 8 bits for src2, but cannot take a
951
	 * const for src2
952
	 *
953
	 * cat5 and cat6 in some cases only has 8 bits, but cannot take a
954
	 * const for any src.
955
	 *
956
	 * Other than that we seem to have 12 bits to encode const src,
957
	 * except for cat1 which may only have 11 bits (but that seems like
958
	 * a bug)
959
	 */
960
	if (flags & IR3_REG_CONST)
961
		compile_assert(ctx, src->Index < (1 << 9));
962
	else
963
		compile_assert(ctx, src->Index < (1 << 6));
964
 
965
	/* NOTE: abs/neg modifiers in tgsi only apply to float */
966
	if (src->Absolute)
967
		flags |= IR3_REG_FABS;
968
	if (src->Negate)
969
		flags |= IR3_REG_FNEG;
970
 
971
	if (src->Indirect) {
972
		flags |= IR3_REG_RELATIV;
973
 
974
		/* shouldn't happen, and we can't cope with it below: */
975
		compile_assert(ctx, wrmask == 0x1);
976
 
977
		compile_assert(ctx, ctx->block->address);
978
		if (instr->address)
979
			compile_assert(ctx, ctx->block->address == instr->address);
980
 
981
		instr->address = ctx->block->address;
982
		array_insert(ctx->ir->indirects, instr);
983
	}
984
 
985
	reg = ir3_reg_create(instr, regid(num, chan), flags);
986
	reg->wrmask = wrmask;
987
 
988
	if (wrmask == 0x1) {
989
		/* normal case */
990
		ssa_src(ctx, reg, src, chan);
991
	} else if ((src->File == TGSI_FILE_TEMPORARY) ||
992
			(src->File == TGSI_FILE_OUTPUT) ||
993
			(src->File == TGSI_FILE_INPUT)) {
994
		struct ir3_instruction *collect;
995
		unsigned i;
996
 
997
		compile_assert(ctx, !src->Indirect);
998
 
999
		/* if instruction reads multiple, we need to create
1000
		 * some place-holder collect the registers:
1001
		 */
1002
		collect = ir3_instr_create(ctx->block, -1, OPC_META_FI);
1003
		ir3_reg_create(collect, 0, 0);   /* unused dst reg */
1004
 
1005
		for (i = 0; i < 4; i++) {
1006
			if (wrmask & (1 << i)) {
1007
				/* and src reg used point to the original instr */
1008
				ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1009
						src, chan + i);
1010
			} else if (wrmask & ~((i << i) - 1)) {
1011
				/* if any remaining components, then dummy
1012
				 * placeholder src reg to fill in the blanks:
1013
				 */
1014
				ir3_reg_create(collect, 0, 0);
1015
			}
1016
		}
1017
 
1018
		reg->flags |= IR3_REG_SSA;
1019
		reg->instr = collect;
1020
	}
1021
 
1022
	return reg;
1023
}
1024
 
1025
static struct ir3_register *
1026
add_src_reg(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
1027
		const struct tgsi_src_register *src, unsigned chan)
1028
{
1029
	return add_src_reg_wrmask(ctx, instr, src, chan, 0x1);
1030
}
1031
 
1032
static void
1033
src_from_dst(struct tgsi_src_register *src, struct tgsi_dst_register *dst)
1034
{
1035
	src->File      = dst->File;
1036
	src->Indirect  = dst->Indirect;
1037
	src->Dimension = dst->Dimension;
1038
	src->Index     = dst->Index;
1039
	src->Absolute  = 0;
1040
	src->Negate    = 0;
1041
	src->SwizzleX  = TGSI_SWIZZLE_X;
1042
	src->SwizzleY  = TGSI_SWIZZLE_Y;
1043
	src->SwizzleZ  = TGSI_SWIZZLE_Z;
1044
	src->SwizzleW  = TGSI_SWIZZLE_W;
1045
}
1046
 
1047
/* Get internal-temp src/dst to use for a sequence of instructions
1048
 * generated by a single TGSI op.
1049
 */
1050
static struct tgsi_src_register *
1051
get_internal_temp(struct ir3_compile_context *ctx,
1052
		struct tgsi_dst_register *tmp_dst)
1053
{
1054
	struct tgsi_src_register *tmp_src;
1055
	int n;
1056
 
1057
	tmp_dst->File      = TGSI_FILE_TEMPORARY;
1058
	tmp_dst->WriteMask = TGSI_WRITEMASK_XYZW;
1059
	tmp_dst->Indirect  = 0;
1060
	tmp_dst->Dimension = 0;
1061
 
1062
	/* assign next temporary: */
1063
	n = ctx->num_internal_temps++;
1064
	compile_assert(ctx, n < ARRAY_SIZE(ctx->internal_temps));
1065
	tmp_src = &ctx->internal_temps[n];
1066
 
1067
	tmp_dst->Index = ctx->info.file_max[TGSI_FILE_TEMPORARY] + n + 1;
1068
 
1069
	src_from_dst(tmp_src, tmp_dst);
1070
 
1071
	return tmp_src;
1072
}
1073
 
1074
static inline bool
1075
is_const(struct tgsi_src_register *src)
1076
{
1077
	return (src->File == TGSI_FILE_CONSTANT) ||
1078
			(src->File == TGSI_FILE_IMMEDIATE);
1079
}
1080
 
1081
static inline bool
1082
is_relative(struct tgsi_src_register *src)
1083
{
1084
	return src->Indirect;
1085
}
1086
 
1087
static inline bool
1088
is_rel_or_const(struct tgsi_src_register *src)
1089
{
1090
	return is_relative(src) || is_const(src);
1091
}
1092
 
1093
static type_t
1094
get_ftype(struct ir3_compile_context *ctx)
1095
{
1096
	return TYPE_F32;
1097
}
1098
 
1099
static type_t
1100
get_utype(struct ir3_compile_context *ctx)
1101
{
1102
	return TYPE_U32;
1103
}
1104
 
1105
static type_t
1106
get_stype(struct ir3_compile_context *ctx)
1107
{
1108
	return TYPE_S32;
1109
}
1110
 
1111
static unsigned
1112
src_swiz(struct tgsi_src_register *src, int chan)
1113
{
1114
	switch (chan) {
1115
	case 0: return src->SwizzleX;
1116
	case 1: return src->SwizzleY;
1117
	case 2: return src->SwizzleZ;
1118
	case 3: return src->SwizzleW;
1119
	}
1120
	assert(0);
1121
	return 0;
1122
}
1123
 
1124
/* for instructions that cannot take a const register as src, if needed
1125
 * generate a move to temporary gpr:
1126
 */
1127
static struct tgsi_src_register *
1128
get_unconst(struct ir3_compile_context *ctx, struct tgsi_src_register *src)
1129
{
1130
	struct tgsi_dst_register tmp_dst;
1131
	struct tgsi_src_register *tmp_src;
1132
 
1133
	compile_assert(ctx, is_rel_or_const(src));
1134
 
1135
	tmp_src = get_internal_temp(ctx, &tmp_dst);
1136
 
1137
	create_mov(ctx, &tmp_dst, src);
1138
 
1139
	return tmp_src;
1140
}
1141
 
1142
static void
1143
get_immediate(struct ir3_compile_context *ctx,
1144
		struct tgsi_src_register *reg, uint32_t val)
1145
{
1146
	unsigned neg, swiz, idx, i;
1147
	/* actually maps 1:1 currently.. not sure if that is safe to rely on: */
1148
	static const unsigned swiz2tgsi[] = {
1149
			TGSI_SWIZZLE_X, TGSI_SWIZZLE_Y, TGSI_SWIZZLE_Z, TGSI_SWIZZLE_W,
1150
	};
1151
 
1152
	for (i = 0; i < ctx->immediate_idx; i++) {
1153
		swiz = i % 4;
1154
		idx  = i / 4;
1155
 
1156
		if (ctx->so->immediates[idx].val[swiz] == val) {
1157
			neg = 0;
1158
			break;
1159
		}
1160
 
1161
		if (ctx->so->immediates[idx].val[swiz] == -val) {
1162
			neg = 1;
1163
			break;
1164
		}
1165
	}
1166
 
1167
	if (i == ctx->immediate_idx) {
1168
		/* need to generate a new immediate: */
1169
		swiz = i % 4;
1170
		idx  = i / 4;
1171
		neg  = 0;
1172
		ctx->so->immediates[idx].val[swiz] = val;
1173
		ctx->so->immediates_count = idx + 1;
1174
		ctx->immediate_idx++;
1175
	}
1176
 
1177
	reg->File      = TGSI_FILE_IMMEDIATE;
1178
	reg->Indirect  = 0;
1179
	reg->Dimension = 0;
1180
	reg->Index     = idx;
1181
	reg->Absolute  = 0;
1182
	reg->Negate    = neg;
1183
	reg->SwizzleX  = swiz2tgsi[swiz];
1184
	reg->SwizzleY  = swiz2tgsi[swiz];
1185
	reg->SwizzleZ  = swiz2tgsi[swiz];
1186
	reg->SwizzleW  = swiz2tgsi[swiz];
1187
}
1188
 
1189
static void
1190
create_mov(struct ir3_compile_context *ctx, struct tgsi_dst_register *dst,
1191
		struct tgsi_src_register *src)
1192
{
1193
	type_t type_mov = get_ftype(ctx);
1194
	unsigned i;
1195
 
1196
	for (i = 0; i < 4; i++) {
1197
		/* move to destination: */
1198
		if (dst->WriteMask & (1 << i)) {
1199
			struct ir3_instruction *instr;
1200
 
1201
			if (src->Absolute || src->Negate) {
1202
				/* can't have abs or neg on a mov instr, so use
1203
				 * absneg.f instead to handle these cases:
1204
				 */
1205
				instr = instr_create(ctx, 2, OPC_ABSNEG_F);
1206
			} else {
1207
				instr = instr_create(ctx, 1, 0);
1208
				instr->cat1.src_type = type_mov;
1209
				instr->cat1.dst_type = type_mov;
1210
			}
1211
 
1212
			add_dst_reg(ctx, instr, dst, i);
1213
			add_src_reg(ctx, instr, src, src_swiz(src, i));
1214
		}
1215
	}
1216
}
1217
 
1218
static void
1219
create_clamp(struct ir3_compile_context *ctx,
1220
		struct tgsi_dst_register *dst, struct tgsi_src_register *val,
1221
		struct tgsi_src_register *minval, struct tgsi_src_register *maxval)
1222
{
1223
	struct ir3_instruction *instr;
1224
 
1225
	instr = instr_create(ctx, 2, OPC_MAX_F);
1226
	vectorize(ctx, instr, dst, 2, val, 0, minval, 0);
1227
 
1228
	instr = instr_create(ctx, 2, OPC_MIN_F);
1229
	vectorize(ctx, instr, dst, 2, val, 0, maxval, 0);
1230
}
1231
 
1232
static void
1233
create_clamp_imm(struct ir3_compile_context *ctx,
1234
		struct tgsi_dst_register *dst,
1235
		uint32_t minval, uint32_t maxval)
1236
{
1237
	struct tgsi_src_register minconst, maxconst;
1238
	struct tgsi_src_register src;
1239
 
1240
	src_from_dst(&src, dst);
1241
 
1242
	get_immediate(ctx, &minconst, minval);
1243
	get_immediate(ctx, &maxconst, maxval);
1244
 
1245
	create_clamp(ctx, dst, &src, &minconst, &maxconst);
1246
}
1247
 
1248
static struct tgsi_dst_register *
1249
get_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst)
1250
{
1251
	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1252
	unsigned i;
1253
 
1254
	compile_assert(ctx, !ctx->using_tmp_dst);
1255
	ctx->using_tmp_dst = true;
1256
 
1257
	for (i = 0; i < inst->Instruction.NumSrcRegs; i++) {
1258
		struct tgsi_src_register *src = &inst->Src[i].Register;
1259
		if ((src->File == dst->File) && (src->Index == dst->Index)) {
1260
			if ((dst->WriteMask == TGSI_WRITEMASK_XYZW) &&
1261
					(src->SwizzleX == TGSI_SWIZZLE_X) &&
1262
					(src->SwizzleY == TGSI_SWIZZLE_Y) &&
1263
					(src->SwizzleZ == TGSI_SWIZZLE_Z) &&
1264
					(src->SwizzleW == TGSI_SWIZZLE_W))
1265
				continue;
1266
			ctx->tmp_src = get_internal_temp(ctx, &ctx->tmp_dst);
1267
			ctx->tmp_dst.WriteMask = dst->WriteMask;
1268
			dst = &ctx->tmp_dst;
1269
			break;
1270
		}
1271
	}
1272
	return dst;
1273
}
1274
 
1275
static void
1276
put_dst(struct ir3_compile_context *ctx, struct tgsi_full_instruction *inst,
1277
		struct tgsi_dst_register *dst)
1278
{
1279
	compile_assert(ctx, ctx->using_tmp_dst);
1280
	ctx->using_tmp_dst = false;
1281
 
1282
	/* if necessary, add mov back into original dst: */
1283
	if (dst != &inst->Dst[0].Register) {
1284
		create_mov(ctx, &inst->Dst[0].Register, ctx->tmp_src);
1285
	}
1286
}
1287
 
1288
/* helper to generate the necessary repeat and/or additional instructions
1289
 * to turn a scalar instruction into a vector operation:
1290
 */
1291
static void
1292
vectorize(struct ir3_compile_context *ctx, struct ir3_instruction *instr,
1293
		struct tgsi_dst_register *dst, int nsrcs, ...)
1294
{
1295
	va_list ap;
1296
	int i, j, n = 0;
1297
 
1298
	instr_atomic_start(ctx);
1299
 
1300
	for (i = 0; i < 4; i++) {
1301
		if (dst->WriteMask & (1 << i)) {
1302
			struct ir3_instruction *cur;
1303
 
1304
			if (n++ == 0) {
1305
				cur = instr;
1306
			} else {
1307
				cur = instr_create(ctx, instr->category, instr->opc);
1308
				memcpy(cur->info, instr->info, sizeof(cur->info));
1309
			}
1310
 
1311
			add_dst_reg(ctx, cur, dst, i);
1312
 
1313
			va_start(ap, nsrcs);
1314
			for (j = 0; j < nsrcs; j++) {
1315
				struct tgsi_src_register *src =
1316
						va_arg(ap, struct tgsi_src_register *);
1317
				unsigned flags = va_arg(ap, unsigned);
1318
				struct ir3_register *reg;
1319
				if (flags & IR3_REG_IMMED) {
1320
					reg = ir3_reg_create(cur, 0, IR3_REG_IMMED);
1321
					/* this is an ugly cast.. should have put flags first! */
1322
					reg->iim_val = *(int *)&src;
1323
				} else {
1324
					reg = add_src_reg(ctx, cur, src, src_swiz(src, i));
1325
				}
1326
				reg->flags |= flags & ~(IR3_REG_FNEG | IR3_REG_SNEG);
1327
				if (flags & IR3_REG_FNEG)
1328
					reg->flags ^= IR3_REG_FNEG;
1329
				if (flags & IR3_REG_SNEG)
1330
					reg->flags ^= IR3_REG_SNEG;
1331
			}
1332
			va_end(ap);
1333
		}
1334
	}
1335
 
1336
	instr_atomic_end(ctx);
1337
}
1338
 
1339
/*
1340
 * Handlers for TGSI instructions which do not have a 1:1 mapping to
1341
 * native instructions:
1342
 */
1343
 
1344
static void
1345
trans_clamp(const struct instr_translater *t,
1346
		struct ir3_compile_context *ctx,
1347
		struct tgsi_full_instruction *inst)
1348
{
1349
	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1350
	struct tgsi_src_register *src0 = &inst->Src[0].Register;
1351
	struct tgsi_src_register *src1 = &inst->Src[1].Register;
1352
	struct tgsi_src_register *src2 = &inst->Src[2].Register;
1353
 
1354
	create_clamp(ctx, dst, src0, src1, src2);
1355
 
1356
	put_dst(ctx, inst, dst);
1357
}
1358
 
1359
/* ARL(x) = x, but mova from hrN.x to a0.. */
1360
static void
1361
trans_arl(const struct instr_translater *t,
1362
		struct ir3_compile_context *ctx,
1363
		struct tgsi_full_instruction *inst)
1364
{
1365
	struct ir3_instruction *instr;
1366
	struct tgsi_dst_register tmp_dst;
1367
	struct tgsi_src_register *tmp_src;
1368
	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1369
	struct tgsi_src_register *src = &inst->Src[0].Register;
1370
	unsigned chan = src->SwizzleX;
1371
 
1372
	compile_assert(ctx, dst->File == TGSI_FILE_ADDRESS);
1373
 
1374
	/* NOTE: we allocate a temporary from a flat register
1375
	 * namespace (ignoring half vs full).  It turns out
1376
	 * not to really matter since registers get reassigned
1377
	 * later in ir3_ra which (hopefully!) can deal a bit
1378
	 * better with mixed half and full precision.
1379
	 */
1380
	tmp_src = get_internal_temp(ctx, &tmp_dst);
1381
 
1382
	/* cov.{u,f}{32,16}s16 Rtmp, Rsrc */
1383
	instr = instr_create(ctx, 1, 0);
1384
	instr->cat1.src_type = (t->tgsi_opc == TGSI_OPCODE_ARL) ?
1385
			get_ftype(ctx) : get_utype(ctx);
1386
	instr->cat1.dst_type = TYPE_S16;
1387
	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1388
	add_src_reg(ctx, instr, src, chan);
1389
 
1390
	/* shl.b Rtmp, Rtmp, 2 */
1391
	instr = instr_create(ctx, 2, OPC_SHL_B);
1392
	add_dst_reg(ctx, instr, &tmp_dst, chan)->flags |= IR3_REG_HALF;
1393
	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1394
	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
1395
 
1396
	/* mova a0, Rtmp */
1397
	instr = instr_create(ctx, 1, 0);
1398
	instr->cat1.src_type = TYPE_S16;
1399
	instr->cat1.dst_type = TYPE_S16;
1400
	add_dst_reg(ctx, instr, dst, 0)->flags |= IR3_REG_HALF;
1401
	add_src_reg(ctx, instr, tmp_src, chan)->flags |= IR3_REG_HALF;
1402
}
1403
 
1404
/*
1405
 * texture fetch/sample instructions:
1406
 */
1407
 
1408
struct tex_info {
1409
	int8_t order[4];
1410
	int8_t args;
1411
	unsigned src_wrmask, flags;
1412
};
1413
 
1414
struct target_info {
1415
	uint8_t dims;
1416
	uint8_t cube;
1417
	uint8_t array;
1418
	uint8_t shadow;
1419
};
1420
 
1421
static const struct target_info tex_targets[] = {
1422
	[TGSI_TEXTURE_1D]               = { 1, 0, 0, 0 },
1423
	[TGSI_TEXTURE_2D]               = { 2, 0, 0, 0 },
1424
	[TGSI_TEXTURE_3D]               = { 3, 0, 0, 0 },
1425
	[TGSI_TEXTURE_CUBE]             = { 3, 1, 0, 0 },
1426
	[TGSI_TEXTURE_RECT]             = { 2, 0, 0, 0 },
1427
	[TGSI_TEXTURE_SHADOW1D]         = { 1, 0, 0, 1 },
1428
	[TGSI_TEXTURE_SHADOW2D]         = { 2, 0, 0, 1 },
1429
	[TGSI_TEXTURE_SHADOWRECT]       = { 2, 0, 0, 1 },
1430
	[TGSI_TEXTURE_1D_ARRAY]         = { 1, 0, 1, 0 },
1431
	[TGSI_TEXTURE_2D_ARRAY]         = { 2, 0, 1, 0 },
1432
	[TGSI_TEXTURE_SHADOW1D_ARRAY]   = { 1, 0, 1, 1 },
1433
	[TGSI_TEXTURE_SHADOW2D_ARRAY]   = { 2, 0, 1, 1 },
1434
	[TGSI_TEXTURE_SHADOWCUBE]       = { 3, 1, 0, 1 },
1435
	[TGSI_TEXTURE_2D_MSAA]          = { 2, 0, 0, 0 },
1436
	[TGSI_TEXTURE_2D_ARRAY_MSAA]    = { 2, 0, 1, 0 },
1437
	[TGSI_TEXTURE_CUBE_ARRAY]       = { 3, 1, 1, 0 },
1438
	[TGSI_TEXTURE_SHADOWCUBE_ARRAY] = { 3, 1, 1, 1 },
1439
};
1440
 
1441
static void
1442
fill_tex_info(struct ir3_compile_context *ctx,
1443
			  struct tgsi_full_instruction *inst,
1444
			  struct tex_info *info)
1445
{
1446
	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1447
 
1448
	if (tgt->dims == 3)
1449
		info->flags |= IR3_INSTR_3D;
1450
	if (tgt->array)
1451
		info->flags |= IR3_INSTR_A;
1452
	if (tgt->shadow)
1453
		info->flags |= IR3_INSTR_S;
1454
 
1455
	switch (inst->Instruction.Opcode) {
1456
	case TGSI_OPCODE_TXB:
1457
	case TGSI_OPCODE_TXB2:
1458
	case TGSI_OPCODE_TXL:
1459
	case TGSI_OPCODE_TXF:
1460
		info->args = 2;
1461
		break;
1462
	case TGSI_OPCODE_TXP:
1463
		info->flags |= IR3_INSTR_P;
1464
		/* fallthrough */
1465
	case TGSI_OPCODE_TEX:
1466
	case TGSI_OPCODE_TXD:
1467
		info->args = 1;
1468
		break;
1469
	}
1470
 
1471
	/*
1472
	 * lay out the first argument in the proper order:
1473
	 *  - actual coordinates first
1474
	 *  - shadow reference
1475
	 *  - array index
1476
	 *  - projection w
1477
	 *
1478
	 * bias/lod go into the second arg
1479
	 */
1480
	int arg, pos = 0;
1481
	for (arg = 0; arg < tgt->dims; arg++)
1482
		info->order[arg] = pos++;
1483
	if (tgt->dims == 1)
1484
		info->order[pos++] = -1;
1485
	if (tgt->shadow)
1486
		info->order[pos++] = MAX2(arg + tgt->array, 2);
1487
	if (tgt->array)
1488
		info->order[pos++] = arg++;
1489
	if (info->flags & IR3_INSTR_P)
1490
		info->order[pos++] = 3;
1491
 
1492
	info->src_wrmask = (1 << pos) - 1;
1493
 
1494
	for (; pos < 4; pos++)
1495
		info->order[pos] = -1;
1496
 
1497
	assert(pos <= 4);
1498
}
1499
 
1500
static bool check_swiz(struct tgsi_src_register *src, const int8_t order[4])
1501
{
1502
	unsigned i;
1503
	for (i = 1; (i < 4) && order[i] >= 0; i++)
1504
		if (src_swiz(src, i) != (src_swiz(src, 0) + order[i]))
1505
			return false;
1506
	return true;
1507
}
1508
 
1509
static bool is_1d(unsigned tex)
1510
{
1511
	return tex_targets[tex].dims == 1;
1512
}
1513
 
1514
static struct tgsi_src_register *
1515
get_tex_coord(struct ir3_compile_context *ctx,
1516
		struct tgsi_full_instruction *inst,
1517
		const struct tex_info *tinf)
1518
{
1519
	struct tgsi_src_register *coord = &inst->Src[0].Register;
1520
	struct ir3_instruction *instr;
1521
	unsigned tex = inst->Texture.Texture;
1522
	struct tgsi_dst_register tmp_dst;
1523
	struct tgsi_src_register *tmp_src;
1524
	type_t type_mov = get_ftype(ctx);
1525
	unsigned j;
1526
 
1527
	/* need to move things around: */
1528
	tmp_src = get_internal_temp(ctx, &tmp_dst);
1529
 
1530
	for (j = 0; j < 4; j++) {
1531
		if (tinf->order[j] < 0)
1532
			continue;
1533
		instr = instr_create(ctx, 1, 0);  /* mov */
1534
		instr->cat1.src_type = type_mov;
1535
		instr->cat1.dst_type = type_mov;
1536
		add_dst_reg(ctx, instr, &tmp_dst, j);
1537
		add_src_reg(ctx, instr, coord,
1538
				src_swiz(coord, tinf->order[j]));
1539
	}
1540
 
1541
	/* fix up .y coord: */
1542
	if (is_1d(tex)) {
1543
		struct ir3_register *imm;
1544
		instr = instr_create(ctx, 1, 0);  /* mov */
1545
		instr->cat1.src_type = type_mov;
1546
		instr->cat1.dst_type = type_mov;
1547
		add_dst_reg(ctx, instr, &tmp_dst, 1);  /* .y */
1548
		imm = ir3_reg_create(instr, 0, IR3_REG_IMMED);
1549
		if (inst->Instruction.Opcode == TGSI_OPCODE_TXF)
1550
			imm->iim_val = 0;
1551
		else
1552
			imm->fim_val = 0.5;
1553
	}
1554
 
1555
	return tmp_src;
1556
}
1557
 
1558
static void
1559
trans_samp(const struct instr_translater *t,
1560
		struct ir3_compile_context *ctx,
1561
		struct tgsi_full_instruction *inst)
1562
{
1563
	struct ir3_instruction *instr, *collect;
1564
	struct ir3_register *reg;
1565
	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1566
	struct tgsi_src_register *orig, *coord, *samp, *offset, *dpdx, *dpdy;
1567
	struct tgsi_src_register zero;
1568
	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1569
	struct tex_info tinf;
1570
	int i;
1571
 
1572
	memset(&tinf, 0, sizeof(tinf));
1573
	fill_tex_info(ctx, inst, &tinf);
1574
	coord = get_tex_coord(ctx, inst, &tinf);
1575
	get_immediate(ctx, &zero, 0);
1576
 
1577
	switch (inst->Instruction.Opcode) {
1578
	case TGSI_OPCODE_TXB2:
1579
		orig = &inst->Src[1].Register;
1580
		samp = &inst->Src[2].Register;
1581
		break;
1582
	case TGSI_OPCODE_TXD:
1583
		orig = &inst->Src[0].Register;
1584
		dpdx = &inst->Src[1].Register;
1585
		dpdy = &inst->Src[2].Register;
1586
		samp = &inst->Src[3].Register;
1587
		if (is_rel_or_const(dpdx))
1588
				dpdx = get_unconst(ctx, dpdx);
1589
		if (is_rel_or_const(dpdy))
1590
				dpdy = get_unconst(ctx, dpdy);
1591
		break;
1592
	default:
1593
		orig = &inst->Src[0].Register;
1594
		samp = &inst->Src[1].Register;
1595
		break;
1596
	}
1597
	if (tinf.args > 1 && is_rel_or_const(orig))
1598
		orig = get_unconst(ctx, orig);
1599
 
1600
	/* scale up integer coords for TXF based on the LOD */
1601
	if (inst->Instruction.Opcode == TGSI_OPCODE_TXF) {
1602
		struct tgsi_dst_register tmp_dst;
1603
		struct tgsi_src_register *tmp_src;
1604
		type_t type_mov = get_utype(ctx);
1605
 
1606
		tmp_src = get_internal_temp(ctx, &tmp_dst);
1607
		for (i = 0; i < tgt->dims; i++) {
1608
			instr = instr_create(ctx, 2, OPC_SHL_B);
1609
			add_dst_reg(ctx, instr, &tmp_dst, i);
1610
			add_src_reg(ctx, instr, coord, src_swiz(coord, i));
1611
			add_src_reg(ctx, instr, orig, orig->SwizzleW);
1612
		}
1613
		if (tgt->dims < 2) {
1614
			instr = instr_create(ctx, 1, 0);
1615
			instr->cat1.src_type = type_mov;
1616
			instr->cat1.dst_type = type_mov;
1617
			add_dst_reg(ctx, instr, &tmp_dst, i);
1618
			add_src_reg(ctx, instr, &zero, zero.SwizzleX);
1619
			i++;
1620
		}
1621
		if (tgt->array) {
1622
			instr = instr_create(ctx, 1, 0);
1623
			instr->cat1.src_type = type_mov;
1624
			instr->cat1.dst_type = type_mov;
1625
			add_dst_reg(ctx, instr, &tmp_dst, i);
1626
			add_src_reg(ctx, instr, coord, src_swiz(coord, i));
1627
		}
1628
		coord = tmp_src;
1629
	}
1630
 
1631
	if (inst->Texture.NumOffsets) {
1632
		struct tgsi_texture_offset *tex_offset = &inst->TexOffsets[0];
1633
		struct tgsi_src_register offset_src = {0};
1634
 
1635
		offset_src.File = tex_offset->File;
1636
		offset_src.Index = tex_offset->Index;
1637
		offset_src.SwizzleX = tex_offset->SwizzleX;
1638
		offset_src.SwizzleY = tex_offset->SwizzleY;
1639
		offset_src.SwizzleZ = tex_offset->SwizzleZ;
1640
		offset = get_unconst(ctx, &offset_src);
1641
		tinf.flags |= IR3_INSTR_O;
1642
	}
1643
 
1644
	instr = instr_create(ctx, 5, t->opc);
1645
	if (ctx->integer_s & (1 << samp->Index))
1646
		instr->cat5.type = get_utype(ctx);
1647
	else
1648
		instr->cat5.type = get_ftype(ctx);
1649
	instr->cat5.samp = samp->Index;
1650
	instr->cat5.tex  = samp->Index;
1651
	instr->flags |= tinf.flags;
1652
 
1653
	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1654
 
1655
	reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
1656
 
1657
	collect = ir3_instr_create2(ctx->block, -1, OPC_META_FI, 12);
1658
	ir3_reg_create(collect, 0, 0);
1659
	for (i = 0; i < 4; i++) {
1660
		if (tinf.src_wrmask & (1 << i))
1661
			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1662
					coord, src_swiz(coord, i));
1663
		else if (tinf.src_wrmask & ~((1 << i) - 1))
1664
			ir3_reg_create(collect, 0, 0);
1665
	}
1666
 
1667
	/* Attach derivatives onto the end of the fan-in. Derivatives start after
1668
	 * the 4th argument, so make sure that fi is padded up to 4 first.
1669
	 */
1670
	if (inst->Instruction.Opcode == TGSI_OPCODE_TXD) {
1671
		while (collect->regs_count < 5)
1672
			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1673
					&zero, zero.SwizzleX);
1674
		for (i = 0; i < tgt->dims; i++)
1675
			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdx, i);
1676
		if (tgt->dims < 2)
1677
			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1678
					&zero, zero.SwizzleX);
1679
		for (i = 0; i < tgt->dims; i++)
1680
			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA), dpdy, i);
1681
		if (tgt->dims < 2)
1682
			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1683
					&zero, zero.SwizzleX);
1684
		tinf.src_wrmask |= ((1 << (2 * MAX2(tgt->dims, 2))) - 1) << 4;
1685
	}
1686
 
1687
	reg->instr = collect;
1688
	reg->wrmask = tinf.src_wrmask;
1689
 
1690
	/* The second argument contains the offsets, followed by the lod/bias
1691
	 * argument. This is constructed more manually due to the dynamic nature.
1692
	 */
1693
	if (inst->Texture.NumOffsets == 0 && tinf.args == 1)
1694
		return;
1695
 
1696
	reg = ir3_reg_create(instr, 0, IR3_REG_SSA);
1697
 
1698
	collect = ir3_instr_create2(ctx->block, -1, OPC_META_FI, 5);
1699
	ir3_reg_create(collect, 0, 0);
1700
 
1701
	if (inst->Texture.NumOffsets) {
1702
		for (i = 0; i < tgt->dims; i++)
1703
			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1704
					offset, i);
1705
		if (tgt->dims < 2)
1706
			ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1707
					&zero, zero.SwizzleX);
1708
	}
1709
	if (inst->Instruction.Opcode == TGSI_OPCODE_TXB2)
1710
		ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1711
				orig, orig->SwizzleX);
1712
	else if (tinf.args > 1)
1713
		ssa_src(ctx, ir3_reg_create(collect, 0, IR3_REG_SSA),
1714
				orig, orig->SwizzleW);
1715
 
1716
	reg->instr = collect;
1717
	reg->wrmask = (1 << (collect->regs_count - 1)) - 1;
1718
}
1719
 
1720
static void
1721
trans_txq(const struct instr_translater *t,
1722
		struct ir3_compile_context *ctx,
1723
		struct tgsi_full_instruction *inst)
1724
{
1725
	struct ir3_instruction *instr;
1726
	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1727
	struct tgsi_src_register *level = &inst->Src[0].Register;
1728
	struct tgsi_src_register *samp = &inst->Src[1].Register;
1729
	const struct target_info *tgt = &tex_targets[inst->Texture.Texture];
1730
	struct tex_info tinf;
1731
 
1732
	memset(&tinf, 0, sizeof(tinf));
1733
	fill_tex_info(ctx, inst, &tinf);
1734
	if (is_rel_or_const(level))
1735
		level = get_unconst(ctx, level);
1736
 
1737
	instr = instr_create(ctx, 5, OPC_GETSIZE);
1738
	instr->cat5.type = get_utype(ctx);
1739
	instr->cat5.samp = samp->Index;
1740
	instr->cat5.tex  = samp->Index;
1741
	instr->flags |= tinf.flags;
1742
 
1743
	if (tgt->array && (dst->WriteMask & (1 << tgt->dims))) {
1744
		/* Array size actually ends up in .w rather than .z. This doesn't
1745
		 * matter for miplevel 0, but for higher mips the value in z is
1746
		 * minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
1747
		 * returned, which means that we have to add 1 to it for arrays.
1748
		 */
1749
		struct tgsi_dst_register tmp_dst;
1750
		struct tgsi_src_register *tmp_src;
1751
		type_t type_mov = get_utype(ctx);
1752
 
1753
		tmp_src = get_internal_temp(ctx, &tmp_dst);
1754
		add_dst_reg_wrmask(ctx, instr, &tmp_dst, 0,
1755
						   dst->WriteMask | TGSI_WRITEMASK_W);
1756
		add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1);
1757
 
1758
		if (dst->WriteMask & TGSI_WRITEMASK_X) {
1759
			instr = instr_create(ctx, 1, 0);
1760
			instr->cat1.src_type = type_mov;
1761
			instr->cat1.dst_type = type_mov;
1762
			add_dst_reg(ctx, instr, dst, 0);
1763
			add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 0));
1764
		}
1765
 
1766
		if (tgt->dims == 2) {
1767
			if (dst->WriteMask & TGSI_WRITEMASK_Y) {
1768
				instr = instr_create(ctx, 1, 0);
1769
				instr->cat1.src_type = type_mov;
1770
				instr->cat1.dst_type = type_mov;
1771
				add_dst_reg(ctx, instr, dst, 1);
1772
				add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 1));
1773
			}
1774
		}
1775
 
1776
		instr = instr_create(ctx, 2, OPC_ADD_U);
1777
		add_dst_reg(ctx, instr, dst, tgt->dims);
1778
		add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 3));
1779
		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
1780
	} else {
1781
		add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask);
1782
		add_src_reg_wrmask(ctx, instr, level, level->SwizzleX, 0x1);
1783
	}
1784
 
1785
	if (dst->WriteMask & TGSI_WRITEMASK_W) {
1786
		/* The # of levels comes from getinfo.z. We need to add 1 to it, since
1787
		 * the value in TEX_CONST_0 is zero-based.
1788
		 */
1789
		struct tgsi_dst_register tmp_dst;
1790
		struct tgsi_src_register *tmp_src;
1791
 
1792
		tmp_src = get_internal_temp(ctx, &tmp_dst);
1793
		instr = instr_create(ctx, 5, OPC_GETINFO);
1794
		instr->cat5.type = get_utype(ctx);
1795
		instr->cat5.samp = samp->Index;
1796
		instr->cat5.tex  = samp->Index;
1797
		add_dst_reg_wrmask(ctx, instr, &tmp_dst, 0, TGSI_WRITEMASK_Z);
1798
 
1799
		instr = instr_create(ctx, 2, OPC_ADD_U);
1800
		add_dst_reg(ctx, instr, dst, 3);
1801
		add_src_reg(ctx, instr, tmp_src, src_swiz(tmp_src, 2));
1802
		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
1803
	}
1804
}
1805
 
1806
/* DDX/DDY */
1807
static void
1808
trans_deriv(const struct instr_translater *t,
1809
		struct ir3_compile_context *ctx,
1810
		struct tgsi_full_instruction *inst)
1811
{
1812
	struct ir3_instruction *instr;
1813
	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
1814
	struct tgsi_src_register *src = &inst->Src[0].Register;
1815
	static const int8_t order[4] = {0, 1, 2, 3};
1816
 
1817
	if (!check_swiz(src, order)) {
1818
		struct tgsi_dst_register tmp_dst;
1819
		struct tgsi_src_register *tmp_src;
1820
 
1821
		tmp_src = get_internal_temp(ctx, &tmp_dst);
1822
		create_mov(ctx, &tmp_dst, src);
1823
 
1824
		src = tmp_src;
1825
	}
1826
 
1827
	/* This might be a workaround for hw bug?  Blob compiler always
1828
	 * seems to work two components at a time for dsy/dsx.  It does
1829
	 * actually seem to work in some cases (or at least some piglit
1830
	 * tests) for four components at a time.  But seems more reliable
1831
	 * to split this into two instructions like the blob compiler
1832
	 * does:
1833
	 */
1834
 
1835
	instr = instr_create(ctx, 5, t->opc);
1836
	instr->cat5.type = get_ftype(ctx);
1837
	add_dst_reg_wrmask(ctx, instr, dst, 0, dst->WriteMask & 0x3);
1838
	add_src_reg_wrmask(ctx, instr, src, 0, dst->WriteMask & 0x3);
1839
 
1840
	instr = instr_create(ctx, 5, t->opc);
1841
	instr->cat5.type = get_ftype(ctx);
1842
	add_dst_reg_wrmask(ctx, instr, dst, 2, (dst->WriteMask >> 2) & 0x3);
1843
	add_src_reg_wrmask(ctx, instr, src, 2, (dst->WriteMask >> 2) & 0x3);
1844
}
1845
 
1846
/*
1847
 * SEQ(a,b) = (a == b) ? 1.0 : 0.0
1848
 *   cmps.f.eq tmp0, a, b
1849
 *   cov.u16f16 dst, tmp0
1850
 *
1851
 * SNE(a,b) = (a != b) ? 1.0 : 0.0
1852
 *   cmps.f.ne tmp0, a, b
1853
 *   cov.u16f16 dst, tmp0
1854
 *
1855
 * SGE(a,b) = (a >= b) ? 1.0 : 0.0
1856
 *   cmps.f.ge tmp0, a, b
1857
 *   cov.u16f16 dst, tmp0
1858
 *
1859
 * SLE(a,b) = (a <= b) ? 1.0 : 0.0
1860
 *   cmps.f.le tmp0, a, b
1861
 *   cov.u16f16 dst, tmp0
1862
 *
1863
 * SGT(a,b) = (a > b)  ? 1.0 : 0.0
1864
 *   cmps.f.gt tmp0, a, b
1865
 *   cov.u16f16 dst, tmp0
1866
 *
1867
 * SLT(a,b) = (a < b)  ? 1.0 : 0.0
1868
 *   cmps.f.lt tmp0, a, b
1869
 *   cov.u16f16 dst, tmp0
1870
 *
1871
 * CMP(a,b,c) = (a < 0.0) ? b : c
1872
 *   cmps.f.lt tmp0, a, {0.0}
1873
 *   sel.b16 dst, b, tmp0, c
1874
 */
1875
static void
1876
trans_cmp(const struct instr_translater *t,
1877
		struct ir3_compile_context *ctx,
1878
		struct tgsi_full_instruction *inst)
1879
{
1880
	struct ir3_instruction *instr;
1881
	struct tgsi_dst_register tmp_dst;
1882
	struct tgsi_src_register *tmp_src;
1883
	struct tgsi_src_register constval0;
1884
	/* final instruction for CMP() uses orig src1 and src2: */
1885
	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1886
	struct tgsi_src_register *a0, *a1, *a2;
1887
	unsigned condition;
1888
 
1889
	tmp_src = get_internal_temp(ctx, &tmp_dst);
1890
 
1891
	a0 = &inst->Src[0].Register;  /* a */
1892
	a1 = &inst->Src[1].Register;  /* b */
1893
 
1894
	switch (t->tgsi_opc) {
1895
	case TGSI_OPCODE_SEQ:
1896
	case TGSI_OPCODE_FSEQ:
1897
		condition = IR3_COND_EQ;
1898
		break;
1899
	case TGSI_OPCODE_SNE:
1900
	case TGSI_OPCODE_FSNE:
1901
		condition = IR3_COND_NE;
1902
		break;
1903
	case TGSI_OPCODE_SGE:
1904
	case TGSI_OPCODE_FSGE:
1905
		condition = IR3_COND_GE;
1906
		break;
1907
	case TGSI_OPCODE_SLT:
1908
	case TGSI_OPCODE_FSLT:
1909
		condition = IR3_COND_LT;
1910
		break;
1911
	case TGSI_OPCODE_SLE:
1912
		condition = IR3_COND_LE;
1913
		break;
1914
	case TGSI_OPCODE_SGT:
1915
		condition = IR3_COND_GT;
1916
		break;
1917
	case TGSI_OPCODE_CMP:
1918
		get_immediate(ctx, &constval0, fui(0.0));
1919
		a0 = &inst->Src[0].Register;  /* a */
1920
		a1 = &constval0;              /* {0.0} */
1921
		condition = IR3_COND_LT;
1922
		break;
1923
	default:
1924
		compile_assert(ctx, 0);
1925
		return;
1926
	}
1927
 
1928
	if (is_const(a0) && is_const(a1))
1929
		a0 = get_unconst(ctx, a0);
1930
 
1931
	/* cmps.f. tmp, a0, a1 */
1932
	instr = instr_create(ctx, 2, OPC_CMPS_F);
1933
	instr->cat2.condition = condition;
1934
	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
1935
 
1936
	switch (t->tgsi_opc) {
1937
	case TGSI_OPCODE_SEQ:
1938
	case TGSI_OPCODE_SGE:
1939
	case TGSI_OPCODE_SLE:
1940
	case TGSI_OPCODE_SNE:
1941
	case TGSI_OPCODE_SGT:
1942
	case TGSI_OPCODE_SLT:
1943
		/* cov.u16f16 dst, tmp0 */
1944
		instr = instr_create(ctx, 1, 0);
1945
		instr->cat1.src_type = get_utype(ctx);
1946
		instr->cat1.dst_type = get_ftype(ctx);
1947
		vectorize(ctx, instr, dst, 1, tmp_src, 0);
1948
		break;
1949
	case TGSI_OPCODE_FSEQ:
1950
	case TGSI_OPCODE_FSGE:
1951
	case TGSI_OPCODE_FSNE:
1952
	case TGSI_OPCODE_FSLT:
1953
		/* absneg.s dst, (neg)tmp0 */
1954
		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
1955
		vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_SNEG);
1956
		break;
1957
	case TGSI_OPCODE_CMP:
1958
		a1 = &inst->Src[1].Register;
1959
		a2 = &inst->Src[2].Register;
1960
		/* sel.{b32,b16} dst, src2, tmp, src1 */
1961
		instr = instr_create(ctx, 3, OPC_SEL_B32);
1962
		vectorize(ctx, instr, dst, 3, a1, 0, tmp_src, 0, a2, 0);
1963
 
1964
		break;
1965
	}
1966
 
1967
	put_dst(ctx, inst, dst);
1968
}
1969
 
1970
/*
1971
 * USNE(a,b) = (a != b) ? ~0 : 0
1972
 *   cmps.u32.ne dst, a, b
1973
 *
1974
 * USEQ(a,b) = (a == b) ? ~0 : 0
1975
 *   cmps.u32.eq dst, a, b
1976
 *
1977
 * ISGE(a,b) = (a > b) ? ~0 : 0
1978
 *   cmps.s32.ge dst, a, b
1979
 *
1980
 * USGE(a,b) = (a > b) ? ~0 : 0
1981
 *   cmps.u32.ge dst, a, b
1982
 *
1983
 * ISLT(a,b) = (a < b) ? ~0 : 0
1984
 *   cmps.s32.lt dst, a, b
1985
 *
1986
 * USLT(a,b) = (a < b) ? ~0 : 0
1987
 *   cmps.u32.lt dst, a, b
1988
 *
1989
 */
1990
static void
1991
trans_icmp(const struct instr_translater *t,
1992
		struct ir3_compile_context *ctx,
1993
		struct tgsi_full_instruction *inst)
1994
{
1995
	struct ir3_instruction *instr;
1996
	struct tgsi_dst_register *dst = get_dst(ctx, inst);
1997
	struct tgsi_dst_register tmp_dst;
1998
	struct tgsi_src_register *tmp_src;
1999
	struct tgsi_src_register *a0, *a1;
2000
	unsigned condition;
2001
 
2002
	a0 = &inst->Src[0].Register;  /* a */
2003
	a1 = &inst->Src[1].Register;  /* b */
2004
 
2005
	switch (t->tgsi_opc) {
2006
	case TGSI_OPCODE_USNE:
2007
		condition = IR3_COND_NE;
2008
		break;
2009
	case TGSI_OPCODE_USEQ:
2010
		condition = IR3_COND_EQ;
2011
		break;
2012
	case TGSI_OPCODE_ISGE:
2013
	case TGSI_OPCODE_USGE:
2014
		condition = IR3_COND_GE;
2015
		break;
2016
	case TGSI_OPCODE_ISLT:
2017
	case TGSI_OPCODE_USLT:
2018
		condition = IR3_COND_LT;
2019
		break;
2020
 
2021
	default:
2022
		compile_assert(ctx, 0);
2023
		return;
2024
	}
2025
 
2026
	if (is_const(a0) && is_const(a1))
2027
		a0 = get_unconst(ctx, a0);
2028
 
2029
	tmp_src = get_internal_temp(ctx, &tmp_dst);
2030
	/* cmps.{u32,s32}. tmp, a0, a1 */
2031
	instr = instr_create(ctx, 2, t->opc);
2032
	instr->cat2.condition = condition;
2033
	vectorize(ctx, instr, &tmp_dst, 2, a0, 0, a1, 0);
2034
 
2035
	/* absneg.s dst, (neg)tmp */
2036
	instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2037
	vectorize(ctx, instr, dst, 1, tmp_src, IR3_REG_SNEG);
2038
 
2039
	put_dst(ctx, inst, dst);
2040
}
2041
 
2042
/*
2043
 * UCMP(a,b,c) = a ? b : c
2044
 *   sel.b16 dst, b, a, c
2045
 */
2046
static void
2047
trans_ucmp(const struct instr_translater *t,
2048
		struct ir3_compile_context *ctx,
2049
		struct tgsi_full_instruction *inst)
2050
{
2051
	struct ir3_instruction *instr;
2052
	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2053
	struct tgsi_src_register *a0, *a1, *a2;
2054
 
2055
	a0 = &inst->Src[0].Register;  /* a */
2056
	a1 = &inst->Src[1].Register;  /* b */
2057
	a2 = &inst->Src[2].Register;  /* c */
2058
 
2059
	if (is_rel_or_const(a0))
2060
		a0 = get_unconst(ctx, a0);
2061
 
2062
	/* sel.{b32,b16} dst, b, a, c */
2063
	instr = instr_create(ctx, 3, OPC_SEL_B32);
2064
	vectorize(ctx, instr, dst, 3, a1, 0, a0, 0, a2, 0);
2065
	put_dst(ctx, inst, dst);
2066
}
2067
 
2068
/*
2069
 * ISSG(a) = a < 0 ? -1 : a > 0 ? 1 : 0
2070
 *   cmps.s.lt tmp_neg, a, 0  # 1 if a is negative
2071
 *   cmps.s.gt tmp_pos, a, 0  # 1 if a is positive
2072
 *   sub.u dst, tmp_pos, tmp_neg
2073
 */
2074
static void
2075
trans_issg(const struct instr_translater *t,
2076
		struct ir3_compile_context *ctx,
2077
		struct tgsi_full_instruction *inst)
2078
{
2079
	struct ir3_instruction *instr;
2080
	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2081
	struct tgsi_src_register *a = &inst->Src[0].Register;
2082
	struct tgsi_dst_register neg_dst, pos_dst;
2083
	struct tgsi_src_register *neg_src, *pos_src;
2084
 
2085
	neg_src = get_internal_temp(ctx, &neg_dst);
2086
	pos_src = get_internal_temp(ctx, &pos_dst);
2087
 
2088
	/* cmps.s.lt neg, a, 0 */
2089
	instr = instr_create(ctx, 2, OPC_CMPS_S);
2090
	instr->cat2.condition = IR3_COND_LT;
2091
	vectorize(ctx, instr, &neg_dst, 2, a, 0, 0, IR3_REG_IMMED);
2092
 
2093
	/* cmps.s.gt pos, a, 0 */
2094
	instr = instr_create(ctx, 2, OPC_CMPS_S);
2095
	instr->cat2.condition = IR3_COND_GT;
2096
	vectorize(ctx, instr, &pos_dst, 2, a, 0, 0, IR3_REG_IMMED);
2097
 
2098
	/* sub.u dst, pos, neg */
2099
	instr = instr_create(ctx, 2, OPC_SUB_U);
2100
	vectorize(ctx, instr, dst, 2, pos_src, 0, neg_src, 0);
2101
 
2102
	put_dst(ctx, inst, dst);
2103
}
2104
 
2105
 
2106
 
2107
/*
2108
 * Conditional / Flow control
2109
 */
2110
 
2111
static void
2112
push_branch(struct ir3_compile_context *ctx, bool inv,
2113
		struct ir3_instruction *instr, struct ir3_instruction *cond)
2114
{
2115
	unsigned int idx = ctx->branch_count++;
2116
	compile_assert(ctx, idx < ARRAY_SIZE(ctx->branch));
2117
	ctx->branch[idx].instr = instr;
2118
	ctx->branch[idx].inv = inv;
2119
	/* else side of branch has same condition: */
2120
	if (!inv)
2121
		ctx->branch[idx].cond = cond;
2122
}
2123
 
2124
static struct ir3_instruction *
2125
pop_branch(struct ir3_compile_context *ctx)
2126
{
2127
	unsigned int idx = --ctx->branch_count;
2128
	return ctx->branch[idx].instr;
2129
}
2130
 
2131
static void
2132
trans_if(const struct instr_translater *t,
2133
		struct ir3_compile_context *ctx,
2134
		struct tgsi_full_instruction *inst)
2135
{
2136
	struct ir3_instruction *instr, *cond;
2137
	struct tgsi_src_register *src = &inst->Src[0].Register;
2138
	struct tgsi_dst_register tmp_dst;
2139
	struct tgsi_src_register *tmp_src;
2140
	struct tgsi_src_register constval;
2141
 
2142
	get_immediate(ctx, &constval, fui(0.0));
2143
	tmp_src = get_internal_temp(ctx, &tmp_dst);
2144
 
2145
	if (is_const(src))
2146
		src = get_unconst(ctx, src);
2147
 
2148
	/* cmps.{f,u}.ne tmp0, b, {0.0} */
2149
	instr = instr_create(ctx, 2, t->opc);
2150
	add_dst_reg(ctx, instr, &tmp_dst, 0);
2151
	add_src_reg(ctx, instr, src, src->SwizzleX);
2152
	add_src_reg(ctx, instr, &constval, constval.SwizzleX);
2153
	instr->cat2.condition = IR3_COND_NE;
2154
 
2155
	compile_assert(ctx, instr->regs[1]->flags & IR3_REG_SSA); /* because get_unconst() */
2156
	cond = instr->regs[1]->instr;
2157
 
2158
	/* meta:flow tmp0 */
2159
	instr = instr_create(ctx, -1, OPC_META_FLOW);
2160
	ir3_reg_create(instr, 0, 0);  /* dummy dst */
2161
	add_src_reg(ctx, instr, tmp_src, TGSI_SWIZZLE_X);
2162
 
2163
	push_branch(ctx, false, instr, cond);
2164
	instr->flow.if_block = push_block(ctx);
2165
}
2166
 
2167
static void
2168
trans_else(const struct instr_translater *t,
2169
		struct ir3_compile_context *ctx,
2170
		struct tgsi_full_instruction *inst)
2171
{
2172
	struct ir3_instruction *instr;
2173
 
2174
	pop_block(ctx);
2175
 
2176
	instr = pop_branch(ctx);
2177
 
2178
	compile_assert(ctx, (instr->category == -1) &&
2179
			(instr->opc == OPC_META_FLOW));
2180
 
2181
	push_branch(ctx, true, instr, NULL);
2182
	instr->flow.else_block = push_block(ctx);
2183
}
2184
 
2185
static struct ir3_instruction *
2186
find_temporary(struct ir3_block *block, unsigned n)
2187
{
2188
	if (block->parent && !block->temporaries[n])
2189
		return find_temporary(block->parent, n);
2190
	return block->temporaries[n];
2191
}
2192
 
2193
static struct ir3_instruction *
2194
find_output(struct ir3_block *block, unsigned n)
2195
{
2196
	if (block->parent && !block->outputs[n])
2197
		return find_output(block->parent, n);
2198
	return block->outputs[n];
2199
}
2200
 
2201
static struct ir3_instruction *
2202
create_phi(struct ir3_compile_context *ctx, struct ir3_instruction *cond,
2203
		struct ir3_instruction *a, struct ir3_instruction *b)
2204
{
2205
	struct ir3_instruction *phi;
2206
 
2207
	compile_assert(ctx, cond);
2208
 
2209
	/* Either side of the condition could be null..  which
2210
	 * indicates a variable written on only one side of the
2211
	 * branch.  Normally this should only be variables not
2212
	 * used outside of that side of the branch.  So we could
2213
	 * just 'return a ? a : b;' in that case.  But for better
2214
	 * defined undefined behavior we just stick in imm{0.0}.
2215
	 * In the common case of a value only used within the
2216
	 * one side of the branch, the PHI instruction will not
2217
	 * get scheduled
2218
	 */
2219
	if (!a)
2220
		a = create_immed(ctx, 0.0);
2221
	if (!b)
2222
		b = create_immed(ctx, 0.0);
2223
 
2224
	phi = instr_create(ctx, -1, OPC_META_PHI);
2225
	ir3_reg_create(phi, 0, 0);  /* dummy dst */
2226
	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = cond;
2227
	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = a;
2228
	ir3_reg_create(phi, 0, IR3_REG_SSA)->instr = b;
2229
 
2230
	return phi;
2231
}
2232
 
2233
static void
2234
trans_endif(const struct instr_translater *t,
2235
		struct ir3_compile_context *ctx,
2236
		struct tgsi_full_instruction *inst)
2237
{
2238
	struct ir3_instruction *instr;
2239
	struct ir3_block *ifb, *elseb;
2240
	struct ir3_instruction **ifout, **elseout;
2241
	unsigned i, ifnout = 0, elsenout = 0;
2242
 
2243
	pop_block(ctx);
2244
 
2245
	instr = pop_branch(ctx);
2246
 
2247
	compile_assert(ctx, (instr->category == -1) &&
2248
			(instr->opc == OPC_META_FLOW));
2249
 
2250
	ifb = instr->flow.if_block;
2251
	elseb = instr->flow.else_block;
2252
	/* if there is no else block, the parent block is used for the
2253
	 * branch-not-taken src of the PHI instructions:
2254
	 */
2255
	if (!elseb)
2256
		elseb = ifb->parent;
2257
 
2258
	/* worst case sizes: */
2259
	ifnout = ifb->ntemporaries + ifb->noutputs;
2260
	elsenout = elseb->ntemporaries + elseb->noutputs;
2261
 
2262
	ifout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * ifnout);
2263
	if (elseb != ifb->parent)
2264
		elseout = ir3_alloc(ctx->ir, sizeof(ifb->outputs[0]) * elsenout);
2265
 
2266
	ifnout = 0;
2267
	elsenout = 0;
2268
 
2269
	/* generate PHI instructions for any temporaries written: */
2270
	for (i = 0; i < ifb->ntemporaries; i++) {
2271
		struct ir3_instruction *a = ifb->temporaries[i];
2272
		struct ir3_instruction *b = elseb->temporaries[i];
2273
 
2274
		/* if temporary written in if-block, or if else block
2275
		 * is present and temporary written in else-block:
2276
		 */
2277
		if (a || ((elseb != ifb->parent) && b)) {
2278
			struct ir3_instruction *phi;
2279
 
2280
			/* if only written on one side, find the closest
2281
			 * enclosing update on other side:
2282
			 */
2283
			if (!a)
2284
				a = find_temporary(ifb, i);
2285
			if (!b)
2286
				b = find_temporary(elseb, i);
2287
 
2288
			ifout[ifnout] = a;
2289
			a = create_output(ifb, a, ifnout++);
2290
 
2291
			if (elseb != ifb->parent) {
2292
				elseout[elsenout] = b;
2293
				b = create_output(elseb, b, elsenout++);
2294
			}
2295
 
2296
			phi = create_phi(ctx, instr, a, b);
2297
			ctx->block->temporaries[i] = phi;
2298
		}
2299
	}
2300
 
2301
	compile_assert(ctx, ifb->noutputs == elseb->noutputs);
2302
 
2303
	/* .. and any outputs written: */
2304
	for (i = 0; i < ifb->noutputs; i++) {
2305
		struct ir3_instruction *a = ifb->outputs[i];
2306
		struct ir3_instruction *b = elseb->outputs[i];
2307
 
2308
		/* if output written in if-block, or if else block
2309
		 * is present and output written in else-block:
2310
		 */
2311
		if (a || ((elseb != ifb->parent) && b)) {
2312
			struct ir3_instruction *phi;
2313
 
2314
			/* if only written on one side, find the closest
2315
			 * enclosing update on other side:
2316
			 */
2317
			if (!a)
2318
				a = find_output(ifb, i);
2319
			if (!b)
2320
				b = find_output(elseb, i);
2321
 
2322
			ifout[ifnout] = a;
2323
			a = create_output(ifb, a, ifnout++);
2324
 
2325
			if (elseb != ifb->parent) {
2326
				elseout[elsenout] = b;
2327
				b = create_output(elseb, b, elsenout++);
2328
			}
2329
 
2330
			phi = create_phi(ctx, instr, a, b);
2331
			ctx->block->outputs[i] = phi;
2332
		}
2333
	}
2334
 
2335
	ifb->noutputs = ifnout;
2336
	ifb->outputs = ifout;
2337
 
2338
	if (elseb != ifb->parent) {
2339
		elseb->noutputs = elsenout;
2340
		elseb->outputs = elseout;
2341
	}
2342
 
2343
	// TODO maybe we want to compact block->inputs?
2344
}
2345
 
2346
/*
2347
 * Kill
2348
 */
2349
 
2350
static void
2351
trans_kill(const struct instr_translater *t,
2352
		struct ir3_compile_context *ctx,
2353
		struct tgsi_full_instruction *inst)
2354
{
2355
	struct ir3_instruction *instr, *immed, *cond = NULL;
2356
	bool inv = false;
2357
 
2358
	/* unconditional kill, use enclosing if condition: */
2359
	if (ctx->branch_count > 0) {
2360
		unsigned int idx = ctx->branch_count - 1;
2361
		cond = ctx->branch[idx].cond;
2362
		inv = ctx->branch[idx].inv;
2363
	} else {
2364
		cond = create_immed(ctx, 1.0);
2365
	}
2366
 
2367
	compile_assert(ctx, cond);
2368
 
2369
	immed = create_immed(ctx, 0.0);
2370
 
2371
	/* cmps.f.ne p0.x, cond, {0.0} */
2372
	instr = instr_create(ctx, 2, OPC_CMPS_F);
2373
	instr->cat2.condition = IR3_COND_NE;
2374
	ir3_reg_create(instr, regid(REG_P0, 0), 0);
2375
	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2376
	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
2377
	cond = instr;
2378
 
2379
	/* kill p0.x */
2380
	instr = instr_create(ctx, 0, OPC_KILL);
2381
	instr->cat0.inv = inv;
2382
	ir3_reg_create(instr, 0, 0);  /* dummy dst */
2383
	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2384
 
2385
	ctx->kill[ctx->kill_count++] = instr;
2386
 
2387
	ctx->so->has_kill = true;
2388
}
2389
 
2390
/*
2391
 * Kill-If
2392
 */
2393
 
2394
static void
2395
trans_killif(const struct instr_translater *t,
2396
		struct ir3_compile_context *ctx,
2397
		struct tgsi_full_instruction *inst)
2398
{
2399
	struct tgsi_src_register *src = &inst->Src[0].Register;
2400
	struct ir3_instruction *instr, *immed, *cond = NULL;
2401
	bool inv = false;
2402
 
2403
	immed = create_immed(ctx, 0.0);
2404
 
2405
	/* cmps.f.ne p0.x, cond, {0.0} */
2406
	instr = instr_create(ctx, 2, OPC_CMPS_F);
2407
	instr->cat2.condition = IR3_COND_NE;
2408
	ir3_reg_create(instr, regid(REG_P0, 0), 0);
2409
	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = immed;
2410
	add_src_reg(ctx, instr, src, src->SwizzleX);
2411
 
2412
	cond = instr;
2413
 
2414
	/* kill p0.x */
2415
	instr = instr_create(ctx, 0, OPC_KILL);
2416
	instr->cat0.inv = inv;
2417
	ir3_reg_create(instr, 0, 0);  /* dummy dst */
2418
	ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = cond;
2419
 
2420
	ctx->kill[ctx->kill_count++] = instr;
2421
 
2422
	ctx->so->has_kill = true;
2423
 
2424
}
2425
/*
2426
 * I2F / U2F / F2I / F2U
2427
 */
2428
 
2429
static void
2430
trans_cov(const struct instr_translater *t,
2431
		struct ir3_compile_context *ctx,
2432
		struct tgsi_full_instruction *inst)
2433
{
2434
	struct ir3_instruction *instr;
2435
	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2436
	struct tgsi_src_register *src = &inst->Src[0].Register;
2437
 
2438
	// cov.f32s32 dst, tmp0 /
2439
	instr = instr_create(ctx, 1, 0);
2440
	switch (t->tgsi_opc) {
2441
	case TGSI_OPCODE_U2F:
2442
		instr->cat1.src_type = TYPE_U32;
2443
		instr->cat1.dst_type = TYPE_F32;
2444
		break;
2445
	case TGSI_OPCODE_I2F:
2446
		instr->cat1.src_type = TYPE_S32;
2447
		instr->cat1.dst_type = TYPE_F32;
2448
		break;
2449
	case TGSI_OPCODE_F2U:
2450
		instr->cat1.src_type = TYPE_F32;
2451
		instr->cat1.dst_type = TYPE_U32;
2452
		break;
2453
	case TGSI_OPCODE_F2I:
2454
		instr->cat1.src_type = TYPE_F32;
2455
		instr->cat1.dst_type = TYPE_S32;
2456
		break;
2457
 
2458
	}
2459
	vectorize(ctx, instr, dst, 1, src, 0);
2460
	put_dst(ctx, inst, dst);
2461
}
2462
 
2463
/*
2464
 * UMUL / UMAD
2465
 *
2466
 * There is no 32-bit multiply instruction, so splitting a and b into high and
2467
 * low components, we get that
2468
 *
2469
 * dst = al * bl + ah * bl << 16 + al * bh << 16
2470
 *
2471
 *  mull.u tmp0, a, b (mul low, i.e. al * bl)
2472
 *  madsh.m16 tmp1, a, b, tmp0 (mul-add shift high mix, i.e. ah * bl << 16)
2473
 *  madsh.m16 dst, b, a, tmp1 (i.e. al * bh << 16)
2474
 *
2475
 * For UMAD, add in the extra argument after mull.u.
2476
 */
2477
static void
2478
trans_umul(const struct instr_translater *t,
2479
		struct ir3_compile_context *ctx,
2480
		struct tgsi_full_instruction *inst)
2481
{
2482
	struct ir3_instruction *instr;
2483
	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2484
	struct tgsi_src_register *a = &inst->Src[0].Register;
2485
	struct tgsi_src_register *b = &inst->Src[1].Register;
2486
 
2487
	struct tgsi_dst_register tmp0_dst, tmp1_dst;
2488
	struct tgsi_src_register *tmp0_src, *tmp1_src;
2489
 
2490
	tmp0_src = get_internal_temp(ctx, &tmp0_dst);
2491
	tmp1_src = get_internal_temp(ctx, &tmp1_dst);
2492
 
2493
	if (is_rel_or_const(a))
2494
		a = get_unconst(ctx, a);
2495
	if (is_rel_or_const(b))
2496
		b = get_unconst(ctx, b);
2497
 
2498
	/* mull.u tmp0, a, b */
2499
	instr = instr_create(ctx, 2, OPC_MULL_U);
2500
	vectorize(ctx, instr, &tmp0_dst, 2, a, 0, b, 0);
2501
 
2502
	if (t->tgsi_opc == TGSI_OPCODE_UMAD) {
2503
		struct tgsi_src_register *c = &inst->Src[2].Register;
2504
 
2505
		/* add.u tmp0, tmp0, c */
2506
		instr = instr_create(ctx, 2, OPC_ADD_U);
2507
		vectorize(ctx, instr, &tmp0_dst, 2, tmp0_src, 0, c, 0);
2508
	}
2509
 
2510
	/* madsh.m16 tmp1, a, b, tmp0 */
2511
	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2512
	vectorize(ctx, instr, &tmp1_dst, 3, a, 0, b, 0, tmp0_src, 0);
2513
 
2514
	/* madsh.m16 dst, b, a, tmp1 */
2515
	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2516
	vectorize(ctx, instr, dst, 3, b, 0, a, 0, tmp1_src, 0);
2517
	put_dst(ctx, inst, dst);
2518
}
2519
 
2520
/*
2521
 * IDIV / UDIV / MOD / UMOD
2522
 *
2523
 * See NV50LegalizeSSA::handleDIV for the origin of this implementation. For
2524
 * MOD/UMOD, it becomes a - [IU]DIV(a, modulus) * modulus.
2525
 */
2526
static void
2527
trans_idiv(const struct instr_translater *t,
2528
		struct ir3_compile_context *ctx,
2529
		struct tgsi_full_instruction *inst)
2530
{
2531
	struct ir3_instruction *instr;
2532
	struct tgsi_dst_register *dst = get_dst(ctx, inst), *premod_dst = dst;
2533
	struct tgsi_src_register *a = &inst->Src[0].Register;
2534
	struct tgsi_src_register *b = &inst->Src[1].Register;
2535
 
2536
	struct tgsi_dst_register af_dst, bf_dst, q_dst, r_dst, a_dst, b_dst;
2537
	struct tgsi_src_register *af_src, *bf_src, *q_src, *r_src, *a_src, *b_src;
2538
 
2539
	struct tgsi_src_register negative_2, thirty_one;
2540
	type_t src_type;
2541
 
2542
	if (t->tgsi_opc == TGSI_OPCODE_IDIV || t->tgsi_opc == TGSI_OPCODE_MOD)
2543
		src_type = get_stype(ctx);
2544
	else
2545
		src_type = get_utype(ctx);
2546
 
2547
	af_src = get_internal_temp(ctx, &af_dst);
2548
	bf_src = get_internal_temp(ctx, &bf_dst);
2549
	q_src = get_internal_temp(ctx, &q_dst);
2550
	r_src = get_internal_temp(ctx, &r_dst);
2551
	a_src = get_internal_temp(ctx, &a_dst);
2552
	b_src = get_internal_temp(ctx, &b_dst);
2553
 
2554
	get_immediate(ctx, &negative_2, -2);
2555
	get_immediate(ctx, &thirty_one, 31);
2556
 
2557
	if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD)
2558
		premod_dst = &q_dst;
2559
 
2560
	/* cov.[us]32f32 af, numerator */
2561
	instr = instr_create(ctx, 1, 0);
2562
	instr->cat1.src_type = src_type;
2563
	instr->cat1.dst_type = get_ftype(ctx);
2564
	vectorize(ctx, instr, &af_dst, 1, a, 0);
2565
 
2566
	/* cov.[us]32f32 bf, denominator */
2567
	instr = instr_create(ctx, 1, 0);
2568
	instr->cat1.src_type = src_type;
2569
	instr->cat1.dst_type = get_ftype(ctx);
2570
	vectorize(ctx, instr, &bf_dst, 1, b, 0);
2571
 
2572
	/* Get the absolute values for IDIV */
2573
	if (type_sint(src_type)) {
2574
		/* absneg.f af, (abs)af */
2575
		instr = instr_create(ctx, 2, OPC_ABSNEG_F);
2576
		vectorize(ctx, instr, &af_dst, 1, af_src, IR3_REG_FABS);
2577
 
2578
		/* absneg.f bf, (abs)bf */
2579
		instr = instr_create(ctx, 2, OPC_ABSNEG_F);
2580
		vectorize(ctx, instr, &bf_dst, 1, bf_src, IR3_REG_FABS);
2581
 
2582
		/* absneg.s a, (abs)numerator */
2583
		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2584
		vectorize(ctx, instr, &a_dst, 1, a, IR3_REG_SABS);
2585
 
2586
		/* absneg.s b, (abs)denominator */
2587
		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2588
		vectorize(ctx, instr, &b_dst, 1, b, IR3_REG_SABS);
2589
	} else {
2590
		/* mov.u32u32 a, numerator */
2591
		instr = instr_create(ctx, 1, 0);
2592
		instr->cat1.src_type = src_type;
2593
		instr->cat1.dst_type = src_type;
2594
		vectorize(ctx, instr, &a_dst, 1, a, 0);
2595
 
2596
		/* mov.u32u32 b, denominator */
2597
		instr = instr_create(ctx, 1, 0);
2598
		instr->cat1.src_type = src_type;
2599
		instr->cat1.dst_type = src_type;
2600
		vectorize(ctx, instr, &b_dst, 1, b, 0);
2601
	}
2602
 
2603
	/* rcp.f bf, bf */
2604
	instr = instr_create(ctx, 4, OPC_RCP);
2605
	vectorize(ctx, instr, &bf_dst, 1, bf_src, 0);
2606
 
2607
	/* That's right, subtract 2 as an integer from the float */
2608
	/* add.u bf, bf, -2 */
2609
	instr = instr_create(ctx, 2, OPC_ADD_U);
2610
	vectorize(ctx, instr, &bf_dst, 2, bf_src, 0, &negative_2, 0);
2611
 
2612
	/* mul.f q, af, bf */
2613
	instr = instr_create(ctx, 2, OPC_MUL_F);
2614
	vectorize(ctx, instr, &q_dst, 2, af_src, 0, bf_src, 0);
2615
 
2616
	/* cov.f32[us]32 q, q */
2617
	instr = instr_create(ctx, 1, 0);
2618
	instr->cat1.src_type = get_ftype(ctx);
2619
	instr->cat1.dst_type = src_type;
2620
	vectorize(ctx, instr, &q_dst, 1, q_src, 0);
2621
 
2622
	/* integer multiply q by b */
2623
	/* mull.u r, q, b */
2624
	instr = instr_create(ctx, 2, OPC_MULL_U);
2625
	vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
2626
 
2627
	/* madsh.m16 r, q, b, r */
2628
	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2629
	vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
2630
 
2631
	/* madsh.m16, r, b, q, r */
2632
	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2633
	vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
2634
 
2635
	/* sub.u r, a, r */
2636
	instr = instr_create(ctx, 2, OPC_SUB_U);
2637
	vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
2638
 
2639
	/* cov.u32f32, r, r */
2640
	instr = instr_create(ctx, 1, 0);
2641
	instr->cat1.src_type = get_utype(ctx);
2642
	instr->cat1.dst_type = get_ftype(ctx);
2643
	vectorize(ctx, instr, &r_dst, 1, r_src, 0);
2644
 
2645
	/* mul.f r, r, bf */
2646
	instr = instr_create(ctx, 2, OPC_MUL_F);
2647
	vectorize(ctx, instr, &r_dst, 2, r_src, 0, bf_src, 0);
2648
 
2649
	/* cov.f32u32 r, r */
2650
	instr = instr_create(ctx, 1, 0);
2651
	instr->cat1.src_type = get_ftype(ctx);
2652
	instr->cat1.dst_type = get_utype(ctx);
2653
	vectorize(ctx, instr, &r_dst, 1, r_src, 0);
2654
 
2655
	/* add.u q, q, r */
2656
	instr = instr_create(ctx, 2, OPC_ADD_U);
2657
	vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
2658
 
2659
	/* mull.u r, q, b */
2660
	instr = instr_create(ctx, 2, OPC_MULL_U);
2661
	vectorize(ctx, instr, &r_dst, 2, q_src, 0, b_src, 0);
2662
 
2663
	/* madsh.m16 r, q, b, r */
2664
	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2665
	vectorize(ctx, instr, &r_dst, 3, q_src, 0, b_src, 0, r_src, 0);
2666
 
2667
	/* madsh.m16 r, b, q, r */
2668
	instr = instr_create(ctx, 3, OPC_MADSH_M16);
2669
	vectorize(ctx, instr, &r_dst, 3, b_src, 0, q_src, 0, r_src, 0);
2670
 
2671
	/* sub.u r, a, r */
2672
	instr = instr_create(ctx, 2, OPC_SUB_U);
2673
	vectorize(ctx, instr, &r_dst, 2, a_src, 0, r_src, 0);
2674
 
2675
	/* cmps.u.ge r, r, b */
2676
	instr = instr_create(ctx, 2, OPC_CMPS_U);
2677
	instr->cat2.condition = IR3_COND_GE;
2678
	vectorize(ctx, instr, &r_dst, 2, r_src, 0, b_src, 0);
2679
 
2680
	if (type_uint(src_type)) {
2681
		/* add.u dst, q, r */
2682
		instr = instr_create(ctx, 2, OPC_ADD_U);
2683
		vectorize(ctx, instr, premod_dst, 2, q_src, 0, r_src, 0);
2684
	} else {
2685
		/* add.u q, q, r */
2686
		instr = instr_create(ctx, 2, OPC_ADD_U);
2687
		vectorize(ctx, instr, &q_dst, 2, q_src, 0, r_src, 0);
2688
 
2689
		/* negate result based on the original arguments */
2690
		if (is_const(a) && is_const(b))
2691
			a = get_unconst(ctx, a);
2692
 
2693
		/* xor.b r, numerator, denominator */
2694
		instr = instr_create(ctx, 2, OPC_XOR_B);
2695
		vectorize(ctx, instr, &r_dst, 2, a, 0, b, 0);
2696
 
2697
		/* shr.b r, r, 31 */
2698
		instr = instr_create(ctx, 2, OPC_SHR_B);
2699
		vectorize(ctx, instr, &r_dst, 2, r_src, 0, &thirty_one, 0);
2700
 
2701
		/* absneg.s b, (neg)q */
2702
		instr = instr_create(ctx, 2, OPC_ABSNEG_S);
2703
		vectorize(ctx, instr, &b_dst, 1, q_src, IR3_REG_SNEG);
2704
 
2705
		/* sel.b dst, b, r, q */
2706
		instr = instr_create(ctx, 3, OPC_SEL_B32);
2707
		vectorize(ctx, instr, premod_dst, 3, b_src, 0, r_src, 0, q_src, 0);
2708
	}
2709
 
2710
	if (t->tgsi_opc == TGSI_OPCODE_MOD || t->tgsi_opc == TGSI_OPCODE_UMOD) {
2711
		/* The division result will have ended up in q. */
2712
 
2713
		if (is_rel_or_const(b))
2714
			b = get_unconst(ctx, b);
2715
 
2716
		/* mull.u r, q, b */
2717
		instr = instr_create(ctx, 2, OPC_MULL_U);
2718
		vectorize(ctx, instr, &r_dst, 2, q_src, 0, b, 0);
2719
 
2720
		/* madsh.m16 r, q, b, r */
2721
		instr = instr_create(ctx, 3, OPC_MADSH_M16);
2722
		vectorize(ctx, instr, &r_dst, 3, q_src, 0, b, 0, r_src, 0);
2723
 
2724
		/* madsh.m16 r, b, q, r */
2725
		instr = instr_create(ctx, 3, OPC_MADSH_M16);
2726
		vectorize(ctx, instr, &r_dst, 3, b, 0, q_src, 0, r_src, 0);
2727
 
2728
		/* sub.u dst, a, r */
2729
		instr = instr_create(ctx, 2, OPC_SUB_U);
2730
		vectorize(ctx, instr, dst, 2, a, 0, r_src, 0);
2731
	}
2732
 
2733
	put_dst(ctx, inst, dst);
2734
}
2735
 
2736
/*
2737
 * Handlers for TGSI instructions which do have 1:1 mapping to native
2738
 * instructions:
2739
 */
2740
 
2741
static void
2742
instr_cat0(const struct instr_translater *t,
2743
		struct ir3_compile_context *ctx,
2744
		struct tgsi_full_instruction *inst)
2745
{
2746
	instr_create(ctx, 0, t->opc);
2747
}
2748
 
2749
static void
2750
instr_cat1(const struct instr_translater *t,
2751
		struct ir3_compile_context *ctx,
2752
		struct tgsi_full_instruction *inst)
2753
{
2754
	struct tgsi_dst_register *dst = &inst->Dst[0].Register;
2755
	struct tgsi_src_register *src = &inst->Src[0].Register;
2756
 
2757
	/* NOTE: atomic start/end, rather than in create_mov() since
2758
	 * create_mov() is used already w/in atomic sequences (and
2759
	 * we aren't clever enough to deal with the nesting)
2760
	 */
2761
	instr_atomic_start(ctx);
2762
	create_mov(ctx, dst, src);
2763
	instr_atomic_end(ctx);
2764
}
2765
 
2766
static void
2767
instr_cat2(const struct instr_translater *t,
2768
		struct ir3_compile_context *ctx,
2769
		struct tgsi_full_instruction *inst)
2770
{
2771
	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2772
	struct tgsi_src_register *src0 = &inst->Src[0].Register;
2773
	struct tgsi_src_register *src1 = &inst->Src[1].Register;
2774
	struct ir3_instruction *instr;
2775
	unsigned src0_flags = 0, src1_flags = 0;
2776
 
2777
	switch (t->tgsi_opc) {
2778
	case TGSI_OPCODE_ABS:
2779
		src0_flags = IR3_REG_FABS;
2780
		break;
2781
	case TGSI_OPCODE_IABS:
2782
		src0_flags = IR3_REG_SABS;
2783
		break;
2784
	case TGSI_OPCODE_INEG:
2785
		src0_flags = IR3_REG_SNEG;
2786
		break;
2787
	case TGSI_OPCODE_SUB:
2788
		src1_flags = IR3_REG_FNEG;
2789
		break;
2790
	}
2791
 
2792
	switch (t->opc) {
2793
	case OPC_ABSNEG_F:
2794
	case OPC_ABSNEG_S:
2795
	case OPC_CLZ_B:
2796
	case OPC_CLZ_S:
2797
	case OPC_SIGN_F:
2798
	case OPC_FLOOR_F:
2799
	case OPC_CEIL_F:
2800
	case OPC_RNDNE_F:
2801
	case OPC_RNDAZ_F:
2802
	case OPC_TRUNC_F:
2803
	case OPC_NOT_B:
2804
	case OPC_BFREV_B:
2805
	case OPC_SETRM:
2806
	case OPC_CBITS_B:
2807
		/* these only have one src reg */
2808
		instr = instr_create(ctx, 2, t->opc);
2809
		vectorize(ctx, instr, dst, 1, src0, src0_flags);
2810
		break;
2811
	default:
2812
		if (is_const(src0) && is_const(src1))
2813
			src0 = get_unconst(ctx, src0);
2814
 
2815
		instr = instr_create(ctx, 2, t->opc);
2816
		vectorize(ctx, instr, dst, 2, src0, src0_flags,
2817
				src1, src1_flags);
2818
		break;
2819
	}
2820
 
2821
	put_dst(ctx, inst, dst);
2822
}
2823
 
2824
static void
2825
instr_cat3(const struct instr_translater *t,
2826
		struct ir3_compile_context *ctx,
2827
		struct tgsi_full_instruction *inst)
2828
{
2829
	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2830
	struct tgsi_src_register *src0 = &inst->Src[0].Register;
2831
	struct tgsi_src_register *src1 = &inst->Src[1].Register;
2832
	struct ir3_instruction *instr;
2833
 
2834
	/* in particular, can't handle const for src1 for cat3..
2835
	 * for mad, we can swap first two src's if needed:
2836
	 */
2837
	if (is_rel_or_const(src1)) {
2838
		if (is_mad(t->opc) && !is_rel_or_const(src0)) {
2839
			struct tgsi_src_register *tmp;
2840
			tmp = src0;
2841
			src0 = src1;
2842
			src1 = tmp;
2843
		} else {
2844
			src1 = get_unconst(ctx, src1);
2845
		}
2846
	}
2847
 
2848
	instr = instr_create(ctx, 3, t->opc);
2849
	vectorize(ctx, instr, dst, 3, src0, 0, src1, 0,
2850
			&inst->Src[2].Register, 0);
2851
	put_dst(ctx, inst, dst);
2852
}
2853
 
2854
static void
2855
instr_cat4(const struct instr_translater *t,
2856
		struct ir3_compile_context *ctx,
2857
		struct tgsi_full_instruction *inst)
2858
{
2859
	struct tgsi_dst_register *dst = get_dst(ctx, inst);
2860
	struct tgsi_src_register *src = &inst->Src[0].Register;
2861
	struct ir3_instruction *instr;
2862
	unsigned i;
2863
 
2864
	/* seems like blob compiler avoids const as src.. */
2865
	if (is_const(src))
2866
		src = get_unconst(ctx, src);
2867
 
2868
	/* we need to replicate into each component: */
2869
	for (i = 0; i < 4; i++) {
2870
		if (dst->WriteMask & (1 << i)) {
2871
			instr = instr_create(ctx, 4, t->opc);
2872
			add_dst_reg(ctx, instr, dst, i);
2873
			add_src_reg(ctx, instr, src, src->SwizzleX);
2874
		}
2875
	}
2876
 
2877
	put_dst(ctx, inst, dst);
2878
}
2879
 
2880
static const struct instr_translater translaters[TGSI_OPCODE_LAST] = {
2881
#define INSTR(n, f, ...) \
2882
	[TGSI_OPCODE_ ## n] = { .fxn = (f), .tgsi_opc = TGSI_OPCODE_ ## n, ##__VA_ARGS__ }
2883
 
2884
	INSTR(MOV,          instr_cat1),
2885
	INSTR(RCP,          instr_cat4, .opc = OPC_RCP),
2886
	INSTR(RSQ,          instr_cat4, .opc = OPC_RSQ),
2887
	INSTR(SQRT,         instr_cat4, .opc = OPC_SQRT),
2888
	INSTR(MUL,          instr_cat2, .opc = OPC_MUL_F),
2889
	INSTR(ADD,          instr_cat2, .opc = OPC_ADD_F),
2890
	INSTR(SUB,          instr_cat2, .opc = OPC_ADD_F),
2891
	INSTR(MIN,          instr_cat2, .opc = OPC_MIN_F),
2892
	INSTR(MAX,          instr_cat2, .opc = OPC_MAX_F),
2893
	INSTR(UADD,         instr_cat2, .opc = OPC_ADD_U),
2894
	INSTR(IMIN,         instr_cat2, .opc = OPC_MIN_S),
2895
	INSTR(UMIN,         instr_cat2, .opc = OPC_MIN_U),
2896
	INSTR(IMAX,         instr_cat2, .opc = OPC_MAX_S),
2897
	INSTR(UMAX,         instr_cat2, .opc = OPC_MAX_U),
2898
	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
2899
	INSTR(OR,           instr_cat2, .opc = OPC_OR_B),
2900
	INSTR(NOT,          instr_cat2, .opc = OPC_NOT_B),
2901
	INSTR(XOR,          instr_cat2, .opc = OPC_XOR_B),
2902
	INSTR(UMUL,         trans_umul),
2903
	INSTR(UMAD,         trans_umul),
2904
	INSTR(UDIV,         trans_idiv),
2905
	INSTR(IDIV,         trans_idiv),
2906
	INSTR(MOD,          trans_idiv),
2907
	INSTR(UMOD,         trans_idiv),
2908
	INSTR(SHL,          instr_cat2, .opc = OPC_SHL_B),
2909
	INSTR(USHR,         instr_cat2, .opc = OPC_SHR_B),
2910
	INSTR(ISHR,         instr_cat2, .opc = OPC_ASHR_B),
2911
	INSTR(IABS,         instr_cat2, .opc = OPC_ABSNEG_S),
2912
	INSTR(INEG,         instr_cat2, .opc = OPC_ABSNEG_S),
2913
	INSTR(AND,          instr_cat2, .opc = OPC_AND_B),
2914
	INSTR(MAD,          instr_cat3, .opc = OPC_MAD_F32, .hopc = OPC_MAD_F16),
2915
	INSTR(TRUNC,        instr_cat2, .opc = OPC_TRUNC_F),
2916
	INSTR(CLAMP,        trans_clamp),
2917
	INSTR(FLR,          instr_cat2, .opc = OPC_FLOOR_F),
2918
	INSTR(ROUND,        instr_cat2, .opc = OPC_RNDNE_F),
2919
	INSTR(SSG,          instr_cat2, .opc = OPC_SIGN_F),
2920
	INSTR(CEIL,         instr_cat2, .opc = OPC_CEIL_F),
2921
	INSTR(ARL,          trans_arl),
2922
	INSTR(UARL,         trans_arl),
2923
	INSTR(EX2,          instr_cat4, .opc = OPC_EXP2),
2924
	INSTR(LG2,          instr_cat4, .opc = OPC_LOG2),
2925
	INSTR(ABS,          instr_cat2, .opc = OPC_ABSNEG_F),
2926
	INSTR(COS,          instr_cat4, .opc = OPC_COS),
2927
	INSTR(SIN,          instr_cat4, .opc = OPC_SIN),
2928
	INSTR(TEX,          trans_samp, .opc = OPC_SAM),
2929
	INSTR(TXP,          trans_samp, .opc = OPC_SAM),
2930
	INSTR(TXB,          trans_samp, .opc = OPC_SAMB),
2931
	INSTR(TXB2,         trans_samp, .opc = OPC_SAMB),
2932
	INSTR(TXL,          trans_samp, .opc = OPC_SAML),
2933
	INSTR(TXD,          trans_samp, .opc = OPC_SAMGQ),
2934
	INSTR(TXF,          trans_samp, .opc = OPC_ISAML),
2935
	INSTR(TXQ,          trans_txq),
2936
	INSTR(DDX,          trans_deriv, .opc = OPC_DSX),
2937
	INSTR(DDY,          trans_deriv, .opc = OPC_DSY),
2938
	INSTR(SGT,          trans_cmp),
2939
	INSTR(SLT,          trans_cmp),
2940
	INSTR(FSLT,         trans_cmp),
2941
	INSTR(SGE,          trans_cmp),
2942
	INSTR(FSGE,         trans_cmp),
2943
	INSTR(SLE,          trans_cmp),
2944
	INSTR(SNE,          trans_cmp),
2945
	INSTR(FSNE,         trans_cmp),
2946
	INSTR(SEQ,          trans_cmp),
2947
	INSTR(FSEQ,         trans_cmp),
2948
	INSTR(CMP,          trans_cmp),
2949
	INSTR(USNE,         trans_icmp, .opc = OPC_CMPS_U),
2950
	INSTR(USEQ,         trans_icmp, .opc = OPC_CMPS_U),
2951
	INSTR(ISGE,         trans_icmp, .opc = OPC_CMPS_S),
2952
	INSTR(USGE,         trans_icmp, .opc = OPC_CMPS_U),
2953
	INSTR(ISLT,         trans_icmp, .opc = OPC_CMPS_S),
2954
	INSTR(USLT,         trans_icmp, .opc = OPC_CMPS_U),
2955
	INSTR(UCMP,         trans_ucmp),
2956
	INSTR(ISSG,         trans_issg),
2957
	INSTR(IF,           trans_if,   .opc = OPC_CMPS_F),
2958
	INSTR(UIF,          trans_if,   .opc = OPC_CMPS_U),
2959
	INSTR(ELSE,         trans_else),
2960
	INSTR(ENDIF,        trans_endif),
2961
	INSTR(END,          instr_cat0, .opc = OPC_END),
2962
	INSTR(KILL,         trans_kill, .opc = OPC_KILL),
2963
	INSTR(KILL_IF,      trans_killif, .opc = OPC_KILL),
2964
	INSTR(I2F,          trans_cov),
2965
	INSTR(U2F,          trans_cov),
2966
	INSTR(F2I,          trans_cov),
2967
	INSTR(F2U,          trans_cov),
2968
};
2969
 
2970
static ir3_semantic
2971
decl_semantic(const struct tgsi_declaration_semantic *sem)
2972
{
2973
	return ir3_semantic_name(sem->Name, sem->Index);
2974
}
2975
 
2976
static struct ir3_instruction *
2977
decl_in_frag_bary(struct ir3_compile_context *ctx, unsigned regid,
2978
		unsigned j, unsigned inloc, bool use_ldlv)
2979
{
2980
	struct ir3_instruction *instr;
2981
	struct ir3_register *src;
2982
 
2983
	if (use_ldlv) {
2984
		/* ldlv.u32 dst, l[#inloc], 1 */
2985
		instr = instr_create(ctx, 6, OPC_LDLV);
2986
		instr->cat6.type = TYPE_U32;
2987
		instr->cat6.iim_val = 1;
2988
		ir3_reg_create(instr, regid, 0);   /* dummy dst */
2989
		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
2990
		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
2991
 
2992
		return instr;
2993
	}
2994
 
2995
	/* bary.f dst, #inloc, r0.x */
2996
	instr = instr_create(ctx, 2, OPC_BARY_F);
2997
	ir3_reg_create(instr, regid, 0);   /* dummy dst */
2998
	ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = inloc;
2999
	src = ir3_reg_create(instr, 0, IR3_REG_SSA);
3000
	src->wrmask = 0x3;
3001
	src->instr = ctx->frag_pos;
3002
 
3003
	return instr;
3004
}
3005
 
3006
/* TGSI_SEMANTIC_POSITION
3007
 * """"""""""""""""""""""
3008
 *
3009
 * For fragment shaders, TGSI_SEMANTIC_POSITION is used to indicate that
3010
 * fragment shader input contains the fragment's window position.  The X
3011
 * component starts at zero and always increases from left to right.
3012
 * The Y component starts at zero and always increases but Y=0 may either
3013
 * indicate the top of the window or the bottom depending on the fragment
3014
 * coordinate origin convention (see TGSI_PROPERTY_FS_COORD_ORIGIN).
3015
 * The Z coordinate ranges from 0 to 1 to represent depth from the front
3016
 * to the back of the Z buffer.  The W component contains the reciprocol
3017
 * of the interpolated vertex position W component.
3018
 */
3019
static struct ir3_instruction *
3020
decl_in_frag_coord(struct ir3_compile_context *ctx, unsigned regid,
3021
		unsigned j)
3022
{
3023
	struct ir3_instruction *instr, *src;
3024
 
3025
	compile_assert(ctx, !ctx->frag_coord[j]);
3026
 
3027
	ctx->frag_coord[j] = create_input(ctx->block, NULL, 0);
3028
 
3029
 
3030
	switch (j) {
3031
	case 0: /* .x */
3032
	case 1: /* .y */
3033
		/* for frag_coord, we get unsigned values.. we need
3034
		 * to subtract (integer) 8 and divide by 16 (right-
3035
		 * shift by 4) then convert to float:
3036
		 */
3037
 
3038
		/* add.s tmp, src, -8 */
3039
		instr = instr_create(ctx, 2, OPC_ADD_S);
3040
		ir3_reg_create(instr, regid, 0);    /* dummy dst */
3041
		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_coord[j];
3042
		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = -8;
3043
		src = instr;
3044
 
3045
		/* shr.b tmp, tmp, 4 */
3046
		instr = instr_create(ctx, 2, OPC_SHR_B);
3047
		ir3_reg_create(instr, regid, 0);    /* dummy dst */
3048
		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
3049
		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 4;
3050
		src = instr;
3051
 
3052
		/* mov.u32f32 dst, tmp */
3053
		instr = instr_create(ctx, 1, 0);
3054
		instr->cat1.src_type = TYPE_U32;
3055
		instr->cat1.dst_type = TYPE_F32;
3056
		ir3_reg_create(instr, regid, 0);    /* dummy dst */
3057
		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
3058
 
3059
		break;
3060
	case 2: /* .z */
3061
	case 3: /* .w */
3062
		/* seems that we can use these as-is: */
3063
		instr = ctx->frag_coord[j];
3064
		break;
3065
	default:
3066
		compile_error(ctx, "invalid channel\n");
3067
		instr = create_immed(ctx, 0.0);
3068
		break;
3069
	}
3070
 
3071
	return instr;
3072
}
3073
 
3074
/* TGSI_SEMANTIC_FACE
3075
 * """"""""""""""""""
3076
 *
3077
 * This label applies to fragment shader inputs only and indicates that
3078
 * the register contains front/back-face information of the form (F, 0,
3079
 * 0, 1).  The first component will be positive when the fragment belongs
3080
 * to a front-facing polygon, and negative when the fragment belongs to a
3081
 * back-facing polygon.
3082
 */
3083
static struct ir3_instruction *
3084
decl_in_frag_face(struct ir3_compile_context *ctx, unsigned regid,
3085
		unsigned j)
3086
{
3087
	struct ir3_instruction *instr, *src;
3088
 
3089
	switch (j) {
3090
	case 0: /* .x */
3091
		compile_assert(ctx, !ctx->frag_face);
3092
 
3093
		ctx->frag_face = create_input(ctx->block, NULL, 0);
3094
 
3095
		/* for faceness, we always get -1 or 0 (int).. but TGSI expects
3096
		 * positive vs negative float.. and piglit further seems to
3097
		 * expect -1.0 or 1.0:
3098
		 *
3099
		 *    mul.s tmp, hr0.x, 2
3100
		 *    add.s tmp, tmp, 1
3101
		 *    mov.s16f32, dst, tmp
3102
		 *
3103
		 */
3104
 
3105
		instr = instr_create(ctx, 2, OPC_MUL_S);
3106
		ir3_reg_create(instr, regid, 0);    /* dummy dst */
3107
		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = ctx->frag_face;
3108
		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 2;
3109
		src = instr;
3110
 
3111
		instr = instr_create(ctx, 2, OPC_ADD_S);
3112
		ir3_reg_create(instr, regid, 0);    /* dummy dst */
3113
		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
3114
		ir3_reg_create(instr, 0, IR3_REG_IMMED)->iim_val = 1;
3115
		src = instr;
3116
 
3117
		instr = instr_create(ctx, 1, 0); /* mov */
3118
		instr->cat1.src_type = TYPE_S32;
3119
		instr->cat1.dst_type = TYPE_F32;
3120
		ir3_reg_create(instr, regid, 0);    /* dummy dst */
3121
		ir3_reg_create(instr, 0, IR3_REG_SSA)->instr = src;
3122
 
3123
		break;
3124
	case 1: /* .y */
3125
	case 2: /* .z */
3126
		instr = create_immed(ctx, 0.0);
3127
		break;
3128
	case 3: /* .w */
3129
		instr = create_immed(ctx, 1.0);
3130
		break;
3131
	default:
3132
		compile_error(ctx, "invalid channel\n");
3133
		instr = create_immed(ctx, 0.0);
3134
		break;
3135
	}
3136
 
3137
	return instr;
3138
}
3139
 
3140
static void
3141
decl_in(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
3142
{
3143
	struct ir3_shader_variant *so = ctx->so;
3144
	unsigned name = decl->Semantic.Name;
3145
	unsigned i;
3146
 
3147
	/* I don't think we should get frag shader input without
3148
	 * semantic info?  Otherwise how do inputs get linked to
3149
	 * vert outputs?
3150
	 */
3151
	compile_assert(ctx, (ctx->type == TGSI_PROCESSOR_VERTEX) ||
3152
			decl->Declaration.Semantic);
3153
 
3154
	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
3155
		unsigned n = so->inputs_count++;
3156
		unsigned r = regid(i, 0);
3157
		unsigned ncomp, j;
3158
 
3159
		/* we'll figure out the actual components used after scheduling */
3160
		ncomp = 4;
3161
 
3162
		DBG("decl in -> r%d", i);
3163
 
3164
		compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
3165
 
3166
		so->inputs[n].semantic = decl_semantic(&decl->Semantic);
3167
		so->inputs[n].compmask = (1 << ncomp) - 1;
3168
		so->inputs[n].regid = r;
3169
		so->inputs[n].inloc = ctx->next_inloc;
3170
		so->inputs[n].interpolate = decl->Interp.Interpolate;
3171
 
3172
		for (j = 0; j < ncomp; j++) {
3173
			struct ir3_instruction *instr = NULL;
3174
 
3175
			if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
3176
				/* for fragment shaders, POSITION and FACE are handled
3177
				 * specially, not using normal varying / bary.f
3178
				 */
3179
				if (name == TGSI_SEMANTIC_POSITION) {
3180
					so->inputs[n].bary = false;
3181
					so->frag_coord = true;
3182
					instr = decl_in_frag_coord(ctx, r + j, j);
3183
				} else if (name == TGSI_SEMANTIC_FACE) {
3184
					so->inputs[n].bary = false;
3185
					so->frag_face = true;
3186
					instr = decl_in_frag_face(ctx, r + j, j);
3187
				} else {
3188
					bool use_ldlv = false;
3189
 
3190
					/* if no interpolation given, pick based on
3191
					 * semantic:
3192
					 */
3193
					if (!decl->Declaration.Interpolate) {
3194
						switch (decl->Semantic.Name) {
3195
						case TGSI_SEMANTIC_COLOR:
3196
							so->inputs[n].interpolate =
3197
									TGSI_INTERPOLATE_COLOR;
3198
							break;
3199
						default:
3200
							so->inputs[n].interpolate =
3201
									TGSI_INTERPOLATE_LINEAR;
3202
						}
3203
					}
3204
 
3205
					if (ctx->flat_bypass) {
3206
						switch (so->inputs[n].interpolate) {
3207
						case TGSI_INTERPOLATE_COLOR:
3208
							if (!ctx->so->key.rasterflat)
3209
								break;
3210
							/* fallthrough */
3211
						case TGSI_INTERPOLATE_CONSTANT:
3212
							use_ldlv = true;
3213
							break;
3214
						}
3215
					}
3216
 
3217
					so->inputs[n].bary = true;
3218
 
3219
					instr = decl_in_frag_bary(ctx, r + j, j,
3220
							so->inputs[n].inloc + j - 8, use_ldlv);
3221
				}
3222
			} else {
3223
				instr = create_input(ctx->block, NULL, (i * 4) + j);
3224
			}
3225
 
3226
			ctx->block->inputs[(i * 4) + j] = instr;
3227
		}
3228
 
3229
		if (so->inputs[n].bary || (ctx->type == TGSI_PROCESSOR_VERTEX)) {
3230
			ctx->next_inloc += ncomp;
3231
			so->total_in += ncomp;
3232
		}
3233
	}
3234
}
3235
 
3236
static void
3237
decl_sv(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
3238
{
3239
	struct ir3_shader_variant *so = ctx->so;
3240
	unsigned r = regid(so->inputs_count, 0);
3241
	unsigned n = so->inputs_count++;
3242
 
3243
	DBG("decl sv -> r%d", n);
3244
 
3245
	compile_assert(ctx, n < ARRAY_SIZE(so->inputs));
3246
	compile_assert(ctx, decl->Range.First < ARRAY_SIZE(ctx->sysval_semantics));
3247
 
3248
	ctx->sysval_semantics[decl->Range.First] = decl->Semantic.Name;
3249
	so->inputs[n].semantic = decl_semantic(&decl->Semantic);
3250
	so->inputs[n].compmask = 1;
3251
	so->inputs[n].regid = r;
3252
	so->inputs[n].inloc = ctx->next_inloc;
3253
	so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
3254
 
3255
	struct ir3_instruction *instr = NULL;
3256
 
3257
	switch (decl->Semantic.Name) {
3258
	case TGSI_SEMANTIC_VERTEXID_NOBASE:
3259
		ctx->vertex_id = instr = create_input(ctx->block, NULL, r);
3260
		break;
3261
	case TGSI_SEMANTIC_BASEVERTEX:
3262
		ctx->basevertex = instr = instr_create(ctx, 1, 0);
3263
		instr->cat1.src_type = get_stype(ctx);
3264
		instr->cat1.dst_type = get_stype(ctx);
3265
		ir3_reg_create(instr, 0, 0);
3266
		ir3_reg_create(instr, regid(so->first_driver_param + 4, 0),
3267
					   IR3_REG_CONST);
3268
		break;
3269
	case TGSI_SEMANTIC_INSTANCEID:
3270
		ctx->instance_id = instr = create_input(ctx->block, NULL, r);
3271
		break;
3272
	default:
3273
		compile_error(ctx, "Unknown semantic: %s\n",
3274
					  tgsi_semantic_names[decl->Semantic.Name]);
3275
	}
3276
 
3277
	ctx->block->inputs[r] = instr;
3278
	ctx->next_inloc++;
3279
	so->total_in++;
3280
}
3281
 
3282
static void
3283
decl_out(struct ir3_compile_context *ctx, struct tgsi_full_declaration *decl)
3284
{
3285
	struct ir3_shader_variant *so = ctx->so;
3286
	unsigned comp = 0;
3287
	unsigned name = decl->Semantic.Name;
3288
	unsigned i;
3289
 
3290
	compile_assert(ctx, decl->Declaration.Semantic);
3291
 
3292
	DBG("decl out[%d] -> r%d", name, decl->Range.First);
3293
 
3294
	if (ctx->type == TGSI_PROCESSOR_VERTEX) {
3295
		switch (name) {
3296
		case TGSI_SEMANTIC_POSITION:
3297
			so->writes_pos = true;
3298
			break;
3299
		case TGSI_SEMANTIC_PSIZE:
3300
			so->writes_psize = true;
3301
			break;
3302
		case TGSI_SEMANTIC_COLOR:
3303
		case TGSI_SEMANTIC_BCOLOR:
3304
		case TGSI_SEMANTIC_GENERIC:
3305
		case TGSI_SEMANTIC_FOG:
3306
		case TGSI_SEMANTIC_TEXCOORD:
3307
			break;
3308
		default:
3309
			compile_error(ctx, "unknown VS semantic name: %s\n",
3310
					tgsi_semantic_names[name]);
3311
		}
3312
	} else {
3313
		switch (name) {
3314
		case TGSI_SEMANTIC_POSITION:
3315
			comp = 2;  /* tgsi will write to .z component */
3316
			so->writes_pos = true;
3317
			break;
3318
		case TGSI_SEMANTIC_COLOR:
3319
			break;
3320
		default:
3321
			compile_error(ctx, "unknown FS semantic name: %s\n",
3322
					tgsi_semantic_names[name]);
3323
		}
3324
	}
3325
 
3326
	for (i = decl->Range.First; i <= decl->Range.Last; i++) {
3327
		unsigned n = so->outputs_count++;
3328
		unsigned ncomp, j;
3329
 
3330
		ncomp = 4;
3331
 
3332
		compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
3333
 
3334
		so->outputs[n].semantic = decl_semantic(&decl->Semantic);
3335
		so->outputs[n].regid = regid(i, comp);
3336
 
3337
		/* avoid undefined outputs, stick a dummy mov from imm{0.0},
3338
		 * which if the output is actually assigned will be over-
3339
		 * written
3340
		 */
3341
		for (j = 0; j < ncomp; j++)
3342
			ctx->block->outputs[(i * 4) + j] = create_immed(ctx, 0.0);
3343
	}
3344
}
3345
 
3346
/* from TGSI perspective, we actually have inputs.  But most of the "inputs"
3347
 * for a fragment shader are just bary.f instructions.  The *actual* inputs
3348
 * from the hw perspective are the frag_pos and optionally frag_coord and
3349
 * frag_face.
3350
 */
3351
static void
3352
fixup_frag_inputs(struct ir3_compile_context *ctx)
3353
{
3354
	struct ir3_shader_variant *so = ctx->so;
3355
	struct ir3_block *block = ctx->block;
3356
	struct ir3_instruction **inputs;
3357
	struct ir3_instruction *instr;
3358
	int n, regid = 0;
3359
 
3360
	block->ninputs = 0;
3361
 
3362
	n  = 4;  /* always have frag_pos */
3363
	n += COND(so->frag_face, 4);
3364
	n += COND(so->frag_coord, 4);
3365
 
3366
	inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
3367
 
3368
	if (so->frag_face) {
3369
		/* this ultimately gets assigned to hr0.x so doesn't conflict
3370
		 * with frag_coord/frag_pos..
3371
		 */
3372
		inputs[block->ninputs++] = ctx->frag_face;
3373
		ctx->frag_face->regs[0]->num = 0;
3374
 
3375
		/* remaining channels not used, but let's avoid confusing
3376
		 * other parts that expect inputs to come in groups of vec4
3377
		 */
3378
		inputs[block->ninputs++] = NULL;
3379
		inputs[block->ninputs++] = NULL;
3380
		inputs[block->ninputs++] = NULL;
3381
	}
3382
 
3383
	/* since we don't know where to set the regid for frag_coord,
3384
	 * we have to use r0.x for it.  But we don't want to *always*
3385
	 * use r1.x for frag_pos as that could increase the register
3386
	 * footprint on simple shaders:
3387
	 */
3388
	if (so->frag_coord) {
3389
		ctx->frag_coord[0]->regs[0]->num = regid++;
3390
		ctx->frag_coord[1]->regs[0]->num = regid++;
3391
		ctx->frag_coord[2]->regs[0]->num = regid++;
3392
		ctx->frag_coord[3]->regs[0]->num = regid++;
3393
 
3394
		inputs[block->ninputs++] = ctx->frag_coord[0];
3395
		inputs[block->ninputs++] = ctx->frag_coord[1];
3396
		inputs[block->ninputs++] = ctx->frag_coord[2];
3397
		inputs[block->ninputs++] = ctx->frag_coord[3];
3398
	}
3399
 
3400
	/* we always have frag_pos: */
3401
	so->pos_regid = regid;
3402
 
3403
	/* r0.x */
3404
	instr = create_input(block, NULL, block->ninputs);
3405
	instr->regs[0]->num = regid++;
3406
	inputs[block->ninputs++] = instr;
3407
	ctx->frag_pos->regs[1]->instr = instr;
3408
 
3409
	/* r0.y */
3410
	instr = create_input(block, NULL, block->ninputs);
3411
	instr->regs[0]->num = regid++;
3412
	inputs[block->ninputs++] = instr;
3413
	ctx->frag_pos->regs[2]->instr = instr;
3414
 
3415
	block->inputs = inputs;
3416
}
3417
 
3418
static void
3419
compile_instructions(struct ir3_compile_context *ctx)
3420
{
3421
	push_block(ctx);
3422
 
3423
	/* for fragment shader, we have a single input register (usually
3424
	 * r0.xy) which is used as the base for bary.f varying fetch instrs:
3425
	 */
3426
	if (ctx->type == TGSI_PROCESSOR_FRAGMENT) {
3427
		struct ir3_instruction *instr;
3428
		instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
3429
		ir3_reg_create(instr, 0, 0);
3430
		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.x */
3431
		ir3_reg_create(instr, 0, IR3_REG_SSA);    /* r0.y */
3432
		ctx->frag_pos = instr;
3433
	}
3434
 
3435
	while (!tgsi_parse_end_of_tokens(&ctx->parser)) {
3436
		tgsi_parse_token(&ctx->parser);
3437
 
3438
		switch (ctx->parser.FullToken.Token.Type) {
3439
		case TGSI_TOKEN_TYPE_DECLARATION: {
3440
			struct tgsi_full_declaration *decl =
3441
					&ctx->parser.FullToken.FullDeclaration;
3442
			unsigned file = decl->Declaration.File;
3443
			if (file == TGSI_FILE_OUTPUT) {
3444
				decl_out(ctx, decl);
3445
			} else if (file == TGSI_FILE_INPUT) {
3446
				decl_in(ctx, decl);
3447
			} else if (decl->Declaration.File == TGSI_FILE_SYSTEM_VALUE) {
3448
				decl_sv(ctx, decl);
3449
			}
3450
 
3451
			if ((file != TGSI_FILE_CONSTANT) && decl->Declaration.Array) {
3452
				int aid = decl->Array.ArrayID + ctx->array_offsets[file];
3453
 
3454
				compile_assert(ctx, aid < ARRAY_SIZE(ctx->array));
3455
 
3456
				/* legacy ArrayID==0 stuff probably isn't going to work
3457
				 * well (and is at least untested).. let's just scream:
3458
				 */
3459
				compile_assert(ctx, aid != 0);
3460
 
3461
				ctx->array[aid].first = decl->Range.First;
3462
				ctx->array[aid].last  = decl->Range.Last;
3463
			}
3464
			break;
3465
		}
3466
		case TGSI_TOKEN_TYPE_IMMEDIATE: {
3467
			/* TODO: if we know the immediate is small enough, and only
3468
			 * used with instructions that can embed an immediate, we
3469
			 * can skip this:
3470
			 */
3471
			struct tgsi_full_immediate *imm =
3472
					&ctx->parser.FullToken.FullImmediate;
3473
			unsigned n = ctx->so->immediates_count++;
3474
			compile_assert(ctx, n < ARRAY_SIZE(ctx->so->immediates));
3475
			memcpy(ctx->so->immediates[n].val, imm->u, 16);
3476
			break;
3477
		}
3478
		case TGSI_TOKEN_TYPE_INSTRUCTION: {
3479
			struct tgsi_full_instruction *inst =
3480
					&ctx->parser.FullToken.FullInstruction;
3481
			unsigned opc = inst->Instruction.Opcode;
3482
			const struct instr_translater *t = &translaters[opc];
3483
 
3484
			if (t->fxn) {
3485
				t->fxn(t, ctx, inst);
3486
				ctx->num_internal_temps = 0;
3487
 
3488
				compile_assert(ctx, !ctx->using_tmp_dst);
3489
			} else {
3490
				compile_error(ctx, "unknown TGSI opc: %s\n",
3491
						tgsi_get_opcode_name(opc));
3492
			}
3493
 
3494
			switch (inst->Instruction.Saturate) {
3495
			case TGSI_SAT_ZERO_ONE:
3496
				create_clamp_imm(ctx, &inst->Dst[0].Register,
3497
						fui(0.0), fui(1.0));
3498
				break;
3499
			case TGSI_SAT_MINUS_PLUS_ONE:
3500
				create_clamp_imm(ctx, &inst->Dst[0].Register,
3501
						fui(-1.0), fui(1.0));
3502
				break;
3503
			}
3504
 
3505
			instr_finish(ctx);
3506
 
3507
			break;
3508
		}
3509
		case TGSI_TOKEN_TYPE_PROPERTY: {
3510
			struct tgsi_full_property *prop =
3511
				&ctx->parser.FullToken.FullProperty;
3512
			switch (prop->Property.PropertyName) {
3513
			case TGSI_PROPERTY_FS_COLOR0_WRITES_ALL_CBUFS:
3514
				ctx->so->color0_mrt = !!prop->u[0].Data;
3515
				break;
3516
			}
3517
		}
3518
		default:
3519
			break;
3520
		}
3521
	}
3522
}
3523
 
3524
static void
3525
compile_dump(struct ir3_compile_context *ctx)
3526
{
3527
	const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
3528
	static unsigned n = 0;
3529
	char fname[16];
3530
	FILE *f;
3531
	snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
3532
	f = fopen(fname, "w");
3533
	if (!f)
3534
		return;
3535
	ir3_block_depth(ctx->block);
3536
	ir3_dump(ctx->ir, name, ctx->block, f);
3537
	fclose(f);
3538
}
3539
 
3540
int
3541
ir3_compile_shader(struct ir3_shader_variant *so,
3542
		const struct tgsi_token *tokens, struct ir3_shader_key key,
3543
		bool cp)
3544
{
3545
	struct ir3_compile_context ctx;
3546
	struct ir3_block *block;
3547
	struct ir3_instruction **inputs;
3548
	unsigned i, j, actual_in;
3549
	int ret = 0, max_bary;
3550
 
3551
	assert(!so->ir);
3552
 
3553
	so->ir = ir3_create();
3554
 
3555
	assert(so->ir);
3556
 
3557
	if (compile_init(&ctx, so, tokens) != TGSI_PARSE_OK) {
3558
		DBG("INIT failed!");
3559
		ret = -1;
3560
		goto out;
3561
	}
3562
 
3563
	/* for now, until the edge cases are worked out: */
3564
	if (ctx.info.indirect_files_written & (FM(TEMPORARY) | FM(INPUT) | FM(OUTPUT)))
3565
		cp = false;
3566
 
3567
	compile_instructions(&ctx);
3568
 
3569
	block = ctx.block;
3570
	so->ir->block = block;
3571
 
3572
	/* keep track of the inputs from TGSI perspective.. */
3573
	inputs = block->inputs;
3574
 
3575
	/* but fixup actual inputs for frag shader: */
3576
	if (ctx.type == TGSI_PROCESSOR_FRAGMENT)
3577
		fixup_frag_inputs(&ctx);
3578
 
3579
	/* at this point, for binning pass, throw away unneeded outputs: */
3580
	if (key.binning_pass) {
3581
		for (i = 0, j = 0; i < so->outputs_count; i++) {
3582
			unsigned name = sem2name(so->outputs[i].semantic);
3583
			unsigned idx = sem2idx(so->outputs[i].semantic);
3584
 
3585
			/* throw away everything but first position/psize */
3586
			if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) ||
3587
					(name == TGSI_SEMANTIC_PSIZE))) {
3588
				if (i != j) {
3589
					so->outputs[j] = so->outputs[i];
3590
					block->outputs[(j*4)+0] = block->outputs[(i*4)+0];
3591
					block->outputs[(j*4)+1] = block->outputs[(i*4)+1];
3592
					block->outputs[(j*4)+2] = block->outputs[(i*4)+2];
3593
					block->outputs[(j*4)+3] = block->outputs[(i*4)+3];
3594
				}
3595
				j++;
3596
			}
3597
		}
3598
		so->outputs_count = j;
3599
		block->noutputs = j * 4;
3600
	}
3601
 
3602
	/* if we want half-precision outputs, mark the output registers
3603
	 * as half:
3604
	 */
3605
	if (key.half_precision) {
3606
		for (i = 0; i < block->noutputs; i++) {
3607
			if (!block->outputs[i])
3608
				continue;
3609
			block->outputs[i]->regs[0]->flags |= IR3_REG_HALF;
3610
		}
3611
	}
3612
 
3613
	/* at this point, we want the kill's in the outputs array too,
3614
	 * so that they get scheduled (since they have no dst).. we've
3615
	 * already ensured that the array is big enough in push_block():
3616
	 */
3617
	if (ctx.type == TGSI_PROCESSOR_FRAGMENT) {
3618
		for (i = 0; i < ctx.kill_count; i++)
3619
			block->outputs[block->noutputs++] = ctx.kill[i];
3620
	}
3621
 
3622
	if (fd_mesa_debug & FD_DBG_OPTDUMP)
3623
		compile_dump(&ctx);
3624
 
3625
	ret = ir3_block_flatten(block);
3626
	if (ret < 0) {
3627
		DBG("FLATTEN failed!");
3628
		goto out;
3629
	}
3630
	if ((ret > 0) && (fd_mesa_debug & FD_DBG_OPTDUMP))
3631
		compile_dump(&ctx);
3632
 
3633
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3634
		printf("BEFORE CP:\n");
3635
		ir3_dump_instr_list(block->head);
3636
	}
3637
 
3638
	ir3_block_depth(block);
3639
 
3640
	/* First remove all the extra mov's (which we could skip if the
3641
	 * front-end was clever enough not to insert them in the first
3642
	 * place).  Then figure out left/right neighbors, re-inserting
3643
	 * extra mov's when needed to avoid conflicts.
3644
	 */
3645
	if (cp && !(fd_mesa_debug & FD_DBG_NOCP))
3646
		ir3_block_cp(block);
3647
 
3648
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3649
		printf("BEFORE GROUPING:\n");
3650
		ir3_dump_instr_list(block->head);
3651
	}
3652
 
3653
	/* Group left/right neighbors, inserting mov's where needed to
3654
	 * solve conflicts:
3655
	 */
3656
	ir3_block_group(block);
3657
 
3658
	if (fd_mesa_debug & FD_DBG_OPTDUMP)
3659
		compile_dump(&ctx);
3660
 
3661
	ir3_block_depth(block);
3662
 
3663
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3664
		printf("AFTER DEPTH:\n");
3665
		ir3_dump_instr_list(block->head);
3666
	}
3667
 
3668
	ret = ir3_block_sched(block);
3669
	if (ret) {
3670
		DBG("SCHED failed!");
3671
		goto out;
3672
	}
3673
 
3674
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3675
		printf("AFTER SCHED:\n");
3676
		ir3_dump_instr_list(block->head);
3677
	}
3678
 
3679
	ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face);
3680
	if (ret) {
3681
		DBG("RA failed!");
3682
		goto out;
3683
	}
3684
 
3685
	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
3686
		printf("AFTER RA:\n");
3687
		ir3_dump_instr_list(block->head);
3688
	}
3689
 
3690
	ir3_block_legalize(block, &so->has_samp, &max_bary);
3691
 
3692
	/* fixup input/outputs: */
3693
	for (i = 0; i < so->outputs_count; i++) {
3694
		so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
3695
		/* preserve hack for depth output.. tgsi writes depth to .z,
3696
		 * but what we give the hw is the scalar register:
3697
		 */
3698
		if ((ctx.type == TGSI_PROCESSOR_FRAGMENT) &&
3699
			(sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
3700
			so->outputs[i].regid += 2;
3701
	}
3702
	/* Note that some or all channels of an input may be unused: */
3703
	actual_in = 0;
3704
	for (i = 0; i < so->inputs_count; i++) {
3705
		unsigned j, regid = ~0, compmask = 0;
3706
		so->inputs[i].ncomp = 0;
3707
		for (j = 0; j < 4; j++) {
3708
			struct ir3_instruction *in = inputs[(i*4) + j];
3709
			if (in) {
3710
				compmask |= (1 << j);
3711
				regid = in->regs[0]->num - j;
3712
				actual_in++;
3713
				so->inputs[i].ncomp++;
3714
			}
3715
		}
3716
		so->inputs[i].regid = regid;
3717
		so->inputs[i].compmask = compmask;
3718
	}
3719
 
3720
	/* fragment shader always gets full vec4's even if it doesn't
3721
	 * fetch all components, but vertex shader we need to update
3722
	 * with the actual number of components fetch, otherwise thing
3723
	 * will hang due to mismaptch between VFD_DECODE's and
3724
	 * TOTALATTRTOVS
3725
	 */
3726
	if (so->type == SHADER_VERTEX)
3727
		so->total_in = actual_in;
3728
	else
3729
		so->total_in = align(max_bary + 1, 4);
3730
 
3731
out:
3732
	if (ret) {
3733
		ir3_destroy(so->ir);
3734
		so->ir = NULL;
3735
	}
3736
	compile_free(&ctx);
3737
 
3738
	return ret;
3739
}