WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/mesa-10.6.0/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c

Rev	Author	Line No.	Line
5564	serge	1	/* -- mode: C; c-file-style: "k&r"; tab-width 4; indent-tabs-mode: t; -- */
		2
		3	/*
		4	* Copyright (C) 2015 Rob Clark
		5	*
		6	* Permission is hereby granted, free of charge, to any person obtaining a
		7	* copy of this software and associated documentation files (the "Software"),
		8	* to deal in the Software without restriction, including without limitation
		9	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		10	* and/or sell copies of the Software, and to permit persons to whom the
		11	* Software is furnished to do so, subject to the following conditions:
		12	*
		13	* The above copyright notice and this permission notice (including the next
		14	* paragraph) shall be included in all copies or substantial portions of the
		15	* Software.
		16	*
		17	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		18	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		19	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		20	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		21	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
		22	* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
		23	* SOFTWARE.
		24	*
		25	* Authors:
		26	* Rob Clark
		27	*/
		28
		29	#include
		30
		31	#include "pipe/p_state.h"
		32	#include "util/u_string.h"
		33	#include "util/u_memory.h"
		34	#include "util/u_inlines.h"
		35	#include "tgsi/tgsi_lowering.h"
		36	#include "tgsi/tgsi_strings.h"
		37
		38	#include "nir/tgsi_to_nir.h"
		39	#include "glsl/shader_enums.h"
		40
		41	#include "freedreno_util.h"
		42
		43	#include "ir3_compiler.h"
		44	#include "ir3_shader.h"
		45	#include "ir3_nir.h"
		46
		47	#include "instr-a3xx.h"
		48	#include "ir3.h"
		49
		50
		51	static struct ir3_instruction * create_immed(struct ir3_block *block, uint32_t val);
		52
		53	struct ir3_compile {
		54	const struct tgsi_token *tokens;
		55	struct nir_shader *s;
		56
		57	struct ir3 *ir;
		58	struct ir3_shader_variant *so;
		59
		60	/* bitmask of which samplers are integer: */
		61	uint16_t integer_s;
		62
		63	struct ir3_block *block;
		64
		65	/* For fragment shaders, from the hw perspective the only
		66	* actual input is r0.xy position register passed to bary.f.
		67	* But TGSI doesn't know that, it still declares things as
		68	* IN[] registers. So we do all the input tracking normally
		69	* and fix things up after compile_instructions()
		70	*
		71	* NOTE that frag_pos is the hardware position (possibly it
		72	* is actually an index or tag or some such.. it is not
		73	* values that can be directly used for gl_FragCoord..)
		74	*/
		75	struct ir3_instruction frag_pos, frag_face, *frag_coord[4];
		76
		77	/* For vertex shaders, keep track of the system values sources */
		78	struct ir3_instruction vertex_id, basevertex, *instance_id;
		79
		80	/* mapping from nir_register to defining instruction: */
		81	struct hash_table *def_ht;
		82
		83	/* mapping from nir_variable to ir3_array: */
		84	struct hash_table *var_ht;
		85	unsigned num_arrays;
		86
		87	/* a common pattern for indirect addressing is to request the
		88	* same address register multiple times. To avoid generating
		89	* duplicate instruction sequences (which our backend does not
		90	* try to clean up, since that should be done as the NIR stage)
		91	* we cache the address value generated for a given src value:
		92	*/
		93	struct hash_table *addr_ht;
		94
		95	/* for calculating input/output positions/linkages: */
		96	unsigned next_inloc;
		97
		98	/* a4xx (at least patchlevel 0) cannot seem to flat-interpolate
		99	* so we need to use ldlv.u32 to load the varying directly:
		100	*/
		101	bool flat_bypass;
		102
		103	/* on a3xx, we need to add one to # of array levels:
		104	*/
		105	bool levels_add_one;
		106
		107	/* for looking up which system value is which */
		108	unsigned sysval_semantics[8];
		109
		110	/* list of kill instructions: */
		111	struct ir3_instruction *kill[16];
		112	unsigned int kill_count;
		113
		114	/* set if we encounter something we can't handle yet, so we
		115	* can bail cleanly and fallback to TGSI compiler f/e
		116	*/
		117	bool error;
		118	};
		119
		120
		121	static struct nir_shader to_nir(const struct tgsi_token tokens)
		122	{
		123	struct nir_shader_compiler_options options = {
		124	.lower_fpow = true,
		125	.lower_fsat = true,
		126	.lower_scmp = true,
		127	.lower_flrp = true,
		128	.native_integers = true,
		129	};
		130	bool progress;
		131
		132	struct nir_shader *s = tgsi_to_nir(tokens, &options);
		133
		134	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
		135	debug_printf("----------------------\n");
		136	nir_print_shader(s, stdout);
		137	debug_printf("----------------------\n");
		138	}
		139
		140	nir_opt_global_to_local(s);
		141	nir_convert_to_ssa(s);
		142	nir_lower_idiv(s);
		143
		144	do {
		145	progress = false;
		146
		147	nir_lower_vars_to_ssa(s);
		148	nir_lower_alu_to_scalar(s);
		149
		150	progress \|= nir_copy_prop(s);
		151	progress \|= nir_opt_dce(s);
		152	progress \|= nir_opt_cse(s);
		153	progress \|= ir3_nir_lower_if_else(s);
		154	progress \|= nir_opt_algebraic(s);
		155	progress \|= nir_opt_constant_folding(s);
		156
		157	} while (progress);
		158
		159	nir_remove_dead_variables(s);
		160	nir_validate_shader(s);
		161
		162	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
		163	debug_printf("----------------------\n");
		164	nir_print_shader(s, stdout);
		165	debug_printf("----------------------\n");
		166	}
		167
		168	return s;
		169	}
		170
		171	/* TODO nir doesn't lower everything for us yet, but ideally it would: */
		172	static const struct tgsi_token *
		173	lower_tgsi(const struct tgsi_token tokens, struct ir3_shader_variant so)
		174	{
		175	struct tgsi_shader_info info;
		176	struct tgsi_lowering_config lconfig = {
		177	.color_two_side = so->key.color_two_side,
		178	.lower_FRC = true,
		179	};
		180
		181	switch (so->type) {
		182	case SHADER_FRAGMENT:
		183	case SHADER_COMPUTE:
		184	lconfig.saturate_s = so->key.fsaturate_s;
		185	lconfig.saturate_t = so->key.fsaturate_t;
		186	lconfig.saturate_r = so->key.fsaturate_r;
		187	break;
		188	case SHADER_VERTEX:
		189	lconfig.saturate_s = so->key.vsaturate_s;
		190	lconfig.saturate_t = so->key.vsaturate_t;
		191	lconfig.saturate_r = so->key.vsaturate_r;
		192	break;
		193	}
		194
		195	if (!so->shader) {
		196	/* hack for standalone compiler which does not have
		197	* screen/context:
		198	*/
		199	} else if (ir3_shader_gpuid(so->shader) >= 400) {
		200	/* a4xx seems to have no sam.p */
		201	lconfig.lower_TXP = ~0; /* lower all txp */
		202	} else {
		203	/* a3xx just needs to avoid sam.p for 3d tex */
		204	lconfig.lower_TXP = (1 << TGSI_TEXTURE_3D);
		205	}
		206
		207	return tgsi_transform_lowering(&lconfig, tokens, &info);
		208	}
		209
		210	static struct ir3_compile *
		211	compile_init(struct ir3_shader_variant *so,
		212	const struct tgsi_token *tokens)
		213	{
		214	struct ir3_compile *ctx = rzalloc(NULL, struct ir3_compile);
		215	const struct tgsi_token *lowered_tokens;
		216
		217	if (!so->shader) {
		218	/* hack for standalone compiler which does not have
		219	* screen/context:
		220	*/
		221	} else if (ir3_shader_gpuid(so->shader) >= 400) {
		222	/* need special handling for "flat" */
		223	ctx->flat_bypass = true;
		224	ctx->levels_add_one = false;
		225	} else {
		226	/* no special handling for "flat" */
		227	ctx->flat_bypass = false;
		228	ctx->levels_add_one = true;
		229	}
		230
		231	switch (so->type) {
		232	case SHADER_FRAGMENT:
		233	case SHADER_COMPUTE:
		234	ctx->integer_s = so->key.finteger_s;
		235	break;
		236	case SHADER_VERTEX:
		237	ctx->integer_s = so->key.vinteger_s;
		238	break;
		239	}
		240
		241	ctx->ir = so->ir;
		242	ctx->so = so;
		243	ctx->next_inloc = 8;
		244	ctx->def_ht = _mesa_hash_table_create(ctx,
		245	_mesa_hash_pointer, _mesa_key_pointer_equal);
		246	ctx->var_ht = _mesa_hash_table_create(ctx,
		247	_mesa_hash_pointer, _mesa_key_pointer_equal);
		248	ctx->addr_ht = _mesa_hash_table_create(ctx,
		249	_mesa_hash_pointer, _mesa_key_pointer_equal);
		250
		251	lowered_tokens = lower_tgsi(tokens, so);
		252	if (!lowered_tokens)
		253	lowered_tokens = tokens;
		254	ctx->s = to_nir(lowered_tokens);
		255
		256	if (lowered_tokens != tokens)
		257	free((void *)lowered_tokens);
		258
		259	so->first_driver_param = so->first_immediate = ctx->s->num_uniforms;
		260
		261	/* one (vec4) slot for vertex id base: */
		262	if (so->type == SHADER_VERTEX)
		263	so->first_immediate++;
		264
		265	/* reserve 4 (vec4) slots for ubo base addresses: */
		266	so->first_immediate += 4;
		267
		268	return ctx;
		269	}
		270
		271	static void
		272	compile_error(struct ir3_compile ctx, const char format, ...)
		273	{
		274	va_list ap;
		275	va_start(ap, format);
		276	_debug_vprintf(format, ap);
		277	va_end(ap);
		278	nir_print_shader(ctx->s, stdout);
		279	ctx->error = true;
		280	debug_assert(0);
		281	}
		282
		283	#define compile_assert(ctx, cond) do { \
		284	if (!(cond)) compile_error((ctx), "failed assert: "#cond"\n"); \
		285	} while (0)
		286
		287	static void
		288	compile_free(struct ir3_compile *ctx)
		289	{
		290	ralloc_free(ctx);
		291	}
		292
		293
		294	struct ir3_array {
		295	unsigned length, aid;
		296	struct ir3_instruction *arr[];
		297	};
		298
		299	static void
		300	declare_var(struct ir3_compile ctx, nir_variable var)
		301	{
		302	unsigned length = glsl_get_length(var->type) * 4; /* always vec4, at least with ttn */
		303	struct ir3_array arr = ralloc_size(ctx, sizeof(arr) +
		304	(length * sizeof(arr->arr[0])));
		305	arr->length = length;
		306	arr->aid = ++ctx->num_arrays;
		307	/* Some shaders end up reading array elements without first writing..
		308	* so initialize things to prevent null instr ptrs later:
		309	*/
		310	for (unsigned i = 0; i < length; i++)
		311	arr->arr[i] = create_immed(ctx->block, 0);
		312	_mesa_hash_table_insert(ctx->var_ht, var, arr);
		313	}
		314
		315	static struct ir3_array *
		316	get_var(struct ir3_compile ctx, nir_variable var)
		317	{
		318	struct hash_entry *entry = _mesa_hash_table_search(ctx->var_ht, var);
		319	return entry->data;
		320	}
		321
		322	/* allocate a n element value array (to be populated by caller) and
		323	* insert in def_ht
		324	*/
		325	static struct ir3_instruction **
		326	__get_dst(struct ir3_compile ctx, void key, unsigned n)
		327	{
		328	struct ir3_instruction **value =
		329	ralloc_array(ctx->def_ht, struct ir3_instruction *, n);
		330	_mesa_hash_table_insert(ctx->def_ht, key, value);
		331	return value;
		332	}
		333
		334	static struct ir3_instruction **
		335	get_dst(struct ir3_compile ctx, nir_dest dst, unsigned n)
		336	{
		337	if (dst->is_ssa) {
		338	return __get_dst(ctx, &dst->ssa, n);
		339	} else {
		340	return __get_dst(ctx, dst->reg.reg, n);
		341	}
		342	}
		343
		344	static struct ir3_instruction **
		345	get_dst_ssa(struct ir3_compile ctx, nir_ssa_def dst, unsigned n)
		346	{
		347	return __get_dst(ctx, dst, n);
		348	}
		349
		350	static struct ir3_instruction **
		351	get_src(struct ir3_compile ctx, nir_src src)
		352	{
		353	struct hash_entry *entry;
		354	if (src->is_ssa) {
		355	entry = _mesa_hash_table_search(ctx->def_ht, src->ssa);
		356	} else {
		357	entry = _mesa_hash_table_search(ctx->def_ht, src->reg.reg);
		358	}
		359	compile_assert(ctx, entry);
		360	return entry->data;
		361	}
		362
		363	static struct ir3_instruction *
		364	create_immed(struct ir3_block *block, uint32_t val)
		365	{
		366	struct ir3_instruction *mov;
		367
		368	mov = ir3_instr_create(block, 1, 0);
		369	mov->cat1.src_type = TYPE_U32;
		370	mov->cat1.dst_type = TYPE_U32;
		371	ir3_reg_create(mov, 0, 0);
		372	ir3_reg_create(mov, 0, IR3_REG_IMMED)->uim_val = val;
		373
		374	return mov;
		375	}
		376
		377	static struct ir3_instruction *
		378	create_addr(struct ir3_block block, struct ir3_instruction src)
		379	{
		380	struct ir3_instruction instr, immed;
		381
		382	/* TODO in at least some cases, the backend could probably be
		383	* made clever enough to propagate IR3_REG_HALF..
		384	*/
		385	instr = ir3_COV(block, src, TYPE_U32, TYPE_S16);
		386	instr->regs[0]->flags \|= IR3_REG_HALF;
		387
		388	immed = create_immed(block, 2);
		389	immed->regs[0]->flags \|= IR3_REG_HALF;
		390
		391	instr = ir3_SHL_B(block, instr, 0, immed, 0);
		392	instr->regs[0]->flags \|= IR3_REG_HALF;
		393	instr->regs[1]->flags \|= IR3_REG_HALF;
		394
		395	instr = ir3_MOV(block, instr, TYPE_S16);
		396	instr->regs[0]->flags \|= IR3_REG_ADDR \| IR3_REG_HALF;
		397	instr->regs[1]->flags \|= IR3_REG_HALF;
		398
		399	return instr;
		400	}
		401
		402	/* caches addr values to avoid generating multiple cov/shl/mova
		403	* sequences for each use of a given NIR level src as address
		404	*/
		405	static struct ir3_instruction *
		406	get_addr(struct ir3_compile ctx, struct ir3_instruction src)
		407	{
		408	struct ir3_instruction *addr;
		409	struct hash_entry *entry;
		410	entry = _mesa_hash_table_search(ctx->addr_ht, src);
		411	if (entry)
		412	return entry->data;
		413
		414	/* TODO do we need to cache per block? */
		415	addr = create_addr(ctx->block, src);
		416	_mesa_hash_table_insert(ctx->addr_ht, src, addr);
		417
		418	return addr;
		419	}
		420
		421	static struct ir3_instruction *
		422	create_uniform(struct ir3_compile *ctx, unsigned n)
		423	{
		424	struct ir3_instruction *mov;
		425
		426	mov = ir3_instr_create(ctx->block, 1, 0);
		427	/* TODO get types right? */
		428	mov->cat1.src_type = TYPE_F32;
		429	mov->cat1.dst_type = TYPE_F32;
		430	ir3_reg_create(mov, 0, 0);
		431	ir3_reg_create(mov, n, IR3_REG_CONST);
		432
		433	return mov;
		434	}
		435
		436	static struct ir3_instruction *
		437	create_uniform_indirect(struct ir3_compile *ctx, unsigned n,
		438	struct ir3_instruction *address)
		439	{
		440	struct ir3_instruction *mov;
		441
		442	mov = ir3_instr_create(ctx->block, 1, 0);
		443	mov->cat1.src_type = TYPE_U32;
		444	mov->cat1.dst_type = TYPE_U32;
		445	ir3_reg_create(mov, 0, 0);
		446	ir3_reg_create(mov, n, IR3_REG_CONST \| IR3_REG_RELATIV);
		447	mov->address = address;
		448
		449	array_insert(ctx->ir->indirects, mov);
		450
		451	return mov;
		452	}
		453
		454	static struct ir3_instruction *
		455	create_collect(struct ir3_block block, struct ir3_instruction *arr,
		456	unsigned arrsz)
		457	{
		458	struct ir3_instruction *collect;
		459
		460	if (arrsz == 0)
		461	return NULL;
		462
		463	collect = ir3_instr_create2(block, -1, OPC_META_FI, 1 + arrsz);
		464	ir3_reg_create(collect, 0, 0);
		465	for (unsigned i = 0; i < arrsz; i++)
		466	ir3_reg_create(collect, 0, IR3_REG_SSA)->instr = arr[i];
		467
		468	return collect;
		469	}
		470
		471	static struct ir3_instruction *
		472	create_indirect_load(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
		473	struct ir3_instruction address, struct ir3_instruction collect)
		474	{
		475	struct ir3_block *block = ctx->block;
		476	struct ir3_instruction *mov;
		477	struct ir3_register *src;
		478
		479	mov = ir3_instr_create(block, 1, 0);
		480	mov->cat1.src_type = TYPE_U32;
		481	mov->cat1.dst_type = TYPE_U32;
		482	ir3_reg_create(mov, 0, 0);
		483	src = ir3_reg_create(mov, 0, IR3_REG_SSA \| IR3_REG_RELATIV);
		484	src->instr = collect;
		485	src->size = arrsz;
		486	src->offset = n;
		487	mov->address = address;
		488
		489	array_insert(ctx->ir->indirects, mov);
		490
		491	return mov;
		492	}
		493
		494	static struct ir3_instruction *
		495	create_indirect_store(struct ir3_compile *ctx, unsigned arrsz, unsigned n,
		496	struct ir3_instruction src, struct ir3_instruction address,
		497	struct ir3_instruction *collect)
		498	{
		499	struct ir3_block *block = ctx->block;
		500	struct ir3_instruction *mov;
		501	struct ir3_register *dst;
		502
		503	mov = ir3_instr_create(block, 1, 0);
		504	mov->cat1.src_type = TYPE_U32;
		505	mov->cat1.dst_type = TYPE_U32;
		506	dst = ir3_reg_create(mov, 0, IR3_REG_RELATIV);
		507	dst->size = arrsz;
		508	dst->offset = n;
		509	ir3_reg_create(mov, 0, IR3_REG_SSA)->instr = src;
		510	mov->address = address;
		511	mov->fanin = collect;
		512
		513	array_insert(ctx->ir->indirects, mov);
		514
		515	return mov;
		516	}
		517
		518	static struct ir3_instruction *
		519	create_input(struct ir3_block block, struct ir3_instruction instr,
		520	unsigned n)
		521	{
		522	struct ir3_instruction *in;
		523
		524	in = ir3_instr_create(block, -1, OPC_META_INPUT);
		525	in->inout.block = block;
		526	ir3_reg_create(in, n, 0);
		527	if (instr)
		528	ir3_reg_create(in, 0, IR3_REG_SSA)->instr = instr;
		529
		530	return in;
		531	}
		532
		533	static struct ir3_instruction *
		534	create_frag_input(struct ir3_compile *ctx, unsigned n, bool use_ldlv)
		535	{
		536	struct ir3_block *block = ctx->block;
		537	struct ir3_instruction *instr;
		538	struct ir3_instruction *inloc = create_immed(block, n);
		539
		540	if (use_ldlv) {
		541	instr = ir3_LDLV(block, inloc, 0, create_immed(block, 1), 0);
		542	instr->cat6.type = TYPE_U32;
		543	instr->cat6.iim_val = 1;
		544	} else {
		545	instr = ir3_BARY_F(block, inloc, 0, ctx->frag_pos, 0);
		546	instr->regs[2]->wrmask = 0x3;
		547	}
		548
		549	return instr;
		550	}
		551
		552	static struct ir3_instruction *
		553	create_frag_coord(struct ir3_compile *ctx, unsigned comp)
		554	{
		555	struct ir3_block *block = ctx->block;
		556	struct ir3_instruction *instr;
		557
		558	compile_assert(ctx, !ctx->frag_coord[comp]);
		559
		560	ctx->frag_coord[comp] = create_input(ctx->block, NULL, 0);
		561
		562	switch (comp) {
		563	case 0: /* .x */
		564	case 1: /* .y */
		565	/* for frag_coord, we get unsigned values.. we need
		566	* to subtract (integer) 8 and divide by 16 (right-
		567	* shift by 4) then convert to float:
		568	*
		569	* sub.s tmp, src, 8
		570	* shr.b tmp, tmp, 4
		571	* mov.u32f32 dst, tmp
		572	*
		573	*/
		574	instr = ir3_SUB_S(block, ctx->frag_coord[comp], 0,
		575	create_immed(block, 8), 0);
		576	instr = ir3_SHR_B(block, instr, 0,
		577	create_immed(block, 4), 0);
		578	instr = ir3_COV(block, instr, TYPE_U32, TYPE_F32);
		579
		580	return instr;
		581	case 2: /* .z */
		582	case 3: /* .w */
		583	default:
		584	/* seems that we can use these as-is: */
		585	return ctx->frag_coord[comp];
		586	}
		587	}
		588
		589	static struct ir3_instruction *
		590	create_frag_face(struct ir3_compile *ctx, unsigned comp)
		591	{
		592	struct ir3_block *block = ctx->block;
		593	struct ir3_instruction *instr;
		594
		595	switch (comp) {
		596	case 0: /* .x */
		597	compile_assert(ctx, !ctx->frag_face);
		598
		599	ctx->frag_face = create_input(block, NULL, 0);
		600
		601	/* for faceness, we always get -1 or 0 (int).. but TGSI expects
		602	* positive vs negative float.. and piglit further seems to
		603	* expect -1.0 or 1.0:
		604	*
		605	* mul.s tmp, hr0.x, 2
		606	* add.s tmp, tmp, 1
		607	* mov.s32f32, dst, tmp
		608	*
		609	*/
		610	instr = ir3_MUL_S(block, ctx->frag_face, 0,
		611	create_immed(block, 2), 0);
		612	instr = ir3_ADD_S(block, instr, 0,
		613	create_immed(block, 1), 0);
		614	instr = ir3_COV(block, instr, TYPE_S32, TYPE_F32);
		615
		616	return instr;
		617	case 1: /* .y */
		618	case 2: /* .z */
		619	return create_immed(block, fui(0.0));
		620	default:
		621	case 3: /* .w */
		622	return create_immed(block, fui(1.0));
		623	}
		624	}
		625
		626	/* helper for instructions that produce multiple consecutive scalar
		627	* outputs which need to have a split/fanout meta instruction inserted
		628	*/
		629	static void
		630	split_dest(struct ir3_block block, struct ir3_instruction *dst,
		631	struct ir3_instruction *src)
		632	{
		633	struct ir3_instruction *prev = NULL;
		634	for (int i = 0, j = 0; i < 4; i++) {
		635	struct ir3_instruction *split =
		636	ir3_instr_create(block, -1, OPC_META_FO);
		637	ir3_reg_create(split, 0, IR3_REG_SSA);
		638	ir3_reg_create(split, 0, IR3_REG_SSA)->instr = src;
		639	split->fo.off = i;
		640
		641	if (prev) {
		642	split->cp.left = prev;
		643	split->cp.left_cnt++;
		644	prev->cp.right = split;
		645	prev->cp.right_cnt++;
		646	}
		647	prev = split;
		648
		649	if (src->regs[0]->wrmask & (1 << i))
		650	dst[j++] = split;
		651	}
		652	}
		653
		654	/*
		655	* Adreno uses uint rather than having dedicated bool type,
		656	* which (potentially) requires some conversion, in particular
		657	* when using output of an bool instr to int input, or visa
		658	* versa.
		659	*
		660	* \| Adreno \| NIR \|
		661	* -------+---------+-------+-
		662	* true \| 1 \| ~0 \|
		663	* false \| 0 \| 0 \|
		664	*
		665	* To convert from an adreno bool (uint) to nir, use:
		666	*
		667	* absneg.s dst, (neg)src
		668	*
		669	* To convert back in the other direction:
		670	*
		671	* absneg.s dst, (abs)arc
		672	*
		673	* The CP step can clean up the absneg.s that cancel each other
		674	* out, and with a slight bit of extra cleverness (to recognize
		675	* the instructions which produce either a 0 or 1) can eliminate
		676	* the absneg.s's completely when an instruction that wants
		677	* 0/1 consumes the result. For example, when a nir 'bcsel'
		678	* consumes the result of 'feq'. So we should be able to get by
		679	* without a boolean resolve step, and without incuring any
		680	* extra penalty in instruction count.
		681	*/
		682
		683	/* NIR bool -> native (adreno): */
		684	static struct ir3_instruction *
		685	ir3_b2n(struct ir3_block block, struct ir3_instruction instr)
		686	{
		687	return ir3_ABSNEG_S(block, instr, IR3_REG_SABS);
		688	}
		689
		690	/* native (adreno) -> NIR bool: */
		691	static struct ir3_instruction *
		692	ir3_n2b(struct ir3_block block, struct ir3_instruction instr)
		693	{
		694	return ir3_ABSNEG_S(block, instr, IR3_REG_SNEG);
		695	}
		696
		697	/*
		698	* alu/sfu instructions:
		699	*/
		700
		701	static void
		702	emit_alu(struct ir3_compile ctx, nir_alu_instr alu)
		703	{
		704	const nir_op_info *info = &nir_op_infos[alu->op];
		705	struct ir3_instruction *dst, src[info->num_inputs];
		706	struct ir3_block *b = ctx->block;
		707
		708	dst = get_dst(ctx, &alu->dest.dest, MAX2(info->output_size, 1));
		709
		710	/* Vectors are special in that they have non-scalarized writemasks,
		711	* and just take the first swizzle channel for each argument in
		712	* order into each writemask channel.
		713	*/
		714	if ((alu->op == nir_op_vec2) \|\|
		715	(alu->op == nir_op_vec3) \|\|
		716	(alu->op == nir_op_vec4)) {
		717
		718	for (int i = 0; i < info->num_inputs; i++) {
		719	nir_alu_src *asrc = &alu->src[i];
		720
		721	compile_assert(ctx, !asrc->abs);
		722	compile_assert(ctx, !asrc->negate);
		723
		724	src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[0]];
		725	if (!src[i])
		726	src[i] = create_immed(ctx->block, 0);
		727	dst[i] = ir3_MOV(b, src[i], TYPE_U32);
		728	}
		729
		730	return;
		731	}
		732
		733	/* General case: We can just grab the one used channel per src. */
		734	for (int i = 0; i < info->num_inputs; i++) {
		735	unsigned chan = ffs(alu->dest.write_mask) - 1;
		736	nir_alu_src *asrc = &alu->src[i];
		737
		738	compile_assert(ctx, !asrc->abs);
		739	compile_assert(ctx, !asrc->negate);
		740
		741	src[i] = get_src(ctx, &asrc->src)[asrc->swizzle[chan]];
		742
		743	compile_assert(ctx, src[i]);
		744	}
		745
		746	switch (alu->op) {
		747	case nir_op_f2i:
		748	dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_S32);
		749	break;
		750	case nir_op_f2u:
		751	dst[0] = ir3_COV(b, src[0], TYPE_F32, TYPE_U32);
		752	break;
		753	case nir_op_i2f:
		754	dst[0] = ir3_COV(b, src[0], TYPE_S32, TYPE_F32);
		755	break;
		756	case nir_op_u2f:
		757	dst[0] = ir3_COV(b, src[0], TYPE_U32, TYPE_F32);
		758	break;
		759	case nir_op_imov:
		760	dst[0] = ir3_MOV(b, src[0], TYPE_S32);
		761	break;
		762	case nir_op_fmov:
		763	dst[0] = ir3_MOV(b, src[0], TYPE_F32);
		764	break;
		765	case nir_op_f2b:
		766	dst[0] = ir3_CMPS_F(b, src[0], 0, create_immed(b, fui(0.0)), 0);
		767	dst[0]->cat2.condition = IR3_COND_NE;
		768	dst[0] = ir3_n2b(b, dst[0]);
		769	break;
		770	case nir_op_b2f:
		771	dst[0] = ir3_COV(b, ir3_b2n(b, src[0]), TYPE_U32, TYPE_F32);
		772	break;
		773	case nir_op_b2i:
		774	dst[0] = ir3_b2n(b, src[0]);
		775	break;
		776	case nir_op_i2b:
		777	dst[0] = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
		778	dst[0]->cat2.condition = IR3_COND_NE;
		779	dst[0] = ir3_n2b(b, dst[0]);
		780	break;
		781
		782	case nir_op_fneg:
		783	dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FNEG);
		784	break;
		785	case nir_op_fabs:
		786	dst[0] = ir3_ABSNEG_F(b, src[0], IR3_REG_FABS);
		787	break;
		788	case nir_op_fmax:
		789	dst[0] = ir3_MAX_F(b, src[0], 0, src[1], 0);
		790	break;
		791	case nir_op_fmin:
		792	dst[0] = ir3_MIN_F(b, src[0], 0, src[1], 0);
		793	break;
		794	case nir_op_fmul:
		795	dst[0] = ir3_MUL_F(b, src[0], 0, src[1], 0);
		796	break;
		797	case nir_op_fadd:
		798	dst[0] = ir3_ADD_F(b, src[0], 0, src[1], 0);
		799	break;
		800	case nir_op_fsub:
		801	dst[0] = ir3_ADD_F(b, src[0], 0, src[1], IR3_REG_FNEG);
		802	break;
		803	case nir_op_ffma:
		804	dst[0] = ir3_MAD_F32(b, src[0], 0, src[1], 0, src[2], 0);
		805	break;
		806	case nir_op_fddx:
		807	dst[0] = ir3_DSX(b, src[0], 0);
		808	dst[0]->cat5.type = TYPE_F32;
		809	break;
		810	case nir_op_fddy:
		811	dst[0] = ir3_DSY(b, src[0], 0);
		812	dst[0]->cat5.type = TYPE_F32;
		813	break;
		814	break;
		815	case nir_op_flt:
		816	dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
		817	dst[0]->cat2.condition = IR3_COND_LT;
		818	dst[0] = ir3_n2b(b, dst[0]);
		819	break;
		820	case nir_op_fge:
		821	dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
		822	dst[0]->cat2.condition = IR3_COND_GE;
		823	dst[0] = ir3_n2b(b, dst[0]);
		824	break;
		825	case nir_op_feq:
		826	dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
		827	dst[0]->cat2.condition = IR3_COND_EQ;
		828	dst[0] = ir3_n2b(b, dst[0]);
		829	break;
		830	case nir_op_fne:
		831	dst[0] = ir3_CMPS_F(b, src[0], 0, src[1], 0);
		832	dst[0]->cat2.condition = IR3_COND_NE;
		833	dst[0] = ir3_n2b(b, dst[0]);
		834	break;
		835	case nir_op_fceil:
		836	dst[0] = ir3_CEIL_F(b, src[0], 0);
		837	break;
		838	case nir_op_ffloor:
		839	dst[0] = ir3_FLOOR_F(b, src[0], 0);
		840	break;
		841	case nir_op_ftrunc:
		842	dst[0] = ir3_TRUNC_F(b, src[0], 0);
		843	break;
		844	case nir_op_fround_even:
		845	dst[0] = ir3_RNDNE_F(b, src[0], 0);
		846	break;
		847	case nir_op_fsign:
		848	dst[0] = ir3_SIGN_F(b, src[0], 0);
		849	break;
		850
		851	case nir_op_fsin:
		852	dst[0] = ir3_SIN(b, src[0], 0);
		853	break;
		854	case nir_op_fcos:
		855	dst[0] = ir3_COS(b, src[0], 0);
		856	break;
		857	case nir_op_frsq:
		858	dst[0] = ir3_RSQ(b, src[0], 0);
		859	break;
		860	case nir_op_frcp:
		861	dst[0] = ir3_RCP(b, src[0], 0);
		862	break;
		863	case nir_op_flog2:
		864	dst[0] = ir3_LOG2(b, src[0], 0);
		865	break;
		866	case nir_op_fexp2:
		867	dst[0] = ir3_EXP2(b, src[0], 0);
		868	break;
		869	case nir_op_fsqrt:
		870	dst[0] = ir3_SQRT(b, src[0], 0);
		871	break;
		872
		873	case nir_op_iabs:
		874	dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SABS);
		875	break;
		876	case nir_op_iadd:
		877	dst[0] = ir3_ADD_U(b, src[0], 0, src[1], 0);
		878	break;
		879	case nir_op_iand:
		880	dst[0] = ir3_AND_B(b, src[0], 0, src[1], 0);
		881	break;
		882	case nir_op_imax:
		883	dst[0] = ir3_MAX_S(b, src[0], 0, src[1], 0);
		884	break;
		885	case nir_op_imin:
		886	dst[0] = ir3_MIN_S(b, src[0], 0, src[1], 0);
		887	break;
		888	case nir_op_imul:
		889	/*
		890	* dst = (al * bl) + (ah * bl << 16) + (al * bh << 16)
		891	* mull.u tmp0, a, b ; mul low, i.e. al * bl
		892	* madsh.m16 tmp1, a, b, tmp0 ; mul-add shift high mix, i.e. ah * bl << 16
		893	* madsh.m16 dst, b, a, tmp1 ; i.e. al * bh << 16
		894	*/
		895	dst[0] = ir3_MADSH_M16(b, src[1], 0, src[0], 0,
		896	ir3_MADSH_M16(b, src[0], 0, src[1], 0,
		897	ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
		898	break;
		899	case nir_op_ineg:
		900	dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
		901	break;
		902	case nir_op_inot:
		903	dst[0] = ir3_NOT_B(b, src[0], 0);
		904	break;
		905	case nir_op_ior:
		906	dst[0] = ir3_OR_B(b, src[0], 0, src[1], 0);
		907	break;
		908	case nir_op_ishl:
		909	dst[0] = ir3_SHL_B(b, src[0], 0, src[1], 0);
		910	break;
		911	case nir_op_ishr:
		912	dst[0] = ir3_ASHR_B(b, src[0], 0, src[1], 0);
		913	break;
		914	case nir_op_isign: {
		915	/* maybe this would be sane to lower in nir.. */
		916	struct ir3_instruction neg, pos;
		917
		918	neg = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
		919	neg->cat2.condition = IR3_COND_LT;
		920
		921	pos = ir3_CMPS_S(b, src[0], 0, create_immed(b, 0), 0);
		922	pos->cat2.condition = IR3_COND_GT;
		923
		924	dst[0] = ir3_SUB_U(b, pos, 0, neg, 0);
		925
		926	break;
		927	}
		928	case nir_op_isub:
		929	dst[0] = ir3_SUB_U(b, src[0], 0, src[1], 0);
		930	break;
		931	case nir_op_ixor:
		932	dst[0] = ir3_XOR_B(b, src[0], 0, src[1], 0);
		933	break;
		934	case nir_op_ushr:
		935	dst[0] = ir3_SHR_B(b, src[0], 0, src[1], 0);
		936	break;
		937	case nir_op_ilt:
		938	dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
		939	dst[0]->cat2.condition = IR3_COND_LT;
		940	dst[0] = ir3_n2b(b, dst[0]);
		941	break;
		942	case nir_op_ige:
		943	dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
		944	dst[0]->cat2.condition = IR3_COND_GE;
		945	dst[0] = ir3_n2b(b, dst[0]);
		946	break;
		947	case nir_op_ieq:
		948	dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
		949	dst[0]->cat2.condition = IR3_COND_EQ;
		950	dst[0] = ir3_n2b(b, dst[0]);
		951	break;
		952	case nir_op_ine:
		953	dst[0] = ir3_CMPS_S(b, src[0], 0, src[1], 0);
		954	dst[0]->cat2.condition = IR3_COND_NE;
		955	dst[0] = ir3_n2b(b, dst[0]);
		956	break;
		957	case nir_op_ult:
		958	dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
		959	dst[0]->cat2.condition = IR3_COND_LT;
		960	dst[0] = ir3_n2b(b, dst[0]);
		961	break;
		962	case nir_op_uge:
		963	dst[0] = ir3_CMPS_U(b, src[0], 0, src[1], 0);
		964	dst[0]->cat2.condition = IR3_COND_GE;
		965	dst[0] = ir3_n2b(b, dst[0]);
		966	break;
		967
		968	case nir_op_bcsel:
		969	dst[0] = ir3_SEL_B32(b, src[1], 0, ir3_b2n(b, src[0]), 0, src[2], 0);
		970	break;
		971
		972	default:
		973	compile_error(ctx, "Unhandled ALU op: %s\n",
		974	nir_op_infos[alu->op].name);
		975	break;
		976	}
		977	}
		978
		979	/* handles direct/indirect UBO reads: */
		980	static void
		981	emit_intrinsic_load_ubo(struct ir3_compile ctx, nir_intrinsic_instr intr,
		982	struct ir3_instruction **dst)
		983	{
		984	struct ir3_block *b = ctx->block;
		985	struct ir3_instruction addr, src0, *src1;
		986	/* UBO addresses are the first driver params: */
		987	unsigned ubo = regid(ctx->so->first_driver_param, 0);
		988	unsigned off = intr->const_index[0];
		989
		990	/* First src is ubo index, which could either be an immed or not: */
		991	src0 = get_src(ctx, &intr->src[0])[0];
		992	if (is_same_type_mov(src0) &&
		993	(src0->regs[1]->flags & IR3_REG_IMMED)) {
		994	addr = create_uniform(ctx, ubo + src0->regs[1]->iim_val);
		995	} else {
		996	addr = create_uniform_indirect(ctx, ubo, get_addr(ctx, src0));
		997	}
		998
		999	if (intr->intrinsic == nir_intrinsic_load_ubo_indirect) {
		1000	/* For load_ubo_indirect, second src is indirect offset: */
		1001	src1 = get_src(ctx, &intr->src[1])[0];
		1002
		1003	/* and add offset to addr: */
		1004	addr = ir3_ADD_S(b, addr, 0, src1, 0);
		1005	}
		1006
		1007	/* if offset is to large to encode in the ldg, split it out: */
		1008	if ((off + (intr->num_components * 4)) > 1024) {
		1009	/* split out the minimal amount to improve the odds that
		1010	* cp can fit the immediate in the add.s instruction:
		1011	*/
		1012	unsigned off2 = off + (intr->num_components * 4) - 1024;
		1013	addr = ir3_ADD_S(b, addr, 0, create_immed(b, off2), 0);
		1014	off -= off2;
		1015	}
		1016
		1017	for (int i = 0; i < intr->num_components; i++) {
		1018	struct ir3_instruction *load =
		1019	ir3_LDG(b, addr, 0, create_immed(b, 1), 0);
		1020	load->cat6.type = TYPE_U32;
		1021	load->cat6.offset = off + i * 4; /* byte offset */
		1022	dst[i] = load;
		1023	}
		1024	}
		1025
		1026	/* handles array reads: */
		1027	static void
		1028	emit_intrinisic_load_var(struct ir3_compile ctx, nir_intrinsic_instr intr,
		1029	struct ir3_instruction **dst)
		1030	{
		1031	nir_deref_var *dvar = intr->variables[0];
		1032	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
		1033	struct ir3_array *arr = get_var(ctx, dvar->var);
		1034
		1035	compile_assert(ctx, dvar->deref.child &&
		1036	(dvar->deref.child->deref_type == nir_deref_type_array));
		1037
		1038	switch (darr->deref_array_type) {
		1039	case nir_deref_array_type_direct:
		1040	/* direct access does not require anything special: */
		1041	for (int i = 0; i < intr->num_components; i++) {
		1042	unsigned n = darr->base_offset * 4 + i;
		1043	compile_assert(ctx, n < arr->length);
		1044	dst[i] = arr->arr[n];
		1045	}
		1046	break;
		1047	case nir_deref_array_type_indirect: {
		1048	/* for indirect, we need to collect all the array elements: */
		1049	struct ir3_instruction *collect =
		1050	create_collect(ctx->block, arr->arr, arr->length);
		1051	struct ir3_instruction *addr =
		1052	get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
		1053	for (int i = 0; i < intr->num_components; i++) {
		1054	unsigned n = darr->base_offset * 4 + i;
		1055	compile_assert(ctx, n < arr->length);
		1056	dst[i] = create_indirect_load(ctx, arr->length, n, addr, collect);
		1057	}
		1058	break;
		1059	}
		1060	default:
		1061	compile_error(ctx, "Unhandled load deref type: %u\n",
		1062	darr->deref_array_type);
		1063	break;
		1064	}
		1065	}
		1066
		1067	/* handles array writes: */
		1068	static void
		1069	emit_intrinisic_store_var(struct ir3_compile ctx, nir_intrinsic_instr intr)
		1070	{
		1071	nir_deref_var *dvar = intr->variables[0];
		1072	nir_deref_array *darr = nir_deref_as_array(dvar->deref.child);
		1073	struct ir3_array *arr = get_var(ctx, dvar->var);
		1074	struct ir3_instruction **src;
		1075
		1076	compile_assert(ctx, dvar->deref.child &&
		1077	(dvar->deref.child->deref_type == nir_deref_type_array));
		1078
		1079	src = get_src(ctx, &intr->src[0]);
		1080
		1081	switch (darr->deref_array_type) {
		1082	case nir_deref_array_type_direct:
		1083	/* direct access does not require anything special: */
		1084	for (int i = 0; i < intr->num_components; i++) {
		1085	unsigned n = darr->base_offset * 4 + i;
		1086	compile_assert(ctx, n < arr->length);
		1087	arr->arr[n] = src[i];
		1088	}
		1089	break;
		1090	case nir_deref_array_type_indirect: {
		1091	/* for indirect, create indirect-store and fan that out: */
		1092	struct ir3_instruction *collect =
		1093	create_collect(ctx->block, arr->arr, arr->length);
		1094	struct ir3_instruction *addr =
		1095	get_addr(ctx, get_src(ctx, &darr->indirect)[0]);
		1096	for (int i = 0; i < intr->num_components; i++) {
		1097	struct ir3_instruction *store;
		1098	unsigned n = darr->base_offset * 4 + i;
		1099	compile_assert(ctx, n < arr->length);
		1100
		1101	store = create_indirect_store(ctx, arr->length,
		1102	n, src[i], addr, collect);
		1103
		1104	store->fanin->fi.aid = arr->aid;
		1105
		1106	/* TODO: probably split this out to be used for
		1107	* store_output_indirect? or move this into
		1108	* create_indirect_store()?
		1109	*/
		1110	for (int j = i; j < arr->length; j += 4) {
		1111	struct ir3_instruction *split;
		1112
		1113	split = ir3_instr_create(ctx->block, -1, OPC_META_FO);
		1114	split->fo.off = j;
		1115	ir3_reg_create(split, 0, 0);
		1116	ir3_reg_create(split, 0, IR3_REG_SSA)->instr = store;
		1117
		1118	arr->arr[j] = split;
		1119	}
		1120	}
		1121	break;
		1122	}
		1123	default:
		1124	compile_error(ctx, "Unhandled store deref type: %u\n",
		1125	darr->deref_array_type);
		1126	break;
		1127	}
		1128	}
		1129
		1130	static void add_sysval_input(struct ir3_compile *ctx, unsigned name,
		1131	struct ir3_instruction *instr)
		1132	{
		1133	struct ir3_shader_variant *so = ctx->so;
		1134	unsigned r = regid(so->inputs_count, 0);
		1135	unsigned n = so->inputs_count++;
		1136
		1137	so->inputs[n].semantic = ir3_semantic_name(name, 0);
		1138	so->inputs[n].compmask = 1;
		1139	so->inputs[n].regid = r;
		1140	so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
		1141	so->total_in++;
		1142
		1143	ctx->block->ninputs = MAX2(ctx->block->ninputs, r + 1);
		1144	ctx->block->inputs[r] = instr;
		1145	}
		1146
		1147	static void
		1148	emit_intrinisic(struct ir3_compile ctx, nir_intrinsic_instr intr)
		1149	{
		1150	const nir_intrinsic_info *info = &nir_intrinsic_infos[intr->intrinsic];
		1151	struct ir3_instruction dst, src;
		1152	struct ir3_block *b = ctx->block;
		1153	unsigned idx = intr->const_index[0];
		1154
		1155	if (info->has_dest) {
		1156	dst = get_dst(ctx, &intr->dest, intr->num_components);
		1157	}
		1158
		1159	switch (intr->intrinsic) {
		1160	case nir_intrinsic_load_uniform:
		1161	compile_assert(ctx, intr->const_index[1] == 1);
		1162	for (int i = 0; i < intr->num_components; i++) {
		1163	unsigned n = idx * 4 + i;
		1164	dst[i] = create_uniform(ctx, n);
		1165	}
		1166	break;
		1167	case nir_intrinsic_load_uniform_indirect:
		1168	compile_assert(ctx, intr->const_index[1] == 1);
		1169	src = get_src(ctx, &intr->src[0]);
		1170	for (int i = 0; i < intr->num_components; i++) {
		1171	unsigned n = idx * 4 + i;
		1172	dst[i] = create_uniform_indirect(ctx, n,
		1173	get_addr(ctx, src[0]));
		1174	}
		1175	break;
		1176	case nir_intrinsic_load_ubo:
		1177	case nir_intrinsic_load_ubo_indirect:
		1178	emit_intrinsic_load_ubo(ctx, intr, dst);
		1179	break;
		1180	case nir_intrinsic_load_input:
		1181	compile_assert(ctx, intr->const_index[1] == 1);
		1182	for (int i = 0; i < intr->num_components; i++) {
		1183	unsigned n = idx * 4 + i;
		1184	dst[i] = b->inputs[n];
		1185	}
		1186	break;
		1187	case nir_intrinsic_load_input_indirect:
		1188	compile_assert(ctx, intr->const_index[1] == 1);
		1189	src = get_src(ctx, &intr->src[0]);
		1190	struct ir3_instruction *collect =
		1191	create_collect(b, b->inputs, b->ninputs);
		1192	struct ir3_instruction *addr = get_addr(ctx, src[0]);
		1193	for (int i = 0; i < intr->num_components; i++) {
		1194	unsigned n = idx * 4 + i;
		1195	dst[i] = create_indirect_load(ctx, b->ninputs, n, addr, collect);
		1196	}
		1197	break;
		1198	case nir_intrinsic_load_var:
		1199	emit_intrinisic_load_var(ctx, intr, dst);
		1200	break;
		1201	case nir_intrinsic_store_var:
		1202	emit_intrinisic_store_var(ctx, intr);
		1203	break;
		1204	case nir_intrinsic_store_output:
		1205	compile_assert(ctx, intr->const_index[1] == 1);
		1206	src = get_src(ctx, &intr->src[0]);
		1207	for (int i = 0; i < intr->num_components; i++) {
		1208	unsigned n = idx * 4 + i;
		1209	b->outputs[n] = src[i];
		1210	}
		1211	break;
		1212	case nir_intrinsic_load_base_vertex:
		1213	if (!ctx->basevertex) {
		1214	/* first four vec4 sysval's reserved for UBOs: */
		1215	unsigned r = regid(ctx->so->first_driver_param + 4, 0);
		1216	ctx->basevertex = create_uniform(ctx, r);
		1217	add_sysval_input(ctx, TGSI_SEMANTIC_BASEVERTEX,
		1218	ctx->basevertex);
		1219	}
		1220	dst[0] = ctx->basevertex;
		1221	break;
		1222	case nir_intrinsic_load_vertex_id_zero_base:
		1223	if (!ctx->vertex_id) {
		1224	ctx->vertex_id = create_input(ctx->block, NULL, 0);
		1225	add_sysval_input(ctx, TGSI_SEMANTIC_VERTEXID_NOBASE,
		1226	ctx->vertex_id);
		1227	}
		1228	dst[0] = ctx->vertex_id;
		1229	break;
		1230	case nir_intrinsic_load_instance_id:
		1231	if (!ctx->instance_id) {
		1232	ctx->instance_id = create_input(ctx->block, NULL, 0);
		1233	add_sysval_input(ctx, TGSI_SEMANTIC_INSTANCEID,
		1234	ctx->instance_id);
		1235	}
		1236	dst[0] = ctx->instance_id;
		1237	break;
		1238	case nir_intrinsic_discard_if:
		1239	case nir_intrinsic_discard: {
		1240	struct ir3_instruction cond, kill;
		1241
		1242	if (intr->intrinsic == nir_intrinsic_discard_if) {
		1243	/* conditional discard: */
		1244	src = get_src(ctx, &intr->src[0]);
		1245	cond = ir3_b2n(b, src[0]);
		1246	} else {
		1247	/* unconditional discard: */
		1248	cond = create_immed(b, 1);
		1249	}
		1250
		1251	cond = ir3_CMPS_S(b, cond, 0, create_immed(b, 0), 0);
		1252	cond->cat2.condition = IR3_COND_NE;
		1253
		1254	/* condition always goes in predicate register: */
		1255	cond->regs[0]->num = regid(REG_P0, 0);
		1256
		1257	kill = ir3_KILL(b, cond, 0);
		1258
		1259	ctx->kill[ctx->kill_count++] = kill;
		1260	ctx->so->has_kill = true;
		1261
		1262	break;
		1263	}
		1264	default:
		1265	compile_error(ctx, "Unhandled intrinsic type: %s\n",
		1266	nir_intrinsic_infos[intr->intrinsic].name);
		1267	break;
		1268	}
		1269	}
		1270
		1271	static void
		1272	emit_load_const(struct ir3_compile ctx, nir_load_const_instr instr)
		1273	{
		1274	struct ir3_instruction **dst = get_dst_ssa(ctx, &instr->def,
		1275	instr->def.num_components);
		1276	for (int i = 0; i < instr->def.num_components; i++)
		1277	dst[i] = create_immed(ctx->block, instr->value.u[i]);
		1278	}
		1279
		1280	static void
		1281	emit_undef(struct ir3_compile ctx, nir_ssa_undef_instr undef)
		1282	{
		1283	struct ir3_instruction **dst = get_dst_ssa(ctx, &undef->def,
		1284	undef->def.num_components);
		1285	/* backend doesn't want undefined instructions, so just plug
		1286	* in 0.0..
		1287	*/
		1288	for (int i = 0; i < undef->def.num_components; i++)
		1289	dst[i] = create_immed(ctx->block, fui(0.0));
		1290	}
		1291
		1292	/*
		1293	* texture fetch/sample instructions:
		1294	*/
		1295
		1296	static void
		1297	tex_info(nir_tex_instr tex, unsigned flagsp, unsigned *coordsp)
		1298	{
		1299	unsigned coords, flags = 0;
		1300
		1301	/* note: would use tex->coord_components.. except txs.. also,
		1302	* since array index goes after shadow ref, we don't want to
		1303	* count it:
		1304	*/
		1305	switch (tex->sampler_dim) {
		1306	case GLSL_SAMPLER_DIM_1D:
		1307	case GLSL_SAMPLER_DIM_BUF:
		1308	coords = 1;
		1309	break;
		1310	case GLSL_SAMPLER_DIM_2D:
		1311	case GLSL_SAMPLER_DIM_RECT:
		1312	case GLSL_SAMPLER_DIM_EXTERNAL:
		1313	case GLSL_SAMPLER_DIM_MS:
		1314	coords = 2;
		1315	break;
		1316	case GLSL_SAMPLER_DIM_3D:
		1317	case GLSL_SAMPLER_DIM_CUBE:
		1318	coords = 3;
		1319	flags \|= IR3_INSTR_3D;
		1320	break;
		1321	}
		1322
		1323	if (tex->is_shadow)
		1324	flags \|= IR3_INSTR_S;
		1325
		1326	if (tex->is_array)
		1327	flags \|= IR3_INSTR_A;
		1328
		1329	*flagsp = flags;
		1330	*coordsp = coords;
		1331	}
		1332
		1333	static void
		1334	emit_tex(struct ir3_compile ctx, nir_tex_instr tex)
		1335	{
		1336	struct ir3_block *b = ctx->block;
		1337	struct ir3_instruction *dst, sam, src0[12], src1[4];
		1338	struct ir3_instruction *coord, lod, compare, proj, off, ddx, **ddy;
		1339	bool has_bias = false, has_lod = false, has_proj = false, has_off = false;
		1340	unsigned i, coords, flags;
		1341	unsigned nsrc0 = 0, nsrc1 = 0;
		1342	type_t type;
		1343	opc_t opc;
		1344
		1345	/* TODO: might just be one component for gathers? */
		1346	dst = get_dst(ctx, &tex->dest, 4);
		1347
		1348	for (unsigned i = 0; i < tex->num_srcs; i++) {
		1349	switch (tex->src[i].src_type) {
		1350	case nir_tex_src_coord:
		1351	coord = get_src(ctx, &tex->src[i].src);
		1352	break;
		1353	case nir_tex_src_bias:
		1354	lod = get_src(ctx, &tex->src[i].src)[0];
		1355	has_bias = true;
		1356	break;
		1357	case nir_tex_src_lod:
		1358	lod = get_src(ctx, &tex->src[i].src)[0];
		1359	has_lod = true;
		1360	break;
		1361	case nir_tex_src_comparitor: /* shadow comparator */
		1362	compare = get_src(ctx, &tex->src[i].src)[0];
		1363	break;
		1364	case nir_tex_src_projector:
		1365	proj = get_src(ctx, &tex->src[i].src)[0];
		1366	has_proj = true;
		1367	break;
		1368	case nir_tex_src_offset:
		1369	off = get_src(ctx, &tex->src[i].src);
		1370	has_off = true;
		1371	break;
		1372	case nir_tex_src_ddx:
		1373	ddx = get_src(ctx, &tex->src[i].src);
		1374	break;
		1375	case nir_tex_src_ddy:
		1376	ddy = get_src(ctx, &tex->src[i].src);
		1377	break;
		1378	default:
		1379	compile_error(ctx, "Unhandled NIR tex serc type: %d\n",
		1380	tex->src[i].src_type);
		1381	return;
		1382	}
		1383	}
		1384
		1385	switch (tex->op) {
		1386	case nir_texop_tex: opc = OPC_SAM; break;
		1387	case nir_texop_txb: opc = OPC_SAMB; break;
		1388	case nir_texop_txl: opc = OPC_SAML; break;
		1389	case nir_texop_txd: opc = OPC_SAMGQ; break;
		1390	case nir_texop_txf: opc = OPC_ISAML; break;
		1391	case nir_texop_txf_ms:
		1392	case nir_texop_txs:
		1393	case nir_texop_lod:
		1394	case nir_texop_tg4:
		1395	case nir_texop_query_levels:
		1396	compile_error(ctx, "Unhandled NIR tex type: %d\n", tex->op);
		1397	return;
		1398	}
		1399
		1400	tex_info(tex, &flags, &coords);
		1401
		1402	/* scale up integer coords for TXF based on the LOD */
		1403	if (opc == OPC_ISAML) {
		1404	assert(has_lod);
		1405	for (i = 0; i < coords; i++)
		1406	coord[i] = ir3_SHL_B(b, coord[i], 0, lod, 0);
		1407	}
		1408	/*
		1409	* lay out the first argument in the proper order:
		1410	* - actual coordinates first
		1411	* - shadow reference
		1412	* - array index
		1413	* - projection w
		1414	* - starting at offset 4, dpdx.xy, dpdy.xy
		1415	*
		1416	* bias/lod go into the second arg
		1417	*/
		1418
		1419	/* insert tex coords: */
		1420	for (i = 0; i < coords; i++)
		1421	src0[nsrc0++] = coord[i];
		1422
		1423	if (coords == 1) {
		1424	/* hw doesn't do 1d, so we treat it as 2d with
		1425	* height of 1, and patch up the y coord.
		1426	* TODO: y coord should be (int)0 in some cases..
		1427	*/
		1428	src0[nsrc0++] = create_immed(b, fui(0.5));
		1429	}
		1430
		1431	if (tex->is_shadow)
		1432	src0[nsrc0++] = compare;
		1433
		1434	if (tex->is_array)
		1435	src0[nsrc0++] = coord[coords];
		1436
		1437	if (has_proj) {
		1438	src0[nsrc0++] = proj;
		1439	flags \|= IR3_INSTR_P;
		1440	}
		1441
		1442	/* pad to 4, then ddx/ddy: */
		1443	if (tex->op == nir_texop_txd) {
		1444	while (nsrc0 < 4)
		1445	src0[nsrc0++] = create_immed(b, fui(0.0));
		1446	for (i = 0; i < coords; i++)
		1447	src0[nsrc0++] = ddx[i];
		1448	if (coords < 2)
		1449	src0[nsrc0++] = create_immed(b, fui(0.0));
		1450	for (i = 0; i < coords; i++)
		1451	src0[nsrc0++] = ddy[i];
		1452	if (coords < 2)
		1453	src0[nsrc0++] = create_immed(b, fui(0.0));
		1454	}
		1455
		1456	/*
		1457	* second argument (if applicable):
		1458	* - offsets
		1459	* - lod
		1460	* - bias
		1461	*/
		1462	if (has_off \| has_lod \| has_bias) {
		1463	if (has_off) {
		1464	for (i = 0; i < coords; i++)
		1465	src1[nsrc1++] = off[i];
		1466	if (coords < 2)
		1467	src1[nsrc1++] = create_immed(b, fui(0.0));
		1468	flags \|= IR3_INSTR_O;
		1469	}
		1470
		1471	if (has_lod \| has_bias)
		1472	src1[nsrc1++] = lod;
		1473	}
		1474
		1475	switch (tex->dest_type) {
		1476	case nir_type_invalid:
		1477	case nir_type_float:
		1478	type = TYPE_F32;
		1479	break;
		1480	case nir_type_int:
		1481	type = TYPE_S32;
		1482	break;
		1483	case nir_type_unsigned:
		1484	case nir_type_bool:
		1485	type = TYPE_U32;
		1486	break;
		1487	}
		1488
		1489	sam = ir3_SAM(b, opc, type, TGSI_WRITEMASK_XYZW,
		1490	flags, tex->sampler_index, tex->sampler_index,
		1491	create_collect(b, src0, nsrc0),
		1492	create_collect(b, src1, nsrc1));
		1493
		1494	split_dest(b, dst, sam);
		1495	}
		1496
		1497	static void
		1498	emit_tex_query_levels(struct ir3_compile ctx, nir_tex_instr tex)
		1499	{
		1500	struct ir3_block *b = ctx->block;
		1501	struct ir3_instruction *dst, sam;
		1502
		1503	dst = get_dst(ctx, &tex->dest, 1);
		1504
		1505	sam = ir3_SAM(b, OPC_GETINFO, TYPE_U32, TGSI_WRITEMASK_Z, 0,
		1506	tex->sampler_index, tex->sampler_index, NULL, NULL);
		1507
		1508	/* even though there is only one component, since it ends
		1509	* up in .z rather than .x, we need a split_dest()
		1510	*/
		1511	split_dest(b, dst, sam);
		1512
		1513	/* The # of levels comes from getinfo.z. We need to add 1 to it, since
		1514	* the value in TEX_CONST_0 is zero-based.
		1515	*/
		1516	if (ctx->levels_add_one)
		1517	dst[0] = ir3_ADD_U(b, dst[0], 0, create_immed(b, 1), 0);
		1518	}
		1519
		1520	static void
		1521	emit_tex_txs(struct ir3_compile ctx, nir_tex_instr tex)
		1522	{
		1523	struct ir3_block *b = ctx->block;
		1524	struct ir3_instruction *dst, sam, *lod;
		1525	unsigned flags, coords;
		1526
		1527	tex_info(tex, &flags, &coords);
		1528
		1529	dst = get_dst(ctx, &tex->dest, 4);
		1530
		1531	compile_assert(ctx, tex->num_srcs == 1);
		1532	compile_assert(ctx, tex->src[0].src_type == nir_tex_src_lod);
		1533
		1534	lod = get_src(ctx, &tex->src[0].src)[0];
		1535
		1536	sam = ir3_SAM(b, OPC_GETSIZE, TYPE_U32, TGSI_WRITEMASK_XYZW, flags,
		1537	tex->sampler_index, tex->sampler_index, lod, NULL);
		1538
		1539	split_dest(b, dst, sam);
		1540
		1541	/* Array size actually ends up in .w rather than .z. This doesn't
		1542	* matter for miplevel 0, but for higher mips the value in z is
		1543	* minified whereas w stays. Also, the value in TEX_CONST_3_DEPTH is
		1544	* returned, which means that we have to add 1 to it for arrays.
		1545	*/
		1546	if (tex->is_array) {
		1547	if (ctx->levels_add_one) {
		1548	dst[coords] = ir3_ADD_U(b, dst[3], 0, create_immed(b, 1), 0);
		1549	} else {
		1550	dst[coords] = ir3_MOV(b, dst[3], TYPE_U32);
		1551	}
		1552	}
		1553	}
		1554
		1555	static void
		1556	emit_instr(struct ir3_compile ctx, nir_instr instr)
		1557	{
		1558	switch (instr->type) {
		1559	case nir_instr_type_alu:
		1560	emit_alu(ctx, nir_instr_as_alu(instr));
		1561	break;
		1562	case nir_instr_type_intrinsic:
		1563	emit_intrinisic(ctx, nir_instr_as_intrinsic(instr));
		1564	break;
		1565	case nir_instr_type_load_const:
		1566	emit_load_const(ctx, nir_instr_as_load_const(instr));
		1567	break;
		1568	case nir_instr_type_ssa_undef:
		1569	emit_undef(ctx, nir_instr_as_ssa_undef(instr));
		1570	break;
		1571	case nir_instr_type_tex: {
		1572	nir_tex_instr *tex = nir_instr_as_tex(instr);
		1573	/* couple tex instructions get special-cased:
		1574	*/
		1575	switch (tex->op) {
		1576	case nir_texop_txs:
		1577	emit_tex_txs(ctx, tex);
		1578	break;
		1579	case nir_texop_query_levels:
		1580	emit_tex_query_levels(ctx, tex);
		1581	break;
		1582	default:
		1583	emit_tex(ctx, tex);
		1584	break;
		1585	}
		1586	break;
		1587	}
		1588	case nir_instr_type_call:
		1589	case nir_instr_type_jump:
		1590	case nir_instr_type_phi:
		1591	case nir_instr_type_parallel_copy:
		1592	compile_error(ctx, "Unhandled NIR instruction type: %d\n", instr->type);
		1593	break;
		1594	}
		1595	}
		1596
		1597	static void
		1598	emit_block(struct ir3_compile ctx, nir_block block)
		1599	{
		1600	nir_foreach_instr(block, instr) {
		1601	emit_instr(ctx, instr);
		1602	if (ctx->error)
		1603	return;
		1604	}
		1605	}
		1606
		1607	static void
		1608	emit_function(struct ir3_compile ctx, nir_function_impl impl)
		1609	{
		1610	foreach_list_typed(nir_cf_node, node, node, &impl->body) {
		1611	switch (node->type) {
		1612	case nir_cf_node_block:
		1613	emit_block(ctx, nir_cf_node_as_block(node));
		1614	break;
		1615	case nir_cf_node_if:
		1616	case nir_cf_node_loop:
		1617	case nir_cf_node_function:
		1618	compile_error(ctx, "TODO\n");
		1619	break;
		1620	}
		1621	if (ctx->error)
		1622	return;
		1623	}
		1624	}
		1625
		1626	static void
		1627	setup_input(struct ir3_compile ctx, nir_variable in)
		1628	{
		1629	struct ir3_shader_variant *so = ctx->so;
		1630	unsigned array_len = MAX2(glsl_get_length(in->type), 1);
		1631	unsigned ncomp = glsl_get_components(in->type);
		1632	/* XXX: map loc slots to semantics */
		1633	unsigned semantic_name = in->data.location;
		1634	unsigned semantic_index = in->data.index;
		1635	unsigned n = in->data.driver_location;
		1636
		1637	DBG("; in: %u:%u, len=%ux%u, loc=%u\n",
		1638	semantic_name, semantic_index, array_len,
		1639	ncomp, n);
		1640
		1641	so->inputs[n].semantic =
		1642	ir3_semantic_name(semantic_name, semantic_index);
		1643	so->inputs[n].compmask = (1 << ncomp) - 1;
		1644	so->inputs[n].inloc = ctx->next_inloc;
		1645	so->inputs[n].interpolate = 0;
		1646	so->inputs_count = MAX2(so->inputs_count, n + 1);
		1647
		1648	/* the fdN_program_emit() code expects tgsi consts here, so map
		1649	* things back to tgsi for now:
		1650	*/
		1651	switch (in->data.interpolation) {
		1652	case INTERP_QUALIFIER_FLAT:
		1653	so->inputs[n].interpolate = TGSI_INTERPOLATE_CONSTANT;
		1654	break;
		1655	case INTERP_QUALIFIER_NOPERSPECTIVE:
		1656	so->inputs[n].interpolate = TGSI_INTERPOLATE_LINEAR;
		1657	break;
		1658	case INTERP_QUALIFIER_SMOOTH:
		1659	so->inputs[n].interpolate = TGSI_INTERPOLATE_PERSPECTIVE;
		1660	break;
		1661	}
		1662
		1663	for (int i = 0; i < ncomp; i++) {
		1664	struct ir3_instruction *instr = NULL;
		1665	unsigned idx = (n * 4) + i;
		1666
		1667	if (ctx->so->type == SHADER_FRAGMENT) {
		1668	if (semantic_name == TGSI_SEMANTIC_POSITION) {
		1669	so->inputs[n].bary = false;
		1670	so->frag_coord = true;
		1671	instr = create_frag_coord(ctx, i);
		1672	} else if (semantic_name == TGSI_SEMANTIC_FACE) {
		1673	so->inputs[n].bary = false;
		1674	so->frag_face = true;
		1675	instr = create_frag_face(ctx, i);
		1676	} else {
		1677	bool use_ldlv = false;
		1678
		1679	/* with NIR, we need to infer TGSI_INTERPOLATE_COLOR
		1680	* from the semantic name:
		1681	*/
		1682	if ((in->data.interpolation == INTERP_QUALIFIER_NONE) &&
		1683	((semantic_name == TGSI_SEMANTIC_COLOR) \|\|
		1684	(semantic_name == TGSI_SEMANTIC_BCOLOR)))
		1685	so->inputs[n].interpolate = TGSI_INTERPOLATE_COLOR;
		1686
		1687	if (ctx->flat_bypass) {
		1688	/* with NIR, we need to infer TGSI_INTERPOLATE_COLOR
		1689	* from the semantic name:
		1690	*/
		1691	switch (so->inputs[n].interpolate) {
		1692	case TGSI_INTERPOLATE_COLOR:
		1693	if (!ctx->so->key.rasterflat)
		1694	break;
		1695	/* fallthrough */
		1696	case TGSI_INTERPOLATE_CONSTANT:
		1697	use_ldlv = true;
		1698	break;
		1699	}
		1700	}
		1701
		1702	so->inputs[n].bary = true;
		1703
		1704	instr = create_frag_input(ctx,
		1705	so->inputs[n].inloc + i - 8, use_ldlv);
		1706	}
		1707	} else {
		1708	instr = create_input(ctx->block, NULL, idx);
		1709	}
		1710
		1711	ctx->block->inputs[idx] = instr;
		1712	}
		1713
		1714	if (so->inputs[n].bary \|\| (ctx->so->type == SHADER_VERTEX)) {
		1715	ctx->next_inloc += ncomp;
		1716	so->total_in += ncomp;
		1717	}
		1718	}
		1719
		1720	static void
		1721	setup_output(struct ir3_compile ctx, nir_variable out)
		1722	{
		1723	struct ir3_shader_variant *so = ctx->so;
		1724	unsigned array_len = MAX2(glsl_get_length(out->type), 1);
		1725	unsigned ncomp = glsl_get_components(out->type);
		1726	/* XXX: map loc slots to semantics */
		1727	unsigned semantic_name = out->data.location;
		1728	unsigned semantic_index = out->data.index;
		1729	unsigned n = out->data.driver_location;
		1730	unsigned comp = 0;
		1731
		1732	DBG("; out: %u:%u, len=%ux%u, loc=%u\n",
		1733	semantic_name, semantic_index, array_len,
		1734	ncomp, n);
		1735
		1736	if (ctx->so->type == SHADER_VERTEX) {
		1737	switch (semantic_name) {
		1738	case TGSI_SEMANTIC_POSITION:
		1739	so->writes_pos = true;
		1740	break;
		1741	case TGSI_SEMANTIC_PSIZE:
		1742	so->writes_psize = true;
		1743	break;
		1744	case TGSI_SEMANTIC_COLOR:
		1745	case TGSI_SEMANTIC_BCOLOR:
		1746	case TGSI_SEMANTIC_GENERIC:
		1747	case TGSI_SEMANTIC_FOG:
		1748	case TGSI_SEMANTIC_TEXCOORD:
		1749	break;
		1750	default:
		1751	compile_error(ctx, "unknown VS semantic name: %s\n",
		1752	tgsi_semantic_names[semantic_name]);
		1753	}
		1754	} else {
		1755	switch (semantic_name) {
		1756	case TGSI_SEMANTIC_POSITION:
		1757	comp = 2; /* tgsi will write to .z component */
		1758	so->writes_pos = true;
		1759	break;
		1760	case TGSI_SEMANTIC_COLOR:
		1761	break;
		1762	default:
		1763	compile_error(ctx, "unknown FS semantic name: %s\n",
		1764	tgsi_semantic_names[semantic_name]);
		1765	}
		1766	}
		1767
		1768	compile_assert(ctx, n < ARRAY_SIZE(so->outputs));
		1769
		1770	so->outputs[n].semantic =
		1771	ir3_semantic_name(semantic_name, semantic_index);
		1772	so->outputs[n].regid = regid(n, comp);
		1773	so->outputs_count = MAX2(so->outputs_count, n + 1);
		1774
		1775	for (int i = 0; i < ncomp; i++) {
		1776	unsigned idx = (n * 4) + i;
		1777
		1778	ctx->block->outputs[idx] = create_immed(ctx->block, fui(0.0));
		1779	}
		1780	}
		1781
		1782	static void
		1783	emit_instructions(struct ir3_compile *ctx)
		1784	{
		1785	unsigned ninputs = exec_list_length(&ctx->s->inputs) * 4;
		1786	unsigned noutputs = exec_list_length(&ctx->s->outputs) * 4;
		1787
		1788	/* we need to allocate big enough outputs array so that
		1789	* we can stuff the kill's at the end. Likewise for vtx
		1790	* shaders, we need to leave room for sysvals:
		1791	*/
		1792	if (ctx->so->type == SHADER_FRAGMENT) {
		1793	noutputs += ARRAY_SIZE(ctx->kill);
		1794	} else if (ctx->so->type == SHADER_VERTEX) {
		1795	ninputs += 8;
		1796	}
		1797
		1798	ctx->block = ir3_block_create(ctx->ir, 0, ninputs, noutputs);
		1799
		1800	if (ctx->so->type == SHADER_FRAGMENT) {
		1801	ctx->block->noutputs -= ARRAY_SIZE(ctx->kill);
		1802	} else if (ctx->so->type == SHADER_VERTEX) {
		1803	ctx->block->ninputs -= 8;
		1804	}
		1805
		1806	/* for fragment shader, we have a single input register (usually
		1807	* r0.xy) which is used as the base for bary.f varying fetch instrs:
		1808	*/
		1809	if (ctx->so->type == SHADER_FRAGMENT) {
		1810	// TODO maybe a helper for fi since we need it a few places..
		1811	struct ir3_instruction *instr;
		1812	instr = ir3_instr_create(ctx->block, -1, OPC_META_FI);
		1813	ir3_reg_create(instr, 0, 0);
		1814	ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.x */
		1815	ir3_reg_create(instr, 0, IR3_REG_SSA); /* r0.y */
		1816	ctx->frag_pos = instr;
		1817	}
		1818
		1819	/* Setup inputs: */
		1820	foreach_list_typed(nir_variable, var, node, &ctx->s->inputs) {
		1821	setup_input(ctx, var);
		1822	}
		1823
		1824	/* Setup outputs: */
		1825	foreach_list_typed(nir_variable, var, node, &ctx->s->outputs) {
		1826	setup_output(ctx, var);
		1827	}
		1828
		1829	/* Setup variables (which should only be arrays): */
		1830	foreach_list_typed(nir_variable, var, node, &ctx->s->globals) {
		1831	declare_var(ctx, var);
		1832	}
		1833
		1834	/* Find the main function and emit the body: */
		1835	nir_foreach_overload(ctx->s, overload) {
		1836	compile_assert(ctx, strcmp(overload->function->name, "main") == 0);
		1837	compile_assert(ctx, overload->impl);
		1838	emit_function(ctx, overload->impl);
		1839	if (ctx->error)
		1840	return;
		1841	}
		1842	}
		1843
		1844	/* from NIR perspective, we actually have inputs. But most of the "inputs"
		1845	* for a fragment shader are just bary.f instructions. The actual inputs
		1846	* from the hw perspective are the frag_pos and optionally frag_coord and
		1847	* frag_face.
		1848	*/
		1849	static void
		1850	fixup_frag_inputs(struct ir3_compile *ctx)
		1851	{
		1852	struct ir3_shader_variant *so = ctx->so;
		1853	struct ir3_block *block = ctx->block;
		1854	struct ir3_instruction **inputs;
		1855	struct ir3_instruction *instr;
		1856	int n, regid = 0;
		1857
		1858	block->ninputs = 0;
		1859
		1860	n = 4; /* always have frag_pos */
		1861	n += COND(so->frag_face, 4);
		1862	n += COND(so->frag_coord, 4);
		1863
		1864	inputs = ir3_alloc(ctx->ir, n * (sizeof(struct ir3_instruction *)));
		1865
		1866	if (so->frag_face) {
		1867	/* this ultimately gets assigned to hr0.x so doesn't conflict
		1868	* with frag_coord/frag_pos..
		1869	*/
		1870	inputs[block->ninputs++] = ctx->frag_face;
		1871	ctx->frag_face->regs[0]->num = 0;
		1872
		1873	/* remaining channels not used, but let's avoid confusing
		1874	* other parts that expect inputs to come in groups of vec4
		1875	*/
		1876	inputs[block->ninputs++] = NULL;
		1877	inputs[block->ninputs++] = NULL;
		1878	inputs[block->ninputs++] = NULL;
		1879	}
		1880
		1881	/* since we don't know where to set the regid for frag_coord,
		1882	* we have to use r0.x for it. But we don't want to always
		1883	* use r1.x for frag_pos as that could increase the register
		1884	* footprint on simple shaders:
		1885	*/
		1886	if (so->frag_coord) {
		1887	ctx->frag_coord[0]->regs[0]->num = regid++;
		1888	ctx->frag_coord[1]->regs[0]->num = regid++;
		1889	ctx->frag_coord[2]->regs[0]->num = regid++;
		1890	ctx->frag_coord[3]->regs[0]->num = regid++;
		1891
		1892	inputs[block->ninputs++] = ctx->frag_coord[0];
		1893	inputs[block->ninputs++] = ctx->frag_coord[1];
		1894	inputs[block->ninputs++] = ctx->frag_coord[2];
		1895	inputs[block->ninputs++] = ctx->frag_coord[3];
		1896	}
		1897
		1898	/* we always have frag_pos: */
		1899	so->pos_regid = regid;
		1900
		1901	/* r0.x */
		1902	instr = create_input(block, NULL, block->ninputs);
		1903	instr->regs[0]->num = regid++;
		1904	inputs[block->ninputs++] = instr;
		1905	ctx->frag_pos->regs[1]->instr = instr;
		1906
		1907	/* r0.y */
		1908	instr = create_input(block, NULL, block->ninputs);
		1909	instr->regs[0]->num = regid++;
		1910	inputs[block->ninputs++] = instr;
		1911	ctx->frag_pos->regs[2]->instr = instr;
		1912
		1913	block->inputs = inputs;
		1914	}
		1915
		1916	static void
		1917	compile_dump(struct ir3_compile *ctx)
		1918	{
		1919	const char *name = (ctx->so->type == SHADER_VERTEX) ? "vert" : "frag";
		1920	static unsigned n = 0;
		1921	char fname[16];
		1922	FILE *f;
		1923	snprintf(fname, sizeof(fname), "%s-%04u.dot", name, n++);
		1924	f = fopen(fname, "w");
		1925	if (!f)
		1926	return;
		1927	ir3_block_depth(ctx->block);
		1928	ir3_dump(ctx->ir, name, ctx->block, f);
		1929	fclose(f);
		1930	}
		1931
		1932	int
		1933	ir3_compile_shader_nir(struct ir3_shader_variant *so,
		1934	const struct tgsi_token *tokens, struct ir3_shader_key key)
		1935	{
		1936	struct ir3_compile *ctx;
		1937	struct ir3_block *block;
		1938	struct ir3_instruction **inputs;
		1939	unsigned i, j, actual_in;
		1940	int ret = 0, max_bary;
		1941
		1942	assert(!so->ir);
		1943
		1944	so->ir = ir3_create();
		1945
		1946	assert(so->ir);
		1947
		1948	ctx = compile_init(so, tokens);
		1949	if (!ctx) {
		1950	DBG("INIT failed!");
		1951	ret = -1;
		1952	goto out;
		1953	}
		1954
		1955	emit_instructions(ctx);
		1956
		1957	if (ctx->error) {
		1958	DBG("EMIT failed!");
		1959	ret = -1;
		1960	goto out;
		1961	}
		1962
		1963	block = ctx->block;
		1964	so->ir->block = block;
		1965
		1966	/* keep track of the inputs from TGSI perspective.. */
		1967	inputs = block->inputs;
		1968
		1969	/* but fixup actual inputs for frag shader: */
		1970	if (so->type == SHADER_FRAGMENT)
		1971	fixup_frag_inputs(ctx);
		1972
		1973	/* at this point, for binning pass, throw away unneeded outputs: */
		1974	if (key.binning_pass) {
		1975	for (i = 0, j = 0; i < so->outputs_count; i++) {
		1976	unsigned name = sem2name(so->outputs[i].semantic);
		1977	unsigned idx = sem2idx(so->outputs[i].semantic);
		1978
		1979	/* throw away everything but first position/psize */
		1980	if ((idx == 0) && ((name == TGSI_SEMANTIC_POSITION) \|\|
		1981	(name == TGSI_SEMANTIC_PSIZE))) {
		1982	if (i != j) {
		1983	so->outputs[j] = so->outputs[i];
		1984	block->outputs[(j4)+0] = block->outputs[(i4)+0];
		1985	block->outputs[(j4)+1] = block->outputs[(i4)+1];
		1986	block->outputs[(j4)+2] = block->outputs[(i4)+2];
		1987	block->outputs[(j4)+3] = block->outputs[(i4)+3];
		1988	}
		1989	j++;
		1990	}
		1991	}
		1992	so->outputs_count = j;
		1993	block->noutputs = j * 4;
		1994	}
		1995
		1996	/* if we want half-precision outputs, mark the output registers
		1997	* as half:
		1998	*/
		1999	if (key.half_precision) {
		2000	for (i = 0; i < block->noutputs; i++) {
		2001	if (!block->outputs[i])
		2002	continue;
		2003	block->outputs[i]->regs[0]->flags \|= IR3_REG_HALF;
		2004	}
		2005	}
		2006
		2007	/* at this point, we want the kill's in the outputs array too,
		2008	* so that they get scheduled (since they have no dst).. we've
		2009	* already ensured that the array is big enough in push_block():
		2010	*/
		2011	if (so->type == SHADER_FRAGMENT) {
		2012	for (i = 0; i < ctx->kill_count; i++)
		2013	block->outputs[block->noutputs++] = ctx->kill[i];
		2014	}
		2015
		2016	if (fd_mesa_debug & FD_DBG_OPTDUMP)
		2017	compile_dump(ctx);
		2018
		2019	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
		2020	printf("BEFORE CP:\n");
		2021	ir3_dump_instr_list(block->head);
		2022	}
		2023
		2024	ir3_block_depth(block);
		2025
		2026	ir3_block_cp(block);
		2027
		2028	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
		2029	printf("BEFORE GROUPING:\n");
		2030	ir3_dump_instr_list(block->head);
		2031	}
		2032
		2033	/* Group left/right neighbors, inserting mov's where needed to
		2034	* solve conflicts:
		2035	*/
		2036	ir3_block_group(block);
		2037
		2038	if (fd_mesa_debug & FD_DBG_OPTDUMP)
		2039	compile_dump(ctx);
		2040
		2041	ir3_block_depth(block);
		2042
		2043	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
		2044	printf("AFTER DEPTH:\n");
		2045	ir3_dump_instr_list(block->head);
		2046	}
		2047
		2048	ret = ir3_block_sched(block);
		2049	if (ret) {
		2050	DBG("SCHED failed!");
		2051	goto out;
		2052	}
		2053
		2054	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
		2055	printf("AFTER SCHED:\n");
		2056	ir3_dump_instr_list(block->head);
		2057	}
		2058
		2059	ret = ir3_block_ra(block, so->type, so->frag_coord, so->frag_face);
		2060	if (ret) {
		2061	DBG("RA failed!");
		2062	goto out;
		2063	}
		2064
		2065	if (fd_mesa_debug & FD_DBG_OPTMSGS) {
		2066	printf("AFTER RA:\n");
		2067	ir3_dump_instr_list(block->head);
		2068	}
		2069
		2070	ir3_block_legalize(block, &so->has_samp, &max_bary);
		2071
		2072	/* fixup input/outputs: */
		2073	for (i = 0; i < so->outputs_count; i++) {
		2074	so->outputs[i].regid = block->outputs[i*4]->regs[0]->num;
		2075	/* preserve hack for depth output.. tgsi writes depth to .z,
		2076	* but what we give the hw is the scalar register:
		2077	*/
		2078	if ((so->type == SHADER_FRAGMENT) &&
		2079	(sem2name(so->outputs[i].semantic) == TGSI_SEMANTIC_POSITION))
		2080	so->outputs[i].regid += 2;
		2081	}
		2082
		2083	/* Note that some or all channels of an input may be unused: */
		2084	actual_in = 0;
		2085	for (i = 0; i < so->inputs_count; i++) {
		2086	unsigned j, regid = ~0, compmask = 0;
		2087	so->inputs[i].ncomp = 0;
		2088	for (j = 0; j < 4; j++) {
		2089	struct ir3_instruction in = inputs[(i4) + j];
		2090	if (in) {
		2091	compmask \|= (1 << j);
		2092	regid = in->regs[0]->num - j;
		2093	actual_in++;
		2094	so->inputs[i].ncomp++;
		2095	}
		2096	}
		2097	so->inputs[i].regid = regid;
		2098	so->inputs[i].compmask = compmask;
		2099	}
		2100
		2101	/* fragment shader always gets full vec4's even if it doesn't
		2102	* fetch all components, but vertex shader we need to update
		2103	* with the actual number of components fetch, otherwise thing
		2104	* will hang due to mismaptch between VFD_DECODE's and
		2105	* TOTALATTRTOVS
		2106	*/
		2107	if (so->type == SHADER_VERTEX)
		2108	so->total_in = actual_in;
		2109	else
		2110	so->total_in = align(max_bary + 1, 4);
		2111
		2112	out:
		2113	if (ret) {
		2114	ir3_destroy(so->ir);
		2115	so->ir = NULL;
		2116	}
		2117	compile_free(ctx);
		2118
		2119	return ret;
		2120	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/mesa-10.6.0/src/gallium/drivers/freedreno/ir3/ir3_compiler_nir.c – Rev 5571