WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp

Rev	Author	Line No.	Line
4358	Serge	1	/*
		2	* Copyright 2011 Christoph Bumiller
		3	*
		4	* Permission is hereby granted, free of charge, to any person obtaining a
		5	* copy of this software and associated documentation files (the "Software"),
		6	* to deal in the Software without restriction, including without limitation
		7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		8	* and/or sell copies of the Software, and to permit persons to whom the
		9	* Software is furnished to do so, subject to the following conditions:
		10	*
		11	* The above copyright notice and this permission notice shall be included in
		12	* all copies or substantial portions of the Software.
		13	*
		14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		17	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
		18	* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
		19	* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
		20	* OTHER DEALINGS IN THE SOFTWARE.
		21	*/
		22
		23	#include "nv50/codegen/nv50_ir.h"
		24	#include "nv50/codegen/nv50_ir_build_util.h"
		25
		26	#include "nv50_ir_target_nv50.h"
		27
		28	namespace nv50_ir {
		29
		30	// nv50 doesn't support 32 bit integer multiplication
		31	//
		32	// ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
		33	// -------------------
		34	// albh 00 HI32: (al bh + ah * bl) >> 16 + (ah * bh) +
		35	// ah*bh 00 00 ( carry1) << 16 + ( carry2)
		36	// al*bl
		37	// ah*bl 00
		38	//
		39	// fffe0001 + fffe0001
		40	static bool
		41	expandIntegerMUL(BuildUtil bld, Instruction mul)
		42	{
		43	const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
		44
		45	DataType fTy = mul->sType; // full type
		46	DataType hTy;
		47	switch (fTy) {
		48	case TYPE_S32: hTy = TYPE_S16; break;
		49	case TYPE_U32: hTy = TYPE_U16; break;
		50	case TYPE_U64: hTy = TYPE_U32; break;
		51	case TYPE_S64: hTy = TYPE_S32; break;
		52	default:
		53	return false;
		54	}
		55	unsigned int fullSize = typeSizeof(fTy);
		56	unsigned int halfSize = typeSizeof(hTy);
		57
		58	Instruction *i[9];
		59
		60	bld->setPosition(mul, true);
		61
		62	Value a[2], b[2];
		63	Value *c[2];
		64	Value *t[4];
		65	for (int j = 0; j < 4; ++j)
		66	t[j] = bld->getSSA(fullSize);
		67
		68	// split sources into halves
		69	i[0] = bld->mkSplit(a, halfSize, mul->getSrc(0));
		70	i[1] = bld->mkSplit(b, halfSize, mul->getSrc(1));
		71
		72	i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
		73	i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
		74	i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
		75	i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
		76
		77	if (highResult) {
		78	Value *r[3];
		79	Value imm = bld->loadImm(NULL, 1 << (halfSize 8));
		80	c[0] = bld->getSSA(1, FILE_FLAGS);
		81	c[1] = bld->getSSA(1, FILE_FLAGS);
		82	for (int j = 0; j < 3; ++j)
		83	r[j] = bld->getSSA(fullSize);
		84
		85	i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
		86	i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
		87	bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[0]);
		88	i[5] = bld->mkOp3(OP_MAD, fTy, mul->getDef(0), a[1], b[1], r[2]);
		89
		90	// set carry defs / sources
		91	i[3]->setFlagsDef(1, c[0]);
		92	i[4]->setFlagsDef(0, c[1]); // actual result not required, just the carry
		93	i[6]->setPredicate(CC_C, c[0]);
		94	i[5]->setFlagsSrc(3, c[1]);
		95	} else {
		96	bld->mkMov(mul->getDef(0), t[3]);
		97	}
		98	delete_Instruction(bld->getProgram(), mul);
		99
		100	for (int j = 2; j <= (highResult ? 5 : 4); ++j)
		101	if (i[j])
		102	i[j]->sType = hTy;
		103
		104	return true;
		105	}
		106
		107	#define QOP_ADD 0
		108	#define QOP_SUBR 1
		109	#define QOP_SUB 2
		110	#define QOP_MOV2 3
		111
		112	// UL UR LL LR
		113	#define QUADOP(q, r, s, t) \
		114	((QOP_##q << 6) \| (QOP_##r << 4) \| \
		115	(QOP_##s << 2) \| (QOP_##t << 0))
		116
		117	class NV50LegalizePostRA : public Pass
		118	{
		119	private:
		120	virtual bool visit(Function *);
		121	virtual bool visit(BasicBlock *);
		122
		123	void handlePRERET(FlowInstruction *);
		124	void replaceZero(Instruction *);
		125
		126	LValue *r63;
		127	};
		128
		129	bool
		130	NV50LegalizePostRA::visit(Function *fn)
		131	{
		132	Program *prog = fn->getProgram();
		133
		134	r63 = new_LValue(fn, FILE_GPR);
		135	r63->reg.data.id = 63;
		136
		137	// this is actually per-program, but we can do it all on visiting main()
		138	std::list *outWrites =
		139	reinterpret_cast *>(prog->targetPriv);
		140
		141	if (outWrites) {
		142	for (std::list::iterator it = outWrites->begin();
		143	it != outWrites->end(); ++it)
		144	(it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (it)->getSrc(0));
		145	// instructions will be deleted on exit
		146	outWrites->clear();
		147	}
		148
		149	return true;
		150	}
		151
		152	void
		153	NV50LegalizePostRA::replaceZero(Instruction *i)
		154	{
		155	for (int s = 0; i->srcExists(s); ++s) {
		156	ImmediateValue *imm = i->getSrc(s)->asImm();
		157	if (imm && imm->reg.data.u64 == 0)
		158	i->setSrc(s, r63);
		159	}
		160	}
		161
		162	// Emulate PRERET: jump to the target and call to the origin from there
		163	//
		164	// WARNING: atm only works if BBs are affected by at most a single PRERET
		165	//
		166	// BB:0
		167	// preret BB:3
		168	// (...)
		169	// BB:3
		170	// (...)
		171	// --->
		172	// BB:0
		173	// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
		174	// (...)
		175	// BB:3
		176	// bra BB:3 + n1 (skip the call)
		177	// call BB:0 + n2 (skip bra at beginning of BB:0)
		178	// (...)
		179	void
		180	NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
		181	{
		182	BasicBlock *bbE = pre->bb;
		183	BasicBlock *bbT = pre->target.bb;
		184
		185	pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
		186	bbE->remove(pre);
		187	bbE->insertHead(pre);
		188
		189	Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
		190	Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
		191
		192	bbT->insertHead(call);
		193	bbT->insertHead(skip);
		194
		195	// NOTE: maybe split blocks to prevent the instructions from moving ?
		196
		197	skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
		198	call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
		199	}
		200
		201	bool
		202	NV50LegalizePostRA::visit(BasicBlock *bb)
		203	{
		204	Instruction i, next;
		205
		206	// remove pseudo operations and non-fixed no-ops, split 64 bit operations
		207	for (i = bb->getFirst(); i; i = next) {
		208	next = i->next;
		209	if (i->isNop()) {
		210	bb->remove(i);
		211	} else
		212	if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
		213	handlePRERET(i->asFlow());
		214	} else {
		215	// TODO: We will want to do this before register allocation,
		216	// since have to use a $c register for the carry flag.
		217	if (typeSizeof(i->dType) == 8) {
		218	Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
		219	if (hi)
		220	next = hi;
		221	}
		222
		223	if (i->op != OP_MOV && i->op != OP_PFETCH &&
		224	i->op != OP_BAR &&
		225	(!i->defExists(0) \|\| i->def(0).getFile() != FILE_ADDRESS))
		226	replaceZero(i);
		227	}
		228	}
		229	if (!bb->getEntry())
		230	return true;
		231
		232	return true;
		233	}
		234
		235	class NV50LegalizeSSA : public Pass
		236	{
		237	public:
		238	NV50LegalizeSSA(Program *);
		239
		240	virtual bool visit(BasicBlock *bb);
		241
		242	private:
		243	void propagateWriteToOutput(Instruction *);
		244	void handleDIV(Instruction *);
		245	void handleMOD(Instruction *);
		246	void handleMUL(Instruction *);
		247	void handleAddrDef(Instruction *);
		248
		249	inline bool isARL(const Instruction *) const;
		250
		251	BuildUtil bld;
		252
		253	std::list *outWrites;
		254	};
		255
		256	NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
		257	{
		258	bld.setProgram(prog);
		259
		260	if (prog->optLevel >= 2 &&
		261	(prog->getType() == Program::TYPE_GEOMETRY \|\|
		262	prog->getType() == Program::TYPE_VERTEX))
		263	outWrites =
		264	reinterpret_cast *>(prog->targetPriv);
		265	else
		266	outWrites = NULL;
		267	}
		268
		269	void
		270	NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
		271	{
		272	if (st->src(0).isIndirect(0) \|\| st->getSrc(1)->refCount() != 1)
		273	return;
		274
		275	// check def instruction can store
		276	Instruction *di = st->getSrc(1)->defs.front()->getInsn();
		277
		278	// TODO: move exports (if beneficial) in common opt pass
		279	if (di->isPseudo() \|\| isTextureOp(di->op) \|\| di->defCount(0xff, true) > 1)
		280	return;
		281	for (int s = 0; di->srcExists(s); ++s)
		282	if (di->src(s).getFile() == FILE_IMMEDIATE)
		283	return;
		284
		285	// We cannot set defs to non-lvalues before register allocation, so
		286	// save & remove (to save registers) the exports and replace later.
		287	outWrites->push_back(st);
		288	st->bb->remove(st);
		289	}
		290
		291	bool
		292	NV50LegalizeSSA::isARL(const Instruction *i) const
		293	{
		294	ImmediateValue imm;
		295
		296	if (i->op != OP_SHL \|\| i->src(0).getFile() != FILE_GPR)
		297	return false;
		298	if (!i->src(1).getImmediate(imm))
		299	return false;
		300	return imm.isInteger(0);
		301	}
		302
		303	void
		304	NV50LegalizeSSA::handleAddrDef(Instruction *i)
		305	{
		306	Instruction *arl;
		307
		308	i->getDef(0)->reg.size = 2; // $aX are only 16 bit
		309
		310	// only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
		311	if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
		312	if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
		313	return;
		314	if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
		315	return;
		316	}
		317
		318	// turn $a sources into $r sources (can't operate on $a)
		319	for (int s = 0; i->srcExists(s); ++s) {
		320	Value *a = i->getSrc(s);
		321	Value *r;
		322	if (a->reg.file == FILE_ADDRESS) {
		323	if (a->getInsn() && isARL(a->getInsn())) {
		324	i->setSrc(s, a->getInsn()->getSrc(0));
		325	} else {
		326	bld.setPosition(i, false);
		327	r = bld.getSSA();
		328	bld.mkMov(r, a);
		329	i->setSrc(s, r);
		330	}
		331	}
		332	}
		333	if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
		334	return;
		335
		336	// turn result back into $a
		337	bld.setPosition(i, true);
		338	arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
		339	i->setDef(0, arl->getSrc(0));
		340	}
		341
		342	void
		343	NV50LegalizeSSA::handleMUL(Instruction *mul)
		344	{
		345	if (isFloatType(mul->sType) \|\| typeSizeof(mul->sType) <= 2)
		346	return;
		347	Value *def = mul->getDef(0);
		348	Value *pred = mul->getPredicate();
		349	CondCode cc = mul->cc;
		350	if (pred)
		351	mul->setPredicate(CC_ALWAYS, NULL);
		352
		353	if (mul->op == OP_MAD) {
		354	Instruction *add = mul;
		355	bld.setPosition(add, false);
		356	Value *res = cloneShallow(func, mul->getDef(0));
		357	mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
		358	add->op = OP_ADD;
		359	add->setSrc(0, mul->getDef(0));
		360	add->setSrc(1, add->getSrc(2));
		361	for (int s = 2; add->srcExists(s); ++s)
		362	add->setSrc(s, NULL);
		363	mul->subOp = add->subOp;
		364	add->subOp = 0;
		365	}
		366	expandIntegerMUL(&bld, mul);
		367	if (pred)
		368	def->getInsn()->setPredicate(cc, pred);
		369	}
		370
		371	// Use f32 division: first compute an approximate result, use it to reduce
		372	// the dividend, which should then be representable as f32, divide the reduced
		373	// dividend, and add the quotients.
		374	void
		375	NV50LegalizeSSA::handleDIV(Instruction *div)
		376	{
		377	const DataType ty = div->sType;
		378
		379	if (ty != TYPE_U32 && ty != TYPE_S32)
		380	return;
		381
		382	Value q, q0, qf, aR, aRf, qRf, qR, t, s, m, *cond;
		383
		384	bld.setPosition(div, false);
		385
		386	Value a, af = bld.getSSA();
		387	Value b, bf = bld.getSSA();
		388
		389	bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
		390	bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
		391
		392	if (isSignedType(ty)) {
		393	af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
		394	bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
		395	a = bld.getSSA();
		396	b = bld.getSSA();
		397	bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
		398	bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
		399	} else {
		400	a = div->getSrc(0);
		401	b = div->getSrc(1);
		402	}
		403
		404	bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
		405	bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
		406
		407	bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
		408	bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
		409
		410	// get error of 1st result
		411	expandIntegerMUL(&bld,
		412	bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
		413	bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
		414
		415	bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
		416
		417	bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
		418	bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
		419	->rnd = ROUND_Z;
		420	bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
		421
		422	// correction: if modulus >= divisor, add 1
		423	expandIntegerMUL(&bld,
		424	bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
		425	bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
		426	bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), m, b);
		427	if (!isSignedType(ty)) {
		428	div->op = OP_SUB;
		429	div->setSrc(0, q);
		430	div->setSrc(1, s);
		431	} else {
		432	t = q;
		433	bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
		434	s = bld.getSSA();
		435	t = bld.getSSA();
		436	// fix the sign
		437	bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
		438	->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
		439	bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
		440	bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
		441
		442	div->op = OP_UNION;
		443	div->setSrc(0, s);
		444	div->setSrc(1, t);
		445	}
		446	}
		447
		448	void
		449	NV50LegalizeSSA::handleMOD(Instruction *mod)
		450	{
		451	if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
		452	return;
		453	bld.setPosition(mod, false);
		454
		455	Value *q = bld.getSSA();
		456	Value *m = bld.getSSA();
		457
		458	bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
		459	handleDIV(q->getInsn());
		460
		461	bld.setPosition(mod, false);
		462	expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
		463
		464	mod->op = OP_SUB;
		465	mod->setSrc(1, m);
		466	}
		467
		468	bool
		469	NV50LegalizeSSA::visit(BasicBlock *bb)
		470	{
		471	Instruction insn, next;
		472	// skipping PHIs (don't pass them to handleAddrDef) !
		473	for (insn = bb->getEntry(); insn; insn = next) {
		474	next = insn->next;
		475
		476	switch (insn->op) {
		477	case OP_EXPORT:
		478	if (outWrites)
		479	propagateWriteToOutput(insn);
		480	break;
		481	case OP_DIV:
		482	handleDIV(insn);
		483	break;
		484	case OP_MOD:
		485	handleMOD(insn);
		486	break;
		487	case OP_MAD:
		488	case OP_MUL:
		489	handleMUL(insn);
		490	break;
		491	default:
		492	break;
		493	}
		494
		495	if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
		496	handleAddrDef(insn);
		497	}
		498	return true;
		499	}
		500
		501	class NV50LoweringPreSSA : public Pass
		502	{
		503	public:
		504	NV50LoweringPreSSA(Program *);
		505
		506	private:
		507	virtual bool visit(Instruction *);
		508	virtual bool visit(Function *);
		509
		510	bool handleRDSV(Instruction *);
		511	bool handleWRSV(Instruction *);
		512
		513	bool handleEXPORT(Instruction *);
		514
		515	bool handleDIV(Instruction *);
		516	bool handleSQRT(Instruction *);
		517	bool handlePOW(Instruction *);
		518
		519	bool handleSET(Instruction *);
		520	bool handleSLCT(CmpInstruction *);
		521	bool handleSELP(Instruction *);
		522
		523	bool handleTEX(TexInstruction *);
		524	bool handleTXB(TexInstruction *); // I really
		525	bool handleTXL(TexInstruction *); // hate
		526	bool handleTXD(TexInstruction *); // these 3
		527
		528	bool handleCALL(Instruction *);
		529	bool handlePRECONT(Instruction *);
		530	bool handleCONT(Instruction *);
		531
		532	void checkPredicate(Instruction *);
		533
		534	private:
		535	const Target *const targ;
		536
		537	BuildUtil bld;
		538
		539	Value *tid;
		540	};
		541
		542	NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
		543	targ(prog->getTarget()), tid(NULL)
		544	{
		545	bld.setProgram(prog);
		546	}
		547
		548	bool
		549	NV50LoweringPreSSA::visit(Function *f)
		550	{
		551	BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
		552
		553	if (prog->getType() == Program::TYPE_COMPUTE) {
		554	// Add implicit "thread id" argument in $r0 to the function
		555	Value *arg = new_LValue(func, FILE_GPR);
		556	arg->reg.data.id = 0;
		557	f->ins.push_back(arg);
		558
		559	bld.setPosition(root, false);
		560	tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
		561	}
		562
		563	return true;
		564	}
		565
		566	bool
		567	NV50LoweringPreSSA::handleTEX(TexInstruction *i)
		568	{
		569	const int arg = i->tex.target.getArgCount();
		570	const int dref = arg;
		571	const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
		572
		573	// dref comes before bias/lod
		574	if (i->tex.target.isShadow())
		575	if (i->op == OP_TXB \|\| i->op == OP_TXL)
		576	i->swapSources(dref, lod);
		577
		578	// array index must be converted to u32
		579	if (i->tex.target.isArray()) {
		580	Value *layer = i->getSrc(arg - 1);
		581	LValue *src = new_LValue(func, FILE_GPR);
		582	bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
		583	bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
		584	i->setSrc(arg - 1, src);
		585
		586	if (i->tex.target.isCube()) {
		587	std::vector acube, a2d;
		588	int c;
		589
		590	acube.resize(4);
		591	for (c = 0; c < 4; ++c)
		592	acube[c] = i->getSrc(c);
		593	a2d.resize(4);
		594	for (c = 0; c < 3; ++c)
		595	a2d[c] = new_LValue(func, FILE_GPR);
		596	a2d[3] = NULL;
		597
		598	bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
		599	a2d, acube)->asTex()->tex.mask = 0x7;
		600
		601	for (c = 0; c < 3; ++c)
		602	i->setSrc(c, a2d[c]);
		603	i->setSrc(c, NULL);
		604	for (; i->srcExists(c + 1); ++c)
		605	i->setSrc(c, i->getSrc(c + 1));
		606
		607	i->tex.target = i->tex.target.isShadow() ?
		608	TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
		609	}
		610	}
		611
		612	// texel offsets are 3 immediate fields in the instruction,
		613	// nv50 cannot do textureGatherOffsets
		614	assert(i->tex.useOffsets <= 1);
		615
		616	return true;
		617	}
		618
		619	// Bias must be equal for all threads of a quad or lod calculation will fail.
		620	//
		621	// The lanes of a quad are grouped by the bit in the condition register they
		622	// have set, which is selected by differing bias values.
		623	// Move the input values for TEX into a new register set for each group and
		624	// execute TEX only for a specific group.
		625	// We always need to use 4 new registers for the inputs/outputs because the
		626	// implicitly calculated derivatives must be correct.
		627	//
		628	// TODO: move to SSA phase so we can easily determine whether bias is constant
		629	bool
		630	NV50LoweringPreSSA::handleTXB(TexInstruction *i)
		631	{
		632	const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
		633	int l, d;
		634
		635	handleTEX(i);
		636	Value *bias = i->getSrc(i->tex.target.getArgCount());
		637	if (bias->isUniform())
		638	return true;
		639
		640	Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
		641	bld.loadImm(NULL, 1));
		642	bld.setPosition(cond, false);
		643
		644	for (l = 1; l < 4; ++l) {
		645	const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
		646	Value *bit = bld.getSSA();
		647	Value *pred = bld.getScratch(1, FILE_FLAGS);
		648	Value *imm = bld.loadImm(NULL, (1 << l));
		649	bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
		650	bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
		651	cond->setSrc(l, bit);
		652	}
		653	Value *flags = bld.getScratch(1, FILE_FLAGS);
		654	bld.setPosition(cond, true);
		655	bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
		656
		657	Instruction *tex[4];
		658	for (l = 0; l < 4; ++l) {
		659	(tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
		660	bld.insert(tex[l]);
		661	}
		662
		663	Value *res[4][4];
		664	for (d = 0; i->defExists(d); ++d)
		665	res[0][d] = tex[0]->getDef(d);
		666	for (l = 1; l < 4; ++l) {
		667	for (d = 0; tex[l]->defExists(d); ++d) {
		668	res[l][d] = cloneShallow(func, res[0][d]);
		669	bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
		670	}
		671	}
		672
		673	for (d = 0; i->defExists(d); ++d) {
		674	Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
		675	for (l = 0; l < 4; ++l)
		676	dst->setSrc(l, res[l][d]);
		677	}
		678	delete_Instruction(prog, i);
		679	return true;
		680	}
		681
		682	// LOD must be equal for all threads of a quad.
		683	// Unlike with TXB, here we can just diverge since there's no LOD calculation
		684	// that would require all 4 threads' sources to be set up properly.
		685	bool
		686	NV50LoweringPreSSA::handleTXL(TexInstruction *i)
		687	{
		688	handleTEX(i);
		689	Value *lod = i->getSrc(i->tex.target.getArgCount());
		690	if (lod->isUniform())
		691	return true;
		692
		693	BasicBlock *currBB = i->bb;
		694	BasicBlock *texiBB = i->bb->splitBefore(i, false);
		695	BasicBlock *joinBB = i->bb->splitAfter(i);
		696
		697	bld.setPosition(currBB, true);
		698	currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
		699
		700	for (int l = 0; l <= 3; ++l) {
		701	const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
		702	Value *pred = bld.getScratch(1, FILE_FLAGS);
		703	bld.setPosition(currBB, true);
		704	bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
		705	bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
		706	currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
		707	if (l <= 2) {
		708	BasicBlock *laneBB = new BasicBlock(func);
		709	currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
		710	currBB = laneBB;
		711	}
		712	}
		713	bld.setPosition(joinBB, false);
		714	bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
		715	return true;
		716	}
		717
		718	bool
		719	NV50LoweringPreSSA::handleTXD(TexInstruction *i)
		720	{
		721	static const uint8_t qOps[4][2] =
		722	{
		723	{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
		724	{ QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
		725	{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
		726	{ QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
		727	};
		728	Value *def[4][4];
		729	Value *crd[3];
		730	Instruction *tex;
		731	Value *zero = bld.loadImm(bld.getSSA(), 0);
		732	int l, c;
		733	const int dim = i->tex.target.getDim();
		734
		735	handleTEX(i);
		736	i->op = OP_TEX; // no need to clone dPdx/dPdy later
		737
		738	for (c = 0; c < dim; ++c)
		739	crd[c] = bld.getScratch();
		740
		741	bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
		742	for (l = 0; l < 4; ++l) {
		743	// mov coordinates from lane l to all lanes
		744	for (c = 0; c < dim; ++c)
		745	bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
		746	// add dPdx from lane l to lanes dx
		747	for (c = 0; c < dim; ++c)
		748	bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
		749	// add dPdy from lane l to lanes dy
		750	for (c = 0; c < dim; ++c)
		751	bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
		752	// texture
		753	bld.insert(tex = cloneForward(func, i));
		754	for (c = 0; c < dim; ++c)
		755	tex->setSrc(c, crd[c]);
		756	// save results
		757	for (c = 0; i->defExists(c); ++c) {
		758	Instruction *mov;
		759	def[c][l] = bld.getSSA();
		760	mov = bld.mkMov(def[c][l], tex->getDef(c));
		761	mov->fixed = 1;
		762	mov->lanes = 1 << l;
		763	}
		764	}
		765	bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
		766
		767	for (c = 0; i->defExists(c); ++c) {
		768	Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
		769	for (l = 0; l < 4; ++l)
		770	u->setSrc(l, def[c][l]);
		771	}
		772
		773	i->bb->remove(i);
		774	return true;
		775	}
		776
		777	bool
		778	NV50LoweringPreSSA::handleSET(Instruction *i)
		779	{
		780	if (i->dType == TYPE_F32) {
		781	bld.setPosition(i, true);
		782	i->dType = TYPE_U32;
		783	bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
		784	bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
		785	}
		786	return true;
		787	}
		788
		789	bool
		790	NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
		791	{
		792	Value *src0 = bld.getSSA();
		793	Value *src1 = bld.getSSA();
		794	Value *pred = bld.getScratch(1, FILE_FLAGS);
		795
		796	Value *v0 = i->getSrc(0);
		797	Value *v1 = i->getSrc(1);
		798	// XXX: these probably shouldn't be immediates in the first place ...
		799	if (v0->asImm())
		800	v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
		801	if (v1->asImm())
		802	v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
		803
		804	bld.setPosition(i, true);
		805	bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
		806	bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
		807	bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
		808
		809	bld.setPosition(i, false);
		810	i->op = OP_SET;
		811	i->setFlagsDef(0, pred);
		812	i->dType = TYPE_U8;
		813	i->setSrc(0, i->getSrc(2));
		814	i->setSrc(2, NULL);
		815	i->setSrc(1, bld.loadImm(NULL, 0));
		816
		817	return true;
		818	}
		819
		820	bool
		821	NV50LoweringPreSSA::handleSELP(Instruction *i)
		822	{
		823	Value *src0 = bld.getSSA();
		824	Value *src1 = bld.getSSA();
		825
		826	Value *v0 = i->getSrc(0);
		827	Value *v1 = i->getSrc(1);
		828	if (v0->asImm())
		829	v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
		830	if (v1->asImm())
		831	v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
		832
		833	bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
		834	bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
		835	bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
		836	delete_Instruction(prog, i);
		837	return true;
		838	}
		839
		840	bool
		841	NV50LoweringPreSSA::handleWRSV(Instruction *i)
		842	{
		843	Symbol *sym = i->getSrc(0)->asSym();
		844
		845	// these are all shader outputs, $sreg are not writeable
		846	uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
		847	if (addr >= 0x400)
		848	return false;
		849	sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
		850
		851	bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
		852
		853	bld.getBB()->remove(i);
		854	return true;
		855	}
		856
		857	bool
		858	NV50LoweringPreSSA::handleCALL(Instruction *i)
		859	{
		860	if (prog->getType() == Program::TYPE_COMPUTE) {
		861	// Add implicit "thread id" argument in $r0 to the function
		862	i->setSrc(i->srcCount(), tid);
		863	}
		864	return true;
		865	}
		866
		867	bool
		868	NV50LoweringPreSSA::handlePRECONT(Instruction *i)
		869	{
		870	delete_Instruction(prog, i);
		871	return true;
		872	}
		873
		874	bool
		875	NV50LoweringPreSSA::handleCONT(Instruction *i)
		876	{
		877	i->op = OP_BRA;
		878	return true;
		879	}
		880
		881	bool
		882	NV50LoweringPreSSA::handleRDSV(Instruction *i)
		883	{
		884	Symbol *sym = i->getSrc(0)->asSym();
		885	uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
		886	Value *def = i->getDef(0);
		887	SVSemantic sv = sym->reg.data.sv.sv;
		888	int idx = sym->reg.data.sv.index;
		889
		890	if (addr >= 0x400) // mov $sreg
		891	return true;
		892
		893	switch (sv) {
		894	case SV_POSITION:
		895	assert(prog->getType() == Program::TYPE_FRAGMENT);
		896	bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
		897	break;
		898	case SV_FACE:
		899	bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
		900	if (i->dType == TYPE_F32) {
		901	bld.mkOp2(OP_AND, TYPE_U32, def, def, bld.mkImm(0x80000000));
		902	bld.mkOp2(OP_XOR, TYPE_U32, def, def, bld.mkImm(0xbf800000));
		903	}
		904	break;
		905	case SV_NCTAID:
		906	case SV_CTAID:
		907	case SV_NTID:
		908	if ((sv == SV_NCTAID && idx >= 2) \|\|
		909	(sv == SV_NTID && idx >= 3)) {
		910	bld.mkMov(def, bld.mkImm(1));
		911	} else if (sv == SV_CTAID && idx >= 2) {
		912	bld.mkMov(def, bld.mkImm(0));
		913	} else {
		914	Value *x = bld.getSSA(2);
		915	bld.mkOp1(OP_LOAD, TYPE_U16, x,
		916	bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
		917	bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
		918	}
		919	break;
		920	case SV_TID:
		921	if (idx == 0) {
		922	bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
		923	} else if (idx == 1) {
		924	bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
		925	bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
		926	} else if (idx == 2) {
		927	bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
		928	} else {
		929	bld.mkMov(def, bld.mkImm(0));
		930	}
		931	break;
		932	default:
		933	bld.mkFetch(i->getDef(0), i->dType,
		934	FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
		935	break;
		936	}
		937	bld.getBB()->remove(i);
		938	return true;
		939	}
		940
		941	bool
		942	NV50LoweringPreSSA::handleDIV(Instruction *i)
		943	{
		944	if (!isFloatType(i->dType))
		945	return true;
		946	bld.setPosition(i, false);
		947	Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
		948	i->op = OP_MUL;
		949	i->setSrc(1, rcp->getDef(0));
		950	return true;
		951	}
		952
		953	bool
		954	NV50LoweringPreSSA::handleSQRT(Instruction *i)
		955	{
		956	Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
		957	bld.getSSA(), i->getSrc(0));
		958	i->op = OP_MUL;
		959	i->setSrc(1, rsq->getDef(0));
		960
		961	return true;
		962	}
		963
		964	bool
		965	NV50LoweringPreSSA::handlePOW(Instruction *i)
		966	{
		967	LValue *val = bld.getScratch();
		968
		969	bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
		970	bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
		971	bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
		972
		973	i->op = OP_EX2;
		974	i->setSrc(0, val);
		975	i->setSrc(1, NULL);
		976
		977	return true;
		978	}
		979
		980	bool
		981	NV50LoweringPreSSA::handleEXPORT(Instruction *i)
		982	{
		983	if (prog->getType() == Program::TYPE_FRAGMENT) {
		984	if (i->getIndirect(0, 0)) {
		985	// TODO: redirect to l[] here, load to GPRs at exit
		986	return false;
		987	} else {
		988	int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
		989
		990	i->op = OP_MOV;
		991	i->subOp = NV50_IR_SUBOP_MOV_FINAL;
		992	i->src(0).set(i->src(1));
		993	i->setSrc(1, NULL);
		994	i->setDef(0, new_LValue(func, FILE_GPR));
		995	i->getDef(0)->reg.data.id = id;
		996
		997	prog->maxGPR = MAX2(prog->maxGPR, id);
		998	}
		999	}
		1000	return true;
		1001	}
		1002
		1003	// Set flags according to predicate and make the instruction read $cX.
		1004	void
		1005	NV50LoweringPreSSA::checkPredicate(Instruction *insn)
		1006	{
		1007	Value *pred = insn->getPredicate();
		1008	Value *cdst;
		1009
		1010	if (!pred \|\| pred->reg.file == FILE_FLAGS)
		1011	return;
		1012	cdst = bld.getSSA(1, FILE_FLAGS);
		1013
		1014	bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, bld.loadImm(NULL, 0), pred);
		1015
		1016	insn->setPredicate(insn->cc, cdst);
		1017	}
		1018
		1019	//
		1020	// - add quadop dance for texturing
		1021	// - put FP outputs in GPRs
		1022	// - convert instruction sequences
		1023	//
		1024	bool
		1025	NV50LoweringPreSSA::visit(Instruction *i)
		1026	{
		1027	bld.setPosition(i, false);
		1028
		1029	if (i->cc != CC_ALWAYS)
		1030	checkPredicate(i);
		1031
		1032	switch (i->op) {
		1033	case OP_TEX:
		1034	case OP_TXF:
		1035	case OP_TXG:
		1036	return handleTEX(i->asTex());
		1037	case OP_TXB:
		1038	return handleTXB(i->asTex());
		1039	case OP_TXL:
		1040	return handleTXL(i->asTex());
		1041	case OP_TXD:
		1042	return handleTXD(i->asTex());
		1043	case OP_EX2:
		1044	bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
		1045	i->setSrc(0, i->getDef(0));
		1046	break;
		1047	case OP_SET:
		1048	return handleSET(i);
		1049	case OP_SLCT:
		1050	return handleSLCT(i->asCmp());
		1051	case OP_SELP:
		1052	return handleSELP(i);
		1053	case OP_POW:
		1054	return handlePOW(i);
		1055	case OP_DIV:
		1056	return handleDIV(i);
		1057	case OP_SQRT:
		1058	return handleSQRT(i);
		1059	case OP_EXPORT:
		1060	return handleEXPORT(i);
		1061	case OP_RDSV:
		1062	return handleRDSV(i);
		1063	case OP_WRSV:
		1064	return handleWRSV(i);
		1065	case OP_CALL:
		1066	return handleCALL(i);
		1067	case OP_PRECONT:
		1068	return handlePRECONT(i);
		1069	case OP_CONT:
		1070	return handleCONT(i);
		1071	default:
		1072	break;
		1073	}
		1074	return true;
		1075	}
		1076
		1077	bool
		1078	TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
		1079	{
		1080	bool ret = false;
		1081
		1082	if (stage == CG_STAGE_PRE_SSA) {
		1083	NV50LoweringPreSSA pass(prog);
		1084	ret = pass.run(prog, false, true);
		1085	} else
		1086	if (stage == CG_STAGE_SSA) {
		1087	if (!prog->targetPriv)
		1088	prog->targetPriv = new std::list();
		1089	NV50LegalizeSSA pass(prog);
		1090	ret = pass.run(prog, false, true);
		1091	} else
		1092	if (stage == CG_STAGE_POST_RA) {
		1093	NV50LegalizePostRA pass;
		1094	ret = pass.run(prog, false, true);
		1095	if (prog->targetPriv)
		1096	delete reinterpret_cast *>(prog->targetPriv);
		1097	}
		1098	return ret;
		1099	}
		1100
		1101	} // namespace nv50_ir

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/src/gallium/drivers/nv50/codegen/nv50_ir_lowering_nv50.cpp – Rev 4826