WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/mesa-10.6.0/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp

Rev	Author	Line No.	Line
5564	serge	1	/*
		2	* Copyright 2011 Christoph Bumiller
		3	*
		4	* Permission is hereby granted, free of charge, to any person obtaining a
		5	* copy of this software and associated documentation files (the "Software"),
		6	* to deal in the Software without restriction, including without limitation
		7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		8	* and/or sell copies of the Software, and to permit persons to whom the
		9	* Software is furnished to do so, subject to the following conditions:
		10	*
		11	* The above copyright notice and this permission notice shall be included in
		12	* all copies or substantial portions of the Software.
		13	*
		14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		17	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
		18	* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
		19	* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
		20	* OTHER DEALINGS IN THE SOFTWARE.
		21	*/
		22
		23	#include "codegen/nv50_ir.h"
		24	#include "codegen/nv50_ir_build_util.h"
		25
		26	#include "codegen/nv50_ir_target_nv50.h"
		27
		28	namespace nv50_ir {
		29
		30	// nv50 doesn't support 32 bit integer multiplication
		31	//
		32	// ah al * bh bl = LO32: (al * bh + ah * bl) << 16 + (al * bl)
		33	// -------------------
		34	// albh 00 HI32: (al bh + ah * bl) >> 16 + (ah * bh) +
		35	// ah*bh 00 00 ( carry1) << 16 + ( carry2)
		36	// al*bl
		37	// ah*bl 00
		38	//
		39	// fffe0001 + fffe0001
		40	//
		41	// Note that this sort of splitting doesn't work for signed values, so we
		42	// compute the sign on those manually and then perform an unsigned multiply.
		43	static bool
		44	expandIntegerMUL(BuildUtil bld, Instruction mul)
		45	{
		46	const bool highResult = mul->subOp == NV50_IR_SUBOP_MUL_HIGH;
		47
		48	DataType fTy; // full type
		49	switch (mul->sType) {
		50	case TYPE_S32: fTy = TYPE_U32; break;
		51	case TYPE_S64: fTy = TYPE_U64; break;
		52	default: fTy = mul->sType; break;
		53	}
		54
		55	DataType hTy; // half type
		56	switch (fTy) {
		57	case TYPE_U32: hTy = TYPE_U16; break;
		58	case TYPE_U64: hTy = TYPE_U32; break;
		59	default:
		60	return false;
		61	}
		62	unsigned int fullSize = typeSizeof(fTy);
		63	unsigned int halfSize = typeSizeof(hTy);
		64
		65	Instruction *i[9];
		66
		67	bld->setPosition(mul, true);
		68
		69	Value *s[2];
		70	Value a[2], b[2];
		71	Value *t[4];
		72	for (int j = 0; j < 4; ++j)
		73	t[j] = bld->getSSA(fullSize);
		74
		75	s[0] = mul->getSrc(0);
		76	s[1] = mul->getSrc(1);
		77
		78	if (isSignedType(mul->sType)) {
		79	s[0] = bld->getSSA(fullSize);
		80	s[1] = bld->getSSA(fullSize);
		81	bld->mkOp1(OP_ABS, mul->sType, s[0], mul->getSrc(0));
		82	bld->mkOp1(OP_ABS, mul->sType, s[1], mul->getSrc(1));
		83	}
		84
		85	// split sources into halves
		86	i[0] = bld->mkSplit(a, halfSize, s[0]);
		87	i[1] = bld->mkSplit(b, halfSize, s[1]);
		88
		89	i[2] = bld->mkOp2(OP_MUL, fTy, t[0], a[0], b[1]);
		90	i[3] = bld->mkOp3(OP_MAD, fTy, t[1], a[1], b[0], t[0]);
		91	i[7] = bld->mkOp2(OP_SHL, fTy, t[2], t[1], bld->mkImm(halfSize * 8));
		92	i[4] = bld->mkOp3(OP_MAD, fTy, t[3], a[0], b[0], t[2]);
		93
		94	if (highResult) {
		95	Value *c[2];
		96	Value *r[5];
		97	Value imm = bld->loadImm(NULL, 1 << (halfSize 8));
		98	c[0] = bld->getSSA(1, FILE_FLAGS);
		99	c[1] = bld->getSSA(1, FILE_FLAGS);
		100	for (int j = 0; j < 5; ++j)
		101	r[j] = bld->getSSA(fullSize);
		102
		103	i[8] = bld->mkOp2(OP_SHR, fTy, r[0], t[1], bld->mkImm(halfSize * 8));
		104	i[6] = bld->mkOp2(OP_ADD, fTy, r[1], r[0], imm);
		105	bld->mkMov(r[3], r[0])->setPredicate(CC_NC, c[0]);
		106	bld->mkOp2(OP_UNION, TYPE_U32, r[2], r[1], r[3]);
		107	i[5] = bld->mkOp3(OP_MAD, fTy, r[4], a[1], b[1], r[2]);
		108
		109	// set carry defs / sources
		110	i[3]->setFlagsDef(1, c[0]);
		111	// actual result required in negative case, but ignored for
		112	// unsigned. for some reason the compiler ends up dropping the whole
		113	// instruction if the destination is unused but the flags are.
		114	if (isSignedType(mul->sType))
		115	i[4]->setFlagsDef(1, c[1]);
		116	else
		117	i[4]->setFlagsDef(0, c[1]);
		118	i[6]->setPredicate(CC_C, c[0]);
		119	i[5]->setFlagsSrc(3, c[1]);
		120
		121	if (isSignedType(mul->sType)) {
		122	Value *cc[2];
		123	Value *rr[7];
		124	Value *one = bld->getSSA(fullSize);
		125	bld->loadImm(one, 1);
		126	for (int j = 0; j < 7; j++)
		127	rr[j] = bld->getSSA(fullSize);
		128
		129	// NOTE: this logic uses predicates because splitting basic blocks is
		130	// ~impossible during the SSA phase. The RA relies on a correlation
		131	// between edge order and phi node sources.
		132
		133	// Set the sign of the result based on the inputs
		134	bld->mkOp2(OP_XOR, fTy, NULL, mul->getSrc(0), mul->getSrc(1))
		135	->setFlagsDef(0, (cc[0] = bld->getSSA(1, FILE_FLAGS)));
		136
		137	// 1s complement of 64-bit value
		138	bld->mkOp1(OP_NOT, fTy, rr[0], r[4])
		139	->setPredicate(CC_S, cc[0]);
		140	bld->mkOp1(OP_NOT, fTy, rr[1], t[3])
		141	->setPredicate(CC_S, cc[0]);
		142
		143	// add to low 32-bits, keep track of the carry
		144	Instruction *n = bld->mkOp2(OP_ADD, fTy, NULL, rr[1], one);
		145	n->setPredicate(CC_S, cc[0]);
		146	n->setFlagsDef(0, (cc[1] = bld->getSSA(1, FILE_FLAGS)));
		147
		148	// If there was a carry, add 1 to the upper 32 bits
		149	// XXX: These get executed even if they shouldn't be
		150	bld->mkOp2(OP_ADD, fTy, rr[2], rr[0], one)
		151	->setPredicate(CC_C, cc[1]);
		152	bld->mkMov(rr[3], rr[0])
		153	->setPredicate(CC_NC, cc[1]);
		154	bld->mkOp2(OP_UNION, fTy, rr[4], rr[2], rr[3]);
		155
		156	// Merge the results from the negative and non-negative paths
		157	bld->mkMov(rr[5], rr[4])
		158	->setPredicate(CC_S, cc[0]);
		159	bld->mkMov(rr[6], r[4])
		160	->setPredicate(CC_NS, cc[0]);
		161	bld->mkOp2(OP_UNION, mul->sType, mul->getDef(0), rr[5], rr[6]);
		162	} else {
		163	bld->mkMov(mul->getDef(0), r[4]);
		164	}
		165	} else {
		166	bld->mkMov(mul->getDef(0), t[3]);
		167	}
		168	delete_Instruction(bld->getProgram(), mul);
		169
		170	for (int j = 2; j <= (highResult ? 5 : 4); ++j)
		171	if (i[j])
		172	i[j]->sType = hTy;
		173
		174	return true;
		175	}
		176
		177	#define QOP_ADD 0
		178	#define QOP_SUBR 1
		179	#define QOP_SUB 2
		180	#define QOP_MOV2 3
		181
		182	// UL UR LL LR
		183	#define QUADOP(q, r, s, t) \
		184	((QOP_##q << 6) \| (QOP_##r << 4) \| \
		185	(QOP_##s << 2) \| (QOP_##t << 0))
		186
		187	class NV50LegalizePostRA : public Pass
		188	{
		189	private:
		190	virtual bool visit(Function *);
		191	virtual bool visit(BasicBlock *);
		192
		193	void handlePRERET(FlowInstruction *);
		194	void replaceZero(Instruction *);
		195
		196	LValue *r63;
		197	};
		198
		199	bool
		200	NV50LegalizePostRA::visit(Function *fn)
		201	{
		202	Program *prog = fn->getProgram();
		203
		204	r63 = new_LValue(fn, FILE_GPR);
		205	r63->reg.data.id = 63;
		206
		207	// this is actually per-program, but we can do it all on visiting main()
		208	std::list *outWrites =
		209	reinterpret_cast *>(prog->targetPriv);
		210
		211	if (outWrites) {
		212	for (std::list::iterator it = outWrites->begin();
		213	it != outWrites->end(); ++it)
		214	(it)->getSrc(1)->defs.front()->getInsn()->setDef(0, (it)->getSrc(0));
		215	// instructions will be deleted on exit
		216	outWrites->clear();
		217	}
		218
		219	return true;
		220	}
		221
		222	void
		223	NV50LegalizePostRA::replaceZero(Instruction *i)
		224	{
		225	for (int s = 0; i->srcExists(s); ++s) {
		226	ImmediateValue *imm = i->getSrc(s)->asImm();
		227	if (imm && imm->reg.data.u64 == 0)
		228	i->setSrc(s, r63);
		229	}
		230	}
		231
		232	// Emulate PRERET: jump to the target and call to the origin from there
		233	//
		234	// WARNING: atm only works if BBs are affected by at most a single PRERET
		235	//
		236	// BB:0
		237	// preret BB:3
		238	// (...)
		239	// BB:3
		240	// (...)
		241	// --->
		242	// BB:0
		243	// bra BB:3 + n0 (directly to the call; move to beginning of BB and fixate)
		244	// (...)
		245	// BB:3
		246	// bra BB:3 + n1 (skip the call)
		247	// call BB:0 + n2 (skip bra at beginning of BB:0)
		248	// (...)
		249	void
		250	NV50LegalizePostRA::handlePRERET(FlowInstruction *pre)
		251	{
		252	BasicBlock *bbE = pre->bb;
		253	BasicBlock *bbT = pre->target.bb;
		254
		255	pre->subOp = NV50_IR_SUBOP_EMU_PRERET + 0;
		256	bbE->remove(pre);
		257	bbE->insertHead(pre);
		258
		259	Instruction *skip = new_FlowInstruction(func, OP_PRERET, bbT);
		260	Instruction *call = new_FlowInstruction(func, OP_PRERET, bbE);
		261
		262	bbT->insertHead(call);
		263	bbT->insertHead(skip);
		264
		265	// NOTE: maybe split blocks to prevent the instructions from moving ?
		266
		267	skip->subOp = NV50_IR_SUBOP_EMU_PRERET + 1;
		268	call->subOp = NV50_IR_SUBOP_EMU_PRERET + 2;
		269	}
		270
		271	bool
		272	NV50LegalizePostRA::visit(BasicBlock *bb)
		273	{
		274	Instruction i, next;
		275
		276	// remove pseudo operations and non-fixed no-ops, split 64 bit operations
		277	for (i = bb->getFirst(); i; i = next) {
		278	next = i->next;
		279	if (i->isNop()) {
		280	bb->remove(i);
		281	} else
		282	if (i->op == OP_PRERET && prog->getTarget()->getChipset() < 0xa0) {
		283	handlePRERET(i->asFlow());
		284	} else {
		285	// TODO: We will want to do this before register allocation,
		286	// since have to use a $c register for the carry flag.
		287	if (typeSizeof(i->dType) == 8) {
		288	Instruction *hi = BuildUtil::split64BitOpPostRA(func, i, r63, NULL);
		289	if (hi)
		290	next = hi;
		291	}
		292
		293	if (i->op != OP_MOV && i->op != OP_PFETCH &&
		294	i->op != OP_BAR &&
		295	(!i->defExists(0) \|\| i->def(0).getFile() != FILE_ADDRESS))
		296	replaceZero(i);
		297	}
		298	}
		299	if (!bb->getEntry())
		300	return true;
		301
		302	return true;
		303	}
		304
		305	class NV50LegalizeSSA : public Pass
		306	{
		307	public:
		308	NV50LegalizeSSA(Program *);
		309
		310	virtual bool visit(BasicBlock *bb);
		311
		312	private:
		313	void propagateWriteToOutput(Instruction *);
		314	void handleDIV(Instruction *);
		315	void handleMOD(Instruction *);
		316	void handleMUL(Instruction *);
		317	void handleAddrDef(Instruction *);
		318
		319	inline bool isARL(const Instruction *) const;
		320
		321	BuildUtil bld;
		322
		323	std::list *outWrites;
		324	};
		325
		326	NV50LegalizeSSA::NV50LegalizeSSA(Program *prog)
		327	{
		328	bld.setProgram(prog);
		329
		330	if (prog->optLevel >= 2 &&
		331	(prog->getType() == Program::TYPE_GEOMETRY \|\|
		332	prog->getType() == Program::TYPE_VERTEX))
		333	outWrites =
		334	reinterpret_cast *>(prog->targetPriv);
		335	else
		336	outWrites = NULL;
		337	}
		338
		339	void
		340	NV50LegalizeSSA::propagateWriteToOutput(Instruction *st)
		341	{
		342	if (st->src(0).isIndirect(0) \|\| st->getSrc(1)->refCount() != 1)
		343	return;
		344
		345	// check def instruction can store
		346	Instruction *di = st->getSrc(1)->defs.front()->getInsn();
		347
		348	// TODO: move exports (if beneficial) in common opt pass
		349	if (di->isPseudo() \|\| isTextureOp(di->op) \|\| di->defCount(0xff, true) > 1)
		350	return;
		351
		352	for (int s = 0; di->srcExists(s); ++s)
		353	if (di->src(s).getFile() == FILE_IMMEDIATE)
		354	return;
		355
		356	if (prog->getType() == Program::TYPE_GEOMETRY) {
		357	// Only propagate output writes in geometry shaders when we can be sure
		358	// that we are propagating to the same output vertex.
		359	if (di->bb != st->bb)
		360	return;
		361	Instruction *i;
		362	for (i = di; i != st; i = i->next) {
		363	if (i->op == OP_EMIT \|\| i->op == OP_RESTART)
		364	return;
		365	}
		366	assert(i); // st after di
		367	}
		368
		369	// We cannot set defs to non-lvalues before register allocation, so
		370	// save & remove (to save registers) the exports and replace later.
		371	outWrites->push_back(st);
		372	st->bb->remove(st);
		373	}
		374
		375	bool
		376	NV50LegalizeSSA::isARL(const Instruction *i) const
		377	{
		378	ImmediateValue imm;
		379
		380	if (i->op != OP_SHL \|\| i->src(0).getFile() != FILE_GPR)
		381	return false;
		382	if (!i->src(1).getImmediate(imm))
		383	return false;
		384	return imm.isInteger(0);
		385	}
		386
		387	void
		388	NV50LegalizeSSA::handleAddrDef(Instruction *i)
		389	{
		390	Instruction *arl;
		391
		392	i->getDef(0)->reg.size = 2; // $aX are only 16 bit
		393
		394	// PFETCH can always write to $a
		395	if (i->op == OP_PFETCH)
		396	return;
		397	// only ADDR <- SHL(GPR, IMM) and ADDR <- ADD(ADDR, IMM) are valid
		398	if (i->srcExists(1) && i->src(1).getFile() == FILE_IMMEDIATE) {
		399	if (i->op == OP_SHL && i->src(0).getFile() == FILE_GPR)
		400	return;
		401	if (i->op == OP_ADD && i->src(0).getFile() == FILE_ADDRESS)
		402	return;
		403	}
		404
		405	// turn $a sources into $r sources (can't operate on $a)
		406	for (int s = 0; i->srcExists(s); ++s) {
		407	Value *a = i->getSrc(s);
		408	Value *r;
		409	if (a->reg.file == FILE_ADDRESS) {
		410	if (a->getInsn() && isARL(a->getInsn())) {
		411	i->setSrc(s, a->getInsn()->getSrc(0));
		412	} else {
		413	bld.setPosition(i, false);
		414	r = bld.getSSA();
		415	bld.mkMov(r, a);
		416	i->setSrc(s, r);
		417	}
		418	}
		419	}
		420	if (i->op == OP_SHL && i->src(1).getFile() == FILE_IMMEDIATE)
		421	return;
		422
		423	// turn result back into $a
		424	bld.setPosition(i, true);
		425	arl = bld.mkOp2(OP_SHL, TYPE_U32, i->getDef(0), bld.getSSA(), bld.mkImm(0));
		426	i->setDef(0, arl->getSrc(0));
		427	}
		428
		429	void
		430	NV50LegalizeSSA::handleMUL(Instruction *mul)
		431	{
		432	if (isFloatType(mul->sType) \|\| typeSizeof(mul->sType) <= 2)
		433	return;
		434	Value *def = mul->getDef(0);
		435	Value *pred = mul->getPredicate();
		436	CondCode cc = mul->cc;
		437	if (pred)
		438	mul->setPredicate(CC_ALWAYS, NULL);
		439
		440	if (mul->op == OP_MAD) {
		441	Instruction *add = mul;
		442	bld.setPosition(add, false);
		443	Value *res = cloneShallow(func, mul->getDef(0));
		444	mul = bld.mkOp2(OP_MUL, add->sType, res, add->getSrc(0), add->getSrc(1));
		445	add->op = OP_ADD;
		446	add->setSrc(0, mul->getDef(0));
		447	add->setSrc(1, add->getSrc(2));
		448	for (int s = 2; add->srcExists(s); ++s)
		449	add->setSrc(s, NULL);
		450	mul->subOp = add->subOp;
		451	add->subOp = 0;
		452	}
		453	expandIntegerMUL(&bld, mul);
		454	if (pred)
		455	def->getInsn()->setPredicate(cc, pred);
		456	}
		457
		458	// Use f32 division: first compute an approximate result, use it to reduce
		459	// the dividend, which should then be representable as f32, divide the reduced
		460	// dividend, and add the quotients.
		461	void
		462	NV50LegalizeSSA::handleDIV(Instruction *div)
		463	{
		464	const DataType ty = div->sType;
		465
		466	if (ty != TYPE_U32 && ty != TYPE_S32)
		467	return;
		468
		469	Value q, q0, qf, aR, aRf, qRf, qR, t, s, m, *cond;
		470
		471	bld.setPosition(div, false);
		472
		473	Value a, af = bld.getSSA();
		474	Value b, bf = bld.getSSA();
		475
		476	bld.mkCvt(OP_CVT, TYPE_F32, af, ty, div->getSrc(0));
		477	bld.mkCvt(OP_CVT, TYPE_F32, bf, ty, div->getSrc(1));
		478
		479	if (isSignedType(ty)) {
		480	af->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
		481	bf->getInsn()->src(0).mod = Modifier(NV50_IR_MOD_ABS);
		482	a = bld.getSSA();
		483	b = bld.getSSA();
		484	bld.mkOp1(OP_ABS, ty, a, div->getSrc(0));
		485	bld.mkOp1(OP_ABS, ty, b, div->getSrc(1));
		486	} else {
		487	a = div->getSrc(0);
		488	b = div->getSrc(1);
		489	}
		490
		491	bf = bld.mkOp1v(OP_RCP, TYPE_F32, bld.getSSA(), bf);
		492	bf = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getSSA(), bf, bld.mkImm(-2));
		493
		494	bld.mkOp2(OP_MUL, TYPE_F32, (qf = bld.getSSA()), af, bf)->rnd = ROUND_Z;
		495	bld.mkCvt(OP_CVT, ty, (q0 = bld.getSSA()), TYPE_F32, qf)->rnd = ROUND_Z;
		496
		497	// get error of 1st result
		498	expandIntegerMUL(&bld,
		499	bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q0, b));
		500	bld.mkOp2(OP_SUB, TYPE_U32, (aRf = bld.getSSA()), a, t);
		501
		502	bld.mkCvt(OP_CVT, TYPE_F32, (aR = bld.getSSA()), TYPE_U32, aRf);
		503
		504	bld.mkOp2(OP_MUL, TYPE_F32, (qRf = bld.getSSA()), aR, bf)->rnd = ROUND_Z;
		505	bld.mkCvt(OP_CVT, TYPE_U32, (qR = bld.getSSA()), TYPE_F32, qRf)
		506	->rnd = ROUND_Z;
		507	bld.mkOp2(OP_ADD, ty, (q = bld.getSSA()), q0, qR); // add quotients
		508
		509	// correction: if modulus >= divisor, add 1
		510	expandIntegerMUL(&bld,
		511	bld.mkOp2(OP_MUL, TYPE_U32, (t = bld.getSSA()), q, b));
		512	bld.mkOp2(OP_SUB, TYPE_U32, (m = bld.getSSA()), a, t);
		513	bld.mkCmp(OP_SET, CC_GE, TYPE_U32, (s = bld.getSSA()), TYPE_U32, m, b);
		514	if (!isSignedType(ty)) {
		515	div->op = OP_SUB;
		516	div->setSrc(0, q);
		517	div->setSrc(1, s);
		518	} else {
		519	t = q;
		520	bld.mkOp2(OP_SUB, TYPE_U32, (q = bld.getSSA()), t, s);
		521	s = bld.getSSA();
		522	t = bld.getSSA();
		523	// fix the sign
		524	bld.mkOp2(OP_XOR, TYPE_U32, NULL, div->getSrc(0), div->getSrc(1))
		525	->setFlagsDef(0, (cond = bld.getSSA(1, FILE_FLAGS)));
		526	bld.mkOp1(OP_NEG, ty, s, q)->setPredicate(CC_S, cond);
		527	bld.mkOp1(OP_MOV, ty, t, q)->setPredicate(CC_NS, cond);
		528
		529	div->op = OP_UNION;
		530	div->setSrc(0, s);
		531	div->setSrc(1, t);
		532	}
		533	}
		534
		535	void
		536	NV50LegalizeSSA::handleMOD(Instruction *mod)
		537	{
		538	if (mod->dType != TYPE_U32 && mod->dType != TYPE_S32)
		539	return;
		540	bld.setPosition(mod, false);
		541
		542	Value *q = bld.getSSA();
		543	Value *m = bld.getSSA();
		544
		545	bld.mkOp2(OP_DIV, mod->dType, q, mod->getSrc(0), mod->getSrc(1));
		546	handleDIV(q->getInsn());
		547
		548	bld.setPosition(mod, false);
		549	expandIntegerMUL(&bld, bld.mkOp2(OP_MUL, TYPE_U32, m, q, mod->getSrc(1)));
		550
		551	mod->op = OP_SUB;
		552	mod->setSrc(1, m);
		553	}
		554
		555	bool
		556	NV50LegalizeSSA::visit(BasicBlock *bb)
		557	{
		558	Instruction insn, next;
		559	// skipping PHIs (don't pass them to handleAddrDef) !
		560	for (insn = bb->getEntry(); insn; insn = next) {
		561	next = insn->next;
		562
		563	if (insn->defExists(0) && insn->getDef(0)->reg.file == FILE_ADDRESS)
		564	handleAddrDef(insn);
		565
		566	switch (insn->op) {
		567	case OP_EXPORT:
		568	if (outWrites)
		569	propagateWriteToOutput(insn);
		570	break;
		571	case OP_DIV:
		572	handleDIV(insn);
		573	break;
		574	case OP_MOD:
		575	handleMOD(insn);
		576	break;
		577	case OP_MAD:
		578	case OP_MUL:
		579	handleMUL(insn);
		580	break;
		581	default:
		582	break;
		583	}
		584	}
		585	return true;
		586	}
		587
		588	class NV50LoweringPreSSA : public Pass
		589	{
		590	public:
		591	NV50LoweringPreSSA(Program *);
		592
		593	private:
		594	virtual bool visit(Instruction *);
		595	virtual bool visit(Function *);
		596
		597	bool handleRDSV(Instruction *);
		598	bool handleWRSV(Instruction *);
		599
		600	bool handlePFETCH(Instruction *);
		601	bool handleEXPORT(Instruction *);
		602	bool handleLOAD(Instruction *);
		603
		604	bool handleDIV(Instruction *);
		605	bool handleSQRT(Instruction *);
		606	bool handlePOW(Instruction *);
		607
		608	bool handleSET(Instruction *);
		609	bool handleSLCT(CmpInstruction *);
		610	bool handleSELP(Instruction *);
		611
		612	bool handleTEX(TexInstruction *);
		613	bool handleTXB(TexInstruction *); // I really
		614	bool handleTXL(TexInstruction *); // hate
		615	bool handleTXD(TexInstruction *); // these 3
		616	bool handleTXLQ(TexInstruction *);
		617
		618	bool handleCALL(Instruction *);
		619	bool handlePRECONT(Instruction *);
		620	bool handleCONT(Instruction *);
		621
		622	void checkPredicate(Instruction *);
		623	void loadTexMsInfo(uint32_t off, Value ms, Value ms_x, Value **ms_y);
		624	void loadMsInfo(Value ms, Value s, Value dx, Value dy);
		625
		626	private:
		627	const Target *const targ;
		628
		629	BuildUtil bld;
		630
		631	Value *tid;
		632	};
		633
		634	NV50LoweringPreSSA::NV50LoweringPreSSA(Program *prog) :
		635	targ(prog->getTarget()), tid(NULL)
		636	{
		637	bld.setProgram(prog);
		638	}
		639
		640	bool
		641	NV50LoweringPreSSA::visit(Function *f)
		642	{
		643	BasicBlock *root = BasicBlock::get(func->cfg.getRoot());
		644
		645	if (prog->getType() == Program::TYPE_COMPUTE) {
		646	// Add implicit "thread id" argument in $r0 to the function
		647	Value *arg = new_LValue(func, FILE_GPR);
		648	arg->reg.data.id = 0;
		649	f->ins.push_back(arg);
		650
		651	bld.setPosition(root, false);
		652	tid = bld.mkMov(bld.getScratch(), arg, TYPE_U32)->getDef(0);
		653	}
		654
		655	return true;
		656	}
		657
		658	void NV50LoweringPreSSA::loadTexMsInfo(uint32_t off, Value **ms,
		659	Value ms_x, Value ms_y) {
		660	// This loads the texture-indexed ms setting from the constant buffer
		661	Value *tmp = new_LValue(func, FILE_GPR);
		662	uint8_t b = prog->driver->io.resInfoCBSlot;
		663	off += prog->driver->io.suInfoBase;
		664	if (prog->getType() > Program::TYPE_VERTEX)
		665	off += 16 * 2 * 4;
		666	if (prog->getType() > Program::TYPE_GEOMETRY)
		667	off += 16 * 2 * 4;
		668	*ms_x = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
		669	FILE_MEMORY_CONST, b, TYPE_U32, off + 0), NULL);
		670	*ms_y = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
		671	FILE_MEMORY_CONST, b, TYPE_U32, off + 4), NULL);
		672	ms = bld.mkOp2v(OP_ADD, TYPE_U32, tmp, ms_x, *ms_y);
		673	}
		674
		675	void NV50LoweringPreSSA::loadMsInfo(Value ms, Value s, Value dx, Value dy) {
		676	// Given a MS level, and a sample id, compute the delta x/y
		677	uint8_t b = prog->driver->io.msInfoCBSlot;
		678	Value off = new_LValue(func, FILE_ADDRESS), t = new_LValue(func, FILE_GPR);
		679
		680	// The required information is at mslevel * 16 * 4 + sample * 8
		681	// = (mslevel * 8 + sample) * 8
		682	bld.mkOp2(OP_SHL,
		683	TYPE_U32,
		684	off,
		685	bld.mkOp2v(OP_ADD, TYPE_U32, t,
		686	bld.mkOp2v(OP_SHL, TYPE_U32, t, ms, bld.mkImm(3)),
		687	s),
		688	bld.mkImm(3));
		689	*dx = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
		690	FILE_MEMORY_CONST, b, TYPE_U32,
		691	prog->driver->io.msInfoBase), off);
		692	*dy = bld.mkLoadv(TYPE_U32, bld.mkSymbol(
		693	FILE_MEMORY_CONST, b, TYPE_U32,
		694	prog->driver->io.msInfoBase + 4), off);
		695	}
		696
		697	bool
		698	NV50LoweringPreSSA::handleTEX(TexInstruction *i)
		699	{
		700	const int arg = i->tex.target.getArgCount();
		701	const int dref = arg;
		702	const int lod = i->tex.target.isShadow() ? (arg + 1) : arg;
		703
		704	// handle MS, which means looking up the MS params for this texture, and
		705	// adjusting the input coordinates to point at the right sample.
		706	if (i->tex.target.isMS()) {
		707	Value *x = i->getSrc(0);
		708	Value *y = i->getSrc(1);
		709	Value *s = i->getSrc(arg - 1);
		710	Value tx = new_LValue(func, FILE_GPR), ty = new_LValue(func, FILE_GPR),
		711	ms, ms_x, ms_y, dx, *dy;
		712
		713	i->tex.target.clearMS();
		714
		715	loadTexMsInfo(i->tex.r * 4 * 2, &ms, &ms_x, &ms_y);
		716	loadMsInfo(ms, s, &dx, &dy);
		717
		718	bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
		719	bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
		720	bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
		721	bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
		722	i->setSrc(0, tx);
		723	i->setSrc(1, ty);
		724	i->setSrc(arg - 1, bld.loadImm(NULL, 0));
		725	}
		726
		727	// dref comes before bias/lod
		728	if (i->tex.target.isShadow())
		729	if (i->op == OP_TXB \|\| i->op == OP_TXL)
		730	i->swapSources(dref, lod);
		731
		732	if (i->tex.target.isArray()) {
		733	if (i->op != OP_TXF) {
		734	// array index must be converted to u32, but it's already an integer
		735	// for TXF
		736	Value *layer = i->getSrc(arg - 1);
		737	LValue *src = new_LValue(func, FILE_GPR);
		738	bld.mkCvt(OP_CVT, TYPE_U32, src, TYPE_F32, layer);
		739	bld.mkOp2(OP_MIN, TYPE_U32, src, src, bld.loadImm(NULL, 511));
		740	i->setSrc(arg - 1, src);
		741	}
		742	if (i->tex.target.isCube() && i->srcCount() > 4) {
		743	std::vector acube, a2d;
		744	int c;
		745
		746	acube.resize(4);
		747	for (c = 0; c < 4; ++c)
		748	acube[c] = i->getSrc(c);
		749	a2d.resize(4);
		750	for (c = 0; c < 3; ++c)
		751	a2d[c] = new_LValue(func, FILE_GPR);
		752	a2d[3] = NULL;
		753
		754	bld.mkTex(OP_TEXPREP, TEX_TARGET_CUBE_ARRAY, i->tex.r, i->tex.s,
		755	a2d, acube)->asTex()->tex.mask = 0x7;
		756
		757	for (c = 0; c < 3; ++c)
		758	i->setSrc(c, a2d[c]);
		759	for (; i->srcExists(c + 1); ++c)
		760	i->setSrc(c, i->getSrc(c + 1));
		761	i->setSrc(c, NULL);
		762	assert(c <= 4);
		763
		764	i->tex.target = i->tex.target.isShadow() ?
		765	TEX_TARGET_2D_ARRAY_SHADOW : TEX_TARGET_2D_ARRAY;
		766	}
		767	}
		768
		769	// texel offsets are 3 immediate fields in the instruction,
		770	// nv50 cannot do textureGatherOffsets
		771	assert(i->tex.useOffsets <= 1);
		772	if (i->tex.useOffsets) {
		773	for (int c = 0; c < 3; ++c) {
		774	ImmediateValue val;
		775	if (!i->offset[0][c].getImmediate(val))
		776	assert(!"non-immediate offset");
		777	i->tex.offset[c] = val.reg.data.u32;
		778	i->offset[0][c].set(NULL);
		779	}
		780	}
		781
		782	return true;
		783	}
		784
		785	// Bias must be equal for all threads of a quad or lod calculation will fail.
		786	//
		787	// The lanes of a quad are grouped by the bit in the condition register they
		788	// have set, which is selected by differing bias values.
		789	// Move the input values for TEX into a new register set for each group and
		790	// execute TEX only for a specific group.
		791	// We always need to use 4 new registers for the inputs/outputs because the
		792	// implicitly calculated derivatives must be correct.
		793	//
		794	// TODO: move to SSA phase so we can easily determine whether bias is constant
		795	bool
		796	NV50LoweringPreSSA::handleTXB(TexInstruction *i)
		797	{
		798	const CondCode cc[4] = { CC_EQU, CC_S, CC_C, CC_O };
		799	int l, d;
		800
		801	// We can't actually apply bias and do a compare for a cube
		802	// texture. Since the compare has to be done before the filtering, just
		803	// drop the bias on the floor.
		804	if (i->tex.target == TEX_TARGET_CUBE_SHADOW) {
		805	i->op = OP_TEX;
		806	i->setSrc(3, i->getSrc(4));
		807	i->setSrc(4, NULL);
		808	return handleTEX(i);
		809	}
		810
		811	handleTEX(i);
		812	Value *bias = i->getSrc(i->tex.target.getArgCount());
		813	if (bias->isUniform())
		814	return true;
		815
		816	Instruction *cond = bld.mkOp1(OP_UNION, TYPE_U32, bld.getScratch(),
		817	bld.loadImm(NULL, 1));
		818	bld.setPosition(cond, false);
		819
		820	for (l = 1; l < 4; ++l) {
		821	const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
		822	Value *bit = bld.getSSA();
		823	Value *pred = bld.getScratch(1, FILE_FLAGS);
		824	Value *imm = bld.loadImm(NULL, (1 << l));
		825	bld.mkQuadop(qop, pred, l, bias, bias)->flagsDef = 0;
		826	bld.mkMov(bit, imm)->setPredicate(CC_EQ, pred);
		827	cond->setSrc(l, bit);
		828	}
		829	Value *flags = bld.getScratch(1, FILE_FLAGS);
		830	bld.setPosition(cond, true);
		831	bld.mkCvt(OP_CVT, TYPE_U8, flags, TYPE_U32, cond->getDef(0));
		832
		833	Instruction *tex[4];
		834	for (l = 0; l < 4; ++l) {
		835	(tex[l] = cloneForward(func, i))->setPredicate(cc[l], flags);
		836	bld.insert(tex[l]);
		837	}
		838
		839	Value *res[4][4];
		840	for (d = 0; i->defExists(d); ++d)
		841	res[0][d] = tex[0]->getDef(d);
		842	for (l = 1; l < 4; ++l) {
		843	for (d = 0; tex[l]->defExists(d); ++d) {
		844	res[l][d] = cloneShallow(func, res[0][d]);
		845	bld.mkMov(res[l][d], tex[l]->getDef(d))->setPredicate(cc[l], flags);
		846	}
		847	}
		848
		849	for (d = 0; i->defExists(d); ++d) {
		850	Instruction *dst = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(d));
		851	for (l = 0; l < 4; ++l)
		852	dst->setSrc(l, res[l][d]);
		853	}
		854	delete_Instruction(prog, i);
		855	return true;
		856	}
		857
		858	// LOD must be equal for all threads of a quad.
		859	// Unlike with TXB, here we can just diverge since there's no LOD calculation
		860	// that would require all 4 threads' sources to be set up properly.
		861	bool
		862	NV50LoweringPreSSA::handleTXL(TexInstruction *i)
		863	{
		864	handleTEX(i);
		865	Value *lod = i->getSrc(i->tex.target.getArgCount());
		866	if (lod->isUniform())
		867	return true;
		868
		869	BasicBlock *currBB = i->bb;
		870	BasicBlock *texiBB = i->bb->splitBefore(i, false);
		871	BasicBlock *joinBB = i->bb->splitAfter(i);
		872
		873	bld.setPosition(currBB, true);
		874	currBB->joinAt = bld.mkFlow(OP_JOINAT, joinBB, CC_ALWAYS, NULL);
		875
		876	for (int l = 0; l <= 3; ++l) {
		877	const uint8_t qop = QUADOP(SUBR, SUBR, SUBR, SUBR);
		878	Value *pred = bld.getScratch(1, FILE_FLAGS);
		879	bld.setPosition(currBB, true);
		880	bld.mkQuadop(qop, pred, l, lod, lod)->flagsDef = 0;
		881	bld.mkFlow(OP_BRA, texiBB, CC_EQ, pred)->fixed = 1;
		882	currBB->cfg.attach(&texiBB->cfg, Graph::Edge::FORWARD);
		883	if (l <= 2) {
		884	BasicBlock *laneBB = new BasicBlock(func);
		885	currBB->cfg.attach(&laneBB->cfg, Graph::Edge::TREE);
		886	currBB = laneBB;
		887	}
		888	}
		889	bld.setPosition(joinBB, false);
		890	bld.mkOp(OP_JOIN, TYPE_NONE, NULL);
		891	return true;
		892	}
		893
		894	bool
		895	NV50LoweringPreSSA::handleTXD(TexInstruction *i)
		896	{
		897	static const uint8_t qOps[4][2] =
		898	{
		899	{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
		900	{ QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
		901	{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
		902	{ QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
		903	};
		904	Value *def[4][4];
		905	Value *crd[3];
		906	Instruction *tex;
		907	Value *zero = bld.loadImm(bld.getSSA(), 0);
		908	int l, c;
		909	const int dim = i->tex.target.getDim();
		910
		911	handleTEX(i);
		912	i->op = OP_TEX; // no need to clone dPdx/dPdy later
		913
		914	for (c = 0; c < dim; ++c)
		915	crd[c] = bld.getScratch();
		916
		917	bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
		918	for (l = 0; l < 4; ++l) {
		919	// mov coordinates from lane l to all lanes
		920	for (c = 0; c < dim; ++c)
		921	bld.mkQuadop(0x00, crd[c], l, i->getSrc(c), zero);
		922	// add dPdx from lane l to lanes dx
		923	for (c = 0; c < dim; ++c)
		924	bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
		925	// add dPdy from lane l to lanes dy
		926	for (c = 0; c < dim; ++c)
		927	bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
		928	// texture
		929	bld.insert(tex = cloneForward(func, i));
		930	for (c = 0; c < dim; ++c)
		931	tex->setSrc(c, crd[c]);
		932	// save results
		933	for (c = 0; i->defExists(c); ++c) {
		934	Instruction *mov;
		935	def[c][l] = bld.getSSA();
		936	mov = bld.mkMov(def[c][l], tex->getDef(c));
		937	mov->fixed = 1;
		938	mov->lanes = 1 << l;
		939	}
		940	}
		941	bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
		942
		943	for (c = 0; i->defExists(c); ++c) {
		944	Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
		945	for (l = 0; l < 4; ++l)
		946	u->setSrc(l, def[c][l]);
		947	}
		948
		949	i->bb->remove(i);
		950	return true;
		951	}
		952
		953	bool
		954	NV50LoweringPreSSA::handleTXLQ(TexInstruction *i)
		955	{
		956	handleTEX(i);
		957	bld.setPosition(i, true);
		958
		959	/* The returned values are not quite what we want:
		960	* (a) convert from s32 to f32
		961	* (b) multiply by 1/256
		962	*/
		963	for (int def = 0; def < 2; ++def) {
		964	if (!i->defExists(def))
		965	continue;
		966	bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), TYPE_S32, i->getDef(def));
		967	bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
		968	i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
		969	}
		970	return true;
		971	}
		972
		973	bool
		974	NV50LoweringPreSSA::handleSET(Instruction *i)
		975	{
		976	if (i->dType == TYPE_F32) {
		977	bld.setPosition(i, true);
		978	i->dType = TYPE_U32;
		979	bld.mkOp1(OP_ABS, TYPE_S32, i->getDef(0), i->getDef(0));
		980	bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(0), TYPE_S32, i->getDef(0));
		981	}
		982	return true;
		983	}
		984
		985	bool
		986	NV50LoweringPreSSA::handleSLCT(CmpInstruction *i)
		987	{
		988	Value *src0 = bld.getSSA();
		989	Value *src1 = bld.getSSA();
		990	Value *pred = bld.getScratch(1, FILE_FLAGS);
		991
		992	Value *v0 = i->getSrc(0);
		993	Value *v1 = i->getSrc(1);
		994	// XXX: these probably shouldn't be immediates in the first place ...
		995	if (v0->asImm())
		996	v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
		997	if (v1->asImm())
		998	v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
		999
		1000	bld.setPosition(i, true);
		1001	bld.mkMov(src0, v0)->setPredicate(CC_NE, pred);
		1002	bld.mkMov(src1, v1)->setPredicate(CC_EQ, pred);
		1003	bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
		1004
		1005	bld.setPosition(i, false);
		1006	i->op = OP_SET;
		1007	i->setFlagsDef(0, pred);
		1008	i->dType = TYPE_U8;
		1009	i->setSrc(0, i->getSrc(2));
		1010	i->setSrc(2, NULL);
		1011	i->setSrc(1, bld.loadImm(NULL, 0));
		1012
		1013	return true;
		1014	}
		1015
		1016	bool
		1017	NV50LoweringPreSSA::handleSELP(Instruction *i)
		1018	{
		1019	Value *src0 = bld.getSSA();
		1020	Value *src1 = bld.getSSA();
		1021
		1022	Value *v0 = i->getSrc(0);
		1023	Value *v1 = i->getSrc(1);
		1024	if (v0->asImm())
		1025	v0 = bld.mkMov(bld.getSSA(), v0)->getDef(0);
		1026	if (v1->asImm())
		1027	v1 = bld.mkMov(bld.getSSA(), v1)->getDef(0);
		1028
		1029	bld.mkMov(src0, v0)->setPredicate(CC_NE, i->getSrc(2));
		1030	bld.mkMov(src1, v1)->setPredicate(CC_EQ, i->getSrc(2));
		1031	bld.mkOp2(OP_UNION, i->dType, i->getDef(0), src0, src1);
		1032	delete_Instruction(prog, i);
		1033	return true;
		1034	}
		1035
		1036	bool
		1037	NV50LoweringPreSSA::handleWRSV(Instruction *i)
		1038	{
		1039	Symbol *sym = i->getSrc(0)->asSym();
		1040
		1041	// these are all shader outputs, $sreg are not writeable
		1042	uint32_t addr = targ->getSVAddress(FILE_SHADER_OUTPUT, sym);
		1043	if (addr >= 0x400)
		1044	return false;
		1045	sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
		1046
		1047	bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0), i->getSrc(1));
		1048
		1049	bld.getBB()->remove(i);
		1050	return true;
		1051	}
		1052
		1053	bool
		1054	NV50LoweringPreSSA::handleCALL(Instruction *i)
		1055	{
		1056	if (prog->getType() == Program::TYPE_COMPUTE) {
		1057	// Add implicit "thread id" argument in $r0 to the function
		1058	i->setSrc(i->srcCount(), tid);
		1059	}
		1060	return true;
		1061	}
		1062
		1063	bool
		1064	NV50LoweringPreSSA::handlePRECONT(Instruction *i)
		1065	{
		1066	delete_Instruction(prog, i);
		1067	return true;
		1068	}
		1069
		1070	bool
		1071	NV50LoweringPreSSA::handleCONT(Instruction *i)
		1072	{
		1073	i->op = OP_BRA;
		1074	return true;
		1075	}
		1076
		1077	bool
		1078	NV50LoweringPreSSA::handleRDSV(Instruction *i)
		1079	{
		1080	Symbol *sym = i->getSrc(0)->asSym();
		1081	uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
		1082	Value *def = i->getDef(0);
		1083	SVSemantic sv = sym->reg.data.sv.sv;
		1084	int idx = sym->reg.data.sv.index;
		1085
		1086	if (addr >= 0x400) // mov $sreg
		1087	return true;
		1088
		1089	switch (sv) {
		1090	case SV_POSITION:
		1091	assert(prog->getType() == Program::TYPE_FRAGMENT);
		1092	bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
		1093	break;
		1094	case SV_FACE:
		1095	bld.mkInterp(NV50_IR_INTERP_FLAT, def, addr, NULL);
		1096	if (i->dType == TYPE_F32) {
		1097	bld.mkOp2(OP_OR, TYPE_U32, def, def, bld.mkImm(0x00000001));
		1098	bld.mkOp1(OP_NEG, TYPE_S32, def, def);
		1099	bld.mkCvt(OP_CVT, TYPE_F32, def, TYPE_S32, def);
		1100	}
		1101	break;
		1102	case SV_NCTAID:
		1103	case SV_CTAID:
		1104	case SV_NTID:
		1105	if ((sv == SV_NCTAID && idx >= 2) \|\|
		1106	(sv == SV_NTID && idx >= 3)) {
		1107	bld.mkMov(def, bld.mkImm(1));
		1108	} else if (sv == SV_CTAID && idx >= 2) {
		1109	bld.mkMov(def, bld.mkImm(0));
		1110	} else {
		1111	Value *x = bld.getSSA(2);
		1112	bld.mkOp1(OP_LOAD, TYPE_U16, x,
		1113	bld.mkSymbol(FILE_MEMORY_SHARED, 0, TYPE_U16, addr));
		1114	bld.mkCvt(OP_CVT, TYPE_U32, def, TYPE_U16, x);
		1115	}
		1116	break;
		1117	case SV_TID:
		1118	if (idx == 0) {
		1119	bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x0000ffff));
		1120	} else if (idx == 1) {
		1121	bld.mkOp2(OP_AND, TYPE_U32, def, tid, bld.mkImm(0x03ff0000));
		1122	bld.mkOp2(OP_SHR, TYPE_U32, def, def, bld.mkImm(16));
		1123	} else if (idx == 2) {
		1124	bld.mkOp2(OP_SHR, TYPE_U32, def, tid, bld.mkImm(26));
		1125	} else {
		1126	bld.mkMov(def, bld.mkImm(0));
		1127	}
		1128	break;
		1129	case SV_SAMPLE_POS: {
		1130	Value *off = new_LValue(func, FILE_ADDRESS);
		1131	bld.mkOp1(OP_RDSV, TYPE_U32, def, bld.mkSysVal(SV_SAMPLE_INDEX, 0));
		1132	bld.mkOp2(OP_SHL, TYPE_U32, off, def, bld.mkImm(3));
		1133	bld.mkLoad(TYPE_F32,
		1134	def,
		1135	bld.mkSymbol(
		1136	FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
		1137	TYPE_U32, prog->driver->io.sampleInfoBase + 4 * idx),
		1138	off);
		1139	break;
		1140	}
		1141	default:
		1142	bld.mkFetch(i->getDef(0), i->dType,
		1143	FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), NULL);
		1144	break;
		1145	}
		1146	bld.getBB()->remove(i);
		1147	return true;
		1148	}
		1149
		1150	bool
		1151	NV50LoweringPreSSA::handleDIV(Instruction *i)
		1152	{
		1153	if (!isFloatType(i->dType))
		1154	return true;
		1155	bld.setPosition(i, false);
		1156	Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(), i->getSrc(1));
		1157	i->op = OP_MUL;
		1158	i->setSrc(1, rcp->getDef(0));
		1159	return true;
		1160	}
		1161
		1162	bool
		1163	NV50LoweringPreSSA::handleSQRT(Instruction *i)
		1164	{
		1165	Instruction *rsq = bld.mkOp1(OP_RSQ, TYPE_F32,
		1166	bld.getSSA(), i->getSrc(0));
		1167	i->op = OP_MUL;
		1168	i->setSrc(1, rsq->getDef(0));
		1169
		1170	return true;
		1171	}
		1172
		1173	bool
		1174	NV50LoweringPreSSA::handlePOW(Instruction *i)
		1175	{
		1176	LValue *val = bld.getScratch();
		1177
		1178	bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
		1179	bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
		1180	bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
		1181
		1182	i->op = OP_EX2;
		1183	i->setSrc(0, val);
		1184	i->setSrc(1, NULL);
		1185
		1186	return true;
		1187	}
		1188
		1189	bool
		1190	NV50LoweringPreSSA::handleEXPORT(Instruction *i)
		1191	{
		1192	if (prog->getType() == Program::TYPE_FRAGMENT) {
		1193	if (i->getIndirect(0, 0)) {
		1194	// TODO: redirect to l[] here, load to GPRs at exit
		1195	return false;
		1196	} else {
		1197	int id = i->getSrc(0)->reg.data.offset / 4; // in 32 bit reg units
		1198
		1199	i->op = OP_MOV;
		1200	i->subOp = NV50_IR_SUBOP_MOV_FINAL;
		1201	i->src(0).set(i->src(1));
		1202	i->setSrc(1, NULL);
		1203	i->setDef(0, new_LValue(func, FILE_GPR));
		1204	i->getDef(0)->reg.data.id = id;
		1205
		1206	prog->maxGPR = MAX2(prog->maxGPR, id);
		1207	}
		1208	}
		1209	return true;
		1210	}
		1211
		1212	// Handle indirect addressing in geometry shaders:
		1213	//
		1214	// ld $r0 a[$a1][$a2+k] ->
		1215	// ld $r0 a[($a1 + $a2 * $vstride) + k], where k *= $vstride is implicit
		1216	//
		1217	bool
		1218	NV50LoweringPreSSA::handleLOAD(Instruction *i)
		1219	{
		1220	ValueRef src = i->src(0);
		1221
		1222	if (src.isIndirect(1)) {
		1223	assert(prog->getType() == Program::TYPE_GEOMETRY);
		1224	Value *addr = i->getIndirect(0, 1);
		1225
		1226	if (src.isIndirect(0)) {
		1227	// base address is in an address register, so move to a GPR
		1228	Value *base = bld.getScratch();
		1229	bld.mkMov(base, addr);
		1230
		1231	Symbol *sv = bld.mkSysVal(SV_VERTEX_STRIDE, 0);
		1232	Value *vstride = bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getSSA(), sv);
		1233	Value *attrib = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
		1234	i->getIndirect(0, 0), bld.mkImm(2));
		1235
		1236	// Calculate final address: addr = base + attr*vstride; use 16-bit
		1237	// multiplication since 32-bit would be lowered to multiple
		1238	// instructions, and we only need the low 16 bits of the result
		1239	Value a[2], b[2];
		1240	bld.mkSplit(a, 2, attrib);
		1241	bld.mkSplit(b, 2, vstride);
		1242	Value *sum = bld.mkOp3v(OP_MAD, TYPE_U16, bld.getSSA(), a[0], b[0],
		1243	base);
		1244
		1245	// move address from GPR into an address register
		1246	addr = bld.getSSA(2, FILE_ADDRESS);
		1247	bld.mkMov(addr, sum);
		1248	}
		1249
		1250	i->setIndirect(0, 1, NULL);
		1251	i->setIndirect(0, 0, addr);
		1252	}
		1253
		1254	return true;
		1255	}
		1256
		1257	bool
		1258	NV50LoweringPreSSA::handlePFETCH(Instruction *i)
		1259	{
		1260	assert(prog->getType() == Program::TYPE_GEOMETRY);
		1261
		1262	// NOTE: cannot use getImmediate here, not in SSA form yet, move to
		1263	// later phase if that assertion ever triggers:
		1264
		1265	ImmediateValue *imm = i->getSrc(0)->asImm();
		1266	assert(imm);
		1267
		1268	assert(imm->reg.data.u32 <= 127); // TODO: use address reg if that happens
		1269
		1270	if (i->srcExists(1)) {
		1271	// indirect addressing of vertex in primitive space
		1272
		1273	LValue *val = bld.getScratch();
		1274	Value *ptr = bld.getSSA(2, FILE_ADDRESS);
		1275	bld.mkOp2v(OP_SHL, TYPE_U32, ptr, i->getSrc(1), bld.mkImm(2));
		1276	bld.mkOp2v(OP_PFETCH, TYPE_U32, val, imm, ptr);
		1277
		1278	// NOTE: PFETCH directly to an $aX only works with direct addressing
		1279	i->op = OP_SHL;
		1280	i->setSrc(0, val);
		1281	i->setSrc(1, bld.mkImm(0));
		1282	}
		1283
		1284	return true;
		1285	}
		1286
		1287	// Set flags according to predicate and make the instruction read $cX.
		1288	void
		1289	NV50LoweringPreSSA::checkPredicate(Instruction *insn)
		1290	{
		1291	Value *pred = insn->getPredicate();
		1292	Value *cdst;
		1293
		1294	// FILE_PREDICATE will simply be changed to FLAGS on conversion to SSA
		1295	if (!pred \|\|
		1296	pred->reg.file == FILE_FLAGS \|\| pred->reg.file == FILE_PREDICATE)
		1297	return;
		1298
		1299	cdst = bld.getSSA(1, FILE_FLAGS);
		1300
		1301	bld.mkCmp(OP_SET, CC_NEU, insn->dType, cdst, insn->dType, bld.loadImm(NULL, 0), pred);
		1302
		1303	insn->setPredicate(insn->cc, cdst);
		1304	}
		1305
		1306	//
		1307	// - add quadop dance for texturing
		1308	// - put FP outputs in GPRs
		1309	// - convert instruction sequences
		1310	//
		1311	bool
		1312	NV50LoweringPreSSA::visit(Instruction *i)
		1313	{
		1314	bld.setPosition(i, false);
		1315
		1316	if (i->cc != CC_ALWAYS)
		1317	checkPredicate(i);
		1318
		1319	switch (i->op) {
		1320	case OP_TEX:
		1321	case OP_TXF:
		1322	case OP_TXG:
		1323	return handleTEX(i->asTex());
		1324	case OP_TXB:
		1325	return handleTXB(i->asTex());
		1326	case OP_TXL:
		1327	return handleTXL(i->asTex());
		1328	case OP_TXD:
		1329	return handleTXD(i->asTex());
		1330	case OP_TXLQ:
		1331	return handleTXLQ(i->asTex());
		1332	case OP_EX2:
		1333	bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
		1334	i->setSrc(0, i->getDef(0));
		1335	break;
		1336	case OP_SET:
		1337	return handleSET(i);
		1338	case OP_SLCT:
		1339	return handleSLCT(i->asCmp());
		1340	case OP_SELP:
		1341	return handleSELP(i);
		1342	case OP_POW:
		1343	return handlePOW(i);
		1344	case OP_DIV:
		1345	return handleDIV(i);
		1346	case OP_SQRT:
		1347	return handleSQRT(i);
		1348	case OP_EXPORT:
		1349	return handleEXPORT(i);
		1350	case OP_LOAD:
		1351	return handleLOAD(i);
		1352	case OP_RDSV:
		1353	return handleRDSV(i);
		1354	case OP_WRSV:
		1355	return handleWRSV(i);
		1356	case OP_CALL:
		1357	return handleCALL(i);
		1358	case OP_PRECONT:
		1359	return handlePRECONT(i);
		1360	case OP_CONT:
		1361	return handleCONT(i);
		1362	case OP_PFETCH:
		1363	return handlePFETCH(i);
		1364	default:
		1365	break;
		1366	}
		1367	return true;
		1368	}
		1369
		1370	bool
		1371	TargetNV50::runLegalizePass(Program *prog, CGStage stage) const
		1372	{
		1373	bool ret = false;
		1374
		1375	if (stage == CG_STAGE_PRE_SSA) {
		1376	NV50LoweringPreSSA pass(prog);
		1377	ret = pass.run(prog, false, true);
		1378	} else
		1379	if (stage == CG_STAGE_SSA) {
		1380	if (!prog->targetPriv)
		1381	prog->targetPriv = new std::list();
		1382	NV50LegalizeSSA pass(prog);
		1383	ret = pass.run(prog, false, true);
		1384	} else
		1385	if (stage == CG_STAGE_POST_RA) {
		1386	NV50LegalizePostRA pass;
		1387	ret = pass.run(prog, false, true);
		1388	if (prog->targetPriv)
		1389	delete reinterpret_cast *>(prog->targetPriv);
		1390	}
		1391	return ret;
		1392	}
		1393
		1394	} // namespace nv50_ir

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/mesa-10.6.0/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nv50.cpp – Rev 5571