WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/mesa-10.6.0/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp

Rev	Author	Line No.	Line
5564	serge	1	/*
		2	* Copyright 2011 Christoph Bumiller
		3	*
		4	* Permission is hereby granted, free of charge, to any person obtaining a
		5	* copy of this software and associated documentation files (the "Software"),
		6	* to deal in the Software without restriction, including without limitation
		7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		8	* and/or sell copies of the Software, and to permit persons to whom the
		9	* Software is furnished to do so, subject to the following conditions:
		10	*
		11	* The above copyright notice and this permission notice shall be included in
		12	* all copies or substantial portions of the Software.
		13	*
		14	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		15	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		16	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		17	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR
		18	* OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
		19	* ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
		20	* OTHER DEALINGS IN THE SOFTWARE.
		21	*/
		22
		23	#include "codegen/nv50_ir.h"
		24	#include "codegen/nv50_ir_build_util.h"
		25
		26	#include "codegen/nv50_ir_target_nvc0.h"
		27	#include "codegen/nv50_ir_lowering_nvc0.h"
		28
		29	#include
		30
		31	namespace nv50_ir {
		32
		33	#define QOP_ADD 0
		34	#define QOP_SUBR 1
		35	#define QOP_SUB 2
		36	#define QOP_MOV2 3
		37
		38	// UL UR LL LR
		39	#define QUADOP(q, r, s, t) \
		40	((QOP_##q << 6) \| (QOP_##r << 4) \| \
		41	(QOP_##s << 2) \| (QOP_##t << 0))
		42
		43	void
		44	NVC0LegalizeSSA::handleDIV(Instruction *i)
		45	{
		46	FlowInstruction *call;
		47	int builtin;
		48	Value *def[2];
		49
		50	bld.setPosition(i, false);
		51	def[0] = bld.mkMovToReg(0, i->getSrc(0))->getDef(0);
		52	def[1] = bld.mkMovToReg(1, i->getSrc(1))->getDef(0);
		53	switch (i->dType) {
		54	case TYPE_U32: builtin = NVC0_BUILTIN_DIV_U32; break;
		55	case TYPE_S32: builtin = NVC0_BUILTIN_DIV_S32; break;
		56	default:
		57	return;
		58	}
		59	call = bld.mkFlow(OP_CALL, NULL, CC_ALWAYS, NULL);
		60	bld.mkMov(i->getDef(0), def[(i->op == OP_DIV) ? 0 : 1]);
		61	bld.mkClobber(FILE_GPR, (i->op == OP_DIV) ? 0xe : 0xd, 2);
		62	bld.mkClobber(FILE_PREDICATE, (i->dType == TYPE_S32) ? 0xf : 0x3, 0);
		63
		64	call->fixed = 1;
		65	call->absolute = call->builtin = 1;
		66	call->target.builtin = builtin;
		67	delete_Instruction(prog, i);
		68	}
		69
		70	void
		71	NVC0LegalizeSSA::handleRCPRSQ(Instruction *i)
		72	{
		73	assert(i->dType == TYPE_F64);
		74	// There are instructions that will compute the high 32 bits of the 64-bit
		75	// float. We will just stick 0 in the bottom 32 bits.
		76
		77	bld.setPosition(i, false);
		78
		79	// 1. Take the source and it up.
		80	Value src[2], dst[2], *def = i->getDef(0);
		81	bld.mkSplit(src, 4, i->getSrc(0));
		82
		83	// 2. We don't care about the low 32 bits of the destination. Stick a 0 in.
		84	dst[0] = bld.loadImm(NULL, 0);
		85	dst[1] = bld.getSSA();
		86
		87	// 3. The new version of the instruction takes the high 32 bits of the
		88	// source and outputs the high 32 bits of the destination.
		89	i->setSrc(0, src[1]);
		90	i->setDef(0, dst[1]);
		91	i->setType(TYPE_F32);
		92	i->subOp = NV50_IR_SUBOP_RCPRSQ_64H;
		93
		94	// 4. Recombine the two dst pieces back into the original destination.
		95	bld.setPosition(i, true);
		96	bld.mkOp2(OP_MERGE, TYPE_U64, def, dst[0], dst[1]);
		97	}
		98
		99	void
		100	NVC0LegalizeSSA::handleFTZ(Instruction *i)
		101	{
		102	// Only want to flush float inputs
		103	assert(i->sType == TYPE_F32);
		104
		105	// If we're already flushing denorms (and NaN's) to zero, no need for this.
		106	if (i->dnz)
		107	return;
		108
		109	// Only certain classes of operations can flush
		110	OpClass cls = prog->getTarget()->getOpClass(i->op);
		111	if (cls != OPCLASS_ARITH && cls != OPCLASS_COMPARE &&
		112	cls != OPCLASS_CONVERT)
		113	return;
		114
		115	i->ftz = true;
		116	}
		117
		118	bool
		119	NVC0LegalizeSSA::visit(Function *fn)
		120	{
		121	bld.setProgram(fn->getProgram());
		122	return true;
		123	}
		124
		125	bool
		126	NVC0LegalizeSSA::visit(BasicBlock *bb)
		127	{
		128	Instruction *next;
		129	for (Instruction *i = bb->getEntry(); i; i = next) {
		130	next = i->next;
		131	if (i->sType == TYPE_F32) {
		132	if (prog->getType() != Program::TYPE_COMPUTE)
		133	handleFTZ(i);
		134	continue;
		135	}
		136	switch (i->op) {
		137	case OP_DIV:
		138	case OP_MOD:
		139	handleDIV(i);
		140	break;
		141	case OP_RCP:
		142	case OP_RSQ:
		143	if (i->dType == TYPE_F64)
		144	handleRCPRSQ(i);
		145	break;
		146	default:
		147	break;
		148	}
		149	}
		150	return true;
		151	}
		152
		153	NVC0LegalizePostRA::NVC0LegalizePostRA(const Program *prog)
		154	: rZero(NULL),
		155	carry(NULL),
		156	needTexBar(prog->getTarget()->getChipset() >= 0xe0)
		157	{
		158	}
		159
		160	bool
		161	NVC0LegalizePostRA::insnDominatedBy(const Instruction *later,
		162	const Instruction *early) const
		163	{
		164	if (early->bb == later->bb)
		165	return early->serial < later->serial;
		166	return later->bb->dominatedBy(early->bb);
		167	}
		168
		169	void
		170	NVC0LegalizePostRA::addTexUse(std::list &uses,
		171	Instruction usei, const Instruction insn)
		172	{
		173	bool add = true;
		174	for (std::list::iterator it = uses.begin();
		175	it != uses.end();) {
		176	if (insnDominatedBy(usei, it->insn)) {
		177	add = false;
		178	break;
		179	}
		180	if (insnDominatedBy(it->insn, usei))
		181	it = uses.erase(it);
		182	else
		183	++it;
		184	}
		185	if (add)
		186	uses.push_back(TexUse(usei, insn));
		187	}
		188
		189	void
		190	NVC0LegalizePostRA::findOverwritingDefs(const Instruction *texi,
		191	Instruction *insn,
		192	const BasicBlock *term,
		193	std::list &uses)
		194	{
		195	while (insn->op == OP_MOV && insn->getDef(0)->equals(insn->getSrc(0)))
		196	insn = insn->getSrc(0)->getUniqueInsn();
		197
		198	if (!insn->bb->reachableBy(texi->bb, term))
		199	return;
		200
		201	switch (insn->op) {
		202	/* Values not connected to the tex's definition through any of these should
		203	* not be conflicting.
		204	*/
		205	case OP_SPLIT:
		206	case OP_MERGE:
		207	case OP_PHI:
		208	case OP_UNION:
		209	/* recurse again */
		210	for (int s = 0; insn->srcExists(s); ++s)
		211	findOverwritingDefs(texi, insn->getSrc(s)->getUniqueInsn(), term,
		212	uses);
		213	break;
		214	default:
		215	// if (!isTextureOp(insn->op)) // TODO: are TEXes always ordered ?
		216	addTexUse(uses, insn, texi);
		217	break;
		218	}
		219	}
		220
		221	void
		222	NVC0LegalizePostRA::findFirstUses(
		223	const Instruction *texi,
		224	const Instruction *insn,
		225	std::list &uses,
		226	std::tr1::unordered_set& visited)
		227	{
		228	for (int d = 0; insn->defExists(d); ++d) {
		229	Value *v = insn->getDef(d);
		230	for (Value::UseIterator u = v->uses.begin(); u != v->uses.end(); ++u) {
		231	Instruction usei = (u)->getInsn();
		232
		233	// NOTE: In case of a loop that overwrites a value but never uses
		234	// it, it can happen that we have a cycle of uses that consists only
		235	// of phis and no-op moves and will thus cause an infinite loop here
		236	// since these are not considered actual uses.
		237	// The most obvious (and perhaps the only) way to prevent this is to
		238	// remember which instructions we've already visited.
		239
		240	if (visited.find(usei) != visited.end())
		241	continue;
		242
		243	visited.insert(usei);
		244
		245	if (usei->op == OP_PHI \|\| usei->op == OP_UNION) {
		246	// need a barrier before WAW cases
		247	for (int s = 0; usei->srcExists(s); ++s) {
		248	Instruction *defi = usei->getSrc(s)->getUniqueInsn();
		249	if (defi && &usei->src(s) != *u)
		250	findOverwritingDefs(texi, defi, usei->bb, uses);
		251	}
		252	}
		253
		254	if (usei->op == OP_SPLIT \|\|
		255	usei->op == OP_MERGE \|\|
		256	usei->op == OP_PHI \|\|
		257	usei->op == OP_UNION) {
		258	// these uses don't manifest in the machine code
		259	findFirstUses(texi, usei, uses, visited);
		260	} else
		261	if (usei->op == OP_MOV && usei->getDef(0)->equals(usei->getSrc(0)) &&
		262	usei->subOp != NV50_IR_SUBOP_MOV_FINAL) {
		263	findFirstUses(texi, usei, uses, visited);
		264	} else {
		265	addTexUse(uses, usei, insn);
		266	}
		267	}
		268	}
		269	}
		270
		271	// Texture barriers:
		272	// This pass is a bit long and ugly and can probably be optimized.
		273	//
		274	// 1. obtain a list of TEXes and their outputs' first use(s)
		275	// 2. calculate the barrier level of each first use (minimal number of TEXes,
		276	// over all paths, between the TEX and the use in question)
		277	// 3. for each barrier, if all paths from the source TEX to that barrier
		278	// contain a barrier of lesser level, it can be culled
		279	bool
		280	NVC0LegalizePostRA::insertTextureBarriers(Function *fn)
		281	{
		282	std::list *uses;
		283	std::vector texes;
		284	std::vector bbFirstTex;
		285	std::vector bbFirstUse;
		286	std::vector texCounts;
		287	std::vector useVec;
		288	ArrayList insns;
		289
		290	fn->orderInstructions(insns);
		291
		292	texCounts.resize(fn->allBBlocks.getSize(), 0);
		293	bbFirstTex.resize(fn->allBBlocks.getSize(), insns.getSize());
		294	bbFirstUse.resize(fn->allBBlocks.getSize(), insns.getSize());
		295
		296	// tag BB CFG nodes by their id for later
		297	for (ArrayList::Iterator i = fn->allBBlocks.iterator(); !i.end(); i.next()) {
		298	BasicBlock *bb = reinterpret_cast(i.get());
		299	if (bb)
		300	bb->cfg.tag = bb->getId();
		301	}
		302
		303	// gather the first uses for each TEX
		304	for (int i = 0; i < insns.getSize(); ++i) {
		305	Instruction *tex = reinterpret_cast(insns.get(i));
		306	if (isTextureOp(tex->op)) {
		307	texes.push_back(tex);
		308	if (!texCounts.at(tex->bb->getId()))
		309	bbFirstTex[tex->bb->getId()] = texes.size() - 1;
		310	texCounts[tex->bb->getId()]++;
		311	}
		312	}
		313	insns.clear();
		314	if (texes.empty())
		315	return false;
		316	uses = new std::list[texes.size()];
		317	if (!uses)
		318	return false;
		319	for (size_t i = 0; i < texes.size(); ++i) {
		320	std::tr1::unordered_set visited;
		321	findFirstUses(texes[i], texes[i], uses[i], visited);
		322	}
		323
		324	// determine the barrier level at each use
		325	for (size_t i = 0; i < texes.size(); ++i) {
		326	for (std::list::iterator u = uses[i].begin(); u != uses[i].end();
		327	++u) {
		328	BasicBlock *tb = texes[i]->bb;
		329	BasicBlock *ub = u->insn->bb;
		330	if (tb == ub) {
		331	u->level = 0;
		332	for (size_t j = i + 1; j < texes.size() &&
		333	texes[j]->bb == tb && texes[j]->serial < u->insn->serial;
		334	++j)
		335	u->level++;
		336	} else {
		337	u->level = fn->cfg.findLightestPathWeight(&tb->cfg,
		338	&ub->cfg, texCounts);
		339	if (u->level < 0) {
		340	WARN("Failed to find path TEX -> TEXBAR\n");
		341	u->level = 0;
		342	continue;
		343	}
		344	// this counted all TEXes in the origin block, correct that
		345	u->level -= i - bbFirstTex.at(tb->getId()) + 1 /* this TEX */;
		346	// and did not count the TEXes in the destination block, add those
		347	for (size_t j = bbFirstTex.at(ub->getId()); j < texes.size() &&
		348	texes[j]->bb == ub && texes[j]->serial < u->insn->serial;
		349	++j)
		350	u->level++;
		351	}
		352	assert(u->level >= 0);
		353	useVec.push_back(*u);
		354	}
		355	}
		356	delete[] uses;
		357
		358	// insert the barriers
		359	for (size_t i = 0; i < useVec.size(); ++i) {
		360	Instruction *prev = useVec[i].insn->prev;
		361	if (useVec[i].level < 0)
		362	continue;
		363	if (prev && prev->op == OP_TEXBAR) {
		364	if (prev->subOp > useVec[i].level)
		365	prev->subOp = useVec[i].level;
		366	prev->setSrc(prev->srcCount(), useVec[i].tex->getDef(0));
		367	} else {
		368	Instruction *bar = new_Instruction(func, OP_TEXBAR, TYPE_NONE);
		369	bar->fixed = 1;
		370	bar->subOp = useVec[i].level;
		371	// make use explicit to ease latency calculation
		372	bar->setSrc(bar->srcCount(), useVec[i].tex->getDef(0));
		373	useVec[i].insn->bb->insertBefore(useVec[i].insn, bar);
		374	}
		375	}
		376
		377	if (fn->getProgram()->optLevel < 3)
		378	return true;
		379
		380	std::vector limitT, limitB, limitS; // entry, exit, single
		381
		382	limitT.resize(fn->allBBlocks.getSize(), Limits(0, 0));
		383	limitB.resize(fn->allBBlocks.getSize(), Limits(0, 0));
		384	limitS.resize(fn->allBBlocks.getSize());
		385
		386	// cull unneeded barriers (should do that earlier, but for simplicity)
		387	IteratorRef bi = fn->cfg.iteratorCFG();
		388	// first calculate min/max outstanding TEXes for each BB
		389	for (bi->reset(); !bi->end(); bi->next()) {
		390	Graph::Node *n = reinterpret_cast(bi->get());
		391	BasicBlock *bb = BasicBlock::get(n);
		392	int min = 0;
		393	int max = std::numeric_limits::max();
		394	for (Instruction *i = bb->getFirst(); i; i = i->next) {
		395	if (isTextureOp(i->op)) {
		396	min++;
		397	if (max < std::numeric_limits::max())
		398	max++;
		399	} else
		400	if (i->op == OP_TEXBAR) {
		401	min = MIN2(min, i->subOp);
		402	max = MIN2(max, i->subOp);
		403	}
		404	}
		405	// limits when looking at an isolated block
		406	limitS[bb->getId()].min = min;
		407	limitS[bb->getId()].max = max;
		408	}
		409	// propagate the min/max values
		410	for (unsigned int l = 0; l <= fn->loopNestingBound; ++l) {
		411	for (bi->reset(); !bi->end(); bi->next()) {
		412	Graph::Node *n = reinterpret_cast(bi->get());
		413	BasicBlock *bb = BasicBlock::get(n);
		414	const int bbId = bb->getId();
		415	for (Graph::EdgeIterator ei = n->incident(); !ei.end(); ei.next()) {
		416	BasicBlock *in = BasicBlock::get(ei.getNode());
		417	const int inId = in->getId();
		418	limitT[bbId].min = MAX2(limitT[bbId].min, limitB[inId].min);
		419	limitT[bbId].max = MAX2(limitT[bbId].max, limitB[inId].max);
		420	}
		421	// I just hope this is correct ...
		422	if (limitS[bbId].max == std::numeric_limits::max()) {
		423	// no barrier
		424	limitB[bbId].min = limitT[bbId].min + limitS[bbId].min;
		425	limitB[bbId].max = limitT[bbId].max + limitS[bbId].min;
		426	} else {
		427	// block contained a barrier
		428	limitB[bbId].min = MIN2(limitS[bbId].max,
		429	limitT[bbId].min + limitS[bbId].min);
		430	limitB[bbId].max = MIN2(limitS[bbId].max,
		431	limitT[bbId].max + limitS[bbId].min);
		432	}
		433	}
		434	}
		435	// finally delete unnecessary barriers
		436	for (bi->reset(); !bi->end(); bi->next()) {
		437	Graph::Node *n = reinterpret_cast(bi->get());
		438	BasicBlock *bb = BasicBlock::get(n);
		439	Instruction *prev = NULL;
		440	Instruction *next;
		441	int max = limitT[bb->getId()].max;
		442	for (Instruction *i = bb->getFirst(); i; i = next) {
		443	next = i->next;
		444	if (i->op == OP_TEXBAR) {
		445	if (i->subOp >= max) {
		446	delete_Instruction(prog, i);
		447	i = NULL;
		448	} else {
		449	max = i->subOp;
		450	if (prev && prev->op == OP_TEXBAR && prev->subOp >= max) {
		451	delete_Instruction(prog, prev);
		452	prev = NULL;
		453	}
		454	}
		455	} else
		456	if (isTextureOp(i->op)) {
		457	max++;
		458	}
		459	if (i && !i->isNop())
		460	prev = i;
		461	}
		462	}
		463	return true;
		464	}
		465
		466	bool
		467	NVC0LegalizePostRA::visit(Function *fn)
		468	{
		469	if (needTexBar)
		470	insertTextureBarriers(fn);
		471
		472	rZero = new_LValue(fn, FILE_GPR);
		473	carry = new_LValue(fn, FILE_FLAGS);
		474
		475	rZero->reg.data.id = prog->getTarget()->getFileSize(FILE_GPR);
		476	carry->reg.data.id = 0;
		477
		478	return true;
		479	}
		480
		481	void
		482	NVC0LegalizePostRA::replaceZero(Instruction *i)
		483	{
		484	for (int s = 0; i->srcExists(s); ++s) {
		485	if (s == 2 && i->op == OP_SUCLAMP)
		486	continue;
		487	ImmediateValue *imm = i->getSrc(s)->asImm();
		488	if (imm && imm->reg.data.u64 == 0)
		489	i->setSrc(s, rZero);
		490	}
		491	}
		492
		493	// replace CONT with BRA for single unconditional continue
		494	bool
		495	NVC0LegalizePostRA::tryReplaceContWithBra(BasicBlock *bb)
		496	{
		497	if (bb->cfg.incidentCount() != 2 \|\| bb->getEntry()->op != OP_PRECONT)
		498	return false;
		499	Graph::EdgeIterator ei = bb->cfg.incident();
		500	if (ei.getType() != Graph::Edge::BACK)
		501	ei.next();
		502	if (ei.getType() != Graph::Edge::BACK)
		503	return false;
		504	BasicBlock *contBB = BasicBlock::get(ei.getNode());
		505
		506	if (!contBB->getExit() \|\| contBB->getExit()->op != OP_CONT \|\|
		507	contBB->getExit()->getPredicate())
		508	return false;
		509	contBB->getExit()->op = OP_BRA;
		510	bb->remove(bb->getEntry()); // delete PRECONT
		511
		512	ei.next();
		513	assert(ei.end() \|\| ei.getType() != Graph::Edge::BACK);
		514	return true;
		515	}
		516
		517	// replace branches to join blocks with join ops
		518	void
		519	NVC0LegalizePostRA::propagateJoin(BasicBlock *bb)
		520	{
		521	if (bb->getEntry()->op != OP_JOIN \|\| bb->getEntry()->asFlow()->limit)
		522	return;
		523	for (Graph::EdgeIterator ei = bb->cfg.incident(); !ei.end(); ei.next()) {
		524	BasicBlock *in = BasicBlock::get(ei.getNode());
		525	Instruction *exit = in->getExit();
		526	if (!exit) {
		527	in->insertTail(new FlowInstruction(func, OP_JOIN, bb));
		528	// there should always be a terminator instruction
		529	WARN("inserted missing terminator in BB:%i\n", in->getId());
		530	} else
		531	if (exit->op == OP_BRA) {
		532	exit->op = OP_JOIN;
		533	exit->asFlow()->limit = 1; // must-not-propagate marker
		534	}
		535	}
		536	bb->remove(bb->getEntry());
		537	}
		538
		539	bool
		540	NVC0LegalizePostRA::visit(BasicBlock *bb)
		541	{
		542	Instruction i, next;
		543
		544	// remove pseudo operations and non-fixed no-ops, split 64 bit operations
		545	for (i = bb->getFirst(); i; i = next) {
		546	next = i->next;
		547	if (i->op == OP_EMIT \|\| i->op == OP_RESTART) {
		548	if (!i->getDef(0)->refCount())
		549	i->setDef(0, NULL);
		550	if (i->src(0).getFile() == FILE_IMMEDIATE)
		551	i->setSrc(0, rZero); // initial value must be 0
		552	replaceZero(i);
		553	} else
		554	if (i->isNop()) {
		555	bb->remove(i);
		556	} else {
		557	// TODO: Move this to before register allocation for operations that
		558	// need the $c register !
		559	if (typeSizeof(i->dType) == 8) {
		560	Instruction *hi;
		561	hi = BuildUtil::split64BitOpPostRA(func, i, rZero, carry);
		562	if (hi)
		563	next = hi;
		564	}
		565
		566	if (i->op != OP_MOV && i->op != OP_PFETCH)
		567	replaceZero(i);
		568	}
		569	}
		570	if (!bb->getEntry())
		571	return true;
		572
		573	if (!tryReplaceContWithBra(bb))
		574	propagateJoin(bb);
		575
		576	return true;
		577	}
		578
		579	NVC0LoweringPass::NVC0LoweringPass(Program *prog) : targ(prog->getTarget())
		580	{
		581	bld.setProgram(prog);
		582	gMemBase = NULL;
		583	}
		584
		585	bool
		586	NVC0LoweringPass::visit(Function *fn)
		587	{
		588	if (prog->getType() == Program::TYPE_GEOMETRY) {
		589	assert(!strncmp(fn->getName(), "MAIN", 4));
		590	// TODO: when we generate actual functions pass this value along somehow
		591	bld.setPosition(BasicBlock::get(fn->cfg.getRoot()), false);
		592	gpEmitAddress = bld.loadImm(NULL, 0)->asLValue();
		593	if (fn->cfgExit) {
		594	bld.setPosition(BasicBlock::get(fn->cfgExit)->getExit(), false);
		595	bld.mkMovToReg(0, gpEmitAddress);
		596	}
		597	}
		598	return true;
		599	}
		600
		601	bool
		602	NVC0LoweringPass::visit(BasicBlock *bb)
		603	{
		604	return true;
		605	}
		606
		607	inline Value *
		608	NVC0LoweringPass::loadTexHandle(Value *ptr, unsigned int slot)
		609	{
		610	uint8_t b = prog->driver->io.resInfoCBSlot;
		611	uint32_t off = prog->driver->io.texBindBase + slot * 4;
		612	return bld.
		613	mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
		614	}
		615
		616	// move array source to first slot, convert to u16, add indirections
		617	bool
		618	NVC0LoweringPass::handleTEX(TexInstruction *i)
		619	{
		620	const int dim = i->tex.target.getDim() + i->tex.target.isCube();
		621	const int arg = i->tex.target.getArgCount();
		622	const int lyr = arg - (i->tex.target.isMS() ? 2 : 1);
		623	const int chipset = prog->getTarget()->getChipset();
		624
		625	// Arguments to the TEX instruction are a little insane. Even though the
		626	// encoding is identical between SM20 and SM30, the arguments mean
		627	// different things between Fermi and Kepler+. A lot of arguments are
		628	// optional based on flags passed to the instruction. This summarizes the
		629	// order of things.
		630	//
		631	// Fermi:
		632	// array/indirect
		633	// coords
		634	// sample
		635	// lod bias
		636	// depth compare
		637	// offsets:
		638	// - tg4: 8 bits each, either 2 (1 offset reg) or 8 (2 offset reg)
		639	// - other: 4 bits each, single reg
		640	//
		641	// Kepler+:
		642	// indirect handle
		643	// array (+ offsets for txd in upper 16 bits)
		644	// coords
		645	// sample
		646	// lod bias
		647	// depth compare
		648	// offsets (same as fermi, except txd which takes it with array)
		649	//
		650	// Maxwell (tex):
		651	// array
		652	// coords
		653	// indirect handle
		654	// sample
		655	// lod bias
		656	// depth compare
		657	// offsets
		658	//
		659	// Maxwell (txd):
		660	// indirect handle
		661	// coords
		662	// array + offsets
		663	// derivatives
		664
		665	if (chipset >= NVISA_GK104_CHIPSET) {
		666	if (i->tex.rIndirectSrc >= 0 \|\| i->tex.sIndirectSrc >= 0) {
		667	// XXX this ignores tsc, and assumes a 1:1 mapping
		668	assert(i->tex.rIndirectSrc >= 0);
		669	Value *hnd = loadTexHandle(
		670	bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
		671	i->getIndirectR(), bld.mkImm(2)),
		672	i->tex.r);
		673	i->tex.r = 0xff;
		674	i->tex.s = 0x1f;
		675	i->setIndirectR(hnd);
		676	i->setIndirectS(NULL);
		677	} else if (i->tex.r == i->tex.s) {
		678	i->tex.r += prog->driver->io.texBindBase / 4;
		679	i->tex.s = 0; // only a single cX[] value possible here
		680	} else {
		681	Value *hnd = bld.getScratch();
		682	Value *rHnd = loadTexHandle(NULL, i->tex.r);
		683	Value *sHnd = loadTexHandle(NULL, i->tex.s);
		684
		685	bld.mkOp3(OP_INSBF, TYPE_U32, hnd, rHnd, bld.mkImm(0x1400), sHnd);
		686
		687	i->tex.r = 0; // not used for indirect tex
		688	i->tex.s = 0;
		689	i->setIndirectR(hnd);
		690	}
		691	if (i->tex.target.isArray()) {
		692	LValue *layer = new_LValue(func, FILE_GPR);
		693	Value *src = i->getSrc(lyr);
		694	const int sat = (i->op == OP_TXF) ? 1 : 0;
		695	DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
		696	bld.mkCvt(OP_CVT, TYPE_U16, layer, sTy, src)->saturate = sat;
		697	if (i->op != OP_TXD \|\| chipset < NVISA_GM107_CHIPSET) {
		698	for (int s = dim; s >= 1; --s)
		699	i->setSrc(s, i->getSrc(s - 1));
		700	i->setSrc(0, layer);
		701	} else {
		702	i->setSrc(dim, layer);
		703	}
		704	}
		705	// Move the indirect reference to the first place
		706	if (i->tex.rIndirectSrc >= 0 && (
		707	i->op == OP_TXD \|\| chipset < NVISA_GM107_CHIPSET)) {
		708	Value *hnd = i->getIndirectR();
		709
		710	i->setIndirectR(NULL);
		711	i->moveSources(0, 1);
		712	i->setSrc(0, hnd);
		713	i->tex.rIndirectSrc = 0;
		714	i->tex.sIndirectSrc = -1;
		715	}
		716	} else
		717	// (nvc0) generate and move the tsc/tic/array source to the front
		718	if (i->tex.target.isArray() \|\| i->tex.rIndirectSrc >= 0 \|\| i->tex.sIndirectSrc >= 0) {
		719	LValue *src = new_LValue(func, FILE_GPR); // 0xttxsaaaa
		720
		721	Value *ticRel = i->getIndirectR();
		722	Value *tscRel = i->getIndirectS();
		723
		724	if (ticRel) {
		725	i->setSrc(i->tex.rIndirectSrc, NULL);
		726	if (i->tex.r)
		727	ticRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
		728	ticRel, bld.mkImm(i->tex.r));
		729	}
		730	if (tscRel) {
		731	i->setSrc(i->tex.sIndirectSrc, NULL);
		732	if (i->tex.s)
		733	tscRel = bld.mkOp2v(OP_ADD, TYPE_U32, bld.getScratch(),
		734	tscRel, bld.mkImm(i->tex.s));
		735	}
		736
		737	Value *arrayIndex = i->tex.target.isArray() ? i->getSrc(lyr) : NULL;
		738	for (int s = dim; s >= 1; --s)
		739	i->setSrc(s, i->getSrc(s - 1));
		740	i->setSrc(0, arrayIndex);
		741
		742	if (arrayIndex) {
		743	int sat = (i->op == OP_TXF) ? 1 : 0;
		744	DataType sTy = (i->op == OP_TXF) ? TYPE_U32 : TYPE_F32;
		745	bld.mkCvt(OP_CVT, TYPE_U16, src, sTy, arrayIndex)->saturate = sat;
		746	} else {
		747	bld.loadImm(src, 0);
		748	}
		749
		750	if (ticRel)
		751	bld.mkOp3(OP_INSBF, TYPE_U32, src, ticRel, bld.mkImm(0x0917), src);
		752	if (tscRel)
		753	bld.mkOp3(OP_INSBF, TYPE_U32, src, tscRel, bld.mkImm(0x0710), src);
		754
		755	i->setSrc(0, src);
		756	}
		757
		758	// For nvc0, the sample id has to be in the second operand, as the offset
		759	// does. Right now we don't know how to pass both in, and this case can't
		760	// happen with OpenGL. On nve0, the sample id is part of the texture
		761	// coordinate argument.
		762	assert(chipset >= NVISA_GK104_CHIPSET \|\|
		763	!i->tex.useOffsets \|\| !i->tex.target.isMS());
		764
		765	// offset is between lod and dc
		766	if (i->tex.useOffsets) {
		767	int n, c;
		768	int s = i->srcCount(0xff, true);
		769	if (i->op != OP_TXD \|\| chipset < NVISA_GK104_CHIPSET) {
		770	if (i->tex.target.isShadow())
		771	s--;
		772	if (i->srcExists(s)) // move potential predicate out of the way
		773	i->moveSources(s, 1);
		774	if (i->tex.useOffsets == 4 && i->srcExists(s + 1))
		775	i->moveSources(s + 1, 1);
		776	}
		777	if (i->op == OP_TXG) {
		778	// Either there is 1 offset, which goes into the 2 low bytes of the
		779	// first source, or there are 4 offsets, which go into 2 sources (8
		780	// values, 1 byte each).
		781	Value *offs[2] = {NULL, NULL};
		782	for (n = 0; n < i->tex.useOffsets; n++) {
		783	for (c = 0; c < 2; ++c) {
		784	if ((n % 2) == 0 && c == 0)
		785	offs[n / 2] = i->offset[n][c].get();
		786	else
		787	bld.mkOp3(OP_INSBF, TYPE_U32,
		788	offs[n / 2],
		789	i->offset[n][c].get(),
		790	bld.mkImm(0x800 \| ((n * 16 + c * 8) % 32)),
		791	offs[n / 2]);
		792	}
		793	}
		794	i->setSrc(s, offs[0]);
		795	if (offs[1])
		796	i->setSrc(s + 1, offs[1]);
		797	} else {
		798	unsigned imm = 0;
		799	assert(i->tex.useOffsets == 1);
		800	for (c = 0; c < 3; ++c) {
		801	ImmediateValue val;
		802	if (!i->offset[0][c].getImmediate(val))
		803	assert(!"non-immediate offset passed to non-TXG");
		804	imm \|= (val.reg.data.u32 & 0xf) << (c * 4);
		805	}
		806	if (i->op == OP_TXD && chipset >= NVISA_GK104_CHIPSET) {
		807	// The offset goes into the upper 16 bits of the array index. So
		808	// create it if it's not already there, and INSBF it if it already
		809	// is.
		810	s = (i->tex.rIndirectSrc >= 0) ? 1 : 0;
		811	if (chipset >= NVISA_GM107_CHIPSET)
		812	s += dim;
		813	if (i->tex.target.isArray()) {
		814	bld.mkOp3(OP_INSBF, TYPE_U32, i->getSrc(s),
		815	bld.loadImm(NULL, imm), bld.mkImm(0xc10),
		816	i->getSrc(s));
		817	} else {
		818	i->moveSources(s, 1);
		819	i->setSrc(s, bld.loadImm(NULL, imm << 16));
		820	}
		821	} else {
		822	i->setSrc(s, bld.loadImm(NULL, imm));
		823	}
		824	}
		825	}
		826
		827	if (chipset >= NVISA_GK104_CHIPSET) {
		828	//
		829	// If TEX requires more than 4 sources, the 2nd register tuple must be
		830	// aligned to 4, even if it consists of just a single 4-byte register.
		831	//
		832	// XXX HACK: We insert 0 sources to avoid the 5 or 6 regs case.
		833	//
		834	int s = i->srcCount(0xff, true);
		835	if (s > 4 && s < 7) {
		836	if (i->srcExists(s)) // move potential predicate out of the way
		837	i->moveSources(s, 7 - s);
		838	while (s < 7)
		839	i->setSrc(s++, bld.loadImm(NULL, 0));
		840	}
		841	}
		842
		843	return true;
		844	}
		845
		846	bool
		847	NVC0LoweringPass::handleManualTXD(TexInstruction *i)
		848	{
		849	static const uint8_t qOps[4][2] =
		850	{
		851	{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(MOV2, MOV2, ADD, ADD) }, // l0
		852	{ QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(MOV2, MOV2, ADD, ADD) }, // l1
		853	{ QUADOP(MOV2, ADD, MOV2, ADD), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l2
		854	{ QUADOP(SUBR, MOV2, SUBR, MOV2), QUADOP(SUBR, SUBR, MOV2, MOV2) }, // l3
		855	};
		856	Value *def[4][4];
		857	Value *crd[3];
		858	Instruction *tex;
		859	Value *zero = bld.loadImm(bld.getSSA(), 0);
		860	int l, c;
		861	const int dim = i->tex.target.getDim();
		862	const int array = i->tex.target.isArray();
		863
		864	i->op = OP_TEX; // no need to clone dPdx/dPdy later
		865
		866	for (c = 0; c < dim; ++c)
		867	crd[c] = bld.getScratch();
		868
		869	bld.mkOp(OP_QUADON, TYPE_NONE, NULL);
		870	for (l = 0; l < 4; ++l) {
		871	// mov coordinates from lane l to all lanes
		872	for (c = 0; c < dim; ++c)
		873	bld.mkQuadop(0x00, crd[c], l, i->getSrc(c + array), zero);
		874	// add dPdx from lane l to lanes dx
		875	for (c = 0; c < dim; ++c)
		876	bld.mkQuadop(qOps[l][0], crd[c], l, i->dPdx[c].get(), crd[c]);
		877	// add dPdy from lane l to lanes dy
		878	for (c = 0; c < dim; ++c)
		879	bld.mkQuadop(qOps[l][1], crd[c], l, i->dPdy[c].get(), crd[c]);
		880	// texture
		881	bld.insert(tex = cloneForward(func, i));
		882	for (c = 0; c < dim; ++c)
		883	tex->setSrc(c + array, crd[c]);
		884	// save results
		885	for (c = 0; i->defExists(c); ++c) {
		886	Instruction *mov;
		887	def[c][l] = bld.getSSA();
		888	mov = bld.mkMov(def[c][l], tex->getDef(c));
		889	mov->fixed = 1;
		890	mov->lanes = 1 << l;
		891	}
		892	}
		893	bld.mkOp(OP_QUADPOP, TYPE_NONE, NULL);
		894
		895	for (c = 0; i->defExists(c); ++c) {
		896	Instruction *u = bld.mkOp(OP_UNION, TYPE_U32, i->getDef(c));
		897	for (l = 0; l < 4; ++l)
		898	u->setSrc(l, def[c][l]);
		899	}
		900
		901	i->bb->remove(i);
		902	return true;
		903	}
		904
		905	bool
		906	NVC0LoweringPass::handleTXD(TexInstruction *txd)
		907	{
		908	int dim = txd->tex.target.getDim();
		909	unsigned arg = txd->tex.target.getArgCount();
		910	unsigned expected_args = arg;
		911	const int chipset = prog->getTarget()->getChipset();
		912
		913	if (chipset >= NVISA_GK104_CHIPSET) {
		914	if (!txd->tex.target.isArray() && txd->tex.useOffsets)
		915	expected_args++;
		916	if (txd->tex.rIndirectSrc >= 0 \|\| txd->tex.sIndirectSrc >= 0)
		917	expected_args++;
		918	} else {
		919	if (txd->tex.useOffsets)
		920	expected_args++;
		921	if (!txd->tex.target.isArray() && (
		922	txd->tex.rIndirectSrc >= 0 \|\| txd->tex.sIndirectSrc >= 0))
		923	expected_args++;
		924	}
		925
		926	if (expected_args > 4 \|\|
		927	dim > 2 \|\|
		928	txd->tex.target.isShadow() \|\|
		929	txd->tex.target.isCube())
		930	txd->op = OP_TEX;
		931
		932	handleTEX(txd);
		933	while (txd->srcExists(arg))
		934	++arg;
		935
		936	txd->tex.derivAll = true;
		937	if (txd->op == OP_TEX)
		938	return handleManualTXD(txd);
		939
		940	assert(arg == expected_args);
		941	for (int c = 0; c < dim; ++c) {
		942	txd->setSrc(arg + c * 2 + 0, txd->dPdx[c]);
		943	txd->setSrc(arg + c * 2 + 1, txd->dPdy[c]);
		944	txd->dPdx[c].set(NULL);
		945	txd->dPdy[c].set(NULL);
		946	}
		947	return true;
		948	}
		949
		950	bool
		951	NVC0LoweringPass::handleTXQ(TexInstruction *txq)
		952	{
		953	// TODO: indirect resource/sampler index
		954	return true;
		955	}
		956
		957	bool
		958	NVC0LoweringPass::handleTXLQ(TexInstruction *i)
		959	{
		960	/* The outputs are inverted compared to what the TGSI instruction
		961	* expects. Take that into account in the mask.
		962	*/
		963	assert((i->tex.mask & ~3) == 0);
		964	if (i->tex.mask == 1)
		965	i->tex.mask = 2;
		966	else if (i->tex.mask == 2)
		967	i->tex.mask = 1;
		968	handleTEX(i);
		969	bld.setPosition(i, true);
		970
		971	/* The returned values are not quite what we want:
		972	* (a) convert from s16/u16 to f32
		973	* (b) multiply by 1/256
		974	*/
		975	for (int def = 0; def < 2; ++def) {
		976	if (!i->defExists(def))
		977	continue;
		978	enum DataType type = TYPE_S16;
		979	if (i->tex.mask == 2 \|\| def > 0)
		980	type = TYPE_U16;
		981	bld.mkCvt(OP_CVT, TYPE_F32, i->getDef(def), type, i->getDef(def));
		982	bld.mkOp2(OP_MUL, TYPE_F32, i->getDef(def),
		983	i->getDef(def), bld.loadImm(NULL, 1.0f / 256));
		984	}
		985	if (i->tex.mask == 3) {
		986	LValue *t = new_LValue(func, FILE_GPR);
		987	bld.mkMov(t, i->getDef(0));
		988	bld.mkMov(i->getDef(0), i->getDef(1));
		989	bld.mkMov(i->getDef(1), t);
		990	}
		991	return true;
		992	}
		993
		994
		995	bool
		996	NVC0LoweringPass::handleATOM(Instruction *atom)
		997	{
		998	SVSemantic sv;
		999
		1000	switch (atom->src(0).getFile()) {
		1001	case FILE_MEMORY_LOCAL:
		1002	sv = SV_LBASE;
		1003	break;
		1004	case FILE_MEMORY_SHARED:
		1005	sv = SV_SBASE;
		1006	break;
		1007	default:
		1008	assert(atom->src(0).getFile() == FILE_MEMORY_GLOBAL);
		1009	return true;
		1010	}
		1011	Value *base =
		1012	bld.mkOp1v(OP_RDSV, TYPE_U32, bld.getScratch(), bld.mkSysVal(sv, 0));
		1013	Value *ptr = atom->getIndirect(0, 0);
		1014
		1015	atom->setSrc(0, cloneShallow(func, atom->getSrc(0)));
		1016	atom->getSrc(0)->reg.file = FILE_MEMORY_GLOBAL;
		1017	if (ptr)
		1018	base = bld.mkOp2v(OP_ADD, TYPE_U32, base, base, ptr);
		1019	atom->setIndirect(0, 0, base);
		1020
		1021	return true;
		1022	}
		1023
		1024	bool
		1025	NVC0LoweringPass::handleCasExch(Instruction *cas, bool needCctl)
		1026	{
		1027	if (cas->subOp != NV50_IR_SUBOP_ATOM_CAS &&
		1028	cas->subOp != NV50_IR_SUBOP_ATOM_EXCH)
		1029	return false;
		1030	bld.setPosition(cas, true);
		1031
		1032	if (needCctl) {
		1033	Instruction *cctl = bld.mkOp1(OP_CCTL, TYPE_NONE, NULL, cas->getSrc(0));
		1034	cctl->setIndirect(0, 0, cas->getIndirect(0, 0));
		1035	cctl->fixed = 1;
		1036	cctl->subOp = NV50_IR_SUBOP_CCTL_IV;
		1037	if (cas->isPredicated())
		1038	cctl->setPredicate(cas->cc, cas->getPredicate());
		1039	}
		1040
		1041	if (cas->defExists(0) && cas->subOp == NV50_IR_SUBOP_ATOM_CAS) {
		1042	// CAS is crazy. It's 2nd source is a double reg, and the 3rd source
		1043	// should be set to the high part of the double reg or bad things will
		1044	// happen elsewhere in the universe.
		1045	// Also, it sometimes returns the new value instead of the old one
		1046	// under mysterious circumstances.
		1047	Value *dreg = bld.getSSA(8);
		1048	bld.setPosition(cas, false);
		1049	bld.mkOp2(OP_MERGE, TYPE_U64, dreg, cas->getSrc(1), cas->getSrc(2));
		1050	cas->setSrc(1, dreg);
		1051	}
		1052
		1053	return true;
		1054	}
		1055
		1056	inline Value *
		1057	NVC0LoweringPass::loadResInfo32(Value *ptr, uint32_t off)
		1058	{
		1059	uint8_t b = prog->driver->io.resInfoCBSlot;
		1060	off += prog->driver->io.suInfoBase;
		1061	return bld.
		1062	mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
		1063	}
		1064
		1065	inline Value *
		1066	NVC0LoweringPass::loadMsInfo32(Value *ptr, uint32_t off)
		1067	{
		1068	uint8_t b = prog->driver->io.msInfoCBSlot;
		1069	off += prog->driver->io.msInfoBase;
		1070	return bld.
		1071	mkLoadv(TYPE_U32, bld.mkSymbol(FILE_MEMORY_CONST, b, TYPE_U32, off), ptr);
		1072	}
		1073
		1074	/* On nvc0, surface info is obtained via the surface binding points passed
		1075	* to the SULD/SUST instructions.
		1076	* On nve4, surface info is stored in c[] and is used by various special
		1077	* instructions, e.g. for clamping coordiantes or generating an address.
		1078	* They couldn't just have added an equivalent to TIC now, couldn't they ?
		1079	*/
		1080	#define NVE4_SU_INFO_ADDR 0x00
		1081	#define NVE4_SU_INFO_FMT 0x04
		1082	#define NVE4_SU_INFO_DIM_X 0x08
		1083	#define NVE4_SU_INFO_PITCH 0x0c
		1084	#define NVE4_SU_INFO_DIM_Y 0x10
		1085	#define NVE4_SU_INFO_ARRAY 0x14
		1086	#define NVE4_SU_INFO_DIM_Z 0x18
		1087	#define NVE4_SU_INFO_UNK1C 0x1c
		1088	#define NVE4_SU_INFO_WIDTH 0x20
		1089	#define NVE4_SU_INFO_HEIGHT 0x24
		1090	#define NVE4_SU_INFO_DEPTH 0x28
		1091	#define NVE4_SU_INFO_TARGET 0x2c
		1092	#define NVE4_SU_INFO_CALL 0x30
		1093	#define NVE4_SU_INFO_RAW_X 0x34
		1094	#define NVE4_SU_INFO_MS_X 0x38
		1095	#define NVE4_SU_INFO_MS_Y 0x3c
		1096
		1097	#define NVE4_SU_INFO__STRIDE 0x40
		1098
		1099	#define NVE4_SU_INFO_DIM(i) (0x08 + (i) * 8)
		1100	#define NVE4_SU_INFO_SIZE(i) (0x20 + (i) * 4)
		1101	#define NVE4_SU_INFO_MS(i) (0x38 + (i) * 4)
		1102
		1103	static inline uint16_t getSuClampSubOp(const TexInstruction *su, int c)
		1104	{
		1105	switch (su->tex.target.getEnum()) {
		1106	case TEX_TARGET_BUFFER: return NV50_IR_SUBOP_SUCLAMP_PL(0, 1);
		1107	case TEX_TARGET_RECT: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
		1108	case TEX_TARGET_1D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
		1109	case TEX_TARGET_1D_ARRAY: return (c == 1) ?
		1110	NV50_IR_SUBOP_SUCLAMP_PL(0, 2) :
		1111	NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
		1112	case TEX_TARGET_2D: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
		1113	case TEX_TARGET_2D_MS: return NV50_IR_SUBOP_SUCLAMP_BL(0, 2);
		1114	case TEX_TARGET_2D_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
		1115	case TEX_TARGET_2D_MS_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
		1116	case TEX_TARGET_3D: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
		1117	case TEX_TARGET_CUBE: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
		1118	case TEX_TARGET_CUBE_ARRAY: return NV50_IR_SUBOP_SUCLAMP_SD(0, 2);
		1119	default:
		1120	assert(0);
		1121	return 0;
		1122	}
		1123	}
		1124
		1125	void
		1126	NVC0LoweringPass::adjustCoordinatesMS(TexInstruction *tex)
		1127	{
		1128	const uint16_t base = tex->tex.r * NVE4_SU_INFO__STRIDE;
		1129	const int arg = tex->tex.target.getArgCount();
		1130
		1131	if (tex->tex.target == TEX_TARGET_2D_MS)
		1132	tex->tex.target = TEX_TARGET_2D;
		1133	else
		1134	if (tex->tex.target == TEX_TARGET_2D_MS_ARRAY)
		1135	tex->tex.target = TEX_TARGET_2D_ARRAY;
		1136	else
		1137	return;
		1138
		1139	Value *x = tex->getSrc(0);
		1140	Value *y = tex->getSrc(1);
		1141	Value *s = tex->getSrc(arg - 1);
		1142
		1143	Value tx = bld.getSSA(), ty = bld.getSSA(), *ts = bld.getSSA();
		1144
		1145	Value *ms_x = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(0));
		1146	Value *ms_y = loadResInfo32(NULL, base + NVE4_SU_INFO_MS(1));
		1147
		1148	bld.mkOp2(OP_SHL, TYPE_U32, tx, x, ms_x);
		1149	bld.mkOp2(OP_SHL, TYPE_U32, ty, y, ms_y);
		1150
		1151	s = bld.mkOp2v(OP_AND, TYPE_U32, ts, s, bld.loadImm(NULL, 0x7));
		1152	s = bld.mkOp2v(OP_SHL, TYPE_U32, ts, ts, bld.mkImm(3));
		1153
		1154	Value *dx = loadMsInfo32(ts, 0x0);
		1155	Value *dy = loadMsInfo32(ts, 0x4);
		1156
		1157	bld.mkOp2(OP_ADD, TYPE_U32, tx, tx, dx);
		1158	bld.mkOp2(OP_ADD, TYPE_U32, ty, ty, dy);
		1159
		1160	tex->setSrc(0, tx);
		1161	tex->setSrc(1, ty);
		1162	tex->moveSources(arg, -1);
		1163	}
		1164
		1165	// Sets 64-bit "generic address", predicate and format sources for SULD/SUST.
		1166	// They're computed from the coordinates using the surface info in c[] space.
		1167	void
		1168	NVC0LoweringPass::processSurfaceCoordsNVE4(TexInstruction *su)
		1169	{
		1170	Instruction *insn;
		1171	const bool atom = su->op == OP_SUREDB \|\| su->op == OP_SUREDP;
		1172	const bool raw =
		1173	su->op == OP_SULDB \|\| su->op == OP_SUSTB \|\| su->op == OP_SUREDB;
		1174	const int idx = su->tex.r;
		1175	const int dim = su->tex.target.getDim();
		1176	const int arg = dim + (su->tex.target.isArray() ? 1 : 0);
		1177	const uint16_t base = idx * NVE4_SU_INFO__STRIDE;
		1178	int c;
		1179	Value *zero = bld.mkImm(0);
		1180	Value *p1 = NULL;
		1181	Value *v;
		1182	Value *src[3];
		1183	Value bf, eau, *off;
		1184	Value addr, pred;
		1185
		1186	off = bld.getScratch(4);
		1187	bf = bld.getScratch(4);
		1188	addr = bld.getSSA(8);
		1189	pred = bld.getScratch(1, FILE_PREDICATE);
		1190
		1191	bld.setPosition(su, false);
		1192
		1193	adjustCoordinatesMS(su);
		1194
		1195	// calculate clamped coordinates
		1196	for (c = 0; c < arg; ++c) {
		1197	src[c] = bld.getScratch();
		1198	if (c == 0 && raw)
		1199	v = loadResInfo32(NULL, base + NVE4_SU_INFO_RAW_X);
		1200	else
		1201	v = loadResInfo32(NULL, base + NVE4_SU_INFO_DIM(c));
		1202	bld.mkOp3(OP_SUCLAMP, TYPE_S32, src[c], su->getSrc(c), v, zero)
		1203	->subOp = getSuClampSubOp(su, c);
		1204	}
		1205	for (; c < 3; ++c)
		1206	src[c] = zero;
		1207
		1208	// set predicate output
		1209	if (su->tex.target == TEX_TARGET_BUFFER) {
		1210	src[0]->getInsn()->setFlagsDef(1, pred);
		1211	} else
		1212	if (su->tex.target.isArray()) {
		1213	p1 = bld.getSSA(1, FILE_PREDICATE);
		1214	src[dim]->getInsn()->setFlagsDef(1, p1);
		1215	}
		1216
		1217	// calculate pixel offset
		1218	if (dim == 1) {
		1219	if (su->tex.target != TEX_TARGET_BUFFER)
		1220	bld.mkOp2(OP_AND, TYPE_U32, off, src[0], bld.loadImm(NULL, 0xffff));
		1221	} else
		1222	if (dim == 3) {
		1223	v = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
		1224	bld.mkOp3(OP_MADSP, TYPE_U32, off, src[2], v, src[1])
		1225	->subOp = NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
		1226
		1227	v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
		1228	bld.mkOp3(OP_MADSP, TYPE_U32, off, off, v, src[0])
		1229	->subOp = NV50_IR_SUBOP_MADSP(0,2,8); // u32 u16l u16l
		1230	} else {
		1231	assert(dim == 2);
		1232	v = loadResInfo32(NULL, base + NVE4_SU_INFO_PITCH);
		1233	bld.mkOp3(OP_MADSP, TYPE_U32, off, src[1], v, src[0])
		1234	->subOp = su->tex.target.isArray() ?
		1235	NV50_IR_SUBOP_MADSP_SD : NV50_IR_SUBOP_MADSP(4,2,8); // u16l u16l u16l
		1236	}
		1237
		1238	// calculate effective address part 1
		1239	if (su->tex.target == TEX_TARGET_BUFFER) {
		1240	if (raw) {
		1241	bf = src[0];
		1242	} else {
		1243	v = loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
		1244	bld.mkOp3(OP_VSHL, TYPE_U32, bf, src[0], v, zero)
		1245	->subOp = NV50_IR_SUBOP_V1(7,6,8\|2);
		1246	}
		1247	} else {
		1248	Value *y = src[1];
		1249	Value *z = src[2];
		1250	uint16_t subOp = 0;
		1251
		1252	switch (dim) {
		1253	case 1:
		1254	y = zero;
		1255	z = zero;
		1256	break;
		1257	case 2:
		1258	z = off;
		1259	if (!su->tex.target.isArray()) {
		1260	z = loadResInfo32(NULL, base + NVE4_SU_INFO_UNK1C);
		1261	subOp = NV50_IR_SUBOP_SUBFM_3D;
		1262	}
		1263	break;
		1264	default:
		1265	subOp = NV50_IR_SUBOP_SUBFM_3D;
		1266	assert(dim == 3);
		1267	break;
		1268	}
		1269	insn = bld.mkOp3(OP_SUBFM, TYPE_U32, bf, src[0], y, z);
		1270	insn->subOp = subOp;
		1271	insn->setFlagsDef(1, pred);
		1272	}
		1273
		1274	// part 2
		1275	v = loadResInfo32(NULL, base + NVE4_SU_INFO_ADDR);
		1276
		1277	if (su->tex.target == TEX_TARGET_BUFFER) {
		1278	eau = v;
		1279	} else {
		1280	eau = bld.mkOp3v(OP_SUEAU, TYPE_U32, bld.getScratch(4), off, bf, v);
		1281	}
		1282	// add array layer offset
		1283	if (su->tex.target.isArray()) {
		1284	v = loadResInfo32(NULL, base + NVE4_SU_INFO_ARRAY);
		1285	if (dim == 1)
		1286	bld.mkOp3(OP_MADSP, TYPE_U32, eau, src[1], v, eau)
		1287	->subOp = NV50_IR_SUBOP_MADSP(4,0,0); // u16 u24 u32
		1288	else
		1289	bld.mkOp3(OP_MADSP, TYPE_U32, eau, v, src[2], eau)
		1290	->subOp = NV50_IR_SUBOP_MADSP(0,0,0); // u32 u24 u32
		1291	// combine predicates
		1292	assert(p1);
		1293	bld.mkOp2(OP_OR, TYPE_U8, pred, pred, p1);
		1294	}
		1295
		1296	if (atom) {
		1297	Value *lo = bf;
		1298	if (su->tex.target == TEX_TARGET_BUFFER) {
		1299	lo = zero;
		1300	bld.mkMov(off, bf);
		1301	}
		1302	// bf == g[] address & 0xff
		1303	// eau == g[] address >> 8
		1304	bld.mkOp3(OP_PERMT, TYPE_U32, bf, lo, bld.loadImm(NULL, 0x6540), eau);
		1305	bld.mkOp3(OP_PERMT, TYPE_U32, eau, zero, bld.loadImm(NULL, 0x0007), eau);
		1306	} else
		1307	if (su->op == OP_SULDP && su->tex.target == TEX_TARGET_BUFFER) {
		1308	// Convert from u32 to u8 address format, which is what the library code
		1309	// doing SULDP currently uses.
		1310	// XXX: can SUEAU do this ?
		1311	// XXX: does it matter that we don't mask high bytes in bf ?
		1312	// Grrr.
		1313	bld.mkOp2(OP_SHR, TYPE_U32, off, bf, bld.mkImm(8));
		1314	bld.mkOp2(OP_ADD, TYPE_U32, eau, eau, off);
		1315	}
		1316
		1317	bld.mkOp2(OP_MERGE, TYPE_U64, addr, bf, eau);
		1318
		1319	if (atom && su->tex.target == TEX_TARGET_BUFFER)
		1320	bld.mkOp2(OP_ADD, TYPE_U64, addr, addr, off);
		1321
		1322	// let's just set it 0 for raw access and hope it works
		1323	v = raw ?
		1324	bld.mkImm(0) : loadResInfo32(NULL, base + NVE4_SU_INFO_FMT);
		1325
		1326	// get rid of old coordinate sources, make space for fmt info and predicate
		1327	su->moveSources(arg, 3 - arg);
		1328	// set 64 bit address and 32-bit format sources
		1329	su->setSrc(0, addr);
		1330	su->setSrc(1, v);
		1331	su->setSrc(2, pred);
		1332	}
		1333
		1334	void
		1335	NVC0LoweringPass::handleSurfaceOpNVE4(TexInstruction *su)
		1336	{
		1337	processSurfaceCoordsNVE4(su);
		1338
		1339	// Who do we hate more ? The person who decided that nvc0's SULD doesn't
		1340	// have to support conversion or the person who decided that, in OpenCL,
		1341	// you don't have to specify the format here like you do in OpenGL ?
		1342
		1343	if (su->op == OP_SULDP) {
		1344	// We don't patch shaders. Ever.
		1345	// You get an indirect call to our library blob here.
		1346	// But at least it's uniform.
		1347	FlowInstruction *call;
		1348	LValue *p[3];
		1349	LValue *r[5];
		1350	uint16_t base = su->tex.r * NVE4_SU_INFO__STRIDE + NVE4_SU_INFO_CALL;
		1351
		1352	for (int i = 0; i < 4; ++i)
		1353	(r[i] = bld.getScratch(4, FILE_GPR))->reg.data.id = i;
		1354	for (int i = 0; i < 3; ++i)
		1355	(p[i] = bld.getScratch(1, FILE_PREDICATE))->reg.data.id = i;
		1356	(r[4] = bld.getScratch(8, FILE_GPR))->reg.data.id = 4;
		1357
		1358	bld.mkMov(p[1], bld.mkImm((su->cache == CACHE_CA) ? 1 : 0), TYPE_U8);
		1359	bld.mkMov(p[2], bld.mkImm((su->cache == CACHE_CG) ? 1 : 0), TYPE_U8);
		1360	bld.mkMov(p[0], su->getSrc(2), TYPE_U8);
		1361	bld.mkMov(r[4], su->getSrc(0), TYPE_U64);
		1362	bld.mkMov(r[2], su->getSrc(1), TYPE_U32);
		1363
		1364	call = bld.mkFlow(OP_CALL, NULL, su->cc, su->getPredicate());
		1365
		1366	call->indirect = 1;
		1367	call->absolute = 1;
		1368	call->setSrc(0, bld.mkSymbol(FILE_MEMORY_CONST,
		1369	prog->driver->io.resInfoCBSlot, TYPE_U32,
		1370	prog->driver->io.suInfoBase + base));
		1371	call->setSrc(1, r[2]);
		1372	call->setSrc(2, r[4]);
		1373	for (int i = 0; i < 3; ++i)
		1374	call->setSrc(3 + i, p[i]);
		1375	for (int i = 0; i < 4; ++i) {
		1376	call->setDef(i, r[i]);
		1377	bld.mkMov(su->getDef(i), r[i]);
		1378	}
		1379	call->setDef(4, p[1]);
		1380	delete_Instruction(bld.getProgram(), su);
		1381	}
		1382
		1383	if (su->op == OP_SUREDB \|\| su->op == OP_SUREDP) {
		1384	// FIXME: for out of bounds access, destination value will be undefined !
		1385	Value *pred = su->getSrc(2);
		1386	CondCode cc = CC_NOT_P;
		1387	if (su->getPredicate()) {
		1388	pred = bld.getScratch(1, FILE_PREDICATE);
		1389	cc = su->cc;
		1390	if (cc == CC_NOT_P) {
		1391	bld.mkOp2(OP_OR, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
		1392	} else {
		1393	bld.mkOp2(OP_AND, TYPE_U8, pred, su->getPredicate(), su->getSrc(2));
		1394	pred->getInsn()->src(1).mod = Modifier(NV50_IR_MOD_NOT);
		1395	}
		1396	}
		1397	Instruction *red = bld.mkOp(OP_ATOM, su->dType, su->getDef(0));
		1398	red->subOp = su->subOp;
		1399	if (!gMemBase)
		1400	gMemBase = bld.mkSymbol(FILE_MEMORY_GLOBAL, 0, TYPE_U32, 0);
		1401	red->setSrc(0, gMemBase);
		1402	red->setSrc(1, su->getSrc(3));
		1403	if (su->subOp == NV50_IR_SUBOP_ATOM_CAS)
		1404	red->setSrc(2, su->getSrc(4));
		1405	red->setIndirect(0, 0, su->getSrc(0));
		1406	red->setPredicate(cc, pred);
		1407	delete_Instruction(bld.getProgram(), su);
		1408	handleCasExch(red, true);
		1409	} else {
		1410	su->sType = (su->tex.target == TEX_TARGET_BUFFER) ? TYPE_U32 : TYPE_U8;
		1411	}
		1412	}
		1413
		1414	bool
		1415	NVC0LoweringPass::handleWRSV(Instruction *i)
		1416	{
		1417	Instruction *st;
		1418	Symbol *sym;
		1419	uint32_t addr;
		1420
		1421	// must replace, $sreg are not writeable
		1422	addr = targ->getSVAddress(FILE_SHADER_OUTPUT, i->getSrc(0)->asSym());
		1423	if (addr >= 0x400)
		1424	return false;
		1425	sym = bld.mkSymbol(FILE_SHADER_OUTPUT, 0, i->sType, addr);
		1426
		1427	st = bld.mkStore(OP_EXPORT, i->dType, sym, i->getIndirect(0, 0),
		1428	i->getSrc(1));
		1429	st->perPatch = i->perPatch;
		1430
		1431	bld.getBB()->remove(i);
		1432	return true;
		1433	}
		1434
		1435	void
		1436	NVC0LoweringPass::readTessCoord(LValue *dst, int c)
		1437	{
		1438	Value *laneid = bld.getSSA();
		1439	Value x, y;
		1440
		1441	bld.mkOp1(OP_RDSV, TYPE_U32, laneid, bld.mkSysVal(SV_LANEID, 0));
		1442
		1443	if (c == 0) {
		1444	x = dst;
		1445	y = NULL;
		1446	} else
		1447	if (c == 1) {
		1448	x = NULL;
		1449	y = dst;
		1450	} else {
		1451	assert(c == 2);
		1452	x = bld.getSSA();
		1453	y = bld.getSSA();
		1454	}
		1455	if (x)
		1456	bld.mkFetch(x, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f0, NULL, laneid);
		1457	if (y)
		1458	bld.mkFetch(y, TYPE_F32, FILE_SHADER_OUTPUT, 0x2f4, NULL, laneid);
		1459
		1460	if (c == 2) {
		1461	bld.mkOp2(OP_ADD, TYPE_F32, dst, x, y);
		1462	bld.mkOp2(OP_SUB, TYPE_F32, dst, bld.loadImm(NULL, 1.0f), dst);
		1463	}
		1464	}
		1465
		1466	bool
		1467	NVC0LoweringPass::handleRDSV(Instruction *i)
		1468	{
		1469	Symbol *sym = i->getSrc(0)->asSym();
		1470	const SVSemantic sv = sym->reg.data.sv.sv;
		1471	Value *vtx = NULL;
		1472	Instruction *ld;
		1473	uint32_t addr = targ->getSVAddress(FILE_SHADER_INPUT, sym);
		1474
		1475	if (addr >= 0x400) {
		1476	// mov $sreg
		1477	if (sym->reg.data.sv.index == 3) {
		1478	// TGSI backend may use 4th component of TID,NTID,CTAID,NCTAID
		1479	i->op = OP_MOV;
		1480	i->setSrc(0, bld.mkImm((sv == SV_NTID \|\| sv == SV_NCTAID) ? 1 : 0));
		1481	}
		1482	return true;
		1483	}
		1484
		1485	switch (sv) {
		1486	case SV_POSITION:
		1487	assert(prog->getType() == Program::TYPE_FRAGMENT);
		1488	if (i->srcExists(1)) {
		1489	// Pass offset through to the interpolation logic
		1490	ld = bld.mkInterp(NV50_IR_INTERP_LINEAR \| NV50_IR_INTERP_OFFSET,
		1491	i->getDef(0), addr, NULL);
		1492	ld->setSrc(1, i->getSrc(1));
		1493	} else {
		1494	bld.mkInterp(NV50_IR_INTERP_LINEAR, i->getDef(0), addr, NULL);
		1495	}
		1496	break;
		1497	case SV_FACE:
		1498	{
		1499	Value *face = i->getDef(0);
		1500	bld.mkInterp(NV50_IR_INTERP_FLAT, face, addr, NULL);
		1501	if (i->dType == TYPE_F32) {
		1502	bld.mkOp2(OP_OR, TYPE_U32, face, face, bld.mkImm(0x00000001));
		1503	bld.mkOp1(OP_NEG, TYPE_S32, face, face);
		1504	bld.mkCvt(OP_CVT, TYPE_F32, face, TYPE_S32, face);
		1505	}
		1506	}
		1507	break;
		1508	case SV_TESS_COORD:
		1509	assert(prog->getType() == Program::TYPE_TESSELLATION_EVAL);
		1510	readTessCoord(i->getDef(0)->asLValue(), i->getSrc(0)->reg.data.sv.index);
		1511	break;
		1512	case SV_NTID:
		1513	case SV_NCTAID:
		1514	case SV_GRIDID:
		1515	assert(targ->getChipset() >= NVISA_GK104_CHIPSET); // mov $sreg otherwise
		1516	if (sym->reg.data.sv.index == 3) {
		1517	i->op = OP_MOV;
		1518	i->setSrc(0, bld.mkImm(sv == SV_GRIDID ? 0 : 1));
		1519	return true;
		1520	}
		1521	addr += prog->driver->prop.cp.gridInfoBase;
		1522	bld.mkLoad(TYPE_U32, i->getDef(0),
		1523	bld.mkSymbol(FILE_MEMORY_CONST, 0, TYPE_U32, addr), NULL);
		1524	break;
		1525	case SV_SAMPLE_INDEX:
		1526	// TODO: Properly pass source as an address in the PIX address space
		1527	// (which can be of the form [r0+offset]). But this is currently
		1528	// unnecessary.
		1529	ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
		1530	ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
		1531	break;
		1532	case SV_SAMPLE_POS: {
		1533	Value *off = new_LValue(func, FILE_GPR);
		1534	ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
		1535	ld->subOp = NV50_IR_SUBOP_PIXLD_SAMPLEID;
		1536	bld.mkOp2(OP_SHL, TYPE_U32, off, i->getDef(0), bld.mkImm(3));
		1537	bld.mkLoad(TYPE_F32,
		1538	i->getDef(0),
		1539	bld.mkSymbol(
		1540	FILE_MEMORY_CONST, prog->driver->io.resInfoCBSlot,
		1541	TYPE_U32, prog->driver->io.sampleInfoBase +
		1542	4 * sym->reg.data.sv.index),
		1543	off);
		1544	break;
		1545	}
		1546	case SV_SAMPLE_MASK:
		1547	ld = bld.mkOp1(OP_PIXLD, TYPE_U32, i->getDef(0), bld.mkImm(0));
		1548	ld->subOp = NV50_IR_SUBOP_PIXLD_COVMASK;
		1549	break;
		1550	default:
		1551	if (prog->getType() == Program::TYPE_TESSELLATION_EVAL)
		1552	vtx = bld.mkOp1v(OP_PFETCH, TYPE_U32, bld.getSSA(), bld.mkImm(0));
		1553	ld = bld.mkFetch(i->getDef(0), i->dType,
		1554	FILE_SHADER_INPUT, addr, i->getIndirect(0, 0), vtx);
		1555	ld->perPatch = i->perPatch;
		1556	break;
		1557	}
		1558	bld.getBB()->remove(i);
		1559	return true;
		1560	}
		1561
		1562	bool
		1563	NVC0LoweringPass::handleDIV(Instruction *i)
		1564	{
		1565	if (!isFloatType(i->dType))
		1566	return true;
		1567	bld.setPosition(i, false);
		1568	Instruction *rcp = bld.mkOp1(OP_RCP, i->dType, bld.getSSA(typeSizeof(i->dType)), i->getSrc(1));
		1569	i->op = OP_MUL;
		1570	i->setSrc(1, rcp->getDef(0));
		1571	return true;
		1572	}
		1573
		1574	bool
		1575	NVC0LoweringPass::handleMOD(Instruction *i)
		1576	{
		1577	if (!isFloatType(i->dType))
		1578	return true;
		1579	LValue *value = bld.getScratch(typeSizeof(i->dType));
		1580	bld.mkOp1(OP_RCP, i->dType, value, i->getSrc(1));
		1581	bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(0), value);
		1582	bld.mkOp1(OP_TRUNC, i->dType, value, value);
		1583	bld.mkOp2(OP_MUL, i->dType, value, i->getSrc(1), value);
		1584	i->op = OP_SUB;
		1585	i->setSrc(1, value);
		1586	return true;
		1587	}
		1588
		1589	bool
		1590	NVC0LoweringPass::handleSQRT(Instruction *i)
		1591	{
		1592	Value *pred = bld.getSSA(1, FILE_PREDICATE);
		1593	Value *zero = bld.getSSA();
		1594	Instruction *rsq;
		1595
		1596	bld.mkOp1(OP_MOV, TYPE_U32, zero, bld.mkImm(0));
		1597	if (i->dType == TYPE_F64)
		1598	zero = bld.mkOp2v(OP_MERGE, TYPE_U64, bld.getSSA(8), zero, zero);
		1599	bld.mkCmp(OP_SET, CC_LE, i->dType, pred, i->dType, i->getSrc(0), zero);
		1600	bld.mkOp1(OP_MOV, i->dType, i->getDef(0), zero)->setPredicate(CC_P, pred);
		1601	rsq = bld.mkOp1(OP_RSQ, i->dType,
		1602	bld.getSSA(typeSizeof(i->dType)), i->getSrc(0));
		1603	rsq->setPredicate(CC_NOT_P, pred);
		1604	i->op = OP_MUL;
		1605	i->setSrc(1, rsq->getDef(0));
		1606	i->setPredicate(CC_NOT_P, pred);
		1607
		1608
		1609	return true;
		1610	}
		1611
		1612	bool
		1613	NVC0LoweringPass::handlePOW(Instruction *i)
		1614	{
		1615	LValue *val = bld.getScratch();
		1616
		1617	bld.mkOp1(OP_LG2, TYPE_F32, val, i->getSrc(0));
		1618	bld.mkOp2(OP_MUL, TYPE_F32, val, i->getSrc(1), val)->dnz = 1;
		1619	bld.mkOp1(OP_PREEX2, TYPE_F32, val, val);
		1620
		1621	i->op = OP_EX2;
		1622	i->setSrc(0, val);
		1623	i->setSrc(1, NULL);
		1624
		1625	return true;
		1626	}
		1627
		1628	bool
		1629	NVC0LoweringPass::handleEXPORT(Instruction *i)
		1630	{
		1631	if (prog->getType() == Program::TYPE_FRAGMENT) {
		1632	int id = i->getSrc(0)->reg.data.offset / 4;
		1633
		1634	if (i->src(0).isIndirect(0)) // TODO, ugly
		1635	return false;
		1636	i->op = OP_MOV;
		1637	i->subOp = NV50_IR_SUBOP_MOV_FINAL;
		1638	i->src(0).set(i->src(1));
		1639	i->setSrc(1, NULL);
		1640	i->setDef(0, new_LValue(func, FILE_GPR));
		1641	i->getDef(0)->reg.data.id = id;
		1642
		1643	prog->maxGPR = MAX2(prog->maxGPR, id);
		1644	} else
		1645	if (prog->getType() == Program::TYPE_GEOMETRY) {
		1646	i->setIndirect(0, 1, gpEmitAddress);
		1647	}
		1648	return true;
		1649	}
		1650
		1651	bool
		1652	NVC0LoweringPass::handleOUT(Instruction *i)
		1653	{
		1654	Instruction *prev = i->prev;
		1655	ImmediateValue stream, prevStream;
		1656
		1657	// Only merge if the stream ids match. Also, note that the previous
		1658	// instruction would have already been lowered, so we take arg1 from it.
		1659	if (i->op == OP_RESTART && prev && prev->op == OP_EMIT &&
		1660	i->src(0).getImmediate(stream) &&
		1661	prev->src(1).getImmediate(prevStream) &&
		1662	stream.reg.data.u32 == prevStream.reg.data.u32) {
		1663	i->prev->subOp = NV50_IR_SUBOP_EMIT_RESTART;
		1664	delete_Instruction(prog, i);
		1665	} else {
		1666	assert(gpEmitAddress);
		1667	i->setDef(0, gpEmitAddress);
		1668	i->setSrc(1, i->getSrc(0));
		1669	i->setSrc(0, gpEmitAddress);
		1670	}
		1671	return true;
		1672	}
		1673
		1674	// Generate a binary predicate if an instruction is predicated by
		1675	// e.g. an f32 value.
		1676	void
		1677	NVC0LoweringPass::checkPredicate(Instruction *insn)
		1678	{
		1679	Value *pred = insn->getPredicate();
		1680	Value *pdst;
		1681
		1682	if (!pred \|\| pred->reg.file == FILE_PREDICATE)
		1683	return;
		1684	pdst = new_LValue(func, FILE_PREDICATE);
		1685
		1686	// CAUTION: don't use pdst->getInsn, the definition might not be unique,
		1687	// delay turning PSET(FSET(x,y),0) into PSET(x,y) to a later pass
		1688
		1689	bld.mkCmp(OP_SET, CC_NEU, insn->dType, pdst, insn->dType, bld.mkImm(0), pred);
		1690
		1691	insn->setPredicate(insn->cc, pdst);
		1692	}
		1693
		1694	//
		1695	// - add quadop dance for texturing
		1696	// - put FP outputs in GPRs
		1697	// - convert instruction sequences
		1698	//
		1699	bool
		1700	NVC0LoweringPass::visit(Instruction *i)
		1701	{
		1702	bld.setPosition(i, false);
		1703
		1704	if (i->cc != CC_ALWAYS)
		1705	checkPredicate(i);
		1706
		1707	switch (i->op) {
		1708	case OP_TEX:
		1709	case OP_TXB:
		1710	case OP_TXL:
		1711	case OP_TXF:
		1712	case OP_TXG:
		1713	return handleTEX(i->asTex());
		1714	case OP_TXD:
		1715	return handleTXD(i->asTex());
		1716	case OP_TXLQ:
		1717	return handleTXLQ(i->asTex());
		1718	case OP_TXQ:
		1719	return handleTXQ(i->asTex());
		1720	case OP_EX2:
		1721	bld.mkOp1(OP_PREEX2, TYPE_F32, i->getDef(0), i->getSrc(0));
		1722	i->setSrc(0, i->getDef(0));
		1723	break;
		1724	case OP_POW:
		1725	return handlePOW(i);
		1726	case OP_DIV:
		1727	return handleDIV(i);
		1728	case OP_MOD:
		1729	return handleMOD(i);
		1730	case OP_SQRT:
		1731	return handleSQRT(i);
		1732	case OP_EXPORT:
		1733	return handleEXPORT(i);
		1734	case OP_EMIT:
		1735	case OP_RESTART:
		1736	return handleOUT(i);
		1737	case OP_RDSV:
		1738	return handleRDSV(i);
		1739	case OP_WRSV:
		1740	return handleWRSV(i);
		1741	case OP_LOAD:
		1742	if (i->src(0).getFile() == FILE_SHADER_INPUT) {
		1743	if (prog->getType() == Program::TYPE_COMPUTE) {
		1744	i->getSrc(0)->reg.file = FILE_MEMORY_CONST;
		1745	i->getSrc(0)->reg.fileIndex = 0;
		1746	} else
		1747	if (prog->getType() == Program::TYPE_GEOMETRY &&
		1748	i->src(0).isIndirect(0)) {
		1749	// XXX: this assumes vec4 units
		1750	Value *ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
		1751	i->getIndirect(0, 0), bld.mkImm(4));
		1752	i->setIndirect(0, 0, ptr);
		1753	i->op = OP_VFETCH;
		1754	} else {
		1755	i->op = OP_VFETCH;
		1756	assert(prog->getType() != Program::TYPE_FRAGMENT); // INTERP
		1757	}
		1758	} else if (i->src(0).getFile() == FILE_MEMORY_CONST) {
		1759	if (i->src(0).isIndirect(1)) {
		1760	Value *ptr;
		1761	if (i->src(0).isIndirect(0))
		1762	ptr = bld.mkOp3v(OP_INSBF, TYPE_U32, bld.getSSA(),
		1763	i->getIndirect(0, 1), bld.mkImm(0x1010),
		1764	i->getIndirect(0, 0));
		1765	else
		1766	ptr = bld.mkOp2v(OP_SHL, TYPE_U32, bld.getSSA(),
		1767	i->getIndirect(0, 1), bld.mkImm(16));
		1768	i->setIndirect(0, 1, NULL);
		1769	i->setIndirect(0, 0, ptr);
		1770	i->subOp = NV50_IR_SUBOP_LDC_IS;
		1771	}
		1772	}
		1773	break;
		1774	case OP_ATOM:
		1775	{
		1776	const bool cctl = i->src(0).getFile() == FILE_MEMORY_GLOBAL;
		1777	handleATOM(i);
		1778	handleCasExch(i, cctl);
		1779	}
		1780	break;
		1781	case OP_SULDB:
		1782	case OP_SULDP:
		1783	case OP_SUSTB:
		1784	case OP_SUSTP:
		1785	case OP_SUREDB:
		1786	case OP_SUREDP:
		1787	if (targ->getChipset() >= NVISA_GK104_CHIPSET)
		1788	handleSurfaceOpNVE4(i->asTex());
		1789	break;
		1790	default:
		1791	break;
		1792	}
		1793	return true;
		1794	}
		1795
		1796	bool
		1797	TargetNVC0::runLegalizePass(Program *prog, CGStage stage) const
		1798	{
		1799	if (stage == CG_STAGE_PRE_SSA) {
		1800	NVC0LoweringPass pass(prog);
		1801	return pass.run(prog, false, true);
		1802	} else
		1803	if (stage == CG_STAGE_POST_RA) {
		1804	NVC0LegalizePostRA pass(prog);
		1805	return pass.run(prog, false, true);
		1806	} else
		1807	if (stage == CG_STAGE_SSA) {
		1808	NVC0LegalizeSSA pass;
		1809	return pass.run(prog, false, true);
		1810	}
		1811	return false;
		1812	}
		1813
		1814	} // namespace nv50_ir

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/mesa-10.6.0/src/gallium/drivers/nouveau/codegen/nv50_ir_lowering_nvc0.cpp – Rev 5571