WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/src/mesa/drivers/dri/i965/brw_fs.cpp

Rev	Author	Line No.	Line
4358	Serge	1	/*
		2	* Copyright © 2010 Intel Corporation
		3	*
		4	* Permission is hereby granted, free of charge, to any person obtaining a
		5	* copy of this software and associated documentation files (the "Software"),
		6	* to deal in the Software without restriction, including without limitation
		7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		8	* and/or sell copies of the Software, and to permit persons to whom the
		9	* Software is furnished to do so, subject to the following conditions:
		10	*
		11	* The above copyright notice and this permission notice (including the next
		12	* paragraph) shall be included in all copies or substantial portions of the
		13	* Software.
		14	*
		15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		16	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		17	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		18	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		19	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
		20	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
		21	* IN THE SOFTWARE.
		22	*/
		23
		24	/** @file brw_fs.cpp
		25	*
		26	* This file drives the GLSL IR -> LIR translation, contains the
		27	* optimizations on the LIR, and drives the generation of native code
		28	* from the LIR.
		29	*/
		30
		31	extern "C" {
		32
		33	#include
		34
		35	#include "main/hash_table.h"
		36	#include "main/macros.h"
		37	#include "main/shaderobj.h"
		38	#include "main/uniforms.h"
		39	#include "main/fbobject.h"
		40	#include "program/prog_parameter.h"
		41	#include "program/prog_print.h"
		42	#include "program/register_allocate.h"
		43	#include "program/sampler.h"
		44	#include "program/hash_table.h"
		45	#include "brw_context.h"
		46	#include "brw_eu.h"
		47	#include "brw_wm.h"
		48	}
		49	#include "brw_fs.h"
		50	#include "glsl/glsl_types.h"
		51
		52	void
		53	fs_inst::init()
		54	{
		55	memset(this, 0, sizeof(*this));
		56	this->opcode = BRW_OPCODE_NOP;
		57	this->conditional_mod = BRW_CONDITIONAL_NONE;
		58
		59	this->dst = reg_undef;
		60	this->src[0] = reg_undef;
		61	this->src[1] = reg_undef;
		62	this->src[2] = reg_undef;
		63
		64	/* This will be the case for almost all instructions. */
		65	this->regs_written = 1;
		66	}
		67
		68	fs_inst::fs_inst()
		69	{
		70	init();
		71	}
		72
		73	fs_inst::fs_inst(enum opcode opcode)
		74	{
		75	init();
		76	this->opcode = opcode;
		77	}
		78
		79	fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
		80	{
		81	init();
		82	this->opcode = opcode;
		83	this->dst = dst;
		84
		85	if (dst.file == GRF)
		86	assert(dst.reg_offset >= 0);
		87	}
		88
		89	fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
		90	{
		91	init();
		92	this->opcode = opcode;
		93	this->dst = dst;
		94	this->src[0] = src0;
		95
		96	if (dst.file == GRF)
		97	assert(dst.reg_offset >= 0);
		98	if (src[0].file == GRF)
		99	assert(src[0].reg_offset >= 0);
		100	}
		101
		102	fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
		103	{
		104	init();
		105	this->opcode = opcode;
		106	this->dst = dst;
		107	this->src[0] = src0;
		108	this->src[1] = src1;
		109
		110	if (dst.file == GRF)
		111	assert(dst.reg_offset >= 0);
		112	if (src[0].file == GRF)
		113	assert(src[0].reg_offset >= 0);
		114	if (src[1].file == GRF)
		115	assert(src[1].reg_offset >= 0);
		116	}
		117
		118	fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
		119	fs_reg src0, fs_reg src1, fs_reg src2)
		120	{
		121	init();
		122	this->opcode = opcode;
		123	this->dst = dst;
		124	this->src[0] = src0;
		125	this->src[1] = src1;
		126	this->src[2] = src2;
		127
		128	if (dst.file == GRF)
		129	assert(dst.reg_offset >= 0);
		130	if (src[0].file == GRF)
		131	assert(src[0].reg_offset >= 0);
		132	if (src[1].file == GRF)
		133	assert(src[1].reg_offset >= 0);
		134	if (src[2].file == GRF)
		135	assert(src[2].reg_offset >= 0);
		136	}
		137
		138	#define ALU1(op) \
		139	fs_inst * \
		140	fs_visitor::op(fs_reg dst, fs_reg src0) \
		141	{ \
		142	return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
		143	}
		144
		145	#define ALU2(op) \
		146	fs_inst * \
		147	fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
		148	{ \
		149	return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
		150	}
		151
		152	#define ALU3(op) \
		153	fs_inst * \
		154	fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
		155	{ \
		156	return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
		157	}
		158
		159	ALU1(NOT)
		160	ALU1(MOV)
		161	ALU1(FRC)
		162	ALU1(RNDD)
		163	ALU1(RNDE)
		164	ALU1(RNDZ)
		165	ALU2(ADD)
		166	ALU2(MUL)
		167	ALU2(MACH)
		168	ALU2(AND)
		169	ALU2(OR)
		170	ALU2(XOR)
		171	ALU2(SHL)
		172	ALU2(SHR)
		173	ALU2(ASR)
		174	ALU3(LRP)
		175	ALU1(BFREV)
		176	ALU3(BFE)
		177	ALU2(BFI1)
		178	ALU3(BFI2)
		179	ALU1(FBH)
		180	ALU1(FBL)
		181	ALU1(CBIT)
		182
		183	/** Gen4 predicated IF. */
		184	fs_inst *
		185	fs_visitor::IF(uint32_t predicate)
		186	{
		187	fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
		188	inst->predicate = predicate;
		189	return inst;
		190	}
		191
		192	/** Gen6+ IF with embedded comparison. */
		193	fs_inst *
		194	fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
		195	{
		196	assert(brw->gen >= 6);
		197	fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
		198	reg_null_d, src0, src1);
		199	inst->conditional_mod = condition;
		200	return inst;
		201	}
		202
		203	/**
		204	* CMP: Sets the low bit of the destination channels with the result
		205	* of the comparison, while the upper bits are undefined, and updates
		206	* the flag register with the packed 16 bits of the result.
		207	*/
		208	fs_inst *
		209	fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
		210	{
		211	fs_inst *inst;
		212
		213	/* Take the instruction:
		214	*
		215	* CMP null src0 src1
		216	*
		217	* Original gen4 does type conversion to the destination type before
		218	* comparison, producing garbage results for floating point comparisons.
		219	* gen5 does the comparison on the execution type (resolved source types),
		220	* so dst type doesn't matter. gen6 does comparison and then uses the
		221	* result as if it was the dst type with no conversion, which happens to
		222	* mostly work out for float-interpreted-as-int since our comparisons are
		223	* for >0, =0, <0.
		224	*/
		225	if (brw->gen == 4) {
		226	dst.type = src0.type;
		227	if (dst.file == HW_REG)
		228	dst.fixed_hw_reg.type = dst.type;
		229	}
		230
		231	resolve_ud_negate(&src0);
		232	resolve_ud_negate(&src1);
		233
		234	inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
		235	inst->conditional_mod = condition;
		236
		237	return inst;
		238	}
		239
		240	exec_list
		241	fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
		242	fs_reg varying_offset,
		243	uint32_t const_offset)
		244	{
		245	exec_list instructions;
		246	fs_inst *inst;
		247
		248	/* We have our constant surface use a pitch of 4 bytes, so our index can
		249	* be any component of a vector, and then we load 4 contiguous
		250	* components starting from that.
		251	*
		252	* We break down the const_offset to a portion added to the variable
		253	* offset and a portion done using reg_offset, which means that if you
		254	* have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
		255	* a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
		256	* CSE can later notice that those loads are all the same and eliminate
		257	* the redundant ones.
		258	*/
		259	fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
		260	instructions.push_tail(ADD(vec4_offset,
		261	varying_offset, const_offset & ~3));
		262
		263	int scale = 1;
		264	if (brw->gen == 4 && dispatch_width == 8) {
		265	/* Pre-gen5, we can either use a SIMD8 message that requires (header,
		266	* u, v, r) as parameters, or we can just use the SIMD16 message
		267	* consisting of (header, u). We choose the second, at the cost of a
		268	* longer return length.
		269	*/
		270	scale = 2;
		271	}
		272
		273	enum opcode op;
		274	if (brw->gen >= 7)
		275	op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
		276	else
		277	op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
		278	fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
		279	inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
		280	inst->regs_written = 4 * scale;
		281	instructions.push_tail(inst);
		282
		283	if (brw->gen < 7) {
		284	inst->base_mrf = 13;
		285	inst->header_present = true;
		286	if (brw->gen == 4)
		287	inst->mlen = 3;
		288	else
		289	inst->mlen = 1 + dispatch_width / 8;
		290	}
		291
		292	vec4_result.reg_offset += (const_offset & 3) * scale;
		293	instructions.push_tail(MOV(dst, vec4_result));
		294
		295	return instructions;
		296	}
		297
		298	/**
		299	* A helper for MOV generation for fixing up broken hardware SEND dependency
		300	* handling.
		301	*/
		302	fs_inst *
		303	fs_visitor::DEP_RESOLVE_MOV(int grf)
		304	{
		305	fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
		306
		307	inst->ir = NULL;
		308	inst->annotation = "send dependency resolve";
		309
		310	/* The caller always wants uncompressed to emit the minimal extra
		311	* dependencies, and to avoid having to deal with aligning its regs to 2.
		312	*/
		313	inst->force_uncompressed = true;
		314
		315	return inst;
		316	}
		317
		318	bool
		319	fs_inst::equals(fs_inst *inst)
		320	{
		321	return (opcode == inst->opcode &&
		322	dst.equals(inst->dst) &&
		323	src[0].equals(inst->src[0]) &&
		324	src[1].equals(inst->src[1]) &&
		325	src[2].equals(inst->src[2]) &&
		326	saturate == inst->saturate &&
		327	predicate == inst->predicate &&
		328	conditional_mod == inst->conditional_mod &&
		329	mlen == inst->mlen &&
		330	base_mrf == inst->base_mrf &&
		331	sampler == inst->sampler &&
		332	target == inst->target &&
		333	eot == inst->eot &&
		334	header_present == inst->header_present &&
		335	shadow_compare == inst->shadow_compare &&
		336	offset == inst->offset);
		337	}
		338
		339	bool
		340	fs_inst::overwrites_reg(const fs_reg ®)
		341	{
		342	return (reg.file == dst.file &&
		343	reg.reg == dst.reg &&
		344	reg.reg_offset >= dst.reg_offset &&
		345	reg.reg_offset < dst.reg_offset + regs_written);
		346	}
		347
		348	bool
		349	fs_inst::is_send_from_grf()
		350	{
		351	return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 \|\|
		352	opcode == SHADER_OPCODE_SHADER_TIME_ADD \|\|
		353	(opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
		354	src[1].file == GRF));
		355	}
		356
		357	bool
		358	fs_visitor::can_do_source_mods(fs_inst *inst)
		359	{
		360	if (brw->gen == 6 && inst->is_math())
		361	return false;
		362
		363	if (inst->is_send_from_grf())
		364	return false;
		365
		366	return true;
		367	}
		368
		369	void
		370	fs_reg::init()
		371	{
		372	memset(this, 0, sizeof(*this));
		373	this->smear = -1;
		374	}
		375
		376	/** Generic unset register constructor. */
		377	fs_reg::fs_reg()
		378	{
		379	init();
		380	this->file = BAD_FILE;
		381	}
		382
		383	/** Immediate value constructor. */
		384	fs_reg::fs_reg(float f)
		385	{
		386	init();
		387	this->file = IMM;
		388	this->type = BRW_REGISTER_TYPE_F;
		389	this->imm.f = f;
		390	}
		391
		392	/** Immediate value constructor. */
		393	fs_reg::fs_reg(int32_t i)
		394	{
		395	init();
		396	this->file = IMM;
		397	this->type = BRW_REGISTER_TYPE_D;
		398	this->imm.i = i;
		399	}
		400
		401	/** Immediate value constructor. */
		402	fs_reg::fs_reg(uint32_t u)
		403	{
		404	init();
		405	this->file = IMM;
		406	this->type = BRW_REGISTER_TYPE_UD;
		407	this->imm.u = u;
		408	}
		409
		410	/** Fixed brw_reg Immediate value constructor. */
		411	fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
		412	{
		413	init();
		414	this->file = HW_REG;
		415	this->fixed_hw_reg = fixed_hw_reg;
		416	this->type = fixed_hw_reg.type;
		417	}
		418
		419	bool
		420	fs_reg::equals(const fs_reg &r) const
		421	{
		422	return (file == r.file &&
		423	reg == r.reg &&
		424	reg_offset == r.reg_offset &&
		425	type == r.type &&
		426	negate == r.negate &&
		427	abs == r.abs &&
		428	!reladdr && !r.reladdr &&
		429	memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
		430	sizeof(fixed_hw_reg)) == 0 &&
		431	smear == r.smear &&
		432	imm.u == r.imm.u);
		433	}
		434
		435	bool
		436	fs_reg::is_zero() const
		437	{
		438	if (file != IMM)
		439	return false;
		440
		441	return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
		442	}
		443
		444	bool
		445	fs_reg::is_one() const
		446	{
		447	if (file != IMM)
		448	return false;
		449
		450	return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
		451	}
		452
		453	bool
		454	fs_reg::is_valid_3src() const
		455	{
		456	return file == GRF \|\| file == UNIFORM;
		457	}
		458
		459	int
		460	fs_visitor::type_size(const struct glsl_type *type)
		461	{
		462	unsigned int size, i;
		463
		464	switch (type->base_type) {
		465	case GLSL_TYPE_UINT:
		466	case GLSL_TYPE_INT:
		467	case GLSL_TYPE_FLOAT:
		468	case GLSL_TYPE_BOOL:
		469	return type->components();
		470	case GLSL_TYPE_ARRAY:
		471	return type_size(type->fields.array) * type->length;
		472	case GLSL_TYPE_STRUCT:
		473	size = 0;
		474	for (i = 0; i < type->length; i++) {
		475	size += type_size(type->fields.structure[i].type);
		476	}
		477	return size;
		478	case GLSL_TYPE_SAMPLER:
		479	/* Samplers take up no register space, since they're baked in at
		480	* link time.
		481	*/
		482	return 0;
		483	case GLSL_TYPE_VOID:
		484	case GLSL_TYPE_ERROR:
		485	case GLSL_TYPE_INTERFACE:
		486	assert(!"not reached");
		487	break;
		488	}
		489
		490	return 0;
		491	}
		492
		493	fs_reg
		494	fs_visitor::get_timestamp()
		495	{
		496	assert(brw->gen >= 7);
		497
		498	fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
		499	BRW_ARF_TIMESTAMP,
		500	0),
		501	BRW_REGISTER_TYPE_UD));
		502
		503	fs_reg dst = fs_reg(this, glsl_type::uint_type);
		504
		505	fs_inst *mov = emit(MOV(dst, ts));
		506	/* We want to read the 3 fields we care about (mostly field 0, but also 2)
		507	* even if it's not enabled in the dispatch.
		508	*/
		509	mov->force_writemask_all = true;
		510	mov->force_uncompressed = true;
		511
		512	/* The caller wants the low 32 bits of the timestamp. Since it's running
		513	* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
		514	* which is plenty of time for our purposes. It is identical across the
		515	* EUs, but since it's tracking GPU core speed it will increment at a
		516	* varying rate as render P-states change.
		517	*
		518	* The caller could also check if render P-states have changed (or anything
		519	* else that might disrupt timing) by setting smear to 2 and checking if
		520	* that field is != 0.
		521	*/
		522	dst.smear = 0;
		523
		524	return dst;
		525	}
		526
		527	void
		528	fs_visitor::emit_shader_time_begin()
		529	{
		530	current_annotation = "shader time start";
		531	shader_start_time = get_timestamp();
		532	}
		533
		534	void
		535	fs_visitor::emit_shader_time_end()
		536	{
		537	current_annotation = "shader time end";
		538
		539	enum shader_time_shader_type type, written_type, reset_type;
		540	if (dispatch_width == 8) {
		541	type = ST_FS8;
		542	written_type = ST_FS8_WRITTEN;
		543	reset_type = ST_FS8_RESET;
		544	} else {
		545	assert(dispatch_width == 16);
		546	type = ST_FS16;
		547	written_type = ST_FS16_WRITTEN;
		548	reset_type = ST_FS16_RESET;
		549	}
		550
		551	fs_reg shader_end_time = get_timestamp();
		552
		553	/* Check that there weren't any timestamp reset events (assuming these
		554	* were the only two timestamp reads that happened).
		555	*/
		556	fs_reg reset = shader_end_time;
		557	reset.smear = 2;
		558	fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
		559	test->conditional_mod = BRW_CONDITIONAL_Z;
		560	emit(IF(BRW_PREDICATE_NORMAL));
		561
		562	push_force_uncompressed();
		563	fs_reg start = shader_start_time;
		564	start.negate = true;
		565	fs_reg diff = fs_reg(this, glsl_type::uint_type);
		566	emit(ADD(diff, start, shader_end_time));
		567
		568	/* If there were no instructions between the two timestamp gets, the diff
		569	* is 2 cycles. Remove that overhead, so I can forget about that when
		570	* trying to determine the time taken for single instructions.
		571	*/
		572	emit(ADD(diff, diff, fs_reg(-2u)));
		573
		574	emit_shader_time_write(type, diff);
		575	emit_shader_time_write(written_type, fs_reg(1u));
		576	emit(BRW_OPCODE_ELSE);
		577	emit_shader_time_write(reset_type, fs_reg(1u));
		578	emit(BRW_OPCODE_ENDIF);
		579
		580	pop_force_uncompressed();
		581	}
		582
		583	void
		584	fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
		585	fs_reg value)
		586	{
		587	int shader_time_index =
		588	brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
		589	fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
		590
		591	fs_reg payload;
		592	if (dispatch_width == 8)
		593	payload = fs_reg(this, glsl_type::uvec2_type);
		594	else
		595	payload = fs_reg(this, glsl_type::uint_type);
		596
		597	emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
		598	fs_reg(), payload, offset, value));
		599	}
		600
		601	void
		602	fs_visitor::fail(const char *format, ...)
		603	{
		604	va_list va;
		605	char *msg;
		606
		607	if (failed)
		608	return;
		609
		610	failed = true;
		611
		612	va_start(va, format);
		613	msg = ralloc_vasprintf(mem_ctx, format, va);
		614	va_end(va);
		615	msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
		616
		617	this->fail_msg = msg;
		618
		619	if (INTEL_DEBUG & DEBUG_WM) {
		620	fprintf(stderr, "%s", msg);
		621	}
		622	}
		623
		624	fs_inst *
		625	fs_visitor::emit(enum opcode opcode)
		626	{
		627	return emit(fs_inst(opcode));
		628	}
		629
		630	fs_inst *
		631	fs_visitor::emit(enum opcode opcode, fs_reg dst)
		632	{
		633	return emit(fs_inst(opcode, dst));
		634	}
		635
		636	fs_inst *
		637	fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
		638	{
		639	return emit(fs_inst(opcode, dst, src0));
		640	}
		641
		642	fs_inst *
		643	fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
		644	{
		645	return emit(fs_inst(opcode, dst, src0, src1));
		646	}
		647
		648	fs_inst *
		649	fs_visitor::emit(enum opcode opcode, fs_reg dst,
		650	fs_reg src0, fs_reg src1, fs_reg src2)
		651	{
		652	return emit(fs_inst(opcode, dst, src0, src1, src2));
		653	}
		654
		655	void
		656	fs_visitor::push_force_uncompressed()
		657	{
		658	force_uncompressed_stack++;
		659	}
		660
		661	void
		662	fs_visitor::pop_force_uncompressed()
		663	{
		664	force_uncompressed_stack--;
		665	assert(force_uncompressed_stack >= 0);
		666	}
		667
		668	void
		669	fs_visitor::push_force_sechalf()
		670	{
		671	force_sechalf_stack++;
		672	}
		673
		674	void
		675	fs_visitor::pop_force_sechalf()
		676	{
		677	force_sechalf_stack--;
		678	assert(force_sechalf_stack >= 0);
		679	}
		680
		681	/**
		682	* Returns true if the instruction has a flag that means it won't
		683	* update an entire destination register.
		684	*
		685	* For example, dead code elimination and live variable analysis want to know
		686	* when a write to a variable screens off any preceding values that were in
		687	* it.
		688	*/
		689	bool
		690	fs_inst::is_partial_write()
		691	{
		692	return (this->predicate \|\|
		693	this->force_uncompressed \|\|
		694	this->force_sechalf);
		695	}
		696
		697	/**
		698	* Returns how many MRFs an FS opcode will write over.
		699	*
		700	* Note that this is not the 0 or 1 implied writes in an actual gen
		701	* instruction -- the FS opcodes often generate MOVs in addition.
		702	*/
		703	int
		704	fs_visitor::implied_mrf_writes(fs_inst *inst)
		705	{
		706	if (inst->mlen == 0)
		707	return 0;
		708
		709	switch (inst->opcode) {
		710	case SHADER_OPCODE_RCP:
		711	case SHADER_OPCODE_RSQ:
		712	case SHADER_OPCODE_SQRT:
		713	case SHADER_OPCODE_EXP2:
		714	case SHADER_OPCODE_LOG2:
		715	case SHADER_OPCODE_SIN:
		716	case SHADER_OPCODE_COS:
		717	return 1 * dispatch_width / 8;
		718	case SHADER_OPCODE_POW:
		719	case SHADER_OPCODE_INT_QUOTIENT:
		720	case SHADER_OPCODE_INT_REMAINDER:
		721	return 2 * dispatch_width / 8;
		722	case SHADER_OPCODE_TEX:
		723	case FS_OPCODE_TXB:
		724	case SHADER_OPCODE_TXD:
		725	case SHADER_OPCODE_TXF:
		726	case SHADER_OPCODE_TXF_MS:
		727	case SHADER_OPCODE_TXL:
		728	case SHADER_OPCODE_TXS:
		729	case SHADER_OPCODE_LOD:
		730	return 1;
		731	case FS_OPCODE_FB_WRITE:
		732	return 2;
		733	case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
		734	case FS_OPCODE_UNSPILL:
		735	return 1;
		736	case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
		737	return inst->mlen;
		738	case FS_OPCODE_SPILL:
		739	return 2;
		740	default:
		741	assert(!"not reached");
		742	return inst->mlen;
		743	}
		744	}
		745
		746	int
		747	fs_visitor::virtual_grf_alloc(int size)
		748	{
		749	if (virtual_grf_array_size <= virtual_grf_count) {
		750	if (virtual_grf_array_size == 0)
		751	virtual_grf_array_size = 16;
		752	else
		753	virtual_grf_array_size *= 2;
		754	virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
		755	virtual_grf_array_size);
		756	}
		757	virtual_grf_sizes[virtual_grf_count] = size;
		758	return virtual_grf_count++;
		759	}
		760
		761	/** Fixed HW reg constructor. */
		762	fs_reg::fs_reg(enum register_file file, int reg)
		763	{
		764	init();
		765	this->file = file;
		766	this->reg = reg;
		767	this->type = BRW_REGISTER_TYPE_F;
		768	}
		769
		770	/** Fixed HW reg constructor. */
		771	fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
		772	{
		773	init();
		774	this->file = file;
		775	this->reg = reg;
		776	this->type = type;
		777	}
		778
		779	/** Automatic reg constructor. */
		780	fs_reg::fs_reg(class fs_visitor v, const struct glsl_type type)
		781	{
		782	init();
		783
		784	this->file = GRF;
		785	this->reg = v->virtual_grf_alloc(v->type_size(type));
		786	this->reg_offset = 0;
		787	this->type = brw_type_for_base_type(type);
		788	}
		789
		790	fs_reg *
		791	fs_visitor::variable_storage(ir_variable *var)
		792	{
		793	return (fs_reg *)hash_table_find(this->variable_ht, var);
		794	}
		795
		796	void
		797	import_uniforms_callback(const void *key,
		798	void *data,
		799	void *closure)
		800	{
		801	struct hash_table dst_ht = (struct hash_table )closure;
		802	const fs_reg reg = (const fs_reg )data;
		803
		804	if (reg->file != UNIFORM)
		805	return;
		806
		807	hash_table_insert(dst_ht, data, key);
		808	}
		809
		810	/* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
		811	* This brings in those uniform definitions
		812	*/
		813	void
		814	fs_visitor::import_uniforms(fs_visitor *v)
		815	{
		816	hash_table_call_foreach(v->variable_ht,
		817	import_uniforms_callback,
		818	variable_ht);
		819	this->params_remap = v->params_remap;
		820	this->nr_params_remap = v->nr_params_remap;
		821	}
		822
		823	/* Our support for uniforms is piggy-backed on the struct
		824	* gl_fragment_program, because that's where the values actually
		825	* get stored, rather than in some global gl_shader_program uniform
		826	* store.
		827	*/
		828	void
		829	fs_visitor::setup_uniform_values(ir_variable *ir)
		830	{
		831	int namelen = strlen(ir->name);
		832
		833	/* The data for our (non-builtin) uniforms is stored in a series of
		834	* gl_uniform_driver_storage structs for each subcomponent that
		835	* glGetUniformLocation() could name. We know it's been set up in the same
		836	* order we'd walk the type, so walk the list of storage and find anything
		837	* with our name, or the prefix of a component that starts with our name.
		838	*/
		839	unsigned params_before = c->prog_data.nr_params;
		840	for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
		841	struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
		842
		843	if (strncmp(ir->name, storage->name, namelen) != 0 \|\|
		844	(storage->name[namelen] != 0 &&
		845	storage->name[namelen] != '.' &&
		846	storage->name[namelen] != '[')) {
		847	continue;
		848	}
		849
		850	unsigned slots = storage->type->component_slots();
		851	if (storage->array_elements)
		852	slots *= storage->array_elements;
		853
		854	for (unsigned i = 0; i < slots; i++) {
		855	c->prog_data.param[c->prog_data.nr_params++] =
		856	&storage->storage[i].f;
		857	}
		858	}
		859
		860	/* Make sure we actually initialized the right amount of stuff here. */
		861	assert(params_before + ir->type->component_slots() ==
		862	c->prog_data.nr_params);
		863	(void)params_before;
		864	}
		865
		866
		867	/* Our support for builtin uniforms is even scarier than non-builtin.
		868	* It sits on top of the PROG_STATE_VAR parameters that are
		869	* automatically updated from GL context state.
		870	*/
		871	void
		872	fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
		873	{
		874	const ir_state_slot *const slots = ir->state_slots;
		875	assert(ir->state_slots != NULL);
		876
		877	for (unsigned int i = 0; i < ir->num_state_slots; i++) {
		878	/* This state reference has already been setup by ir_to_mesa, but we'll
		879	* get the same index back here.
		880	*/
		881	int index = _mesa_add_state_reference(this->fp->Base.Parameters,
		882	(gl_state_index *)slots[i].tokens);
		883
		884	/* Add each of the unique swizzles of the element as a parameter.
		885	* This'll end up matching the expected layout of the
		886	* array/matrix/structure we're trying to fill in.
		887	*/
		888	int last_swiz = -1;
		889	for (unsigned int j = 0; j < 4; j++) {
		890	int swiz = GET_SWZ(slots[i].swizzle, j);
		891	if (swiz == last_swiz)
		892	break;
		893	last_swiz = swiz;
		894
		895	c->prog_data.param[c->prog_data.nr_params++] =
		896	&fp->Base.Parameters->ParameterValues[index][swiz].f;
		897	}
		898	}
		899	}
		900
		901	fs_reg *
		902	fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
		903	{
		904	fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
		905	fs_reg wpos = *reg;
		906	bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
		907
		908	/* gl_FragCoord.x */
		909	if (ir->pixel_center_integer) {
		910	emit(MOV(wpos, this->pixel_x));
		911	} else {
		912	emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
		913	}
		914	wpos.reg_offset++;
		915
		916	/* gl_FragCoord.y */
		917	if (!flip && ir->pixel_center_integer) {
		918	emit(MOV(wpos, this->pixel_y));
		919	} else {
		920	fs_reg pixel_y = this->pixel_y;
		921	float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
		922
		923	if (flip) {
		924	pixel_y.negate = true;
		925	offset += c->key.drawable_height - 1.0;
		926	}
		927
		928	emit(ADD(wpos, pixel_y, fs_reg(offset)));
		929	}
		930	wpos.reg_offset++;
		931
		932	/* gl_FragCoord.z */
		933	if (brw->gen >= 6) {
		934	emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
		935	} else {
		936	emit(FS_OPCODE_LINTERP, wpos,
		937	this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
		938	this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
		939	interp_reg(VARYING_SLOT_POS, 2));
		940	}
		941	wpos.reg_offset++;
		942
		943	/* gl_FragCoord.w: Already set up in emit_interpolation */
		944	emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
		945
		946	return reg;
		947	}
		948
		949	fs_inst *
		950	fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
		951	glsl_interp_qualifier interpolation_mode,
		952	bool is_centroid)
		953	{
		954	brw_wm_barycentric_interp_mode barycoord_mode;
		955	if (brw->gen >= 6) {
		956	if (is_centroid) {
		957	if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
		958	barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
		959	else
		960	barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
		961	} else {
		962	if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
		963	barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
		964	else
		965	barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
		966	}
		967	} else {
		968	/* On Ironlake and below, there is only one interpolation mode.
		969	* Centroid interpolation doesn't mean anything on this hardware --
		970	* there is no multisampling.
		971	*/
		972	barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
		973	}
		974	return emit(FS_OPCODE_LINTERP, attr,
		975	this->delta_x[barycoord_mode],
		976	this->delta_y[barycoord_mode], interp);
		977	}
		978
		979	fs_reg *
		980	fs_visitor::emit_general_interpolation(ir_variable *ir)
		981	{
		982	fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
		983	reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
		984	fs_reg attr = *reg;
		985
		986	unsigned int array_elements;
		987	const glsl_type *type;
		988
		989	if (ir->type->is_array()) {
		990	array_elements = ir->type->length;
		991	if (array_elements == 0) {
		992	fail("dereferenced array '%s' has length 0\n", ir->name);
		993	}
		994	type = ir->type->fields.array;
		995	} else {
		996	array_elements = 1;
		997	type = ir->type;
		998	}
		999
		1000	glsl_interp_qualifier interpolation_mode =
		1001	ir->determine_interpolation_mode(c->key.flat_shade);
		1002
		1003	int location = ir->location;
		1004	for (unsigned int i = 0; i < array_elements; i++) {
		1005	for (unsigned int j = 0; j < type->matrix_columns; j++) {
		1006	if (urb_setup[location] == -1) {
		1007	/* If there's no incoming setup data for this slot, don't
		1008	* emit interpolation for it.
		1009	*/
		1010	attr.reg_offset += type->vector_elements;
		1011	location++;
		1012	continue;
		1013	}
		1014
		1015	if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
		1016	/* Constant interpolation (flat shading) case. The SF has
		1017	* handed us defined values in only the constant offset
		1018	* field of the setup reg.
		1019	*/
		1020	for (unsigned int k = 0; k < type->vector_elements; k++) {
		1021	struct brw_reg interp = interp_reg(location, k);
		1022	interp = suboffset(interp, 3);
		1023	interp.type = reg->type;
		1024	emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
		1025	attr.reg_offset++;
		1026	}
		1027	} else {
		1028	/* Smooth/noperspective interpolation case. */
		1029	for (unsigned int k = 0; k < type->vector_elements; k++) {
		1030	/* FINISHME: At some point we probably want to push
		1031	* this farther by giving similar treatment to the
		1032	* other potentially constant components of the
		1033	* attribute, as well as making brw_vs_constval.c
		1034	* handle varyings other than gl_TexCoord.
		1035	*/
		1036	struct brw_reg interp = interp_reg(location, k);
		1037	emit_linterp(attr, fs_reg(interp), interpolation_mode,
		1038	ir->centroid);
		1039	if (brw->needs_unlit_centroid_workaround && ir->centroid) {
		1040	/* Get the pixel/sample mask into f0 so that we know
		1041	* which pixels are lit. Then, for each channel that is
		1042	* unlit, replace the centroid data with non-centroid
		1043	* data.
		1044	*/
		1045	emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
		1046	fs_inst *inst = emit_linterp(attr, fs_reg(interp),
		1047	interpolation_mode, false);
		1048	inst->predicate = BRW_PREDICATE_NORMAL;
		1049	inst->predicate_inverse = true;
		1050	}
		1051	if (brw->gen < 6) {
		1052	emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
		1053	}
		1054	attr.reg_offset++;
		1055	}
		1056
		1057	}
		1058	location++;
		1059	}
		1060	}
		1061
		1062	return reg;
		1063	}
		1064
		1065	fs_reg *
		1066	fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
		1067	{
		1068	fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
		1069
		1070	/* The frontfacing comes in as a bit in the thread payload. */
		1071	if (brw->gen >= 6) {
		1072	emit(BRW_OPCODE_ASR, *reg,
		1073	fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
		1074	fs_reg(15));
		1075	emit(BRW_OPCODE_NOT, reg, reg);
		1076	emit(BRW_OPCODE_AND, reg, reg, fs_reg(1));
		1077	} else {
		1078	struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
		1079	/* bit 31 is "primitive is back face", so checking < (1 << 31) gives
		1080	* us front face
		1081	*/
		1082	emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
		1083	emit(BRW_OPCODE_AND, reg, reg, fs_reg(1u));
		1084	}
		1085
		1086	return reg;
		1087	}
		1088
		1089	fs_reg
		1090	fs_visitor::fix_math_operand(fs_reg src)
		1091	{
		1092	/* Can't do hstride == 0 args on gen6 math, so expand it out. We
		1093	* might be able to do better by doing execsize = 1 math and then
		1094	* expanding that result out, but we would need to be careful with
		1095	* masking.
		1096	*
		1097	* The hardware ignores source modifiers (negate and abs) on math
		1098	* instructions, so we also move to a temp to set those up.
		1099	*/
		1100	if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
		1101	!src.abs && !src.negate)
		1102	return src;
		1103
		1104	/* Gen7 relaxes most of the above restrictions, but still can't use IMM
		1105	* operands to math
		1106	*/
		1107	if (brw->gen >= 7 && src.file != IMM)
		1108	return src;
		1109
		1110	fs_reg expanded = fs_reg(this, glsl_type::float_type);
		1111	expanded.type = src.type;
		1112	emit(BRW_OPCODE_MOV, expanded, src);
		1113	return expanded;
		1114	}
		1115
		1116	fs_inst *
		1117	fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
		1118	{
		1119	switch (opcode) {
		1120	case SHADER_OPCODE_RCP:
		1121	case SHADER_OPCODE_RSQ:
		1122	case SHADER_OPCODE_SQRT:
		1123	case SHADER_OPCODE_EXP2:
		1124	case SHADER_OPCODE_LOG2:
		1125	case SHADER_OPCODE_SIN:
		1126	case SHADER_OPCODE_COS:
		1127	break;
		1128	default:
		1129	assert(!"not reached: bad math opcode");
		1130	return NULL;
		1131	}
		1132
		1133	/* Can't do hstride == 0 args to gen6 math, so expand it out. We
		1134	* might be able to do better by doing execsize = 1 math and then
		1135	* expanding that result out, but we would need to be careful with
		1136	* masking.
		1137	*
		1138	* Gen 6 hardware ignores source modifiers (negate and abs) on math
		1139	* instructions, so we also move to a temp to set those up.
		1140	*/
		1141	if (brw->gen >= 6)
		1142	src = fix_math_operand(src);
		1143
		1144	fs_inst *inst = emit(opcode, dst, src);
		1145
		1146	if (brw->gen < 6) {
		1147	inst->base_mrf = 2;
		1148	inst->mlen = dispatch_width / 8;
		1149	}
		1150
		1151	return inst;
		1152	}
		1153
		1154	fs_inst *
		1155	fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
		1156	{
		1157	int base_mrf = 2;
		1158	fs_inst *inst;
		1159
		1160	switch (opcode) {
		1161	case SHADER_OPCODE_INT_QUOTIENT:
		1162	case SHADER_OPCODE_INT_REMAINDER:
		1163	if (brw->gen >= 7 && dispatch_width == 16)
		1164	fail("16-wide INTDIV unsupported\n");
		1165	break;
		1166	case SHADER_OPCODE_POW:
		1167	break;
		1168	default:
		1169	assert(!"not reached: unsupported binary math opcode.");
		1170	return NULL;
		1171	}
		1172
		1173	if (brw->gen >= 6) {
		1174	src0 = fix_math_operand(src0);
		1175	src1 = fix_math_operand(src1);
		1176
		1177	inst = emit(opcode, dst, src0, src1);
		1178	} else {
		1179	/* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
		1180	* "Message Payload":
		1181	*
		1182	* "Operand0[7]. For the INT DIV functions, this operand is the
		1183	* denominator."
		1184	* ...
		1185	* "Operand1[7]. For the INT DIV functions, this operand is the
		1186	* numerator."
		1187	*/
		1188	bool is_int_div = opcode != SHADER_OPCODE_POW;
		1189	fs_reg &op0 = is_int_div ? src1 : src0;
		1190	fs_reg &op1 = is_int_div ? src0 : src1;
		1191
		1192	emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
		1193	inst = emit(opcode, dst, op0, reg_null_f);
		1194
		1195	inst->base_mrf = base_mrf;
		1196	inst->mlen = 2 * dispatch_width / 8;
		1197	}
		1198	return inst;
		1199	}
		1200
		1201	void
		1202	fs_visitor::assign_curb_setup()
		1203	{
		1204	c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
		1205	if (dispatch_width == 8) {
		1206	c->prog_data.first_curbe_grf = c->nr_payload_regs;
		1207	} else {
		1208	c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
		1209	}
		1210
		1211	/* Map the offsets in the UNIFORM file to fixed HW regs. */
		1212	foreach_list(node, &this->instructions) {
		1213	fs_inst inst = (fs_inst )node;
		1214
		1215	for (unsigned int i = 0; i < 3; i++) {
		1216	if (inst->src[i].file == UNIFORM) {
		1217	int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
		1218	struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
		1219	constant_nr / 8,
		1220	constant_nr % 8);
		1221
		1222	inst->src[i].file = HW_REG;
		1223	inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
		1224	}
		1225	}
		1226	}
		1227	}
		1228
		1229	void
		1230	fs_visitor::calculate_urb_setup()
		1231	{
		1232	for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
		1233	urb_setup[i] = -1;
		1234	}
		1235
		1236	int urb_next = 0;
		1237	/* Figure out where each of the incoming setup attributes lands. */
		1238	if (brw->gen >= 6) {
		1239	for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
		1240	if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
		1241	urb_setup[i] = urb_next++;
		1242	}
		1243	}
		1244	} else {
		1245	/* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
		1246	for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
		1247	/* Point size is packed into the header, not as a general attribute */
		1248	if (i == VARYING_SLOT_PSIZ)
		1249	continue;
		1250
		1251	if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
		1252	/* The back color slot is skipped when the front color is
		1253	* also written to. In addition, some slots can be
		1254	* written in the vertex shader and not read in the
		1255	* fragment shader. So the register number must always be
		1256	* incremented, mapped or not.
		1257	*/
		1258	if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
		1259	urb_setup[i] = urb_next;
		1260	urb_next++;
		1261	}
		1262	}
		1263
		1264	/*
		1265	* It's a FS only attribute, and we did interpolation for this attribute
		1266	* in SF thread. So, count it here, too.
		1267	*
		1268	* See compile_sf_prog() for more info.
		1269	*/
		1270	if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
		1271	urb_setup[VARYING_SLOT_PNTC] = urb_next++;
		1272	}
		1273
		1274	/* Each attribute is 4 setup channels, each of which is half a reg. */
		1275	c->prog_data.urb_read_length = urb_next * 2;
		1276	}
		1277
		1278	void
		1279	fs_visitor::assign_urb_setup()
		1280	{
		1281	int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
		1282
		1283	/* Offset all the urb_setup[] index by the actual position of the
		1284	* setup regs, now that the location of the constants has been chosen.
		1285	*/
		1286	foreach_list(node, &this->instructions) {
		1287	fs_inst inst = (fs_inst )node;
		1288
		1289	if (inst->opcode == FS_OPCODE_LINTERP) {
		1290	assert(inst->src[2].file == HW_REG);
		1291	inst->src[2].fixed_hw_reg.nr += urb_start;
		1292	}
		1293
		1294	if (inst->opcode == FS_OPCODE_CINTERP) {
		1295	assert(inst->src[0].file == HW_REG);
		1296	inst->src[0].fixed_hw_reg.nr += urb_start;
		1297	}
		1298	}
		1299
		1300	this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
		1301	}
		1302
		1303	/**
		1304	* Split large virtual GRFs into separate components if we can.
		1305	*
		1306	* This is mostly duplicated with what brw_fs_vector_splitting does,
		1307	* but that's really conservative because it's afraid of doing
		1308	* splitting that doesn't result in real progress after the rest of
		1309	* the optimization phases, which would cause infinite looping in
		1310	* optimization. We can do it once here, safely. This also has the
		1311	* opportunity to split interpolated values, or maybe even uniforms,
		1312	* which we don't have at the IR level.
		1313	*
		1314	* We want to split, because virtual GRFs are what we register
		1315	* allocate and spill (due to contiguousness requirements for some
		1316	* instructions), and they're what we naturally generate in the
		1317	* codegen process, but most virtual GRFs don't actually need to be
		1318	* contiguous sets of GRFs. If we split, we'll end up with reduced
		1319	* live intervals and better dead code elimination and coalescing.
		1320	*/
		1321	void
		1322	fs_visitor::split_virtual_grfs()
		1323	{
		1324	int num_vars = this->virtual_grf_count;
		1325	bool split_grf[num_vars];
		1326	int new_virtual_grf[num_vars];
		1327
		1328	/* Try to split anything > 0 sized. */
		1329	for (int i = 0; i < num_vars; i++) {
		1330	if (this->virtual_grf_sizes[i] != 1)
		1331	split_grf[i] = true;
		1332	else
		1333	split_grf[i] = false;
		1334	}
		1335
		1336	if (brw->has_pln &&
		1337	this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
		1338	/* PLN opcodes rely on the delta_xy being contiguous. We only have to
		1339	* check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
		1340	* Gen6, that was the only supported interpolation mode, and since Gen6,
		1341	* delta_x and delta_y are in fixed hardware registers.
		1342	*/
		1343	split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
		1344	false;
		1345	}
		1346
		1347	foreach_list(node, &this->instructions) {
		1348	fs_inst inst = (fs_inst )node;
		1349
		1350	/* If there's a SEND message that requires contiguous destination
		1351	* registers, no splitting is allowed.
		1352	*/
		1353	if (inst->regs_written > 1) {
		1354	split_grf[inst->dst.reg] = false;
		1355	}
		1356
		1357	/* If we're sending from a GRF, don't split it, on the assumption that
		1358	* the send is reading the whole thing.
		1359	*/
		1360	if (inst->is_send_from_grf()) {
		1361	for (int i = 0; i < 3; i++) {
		1362	if (inst->src[i].file == GRF) {
		1363	split_grf[inst->src[i].reg] = false;
		1364	}
		1365	}
		1366	}
		1367	}
		1368
		1369	/* Allocate new space for split regs. Note that the virtual
		1370	* numbers will be contiguous.
		1371	*/
		1372	for (int i = 0; i < num_vars; i++) {
		1373	if (split_grf[i]) {
		1374	new_virtual_grf[i] = virtual_grf_alloc(1);
		1375	for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
		1376	int reg = virtual_grf_alloc(1);
		1377	assert(reg == new_virtual_grf[i] + j - 1);
		1378	(void) reg;
		1379	}
		1380	this->virtual_grf_sizes[i] = 1;
		1381	}
		1382	}
		1383
		1384	foreach_list(node, &this->instructions) {
		1385	fs_inst inst = (fs_inst )node;
		1386
		1387	if (inst->dst.file == GRF &&
		1388	split_grf[inst->dst.reg] &&
		1389	inst->dst.reg_offset != 0) {
		1390	inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
		1391	inst->dst.reg_offset - 1);
		1392	inst->dst.reg_offset = 0;
		1393	}
		1394	for (int i = 0; i < 3; i++) {
		1395	if (inst->src[i].file == GRF &&
		1396	split_grf[inst->src[i].reg] &&
		1397	inst->src[i].reg_offset != 0) {
		1398	inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
		1399	inst->src[i].reg_offset - 1);
		1400	inst->src[i].reg_offset = 0;
		1401	}
		1402	}
		1403	}
		1404	this->live_intervals_valid = false;
		1405	}
		1406
		1407	/**
		1408	* Remove unused virtual GRFs and compact the virtual_grf_* arrays.
		1409	*
		1410	* During code generation, we create tons of temporary variables, many of
		1411	* which get immediately killed and are never used again. Yet, in later
		1412	* optimization and analysis passes, such as compute_live_intervals, we need
		1413	* to loop over all the virtual GRFs. Compacting them can save a lot of
		1414	* overhead.
		1415	*/
		1416	void
		1417	fs_visitor::compact_virtual_grfs()
		1418	{
		1419	/* Mark which virtual GRFs are used, and count how many. */
		1420	int remap_table[this->virtual_grf_count];
		1421	memset(remap_table, -1, sizeof(remap_table));
		1422
		1423	foreach_list(node, &this->instructions) {
		1424	const fs_inst inst = (const fs_inst ) node;
		1425
		1426	if (inst->dst.file == GRF)
		1427	remap_table[inst->dst.reg] = 0;
		1428
		1429	for (int i = 0; i < 3; i++) {
		1430	if (inst->src[i].file == GRF)
		1431	remap_table[inst->src[i].reg] = 0;
		1432	}
		1433	}
		1434
		1435	/* In addition to registers used in instructions, fs_visitor keeps
		1436	* direct references to certain special values which must be patched:
		1437	*/
		1438	fs_reg *special[] = {
		1439	&frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
		1440	&outputs[0], &outputs[1], &outputs[2], &outputs[3],
		1441	&outputs[4], &outputs[5], &outputs[6], &outputs[7],
		1442	&delta_x[0], &delta_x[1], &delta_x[2],
		1443	&delta_x[3], &delta_x[4], &delta_x[5],
		1444	&delta_y[0], &delta_y[1], &delta_y[2],
		1445	&delta_y[3], &delta_y[4], &delta_y[5],
		1446	};
		1447	STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
		1448	STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
		1449
		1450	/* Treat all special values as used, to be conservative */
		1451	for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
		1452	if (special[i]->file == GRF)
		1453	remap_table[special[i]->reg] = 0;
		1454	}
		1455
		1456	/* Compact the GRF arrays. */
		1457	int new_index = 0;
		1458	for (int i = 0; i < this->virtual_grf_count; i++) {
		1459	if (remap_table[i] != -1) {
		1460	remap_table[i] = new_index;
		1461	virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
		1462	if (live_intervals_valid) {
		1463	virtual_grf_start[new_index] = virtual_grf_start[i];
		1464	virtual_grf_end[new_index] = virtual_grf_end[i];
		1465	}
		1466	++new_index;
		1467	}
		1468	}
		1469
		1470	this->virtual_grf_count = new_index;
		1471
		1472	/* Patch all the instructions to use the newly renumbered registers */
		1473	foreach_list(node, &this->instructions) {
		1474	fs_inst inst = (fs_inst ) node;
		1475
		1476	if (inst->dst.file == GRF)
		1477	inst->dst.reg = remap_table[inst->dst.reg];
		1478
		1479	for (int i = 0; i < 3; i++) {
		1480	if (inst->src[i].file == GRF)
		1481	inst->src[i].reg = remap_table[inst->src[i].reg];
		1482	}
		1483	}
		1484
		1485	/* Patch all the references to special values */
		1486	for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
		1487	if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
		1488	special[i]->reg = remap_table[special[i]->reg];
		1489	}
		1490	}
		1491
		1492	bool
		1493	fs_visitor::remove_dead_constants()
		1494	{
		1495	if (dispatch_width == 8) {
		1496	this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
		1497	this->nr_params_remap = c->prog_data.nr_params;
		1498
		1499	for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
		1500	this->params_remap[i] = -1;
		1501
		1502	/* Find which params are still in use. */
		1503	foreach_list(node, &this->instructions) {
		1504	fs_inst inst = (fs_inst )node;
		1505
		1506	for (int i = 0; i < 3; i++) {
		1507	int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
		1508
		1509	if (inst->src[i].file != UNIFORM)
		1510	continue;
		1511
		1512	/* Section 5.11 of the OpenGL 4.3 spec says:
		1513	*
		1514	* "Out-of-bounds reads return undefined values, which include
		1515	* values from other variables of the active program or zero."
		1516	*/
		1517	if (constant_nr < 0 \|\| constant_nr >= (int)c->prog_data.nr_params) {
		1518	constant_nr = 0;
		1519	}
		1520
		1521	/* For now, set this to non-negative. We'll give it the
		1522	* actual new number in a moment, in order to keep the
		1523	* register numbers nicely ordered.
		1524	*/
		1525	this->params_remap[constant_nr] = 0;
		1526	}
		1527	}
		1528
		1529	/* Figure out what the new numbers for the params will be. At some
		1530	* point when we're doing uniform array access, we're going to want
		1531	* to keep the distinction between .reg and .reg_offset, but for
		1532	* now we don't care.
		1533	*/
		1534	unsigned int new_nr_params = 0;
		1535	for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
		1536	if (this->params_remap[i] != -1) {
		1537	this->params_remap[i] = new_nr_params++;
		1538	}
		1539	}
		1540
		1541	/* Update the list of params to be uploaded to match our new numbering. */
		1542	for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
		1543	int remapped = this->params_remap[i];
		1544
		1545	if (remapped == -1)
		1546	continue;
		1547
		1548	c->prog_data.param[remapped] = c->prog_data.param[i];
		1549	}
		1550
		1551	c->prog_data.nr_params = new_nr_params;
		1552	} else {
		1553	/* This should have been generated in the 8-wide pass already. */
		1554	assert(this->params_remap);
		1555	}
		1556
		1557	/* Now do the renumbering of the shader to remove unused params. */
		1558	foreach_list(node, &this->instructions) {
		1559	fs_inst inst = (fs_inst )node;
		1560
		1561	for (int i = 0; i < 3; i++) {
		1562	int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
		1563
		1564	if (inst->src[i].file != UNIFORM)
		1565	continue;
		1566
		1567	/* as above alias to 0 */
		1568	if (constant_nr < 0 \|\| constant_nr >= (int)this->nr_params_remap) {
		1569	constant_nr = 0;
		1570	}
		1571	assert(this->params_remap[constant_nr] != -1);
		1572	inst->src[i].reg = this->params_remap[constant_nr];
		1573	inst->src[i].reg_offset = 0;
		1574	}
		1575	}
		1576
		1577	return true;
		1578	}
		1579
		1580	/*
		1581	* Implements array access of uniforms by inserting a
		1582	* PULL_CONSTANT_LOAD instruction.
		1583	*
		1584	* Unlike temporary GRF array access (where we don't support it due to
		1585	* the difficulty of doing relative addressing on instruction
		1586	* destinations), we could potentially do array access of uniforms
		1587	* that were loaded in GRF space as push constants. In real-world
		1588	* usage we've seen, though, the arrays being used are always larger
		1589	* than we could load as push constants, so just always move all
		1590	* uniform array access out to a pull constant buffer.
		1591	*/
		1592	void
		1593	fs_visitor::move_uniform_array_access_to_pull_constants()
		1594	{
		1595	int pull_constant_loc[c->prog_data.nr_params];
		1596
		1597	for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
		1598	pull_constant_loc[i] = -1;
		1599	}
		1600
		1601	/* Walk through and find array access of uniforms. Put a copy of that
		1602	* uniform in the pull constant buffer.
		1603	*
		1604	* Note that we don't move constant-indexed accesses to arrays. No
		1605	* testing has been done of the performance impact of this choice.
		1606	*/
		1607	foreach_list_safe(node, &this->instructions) {
		1608	fs_inst inst = (fs_inst )node;
		1609
		1610	for (int i = 0 ; i < 3; i++) {
		1611	if (inst->src[i].file != UNIFORM \|\| !inst->src[i].reladdr)
		1612	continue;
		1613
		1614	int uniform = inst->src[i].reg;
		1615
		1616	/* If this array isn't already present in the pull constant buffer,
		1617	* add it.
		1618	*/
		1619	if (pull_constant_loc[uniform] == -1) {
		1620	const float **values = &c->prog_data.param[uniform];
		1621
		1622	pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
		1623
		1624	assert(param_size[uniform]);
		1625
		1626	for (int j = 0; j < param_size[uniform]; j++) {
		1627	c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
		1628	values[j];
		1629	}
		1630	}
		1631
		1632	/* Set up the annotation tracking for new generated instructions. */
		1633	base_ir = inst->ir;
		1634	current_annotation = inst->annotation;
		1635
		1636	fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
		1637	fs_reg temp = fs_reg(this, glsl_type::float_type);
		1638	exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
		1639	surf_index,
		1640	*inst->src[i].reladdr,
		1641	pull_constant_loc[uniform] +
		1642	inst->src[i].reg_offset);
		1643	inst->insert_before(&list);
		1644
		1645	inst->src[i].file = temp.file;
		1646	inst->src[i].reg = temp.reg;
		1647	inst->src[i].reg_offset = temp.reg_offset;
		1648	inst->src[i].reladdr = NULL;
		1649	}
		1650	}
		1651	}
		1652
		1653	/**
		1654	* Choose accesses from the UNIFORM file to demote to using the pull
		1655	* constant buffer.
		1656	*
		1657	* We allow a fragment shader to have more than the specified minimum
		1658	* maximum number of fragment shader uniform components (64). If
		1659	* there are too many of these, they'd fill up all of register space.
		1660	* So, this will push some of them out to the pull constant buffer and
		1661	* update the program to load them.
		1662	*/
		1663	void
		1664	fs_visitor::setup_pull_constants()
		1665	{
		1666	/* Only allow 16 registers (128 uniform components) as push constants. */
		1667	unsigned int max_uniform_components = 16 * 8;
		1668	if (c->prog_data.nr_params <= max_uniform_components)
		1669	return;
		1670
		1671	if (dispatch_width == 16) {
		1672	fail("Pull constants not supported in 16-wide\n");
		1673	return;
		1674	}
		1675
		1676	/* Just demote the end of the list. We could probably do better
		1677	* here, demoting things that are rarely used in the program first.
		1678	*/
		1679	unsigned int pull_uniform_base = max_uniform_components;
		1680
		1681	int pull_constant_loc[c->prog_data.nr_params];
		1682	for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
		1683	if (i < pull_uniform_base) {
		1684	pull_constant_loc[i] = -1;
		1685	} else {
		1686	pull_constant_loc[i] = -1;
		1687	/* If our constant is already being uploaded for reladdr purposes,
		1688	* reuse it.
		1689	*/
		1690	for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
		1691	if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
		1692	pull_constant_loc[i] = j;
		1693	break;
		1694	}
		1695	}
		1696	if (pull_constant_loc[i] == -1) {
		1697	int pull_index = c->prog_data.nr_pull_params++;
		1698	c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
		1699	pull_constant_loc[i] = pull_index;;
		1700	}
		1701	}
		1702	}
		1703	c->prog_data.nr_params = pull_uniform_base;
		1704
		1705	foreach_list(node, &this->instructions) {
		1706	fs_inst inst = (fs_inst )node;
		1707
		1708	for (int i = 0; i < 3; i++) {
		1709	if (inst->src[i].file != UNIFORM)
		1710	continue;
		1711
		1712	int pull_index = pull_constant_loc[inst->src[i].reg +
		1713	inst->src[i].reg_offset];
		1714	if (pull_index == -1)
		1715	continue;
		1716
		1717	assert(!inst->src[i].reladdr);
		1718
		1719	fs_reg dst = fs_reg(this, glsl_type::float_type);
		1720	fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
		1721	fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
		1722	fs_inst *pull =
		1723	new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
		1724	dst, index, offset);
		1725	pull->ir = inst->ir;
		1726	pull->annotation = inst->annotation;
		1727
		1728	inst->insert_before(pull);
		1729
		1730	inst->src[i].file = GRF;
		1731	inst->src[i].reg = dst.reg;
		1732	inst->src[i].reg_offset = 0;
		1733	inst->src[i].smear = pull_index & 3;
		1734	}
		1735	}
		1736	}
		1737
		1738	bool
		1739	fs_visitor::opt_algebraic()
		1740	{
		1741	bool progress = false;
		1742
		1743	foreach_list(node, &this->instructions) {
		1744	fs_inst inst = (fs_inst )node;
		1745
		1746	switch (inst->opcode) {
		1747	case BRW_OPCODE_MUL:
		1748	if (inst->src[1].file != IMM)
		1749	continue;
		1750
		1751	/* a * 1.0 = a */
		1752	if (inst->src[1].is_one()) {
		1753	inst->opcode = BRW_OPCODE_MOV;
		1754	inst->src[1] = reg_undef;
		1755	progress = true;
		1756	break;
		1757	}
		1758
		1759	/* a * 0.0 = 0.0 */
		1760	if (inst->src[1].is_zero()) {
		1761	inst->opcode = BRW_OPCODE_MOV;
		1762	inst->src[0] = inst->src[1];
		1763	inst->src[1] = reg_undef;
		1764	progress = true;
		1765	break;
		1766	}
		1767
		1768	break;
		1769	case BRW_OPCODE_ADD:
		1770	if (inst->src[1].file != IMM)
		1771	continue;
		1772
		1773	/* a + 0.0 = a */
		1774	if (inst->src[1].is_zero()) {
		1775	inst->opcode = BRW_OPCODE_MOV;
		1776	inst->src[1] = reg_undef;
		1777	progress = true;
		1778	break;
		1779	}
		1780	break;
		1781	default:
		1782	break;
		1783	}
		1784	}
		1785
		1786	return progress;
		1787	}
		1788
		1789	/**
		1790	* Removes any instructions writing a VGRF where that VGRF is not used by any
		1791	* later instruction.
		1792	*/
		1793	bool
		1794	fs_visitor::dead_code_eliminate()
		1795	{
		1796	bool progress = false;
		1797	int pc = 0;
		1798
		1799	calculate_live_intervals();
		1800
		1801	foreach_list_safe(node, &this->instructions) {
		1802	fs_inst inst = (fs_inst )node;
		1803
		1804	if (inst->dst.file == GRF) {
		1805	assert(this->virtual_grf_end[inst->dst.reg] >= pc);
		1806	if (this->virtual_grf_end[inst->dst.reg] == pc) {
		1807	inst->remove();
		1808	progress = true;
		1809	}
		1810	}
		1811
		1812	pc++;
		1813	}
		1814
		1815	if (progress)
		1816	live_intervals_valid = false;
		1817
		1818	return progress;
		1819	}
		1820
		1821	struct dead_code_hash_key
		1822	{
		1823	int vgrf;
		1824	int reg_offset;
		1825	};
		1826
		1827	static bool
		1828	dead_code_hash_compare(const void a, const void b)
		1829	{
		1830	return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
		1831	}
		1832
		1833	static void
		1834	clear_dead_code_hash(struct hash_table *ht)
		1835	{
		1836	struct hash_entry *entry;
		1837
		1838	hash_table_foreach(ht, entry) {
		1839	_mesa_hash_table_remove(ht, entry);
		1840	}
		1841	}
		1842
		1843	static void
		1844	insert_dead_code_hash(struct hash_table *ht,
		1845	int vgrf, int reg_offset, fs_inst *inst)
		1846	{
		1847	/* We don't bother freeing keys, because they'll be GCed with the ht. */
		1848	struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
		1849
		1850	key->vgrf = vgrf;
		1851	key->reg_offset = reg_offset;
		1852
		1853	_mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
		1854	}
		1855
		1856	static struct hash_entry *
		1857	get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
		1858	{
		1859	struct dead_code_hash_key key;
		1860
		1861	key.vgrf = vgrf;
		1862	key.reg_offset = reg_offset;
		1863
		1864	return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
		1865	}
		1866
		1867	static void
		1868	remove_dead_code_hash(struct hash_table *ht,
		1869	int vgrf, int reg_offset)
		1870	{
		1871	struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
		1872	if (!entry)
		1873	return;
		1874
		1875	_mesa_hash_table_remove(ht, entry);
		1876	}
		1877
		1878	/**
		1879	* Walks basic blocks, removing any regs that are written but not read before
		1880	* being redefined.
		1881	*
		1882	* The dead_code_eliminate() function implements a global dead code
		1883	* elimination, but it only handles the removing the last write to a register
		1884	* if it's never read. This one can handle intermediate writes, but only
		1885	* within a basic block.
		1886	*/
		1887	bool
		1888	fs_visitor::dead_code_eliminate_local()
		1889	{
		1890	struct hash_table *ht;
		1891	bool progress = false;
		1892
		1893	ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
		1894
		1895	foreach_list_safe(node, &this->instructions) {
		1896	fs_inst inst = (fs_inst )node;
		1897
		1898	/* At a basic block, empty the HT since we don't understand dataflow
		1899	* here.
		1900	*/
		1901	if (inst->is_control_flow()) {
		1902	clear_dead_code_hash(ht);
		1903	continue;
		1904	}
		1905
		1906	/* Clear the HT of any instructions that got read. */
		1907	for (int i = 0; i < 3; i++) {
		1908	fs_reg src = inst->src[i];
		1909	if (src.file != GRF)
		1910	continue;
		1911
		1912	int read = 1;
		1913	if (inst->is_send_from_grf())
		1914	read = virtual_grf_sizes[src.reg] - src.reg_offset;
		1915
		1916	for (int reg_offset = src.reg_offset;
		1917	reg_offset < src.reg_offset + read;
		1918	reg_offset++) {
		1919	remove_dead_code_hash(ht, src.reg, reg_offset);
		1920	}
		1921	}
		1922
		1923	/* Add any update of a GRF to the HT, removing a previous write if it
		1924	* wasn't read.
		1925	*/
		1926	if (inst->dst.file == GRF) {
		1927	if (inst->regs_written > 1) {
		1928	/* We don't know how to trim channels from an instruction's
		1929	* writes, so we can't incrementally remove unread channels from
		1930	* it. Just remove whatever it overwrites from the table
		1931	*/
		1932	for (int i = 0; i < inst->regs_written; i++) {
		1933	remove_dead_code_hash(ht,
		1934	inst->dst.reg,
		1935	inst->dst.reg_offset + i);
		1936	}
		1937	} else {
		1938	struct hash_entry *entry =
		1939	get_dead_code_hash_entry(ht, inst->dst.reg,
		1940	inst->dst.reg_offset);
		1941
		1942	if (inst->is_partial_write()) {
		1943	/* For a partial write, we can't remove any previous dead code
		1944	* candidate, since we're just modifying their result, but we can
		1945	* be dead code eliminiated ourselves.
		1946	*/
		1947	if (entry) {
		1948	entry->data = inst;
		1949	} else {
		1950	insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
		1951	inst);
		1952	}
		1953	} else {
		1954	if (entry) {
		1955	/* We're completely updating a channel, and there was a
		1956	* previous write to the channel that wasn't read. Kill it!
		1957	*/
		1958	fs_inst inst = (fs_inst )entry->data;
		1959	inst->remove();
		1960	progress = true;
		1961	_mesa_hash_table_remove(ht, entry);
		1962	}
		1963
		1964	insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
		1965	inst);
		1966	}
		1967	}
		1968	}
		1969	}
		1970
		1971	_mesa_hash_table_destroy(ht, NULL);
		1972
		1973	if (progress)
		1974	live_intervals_valid = false;
		1975
		1976	return progress;
		1977	}
		1978
		1979	/**
		1980	* Implements a second type of register coalescing: This one checks if
		1981	* the two regs involved in a raw move don't interfere, in which case
		1982	* they can both by stored in the same place and the MOV removed.
		1983	*/
		1984	bool
		1985	fs_visitor::register_coalesce_2()
		1986	{
		1987	bool progress = false;
		1988
		1989	calculate_live_intervals();
		1990
		1991	foreach_list_safe(node, &this->instructions) {
		1992	fs_inst inst = (fs_inst )node;
		1993
		1994	if (inst->opcode != BRW_OPCODE_MOV \|\|
		1995	inst->is_partial_write() \|\|
		1996	inst->saturate \|\|
		1997	inst->src[0].file != GRF \|\|
		1998	inst->src[0].negate \|\|
		1999	inst->src[0].abs \|\|
		2000	inst->src[0].smear != -1 \|\|
		2001	inst->dst.file != GRF \|\|
		2002	inst->dst.type != inst->src[0].type \|\|
		2003	virtual_grf_sizes[inst->src[0].reg] != 1 \|\|
		2004	virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
		2005	continue;
		2006	}
		2007
		2008	int reg_from = inst->src[0].reg;
		2009	assert(inst->src[0].reg_offset == 0);
		2010	int reg_to = inst->dst.reg;
		2011	int reg_to_offset = inst->dst.reg_offset;
		2012
		2013	foreach_list(node, &this->instructions) {
		2014	fs_inst scan_inst = (fs_inst )node;
		2015
		2016	if (scan_inst->dst.file == GRF &&
		2017	scan_inst->dst.reg == reg_from) {
		2018	scan_inst->dst.reg = reg_to;
		2019	scan_inst->dst.reg_offset = reg_to_offset;
		2020	}
		2021	for (int i = 0; i < 3; i++) {
		2022	if (scan_inst->src[i].file == GRF &&
		2023	scan_inst->src[i].reg == reg_from) {
		2024	scan_inst->src[i].reg = reg_to;
		2025	scan_inst->src[i].reg_offset = reg_to_offset;
		2026	}
		2027	}
		2028	}
		2029
		2030	inst->remove();
		2031
		2032	/* We don't need to recalculate live intervals inside the loop despite
		2033	* flagging live_intervals_valid because we only use live intervals for
		2034	* the interferes test, and we must have had a situation where the
		2035	* intervals were:
		2036	*
		2037	* from to
		2038	* ^
		2039	* \|
		2040	* v
		2041	* ^
		2042	* \|
		2043	* v
		2044	*
		2045	* Some register R that might get coalesced with one of these two could
		2046	* only be referencing "to", otherwise "from"'s range would have been
		2047	* longer. R's range could also only start at the end of "to" or later,
		2048	* otherwise it will conflict with "to" when we try to coalesce "to"
		2049	* into Rw anyway.
		2050	*/
		2051	live_intervals_valid = false;
		2052
		2053	progress = true;
		2054	continue;
		2055	}
		2056
		2057	return progress;
		2058	}
		2059
		2060	bool
		2061	fs_visitor::register_coalesce()
		2062	{
		2063	bool progress = false;
		2064	int if_depth = 0;
		2065	int loop_depth = 0;
		2066
		2067	foreach_list_safe(node, &this->instructions) {
		2068	fs_inst inst = (fs_inst )node;
		2069
		2070	/* Make sure that we dominate the instructions we're going to
		2071	* scan for interfering with our coalescing, or we won't have
		2072	* scanned enough to see if anything interferes with our
		2073	* coalescing. We don't dominate the following instructions if
		2074	* we're in a loop or an if block.
		2075	*/
		2076	switch (inst->opcode) {
		2077	case BRW_OPCODE_DO:
		2078	loop_depth++;
		2079	break;
		2080	case BRW_OPCODE_WHILE:
		2081	loop_depth--;
		2082	break;
		2083	case BRW_OPCODE_IF:
		2084	if_depth++;
		2085	break;
		2086	case BRW_OPCODE_ENDIF:
		2087	if_depth--;
		2088	break;
		2089	default:
		2090	break;
		2091	}
		2092	if (loop_depth \|\| if_depth)
		2093	continue;
		2094
		2095	if (inst->opcode != BRW_OPCODE_MOV \|\|
		2096	inst->is_partial_write() \|\|
		2097	inst->saturate \|\|
		2098	inst->dst.file != GRF \|\| (inst->src[0].file != GRF &&
		2099	inst->src[0].file != UNIFORM)\|\|
		2100	inst->dst.type != inst->src[0].type)
		2101	continue;
		2102
		2103	bool has_source_modifiers = (inst->src[0].abs \|\|
		2104	inst->src[0].negate \|\|
		2105	inst->src[0].smear != -1 \|\|
		2106	inst->src[0].file == UNIFORM);
		2107
		2108	/* Found a move of a GRF to a GRF. Let's see if we can coalesce
		2109	* them: check for no writes to either one until the exit of the
		2110	* program.
		2111	*/
		2112	bool interfered = false;
		2113
		2114	for (fs_inst scan_inst = (fs_inst )inst->next;
		2115	!scan_inst->is_tail_sentinel();
		2116	scan_inst = (fs_inst *)scan_inst->next) {
		2117	if (scan_inst->dst.file == GRF) {
		2118	if (scan_inst->overwrites_reg(inst->dst) \|\|
		2119	scan_inst->overwrites_reg(inst->src[0])) {
		2120	interfered = true;
		2121	break;
		2122	}
		2123	}
		2124
		2125	if (has_source_modifiers) {
		2126	for (int i = 0; i < 3; i++) {
		2127	if (scan_inst->src[i].file == GRF &&
		2128	scan_inst->src[i].reg == inst->dst.reg &&
		2129	scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
		2130	inst->dst.type != scan_inst->src[i].type)
		2131	{
		2132	interfered = true;
		2133	break;
		2134	}
		2135	}
		2136	}
		2137
		2138
		2139	/* The gen6 MATH instruction can't handle source modifiers or
		2140	* unusual register regions, so avoid coalescing those for
		2141	* now. We should do something more specific.
		2142	*/
		2143	if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
		2144	interfered = true;
		2145	break;
		2146	}
		2147
		2148	/* The accumulator result appears to get used for the
		2149	* conditional modifier generation. When negating a UD
		2150	* value, there is a 33rd bit generated for the sign in the
		2151	* accumulator value, so now you can't check, for example,
		2152	* equality with a 32-bit value. See piglit fs-op-neg-uint.
		2153	*/
		2154	if (scan_inst->conditional_mod &&
		2155	inst->src[0].negate &&
		2156	inst->src[0].type == BRW_REGISTER_TYPE_UD) {
		2157	interfered = true;
		2158	break;
		2159	}
		2160	}
		2161	if (interfered) {
		2162	continue;
		2163	}
		2164
		2165	/* Rewrite the later usage to point at the source of the move to
		2166	* be removed.
		2167	*/
		2168	for (fs_inst *scan_inst = inst;
		2169	!scan_inst->is_tail_sentinel();
		2170	scan_inst = (fs_inst *)scan_inst->next) {
		2171	for (int i = 0; i < 3; i++) {
		2172	if (scan_inst->src[i].file == GRF &&
		2173	scan_inst->src[i].reg == inst->dst.reg &&
		2174	scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
		2175	fs_reg new_src = inst->src[0];
		2176	if (scan_inst->src[i].abs) {
		2177	new_src.negate = 0;
		2178	new_src.abs = 1;
		2179	}
		2180	new_src.negate ^= scan_inst->src[i].negate;
		2181	scan_inst->src[i] = new_src;
		2182	}
		2183	}
		2184	}
		2185
		2186	inst->remove();
		2187	progress = true;
		2188	}
		2189
		2190	if (progress)
		2191	live_intervals_valid = false;
		2192
		2193	return progress;
		2194	}
		2195
		2196
		2197	bool
		2198	fs_visitor::compute_to_mrf()
		2199	{
		2200	bool progress = false;
		2201	int next_ip = 0;
		2202
		2203	calculate_live_intervals();
		2204
		2205	foreach_list_safe(node, &this->instructions) {
		2206	fs_inst inst = (fs_inst )node;
		2207
		2208	int ip = next_ip;
		2209	next_ip++;
		2210
		2211	if (inst->opcode != BRW_OPCODE_MOV \|\|
		2212	inst->is_partial_write() \|\|
		2213	inst->dst.file != MRF \|\| inst->src[0].file != GRF \|\|
		2214	inst->dst.type != inst->src[0].type \|\|
		2215	inst->src[0].abs \|\| inst->src[0].negate \|\| inst->src[0].smear != -1)
		2216	continue;
		2217
		2218	/* Work out which hardware MRF registers are written by this
		2219	* instruction.
		2220	*/
		2221	int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
		2222	int mrf_high;
		2223	if (inst->dst.reg & BRW_MRF_COMPR4) {
		2224	mrf_high = mrf_low + 4;
		2225	} else if (dispatch_width == 16 &&
		2226	(!inst->force_uncompressed && !inst->force_sechalf)) {
		2227	mrf_high = mrf_low + 1;
		2228	} else {
		2229	mrf_high = mrf_low;
		2230	}
		2231
		2232	/* Can't compute-to-MRF this GRF if someone else was going to
		2233	* read it later.
		2234	*/
		2235	if (this->virtual_grf_end[inst->src[0].reg] > ip)
		2236	continue;
		2237
		2238	/* Found a move of a GRF to a MRF. Let's see if we can go
		2239	* rewrite the thing that made this GRF to write into the MRF.
		2240	*/
		2241	fs_inst *scan_inst;
		2242	for (scan_inst = (fs_inst *)inst->prev;
		2243	scan_inst->prev != NULL;
		2244	scan_inst = (fs_inst *)scan_inst->prev) {
		2245	if (scan_inst->dst.file == GRF &&
		2246	scan_inst->dst.reg == inst->src[0].reg) {
		2247	/* Found the last thing to write our reg we want to turn
		2248	* into a compute-to-MRF.
		2249	*/
		2250
		2251	/* If this one instruction didn't populate all the
		2252	* channels, bail. We might be able to rewrite everything
		2253	* that writes that reg, but it would require smarter
		2254	* tracking to delay the rewriting until complete success.
		2255	*/
		2256	if (scan_inst->is_partial_write())
		2257	break;
		2258
		2259	/* Things returning more than one register would need us to
		2260	* understand coalescing out more than one MOV at a time.
		2261	*/
		2262	if (scan_inst->regs_written > 1)
		2263	break;
		2264
		2265	/* SEND instructions can't have MRF as a destination. */
		2266	if (scan_inst->mlen)
		2267	break;
		2268
		2269	if (brw->gen == 6) {
		2270	/* gen6 math instructions must have the destination be
		2271	* GRF, so no compute-to-MRF for them.
		2272	*/
		2273	if (scan_inst->is_math()) {
		2274	break;
		2275	}
		2276	}
		2277
		2278	if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
		2279	/* Found the creator of our MRF's source value. */
		2280	scan_inst->dst.file = MRF;
		2281	scan_inst->dst.reg = inst->dst.reg;
		2282	scan_inst->saturate \|= inst->saturate;
		2283	inst->remove();
		2284	progress = true;
		2285	}
		2286	break;
		2287	}
		2288
		2289	/* We don't handle control flow here. Most computation of
		2290	* values that end up in MRFs are shortly before the MRF
		2291	* write anyway.
		2292	*/
		2293	if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
		2294	break;
		2295
		2296	/* You can't read from an MRF, so if someone else reads our
		2297	* MRF's source GRF that we wanted to rewrite, that stops us.
		2298	*/
		2299	bool interfered = false;
		2300	for (int i = 0; i < 3; i++) {
		2301	if (scan_inst->src[i].file == GRF &&
		2302	scan_inst->src[i].reg == inst->src[0].reg &&
		2303	scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
		2304	interfered = true;
		2305	}
		2306	}
		2307	if (interfered)
		2308	break;
		2309
		2310	if (scan_inst->dst.file == MRF) {
		2311	/* If somebody else writes our MRF here, we can't
		2312	* compute-to-MRF before that.
		2313	*/
		2314	int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
		2315	int scan_mrf_high;
		2316
		2317	if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
		2318	scan_mrf_high = scan_mrf_low + 4;
		2319	} else if (dispatch_width == 16 &&
		2320	(!scan_inst->force_uncompressed &&
		2321	!scan_inst->force_sechalf)) {
		2322	scan_mrf_high = scan_mrf_low + 1;
		2323	} else {
		2324	scan_mrf_high = scan_mrf_low;
		2325	}
		2326
		2327	if (mrf_low == scan_mrf_low \|\|
		2328	mrf_low == scan_mrf_high \|\|
		2329	mrf_high == scan_mrf_low \|\|
		2330	mrf_high == scan_mrf_high) {
		2331	break;
		2332	}
		2333	}
		2334
		2335	if (scan_inst->mlen > 0) {
		2336	/* Found a SEND instruction, which means that there are
		2337	* live values in MRFs from base_mrf to base_mrf +
		2338	* scan_inst->mlen - 1. Don't go pushing our MRF write up
		2339	* above it.
		2340	*/
		2341	if (mrf_low >= scan_inst->base_mrf &&
		2342	mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
		2343	break;
		2344	}
		2345	if (mrf_high >= scan_inst->base_mrf &&
		2346	mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
		2347	break;
		2348	}
		2349	}
		2350	}
		2351	}
		2352
		2353	if (progress)
		2354	live_intervals_valid = false;
		2355
		2356	return progress;
		2357	}
		2358
		2359	/**
		2360	* Walks through basic blocks, looking for repeated MRF writes and
		2361	* removing the later ones.
		2362	*/
		2363	bool
		2364	fs_visitor::remove_duplicate_mrf_writes()
		2365	{
		2366	fs_inst *last_mrf_move[16];
		2367	bool progress = false;
		2368
		2369	/* Need to update the MRF tracking for compressed instructions. */
		2370	if (dispatch_width == 16)
		2371	return false;
		2372
		2373	memset(last_mrf_move, 0, sizeof(last_mrf_move));
		2374
		2375	foreach_list_safe(node, &this->instructions) {
		2376	fs_inst inst = (fs_inst )node;
		2377
		2378	if (inst->is_control_flow()) {
		2379	memset(last_mrf_move, 0, sizeof(last_mrf_move));
		2380	}
		2381
		2382	if (inst->opcode == BRW_OPCODE_MOV &&
		2383	inst->dst.file == MRF) {
		2384	fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
		2385	if (prev_inst && inst->equals(prev_inst)) {
		2386	inst->remove();
		2387	progress = true;
		2388	continue;
		2389	}
		2390	}
		2391
		2392	/* Clear out the last-write records for MRFs that were overwritten. */
		2393	if (inst->dst.file == MRF) {
		2394	last_mrf_move[inst->dst.reg] = NULL;
		2395	}
		2396
		2397	if (inst->mlen > 0) {
		2398	/* Found a SEND instruction, which will include two or fewer
		2399	* implied MRF writes. We could do better here.
		2400	*/
		2401	for (int i = 0; i < implied_mrf_writes(inst); i++) {
		2402	last_mrf_move[inst->base_mrf + i] = NULL;
		2403	}
		2404	}
		2405
		2406	/* Clear out any MRF move records whose sources got overwritten. */
		2407	if (inst->dst.file == GRF) {
		2408	for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
		2409	if (last_mrf_move[i] &&
		2410	last_mrf_move[i]->src[0].reg == inst->dst.reg) {
		2411	last_mrf_move[i] = NULL;
		2412	}
		2413	}
		2414	}
		2415
		2416	if (inst->opcode == BRW_OPCODE_MOV &&
		2417	inst->dst.file == MRF &&
		2418	inst->src[0].file == GRF &&
		2419	!inst->is_partial_write()) {
		2420	last_mrf_move[inst->dst.reg] = inst;
		2421	}
		2422	}
		2423
		2424	if (progress)
		2425	live_intervals_valid = false;
		2426
		2427	return progress;
		2428	}
		2429
		2430	static void
		2431	clear_deps_for_inst_src(fs_inst inst, int dispatch_width, bool deps,
		2432	int first_grf, int grf_len)
		2433	{
		2434	bool inst_16wide = (dispatch_width > 8 &&
		2435	!inst->force_uncompressed &&
		2436	!inst->force_sechalf);
		2437
		2438	/* Clear the flag for registers that actually got read (as expected). */
		2439	for (int i = 0; i < 3; i++) {
		2440	int grf;
		2441	if (inst->src[i].file == GRF) {
		2442	grf = inst->src[i].reg;
		2443	} else if (inst->src[i].file == HW_REG &&
		2444	inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
		2445	grf = inst->src[i].fixed_hw_reg.nr;
		2446	} else {
		2447	continue;
		2448	}
		2449
		2450	if (grf >= first_grf &&
		2451	grf < first_grf + grf_len) {
		2452	deps[grf - first_grf] = false;
		2453	if (inst_16wide)
		2454	deps[grf - first_grf + 1] = false;
		2455	}
		2456	}
		2457	}
		2458
		2459	/**
		2460	* Implements this workaround for the original 965:
		2461	*
		2462	* "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
		2463	* check for post destination dependencies on this instruction, software
		2464	* must ensure that there is no destination hazard for the case of ‘write
		2465	* followed by a posted write’ shown in the following example.
		2466	*
		2467	* 1. mov r3 0
		2468	* 2. send r3.xy
		2469	* 3. mov r2 r3
		2470	*
		2471	* Due to no post-destination dependency check on the ‘send’, the above
		2472	* code sequence could have two instructions (1 and 2) in flight at the
		2473	* same time that both consider ‘r3’ as the target of their final writes.
		2474	*/
		2475	void
		2476	fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
		2477	{
		2478	int reg_size = dispatch_width / 8;
		2479	int write_len = inst->regs_written * reg_size;
		2480	int first_write_grf = inst->dst.reg;
		2481	bool needs_dep[BRW_MAX_MRF];
		2482	assert(write_len < (int)sizeof(needs_dep) - 1);
		2483
		2484	memset(needs_dep, false, sizeof(needs_dep));
		2485	memset(needs_dep, true, write_len);
		2486
		2487	clear_deps_for_inst_src(inst, dispatch_width,
		2488	needs_dep, first_write_grf, write_len);
		2489
		2490	/* Walk backwards looking for writes to registers we're writing which
		2491	* aren't read since being written. If we hit the start of the program,
		2492	* we assume that there are no outstanding dependencies on entry to the
		2493	* program.
		2494	*/
		2495	for (fs_inst scan_inst = (fs_inst )inst->prev;
		2496	scan_inst != NULL;
		2497	scan_inst = (fs_inst *)scan_inst->prev) {
		2498
		2499	/* If we hit control flow, assume that there are outstanding
		2500	* dependencies, and force their cleanup before our instruction.
		2501	*/
		2502	if (scan_inst->is_control_flow()) {
		2503	for (int i = 0; i < write_len; i++) {
		2504	if (needs_dep[i]) {
		2505	inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
		2506	}
		2507	}
		2508	return;
		2509	}
		2510
		2511	bool scan_inst_16wide = (dispatch_width > 8 &&
		2512	!scan_inst->force_uncompressed &&
		2513	!scan_inst->force_sechalf);
		2514
		2515	/* We insert our reads as late as possible on the assumption that any
		2516	* instruction but a MOV that might have left us an outstanding
		2517	* dependency has more latency than a MOV.
		2518	*/
		2519	if (scan_inst->dst.file == GRF) {
		2520	for (int i = 0; i < scan_inst->regs_written; i++) {
		2521	int reg = scan_inst->dst.reg + i * reg_size;
		2522
		2523	if (reg >= first_write_grf &&
		2524	reg < first_write_grf + write_len &&
		2525	needs_dep[reg - first_write_grf]) {
		2526	inst->insert_before(DEP_RESOLVE_MOV(reg));
		2527	needs_dep[reg - first_write_grf] = false;
		2528	if (scan_inst_16wide)
		2529	needs_dep[reg - first_write_grf + 1] = false;
		2530	}
		2531	}
		2532	}
		2533
		2534	/* Clear the flag for registers that actually got read (as expected). */
		2535	clear_deps_for_inst_src(scan_inst, dispatch_width,
		2536	needs_dep, first_write_grf, write_len);
		2537
		2538	/* Continue the loop only if we haven't resolved all the dependencies */
		2539	int i;
		2540	for (i = 0; i < write_len; i++) {
		2541	if (needs_dep[i])
		2542	break;
		2543	}
		2544	if (i == write_len)
		2545	return;
		2546	}
		2547	}
		2548
		2549	/**
		2550	* Implements this workaround for the original 965:
		2551	*
		2552	* "[DevBW, DevCL] Errata: A destination register from a send can not be
		2553	* used as a destination register until after it has been sourced by an
		2554	* instruction with a different destination register.
		2555	*/
		2556	void
		2557	fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
		2558	{
		2559	int write_len = inst->regs_written * dispatch_width / 8;
		2560	int first_write_grf = inst->dst.reg;
		2561	bool needs_dep[BRW_MAX_MRF];
		2562	assert(write_len < (int)sizeof(needs_dep) - 1);
		2563
		2564	memset(needs_dep, false, sizeof(needs_dep));
		2565	memset(needs_dep, true, write_len);
		2566	/* Walk forwards looking for writes to registers we're writing which aren't
		2567	* read before being written.
		2568	*/
		2569	for (fs_inst scan_inst = (fs_inst )inst->next;
		2570	!scan_inst->is_tail_sentinel();
		2571	scan_inst = (fs_inst *)scan_inst->next) {
		2572	/* If we hit control flow, force resolve all remaining dependencies. */
		2573	if (scan_inst->is_control_flow()) {
		2574	for (int i = 0; i < write_len; i++) {
		2575	if (needs_dep[i])
		2576	scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
		2577	}
		2578	return;
		2579	}
		2580
		2581	/* Clear the flag for registers that actually got read (as expected). */
		2582	clear_deps_for_inst_src(scan_inst, dispatch_width,
		2583	needs_dep, first_write_grf, write_len);
		2584
		2585	/* We insert our reads as late as possible since they're reading the
		2586	* result of a SEND, which has massive latency.
		2587	*/
		2588	if (scan_inst->dst.file == GRF &&
		2589	scan_inst->dst.reg >= first_write_grf &&
		2590	scan_inst->dst.reg < first_write_grf + write_len &&
		2591	needs_dep[scan_inst->dst.reg - first_write_grf]) {
		2592	scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
		2593	needs_dep[scan_inst->dst.reg - first_write_grf] = false;
		2594	}
		2595
		2596	/* Continue the loop only if we haven't resolved all the dependencies */
		2597	int i;
		2598	for (i = 0; i < write_len; i++) {
		2599	if (needs_dep[i])
		2600	break;
		2601	}
		2602	if (i == write_len)
		2603	return;
		2604	}
		2605
		2606	/* If we hit the end of the program, resolve all remaining dependencies out
		2607	* of paranoia.
		2608	*/
		2609	fs_inst last_inst = (fs_inst )this->instructions.get_tail();
		2610	assert(last_inst->eot);
		2611	for (int i = 0; i < write_len; i++) {
		2612	if (needs_dep[i])
		2613	last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
		2614	}
		2615	}
		2616
		2617	void
		2618	fs_visitor::insert_gen4_send_dependency_workarounds()
		2619	{
		2620	if (brw->gen != 4 \|\| brw->is_g4x)
		2621	return;
		2622
		2623	/* Note that we're done with register allocation, so GRF fs_regs always
		2624	* have a .reg_offset of 0.
		2625	*/
		2626
		2627	foreach_list_safe(node, &this->instructions) {
		2628	fs_inst inst = (fs_inst )node;
		2629
		2630	if (inst->mlen != 0 && inst->dst.file == GRF) {
		2631	insert_gen4_pre_send_dependency_workarounds(inst);
		2632	insert_gen4_post_send_dependency_workarounds(inst);
		2633	}
		2634	}
		2635	}
		2636
		2637	/**
		2638	* Turns the generic expression-style uniform pull constant load instruction
		2639	* into a hardware-specific series of instructions for loading a pull
		2640	* constant.
		2641	*
		2642	* The expression style allows the CSE pass before this to optimize out
		2643	* repeated loads from the same offset, and gives the pre-register-allocation
		2644	* scheduling full flexibility, while the conversion to native instructions
		2645	* allows the post-register-allocation scheduler the best information
		2646	* possible.
		2647	*
		2648	* Note that execution masking for setting up pull constant loads is special:
		2649	* the channels that need to be written are unrelated to the current execution
		2650	* mask, since a later instruction will use one of the result channels as a
		2651	* source operand for all 8 or 16 of its channels.
		2652	*/
		2653	void
		2654	fs_visitor::lower_uniform_pull_constant_loads()
		2655	{
		2656	foreach_list(node, &this->instructions) {
		2657	fs_inst inst = (fs_inst )node;
		2658
		2659	if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
		2660	continue;
		2661
		2662	if (brw->gen >= 7) {
		2663	/* The offset arg before was a vec4-aligned byte offset. We need to
		2664	* turn it into a dword offset.
		2665	*/
		2666	fs_reg const_offset_reg = inst->src[1];
		2667	assert(const_offset_reg.file == IMM &&
		2668	const_offset_reg.type == BRW_REGISTER_TYPE_UD);
		2669	const_offset_reg.imm.u /= 4;
		2670	fs_reg payload = fs_reg(this, glsl_type::uint_type);
		2671
		2672	/* This is actually going to be a MOV, but since only the first dword
		2673	* is accessed, we have a special opcode to do just that one. Note
		2674	* that this needs to be an operation that will be considered a def
		2675	* by live variable analysis, or register allocation will explode.
		2676	*/
		2677	fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
		2678	payload, const_offset_reg);
		2679	setup->force_writemask_all = true;
		2680
		2681	setup->ir = inst->ir;
		2682	setup->annotation = inst->annotation;
		2683	inst->insert_before(setup);
		2684
		2685	/* Similarly, this will only populate the first 4 channels of the
		2686	* result register (since we only use smear values from 0-3), but we
		2687	* don't tell the optimizer.
		2688	*/
		2689	inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
		2690	inst->src[1] = payload;
		2691
		2692	this->live_intervals_valid = false;
		2693	} else {
		2694	/* Before register allocation, we didn't tell the scheduler about the
		2695	* MRF we use. We know it's safe to use this MRF because nothing
		2696	* else does except for register spill/unspill, which generates and
		2697	* uses its MRF within a single IR instruction.
		2698	*/
		2699	inst->base_mrf = 14;
		2700	inst->mlen = 1;
		2701	}
		2702	}
		2703	}
		2704
		2705	void
		2706	fs_visitor::dump_instruction(backend_instruction *be_inst)
		2707	{
		2708	fs_inst inst = (fs_inst )be_inst;
		2709
		2710	if (inst->predicate) {
		2711	printf("(%cf0.%d) ",
		2712	inst->predicate_inverse ? '-' : '+',
		2713	inst->flag_subreg);
		2714	}
		2715
		2716	printf("%s", brw_instruction_name(inst->opcode));
		2717	if (inst->saturate)
		2718	printf(".sat");
		2719	if (inst->conditional_mod) {
		2720	printf(".cmod");
		2721	if (!inst->predicate &&
		2722	(brw->gen < 5 \|\| (inst->opcode != BRW_OPCODE_SEL &&
		2723	inst->opcode != BRW_OPCODE_IF &&
		2724	inst->opcode != BRW_OPCODE_WHILE))) {
		2725	printf(".f0.%d\n", inst->flag_subreg);
		2726	}
		2727	}
		2728	printf(" ");
		2729
		2730
		2731	switch (inst->dst.file) {
		2732	case GRF:
		2733	printf("vgrf%d", inst->dst.reg);
		2734	if (inst->dst.reg_offset)
		2735	printf("+%d", inst->dst.reg_offset);
		2736	break;
		2737	case MRF:
		2738	printf("m%d", inst->dst.reg);
		2739	break;
		2740	case BAD_FILE:
		2741	printf("(null)");
		2742	break;
		2743	case UNIFORM:
		2744	printf("*u%d*", inst->dst.reg);
		2745	break;
		2746	default:
		2747	printf("???");
		2748	break;
		2749	}
		2750	printf(", ");
		2751
		2752	for (int i = 0; i < 3; i++) {
		2753	if (inst->src[i].negate)
		2754	printf("-");
		2755	if (inst->src[i].abs)
		2756	printf("\|");
		2757	switch (inst->src[i].file) {
		2758	case GRF:
		2759	printf("vgrf%d", inst->src[i].reg);
		2760	if (inst->src[i].reg_offset)
		2761	printf("+%d", inst->src[i].reg_offset);
		2762	break;
		2763	case MRF:
		2764	printf("*m%d*", inst->src[i].reg);
		2765	break;
		2766	case UNIFORM:
		2767	printf("u%d", inst->src[i].reg);
		2768	if (inst->src[i].reg_offset)
		2769	printf(".%d", inst->src[i].reg_offset);
		2770	break;
		2771	case BAD_FILE:
		2772	printf("(null)");
		2773	break;
		2774	case IMM:
		2775	switch (inst->src[i].type) {
		2776	case BRW_REGISTER_TYPE_F:
		2777	printf("%ff", inst->src[i].imm.f);
		2778	break;
		2779	case BRW_REGISTER_TYPE_D:
		2780	printf("%dd", inst->src[i].imm.i);
		2781	break;
		2782	case BRW_REGISTER_TYPE_UD:
		2783	printf("%uu", inst->src[i].imm.u);
		2784	break;
		2785	default:
		2786	printf("???");
		2787	break;
		2788	}
		2789	break;
		2790	default:
		2791	printf("???");
		2792	break;
		2793	}
		2794	if (inst->src[i].abs)
		2795	printf("\|");
		2796
		2797	if (i < 3)
		2798	printf(", ");
		2799	}
		2800
		2801	printf(" ");
		2802
		2803	if (inst->force_uncompressed)
		2804	printf("1sthalf ");
		2805
		2806	if (inst->force_sechalf)
		2807	printf("2ndhalf ");
		2808
		2809	printf("\n");
		2810	}
		2811
		2812	/**
		2813	* Possibly returns an instruction that set up @param reg.
		2814	*
		2815	* Sometimes we want to take the result of some expression/variable
		2816	* dereference tree and rewrite the instruction generating the result
		2817	* of the tree. When processing the tree, we know that the
		2818	* instructions generated are all writing temporaries that are dead
		2819	* outside of this tree. So, if we have some instructions that write
		2820	* a temporary, we're free to point that temp write somewhere else.
		2821	*
		2822	* Note that this doesn't guarantee that the instruction generated
		2823	* only reg -- it might be the size=4 destination of a texture instruction.
		2824	*/
		2825	fs_inst *
		2826	fs_visitor::get_instruction_generating_reg(fs_inst *start,
		2827	fs_inst *end,
		2828	fs_reg reg)
		2829	{
		2830	if (end == start \|\|
		2831	end->is_partial_write() \|\|
		2832	reg.reladdr \|\|
		2833	!reg.equals(end->dst)) {
		2834	return NULL;
		2835	} else {
		2836	return end;
		2837	}
		2838	}
		2839
		2840	void
		2841	fs_visitor::setup_payload_gen6()
		2842	{
		2843	bool uses_depth =
		2844	(fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
		2845	unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
		2846
		2847	assert(brw->gen >= 6);
		2848
		2849	/* R0-1: masks, pixel X/Y coordinates. */
		2850	c->nr_payload_regs = 2;
		2851	/* R2: only for 32-pixel dispatch.*/
		2852
		2853	/* R3-26: barycentric interpolation coordinates. These appear in the
		2854	* same order that they appear in the brw_wm_barycentric_interp_mode
		2855	* enum. Each set of coordinates occupies 2 registers if dispatch width
		2856	* == 8 and 4 registers if dispatch width == 16. Coordinates only
		2857	* appear if they were enabled using the "Barycentric Interpolation
		2858	* Mode" bits in WM_STATE.
		2859	*/
		2860	for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
		2861	if (barycentric_interp_modes & (1 << i)) {
		2862	c->barycentric_coord_reg[i] = c->nr_payload_regs;
		2863	c->nr_payload_regs += 2;
		2864	if (dispatch_width == 16) {
		2865	c->nr_payload_regs += 2;
		2866	}
		2867	}
		2868	}
		2869
		2870	/* R27: interpolated depth if uses source depth */
		2871	if (uses_depth) {
		2872	c->source_depth_reg = c->nr_payload_regs;
		2873	c->nr_payload_regs++;
		2874	if (dispatch_width == 16) {
		2875	/* R28: interpolated depth if not 8-wide. */
		2876	c->nr_payload_regs++;
		2877	}
		2878	}
		2879	/* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
		2880	if (uses_depth) {
		2881	c->source_w_reg = c->nr_payload_regs;
		2882	c->nr_payload_regs++;
		2883	if (dispatch_width == 16) {
		2884	/* R30: interpolated W if not 8-wide. */
		2885	c->nr_payload_regs++;
		2886	}
		2887	}
		2888	/* R31: MSAA position offsets. */
		2889	/* R32-: bary for 32-pixel. */
		2890	/* R58-59: interp W for 32-pixel. */
		2891
		2892	if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
		2893	c->source_depth_to_render_target = true;
		2894	}
		2895	}
		2896
		2897	bool
		2898	fs_visitor::run()
		2899	{
		2900	sanity_param_count = fp->Base.Parameters->NumParameters;
		2901	uint32_t orig_nr_params = c->prog_data.nr_params;
		2902
		2903	if (brw->gen >= 6)
		2904	setup_payload_gen6();
		2905	else
		2906	setup_payload_gen4();
		2907
		2908	if (0) {
		2909	emit_dummy_fs();
		2910	} else {
		2911	if (INTEL_DEBUG & DEBUG_SHADER_TIME)
		2912	emit_shader_time_begin();
		2913
		2914	calculate_urb_setup();
		2915	if (brw->gen < 6)
		2916	emit_interpolation_setup_gen4();
		2917	else
		2918	emit_interpolation_setup_gen6();
		2919
		2920	/* We handle discards by keeping track of the still-live pixels in f0.1.
		2921	* Initialize it with the dispatched pixels.
		2922	*/
4401	Serge	2923	if (fp->UsesKill \|\| c->key.alpha_test_func) {
4358	Serge	2924	fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
		2925	discard_init->flag_subreg = 1;
		2926	}
		2927
		2928	/* Generate FS IR for main(). (the visitor only descends into
		2929	* functions called "main").
		2930	*/
		2931	if (shader) {
		2932	foreach_list(node, &*shader->ir) {
		2933	ir_instruction ir = (ir_instruction )node;
		2934	base_ir = ir;
		2935	this->result = reg_undef;
		2936	ir->accept(this);
		2937	}
		2938	} else {
		2939	emit_fragment_program_code();
		2940	}
		2941	base_ir = NULL;
		2942	if (failed)
		2943	return false;
		2944
		2945	emit(FS_OPCODE_PLACEHOLDER_HALT);
		2946
4401	Serge	2947	if (c->key.alpha_test_func)
		2948	emit_alpha_test();
		2949
4358	Serge	2950	emit_fb_writes();
		2951
		2952	split_virtual_grfs();
		2953
		2954	move_uniform_array_access_to_pull_constants();
		2955	setup_pull_constants();
		2956
		2957	bool progress;
		2958	do {
		2959	progress = false;
		2960
		2961	compact_virtual_grfs();
		2962
		2963	progress = remove_duplicate_mrf_writes() \|\| progress;
		2964
		2965	progress = opt_algebraic() \|\| progress;
		2966	progress = opt_cse() \|\| progress;
		2967	progress = opt_copy_propagate() \|\| progress;
		2968	progress = dead_code_eliminate() \|\| progress;
		2969	progress = dead_code_eliminate_local() \|\| progress;
		2970	progress = register_coalesce() \|\| progress;
		2971	progress = register_coalesce_2() \|\| progress;
		2972	progress = compute_to_mrf() \|\| progress;
		2973	} while (progress);
		2974
		2975	remove_dead_constants();
		2976
		2977	schedule_instructions(false);
		2978
		2979	lower_uniform_pull_constant_loads();
		2980
		2981	assign_curb_setup();
		2982	assign_urb_setup();
		2983
		2984	if (0) {
		2985	/* Debug of register spilling: Go spill everything. */
		2986	for (int i = 0; i < virtual_grf_count; i++) {
		2987	spill_reg(i);
		2988	}
		2989	}
		2990
		2991	if (0)
		2992	assign_regs_trivial();
		2993	else {
		2994	while (!assign_regs()) {
		2995	if (failed)
		2996	break;
		2997	}
		2998	}
		2999	}
		3000	assert(force_uncompressed_stack == 0);
		3001	assert(force_sechalf_stack == 0);
		3002
		3003	/* This must come after all optimization and register allocation, since
		3004	* it inserts dead code that happens to have side effects, and it does
		3005	* so based on the actual physical registers in use.
		3006	*/
		3007	insert_gen4_send_dependency_workarounds();
		3008
		3009	if (failed)
		3010	return false;
		3011
		3012	schedule_instructions(true);
		3013
		3014	if (dispatch_width == 8) {
		3015	c->prog_data.reg_blocks = brw_register_blocks(grf_used);
		3016	} else {
		3017	c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
		3018
		3019	/* Make sure we didn't try to sneak in an extra uniform */
		3020	assert(orig_nr_params == c->prog_data.nr_params);
		3021	(void) orig_nr_params;
		3022	}
		3023
		3024	/* If any state parameters were appended, then ParameterValues could have
		3025	* been realloced, in which case the driver uniform storage set up by
		3026	* _mesa_associate_uniform_storage() would point to freed memory. Make
		3027	* sure that didn't happen.
		3028	*/
		3029	assert(sanity_param_count == fp->Base.Parameters->NumParameters);
		3030
		3031	return !failed;
		3032	}
		3033
		3034	const unsigned *
		3035	brw_wm_fs_emit(struct brw_context brw, struct brw_wm_compile c,
		3036	struct gl_fragment_program *fp,
		3037	struct gl_shader_program *prog,
		3038	unsigned *final_assembly_size)
		3039	{
		3040	bool start_busy = false;
		3041	float start_time = 0;
		3042
		3043	if (unlikely(brw->perf_debug)) {
		3044	start_busy = (brw->batch.last_bo &&
		3045	drm_intel_bo_busy(brw->batch.last_bo));
		3046	start_time = get_time();
		3047	}
		3048
		3049	struct brw_shader *shader = NULL;
		3050	if (prog)
		3051	shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
		3052
		3053	if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
		3054	if (prog) {
		3055	printf("GLSL IR for native fragment shader %d:\n", prog->Name);
		3056	_mesa_print_ir(shader->ir, NULL);
		3057	printf("\n\n");
		3058	} else {
		3059	printf("ARB_fragment_program %d ir for native fragment shader\n",
		3060	fp->Base.Id);
		3061	_mesa_print_program(&fp->Base);
		3062	}
		3063	}
		3064
		3065	/* Now the main event: Visit the shader IR and generate our FS IR for it.
		3066	*/
		3067	fs_visitor v(brw, c, prog, fp, 8);
		3068	if (!v.run()) {
		3069	if (prog) {
		3070	prog->LinkStatus = false;
		3071	ralloc_strcat(&prog->InfoLog, v.fail_msg);
		3072	}
		3073
		3074	_mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
		3075	v.fail_msg);
		3076
		3077	return NULL;
		3078	}
		3079
		3080	exec_list *simd16_instructions = NULL;
		3081	fs_visitor v2(brw, c, prog, fp, 16);
		3082	bool no16 = INTEL_DEBUG & DEBUG_NO16;
		3083	if (brw->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
		3084	v2.import_uniforms(&v);
		3085	if (!v2.run()) {
		3086	perf_debug("16-wide shader failed to compile, falling back to "
		3087	"8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
		3088	} else {
		3089	simd16_instructions = &v2.instructions;
		3090	}
		3091	}
		3092
		3093	c->prog_data.dispatch_width = 8;
		3094
		3095	fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
		3096	const unsigned *generated = g.generate_assembly(&v.instructions,
		3097	simd16_instructions,
		3098	final_assembly_size);
		3099
		3100	if (unlikely(brw->perf_debug) && shader) {
		3101	if (shader->compiled_once)
		3102	brw_wm_debug_recompile(brw, prog, &c->key);
		3103	shader->compiled_once = true;
		3104
		3105	if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
		3106	perf_debug("FS compile took %.03f ms and stalled the GPU\n",
		3107	(get_time() - start_time) * 1000);
		3108	}
		3109	}
		3110
		3111	return generated;
		3112	}
		3113
		3114	bool
		3115	brw_fs_precompile(struct gl_context ctx, struct gl_shader_program prog)
		3116	{
		3117	struct brw_context *brw = brw_context(ctx);
		3118	struct brw_wm_prog_key key;
		3119
		3120	if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
		3121	return true;
		3122
		3123	struct gl_fragment_program fp = (struct gl_fragment_program )
		3124	prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
		3125	struct brw_fragment_program *bfp = brw_fragment_program(fp);
		3126	bool program_uses_dfdy = fp->UsesDFdy;
		3127
		3128	memset(&key, 0, sizeof(key));
		3129
		3130	if (brw->gen < 6) {
		3131	if (fp->UsesKill)
		3132	key.iz_lookup \|= IZ_PS_KILL_ALPHATEST_BIT;
		3133
		3134	if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
		3135	key.iz_lookup \|= IZ_PS_COMPUTES_DEPTH_BIT;
		3136
		3137	/* Just assume depth testing. */
		3138	key.iz_lookup \|= IZ_DEPTH_TEST_ENABLE_BIT;
		3139	key.iz_lookup \|= IZ_DEPTH_WRITE_ENABLE_BIT;
		3140	}
		3141
		3142	if (brw->gen < 6)
		3143	key.input_slots_valid \|= BITFIELD64_BIT(VARYING_SLOT_POS);
		3144
		3145	for (int i = 0; i < VARYING_SLOT_MAX; i++) {
		3146	if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
		3147	continue;
		3148
		3149	if (brw->gen < 6) {
		3150	if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
		3151	key.input_slots_valid \|= BITFIELD64_BIT(i);
		3152	}
		3153	}
		3154
		3155	key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
		3156
		3157	for (int i = 0; i < MAX_SAMPLERS; i++) {
		3158	if (fp->Base.ShadowSamplers & (1 << i)) {
		3159	/* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
		3160	key.tex.swizzles[i] =
		3161	MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
		3162	} else {
		3163	/* Color sampler: assume no swizzling. */
		3164	key.tex.swizzles[i] = SWIZZLE_XYZW;
		3165	}
		3166	}
		3167
		3168	if (fp->Base.InputsRead & VARYING_BIT_POS) {
		3169	key.drawable_height = ctx->DrawBuffer->Height;
		3170	}
		3171
		3172	if ((fp->Base.InputsRead & VARYING_BIT_POS) \|\| program_uses_dfdy) {
		3173	key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
		3174	}
		3175
		3176	key.nr_color_regions = 1;
		3177
		3178	key.program_string_id = bfp->id;
		3179
		3180	uint32_t old_prog_offset = brw->wm.prog_offset;
		3181	struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
		3182
		3183	bool success = do_wm_prog(brw, prog, bfp, &key);
		3184
		3185	brw->wm.prog_offset = old_prog_offset;
		3186	brw->wm.prog_data = old_prog_data;
		3187
		3188	return success;
		3189	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/src/mesa/drivers/dri/i965/brw_fs.cpp – Rev 4401