WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/mesa-10.6.0/src/mesa/drivers/dri/i965/brw_fs.cpp

Rev	Author	Line No.	Line
5564	serge	1	/*
		2	* Copyright © 2010 Intel Corporation
		3	*
		4	* Permission is hereby granted, free of charge, to any person obtaining a
		5	* copy of this software and associated documentation files (the "Software"),
		6	* to deal in the Software without restriction, including without limitation
		7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		8	* and/or sell copies of the Software, and to permit persons to whom the
		9	* Software is furnished to do so, subject to the following conditions:
		10	*
		11	* The above copyright notice and this permission notice (including the next
		12	* paragraph) shall be included in all copies or substantial portions of the
		13	* Software.
		14	*
		15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		16	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		17	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		18	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		19	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
		20	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
		21	* IN THE SOFTWARE.
		22	*/
		23
		24	/** @file brw_fs.cpp
		25	*
		26	* This file drives the GLSL IR -> LIR translation, contains the
		27	* optimizations on the LIR, and drives the generation of native code
		28	* from the LIR.
		29	*/
		30
		31	#include
		32
		33	#include "util/hash_table.h"
		34	#include "main/macros.h"
		35	#include "main/shaderobj.h"
		36	#include "main/fbobject.h"
		37	#include "program/prog_parameter.h"
		38	#include "program/prog_print.h"
		39	#include "util/register_allocate.h"
		40	#include "program/hash_table.h"
		41	#include "brw_context.h"
		42	#include "brw_eu.h"
		43	#include "brw_wm.h"
		44	#include "brw_fs.h"
		45	#include "brw_cfg.h"
		46	#include "brw_dead_control_flow.h"
		47	#include "main/uniforms.h"
		48	#include "brw_fs_live_variables.h"
		49	#include "glsl/glsl_types.h"
		50	#include "program/sampler.h"
		51
		52	void
		53	fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
		54	const fs_reg *src, unsigned sources)
		55	{
		56	memset(this, 0, sizeof(*this));
		57
		58	this->src = new fs_reg[MAX2(sources, 3)];
		59	for (unsigned i = 0; i < sources; i++)
		60	this->src[i] = src[i];
		61
		62	this->opcode = opcode;
		63	this->dst = dst;
		64	this->sources = sources;
		65	this->exec_size = exec_size;
		66
		67	assert(dst.file != IMM && dst.file != UNIFORM);
		68
		69	/* If exec_size == 0, try to guess it from the registers. Since all
		70	* manner of things may use hardware registers, we first try to guess
		71	* based on GRF registers. If this fails, we will go ahead and take the
		72	* width from the destination register.
		73	*/
		74	if (this->exec_size == 0) {
		75	if (dst.file == GRF) {
		76	this->exec_size = dst.width;
		77	} else {
		78	for (unsigned i = 0; i < sources; ++i) {
		79	if (src[i].file != GRF && src[i].file != ATTR)
		80	continue;
		81
		82	if (this->exec_size <= 1)
		83	this->exec_size = src[i].width;
		84	assert(src[i].width == 1 \|\| src[i].width == this->exec_size);
		85	}
		86	}
		87
		88	if (this->exec_size == 0 && dst.file != BAD_FILE)
		89	this->exec_size = dst.width;
		90	}
		91	assert(this->exec_size != 0);
		92
		93	this->conditional_mod = BRW_CONDITIONAL_NONE;
		94
		95	/* This will be the case for almost all instructions. */
		96	switch (dst.file) {
		97	case GRF:
		98	case HW_REG:
		99	case MRF:
		100	case ATTR:
		101	this->regs_written =
		102	DIV_ROUND_UP(MAX2(dst.width * dst.stride, 1) * type_sz(dst.type), 32);
		103	break;
		104	case BAD_FILE:
		105	this->regs_written = 0;
		106	break;
		107	case IMM:
		108	case UNIFORM:
		109	unreachable("Invalid destination register file");
		110	default:
		111	unreachable("Invalid register file");
		112	}
		113
		114	this->writes_accumulator = false;
		115	}
		116
		117	fs_inst::fs_inst()
		118	{
		119	init(BRW_OPCODE_NOP, 8, dst, NULL, 0);
		120	}
		121
		122	fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
		123	{
		124	init(opcode, exec_size, reg_undef, NULL, 0);
		125	}
		126
		127	fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
		128	{
		129	init(opcode, 0, dst, NULL, 0);
		130	}
		131
		132	fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
		133	const fs_reg &src0)
		134	{
		135	const fs_reg src[1] = { src0 };
		136	init(opcode, exec_size, dst, src, 1);
		137	}
		138
		139	fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
		140	{
		141	const fs_reg src[1] = { src0 };
		142	init(opcode, 0, dst, src, 1);
		143	}
		144
		145	fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
		146	const fs_reg &src0, const fs_reg &src1)
		147	{
		148	const fs_reg src[2] = { src0, src1 };
		149	init(opcode, exec_size, dst, src, 2);
		150	}
		151
		152	fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
		153	const fs_reg &src1)
		154	{
		155	const fs_reg src[2] = { src0, src1 };
		156	init(opcode, 0, dst, src, 2);
		157	}
		158
		159	fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
		160	const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
		161	{
		162	const fs_reg src[3] = { src0, src1, src2 };
		163	init(opcode, exec_size, dst, src, 3);
		164	}
		165
		166	fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
		167	const fs_reg &src1, const fs_reg &src2)
		168	{
		169	const fs_reg src[3] = { src0, src1, src2 };
		170	init(opcode, 0, dst, src, 3);
		171	}
		172
		173	fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst,
		174	const fs_reg src[], unsigned sources)
		175	{
		176	init(opcode, 0, dst, src, sources);
		177	}
		178
		179	fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
		180	const fs_reg src[], unsigned sources)
		181	{
		182	init(opcode, exec_width, dst, src, sources);
		183	}
		184
		185	fs_inst::fs_inst(const fs_inst &that)
		186	{
		187	memcpy(this, &that, sizeof(that));
		188
		189	this->src = new fs_reg[MAX2(that.sources, 3)];
		190
		191	for (unsigned i = 0; i < that.sources; i++)
		192	this->src[i] = that.src[i];
		193	}
		194
		195	fs_inst::~fs_inst()
		196	{
		197	delete[] this->src;
		198	}
		199
		200	void
		201	fs_inst::resize_sources(uint8_t num_sources)
		202	{
		203	if (this->sources != num_sources) {
		204	fs_reg *src = new fs_reg[MAX2(num_sources, 3)];
		205
		206	for (unsigned i = 0; i < MIN2(this->sources, num_sources); ++i)
		207	src[i] = this->src[i];
		208
		209	delete[] this->src;
		210	this->src = src;
		211	this->sources = num_sources;
		212	}
		213	}
		214
		215	#define ALU1(op) \
		216	fs_inst * \
		217	fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
		218	{ \
		219	return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
		220	}
		221
		222	#define ALU2(op) \
		223	fs_inst * \
		224	fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
		225	const fs_reg &src1) \
		226	{ \
		227	return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
		228	}
		229
		230	#define ALU2_ACC(op) \
		231	fs_inst * \
		232	fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
		233	const fs_reg &src1) \
		234	{ \
		235	fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
		236	inst->writes_accumulator = true; \
		237	return inst; \
		238	}
		239
		240	#define ALU3(op) \
		241	fs_inst * \
		242	fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
		243	const fs_reg &src1, const fs_reg &src2) \
		244	{ \
		245	return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
		246	}
		247
		248	ALU1(NOT)
		249	ALU1(MOV)
		250	ALU1(FRC)
		251	ALU1(RNDD)
		252	ALU1(RNDE)
		253	ALU1(RNDZ)
		254	ALU2(ADD)
		255	ALU2(MUL)
		256	ALU2_ACC(MACH)
		257	ALU2(AND)
		258	ALU2(OR)
		259	ALU2(XOR)
		260	ALU2(SHL)
		261	ALU2(SHR)
		262	ALU2(ASR)
		263	ALU3(LRP)
		264	ALU1(BFREV)
		265	ALU3(BFE)
		266	ALU2(BFI1)
		267	ALU3(BFI2)
		268	ALU1(FBH)
		269	ALU1(FBL)
		270	ALU1(CBIT)
		271	ALU3(MAD)
		272	ALU2_ACC(ADDC)
		273	ALU2_ACC(SUBB)
		274	ALU2(SEL)
		275	ALU2(MAC)
		276
		277	/** Gen4 predicated IF. */
		278	fs_inst *
		279	fs_visitor::IF(enum brw_predicate predicate)
		280	{
		281	fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
		282	inst->predicate = predicate;
		283	return inst;
		284	}
		285
		286	/** Gen6 IF with embedded comparison. */
		287	fs_inst *
		288	fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
		289	enum brw_conditional_mod condition)
		290	{
		291	assert(devinfo->gen == 6);
		292	fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
		293	reg_null_d, src0, src1);
		294	inst->conditional_mod = condition;
		295	return inst;
		296	}
		297
		298	/**
		299	* CMP: Sets the low bit of the destination channels with the result
		300	* of the comparison, while the upper bits are undefined, and updates
		301	* the flag register with the packed 16 bits of the result.
		302	*/
		303	fs_inst *
		304	fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
		305	enum brw_conditional_mod condition)
		306	{
		307	fs_inst *inst;
		308
		309	/* Take the instruction:
		310	*
		311	* CMP null src0 src1
		312	*
		313	* Original gen4 does type conversion to the destination type before
		314	* comparison, producing garbage results for floating point comparisons.
		315	*
		316	* The destination type doesn't matter on newer generations, so we set the
		317	* type to match src0 so we can compact the instruction.
		318	*/
		319	dst.type = src0.type;
		320	if (dst.file == HW_REG)
		321	dst.fixed_hw_reg.type = dst.type;
		322
		323	resolve_ud_negate(&src0);
		324	resolve_ud_negate(&src1);
		325
		326	inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
		327	inst->conditional_mod = condition;
		328
		329	return inst;
		330	}
		331
		332	fs_inst *
		333	fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources,
		334	int header_size)
		335	{
		336	assert(dst.width % 8 == 0);
		337	fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst.width,
		338	dst, src, sources);
		339	inst->header_size = header_size;
		340
		341	for (int i = 0; i < header_size; i++)
		342	assert(src[i].file != GRF \|\| src[i].width * type_sz(src[i].type) == 32);
		343	inst->regs_written = header_size;
		344
		345	for (int i = header_size; i < sources; ++i)
		346	assert(src[i].file != GRF \|\| src[i].width == dst.width);
		347	inst->regs_written += (sources - header_size) * (dst.width / 8);
		348
		349	return inst;
		350	}
		351
		352	exec_list
		353	fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
		354	const fs_reg &surf_index,
		355	const fs_reg &varying_offset,
		356	uint32_t const_offset)
		357	{
		358	exec_list instructions;
		359	fs_inst *inst;
		360
		361	/* We have our constant surface use a pitch of 4 bytes, so our index can
		362	* be any component of a vector, and then we load 4 contiguous
		363	* components starting from that.
		364	*
		365	* We break down the const_offset to a portion added to the variable
		366	* offset and a portion done using reg_offset, which means that if you
		367	* have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
		368	* a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
		369	* CSE can later notice that those loads are all the same and eliminate
		370	* the redundant ones.
		371	*/
		372	fs_reg vec4_offset = vgrf(glsl_type::int_type);
		373	instructions.push_tail(ADD(vec4_offset,
		374	varying_offset, fs_reg(const_offset & ~3)));
		375
		376	int scale = 1;
		377	if (devinfo->gen == 4 && dst.width == 8) {
		378	/* Pre-gen5, we can either use a SIMD8 message that requires (header,
		379	* u, v, r) as parameters, or we can just use the SIMD16 message
		380	* consisting of (header, u). We choose the second, at the cost of a
		381	* longer return length.
		382	*/
		383	scale = 2;
		384	}
		385
		386	enum opcode op;
		387	if (devinfo->gen >= 7)
		388	op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
		389	else
		390	op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
		391
		392	assert(dst.width % 8 == 0);
		393	int regs_written = 4 * (dst.width / 8) * scale;
		394	fs_reg vec4_result = fs_reg(GRF, alloc.allocate(regs_written),
		395	dst.type, dst.width);
		396	inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
		397	inst->regs_written = regs_written;
		398	instructions.push_tail(inst);
		399
		400	if (devinfo->gen < 7) {
		401	inst->base_mrf = 13;
		402	inst->header_size = 1;
		403	if (devinfo->gen == 4)
		404	inst->mlen = 3;
		405	else
		406	inst->mlen = 1 + dispatch_width / 8;
		407	}
		408
		409	fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
		410	instructions.push_tail(MOV(dst, result));
		411
		412	return instructions;
		413	}
		414
		415	/**
		416	* A helper for MOV generation for fixing up broken hardware SEND dependency
		417	* handling.
		418	*/
		419	fs_inst *
		420	fs_visitor::DEP_RESOLVE_MOV(int grf)
		421	{
		422	fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
		423
		424	inst->ir = NULL;
		425	inst->annotation = "send dependency resolve";
		426
		427	/* The caller always wants uncompressed to emit the minimal extra
		428	* dependencies, and to avoid having to deal with aligning its regs to 2.
		429	*/
		430	inst->exec_size = 8;
		431
		432	return inst;
		433	}
		434
		435	bool
		436	fs_inst::equals(fs_inst *inst) const
		437	{
		438	return (opcode == inst->opcode &&
		439	dst.equals(inst->dst) &&
		440	src[0].equals(inst->src[0]) &&
		441	src[1].equals(inst->src[1]) &&
		442	src[2].equals(inst->src[2]) &&
		443	saturate == inst->saturate &&
		444	predicate == inst->predicate &&
		445	conditional_mod == inst->conditional_mod &&
		446	mlen == inst->mlen &&
		447	base_mrf == inst->base_mrf &&
		448	target == inst->target &&
		449	eot == inst->eot &&
		450	header_size == inst->header_size &&
		451	shadow_compare == inst->shadow_compare &&
		452	exec_size == inst->exec_size &&
		453	offset == inst->offset);
		454	}
		455
		456	bool
		457	fs_inst::overwrites_reg(const fs_reg ®) const
		458	{
		459	return reg.in_range(dst, regs_written);
		460	}
		461
		462	bool
		463	fs_inst::is_send_from_grf() const
		464	{
		465	switch (opcode) {
		466	case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
		467	case SHADER_OPCODE_SHADER_TIME_ADD:
		468	case FS_OPCODE_INTERPOLATE_AT_CENTROID:
		469	case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
		470	case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
		471	case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
		472	case SHADER_OPCODE_UNTYPED_ATOMIC:
		473	case SHADER_OPCODE_UNTYPED_SURFACE_READ:
		474	case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
		475	case SHADER_OPCODE_TYPED_ATOMIC:
		476	case SHADER_OPCODE_TYPED_SURFACE_READ:
		477	case SHADER_OPCODE_TYPED_SURFACE_WRITE:
		478	case SHADER_OPCODE_URB_WRITE_SIMD8:
		479	return true;
		480	case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
		481	return src[1].file == GRF;
		482	case FS_OPCODE_FB_WRITE:
		483	return src[0].file == GRF;
		484	default:
		485	if (is_tex())
		486	return src[0].file == GRF;
		487
		488	return false;
		489	}
		490	}
		491
		492	bool
		493	fs_inst::is_copy_payload(const brw::simple_allocator &grf_alloc) const
		494	{
		495	if (this->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
		496	return false;
		497
		498	fs_reg reg = this->src[0];
		499	if (reg.file != GRF \|\| reg.reg_offset != 0 \|\| reg.stride == 0)
		500	return false;
		501
		502	if (grf_alloc.sizes[reg.reg] != this->regs_written)
		503	return false;
		504
		505	for (int i = 0; i < this->sources; i++) {
		506	reg.type = this->src[i].type;
		507	reg.width = this->src[i].width;
		508	if (!this->src[i].equals(reg))
		509	return false;
		510	reg = ::offset(reg, 1);
		511	}
		512
		513	return true;
		514	}
		515
		516	bool
		517	fs_inst::can_do_source_mods(const struct brw_device_info *devinfo)
		518	{
		519	if (devinfo->gen == 6 && is_math())
		520	return false;
		521
		522	if (is_send_from_grf())
		523	return false;
		524
		525	if (!backend_instruction::can_do_source_mods())
		526	return false;
		527
		528	return true;
		529	}
		530
		531	bool
		532	fs_inst::has_side_effects() const
		533	{
		534	return this->eot \|\| backend_instruction::has_side_effects();
		535	}
		536
		537	void
		538	fs_reg::init()
		539	{
		540	memset(this, 0, sizeof(*this));
		541	stride = 1;
		542	}
		543
		544	/** Generic unset register constructor. */
		545	fs_reg::fs_reg()
		546	{
		547	init();
		548	this->file = BAD_FILE;
		549	}
		550
		551	/** Immediate value constructor. */
		552	fs_reg::fs_reg(float f)
		553	{
		554	init();
		555	this->file = IMM;
		556	this->type = BRW_REGISTER_TYPE_F;
		557	this->fixed_hw_reg.dw1.f = f;
		558	this->width = 1;
		559	}
		560
		561	/** Immediate value constructor. */
		562	fs_reg::fs_reg(int32_t i)
		563	{
		564	init();
		565	this->file = IMM;
		566	this->type = BRW_REGISTER_TYPE_D;
		567	this->fixed_hw_reg.dw1.d = i;
		568	this->width = 1;
		569	}
		570
		571	/** Immediate value constructor. */
		572	fs_reg::fs_reg(uint32_t u)
		573	{
		574	init();
		575	this->file = IMM;
		576	this->type = BRW_REGISTER_TYPE_UD;
		577	this->fixed_hw_reg.dw1.ud = u;
		578	this->width = 1;
		579	}
		580
		581	/** Vector float immediate value constructor. */
		582	fs_reg::fs_reg(uint8_t vf[4])
		583	{
		584	init();
		585	this->file = IMM;
		586	this->type = BRW_REGISTER_TYPE_VF;
		587	memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
		588	}
		589
		590	/** Vector float immediate value constructor. */
		591	fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
		592	{
		593	init();
		594	this->file = IMM;
		595	this->type = BRW_REGISTER_TYPE_VF;
		596	this->fixed_hw_reg.dw1.ud = (vf0 << 0) \|
		597	(vf1 << 8) \|
		598	(vf2 << 16) \|
		599	(vf3 << 24);
		600	}
		601
		602	/** Fixed brw_reg. */
		603	fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
		604	{
		605	init();
		606	this->file = HW_REG;
		607	this->fixed_hw_reg = fixed_hw_reg;
		608	this->type = fixed_hw_reg.type;
		609	this->width = 1 << fixed_hw_reg.width;
		610	}
		611
		612	bool
		613	fs_reg::equals(const fs_reg &r) const
		614	{
		615	return (file == r.file &&
		616	reg == r.reg &&
		617	reg_offset == r.reg_offset &&
		618	subreg_offset == r.subreg_offset &&
		619	type == r.type &&
		620	negate == r.negate &&
		621	abs == r.abs &&
		622	!reladdr && !r.reladdr &&
		623	memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
		624	width == r.width &&
		625	stride == r.stride);
		626	}
		627
		628	fs_reg &
		629	fs_reg::set_smear(unsigned subreg)
		630	{
		631	assert(file != HW_REG && file != IMM);
		632	subreg_offset = subreg * type_sz(type);
		633	stride = 0;
		634	return *this;
		635	}
		636
		637	bool
		638	fs_reg::is_contiguous() const
		639	{
		640	return stride == 1;
		641	}
		642
		643	int
		644	fs_visitor::type_size(const struct glsl_type *type)
		645	{
		646	unsigned int size, i;
		647
		648	switch (type->base_type) {
		649	case GLSL_TYPE_UINT:
		650	case GLSL_TYPE_INT:
		651	case GLSL_TYPE_FLOAT:
		652	case GLSL_TYPE_BOOL:
		653	return type->components();
		654	case GLSL_TYPE_ARRAY:
		655	return type_size(type->fields.array) * type->length;
		656	case GLSL_TYPE_STRUCT:
		657	size = 0;
		658	for (i = 0; i < type->length; i++) {
		659	size += type_size(type->fields.structure[i].type);
		660	}
		661	return size;
		662	case GLSL_TYPE_SAMPLER:
		663	/* Samplers take up no register space, since they're baked in at
		664	* link time.
		665	*/
		666	return 0;
		667	case GLSL_TYPE_ATOMIC_UINT:
		668	return 0;
		669	case GLSL_TYPE_IMAGE:
		670	case GLSL_TYPE_VOID:
		671	case GLSL_TYPE_ERROR:
		672	case GLSL_TYPE_INTERFACE:
		673	case GLSL_TYPE_DOUBLE:
		674	unreachable("not reached");
		675	}
		676
		677	return 0;
		678	}
		679
		680	/**
		681	* Create a MOV to read the timestamp register.
		682	*
		683	* The caller is responsible for emitting the MOV. The return value is
		684	* the destination of the MOV, with extra parameters set.
		685	*/
		686	fs_reg
		687	fs_visitor::get_timestamp(fs_inst **out_mov)
		688	{
		689	assert(devinfo->gen >= 7);
		690
		691	fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
		692	BRW_ARF_TIMESTAMP,
		693	0),
		694	BRW_REGISTER_TYPE_UD));
		695
		696	fs_reg dst = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 4);
		697
		698	fs_inst *mov = MOV(dst, ts);
		699	/* We want to read the 3 fields we care about even if it's not enabled in
		700	* the dispatch.
		701	*/
		702	mov->force_writemask_all = true;
		703
		704	/* The caller wants the low 32 bits of the timestamp. Since it's running
		705	* at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
		706	* which is plenty of time for our purposes. It is identical across the
		707	* EUs, but since it's tracking GPU core speed it will increment at a
		708	* varying rate as render P-states change.
		709	*
		710	* The caller could also check if render P-states have changed (or anything
		711	* else that might disrupt timing) by setting smear to 2 and checking if
		712	* that field is != 0.
		713	*/
		714	dst.set_smear(0);
		715
		716	*out_mov = mov;
		717	return dst;
		718	}
		719
		720	void
		721	fs_visitor::emit_shader_time_begin()
		722	{
		723	current_annotation = "shader time start";
		724	fs_inst *mov;
		725	shader_start_time = get_timestamp(&mov);
		726	emit(mov);
		727	}
		728
		729	void
		730	fs_visitor::emit_shader_time_end()
		731	{
		732	current_annotation = "shader time end";
		733
		734	enum shader_time_shader_type type, written_type, reset_type;
		735	switch (stage) {
		736	case MESA_SHADER_VERTEX:
		737	type = ST_VS;
		738	written_type = ST_VS_WRITTEN;
		739	reset_type = ST_VS_RESET;
		740	break;
		741	case MESA_SHADER_GEOMETRY:
		742	type = ST_GS;
		743	written_type = ST_GS_WRITTEN;
		744	reset_type = ST_GS_RESET;
		745	break;
		746	case MESA_SHADER_FRAGMENT:
		747	if (dispatch_width == 8) {
		748	type = ST_FS8;
		749	written_type = ST_FS8_WRITTEN;
		750	reset_type = ST_FS8_RESET;
		751	} else {
		752	assert(dispatch_width == 16);
		753	type = ST_FS16;
		754	written_type = ST_FS16_WRITTEN;
		755	reset_type = ST_FS16_RESET;
		756	}
		757	break;
		758	case MESA_SHADER_COMPUTE:
		759	type = ST_CS;
		760	written_type = ST_CS_WRITTEN;
		761	reset_type = ST_CS_RESET;
		762	break;
		763	default:
		764	unreachable("fs_visitor::emit_shader_time_end missing code");
		765	}
		766
		767	/* Insert our code just before the final SEND with EOT. */
		768	exec_node *end = this->instructions.get_tail();
		769	assert(end && ((fs_inst *) end)->eot);
		770
		771	fs_inst *tm_read;
		772	fs_reg shader_end_time = get_timestamp(&tm_read);
		773	end->insert_before(tm_read);
		774
		775	/* Check that there weren't any timestamp reset events (assuming these
		776	* were the only two timestamp reads that happened).
		777	*/
		778	fs_reg reset = shader_end_time;
		779	reset.set_smear(2);
		780	fs_inst *test = AND(reg_null_d, reset, fs_reg(1u));
		781	test->conditional_mod = BRW_CONDITIONAL_Z;
		782	test->force_writemask_all = true;
		783	end->insert_before(test);
		784	end->insert_before(IF(BRW_PREDICATE_NORMAL));
		785
		786	fs_reg start = shader_start_time;
		787	start.negate = true;
		788	fs_reg diff = fs_reg(GRF, alloc.allocate(1), BRW_REGISTER_TYPE_UD, 1);
		789	diff.set_smear(0);
		790	fs_inst *add = ADD(diff, start, shader_end_time);
		791	add->force_writemask_all = true;
		792	end->insert_before(add);
		793
		794	/* If there were no instructions between the two timestamp gets, the diff
		795	* is 2 cycles. Remove that overhead, so I can forget about that when
		796	* trying to determine the time taken for single instructions.
		797	*/
		798	add = ADD(diff, diff, fs_reg(-2u));
		799	add->force_writemask_all = true;
		800	end->insert_before(add);
		801
		802	end->insert_before(SHADER_TIME_ADD(type, diff));
		803	end->insert_before(SHADER_TIME_ADD(written_type, fs_reg(1u)));
		804	end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ELSE, dispatch_width));
		805	end->insert_before(SHADER_TIME_ADD(reset_type, fs_reg(1u)));
		806	end->insert_before(new(mem_ctx) fs_inst(BRW_OPCODE_ENDIF, dispatch_width));
		807	}
		808
		809	fs_inst *
		810	fs_visitor::SHADER_TIME_ADD(enum shader_time_shader_type type, fs_reg value)
		811	{
		812	int shader_time_index =
		813	brw_get_shader_time_index(brw, shader_prog, prog, type);
		814	fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
		815
		816	fs_reg payload;
		817	if (dispatch_width == 8)
		818	payload = vgrf(glsl_type::uvec2_type);
		819	else
		820	payload = vgrf(glsl_type::uint_type);
		821
		822	return new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
		823	fs_reg(), payload, offset, value);
		824	}
		825
		826	void
		827	fs_visitor::vfail(const char *format, va_list va)
		828	{
		829	char *msg;
		830
		831	if (failed)
		832	return;
		833
		834	failed = true;
		835
		836	msg = ralloc_vasprintf(mem_ctx, format, va);
		837	msg = ralloc_asprintf(mem_ctx, "%s compile failed: %s\n", stage_abbrev, msg);
		838
		839	this->fail_msg = msg;
		840
		841	if (debug_enabled) {
		842	fprintf(stderr, "%s", msg);
		843	}
		844	}
		845
		846	void
		847	fs_visitor::fail(const char *format, ...)
		848	{
		849	va_list va;
		850
		851	va_start(va, format);
		852	vfail(format, va);
		853	va_end(va);
		854	}
		855
		856	/**
		857	* Mark this program as impossible to compile in SIMD16 mode.
		858	*
		859	* During the SIMD8 compile (which happens first), we can detect and flag
		860	* things that are unsupported in SIMD16 mode, so the compiler can skip
		861	* the SIMD16 compile altogether.
		862	*
		863	* During a SIMD16 compile (if one happens anyway), this just calls fail().
		864	*/
		865	void
		866	fs_visitor::no16(const char *format, ...)
		867	{
		868	va_list va;
		869
		870	va_start(va, format);
		871
		872	if (dispatch_width == 16) {
		873	vfail(format, va);
		874	} else {
		875	simd16_unsupported = true;
		876
		877	if (brw->perf_debug) {
		878	if (no16_msg)
		879	ralloc_vasprintf_append(&no16_msg, format, va);
		880	else
		881	no16_msg = ralloc_vasprintf(mem_ctx, format, va);
		882	}
		883	}
		884
		885	va_end(va);
		886	}
		887
		888	fs_inst *
		889	fs_visitor::emit(enum opcode opcode)
		890	{
		891	return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
		892	}
		893
		894	fs_inst *
		895	fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
		896	{
		897	return emit(new(mem_ctx) fs_inst(opcode, dst));
		898	}
		899
		900	fs_inst *
		901	fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
		902	{
		903	return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
		904	}
		905
		906	fs_inst *
		907	fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
		908	const fs_reg &src1)
		909	{
		910	return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
		911	}
		912
		913	fs_inst *
		914	fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
		915	const fs_reg &src1, const fs_reg &src2)
		916	{
		917	return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
		918	}
		919
		920	fs_inst *
		921	fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
		922	fs_reg src[], int sources)
		923	{
		924	return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
		925	}
		926
		927	/**
		928	* Returns true if the instruction has a flag that means it won't
		929	* update an entire destination register.
		930	*
		931	* For example, dead code elimination and live variable analysis want to know
		932	* when a write to a variable screens off any preceding values that were in
		933	* it.
		934	*/
		935	bool
		936	fs_inst::is_partial_write() const
		937	{
		938	return ((this->predicate && this->opcode != BRW_OPCODE_SEL) \|\|
		939	(this->dst.width * type_sz(this->dst.type)) < 32 \|\|
		940	!this->dst.is_contiguous());
		941	}
		942
		943	int
		944	fs_inst::regs_read(int arg) const
		945	{
		946	if (is_tex() && arg == 0 && src[0].file == GRF) {
		947	return mlen;
		948	} else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
		949	return mlen;
		950	} else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
		951	return mlen;
		952	} else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
		953	return mlen;
		954	} else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
		955	return mlen;
		956	} else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_WRITE && arg == 0) {
		957	return mlen;
		958	} else if (opcode == SHADER_OPCODE_TYPED_ATOMIC && arg == 0) {
		959	return mlen;
		960	} else if (opcode == SHADER_OPCODE_TYPED_SURFACE_READ && arg == 0) {
		961	return mlen;
		962	} else if (opcode == SHADER_OPCODE_TYPED_SURFACE_WRITE && arg == 0) {
		963	return mlen;
		964	} else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
		965	return mlen;
		966	} else if (opcode == FS_OPCODE_LINTERP && arg == 0) {
		967	return exec_size / 4;
		968	}
		969
		970	switch (src[arg].file) {
		971	case BAD_FILE:
		972	case UNIFORM:
		973	case IMM:
		974	return 1;
		975	case GRF:
		976	case HW_REG:
		977	if (src[arg].stride == 0) {
		978	return 1;
		979	} else {
		980	int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
		981	return (size + 31) / 32;
		982	}
		983	case MRF:
		984	unreachable("MRF registers are not allowed as sources");
		985	default:
		986	unreachable("Invalid register file");
		987	}
		988	}
		989
		990	bool
		991	fs_inst::reads_flag() const
		992	{
		993	return predicate;
		994	}
		995
		996	bool
		997	fs_inst::writes_flag() const
		998	{
		999	return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
		1000	opcode != BRW_OPCODE_IF &&
		1001	opcode != BRW_OPCODE_WHILE)) \|\|
		1002	opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
		1003	}
		1004
		1005	/**
		1006	* Returns how many MRFs an FS opcode will write over.
		1007	*
		1008	* Note that this is not the 0 or 1 implied writes in an actual gen
		1009	* instruction -- the FS opcodes often generate MOVs in addition.
		1010	*/
		1011	int
		1012	fs_visitor::implied_mrf_writes(fs_inst *inst)
		1013	{
		1014	if (inst->mlen == 0)
		1015	return 0;
		1016
		1017	if (inst->base_mrf == -1)
		1018	return 0;
		1019
		1020	switch (inst->opcode) {
		1021	case SHADER_OPCODE_RCP:
		1022	case SHADER_OPCODE_RSQ:
		1023	case SHADER_OPCODE_SQRT:
		1024	case SHADER_OPCODE_EXP2:
		1025	case SHADER_OPCODE_LOG2:
		1026	case SHADER_OPCODE_SIN:
		1027	case SHADER_OPCODE_COS:
		1028	return 1 * dispatch_width / 8;
		1029	case SHADER_OPCODE_POW:
		1030	case SHADER_OPCODE_INT_QUOTIENT:
		1031	case SHADER_OPCODE_INT_REMAINDER:
		1032	return 2 * dispatch_width / 8;
		1033	case SHADER_OPCODE_TEX:
		1034	case FS_OPCODE_TXB:
		1035	case SHADER_OPCODE_TXD:
		1036	case SHADER_OPCODE_TXF:
		1037	case SHADER_OPCODE_TXF_CMS:
		1038	case SHADER_OPCODE_TXF_MCS:
		1039	case SHADER_OPCODE_TG4:
		1040	case SHADER_OPCODE_TG4_OFFSET:
		1041	case SHADER_OPCODE_TXL:
		1042	case SHADER_OPCODE_TXS:
		1043	case SHADER_OPCODE_LOD:
		1044	return 1;
		1045	case FS_OPCODE_FB_WRITE:
		1046	return 2;
		1047	case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
		1048	case SHADER_OPCODE_GEN4_SCRATCH_READ:
		1049	return 1;
		1050	case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
		1051	return inst->mlen;
		1052	case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
		1053	return inst->mlen;
		1054	case SHADER_OPCODE_UNTYPED_ATOMIC:
		1055	case SHADER_OPCODE_UNTYPED_SURFACE_READ:
		1056	case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
		1057	case SHADER_OPCODE_TYPED_ATOMIC:
		1058	case SHADER_OPCODE_TYPED_SURFACE_READ:
		1059	case SHADER_OPCODE_TYPED_SURFACE_WRITE:
		1060	case SHADER_OPCODE_URB_WRITE_SIMD8:
		1061	case FS_OPCODE_INTERPOLATE_AT_CENTROID:
		1062	case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
		1063	case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
		1064	case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
		1065	return 0;
		1066	default:
		1067	unreachable("not reached");
		1068	}
		1069	}
		1070
		1071	fs_reg
		1072	fs_visitor::vgrf(const glsl_type *const type)
		1073	{
		1074	int reg_width = dispatch_width / 8;
		1075	return fs_reg(GRF, alloc.allocate(type_size(type) * reg_width),
		1076	brw_type_for_base_type(type), dispatch_width);
		1077	}
		1078
		1079	fs_reg
		1080	fs_visitor::vgrf(int num_components)
		1081	{
		1082	int reg_width = dispatch_width / 8;
		1083	return fs_reg(GRF, alloc.allocate(num_components * reg_width),
		1084	BRW_REGISTER_TYPE_F, dispatch_width);
		1085	}
		1086
		1087	/** Fixed HW reg constructor. */
		1088	fs_reg::fs_reg(enum register_file file, int reg)
		1089	{
		1090	init();
		1091	this->file = file;
		1092	this->reg = reg;
		1093	this->type = BRW_REGISTER_TYPE_F;
		1094
		1095	switch (file) {
		1096	case UNIFORM:
		1097	this->width = 1;
		1098	break;
		1099	default:
		1100	this->width = 8;
		1101	}
		1102	}
		1103
		1104	/** Fixed HW reg constructor. */
		1105	fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
		1106	{
		1107	init();
		1108	this->file = file;
		1109	this->reg = reg;
		1110	this->type = type;
		1111
		1112	switch (file) {
		1113	case UNIFORM:
		1114	this->width = 1;
		1115	break;
		1116	default:
		1117	this->width = 8;
		1118	}
		1119	}
		1120
		1121	/** Fixed HW reg constructor. */
		1122	fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
		1123	uint8_t width)
		1124	{
		1125	init();
		1126	this->file = file;
		1127	this->reg = reg;
		1128	this->type = type;
		1129	this->width = width;
		1130	}
		1131
		1132	fs_reg *
		1133	fs_visitor::variable_storage(ir_variable *var)
		1134	{
		1135	return (fs_reg *)hash_table_find(this->variable_ht, var);
		1136	}
		1137
		1138	void
		1139	import_uniforms_callback(const void *key,
		1140	void *data,
		1141	void *closure)
		1142	{
		1143	struct hash_table dst_ht = (struct hash_table )closure;
		1144	const fs_reg reg = (const fs_reg )data;
		1145
		1146	if (reg->file != UNIFORM)
		1147	return;
		1148
		1149	hash_table_insert(dst_ht, data, key);
		1150	}
		1151
		1152	/* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
		1153	* This brings in those uniform definitions
		1154	*/
		1155	void
		1156	fs_visitor::import_uniforms(fs_visitor *v)
		1157	{
		1158	hash_table_call_foreach(v->variable_ht,
		1159	import_uniforms_callback,
		1160	variable_ht);
		1161	this->push_constant_loc = v->push_constant_loc;
		1162	this->pull_constant_loc = v->pull_constant_loc;
		1163	this->uniforms = v->uniforms;
		1164	this->param_size = v->param_size;
		1165	}
		1166
		1167	/* Our support for uniforms is piggy-backed on the struct
		1168	* gl_fragment_program, because that's where the values actually
		1169	* get stored, rather than in some global gl_shader_program uniform
		1170	* store.
		1171	*/
		1172	void
		1173	fs_visitor::setup_uniform_values(ir_variable *ir)
		1174	{
		1175	int namelen = strlen(ir->name);
		1176
		1177	/* The data for our (non-builtin) uniforms is stored in a series of
		1178	* gl_uniform_driver_storage structs for each subcomponent that
		1179	* glGetUniformLocation() could name. We know it's been set up in the same
		1180	* order we'd walk the type, so walk the list of storage and find anything
		1181	* with our name, or the prefix of a component that starts with our name.
		1182	*/
		1183	unsigned params_before = uniforms;
		1184	for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
		1185	struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
		1186
		1187	if (strncmp(ir->name, storage->name, namelen) != 0 \|\|
		1188	(storage->name[namelen] != 0 &&
		1189	storage->name[namelen] != '.' &&
		1190	storage->name[namelen] != '[')) {
		1191	continue;
		1192	}
		1193
		1194	unsigned slots = storage->type->component_slots();
		1195	if (storage->array_elements)
		1196	slots *= storage->array_elements;
		1197
		1198	for (unsigned i = 0; i < slots; i++) {
		1199	stage_prog_data->param[uniforms++] = &storage->storage[i];
		1200	}
		1201	}
		1202
		1203	/* Make sure we actually initialized the right amount of stuff here. */
		1204	assert(params_before + ir->type->component_slots() == uniforms);
		1205	(void)params_before;
		1206	}
		1207
		1208
		1209	/* Our support for builtin uniforms is even scarier than non-builtin.
		1210	* It sits on top of the PROG_STATE_VAR parameters that are
		1211	* automatically updated from GL context state.
		1212	*/
		1213	void
		1214	fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
		1215	{
		1216	const ir_state_slot *const slots = ir->get_state_slots();
		1217	assert(slots != NULL);
		1218
		1219	for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
		1220	/* This state reference has already been setup by ir_to_mesa, but we'll
		1221	* get the same index back here.
		1222	*/
		1223	int index = _mesa_add_state_reference(this->prog->Parameters,
		1224	(gl_state_index *)slots[i].tokens);
		1225
		1226	/* Add each of the unique swizzles of the element as a parameter.
		1227	* This'll end up matching the expected layout of the
		1228	* array/matrix/structure we're trying to fill in.
		1229	*/
		1230	int last_swiz = -1;
		1231	for (unsigned int j = 0; j < 4; j++) {
		1232	int swiz = GET_SWZ(slots[i].swizzle, j);
		1233	if (swiz == last_swiz)
		1234	break;
		1235	last_swiz = swiz;
		1236
		1237	stage_prog_data->param[uniforms++] =
		1238	&prog->Parameters->ParameterValues[index][swiz];
		1239	}
		1240	}
		1241	}
		1242
		1243	fs_reg *
		1244	fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
		1245	bool origin_upper_left)
		1246	{
		1247	assert(stage == MESA_SHADER_FRAGMENT);
		1248	brw_wm_prog_key key = (brw_wm_prog_key) this->key;
		1249	fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
		1250	fs_reg wpos = *reg;
		1251	bool flip = !origin_upper_left ^ key->render_to_fbo;
		1252
		1253	/* gl_FragCoord.x */
		1254	if (pixel_center_integer) {
		1255	emit(MOV(wpos, this->pixel_x));
		1256	} else {
		1257	emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
		1258	}
		1259	wpos = offset(wpos, 1);
		1260
		1261	/* gl_FragCoord.y */
		1262	if (!flip && pixel_center_integer) {
		1263	emit(MOV(wpos, this->pixel_y));
		1264	} else {
		1265	fs_reg pixel_y = this->pixel_y;
		1266	float offset = (pixel_center_integer ? 0.0 : 0.5);
		1267
		1268	if (flip) {
		1269	pixel_y.negate = true;
		1270	offset += key->drawable_height - 1.0;
		1271	}
		1272
		1273	emit(ADD(wpos, pixel_y, fs_reg(offset)));
		1274	}
		1275	wpos = offset(wpos, 1);
		1276
		1277	/* gl_FragCoord.z */
		1278	if (devinfo->gen >= 6) {
		1279	emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
		1280	} else {
		1281	emit(FS_OPCODE_LINTERP, wpos,
		1282	this->delta_xy[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
		1283	interp_reg(VARYING_SLOT_POS, 2));
		1284	}
		1285	wpos = offset(wpos, 1);
		1286
		1287	/* gl_FragCoord.w: Already set up in emit_interpolation */
		1288	emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
		1289
		1290	return reg;
		1291	}
		1292
		1293	fs_inst *
		1294	fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
		1295	glsl_interp_qualifier interpolation_mode,
		1296	bool is_centroid, bool is_sample)
		1297	{
		1298	brw_wm_barycentric_interp_mode barycoord_mode;
		1299	if (devinfo->gen >= 6) {
		1300	if (is_centroid) {
		1301	if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
		1302	barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
		1303	else
		1304	barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
		1305	} else if (is_sample) {
		1306	if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
		1307	barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
		1308	else
		1309	barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
		1310	} else {
		1311	if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
		1312	barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
		1313	else
		1314	barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
		1315	}
		1316	} else {
		1317	/* On Ironlake and below, there is only one interpolation mode.
		1318	* Centroid interpolation doesn't mean anything on this hardware --
		1319	* there is no multisampling.
		1320	*/
		1321	barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
		1322	}
		1323	return emit(FS_OPCODE_LINTERP, attr,
		1324	this->delta_xy[barycoord_mode], interp);
		1325	}
		1326
		1327	void
		1328	fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
		1329	const glsl_type *type,
		1330	glsl_interp_qualifier interpolation_mode,
		1331	int location, bool mod_centroid,
		1332	bool mod_sample)
		1333	{
		1334	attr.type = brw_type_for_base_type(type->get_scalar_type());
		1335
		1336	assert(stage == MESA_SHADER_FRAGMENT);
		1337	brw_wm_prog_data prog_data = (brw_wm_prog_data) this->prog_data;
		1338	brw_wm_prog_key key = (brw_wm_prog_key) this->key;
		1339
		1340	unsigned int array_elements;
		1341
		1342	if (type->is_array()) {
		1343	array_elements = type->length;
		1344	if (array_elements == 0) {
		1345	fail("dereferenced array '%s' has length 0\n", name);
		1346	}
		1347	type = type->fields.array;
		1348	} else {
		1349	array_elements = 1;
		1350	}
		1351
		1352	if (interpolation_mode == INTERP_QUALIFIER_NONE) {
		1353	bool is_gl_Color =
		1354	location == VARYING_SLOT_COL0 \|\| location == VARYING_SLOT_COL1;
		1355	if (key->flat_shade && is_gl_Color) {
		1356	interpolation_mode = INTERP_QUALIFIER_FLAT;
		1357	} else {
		1358	interpolation_mode = INTERP_QUALIFIER_SMOOTH;
		1359	}
		1360	}
		1361
		1362	for (unsigned int i = 0; i < array_elements; i++) {
		1363	for (unsigned int j = 0; j < type->matrix_columns; j++) {
		1364	if (prog_data->urb_setup[location] == -1) {
		1365	/* If there's no incoming setup data for this slot, don't
		1366	* emit interpolation for it.
		1367	*/
		1368	attr = offset(attr, type->vector_elements);
		1369	location++;
		1370	continue;
		1371	}
		1372
		1373	if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
		1374	/* Constant interpolation (flat shading) case. The SF has
		1375	* handed us defined values in only the constant offset
		1376	* field of the setup reg.
		1377	*/
		1378	for (unsigned int k = 0; k < type->vector_elements; k++) {
		1379	struct brw_reg interp = interp_reg(location, k);
		1380	interp = suboffset(interp, 3);
		1381	interp.type = attr.type;
		1382	emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
		1383	attr = offset(attr, 1);
		1384	}
		1385	} else {
		1386	/* Smooth/noperspective interpolation case. */
		1387	for (unsigned int k = 0; k < type->vector_elements; k++) {
		1388	struct brw_reg interp = interp_reg(location, k);
		1389	if (devinfo->needs_unlit_centroid_workaround && mod_centroid) {
		1390	/* Get the pixel/sample mask into f0 so that we know
		1391	* which pixels are lit. Then, for each channel that is
		1392	* unlit, replace the centroid data with non-centroid
		1393	* data.
		1394	*/
		1395	emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
		1396
		1397	fs_inst *inst;
		1398	inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
		1399	false, false);
		1400	inst->predicate = BRW_PREDICATE_NORMAL;
		1401	inst->predicate_inverse = true;
		1402	if (devinfo->has_pln)
		1403	inst->no_dd_clear = true;
		1404
		1405	inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
		1406	mod_centroid && !key->persample_shading,
		1407	mod_sample \|\| key->persample_shading);
		1408	inst->predicate = BRW_PREDICATE_NORMAL;
		1409	inst->predicate_inverse = false;
		1410	if (devinfo->has_pln)
		1411	inst->no_dd_check = true;
		1412
		1413	} else {
		1414	emit_linterp(attr, fs_reg(interp), interpolation_mode,
		1415	mod_centroid && !key->persample_shading,
		1416	mod_sample \|\| key->persample_shading);
		1417	}
		1418	if (devinfo->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
		1419	emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
		1420	}
		1421	attr = offset(attr, 1);
		1422	}
		1423
		1424	}
		1425	location++;
		1426	}
		1427	}
		1428	}
		1429
		1430	fs_reg *
		1431	fs_visitor::emit_frontfacing_interpolation()
		1432	{
		1433	fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
		1434
		1435	if (devinfo->gen >= 6) {
		1436	/* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
		1437	* a boolean result from this (~0/true or 0/false).
		1438	*
		1439	* We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
		1440	* this task in only one instruction:
		1441	* - a negation source modifier will flip the bit; and
		1442	* - a W -> D type conversion will sign extend the bit into the high
		1443	* word of the destination.
		1444	*
		1445	* An ASR 15 fills the low word of the destination.
		1446	*/
		1447	fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
		1448	g0.negate = true;
		1449
		1450	emit(ASR(*reg, g0, fs_reg(15)));
		1451	} else {
		1452	/* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
		1453	* a boolean result from this (1/true or 0/false).
		1454	*
		1455	* Like in the above case, since the bit is the MSB of g1.6:UD we can use
		1456	* the negation source modifier to flip it. Unfortunately the SHR
		1457	* instruction only operates on UD (or D with an abs source modifier)
		1458	* sources without negation.
		1459	*
		1460	* Instead, use ASR (which will give ~0/true or 0/false).
		1461	*/
		1462	fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
		1463	g1_6.negate = true;
		1464
		1465	emit(ASR(*reg, g1_6, fs_reg(31)));
		1466	}
		1467
		1468	return reg;
		1469	}
		1470
		1471	void
		1472	fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
		1473	{
		1474	assert(stage == MESA_SHADER_FRAGMENT);
		1475	brw_wm_prog_key key = (brw_wm_prog_key) this->key;
		1476	assert(dst.type == BRW_REGISTER_TYPE_F);
		1477
		1478	if (key->compute_pos_offset) {
		1479	/* Convert int_sample_pos to floating point */
		1480	emit(MOV(dst, int_sample_pos));
		1481	/* Scale to the range [0, 1] */
		1482	emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
		1483	}
		1484	else {
		1485	/* From ARB_sample_shading specification:
		1486	* "When rendering to a non-multisample buffer, or if multisample
		1487	* rasterization is disabled, gl_SamplePosition will always be
		1488	* (0.5, 0.5).
		1489	*/
		1490	emit(MOV(dst, fs_reg(0.5f)));
		1491	}
		1492	}
		1493
		1494	fs_reg *
		1495	fs_visitor::emit_samplepos_setup()
		1496	{
		1497	assert(devinfo->gen >= 6);
		1498
		1499	this->current_annotation = "compute sample position";
		1500	fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
		1501	fs_reg pos = *reg;
		1502	fs_reg int_sample_x = vgrf(glsl_type::int_type);
		1503	fs_reg int_sample_y = vgrf(glsl_type::int_type);
		1504
		1505	/* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
		1506	* mode will be enabled.
		1507	*
		1508	* From the Ivy Bridge PRM, volume 2 part 1, page 344:
		1509	* R31.1:0 Position Offset X/Y for Slot[3:0]
		1510	* R31.3:2 Position Offset X/Y for Slot[7:4]
		1511	* .....
		1512	*
		1513	* The X, Y sample positions come in as bytes in thread payload. So, read
		1514	* the positions using vstride=16, width=8, hstride=2.
		1515	*/
		1516	struct brw_reg sample_pos_reg =
		1517	stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
		1518	BRW_REGISTER_TYPE_B), 16, 8, 2);
		1519
		1520	if (dispatch_width == 8) {
		1521	emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
		1522	} else {
		1523	emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
		1524	emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
		1525	->force_sechalf = true;
		1526	}
		1527	/* Compute gl_SamplePosition.x */
		1528	compute_sample_position(pos, int_sample_x);
		1529	pos = offset(pos, 1);
		1530	if (dispatch_width == 8) {
		1531	emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
		1532	} else {
		1533	emit(MOV(half(int_sample_y, 0),
		1534	fs_reg(suboffset(sample_pos_reg, 1))));
		1535	emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
		1536	->force_sechalf = true;
		1537	}
		1538	/* Compute gl_SamplePosition.y */
		1539	compute_sample_position(pos, int_sample_y);
		1540	return reg;
		1541	}
		1542
		1543	fs_reg *
		1544	fs_visitor::emit_sampleid_setup()
		1545	{
		1546	assert(stage == MESA_SHADER_FRAGMENT);
		1547	brw_wm_prog_key key = (brw_wm_prog_key) this->key;
		1548	assert(devinfo->gen >= 6);
		1549
		1550	this->current_annotation = "compute sample id";
		1551	fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
		1552
		1553	if (key->compute_sample_id) {
		1554	fs_reg t1 = vgrf(glsl_type::int_type);
		1555	fs_reg t2 = vgrf(glsl_type::int_type);
		1556	t2.type = BRW_REGISTER_TYPE_UW;
		1557
		1558	/* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
		1559	* 8x multisampling, subspan 0 will represent sample N (where N
		1560	* is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
		1561	* 7. We can find the value of N by looking at R0.0 bits 7:6
		1562	* ("Starting Sample Pair Index (SSPI)") and multiplying by two
		1563	* (since samples are always delivered in pairs). That is, we
		1564	* compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
		1565	* we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
		1566	* case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
		1567	* 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
		1568	* populating a temporary variable with the sequence (0, 1, 2, 3),
		1569	* and then reading from it using vstride=1, width=4, hstride=0.
		1570	* These computations hold good for 4x multisampling as well.
		1571	*
		1572	* For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
		1573	* the first four slots are sample 0 of subspan 0; the next four
		1574	* are sample 1 of subspan 0; the third group is sample 0 of
		1575	* subspan 1, and finally sample 1 of subspan 1.
		1576	*/
		1577	fs_inst *inst;
		1578	inst = emit(BRW_OPCODE_AND, t1,
		1579	fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
		1580	fs_reg(0xc0));
		1581	inst->force_writemask_all = true;
		1582	inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
		1583	inst->force_writemask_all = true;
		1584	/* This works for both SIMD8 and SIMD16 */
		1585	inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
		1586	inst->force_writemask_all = true;
		1587	/* This special instruction takes care of setting vstride=1,
		1588	* width=4, hstride=0 of t2 during an ADD instruction.
		1589	*/
		1590	emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
		1591	} else {
		1592	/* As per GL_ARB_sample_shading specification:
		1593	* "When rendering to a non-multisample buffer, or if multisample
		1594	* rasterization is disabled, gl_SampleID will always be zero."
		1595	*/
		1596	emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
		1597	}
		1598
		1599	return reg;
		1600	}
		1601
		1602	void
		1603	fs_visitor::resolve_source_modifiers(fs_reg *src)
		1604	{
		1605	if (!src->abs && !src->negate)
		1606	return;
		1607
		1608	fs_reg temp = retype(vgrf(1), src->type);
		1609	emit(MOV(temp, *src));
		1610	*src = temp;
		1611	}
		1612
		1613	fs_reg
		1614	fs_visitor::fix_math_operand(fs_reg src)
		1615	{
		1616	/* Can't do hstride == 0 args on gen6 math, so expand it out. We
		1617	* might be able to do better by doing execsize = 1 math and then
		1618	* expanding that result out, but we would need to be careful with
		1619	* masking.
		1620	*
		1621	* The hardware ignores source modifiers (negate and abs) on math
		1622	* instructions, so we also move to a temp to set those up.
		1623	*/
		1624	if (devinfo->gen == 6 && src.file != UNIFORM && src.file != IMM &&
		1625	!src.abs && !src.negate)
		1626	return src;
		1627
		1628	/* Gen7 relaxes most of the above restrictions, but still can't use IMM
		1629	* operands to math
		1630	*/
		1631	if (devinfo->gen >= 7 && src.file != IMM)
		1632	return src;
		1633
		1634	fs_reg expanded = vgrf(glsl_type::float_type);
		1635	expanded.type = src.type;
		1636	emit(BRW_OPCODE_MOV, expanded, src);
		1637	return expanded;
		1638	}
		1639
		1640	fs_inst *
		1641	fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
		1642	{
		1643	switch (opcode) {
		1644	case SHADER_OPCODE_RCP:
		1645	case SHADER_OPCODE_RSQ:
		1646	case SHADER_OPCODE_SQRT:
		1647	case SHADER_OPCODE_EXP2:
		1648	case SHADER_OPCODE_LOG2:
		1649	case SHADER_OPCODE_SIN:
		1650	case SHADER_OPCODE_COS:
		1651	break;
		1652	default:
		1653	unreachable("not reached: bad math opcode");
		1654	}
		1655
		1656	/* Can't do hstride == 0 args to gen6 math, so expand it out. We
		1657	* might be able to do better by doing execsize = 1 math and then
		1658	* expanding that result out, but we would need to be careful with
		1659	* masking.
		1660	*
		1661	* Gen 6 hardware ignores source modifiers (negate and abs) on math
		1662	* instructions, so we also move to a temp to set those up.
		1663	*/
		1664	if (devinfo->gen == 6 \|\| devinfo->gen == 7)
		1665	src = fix_math_operand(src);
		1666
		1667	fs_inst *inst = emit(opcode, dst, src);
		1668
		1669	if (devinfo->gen < 6) {
		1670	inst->base_mrf = 2;
		1671	inst->mlen = dispatch_width / 8;
		1672	}
		1673
		1674	return inst;
		1675	}
		1676
		1677	fs_inst *
		1678	fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
		1679	{
		1680	int base_mrf = 2;
		1681	fs_inst *inst;
		1682
		1683	if (devinfo->gen >= 8) {
		1684	inst = emit(opcode, dst, src0, src1);
		1685	} else if (devinfo->gen >= 6) {
		1686	src0 = fix_math_operand(src0);
		1687	src1 = fix_math_operand(src1);
		1688
		1689	inst = emit(opcode, dst, src0, src1);
		1690	} else {
		1691	/* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
		1692	* "Message Payload":
		1693	*
		1694	* "Operand0[7]. For the INT DIV functions, this operand is the
		1695	* denominator."
		1696	* ...
		1697	* "Operand1[7]. For the INT DIV functions, this operand is the
		1698	* numerator."
		1699	*/
		1700	bool is_int_div = opcode != SHADER_OPCODE_POW;
		1701	fs_reg &op0 = is_int_div ? src1 : src0;
		1702	fs_reg &op1 = is_int_div ? src0 : src1;
		1703
		1704	emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
		1705	inst = emit(opcode, dst, op0, reg_null_f);
		1706
		1707	inst->base_mrf = base_mrf;
		1708	inst->mlen = 2 * dispatch_width / 8;
		1709	}
		1710	return inst;
		1711	}
		1712
		1713	void
		1714	fs_visitor::emit_discard_jump()
		1715	{
		1716	assert(((brw_wm_prog_data*) this->prog_data)->uses_kill);
		1717
		1718	/* For performance, after a discard, jump to the end of the
		1719	* shader if all relevant channels have been discarded.
		1720	*/
		1721	fs_inst *discard_jump = emit(FS_OPCODE_DISCARD_JUMP);
		1722	discard_jump->flag_subreg = 1;
		1723
		1724	discard_jump->predicate = (dispatch_width == 8)
		1725	? BRW_PREDICATE_ALIGN1_ANY8H
		1726	: BRW_PREDICATE_ALIGN1_ANY16H;
		1727	discard_jump->predicate_inverse = true;
		1728	}
		1729
		1730	void
		1731	fs_visitor::assign_curb_setup()
		1732	{
		1733	if (dispatch_width == 8) {
		1734	prog_data->dispatch_grf_start_reg = payload.num_regs;
		1735	} else {
		1736	if (stage == MESA_SHADER_FRAGMENT) {
		1737	brw_wm_prog_data prog_data = (brw_wm_prog_data) this->prog_data;
		1738	prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
		1739	} else if (stage == MESA_SHADER_COMPUTE) {
		1740	brw_cs_prog_data prog_data = (brw_cs_prog_data) this->prog_data;
		1741	prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
		1742	} else {
		1743	unreachable("Unsupported shader type!");
		1744	}
		1745	}
		1746
		1747	prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
		1748
		1749	/* Map the offsets in the UNIFORM file to fixed HW regs. */
		1750	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		1751	for (unsigned int i = 0; i < inst->sources; i++) {
		1752	if (inst->src[i].file == UNIFORM) {
		1753	int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
		1754	int constant_nr;
		1755	if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
		1756	constant_nr = push_constant_loc[uniform_nr];
		1757	} else {
		1758	/* Section 5.11 of the OpenGL 4.1 spec says:
		1759	* "Out-of-bounds reads return undefined values, which include
		1760	* values from other variables of the active program or zero."
		1761	* Just return the first push constant.
		1762	*/
		1763	constant_nr = 0;
		1764	}
		1765
		1766	struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
		1767	constant_nr / 8,
		1768	constant_nr % 8);
		1769
		1770	inst->src[i].file = HW_REG;
		1771	inst->src[i].fixed_hw_reg = byte_offset(
		1772	retype(brw_reg, inst->src[i].type),
		1773	inst->src[i].subreg_offset);
		1774	}
		1775	}
		1776	}
		1777	}
		1778
		1779	void
		1780	fs_visitor::calculate_urb_setup()
		1781	{
		1782	assert(stage == MESA_SHADER_FRAGMENT);
		1783	brw_wm_prog_data prog_data = (brw_wm_prog_data) this->prog_data;
		1784	brw_wm_prog_key key = (brw_wm_prog_key) this->key;
		1785
		1786	memset(prog_data->urb_setup, -1,
		1787	sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
		1788
		1789	int urb_next = 0;
		1790	/* Figure out where each of the incoming setup attributes lands. */
		1791	if (devinfo->gen >= 6) {
		1792	if (_mesa_bitcount_64(prog->InputsRead &
		1793	BRW_FS_VARYING_INPUT_MASK) <= 16) {
		1794	/* The SF/SBE pipeline stage can do arbitrary rearrangement of the
		1795	* first 16 varying inputs, so we can put them wherever we want.
		1796	* Just put them in order.
		1797	*
		1798	* This is useful because it means that (a) inputs not used by the
		1799	* fragment shader won't take up valuable register space, and (b) we
		1800	* won't have to recompile the fragment shader if it gets paired with
		1801	* a different vertex (or geometry) shader.
		1802	*/
		1803	for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
		1804	if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
		1805	BITFIELD64_BIT(i)) {
		1806	prog_data->urb_setup[i] = urb_next++;
		1807	}
		1808	}
		1809	} else {
		1810	/* We have enough input varyings that the SF/SBE pipeline stage can't
		1811	* arbitrarily rearrange them to suit our whim; we have to put them
		1812	* in an order that matches the output of the previous pipeline stage
		1813	* (geometry or vertex shader).
		1814	*/
		1815	struct brw_vue_map prev_stage_vue_map;
		1816	brw_compute_vue_map(devinfo, &prev_stage_vue_map,
		1817	key->input_slots_valid);
		1818	int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
		1819	assert(prev_stage_vue_map.num_slots <= first_slot + 32);
		1820	for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
		1821	slot++) {
		1822	int varying = prev_stage_vue_map.slot_to_varying[slot];
		1823	/* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
		1824	* unused.
		1825	*/
		1826	if (varying != BRW_VARYING_SLOT_COUNT &&
		1827	(prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
		1828	BITFIELD64_BIT(varying))) {
		1829	prog_data->urb_setup[varying] = slot - first_slot;
		1830	}
		1831	}
		1832	urb_next = prev_stage_vue_map.num_slots - first_slot;
		1833	}
		1834	} else {
		1835	/* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
		1836	for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
		1837	/* Point size is packed into the header, not as a general attribute */
		1838	if (i == VARYING_SLOT_PSIZ)
		1839	continue;
		1840
		1841	if (key->input_slots_valid & BITFIELD64_BIT(i)) {
		1842	/* The back color slot is skipped when the front color is
		1843	* also written to. In addition, some slots can be
		1844	* written in the vertex shader and not read in the
		1845	* fragment shader. So the register number must always be
		1846	* incremented, mapped or not.
		1847	*/
		1848	if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
		1849	prog_data->urb_setup[i] = urb_next;
		1850	urb_next++;
		1851	}
		1852	}
		1853
		1854	/*
		1855	* It's a FS only attribute, and we did interpolation for this attribute
		1856	* in SF thread. So, count it here, too.
		1857	*
		1858	* See compile_sf_prog() for more info.
		1859	*/
		1860	if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
		1861	prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
		1862	}
		1863
		1864	prog_data->num_varying_inputs = urb_next;
		1865	}
		1866
		1867	void
		1868	fs_visitor::assign_urb_setup()
		1869	{
		1870	assert(stage == MESA_SHADER_FRAGMENT);
		1871	brw_wm_prog_data prog_data = (brw_wm_prog_data) this->prog_data;
		1872
		1873	int urb_start = payload.num_regs + prog_data->base.curb_read_length;
		1874
		1875	/* Offset all the urb_setup[] index by the actual position of the
		1876	* setup regs, now that the location of the constants has been chosen.
		1877	*/
		1878	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		1879	if (inst->opcode == FS_OPCODE_LINTERP) {
		1880	assert(inst->src[1].file == HW_REG);
		1881	inst->src[1].fixed_hw_reg.nr += urb_start;
		1882	}
		1883
		1884	if (inst->opcode == FS_OPCODE_CINTERP) {
		1885	assert(inst->src[0].file == HW_REG);
		1886	inst->src[0].fixed_hw_reg.nr += urb_start;
		1887	}
		1888	}
		1889
		1890	/* Each attribute is 4 setup channels, each of which is half a reg. */
		1891	this->first_non_payload_grf =
		1892	urb_start + prog_data->num_varying_inputs * 2;
		1893	}
		1894
		1895	void
		1896	fs_visitor::assign_vs_urb_setup()
		1897	{
		1898	brw_vs_prog_data vs_prog_data = (brw_vs_prog_data ) prog_data;
		1899	int grf, count, slot, channel, attr;
		1900
		1901	assert(stage == MESA_SHADER_VERTEX);
		1902	count = _mesa_bitcount_64(vs_prog_data->inputs_read);
		1903	if (vs_prog_data->uses_vertexid \|\| vs_prog_data->uses_instanceid)
		1904	count++;
		1905
		1906	/* Each attribute is 4 regs. */
		1907	this->first_non_payload_grf =
		1908	payload.num_regs + prog_data->curb_read_length + count * 4;
		1909
		1910	unsigned vue_entries =
		1911	MAX2(count, vs_prog_data->base.vue_map.num_slots);
		1912
		1913	vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
		1914	vs_prog_data->base.urb_read_length = (count + 1) / 2;
		1915
		1916	assert(vs_prog_data->base.urb_read_length <= 15);
		1917
		1918	/* Rewrite all ATTR file references to the hw grf that they land in. */
		1919	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		1920	for (int i = 0; i < inst->sources; i++) {
		1921	if (inst->src[i].file == ATTR) {
		1922
		1923	if (inst->src[i].reg == VERT_ATTRIB_MAX) {
		1924	slot = count - 1;
		1925	} else {
		1926	/* Attributes come in in a contiguous block, ordered by their
		1927	* gl_vert_attrib value. That means we can compute the slot
		1928	* number for an attribute by masking out the enabled
		1929	* attributes before it and counting the bits.
		1930	*/
		1931	attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
		1932	slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
		1933	BITFIELD64_MASK(attr));
		1934	}
		1935
		1936	channel = inst->src[i].reg_offset & 3;
		1937
		1938	grf = payload.num_regs +
		1939	prog_data->curb_read_length +
		1940	slot * 4 + channel;
		1941
		1942	inst->src[i].file = HW_REG;
		1943	inst->src[i].fixed_hw_reg =
		1944	retype(brw_vec8_grf(grf, 0), inst->src[i].type);
		1945	}
		1946	}
		1947	}
		1948	}
		1949
		1950	/**
		1951	* Split large virtual GRFs into separate components if we can.
		1952	*
		1953	* This is mostly duplicated with what brw_fs_vector_splitting does,
		1954	* but that's really conservative because it's afraid of doing
		1955	* splitting that doesn't result in real progress after the rest of
		1956	* the optimization phases, which would cause infinite looping in
		1957	* optimization. We can do it once here, safely. This also has the
		1958	* opportunity to split interpolated values, or maybe even uniforms,
		1959	* which we don't have at the IR level.
		1960	*
		1961	* We want to split, because virtual GRFs are what we register
		1962	* allocate and spill (due to contiguousness requirements for some
		1963	* instructions), and they're what we naturally generate in the
		1964	* codegen process, but most virtual GRFs don't actually need to be
		1965	* contiguous sets of GRFs. If we split, we'll end up with reduced
		1966	* live intervals and better dead code elimination and coalescing.
		1967	*/
		1968	void
		1969	fs_visitor::split_virtual_grfs()
		1970	{
		1971	int num_vars = this->alloc.count;
		1972
		1973	/* Count the total number of registers */
		1974	int reg_count = 0;
		1975	int vgrf_to_reg[num_vars];
		1976	for (int i = 0; i < num_vars; i++) {
		1977	vgrf_to_reg[i] = reg_count;
		1978	reg_count += alloc.sizes[i];
		1979	}
		1980
		1981	/* An array of "split points". For each register slot, this indicates
		1982	* if this slot can be separated from the previous slot. Every time an
		1983	* instruction uses multiple elements of a register (as a source or
		1984	* destination), we mark the used slots as inseparable. Then we go
		1985	* through and split the registers into the smallest pieces we can.
		1986	*/
		1987	bool split_points[reg_count];
		1988	memset(split_points, 0, sizeof(split_points));
		1989
		1990	/* Mark all used registers as fully splittable */
		1991	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		1992	if (inst->dst.file == GRF) {
		1993	int reg = vgrf_to_reg[inst->dst.reg];
		1994	for (unsigned j = 1; j < this->alloc.sizes[inst->dst.reg]; j++)
		1995	split_points[reg + j] = true;
		1996	}
		1997
		1998	for (int i = 0; i < inst->sources; i++) {
		1999	if (inst->src[i].file == GRF) {
		2000	int reg = vgrf_to_reg[inst->src[i].reg];
		2001	for (unsigned j = 1; j < this->alloc.sizes[inst->src[i].reg]; j++)
		2002	split_points[reg + j] = true;
		2003	}
		2004	}
		2005	}
		2006
		2007	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		2008	if (inst->dst.file == GRF) {
		2009	int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
		2010	for (int j = 1; j < inst->regs_written; j++)
		2011	split_points[reg + j] = false;
		2012	}
		2013	for (int i = 0; i < inst->sources; i++) {
		2014	if (inst->src[i].file == GRF) {
		2015	int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
		2016	for (int j = 1; j < inst->regs_read(i); j++)
		2017	split_points[reg + j] = false;
		2018	}
		2019	}
		2020	}
		2021
		2022	int new_virtual_grf[reg_count];
		2023	int new_reg_offset[reg_count];
		2024
		2025	int reg = 0;
		2026	for (int i = 0; i < num_vars; i++) {
		2027	/* The first one should always be 0 as a quick sanity check. */
		2028	assert(split_points[reg] == false);
		2029
		2030	/* j = 0 case */
		2031	new_reg_offset[reg] = 0;
		2032	reg++;
		2033	int offset = 1;
		2034
		2035	/* j > 0 case */
		2036	for (unsigned j = 1; j < alloc.sizes[i]; j++) {
		2037	/* If this is a split point, reset the offset to 0 and allocate a
		2038	* new virtual GRF for the previous offset many registers
		2039	*/
		2040	if (split_points[reg]) {
		2041	assert(offset <= MAX_VGRF_SIZE);
		2042	int grf = alloc.allocate(offset);
		2043	for (int k = reg - offset; k < reg; k++)
		2044	new_virtual_grf[k] = grf;
		2045	offset = 0;
		2046	}
		2047	new_reg_offset[reg] = offset;
		2048	offset++;
		2049	reg++;
		2050	}
		2051
		2052	/* The last one gets the original register number */
		2053	assert(offset <= MAX_VGRF_SIZE);
		2054	alloc.sizes[i] = offset;
		2055	for (int k = reg - offset; k < reg; k++)
		2056	new_virtual_grf[k] = i;
		2057	}
		2058	assert(reg == reg_count);
		2059
		2060	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		2061	if (inst->dst.file == GRF) {
		2062	reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
		2063	inst->dst.reg = new_virtual_grf[reg];
		2064	inst->dst.reg_offset = new_reg_offset[reg];
		2065	assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
		2066	}
		2067	for (int i = 0; i < inst->sources; i++) {
		2068	if (inst->src[i].file == GRF) {
		2069	reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
		2070	inst->src[i].reg = new_virtual_grf[reg];
		2071	inst->src[i].reg_offset = new_reg_offset[reg];
		2072	assert((unsigned)new_reg_offset[reg] < alloc.sizes[new_virtual_grf[reg]]);
		2073	}
		2074	}
		2075	}
		2076	invalidate_live_intervals();
		2077	}
		2078
		2079	/**
		2080	* Remove unused virtual GRFs and compact the virtual_grf_* arrays.
		2081	*
		2082	* During code generation, we create tons of temporary variables, many of
		2083	* which get immediately killed and are never used again. Yet, in later
		2084	* optimization and analysis passes, such as compute_live_intervals, we need
		2085	* to loop over all the virtual GRFs. Compacting them can save a lot of
		2086	* overhead.
		2087	*/
		2088	bool
		2089	fs_visitor::compact_virtual_grfs()
		2090	{
		2091	bool progress = false;
		2092	int remap_table[this->alloc.count];
		2093	memset(remap_table, -1, sizeof(remap_table));
		2094
		2095	/* Mark which virtual GRFs are used. */
		2096	foreach_block_and_inst(block, const fs_inst, inst, cfg) {
		2097	if (inst->dst.file == GRF)
		2098	remap_table[inst->dst.reg] = 0;
		2099
		2100	for (int i = 0; i < inst->sources; i++) {
		2101	if (inst->src[i].file == GRF)
		2102	remap_table[inst->src[i].reg] = 0;
		2103	}
		2104	}
		2105
		2106	/* Compact the GRF arrays. */
		2107	int new_index = 0;
		2108	for (unsigned i = 0; i < this->alloc.count; i++) {
		2109	if (remap_table[i] == -1) {
		2110	/* We just found an unused register. This means that we are
		2111	* actually going to compact something.
		2112	*/
		2113	progress = true;
		2114	} else {
		2115	remap_table[i] = new_index;
		2116	alloc.sizes[new_index] = alloc.sizes[i];
		2117	invalidate_live_intervals();
		2118	++new_index;
		2119	}
		2120	}
		2121
		2122	this->alloc.count = new_index;
		2123
		2124	/* Patch all the instructions to use the newly renumbered registers */
		2125	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		2126	if (inst->dst.file == GRF)
		2127	inst->dst.reg = remap_table[inst->dst.reg];
		2128
		2129	for (int i = 0; i < inst->sources; i++) {
		2130	if (inst->src[i].file == GRF)
		2131	inst->src[i].reg = remap_table[inst->src[i].reg];
		2132	}
		2133	}
		2134
		2135	/* Patch all the references to delta_xy, since they're used in register
		2136	* allocation. If they're unused, switch them to BAD_FILE so we don't
		2137	* think some random VGRF is delta_xy.
		2138	*/
		2139	for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
		2140	if (delta_xy[i].file == GRF) {
		2141	if (remap_table[delta_xy[i].reg] != -1) {
		2142	delta_xy[i].reg = remap_table[delta_xy[i].reg];
		2143	} else {
		2144	delta_xy[i].file = BAD_FILE;
		2145	}
		2146	}
		2147	}
		2148
		2149	return progress;
		2150	}
		2151
		2152	/*
		2153	* Implements array access of uniforms by inserting a
		2154	* PULL_CONSTANT_LOAD instruction.
		2155	*
		2156	* Unlike temporary GRF array access (where we don't support it due to
		2157	* the difficulty of doing relative addressing on instruction
		2158	* destinations), we could potentially do array access of uniforms
		2159	* that were loaded in GRF space as push constants. In real-world
		2160	* usage we've seen, though, the arrays being used are always larger
		2161	* than we could load as push constants, so just always move all
		2162	* uniform array access out to a pull constant buffer.
		2163	*/
		2164	void
		2165	fs_visitor::move_uniform_array_access_to_pull_constants()
		2166	{
		2167	if (dispatch_width != 8)
		2168	return;
		2169
		2170	pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
		2171	memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
		2172
		2173	/* Walk through and find array access of uniforms. Put a copy of that
		2174	* uniform in the pull constant buffer.
		2175	*
		2176	* Note that we don't move constant-indexed accesses to arrays. No
		2177	* testing has been done of the performance impact of this choice.
		2178	*/
		2179	foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
		2180	for (int i = 0 ; i < inst->sources; i++) {
		2181	if (inst->src[i].file != UNIFORM \|\| !inst->src[i].reladdr)
		2182	continue;
		2183
		2184	int uniform = inst->src[i].reg;
		2185
		2186	/* If this array isn't already present in the pull constant buffer,
		2187	* add it.
		2188	*/
		2189	if (pull_constant_loc[uniform] == -1) {
		2190	const gl_constant_value **values = &stage_prog_data->param[uniform];
		2191
		2192	assert(param_size[uniform]);
		2193
		2194	for (int j = 0; j < param_size[uniform]; j++) {
		2195	pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
		2196
		2197	stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
		2198	values[j];
		2199	}
		2200	}
		2201	}
		2202	}
		2203	}
		2204
		2205	/**
		2206	* Assign UNIFORM file registers to either push constants or pull constants.
		2207	*
		2208	* We allow a fragment shader to have more than the specified minimum
		2209	* maximum number of fragment shader uniform components (64). If
		2210	* there are too many of these, they'd fill up all of register space.
		2211	* So, this will push some of them out to the pull constant buffer and
		2212	* update the program to load them.
		2213	*/
		2214	void
		2215	fs_visitor::assign_constant_locations()
		2216	{
		2217	/* Only the first compile (SIMD8 mode) gets to decide on locations. */
		2218	if (dispatch_width != 8)
		2219	return;
		2220
		2221	/* Find which UNIFORM registers are still in use. */
		2222	bool is_live[uniforms];
		2223	for (unsigned int i = 0; i < uniforms; i++) {
		2224	is_live[i] = false;
		2225	}
		2226
		2227	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		2228	for (int i = 0; i < inst->sources; i++) {
		2229	if (inst->src[i].file != UNIFORM)
		2230	continue;
		2231
		2232	int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
		2233	if (constant_nr >= 0 && constant_nr < (int) uniforms)
		2234	is_live[constant_nr] = true;
		2235	}
		2236	}
		2237
		2238	/* Only allow 16 registers (128 uniform components) as push constants.
		2239	*
		2240	* Just demote the end of the list. We could probably do better
		2241	* here, demoting things that are rarely used in the program first.
		2242	*
		2243	* If changing this value, note the limitation about total_regs in
		2244	* brw_curbe.c.
		2245	*/
		2246	unsigned int max_push_components = 16 * 8;
		2247	unsigned int num_push_constants = 0;
		2248
		2249	push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
		2250
		2251	for (unsigned int i = 0; i < uniforms; i++) {
		2252	if (!is_live[i] \|\| pull_constant_loc[i] != -1) {
		2253	/* This UNIFORM register is either dead, or has already been demoted
		2254	* to a pull const. Mark it as no longer living in the param[] array.
		2255	*/
		2256	push_constant_loc[i] = -1;
		2257	continue;
		2258	}
		2259
		2260	if (num_push_constants < max_push_components) {
		2261	/* Retain as a push constant. Record the location in the params[]
		2262	* array.
		2263	*/
		2264	push_constant_loc[i] = num_push_constants++;
		2265	} else {
		2266	/* Demote to a pull constant. */
		2267	push_constant_loc[i] = -1;
		2268
		2269	int pull_index = stage_prog_data->nr_pull_params++;
		2270	stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
		2271	pull_constant_loc[i] = pull_index;
		2272	}
		2273	}
		2274
		2275	stage_prog_data->nr_params = num_push_constants;
		2276
		2277	/* Up until now, the param[] array has been indexed by reg + reg_offset
		2278	* of UNIFORM registers. Condense it to only contain the uniforms we
		2279	* chose to upload as push constants.
		2280	*/
		2281	for (unsigned int i = 0; i < uniforms; i++) {
		2282	int remapped = push_constant_loc[i];
		2283
		2284	if (remapped == -1)
		2285	continue;
		2286
		2287	assert(remapped <= (int)i);
		2288	stage_prog_data->param[remapped] = stage_prog_data->param[i];
		2289	}
		2290	}
		2291
		2292	/**
		2293	* Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
		2294	* or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
		2295	*/
		2296	void
		2297	fs_visitor::demote_pull_constants()
		2298	{
		2299	foreach_block_and_inst (block, fs_inst, inst, cfg) {
		2300	for (int i = 0; i < inst->sources; i++) {
		2301	if (inst->src[i].file != UNIFORM)
		2302	continue;
		2303
		2304	int pull_index;
		2305	unsigned location = inst->src[i].reg + inst->src[i].reg_offset;
		2306	if (location >= uniforms) /* Out of bounds access */
		2307	pull_index = -1;
		2308	else
		2309	pull_index = pull_constant_loc[location];
		2310
		2311	if (pull_index == -1)
		2312	continue;
		2313
		2314	/* Set up the annotation tracking for new generated instructions. */
		2315	base_ir = inst->ir;
		2316	current_annotation = inst->annotation;
		2317
		2318	fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
		2319	fs_reg dst = vgrf(glsl_type::float_type);
		2320
		2321	/* Generate a pull load into dst. */
		2322	if (inst->src[i].reladdr) {
		2323	exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
		2324	surf_index,
		2325	*inst->src[i].reladdr,
		2326	pull_index);
		2327	inst->insert_before(block, &list);
		2328	inst->src[i].reladdr = NULL;
		2329	} else {
		2330	fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
		2331	fs_inst *pull =
		2332	new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
		2333	dst, surf_index, offset);
		2334	inst->insert_before(block, pull);
		2335	inst->src[i].set_smear(pull_index & 3);
		2336	}
		2337
		2338	/* Rewrite the instruction to use the temporary VGRF. */
		2339	inst->src[i].file = GRF;
		2340	inst->src[i].reg = dst.reg;
		2341	inst->src[i].reg_offset = 0;
		2342	inst->src[i].width = dispatch_width;
		2343	}
		2344	}
		2345	invalidate_live_intervals();
		2346	}
		2347
		2348	bool
		2349	fs_visitor::opt_algebraic()
		2350	{
		2351	bool progress = false;
		2352
		2353	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		2354	switch (inst->opcode) {
		2355	case BRW_OPCODE_MOV:
		2356	if (inst->src[0].file != IMM)
		2357	break;
		2358
		2359	if (inst->saturate) {
		2360	if (inst->dst.type != inst->src[0].type)
		2361	assert(!"unimplemented: saturate mixed types");
		2362
		2363	if (brw_saturate_immediate(inst->dst.type,
		2364	&inst->src[0].fixed_hw_reg)) {
		2365	inst->saturate = false;
		2366	progress = true;
		2367	}
		2368	}
		2369	break;
		2370
		2371	case BRW_OPCODE_MUL:
		2372	if (inst->src[1].file != IMM)
		2373	continue;
		2374
		2375	/* a * 1.0 = a */
		2376	if (inst->src[1].is_one()) {
		2377	inst->opcode = BRW_OPCODE_MOV;
		2378	inst->src[1] = reg_undef;
		2379	progress = true;
		2380	break;
		2381	}
		2382
		2383	/* a * -1.0 = -a */
		2384	if (inst->src[1].is_negative_one()) {
		2385	inst->opcode = BRW_OPCODE_MOV;
		2386	inst->src[0].negate = !inst->src[0].negate;
		2387	inst->src[1] = reg_undef;
		2388	progress = true;
		2389	break;
		2390	}
		2391
		2392	/* a * 0.0 = 0.0 */
		2393	if (inst->src[1].is_zero()) {
		2394	inst->opcode = BRW_OPCODE_MOV;
		2395	inst->src[0] = inst->src[1];
		2396	inst->src[1] = reg_undef;
		2397	progress = true;
		2398	break;
		2399	}
		2400
		2401	if (inst->src[0].file == IMM) {
		2402	assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
		2403	inst->opcode = BRW_OPCODE_MOV;
		2404	inst->src[0].fixed_hw_reg.dw1.f *= inst->src[1].fixed_hw_reg.dw1.f;
		2405	inst->src[1] = reg_undef;
		2406	progress = true;
		2407	break;
		2408	}
		2409	break;
		2410	case BRW_OPCODE_ADD:
		2411	if (inst->src[1].file != IMM)
		2412	continue;
		2413
		2414	/* a + 0.0 = a */
		2415	if (inst->src[1].is_zero()) {
		2416	inst->opcode = BRW_OPCODE_MOV;
		2417	inst->src[1] = reg_undef;
		2418	progress = true;
		2419	break;
		2420	}
		2421
		2422	if (inst->src[0].file == IMM) {
		2423	assert(inst->src[0].type == BRW_REGISTER_TYPE_F);
		2424	inst->opcode = BRW_OPCODE_MOV;
		2425	inst->src[0].fixed_hw_reg.dw1.f += inst->src[1].fixed_hw_reg.dw1.f;
		2426	inst->src[1] = reg_undef;
		2427	progress = true;
		2428	break;
		2429	}
		2430	break;
		2431	case BRW_OPCODE_OR:
		2432	if (inst->src[0].equals(inst->src[1])) {
		2433	inst->opcode = BRW_OPCODE_MOV;
		2434	inst->src[1] = reg_undef;
		2435	progress = true;
		2436	break;
		2437	}
		2438	break;
		2439	case BRW_OPCODE_LRP:
		2440	if (inst->src[1].equals(inst->src[2])) {
		2441	inst->opcode = BRW_OPCODE_MOV;
		2442	inst->src[0] = inst->src[1];
		2443	inst->src[1] = reg_undef;
		2444	inst->src[2] = reg_undef;
		2445	progress = true;
		2446	break;
		2447	}
		2448	break;
		2449	case BRW_OPCODE_CMP:
		2450	if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
		2451	inst->src[0].abs &&
		2452	inst->src[0].negate &&
		2453	inst->src[1].is_zero()) {
		2454	inst->src[0].abs = false;
		2455	inst->src[0].negate = false;
		2456	inst->conditional_mod = BRW_CONDITIONAL_Z;
		2457	progress = true;
		2458	break;
		2459	}
		2460	break;
		2461	case BRW_OPCODE_SEL:
		2462	if (inst->src[0].equals(inst->src[1])) {
		2463	inst->opcode = BRW_OPCODE_MOV;
		2464	inst->src[1] = reg_undef;
		2465	inst->predicate = BRW_PREDICATE_NONE;
		2466	inst->predicate_inverse = false;
		2467	progress = true;
		2468	} else if (inst->saturate && inst->src[1].file == IMM) {
		2469	switch (inst->conditional_mod) {
		2470	case BRW_CONDITIONAL_LE:
		2471	case BRW_CONDITIONAL_L:
		2472	switch (inst->src[1].type) {
		2473	case BRW_REGISTER_TYPE_F:
		2474	if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
		2475	inst->opcode = BRW_OPCODE_MOV;
		2476	inst->src[1] = reg_undef;
		2477	inst->conditional_mod = BRW_CONDITIONAL_NONE;
		2478	progress = true;
		2479	}
		2480	break;
		2481	default:
		2482	break;
		2483	}
		2484	break;
		2485	case BRW_CONDITIONAL_GE:
		2486	case BRW_CONDITIONAL_G:
		2487	switch (inst->src[1].type) {
		2488	case BRW_REGISTER_TYPE_F:
		2489	if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
		2490	inst->opcode = BRW_OPCODE_MOV;
		2491	inst->src[1] = reg_undef;
		2492	inst->conditional_mod = BRW_CONDITIONAL_NONE;
		2493	progress = true;
		2494	}
		2495	break;
		2496	default:
		2497	break;
		2498	}
		2499	default:
		2500	break;
		2501	}
		2502	}
		2503	break;
		2504	case BRW_OPCODE_MAD:
		2505	if (inst->src[1].is_zero() \|\| inst->src[2].is_zero()) {
		2506	inst->opcode = BRW_OPCODE_MOV;
		2507	inst->src[1] = reg_undef;
		2508	inst->src[2] = reg_undef;
		2509	progress = true;
		2510	} else if (inst->src[0].is_zero()) {
		2511	inst->opcode = BRW_OPCODE_MUL;
		2512	inst->src[0] = inst->src[2];
		2513	inst->src[2] = reg_undef;
		2514	progress = true;
		2515	} else if (inst->src[1].is_one()) {
		2516	inst->opcode = BRW_OPCODE_ADD;
		2517	inst->src[1] = inst->src[2];
		2518	inst->src[2] = reg_undef;
		2519	progress = true;
		2520	} else if (inst->src[2].is_one()) {
		2521	inst->opcode = BRW_OPCODE_ADD;
		2522	inst->src[2] = reg_undef;
		2523	progress = true;
		2524	} else if (inst->src[1].file == IMM && inst->src[2].file == IMM) {
		2525	inst->opcode = BRW_OPCODE_ADD;
		2526	inst->src[1].fixed_hw_reg.dw1.f *= inst->src[2].fixed_hw_reg.dw1.f;
		2527	inst->src[2] = reg_undef;
		2528	progress = true;
		2529	}
		2530	break;
		2531	case SHADER_OPCODE_RCP: {
		2532	fs_inst prev = (fs_inst )inst->prev;
		2533	if (prev->opcode == SHADER_OPCODE_SQRT) {
		2534	if (inst->src[0].equals(prev->dst)) {
		2535	inst->opcode = SHADER_OPCODE_RSQ;
		2536	inst->src[0] = prev->src[0];
		2537	progress = true;
		2538	}
		2539	}
		2540	break;
		2541	}
		2542	case SHADER_OPCODE_BROADCAST:
		2543	if (is_uniform(inst->src[0])) {
		2544	inst->opcode = BRW_OPCODE_MOV;
		2545	inst->sources = 1;
		2546	inst->force_writemask_all = true;
		2547	progress = true;
		2548	} else if (inst->src[1].file == IMM) {
		2549	inst->opcode = BRW_OPCODE_MOV;
		2550	inst->src[0] = component(inst->src[0],
		2551	inst->src[1].fixed_hw_reg.dw1.ud);
		2552	inst->sources = 1;
		2553	inst->force_writemask_all = true;
		2554	progress = true;
		2555	}
		2556	break;
		2557
		2558	default:
		2559	break;
		2560	}
		2561
		2562	/* Swap if src[0] is immediate. */
		2563	if (progress && inst->is_commutative()) {
		2564	if (inst->src[0].file == IMM) {
		2565	fs_reg tmp = inst->src[1];
		2566	inst->src[1] = inst->src[0];
		2567	inst->src[0] = tmp;
		2568	}
		2569	}
		2570	}
		2571	return progress;
		2572	}
		2573
		2574	/**
		2575	* Optimize sample messages that have constant zero values for the trailing
		2576	* texture coordinates. We can just reduce the message length for these
		2577	* instructions instead of reserving a register for it. Trailing parameters
		2578	* that aren't sent default to zero anyway. This will cause the dead code
		2579	* eliminator to remove the MOV instruction that would otherwise be emitted to
		2580	* set up the zero value.
		2581	*/
		2582	bool
		2583	fs_visitor::opt_zero_samples()
		2584	{
		2585	/* Gen4 infers the texturing opcode based on the message length so we can't
		2586	* change it.
		2587	*/
		2588	if (devinfo->gen < 5)
		2589	return false;
		2590
		2591	bool progress = false;
		2592
		2593	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		2594	if (!inst->is_tex())
		2595	continue;
		2596
		2597	fs_inst load_payload = (fs_inst ) inst->prev;
		2598
		2599	if (load_payload->is_head_sentinel() \|\|
		2600	load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
		2601	continue;
		2602
		2603	/* We don't want to remove the message header or the first parameter.
		2604	* Removing the first parameter is not allowed, see the Haswell PRM
		2605	* volume 7, page 149:
		2606	*
		2607	* "Parameter 0 is required except for the sampleinfo message, which
		2608	* has no parameter 0"
		2609	*/
		2610	while (inst->mlen > inst->header_size + dispatch_width / 8 &&
		2611	load_payload->src[(inst->mlen - inst->header_size) /
		2612	(dispatch_width / 8) +
		2613	inst->header_size - 1].is_zero()) {
		2614	inst->mlen -= dispatch_width / 8;
		2615	progress = true;
		2616	}
		2617	}
		2618
		2619	if (progress)
		2620	invalidate_live_intervals();
		2621
		2622	return progress;
		2623	}
		2624
		2625	/**
		2626	* Optimize sample messages which are followed by the final RT write.
		2627	*
		2628	* CHV, and GEN9+ can mark a texturing SEND instruction with EOT to have its
		2629	* results sent directly to the framebuffer, bypassing the EU. Recognize the
		2630	* final texturing results copied to the framebuffer write payload and modify
		2631	* them to write to the framebuffer directly.
		2632	*/
		2633	bool
		2634	fs_visitor::opt_sampler_eot()
		2635	{
		2636	brw_wm_prog_key key = (brw_wm_prog_key) this->key;
		2637
		2638	if (stage != MESA_SHADER_FRAGMENT)
		2639	return false;
		2640
		2641	if (devinfo->gen < 9 && !devinfo->is_cherryview)
		2642	return false;
		2643
		2644	/* FINISHME: It should be possible to implement this optimization when there
		2645	* are multiple drawbuffers.
		2646	*/
		2647	if (key->nr_color_regions != 1)
		2648	return false;
		2649
		2650	/* Look for a texturing instruction immediately before the final FB_WRITE. */
		2651	fs_inst fb_write = (fs_inst ) cfg->blocks[cfg->num_blocks - 1]->end();
		2652	assert(fb_write->eot);
		2653	assert(fb_write->opcode == FS_OPCODE_FB_WRITE);
		2654
		2655	fs_inst tex_inst = (fs_inst ) fb_write->prev;
		2656
		2657	/* There wasn't one; nothing to do. */
		2658	if (unlikely(tex_inst->is_head_sentinel()) \|\| !tex_inst->is_tex())
		2659	return false;
		2660
		2661	/* This optimisation doesn't seem to work for textureGather for some
		2662	* reason. I can't find any documentation or known workarounds to indicate
		2663	* that this is expected, but considering that it is probably pretty
		2664	* unlikely that a shader would directly write out the results from
		2665	* textureGather we might as well just disable it.
		2666	*/
		2667	if (tex_inst->opcode == SHADER_OPCODE_TG4 \|\|
		2668	tex_inst->opcode == SHADER_OPCODE_TG4_OFFSET)
		2669	return false;
		2670
		2671	/* If there's no header present, we need to munge the LOAD_PAYLOAD as well.
		2672	* It's very likely to be the previous instruction.
		2673	*/
		2674	fs_inst load_payload = (fs_inst ) tex_inst->prev;
		2675	if (load_payload->is_head_sentinel() \|\|
		2676	load_payload->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
		2677	return false;
		2678
		2679	assert(!tex_inst->eot); /* We can't get here twice */
		2680	assert((tex_inst->offset & (0xff << 24)) == 0);
		2681
		2682	tex_inst->offset \|= fb_write->target << 24;
		2683	tex_inst->eot = true;
		2684	tex_inst->dst = reg_null_ud;
		2685	fb_write->remove(cfg->blocks[cfg->num_blocks - 1]);
		2686
		2687	/* If a header is present, marking the eot is sufficient. Otherwise, we need
		2688	* to create a new LOAD_PAYLOAD command with the same sources and a space
		2689	* saved for the header. Using a new destination register not only makes sure
		2690	* we have enough space, but it will make sure the dead code eliminator kills
		2691	* the instruction that this will replace.
		2692	*/
		2693	if (tex_inst->header_size != 0)
		2694	return true;
		2695
		2696	fs_reg send_header = vgrf(load_payload->sources + 1);
		2697	fs_reg *new_sources =
		2698	ralloc_array(mem_ctx, fs_reg, load_payload->sources + 1);
		2699
		2700	new_sources[0] = fs_reg();
		2701	for (int i = 0; i < load_payload->sources; i++)
		2702	new_sources[i+1] = load_payload->src[i];
		2703
		2704	/* The LOAD_PAYLOAD helper seems like the obvious choice here. However, it
		2705	* requires a lot of information about the sources to appropriately figure
		2706	* out the number of registers needed to be used. Given this stage in our
		2707	* optimization, we may not have the appropriate GRFs required by
		2708	* LOAD_PAYLOAD at this point (copy propagation). Therefore, we need to
		2709	* manually emit the instruction.
		2710	*/
		2711	fs_inst *new_load_payload = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD,
		2712	load_payload->exec_size,
		2713	send_header,
		2714	new_sources,
		2715	load_payload->sources + 1);
		2716
		2717	new_load_payload->regs_written = load_payload->regs_written + 1;
		2718	new_load_payload->header_size = 1;
		2719	tex_inst->mlen++;
		2720	tex_inst->header_size = 1;
		2721	tex_inst->insert_before(cfg->blocks[cfg->num_blocks - 1], new_load_payload);
		2722	tex_inst->src[0] = send_header;
		2723
		2724	return true;
		2725	}
		2726
		2727	bool
		2728	fs_visitor::opt_register_renaming()
		2729	{
		2730	bool progress = false;
		2731	int depth = 0;
		2732
		2733	int remap[alloc.count];
		2734	memset(remap, -1, sizeof(int) * alloc.count);
		2735
		2736	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		2737	if (inst->opcode == BRW_OPCODE_IF \|\| inst->opcode == BRW_OPCODE_DO) {
		2738	depth++;
		2739	} else if (inst->opcode == BRW_OPCODE_ENDIF \|\|
		2740	inst->opcode == BRW_OPCODE_WHILE) {
		2741	depth--;
		2742	}
		2743
		2744	/* Rewrite instruction sources. */
		2745	for (int i = 0; i < inst->sources; i++) {
		2746	if (inst->src[i].file == GRF &&
		2747	remap[inst->src[i].reg] != -1 &&
		2748	remap[inst->src[i].reg] != inst->src[i].reg) {
		2749	inst->src[i].reg = remap[inst->src[i].reg];
		2750	progress = true;
		2751	}
		2752	}
		2753
		2754	const int dst = inst->dst.reg;
		2755
		2756	if (depth == 0 &&
		2757	inst->dst.file == GRF &&
		2758	alloc.sizes[inst->dst.reg] == inst->dst.width / 8 &&
		2759	!inst->is_partial_write()) {
		2760	if (remap[dst] == -1) {
		2761	remap[dst] = dst;
		2762	} else {
		2763	remap[dst] = alloc.allocate(inst->dst.width / 8);
		2764	inst->dst.reg = remap[dst];
		2765	progress = true;
		2766	}
		2767	} else if (inst->dst.file == GRF &&
		2768	remap[dst] != -1 &&
		2769	remap[dst] != dst) {
		2770	inst->dst.reg = remap[dst];
		2771	progress = true;
		2772	}
		2773	}
		2774
		2775	if (progress) {
		2776	invalidate_live_intervals();
		2777
		2778	for (unsigned i = 0; i < ARRAY_SIZE(delta_xy); i++) {
		2779	if (delta_xy[i].file == GRF && remap[delta_xy[i].reg] != -1) {
		2780	delta_xy[i].reg = remap[delta_xy[i].reg];
		2781	}
		2782	}
		2783	}
		2784
		2785	return progress;
		2786	}
		2787
		2788	/**
		2789	* Remove redundant or useless discard jumps.
		2790	*
		2791	* For example, we can eliminate jumps in the following sequence:
		2792	*
		2793	* discard-jump (redundant with the next jump)
		2794	* discard-jump (useless; jumps to the next instruction)
		2795	* placeholder-halt
		2796	*/
		2797	bool
		2798	fs_visitor::opt_redundant_discard_jumps()
		2799	{
		2800	bool progress = false;
		2801
		2802	bblock_t *last_bblock = cfg->blocks[cfg->num_blocks - 1];
		2803
		2804	fs_inst *placeholder_halt = NULL;
		2805	foreach_inst_in_block_reverse(fs_inst, inst, last_bblock) {
		2806	if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT) {
		2807	placeholder_halt = inst;
		2808	break;
		2809	}
		2810	}
		2811
		2812	if (!placeholder_halt)
		2813	return false;
		2814
		2815	/* Delete any HALTs immediately before the placeholder halt. */
		2816	for (fs_inst prev = (fs_inst ) placeholder_halt->prev;
		2817	!prev->is_head_sentinel() && prev->opcode == FS_OPCODE_DISCARD_JUMP;
		2818	prev = (fs_inst *) placeholder_halt->prev) {
		2819	prev->remove(last_bblock);
		2820	progress = true;
		2821	}
		2822
		2823	if (progress)
		2824	invalidate_live_intervals();
		2825
		2826	return progress;
		2827	}
		2828
		2829	bool
		2830	fs_visitor::compute_to_mrf()
		2831	{
		2832	bool progress = false;
		2833	int next_ip = 0;
		2834
		2835	/* No MRFs on Gen >= 7. */
		2836	if (devinfo->gen >= 7)
		2837	return false;
		2838
		2839	calculate_live_intervals();
		2840
		2841	foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
		2842	int ip = next_ip;
		2843	next_ip++;
		2844
		2845	if (inst->opcode != BRW_OPCODE_MOV \|\|
		2846	inst->is_partial_write() \|\|
		2847	inst->dst.file != MRF \|\| inst->src[0].file != GRF \|\|
		2848	inst->dst.type != inst->src[0].type \|\|
		2849	inst->src[0].abs \|\| inst->src[0].negate \|\|
		2850	!inst->src[0].is_contiguous() \|\|
		2851	inst->src[0].subreg_offset)
		2852	continue;
		2853
		2854	/* Work out which hardware MRF registers are written by this
		2855	* instruction.
		2856	*/
		2857	int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
		2858	int mrf_high;
		2859	if (inst->dst.reg & BRW_MRF_COMPR4) {
		2860	mrf_high = mrf_low + 4;
		2861	} else if (inst->exec_size == 16) {
		2862	mrf_high = mrf_low + 1;
		2863	} else {
		2864	mrf_high = mrf_low;
		2865	}
		2866
		2867	/* Can't compute-to-MRF this GRF if someone else was going to
		2868	* read it later.
		2869	*/
		2870	if (this->virtual_grf_end[inst->src[0].reg] > ip)
		2871	continue;
		2872
		2873	/* Found a move of a GRF to a MRF. Let's see if we can go
		2874	* rewrite the thing that made this GRF to write into the MRF.
		2875	*/
		2876	foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
		2877	if (scan_inst->dst.file == GRF &&
		2878	scan_inst->dst.reg == inst->src[0].reg) {
		2879	/* Found the last thing to write our reg we want to turn
		2880	* into a compute-to-MRF.
		2881	*/
		2882
		2883	/* If this one instruction didn't populate all the
		2884	* channels, bail. We might be able to rewrite everything
		2885	* that writes that reg, but it would require smarter
		2886	* tracking to delay the rewriting until complete success.
		2887	*/
		2888	if (scan_inst->is_partial_write())
		2889	break;
		2890
		2891	/* Things returning more than one register would need us to
		2892	* understand coalescing out more than one MOV at a time.
		2893	*/
		2894	if (scan_inst->regs_written > scan_inst->dst.width / 8)
		2895	break;
		2896
		2897	/* SEND instructions can't have MRF as a destination. */
		2898	if (scan_inst->mlen)
		2899	break;
		2900
		2901	if (devinfo->gen == 6) {
		2902	/* gen6 math instructions must have the destination be
		2903	* GRF, so no compute-to-MRF for them.
		2904	*/
		2905	if (scan_inst->is_math()) {
		2906	break;
		2907	}
		2908	}
		2909
		2910	if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
		2911	/* Found the creator of our MRF's source value. */
		2912	scan_inst->dst.file = MRF;
		2913	scan_inst->dst.reg = inst->dst.reg;
		2914	scan_inst->saturate \|= inst->saturate;
		2915	inst->remove(block);
		2916	progress = true;
		2917	}
		2918	break;
		2919	}
		2920
		2921	/* We don't handle control flow here. Most computation of
		2922	* values that end up in MRFs are shortly before the MRF
		2923	* write anyway.
		2924	*/
		2925	if (block->start() == scan_inst)
		2926	break;
		2927
		2928	/* You can't read from an MRF, so if someone else reads our
		2929	* MRF's source GRF that we wanted to rewrite, that stops us.
		2930	*/
		2931	bool interfered = false;
		2932	for (int i = 0; i < scan_inst->sources; i++) {
		2933	if (scan_inst->src[i].file == GRF &&
		2934	scan_inst->src[i].reg == inst->src[0].reg &&
		2935	scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
		2936	interfered = true;
		2937	}
		2938	}
		2939	if (interfered)
		2940	break;
		2941
		2942	if (scan_inst->dst.file == MRF) {
		2943	/* If somebody else writes our MRF here, we can't
		2944	* compute-to-MRF before that.
		2945	*/
		2946	int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
		2947	int scan_mrf_high;
		2948
		2949	if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
		2950	scan_mrf_high = scan_mrf_low + 4;
		2951	} else if (scan_inst->exec_size == 16) {
		2952	scan_mrf_high = scan_mrf_low + 1;
		2953	} else {
		2954	scan_mrf_high = scan_mrf_low;
		2955	}
		2956
		2957	if (mrf_low == scan_mrf_low \|\|
		2958	mrf_low == scan_mrf_high \|\|
		2959	mrf_high == scan_mrf_low \|\|
		2960	mrf_high == scan_mrf_high) {
		2961	break;
		2962	}
		2963	}
		2964
		2965	if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
		2966	/* Found a SEND instruction, which means that there are
		2967	* live values in MRFs from base_mrf to base_mrf +
		2968	* scan_inst->mlen - 1. Don't go pushing our MRF write up
		2969	* above it.
		2970	*/
		2971	if (mrf_low >= scan_inst->base_mrf &&
		2972	mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
		2973	break;
		2974	}
		2975	if (mrf_high >= scan_inst->base_mrf &&
		2976	mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
		2977	break;
		2978	}
		2979	}
		2980	}
		2981	}
		2982
		2983	if (progress)
		2984	invalidate_live_intervals();
		2985
		2986	return progress;
		2987	}
		2988
		2989	/**
		2990	* Eliminate FIND_LIVE_CHANNEL instructions occurring outside any control
		2991	* flow. We could probably do better here with some form of divergence
		2992	* analysis.
		2993	*/
		2994	bool
		2995	fs_visitor::eliminate_find_live_channel()
		2996	{
		2997	bool progress = false;
		2998	unsigned depth = 0;
		2999
		3000	foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
		3001	switch (inst->opcode) {
		3002	case BRW_OPCODE_IF:
		3003	case BRW_OPCODE_DO:
		3004	depth++;
		3005	break;
		3006
		3007	case BRW_OPCODE_ENDIF:
		3008	case BRW_OPCODE_WHILE:
		3009	depth--;
		3010	break;
		3011
		3012	case FS_OPCODE_DISCARD_JUMP:
		3013	/* This can potentially make control flow non-uniform until the end
		3014	* of the program.
		3015	*/
		3016	return progress;
		3017
		3018	case SHADER_OPCODE_FIND_LIVE_CHANNEL:
		3019	if (depth == 0) {
		3020	inst->opcode = BRW_OPCODE_MOV;
		3021	inst->src[0] = fs_reg(0);
		3022	inst->sources = 1;
		3023	inst->force_writemask_all = true;
		3024	progress = true;
		3025	}
		3026	break;
		3027
		3028	default:
		3029	break;
		3030	}
		3031	}
		3032
		3033	return progress;
		3034	}
		3035
		3036	/**
		3037	* Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
		3038	* instructions to FS_OPCODE_REP_FB_WRITE.
		3039	*/
		3040	void
		3041	fs_visitor::emit_repclear_shader()
		3042	{
		3043	brw_wm_prog_key key = (brw_wm_prog_key) this->key;
		3044	int base_mrf = 1;
		3045	int color_mrf = base_mrf + 2;
		3046
		3047	fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
		3048	fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
		3049	mov->force_writemask_all = true;
		3050
		3051	fs_inst *write;
		3052	if (key->nr_color_regions == 1) {
		3053	write = emit(FS_OPCODE_REP_FB_WRITE);
		3054	write->saturate = key->clamp_fragment_color;
		3055	write->base_mrf = color_mrf;
		3056	write->target = 0;
		3057	write->header_size = 0;
		3058	write->mlen = 1;
		3059	} else {
		3060	assume(key->nr_color_regions > 0);
		3061	for (int i = 0; i < key->nr_color_regions; ++i) {
		3062	write = emit(FS_OPCODE_REP_FB_WRITE);
		3063	write->saturate = key->clamp_fragment_color;
		3064	write->base_mrf = base_mrf;
		3065	write->target = i;
		3066	write->header_size = 2;
		3067	write->mlen = 3;
		3068	}
		3069	}
		3070	write->eot = true;
		3071
		3072	calculate_cfg();
		3073
		3074	assign_constant_locations();
		3075	assign_curb_setup();
		3076
		3077	/* Now that we have the uniform assigned, go ahead and force it to a vec4. */
		3078	assert(mov->src[0].file == HW_REG);
		3079	mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
		3080	}
		3081
		3082	/**
		3083	* Walks through basic blocks, looking for repeated MRF writes and
		3084	* removing the later ones.
		3085	*/
		3086	bool
		3087	fs_visitor::remove_duplicate_mrf_writes()
		3088	{
		3089	fs_inst *last_mrf_move[16];
		3090	bool progress = false;
		3091
		3092	/* Need to update the MRF tracking for compressed instructions. */
		3093	if (dispatch_width == 16)
		3094	return false;
		3095
		3096	memset(last_mrf_move, 0, sizeof(last_mrf_move));
		3097
		3098	foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
		3099	if (inst->is_control_flow()) {
		3100	memset(last_mrf_move, 0, sizeof(last_mrf_move));
		3101	}
		3102
		3103	if (inst->opcode == BRW_OPCODE_MOV &&
		3104	inst->dst.file == MRF) {
		3105	fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
		3106	if (prev_inst && inst->equals(prev_inst)) {
		3107	inst->remove(block);
		3108	progress = true;
		3109	continue;
		3110	}
		3111	}
		3112
		3113	/* Clear out the last-write records for MRFs that were overwritten. */
		3114	if (inst->dst.file == MRF) {
		3115	last_mrf_move[inst->dst.reg] = NULL;
		3116	}
		3117
		3118	if (inst->mlen > 0 && inst->base_mrf != -1) {
		3119	/* Found a SEND instruction, which will include two or fewer
		3120	* implied MRF writes. We could do better here.
		3121	*/
		3122	for (int i = 0; i < implied_mrf_writes(inst); i++) {
		3123	last_mrf_move[inst->base_mrf + i] = NULL;
		3124	}
		3125	}
		3126
		3127	/* Clear out any MRF move records whose sources got overwritten. */
		3128	if (inst->dst.file == GRF) {
		3129	for (unsigned int i = 0; i < ARRAY_SIZE(last_mrf_move); i++) {
		3130	if (last_mrf_move[i] &&
		3131	last_mrf_move[i]->src[0].reg == inst->dst.reg) {
		3132	last_mrf_move[i] = NULL;
		3133	}
		3134	}
		3135	}
		3136
		3137	if (inst->opcode == BRW_OPCODE_MOV &&
		3138	inst->dst.file == MRF &&
		3139	inst->src[0].file == GRF &&
		3140	!inst->is_partial_write()) {
		3141	last_mrf_move[inst->dst.reg] = inst;
		3142	}
		3143	}
		3144
		3145	if (progress)
		3146	invalidate_live_intervals();
		3147
		3148	return progress;
		3149	}
		3150
		3151	static void
		3152	clear_deps_for_inst_src(fs_inst inst, bool deps, int first_grf, int grf_len)
		3153	{
		3154	/* Clear the flag for registers that actually got read (as expected). */
		3155	for (int i = 0; i < inst->sources; i++) {
		3156	int grf;
		3157	if (inst->src[i].file == GRF) {
		3158	grf = inst->src[i].reg;
		3159	} else if (inst->src[i].file == HW_REG &&
		3160	inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
		3161	grf = inst->src[i].fixed_hw_reg.nr;
		3162	} else {
		3163	continue;
		3164	}
		3165
		3166	if (grf >= first_grf &&
		3167	grf < first_grf + grf_len) {
		3168	deps[grf - first_grf] = false;
		3169	if (inst->exec_size == 16)
		3170	deps[grf - first_grf + 1] = false;
		3171	}
		3172	}
		3173	}
		3174
		3175	/**
		3176	* Implements this workaround for the original 965:
		3177	*
		3178	* "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
		3179	* check for post destination dependencies on this instruction, software
		3180	* must ensure that there is no destination hazard for the case of ‘write
		3181	* followed by a posted write’ shown in the following example.
		3182	*
		3183	* 1. mov r3 0
		3184	* 2. send r3.xy
		3185	* 3. mov r2 r3
		3186	*
		3187	* Due to no post-destination dependency check on the ‘send’, the above
		3188	* code sequence could have two instructions (1 and 2) in flight at the
		3189	* same time that both consider ‘r3’ as the target of their final writes.
		3190	*/
		3191	void
		3192	fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
		3193	fs_inst *inst)
		3194	{
		3195	int write_len = inst->regs_written;
		3196	int first_write_grf = inst->dst.reg;
		3197	bool needs_dep[BRW_MAX_MRF];
		3198	assert(write_len < (int)sizeof(needs_dep) - 1);
		3199
		3200	memset(needs_dep, false, sizeof(needs_dep));
		3201	memset(needs_dep, true, write_len);
		3202
		3203	clear_deps_for_inst_src(inst, needs_dep, first_write_grf, write_len);
		3204
		3205	/* Walk backwards looking for writes to registers we're writing which
		3206	* aren't read since being written. If we hit the start of the program,
		3207	* we assume that there are no outstanding dependencies on entry to the
		3208	* program.
		3209	*/
		3210	foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
		3211	/* If we hit control flow, assume that there are outstanding
		3212	* dependencies, and force their cleanup before our instruction.
		3213	*/
		3214	if (block->start() == scan_inst) {
		3215	for (int i = 0; i < write_len; i++) {
		3216	if (needs_dep[i]) {
		3217	inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
		3218	}
		3219	}
		3220	return;
		3221	}
		3222
		3223	/* We insert our reads as late as possible on the assumption that any
		3224	* instruction but a MOV that might have left us an outstanding
		3225	* dependency has more latency than a MOV.
		3226	*/
		3227	if (scan_inst->dst.file == GRF) {
		3228	for (int i = 0; i < scan_inst->regs_written; i++) {
		3229	int reg = scan_inst->dst.reg + i;
		3230
		3231	if (reg >= first_write_grf &&
		3232	reg < first_write_grf + write_len &&
		3233	needs_dep[reg - first_write_grf]) {
		3234	inst->insert_before(block, DEP_RESOLVE_MOV(reg));
		3235	needs_dep[reg - first_write_grf] = false;
		3236	if (scan_inst->exec_size == 16)
		3237	needs_dep[reg - first_write_grf + 1] = false;
		3238	}
		3239	}
		3240	}
		3241
		3242	/* Clear the flag for registers that actually got read (as expected). */
		3243	clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
		3244
		3245	/* Continue the loop only if we haven't resolved all the dependencies */
		3246	int i;
		3247	for (i = 0; i < write_len; i++) {
		3248	if (needs_dep[i])
		3249	break;
		3250	}
		3251	if (i == write_len)
		3252	return;
		3253	}
		3254	}
		3255
		3256	/**
		3257	* Implements this workaround for the original 965:
		3258	*
		3259	* "[DevBW, DevCL] Errata: A destination register from a send can not be
		3260	* used as a destination register until after it has been sourced by an
		3261	* instruction with a different destination register.
		3262	*/
		3263	void
		3264	fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t block, fs_inst inst)
		3265	{
		3266	int write_len = inst->regs_written;
		3267	int first_write_grf = inst->dst.reg;
		3268	bool needs_dep[BRW_MAX_MRF];
		3269	assert(write_len < (int)sizeof(needs_dep) - 1);
		3270
		3271	memset(needs_dep, false, sizeof(needs_dep));
		3272	memset(needs_dep, true, write_len);
		3273	/* Walk forwards looking for writes to registers we're writing which aren't
		3274	* read before being written.
		3275	*/
		3276	foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
		3277	/* If we hit control flow, force resolve all remaining dependencies. */
		3278	if (block->end() == scan_inst) {
		3279	for (int i = 0; i < write_len; i++) {
		3280	if (needs_dep[i])
		3281	scan_inst->insert_before(block,
		3282	DEP_RESOLVE_MOV(first_write_grf + i));
		3283	}
		3284	return;
		3285	}
		3286
		3287	/* Clear the flag for registers that actually got read (as expected). */
		3288	clear_deps_for_inst_src(scan_inst, needs_dep, first_write_grf, write_len);
		3289
		3290	/* We insert our reads as late as possible since they're reading the
		3291	* result of a SEND, which has massive latency.
		3292	*/
		3293	if (scan_inst->dst.file == GRF &&
		3294	scan_inst->dst.reg >= first_write_grf &&
		3295	scan_inst->dst.reg < first_write_grf + write_len &&
		3296	needs_dep[scan_inst->dst.reg - first_write_grf]) {
		3297	scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
		3298	needs_dep[scan_inst->dst.reg - first_write_grf] = false;
		3299	}
		3300
		3301	/* Continue the loop only if we haven't resolved all the dependencies */
		3302	int i;
		3303	for (i = 0; i < write_len; i++) {
		3304	if (needs_dep[i])
		3305	break;
		3306	}
		3307	if (i == write_len)
		3308	return;
		3309	}
		3310	}
		3311
		3312	void
		3313	fs_visitor::insert_gen4_send_dependency_workarounds()
		3314	{
		3315	if (devinfo->gen != 4 \|\| devinfo->is_g4x)
		3316	return;
		3317
		3318	bool progress = false;
		3319
		3320	/* Note that we're done with register allocation, so GRF fs_regs always
		3321	* have a .reg_offset of 0.
		3322	*/
		3323
		3324	foreach_block_and_inst(block, fs_inst, inst, cfg) {
		3325	if (inst->mlen != 0 && inst->dst.file == GRF) {
		3326	insert_gen4_pre_send_dependency_workarounds(block, inst);
		3327	insert_gen4_post_send_dependency_workarounds(block, inst);
		3328	progress = true;
		3329	}
		3330	}
		3331
		3332	if (progress)
		3333	invalidate_live_intervals();
		3334	}
		3335
		3336	/**
		3337	* Turns the generic expression-style uniform pull constant load instruction
		3338	* into a hardware-specific series of instructions for loading a pull
		3339	* constant.
		3340	*
		3341	* The expression style allows the CSE pass before this to optimize out
		3342	* repeated loads from the same offset, and gives the pre-register-allocation
		3343	* scheduling full flexibility, while the conversion to native instructions
		3344	* allows the post-register-allocation scheduler the best information
		3345	* possible.
		3346	*
		3347	* Note that execution masking for setting up pull constant loads is special:
		3348	* the channels that need to be written are unrelated to the current execution
		3349	* mask, since a later instruction will use one of the result channels as a
		3350	* source operand for all 8 or 16 of its channels.
		3351	*/
		3352	void
		3353	fs_visitor::lower_uniform_pull_constant_loads()
		3354	{
		3355	foreach_block_and_inst (block, fs_inst, inst, cfg) {
		3356	if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
		3357	continue;
		3358
		3359	if (devinfo->gen >= 7) {
		3360	/* The offset arg before was a vec4-aligned byte offset. We need to
		3361	* turn it into a dword offset.
		3362	*/
		3363	fs_reg const_offset_reg = inst->src[1];
		3364	assert(const_offset_reg.file == IMM &&
		3365	const_offset_reg.type == BRW_REGISTER_TYPE_UD);
		3366	const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
		3367	fs_reg payload = fs_reg(GRF, alloc.allocate(1));
		3368
		3369	/* We have to use a message header on Skylake to get SIMD4x2 mode.
		3370	* Reserve space for the register.
		3371	*/
		3372	if (devinfo->gen >= 9) {
		3373	payload.reg_offset++;
		3374	alloc.sizes[payload.reg] = 2;
		3375	}
		3376
		3377	/* This is actually going to be a MOV, but since only the first dword
		3378	* is accessed, we have a special opcode to do just that one. Note
		3379	* that this needs to be an operation that will be considered a def
		3380	* by live variable analysis, or register allocation will explode.
		3381	*/
		3382	fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
		3383	8, payload, const_offset_reg);
		3384	setup->force_writemask_all = true;
		3385
		3386	setup->ir = inst->ir;
		3387	setup->annotation = inst->annotation;
		3388	inst->insert_before(block, setup);
		3389
		3390	/* Similarly, this will only populate the first 4 channels of the
		3391	* result register (since we only use smear values from 0-3), but we
		3392	* don't tell the optimizer.
		3393	*/
		3394	inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
		3395	inst->src[1] = payload;
		3396
		3397	invalidate_live_intervals();
		3398	} else {
		3399	/* Before register allocation, we didn't tell the scheduler about the
		3400	* MRF we use. We know it's safe to use this MRF because nothing
		3401	* else does except for register spill/unspill, which generates and
		3402	* uses its MRF within a single IR instruction.
		3403	*/
		3404	inst->base_mrf = 14;
		3405	inst->mlen = 1;
		3406	}
		3407	}
		3408	}
		3409
		3410	bool
		3411	fs_visitor::lower_load_payload()
		3412	{
		3413	bool progress = false;
		3414
		3415	foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
		3416	if (inst->opcode != SHADER_OPCODE_LOAD_PAYLOAD)
		3417	continue;
		3418
		3419	assert(inst->dst.file == MRF \|\| inst->dst.file == GRF);
		3420	assert(inst->saturate == false);
		3421
		3422	fs_reg dst = inst->dst;
		3423
		3424	/* Get rid of COMPR4. We'll add it back in if we need it */
		3425	if (dst.file == MRF)
		3426	dst.reg = dst.reg & ~BRW_MRF_COMPR4;
		3427
		3428	dst.width = 8;
		3429	for (uint8_t i = 0; i < inst->header_size; i++) {
		3430	if (inst->src[i].file != BAD_FILE) {
		3431	fs_reg mov_dst = retype(dst, BRW_REGISTER_TYPE_UD);
		3432	fs_reg mov_src = retype(inst->src[i], BRW_REGISTER_TYPE_UD);
		3433	mov_src.width = 8;
		3434	fs_inst *mov = MOV(mov_dst, mov_src);
		3435	mov->force_writemask_all = true;
		3436	inst->insert_before(block, mov);
		3437	}
		3438	dst = offset(dst, 1);
		3439	}
		3440
		3441	dst.width = inst->exec_size;
		3442	if (inst->dst.file == MRF && (inst->dst.reg & BRW_MRF_COMPR4) &&
		3443	inst->exec_size > 8) {
		3444	/* In this case, the payload portion of the LOAD_PAYLOAD isn't
		3445	* a straightforward copy. Instead, the result of the
		3446	* LOAD_PAYLOAD is treated as interleaved and the first four
		3447	* non-header sources are unpacked as:
		3448	*
		3449	* m + 0: r0
		3450	* m + 1: g0
		3451	* m + 2: b0
		3452	* m + 3: a0
		3453	* m + 4: r1
		3454	* m + 5: g1
		3455	* m + 6: b1
		3456	* m + 7: a1
		3457	*
		3458	* This is used for gen <= 5 fb writes.
		3459	*/
		3460	assert(inst->exec_size == 16);
		3461	assert(inst->header_size + 4 <= inst->sources);
		3462	for (uint8_t i = inst->header_size; i < inst->header_size + 4; i++) {
		3463	if (inst->src[i].file != BAD_FILE) {
		3464	if (devinfo->has_compr4) {
		3465	fs_reg compr4_dst = retype(dst, inst->src[i].type);
		3466	compr4_dst.reg \|= BRW_MRF_COMPR4;
		3467
		3468	fs_inst *mov = MOV(compr4_dst, inst->src[i]);
		3469	mov->force_writemask_all = inst->force_writemask_all;
		3470	inst->insert_before(block, mov);
		3471	} else {
		3472	/* Platform doesn't have COMPR4. We have to fake it */
		3473	fs_reg mov_dst = retype(dst, inst->src[i].type);
		3474	mov_dst.width = 8;
		3475
		3476	fs_inst *mov = MOV(mov_dst, half(inst->src[i], 0));
		3477	mov->force_writemask_all = inst->force_writemask_all;
		3478	inst->insert_before(block, mov);
		3479
		3480	mov = MOV(offset(mov_dst, 4), half(inst->src[i], 1));
		3481	mov->force_writemask_all = inst->force_writemask_all;
		3482	mov->force_sechalf = true;
		3483	inst->insert_before(block, mov);
		3484	}
		3485	}
		3486
		3487	dst.reg++;
		3488	}
		3489
		3490	/* The loop above only ever incremented us through the first set
		3491	* of 4 registers. However, thanks to the magic of COMPR4, we
		3492	* actually wrote to the first 8 registers, so we need to take
		3493	* that into account now.
		3494	*/
		3495	dst.reg += 4;
		3496
		3497	/* The COMPR4 code took care of the first 4 sources. We'll let
		3498	* the regular path handle any remaining sources. Yes, we are
		3499	* modifying the instruction but we're about to delete it so
		3500	* this really doesn't hurt anything.
		3501	*/
		3502	inst->header_size += 4;
		3503	}
		3504
		3505	for (uint8_t i = inst->header_size; i < inst->sources; i++) {
		3506	if (inst->src[i].file != BAD_FILE) {
		3507	fs_inst *mov = MOV(retype(dst, inst->src[i].type),
		3508	inst->src[i]);
		3509	mov->force_writemask_all = inst->force_writemask_all;
		3510	mov->force_sechalf = inst->force_sechalf;
		3511	inst->insert_before(block, mov);
		3512	}
		3513	dst = offset(dst, 1);
		3514	}
		3515
		3516	inst->remove(block);
		3517	progress = true;
		3518	}
		3519
		3520	if (progress)
		3521	invalidate_live_intervals();
		3522
		3523	return progress;
		3524	}
		3525
		3526	bool
		3527	fs_visitor::lower_integer_multiplication()
		3528	{
		3529	bool progress = false;
		3530
		3531	/* Gen8's MUL instruction can do a 32-bit x 32-bit -> 32-bit operation
		3532	* directly, but Cherryview cannot.
		3533	*/
		3534	if (devinfo->gen >= 8 && !devinfo->is_cherryview)
		3535	return false;
		3536
		3537	foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
		3538	if (inst->opcode != BRW_OPCODE_MUL \|\|
		3539	inst->dst.is_accumulator() \|\|
		3540	(inst->dst.type != BRW_REGISTER_TYPE_D &&
		3541	inst->dst.type != BRW_REGISTER_TYPE_UD))
		3542	continue;
		3543
		3544	#define insert(instr) inst->insert_before(block, instr)
		3545
		3546	/* The MUL instruction isn't commutative. On Gen <= 6, only the low
		3547	* 16-bits of src0 are read, and on Gen >= 7 only the low 16-bits of
		3548	* src1 are used.
		3549	*
		3550	* If multiplying by an immediate value that fits in 16-bits, do a
		3551	* single MUL instruction with that value in the proper location.
		3552	*/
		3553	if (inst->src[1].file == IMM &&
		3554	inst->src[1].fixed_hw_reg.dw1.ud < (1 << 16)) {
		3555	if (devinfo->gen < 7) {
		3556	fs_reg imm(GRF, alloc.allocate(dispatch_width / 8),
		3557	inst->dst.type, dispatch_width);
		3558	insert(MOV(imm, inst->src[1]));
		3559	insert(MUL(inst->dst, imm, inst->src[0]));
		3560	} else {
		3561	insert(MUL(inst->dst, inst->src[0], inst->src[1]));
		3562	}
		3563	} else {
		3564	/* Gen < 8 (and some Gen8+ low-power parts like Cherryview) cannot
		3565	* do 32-bit integer multiplication in one instruction, but instead
		3566	* must do a sequence (which actually calculates a 64-bit result):
		3567	*
		3568	* mul(8) acc0<1>D g3<8,8,1>D g4<8,8,1>D
		3569	* mach(8) null g3<8,8,1>D g4<8,8,1>D
		3570	* mov(8) g2<1>D acc0<8,8,1>D
		3571	*
		3572	* But on Gen > 6, the ability to use second accumulator register
		3573	* (acc1) for non-float data types was removed, preventing a simple
		3574	* implementation in SIMD16. A 16-channel result can be calculated by
		3575	* executing the three instructions twice in SIMD8, once with quarter
		3576	* control of 1Q for the first eight channels and again with 2Q for
		3577	* the second eight channels.
		3578	*
		3579	* Which accumulator register is implicitly accessed (by AccWrEnable
		3580	* for instance) is determined by the quarter control. Unfortunately
		3581	* Ivybridge (and presumably Baytrail) has a hardware bug in which an
		3582	* implicit accumulator access by an instruction with 2Q will access
		3583	* acc1 regardless of whether the data type is usable in acc1.
		3584	*
		3585	* Specifically, the 2Q mach(8) writes acc1 which does not exist for
		3586	* integer data types.
		3587	*
		3588	* Since we only want the low 32-bits of the result, we can do two
		3589	* 32-bit x 16-bit multiplies (like the mul and mach are doing), and
		3590	* adjust the high result and add them (like the mach is doing):
		3591	*
		3592	* mul(8) g7<1>D g3<8,8,1>D g4.0<8,8,1>UW
		3593	* mul(8) g8<1>D g3<8,8,1>D g4.1<8,8,1>UW
		3594	* shl(8) g9<1>D g8<8,8,1>D 16D
		3595	* add(8) g2<1>D g7<8,8,1>D g8<8,8,1>D
		3596	*
		3597	* We avoid the shl instruction by realizing that we only want to add
		3598	* the low 16-bits of the "high" result to the high 16-bits of the
		3599	* "low" result and using proper regioning on the add:
		3600	*
		3601	* mul(8) g7<1>D g3<8,8,1>D g4.0<16,8,2>UW
		3602	* mul(8) g8<1>D g3<8,8,1>D g4.1<16,8,2>UW
		3603	* add(8) g7.1<2>UW g7.1<16,8,2>UW g8<16,8,2>UW
		3604	*
		3605	* Since it does not use the (single) accumulator register, we can
		3606	* schedule multi-component multiplications much better.
		3607	*/
		3608
		3609	fs_reg low = inst->dst;
		3610	fs_reg high(GRF, alloc.allocate(dispatch_width / 8),
		3611	inst->dst.type, dispatch_width);
		3612
		3613	if (brw->gen >= 7) {
		3614	fs_reg src1_0_w = inst->src[1];
		3615	fs_reg src1_1_w = inst->src[1];
		3616
		3617	if (inst->src[1].file == IMM) {
		3618	src1_0_w.fixed_hw_reg.dw1.ud &= 0xffff;
		3619	src1_1_w.fixed_hw_reg.dw1.ud >>= 16;
		3620	} else {
		3621	src1_0_w.type = BRW_REGISTER_TYPE_UW;
		3622	src1_0_w.stride = 2;
		3623
		3624	src1_1_w.type = BRW_REGISTER_TYPE_UW;
		3625	src1_1_w.stride = 2;
		3626	src1_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
		3627	}
		3628	insert(MUL(low, inst->src[0], src1_0_w));
		3629	insert(MUL(high, inst->src[0], src1_1_w));
		3630	} else {
		3631	fs_reg src0_0_w = inst->src[0];
		3632	fs_reg src0_1_w = inst->src[0];
		3633
		3634	src0_0_w.type = BRW_REGISTER_TYPE_UW;
		3635	src0_0_w.stride = 2;
		3636
		3637	src0_1_w.type = BRW_REGISTER_TYPE_UW;
		3638	src0_1_w.stride = 2;
		3639	src0_1_w.subreg_offset += type_sz(BRW_REGISTER_TYPE_UW);
		3640
		3641	insert(MUL(low, src0_0_w, inst->src[1]));
		3642	insert(MUL(high, src0_1_w, inst->src[1]));
		3643	}
		3644
		3645	fs_reg dst = inst->dst;
		3646	dst.type = BRW_REGISTER_TYPE_UW;
		3647	dst.subreg_offset = 2;
		3648	dst.stride = 2;
		3649
		3650	high.type = BRW_REGISTER_TYPE_UW;
		3651	high.stride = 2;
		3652
		3653	low.type = BRW_REGISTER_TYPE_UW;
		3654	low.subreg_offset = 2;
		3655	low.stride = 2;
		3656
		3657	insert(ADD(dst, low, high));
		3658	}
		3659	#undef insert
		3660
		3661	inst->remove(block);
		3662	progress = true;
		3663	}
		3664
		3665	if (progress)
		3666	invalidate_live_intervals();
		3667
		3668	return progress;
		3669	}
		3670
		3671	void
		3672	fs_visitor::dump_instructions()
		3673	{
		3674	dump_instructions(NULL);
		3675	}
		3676
		3677	void
		3678	fs_visitor::dump_instructions(const char *name)
		3679	{
		3680	FILE *file = stderr;
		3681	if (name && geteuid() != 0) {
		3682	file = fopen(name, "w");
		3683	if (!file)
		3684	file = stderr;
		3685	}
		3686
		3687	if (cfg) {
		3688	calculate_register_pressure();
		3689	int ip = 0, max_pressure = 0;
		3690	foreach_block_and_inst(block, backend_instruction, inst, cfg) {
		3691	max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
		3692	fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
		3693	dump_instruction(inst, file);
		3694	ip++;
		3695	}
		3696	fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
		3697	} else {
		3698	int ip = 0;
		3699	foreach_in_list(backend_instruction, inst, &instructions) {
		3700	fprintf(file, "%4d: ", ip++);
		3701	dump_instruction(inst, file);
		3702	}
		3703	}
		3704
		3705	if (file != stderr) {
		3706	fclose(file);
		3707	}
		3708	}
		3709
		3710	void
		3711	fs_visitor::dump_instruction(backend_instruction *be_inst)
		3712	{
		3713	dump_instruction(be_inst, stderr);
		3714	}
		3715
		3716	void
		3717	fs_visitor::dump_instruction(backend_instruction be_inst, FILE file)
		3718	{
		3719	fs_inst inst = (fs_inst )be_inst;
		3720
		3721	if (inst->predicate) {
		3722	fprintf(file, "(%cf0.%d) ",
		3723	inst->predicate_inverse ? '-' : '+',
		3724	inst->flag_subreg);
		3725	}
		3726
		3727	fprintf(file, "%s", brw_instruction_name(inst->opcode));
		3728	if (inst->saturate)
		3729	fprintf(file, ".sat");
		3730	if (inst->conditional_mod) {
		3731	fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
		3732	if (!inst->predicate &&
		3733	(devinfo->gen < 5 \|\| (inst->opcode != BRW_OPCODE_SEL &&
		3734	inst->opcode != BRW_OPCODE_IF &&
		3735	inst->opcode != BRW_OPCODE_WHILE))) {
		3736	fprintf(file, ".f0.%d", inst->flag_subreg);
		3737	}
		3738	}
		3739	fprintf(file, "(%d) ", inst->exec_size);
		3740
		3741
		3742	switch (inst->dst.file) {
		3743	case GRF:
		3744	fprintf(file, "vgrf%d", inst->dst.reg);
		3745	if (inst->dst.width != dispatch_width)
		3746	fprintf(file, "@%d", inst->dst.width);
		3747	if (alloc.sizes[inst->dst.reg] != inst->dst.width / 8 \|\|
		3748	inst->dst.subreg_offset)
		3749	fprintf(file, "+%d.%d",
		3750	inst->dst.reg_offset, inst->dst.subreg_offset);
		3751	break;
		3752	case MRF:
		3753	fprintf(file, "m%d", inst->dst.reg);
		3754	break;
		3755	case BAD_FILE:
		3756	fprintf(file, "(null)");
		3757	break;
		3758	case UNIFORM:
		3759	fprintf(file, "*u%d*", inst->dst.reg + inst->dst.reg_offset);
		3760	break;
		3761	case ATTR:
		3762	fprintf(file, "*attr%d*", inst->dst.reg + inst->dst.reg_offset);
		3763	break;
		3764	case HW_REG:
		3765	if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
		3766	switch (inst->dst.fixed_hw_reg.nr) {
		3767	case BRW_ARF_NULL:
		3768	fprintf(file, "null");
		3769	break;
		3770	case BRW_ARF_ADDRESS:
		3771	fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
		3772	break;
		3773	case BRW_ARF_ACCUMULATOR:
		3774	fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
		3775	break;
		3776	case BRW_ARF_FLAG:
		3777	fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
		3778	inst->dst.fixed_hw_reg.subnr);
		3779	break;
		3780	default:
		3781	fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
		3782	inst->dst.fixed_hw_reg.subnr);
		3783	break;
		3784	}
		3785	} else {
		3786	fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
		3787	}
		3788	if (inst->dst.fixed_hw_reg.subnr)
		3789	fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
		3790	break;
		3791	default:
		3792	fprintf(file, "???");
		3793	break;
		3794	}
		3795	fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
		3796
		3797	for (int i = 0; i < inst->sources; i++) {
		3798	if (inst->src[i].negate)
		3799	fprintf(file, "-");
		3800	if (inst->src[i].abs)
		3801	fprintf(file, "\|");
		3802	switch (inst->src[i].file) {
		3803	case GRF:
		3804	fprintf(file, "vgrf%d", inst->src[i].reg);
		3805	if (inst->src[i].width != dispatch_width)
		3806	fprintf(file, "@%d", inst->src[i].width);
		3807	if (alloc.sizes[inst->src[i].reg] != inst->src[i].width / 8 \|\|
		3808	inst->src[i].subreg_offset)
		3809	fprintf(file, "+%d.%d", inst->src[i].reg_offset,
		3810	inst->src[i].subreg_offset);
		3811	break;
		3812	case MRF:
		3813	fprintf(file, "*m%d*", inst->src[i].reg);
		3814	break;
		3815	case ATTR:
		3816	fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
		3817	break;
		3818	case UNIFORM:
		3819	fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
		3820	if (inst->src[i].reladdr) {
		3821	fprintf(file, "+reladdr");
		3822	} else if (inst->src[i].subreg_offset) {
		3823	fprintf(file, "+%d.%d", inst->src[i].reg_offset,
		3824	inst->src[i].subreg_offset);
		3825	}
		3826	break;
		3827	case BAD_FILE:
		3828	fprintf(file, "(null)");
		3829	break;
		3830	case IMM:
		3831	switch (inst->src[i].type) {
		3832	case BRW_REGISTER_TYPE_F:
		3833	fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
		3834	break;
		3835	case BRW_REGISTER_TYPE_W:
		3836	case BRW_REGISTER_TYPE_D:
		3837	fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
		3838	break;
		3839	case BRW_REGISTER_TYPE_UW:
		3840	case BRW_REGISTER_TYPE_UD:
		3841	fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
		3842	break;
		3843	case BRW_REGISTER_TYPE_VF:
		3844	fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
		3845	brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
		3846	brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
		3847	brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
		3848	brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
		3849	break;
		3850	default:
		3851	fprintf(file, "???");
		3852	break;
		3853	}
		3854	break;
		3855	case HW_REG:
		3856	if (inst->src[i].fixed_hw_reg.negate)
		3857	fprintf(file, "-");
		3858	if (inst->src[i].fixed_hw_reg.abs)
		3859	fprintf(file, "\|");
		3860	if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
		3861	switch (inst->src[i].fixed_hw_reg.nr) {
		3862	case BRW_ARF_NULL:
		3863	fprintf(file, "null");
		3864	break;
		3865	case BRW_ARF_ADDRESS:
		3866	fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
		3867	break;
		3868	case BRW_ARF_ACCUMULATOR:
		3869	fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
		3870	break;
		3871	case BRW_ARF_FLAG:
		3872	fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
		3873	inst->src[i].fixed_hw_reg.subnr);
		3874	break;
		3875	default:
		3876	fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
		3877	inst->src[i].fixed_hw_reg.subnr);
		3878	break;
		3879	}
		3880	} else {
		3881	fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
		3882	}
		3883	if (inst->src[i].fixed_hw_reg.subnr)
		3884	fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
		3885	if (inst->src[i].fixed_hw_reg.abs)
		3886	fprintf(file, "\|");
		3887	break;
		3888	default:
		3889	fprintf(file, "???");
		3890	break;
		3891	}
		3892	if (inst->src[i].abs)
		3893	fprintf(file, "\|");
		3894
		3895	if (inst->src[i].file != IMM) {
		3896	fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
		3897	}
		3898
		3899	if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
		3900	fprintf(file, ", ");
		3901	}
		3902
		3903	fprintf(file, " ");
		3904
		3905	if (dispatch_width == 16 && inst->exec_size == 8) {
		3906	if (inst->force_sechalf)
		3907	fprintf(file, "2ndhalf ");
		3908	else
		3909	fprintf(file, "1sthalf ");
		3910	}
		3911
		3912	fprintf(file, "\n");
		3913	}
		3914
		3915	/**
		3916	* Possibly returns an instruction that set up @param reg.
		3917	*
		3918	* Sometimes we want to take the result of some expression/variable
		3919	* dereference tree and rewrite the instruction generating the result
		3920	* of the tree. When processing the tree, we know that the
		3921	* instructions generated are all writing temporaries that are dead
		3922	* outside of this tree. So, if we have some instructions that write
		3923	* a temporary, we're free to point that temp write somewhere else.
		3924	*
		3925	* Note that this doesn't guarantee that the instruction generated
		3926	* only reg -- it might be the size=4 destination of a texture instruction.
		3927	*/
		3928	fs_inst *
		3929	fs_visitor::get_instruction_generating_reg(fs_inst *start,
		3930	fs_inst *end,
		3931	const fs_reg ®)
		3932	{
		3933	if (end == start \|\|
		3934	end->is_partial_write() \|\|
		3935	reg.reladdr \|\|
		3936	!reg.equals(end->dst)) {
		3937	return NULL;
		3938	} else {
		3939	return end;
		3940	}
		3941	}
		3942
		3943	void
		3944	fs_visitor::setup_payload_gen6()
		3945	{
		3946	bool uses_depth =
		3947	(prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
		3948	unsigned barycentric_interp_modes =
		3949	(stage == MESA_SHADER_FRAGMENT) ?
		3950	((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
		3951
		3952	assert(devinfo->gen >= 6);
		3953
		3954	/* R0-1: masks, pixel X/Y coordinates. */
		3955	payload.num_regs = 2;
		3956	/* R2: only for 32-pixel dispatch.*/
		3957
		3958	/* R3-26: barycentric interpolation coordinates. These appear in the
		3959	* same order that they appear in the brw_wm_barycentric_interp_mode
		3960	* enum. Each set of coordinates occupies 2 registers if dispatch width
		3961	* == 8 and 4 registers if dispatch width == 16. Coordinates only
		3962	* appear if they were enabled using the "Barycentric Interpolation
		3963	* Mode" bits in WM_STATE.
		3964	*/
		3965	for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
		3966	if (barycentric_interp_modes & (1 << i)) {
		3967	payload.barycentric_coord_reg[i] = payload.num_regs;
		3968	payload.num_regs += 2;
		3969	if (dispatch_width == 16) {
		3970	payload.num_regs += 2;
		3971	}
		3972	}
		3973	}
		3974
		3975	/* R27: interpolated depth if uses source depth */
		3976	if (uses_depth) {
		3977	payload.source_depth_reg = payload.num_regs;
		3978	payload.num_regs++;
		3979	if (dispatch_width == 16) {
		3980	/* R28: interpolated depth if not SIMD8. */
		3981	payload.num_regs++;
		3982	}
		3983	}
		3984	/* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
		3985	if (uses_depth) {
		3986	payload.source_w_reg = payload.num_regs;
		3987	payload.num_regs++;
		3988	if (dispatch_width == 16) {
		3989	/* R30: interpolated W if not SIMD8. */
		3990	payload.num_regs++;
		3991	}
		3992	}
		3993
		3994	if (stage == MESA_SHADER_FRAGMENT) {
		3995	brw_wm_prog_data prog_data = (brw_wm_prog_data) this->prog_data;
		3996	brw_wm_prog_key key = (brw_wm_prog_key) this->key;
		3997	prog_data->uses_pos_offset = key->compute_pos_offset;
		3998	/* R31: MSAA position offsets. */
		3999	if (prog_data->uses_pos_offset) {
		4000	payload.sample_pos_reg = payload.num_regs;
		4001	payload.num_regs++;
		4002	}
		4003	}
		4004
		4005	/* R32: MSAA input coverage mask */
		4006	if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
		4007	assert(devinfo->gen >= 7);
		4008	payload.sample_mask_in_reg = payload.num_regs;
		4009	payload.num_regs++;
		4010	if (dispatch_width == 16) {
		4011	/* R33: input coverage mask if not SIMD8. */
		4012	payload.num_regs++;
		4013	}
		4014	}
		4015
		4016	/* R34-: bary for 32-pixel. */
		4017	/* R58-59: interp W for 32-pixel. */
		4018
		4019	if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
		4020	source_depth_to_render_target = true;
		4021	}
		4022	}
		4023
		4024	void
		4025	fs_visitor::setup_vs_payload()
		4026	{
		4027	/* R0: thread header, R1: urb handles */
		4028	payload.num_regs = 2;
		4029	}
		4030
		4031	void
		4032	fs_visitor::setup_cs_payload()
		4033	{
		4034	assert(brw->gen >= 7);
		4035
		4036	payload.num_regs = 1;
		4037	}
		4038
		4039	void
		4040	fs_visitor::assign_binding_table_offsets()
		4041	{
		4042	assert(stage == MESA_SHADER_FRAGMENT);
		4043	brw_wm_prog_data prog_data = (brw_wm_prog_data) this->prog_data;
		4044	brw_wm_prog_key key = (brw_wm_prog_key) this->key;
		4045	uint32_t next_binding_table_offset = 0;
		4046
		4047	/* If there are no color regions, we still perform an FB write to a null
		4048	* renderbuffer, which we place at surface index 0.
		4049	*/
		4050	prog_data->binding_table.render_target_start = next_binding_table_offset;
		4051	next_binding_table_offset += MAX2(key->nr_color_regions, 1);
		4052
		4053	assign_common_binding_table_offsets(next_binding_table_offset);
		4054	}
		4055
		4056	void
		4057	fs_visitor::calculate_register_pressure()
		4058	{
		4059	invalidate_live_intervals();
		4060	calculate_live_intervals();
		4061
		4062	unsigned num_instructions = 0;
		4063	foreach_block(block, cfg)
		4064	num_instructions += block->instructions.length();
		4065
		4066	regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
		4067
		4068	for (unsigned reg = 0; reg < alloc.count; reg++) {
		4069	for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
		4070	regs_live_at_ip[ip] += alloc.sizes[reg];
		4071	}
		4072	}
		4073
		4074	void
		4075	fs_visitor::optimize()
		4076	{
		4077	split_virtual_grfs();
		4078
		4079	move_uniform_array_access_to_pull_constants();
		4080	assign_constant_locations();
		4081	demote_pull_constants();
		4082
		4083	#define OPT(pass, args...) ({ \
		4084	pass_num++; \
		4085	bool this_progress = pass(args); \
		4086	\
		4087	if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
		4088	char filename[64]; \
		4089	snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
		4090	stage_abbrev, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
		4091	\
		4092	backend_visitor::dump_instructions(filename); \
		4093	} \
		4094	\
		4095	progress = progress \|\| this_progress; \
		4096	this_progress; \
		4097	})
		4098
		4099	if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
		4100	char filename[64];
		4101	snprintf(filename, 64, "%s%d-%04d-00-start",
		4102	stage_abbrev, dispatch_width,
		4103	shader_prog ? shader_prog->Name : 0);
		4104
		4105	backend_visitor::dump_instructions(filename);
		4106	}
		4107
		4108	bool progress;
		4109	int iteration = 0;
		4110	int pass_num = 0;
		4111	do {
		4112	progress = false;
		4113	pass_num = 0;
		4114	iteration++;
		4115
		4116	OPT(remove_duplicate_mrf_writes);
		4117
		4118	OPT(opt_algebraic);
		4119	OPT(opt_cse);
		4120	OPT(opt_copy_propagate);
		4121	OPT(opt_peephole_predicated_break);
		4122	OPT(opt_cmod_propagation);
		4123	OPT(dead_code_eliminate);
		4124	OPT(opt_peephole_sel);
		4125	OPT(dead_control_flow_eliminate, this);
		4126	OPT(opt_register_renaming);
		4127	OPT(opt_redundant_discard_jumps);
		4128	OPT(opt_saturate_propagation);
		4129	OPT(opt_zero_samples);
		4130	OPT(register_coalesce);
		4131	OPT(compute_to_mrf);
		4132	OPT(eliminate_find_live_channel);
		4133
		4134	OPT(compact_virtual_grfs);
		4135	} while (progress);
		4136
		4137	pass_num = 0;
		4138
		4139	OPT(opt_sampler_eot);
		4140
		4141	if (OPT(lower_load_payload)) {
		4142	split_virtual_grfs();
		4143	OPT(register_coalesce);
		4144	OPT(compute_to_mrf);
		4145	OPT(dead_code_eliminate);
		4146	}
		4147
		4148	OPT(opt_combine_constants);
		4149	OPT(lower_integer_multiplication);
		4150
		4151	lower_uniform_pull_constant_loads();
		4152	}
		4153
		4154	/**
		4155	* Three source instruction must have a GRF/MRF destination register.
		4156	* ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
		4157	*/
		4158	void
		4159	fs_visitor::fixup_3src_null_dest()
		4160	{
		4161	foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
		4162	if (inst->is_3src() && inst->dst.is_null()) {
		4163	inst->dst = fs_reg(GRF, alloc.allocate(dispatch_width / 8),
		4164	inst->dst.type);
		4165	}
		4166	}
		4167	}
		4168
		4169	void
		4170	fs_visitor::allocate_registers()
		4171	{
		4172	bool allocated_without_spills;
		4173
		4174	static const enum instruction_scheduler_mode pre_modes[] = {
		4175	SCHEDULE_PRE,
		4176	SCHEDULE_PRE_NON_LIFO,
		4177	SCHEDULE_PRE_LIFO,
		4178	};
		4179
		4180	/* Try each scheduling heuristic to see if it can successfully register
		4181	* allocate without spilling. They should be ordered by decreasing
		4182	* performance but increasing likelihood of allocating.
		4183	*/
		4184	for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
		4185	schedule_instructions(pre_modes[i]);
		4186
		4187	if (0) {
		4188	assign_regs_trivial();
		4189	allocated_without_spills = true;
		4190	} else {
		4191	allocated_without_spills = assign_regs(false);
		4192	}
		4193	if (allocated_without_spills)
		4194	break;
		4195	}
		4196
		4197	if (!allocated_without_spills) {
		4198	/* We assume that any spilling is worse than just dropping back to
		4199	* SIMD8. There's probably actually some intermediate point where
		4200	* SIMD16 with a couple of spills is still better.
		4201	*/
		4202	if (dispatch_width == 16) {
		4203	fail("Failure to register allocate. Reduce number of "
		4204	"live scalar values to avoid this.");
		4205	} else {
		4206	perf_debug("%s shader triggered register spilling. "
		4207	"Try reducing the number of live scalar values to "
		4208	"improve performance.\n", stage_name);
		4209	}
		4210
		4211	/* Since we're out of heuristics, just go spill registers until we
		4212	* get an allocation.
		4213	*/
		4214	while (!assign_regs(true)) {
		4215	if (failed)
		4216	break;
		4217	}
		4218	}
		4219
		4220	/* This must come after all optimization and register allocation, since
		4221	* it inserts dead code that happens to have side effects, and it does
		4222	* so based on the actual physical registers in use.
		4223	*/
		4224	insert_gen4_send_dependency_workarounds();
		4225
		4226	if (failed)
		4227	return;
		4228
		4229	if (!allocated_without_spills)
		4230	schedule_instructions(SCHEDULE_POST);
		4231
		4232	if (last_scratch > 0)
		4233	prog_data->total_scratch = brw_get_scratch_size(last_scratch);
		4234	}
		4235
		4236	bool
		4237	fs_visitor::run_vs()
		4238	{
		4239	assert(stage == MESA_SHADER_VERTEX);
		4240
		4241	assign_common_binding_table_offsets(0);
		4242	setup_vs_payload();
		4243
		4244	if (INTEL_DEBUG & DEBUG_SHADER_TIME)
		4245	emit_shader_time_begin();
		4246
		4247	if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_VERTEX].NirOptions) {
		4248	emit_nir_code();
		4249	} else {
		4250	foreach_in_list(ir_instruction, ir, shader->base.ir) {
		4251	base_ir = ir;
		4252	this->result = reg_undef;
		4253	ir->accept(this);
		4254	}
		4255	base_ir = NULL;
		4256	}
		4257
		4258	if (failed)
		4259	return false;
		4260
		4261	emit_urb_writes();
		4262
		4263	if (INTEL_DEBUG & DEBUG_SHADER_TIME)
		4264	emit_shader_time_end();
		4265
		4266	calculate_cfg();
		4267
		4268	optimize();
		4269
		4270	assign_curb_setup();
		4271	assign_vs_urb_setup();
		4272
		4273	fixup_3src_null_dest();
		4274	allocate_registers();
		4275
		4276	return !failed;
		4277	}
		4278
		4279	bool
		4280	fs_visitor::run_fs()
		4281	{
		4282	brw_wm_prog_data wm_prog_data = (brw_wm_prog_data ) this->prog_data;
		4283	brw_wm_prog_key wm_key = (brw_wm_prog_key ) this->key;
		4284
		4285	assert(stage == MESA_SHADER_FRAGMENT);
		4286
		4287	sanity_param_count = prog->Parameters->NumParameters;
		4288
		4289	assign_binding_table_offsets();
		4290
		4291	if (devinfo->gen >= 6)
		4292	setup_payload_gen6();
		4293	else
		4294	setup_payload_gen4();
		4295
		4296	if (0) {
		4297	emit_dummy_fs();
		4298	} else if (brw->use_rep_send && dispatch_width == 16) {
		4299	emit_repclear_shader();
		4300	} else {
		4301	if (INTEL_DEBUG & DEBUG_SHADER_TIME)
		4302	emit_shader_time_begin();
		4303
		4304	calculate_urb_setup();
		4305	if (prog->InputsRead > 0) {
		4306	if (devinfo->gen < 6)
		4307	emit_interpolation_setup_gen4();
		4308	else
		4309	emit_interpolation_setup_gen6();
		4310	}
		4311
		4312	/* We handle discards by keeping track of the still-live pixels in f0.1.
		4313	* Initialize it with the dispatched pixels.
		4314	*/
		4315	if (wm_prog_data->uses_kill) {
		4316	fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
		4317	discard_init->flag_subreg = 1;
		4318	}
		4319
		4320	/* Generate FS IR for main(). (the visitor only descends into
		4321	* functions called "main").
		4322	*/
		4323	if (brw->ctx.Const.ShaderCompilerOptions[MESA_SHADER_FRAGMENT].NirOptions) {
		4324	emit_nir_code();
		4325	} else if (shader) {
		4326	foreach_in_list(ir_instruction, ir, shader->base.ir) {
		4327	base_ir = ir;
		4328	this->result = reg_undef;
		4329	ir->accept(this);
		4330	}
		4331	} else {
		4332	emit_fragment_program_code();
		4333	}
		4334	base_ir = NULL;
		4335	if (failed)
		4336	return false;
		4337
		4338	if (wm_prog_data->uses_kill)
		4339	emit(FS_OPCODE_PLACEHOLDER_HALT);
		4340
		4341	if (wm_key->alpha_test_func)
		4342	emit_alpha_test();
		4343
		4344	emit_fb_writes();
		4345
		4346	if (INTEL_DEBUG & DEBUG_SHADER_TIME)
		4347	emit_shader_time_end();
		4348
		4349	calculate_cfg();
		4350
		4351	optimize();
		4352
		4353	assign_curb_setup();
		4354	assign_urb_setup();
		4355
		4356	fixup_3src_null_dest();
		4357	allocate_registers();
		4358
		4359	if (failed)
		4360	return false;
		4361	}
		4362
		4363	if (dispatch_width == 8)
		4364	wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
		4365	else
		4366	wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
		4367
		4368	/* If any state parameters were appended, then ParameterValues could have
		4369	* been realloced, in which case the driver uniform storage set up by
		4370	* _mesa_associate_uniform_storage() would point to freed memory. Make
		4371	* sure that didn't happen.
		4372	*/
		4373	assert(sanity_param_count == prog->Parameters->NumParameters);
		4374
		4375	return !failed;
		4376	}
		4377
		4378	bool
		4379	fs_visitor::run_cs()
		4380	{
		4381	assert(stage == MESA_SHADER_COMPUTE);
		4382	assert(shader);
		4383
		4384	sanity_param_count = prog->Parameters->NumParameters;
		4385
		4386	assign_common_binding_table_offsets(0);
		4387
		4388	setup_cs_payload();
		4389
		4390	if (INTEL_DEBUG & DEBUG_SHADER_TIME)
		4391	emit_shader_time_begin();
		4392
		4393	emit_nir_code();
		4394
		4395	if (failed)
		4396	return false;
		4397
		4398	emit_cs_terminate();
		4399
		4400	if (INTEL_DEBUG & DEBUG_SHADER_TIME)
		4401	emit_shader_time_end();
		4402
		4403	calculate_cfg();
		4404
		4405	optimize();
		4406
		4407	assign_curb_setup();
		4408
		4409	fixup_3src_null_dest();
		4410	allocate_registers();
		4411
		4412	if (failed)
		4413	return false;
		4414
		4415	/* If any state parameters were appended, then ParameterValues could have
		4416	* been realloced, in which case the driver uniform storage set up by
		4417	* _mesa_associate_uniform_storage() would point to freed memory. Make
		4418	* sure that didn't happen.
		4419	*/
		4420	assert(sanity_param_count == prog->Parameters->NumParameters);
		4421
		4422	return !failed;
		4423	}
		4424
		4425	const unsigned *
		4426	brw_wm_fs_emit(struct brw_context *brw,
		4427	void *mem_ctx,
		4428	const struct brw_wm_prog_key *key,
		4429	struct brw_wm_prog_data *prog_data,
		4430	struct gl_fragment_program *fp,
		4431	struct gl_shader_program *prog,
		4432	unsigned *final_assembly_size)
		4433	{
		4434	bool start_busy = false;
		4435	double start_time = 0;
		4436
		4437	if (unlikely(brw->perf_debug)) {
		4438	start_busy = (brw->batch.last_bo &&
		4439	drm_intel_bo_busy(brw->batch.last_bo));
		4440	start_time = get_time();
		4441	}
		4442
		4443	struct brw_shader *shader = NULL;
		4444	if (prog)
		4445	shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
		4446
		4447	if (unlikely(INTEL_DEBUG & DEBUG_WM))
		4448	brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
		4449
		4450	/* Now the main event: Visit the shader IR and generate our FS IR for it.
		4451	*/
		4452	fs_visitor v(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
		4453	prog, &fp->Base, 8);
		4454	if (!v.run_fs()) {
		4455	if (prog) {
		4456	prog->LinkStatus = false;
		4457	ralloc_strcat(&prog->InfoLog, v.fail_msg);
		4458	}
		4459
		4460	_mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
		4461	v.fail_msg);
		4462
		4463	return NULL;
		4464	}
		4465
		4466	cfg_t *simd16_cfg = NULL;
		4467	fs_visitor v2(brw, mem_ctx, MESA_SHADER_FRAGMENT, key, &prog_data->base,
		4468	prog, &fp->Base, 16);
		4469	if (likely(!(INTEL_DEBUG & DEBUG_NO16) \|\| brw->use_rep_send)) {
		4470	if (!v.simd16_unsupported) {
		4471	/* Try a SIMD16 compile */
		4472	v2.import_uniforms(&v);
		4473	if (!v2.run_fs()) {
		4474	perf_debug("SIMD16 shader failed to compile, falling back to "
		4475	"SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
		4476	} else {
		4477	simd16_cfg = v2.cfg;
		4478	}
		4479	} else {
		4480	perf_debug("SIMD16 shader unsupported, falling back to "
		4481	"SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
		4482	}
		4483	}
		4484
		4485	cfg_t *simd8_cfg;
		4486	int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) \|\| brw->no_simd8;
		4487	if ((no_simd8 \|\| brw->gen < 5) && simd16_cfg) {
		4488	simd8_cfg = NULL;
		4489	prog_data->no_8 = true;
		4490	} else {
		4491	simd8_cfg = v.cfg;
		4492	prog_data->no_8 = false;
		4493	}
		4494
		4495	fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
		4496	&fp->Base, v.promoted_constants, v.runtime_check_aads_emit, "FS");
		4497
		4498	if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
		4499	char *name;
		4500	if (prog)
		4501	name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
		4502	prog->Label ? prog->Label : "unnamed",
		4503	prog->Name);
		4504	else
		4505	name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
		4506
		4507	g.enable_debug(name);
		4508	}
		4509
		4510	if (simd8_cfg)
		4511	g.generate_code(simd8_cfg, 8);
		4512	if (simd16_cfg)
		4513	prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
		4514
		4515	if (unlikely(brw->perf_debug) && shader) {
		4516	if (shader->compiled_once)
		4517	brw_wm_debug_recompile(brw, prog, key);
		4518	shader->compiled_once = true;
		4519
		4520	if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
		4521	perf_debug("FS compile took %.03f ms and stalled the GPU\n",
		4522	(get_time() - start_time) * 1000);
		4523	}
		4524	}
		4525
		4526	return g.get_assembly(final_assembly_size);
		4527	}
		4528
		4529	extern "C" bool
		4530	brw_fs_precompile(struct gl_context *ctx,
		4531	struct gl_shader_program *shader_prog,
		4532	struct gl_program *prog)
		4533	{
		4534	struct brw_context *brw = brw_context(ctx);
		4535	struct brw_wm_prog_key key;
		4536
		4537	struct gl_fragment_program fp = (struct gl_fragment_program ) prog;
		4538	struct brw_fragment_program *bfp = brw_fragment_program(fp);
		4539	bool program_uses_dfdy = fp->UsesDFdy;
		4540
		4541	memset(&key, 0, sizeof(key));
		4542
		4543	if (brw->gen < 6) {
		4544	if (fp->UsesKill)
		4545	key.iz_lookup \|= IZ_PS_KILL_ALPHATEST_BIT;
		4546
		4547	if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
		4548	key.iz_lookup \|= IZ_PS_COMPUTES_DEPTH_BIT;
		4549
		4550	/* Just assume depth testing. */
		4551	key.iz_lookup \|= IZ_DEPTH_TEST_ENABLE_BIT;
		4552	key.iz_lookup \|= IZ_DEPTH_WRITE_ENABLE_BIT;
		4553	}
		4554
		4555	if (brw->gen < 6 \|\| _mesa_bitcount_64(fp->Base.InputsRead &
		4556	BRW_FS_VARYING_INPUT_MASK) > 16)
		4557	key.input_slots_valid = fp->Base.InputsRead \| VARYING_BIT_POS;
		4558
		4559	brw_setup_tex_for_precompile(brw, &key.tex, &fp->Base);
		4560
		4561	if (fp->Base.InputsRead & VARYING_BIT_POS) {
		4562	key.drawable_height = ctx->DrawBuffer->Height;
		4563	}
		4564
		4565	key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
		4566	~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) \|
		4567	BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
		4568
		4569	if ((fp->Base.InputsRead & VARYING_BIT_POS) \|\| program_uses_dfdy) {
		4570	key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) \|\|
		4571	key.nr_color_regions > 1;
		4572	}
		4573
		4574	key.program_string_id = bfp->id;
		4575
		4576	uint32_t old_prog_offset = brw->wm.base.prog_offset;
		4577	struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
		4578
		4579	bool success = brw_codegen_wm_prog(brw, shader_prog, bfp, &key);
		4580
		4581	brw->wm.base.prog_offset = old_prog_offset;
		4582	brw->wm.prog_data = old_prog_data;
		4583
		4584	return success;
		4585	}
		4586
		4587	void
		4588	brw_setup_tex_for_precompile(struct brw_context *brw,
		4589	struct brw_sampler_prog_key_data *tex,
		4590	struct gl_program *prog)
		4591	{
		4592	const bool has_shader_channel_select = brw->is_haswell \|\| brw->gen >= 8;
		4593	unsigned sampler_count = _mesa_fls(prog->SamplersUsed);
		4594	for (unsigned i = 0; i < sampler_count; i++) {
		4595	if (!has_shader_channel_select && (prog->ShadowSamplers & (1 << i))) {
		4596	/* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
		4597	tex->swizzles[i] =
		4598	MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
		4599	} else {
		4600	/* Color sampler: assume no swizzling. */
		4601	tex->swizzles[i] = SWIZZLE_XYZW;
		4602	}
		4603	}
		4604	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/mesa-10.6.0/src/mesa/drivers/dri/i965/brw_fs.cpp – Rev 5564