WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/mesa-10.6.0/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp

Rev	Author	Line No.	Line
5564	serge	1	/*
		2	* Copyright © 2010 Intel Corporation
		3	*
		4	* Permission is hereby granted, free of charge, to any person obtaining a
		5	* copy of this software and associated documentation files (the "Software"),
		6	* to deal in the Software without restriction, including without limitation
		7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		8	* and/or sell copies of the Software, and to permit persons to whom the
		9	* Software is furnished to do so, subject to the following conditions:
		10	*
		11	* The above copyright notice and this permission notice (including the next
		12	* paragraph) shall be included in all copies or substantial portions of the
		13	* Software.
		14	*
		15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		16	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		17	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		18	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		19	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
		20	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
		21	* IN THE SOFTWARE.
		22	*
		23	* Authors:
		24	* Eric Anholt
		25	*
		26	*/
		27
		28	#include "brw_fs.h"
		29	#include "brw_vec4.h"
		30	#include "brw_cfg.h"
		31	#include "brw_shader.h"
		32	#include "glsl/glsl_types.h"
		33	#include "glsl/ir_optimization.h"
		34
		35	using namespace brw;
		36
		37	/** @file brw_fs_schedule_instructions.cpp
		38	*
		39	* List scheduling of FS instructions.
		40	*
		41	* The basic model of the list scheduler is to take a basic block,
		42	* compute a DAG of the dependencies (RAW ordering with latency, WAW
		43	* ordering with latency, WAR ordering), and make a list of the DAG heads.
		44	* Heuristically pick a DAG head, then put all the children that are
		45	* now DAG heads into the list of things to schedule.
		46	*
		47	* The heuristic is the important part. We're trying to be cheap,
		48	* since actually computing the optimal scheduling is NP complete.
		49	* What we do is track a "current clock". When we schedule a node, we
		50	* update the earliest-unblocked clock time of its children, and
		51	* increment the clock. Then, when trying to schedule, we just pick
		52	* the earliest-unblocked instruction to schedule.
		53	*
		54	* Note that often there will be many things which could execute
		55	* immediately, and there are a range of heuristic options to choose
		56	* from in picking among those.
		57	*/
		58
		59	static bool debug = false;
		60
		61	class instruction_scheduler;
		62
		63	class schedule_node : public exec_node
		64	{
		65	public:
		66	schedule_node(backend_instruction inst, instruction_scheduler sched);
		67	void set_latency_gen4();
		68	void set_latency_gen7(bool is_haswell);
		69
		70	backend_instruction *inst;
		71	schedule_node **children;
		72	int *child_latency;
		73	int child_count;
		74	int parent_count;
		75	int child_array_size;
		76	int unblocked_time;
		77	int latency;
		78
		79	/**
		80	* Which iteration of pushing groups of children onto the candidates list
		81	* this node was a part of.
		82	*/
		83	unsigned cand_generation;
		84
		85	/**
		86	* This is the sum of the instruction's latency plus the maximum delay of
		87	* its children, or just the issue_time if it's a leaf node.
		88	*/
		89	int delay;
		90	};
		91
		92	void
		93	schedule_node::set_latency_gen4()
		94	{
		95	int chans = 8;
		96	int math_latency = 22;
		97
		98	switch (inst->opcode) {
		99	case SHADER_OPCODE_RCP:
		100	this->latency = 1 * chans * math_latency;
		101	break;
		102	case SHADER_OPCODE_RSQ:
		103	this->latency = 2 * chans * math_latency;
		104	break;
		105	case SHADER_OPCODE_INT_QUOTIENT:
		106	case SHADER_OPCODE_SQRT:
		107	case SHADER_OPCODE_LOG2:
		108	/* full precision log. partial is 2. */
		109	this->latency = 3 * chans * math_latency;
		110	break;
		111	case SHADER_OPCODE_INT_REMAINDER:
		112	case SHADER_OPCODE_EXP2:
		113	/* full precision. partial is 3, same throughput. */
		114	this->latency = 4 * chans * math_latency;
		115	break;
		116	case SHADER_OPCODE_POW:
		117	this->latency = 8 * chans * math_latency;
		118	break;
		119	case SHADER_OPCODE_SIN:
		120	case SHADER_OPCODE_COS:
		121	/* minimum latency, max is 12 rounds. */
		122	this->latency = 5 * chans * math_latency;
		123	break;
		124	default:
		125	this->latency = 2;
		126	break;
		127	}
		128	}
		129
		130	void
		131	schedule_node::set_latency_gen7(bool is_haswell)
		132	{
		133	switch (inst->opcode) {
		134	case BRW_OPCODE_MAD:
		135	/* 2 cycles
		136	* (since the last two src operands are in different register banks):
		137	* mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
		138	*
		139	* 3 cycles on IVB, 4 on HSW
		140	* (since the last two src operands are in the same register bank):
		141	* mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
		142	*
		143	* 18 cycles on IVB, 16 on HSW
		144	* (since the last two src operands are in different register banks):
		145	* mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
		146	* mov(8) null g4<4,5,1>F { align16 WE_normal 1Q };
		147	*
		148	* 20 cycles on IVB, 18 on HSW
		149	* (since the last two src operands are in the same register bank):
		150	* mad(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
		151	* mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
		152	*/
		153
		154	/* Our register allocator doesn't know about register banks, so use the
		155	* higher latency.
		156	*/
		157	latency = is_haswell ? 16 : 18;
		158	break;
		159
		160	case BRW_OPCODE_LRP:
		161	/* 2 cycles
		162	* (since the last two src operands are in different register banks):
		163	* lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
		164	*
		165	* 3 cycles on IVB, 4 on HSW
		166	* (since the last two src operands are in the same register bank):
		167	* lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
		168	*
		169	* 16 cycles on IVB, 14 on HSW
		170	* (since the last two src operands are in different register banks):
		171	* lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g3.1<4,4,1>F.x { align16 WE_normal 1Q };
		172	* mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
		173	*
		174	* 16 cycles
		175	* (since the last two src operands are in the same register bank):
		176	* lrp(8) g4<1>F g2.2<4,4,1>F.x g2<4,4,1>F.x g2.1<4,4,1>F.x { align16 WE_normal 1Q };
		177	* mov(8) null g4<4,4,1>F { align16 WE_normal 1Q };
		178	*/
		179
		180	/* Our register allocator doesn't know about register banks, so use the
		181	* higher latency.
		182	*/
		183	latency = 14;
		184	break;
		185
		186	case SHADER_OPCODE_RCP:
		187	case SHADER_OPCODE_RSQ:
		188	case SHADER_OPCODE_SQRT:
		189	case SHADER_OPCODE_LOG2:
		190	case SHADER_OPCODE_EXP2:
		191	case SHADER_OPCODE_SIN:
		192	case SHADER_OPCODE_COS:
		193	/* 2 cycles:
		194	* math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
		195	*
		196	* 18 cycles:
		197	* math inv(8) g4<1>F g2<0,1,0>F null { align1 WE_normal 1Q };
		198	* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
		199	*
		200	* Same for exp2, log2, rsq, sqrt, sin, cos.
		201	*/
		202	latency = is_haswell ? 14 : 16;
		203	break;
		204
		205	case SHADER_OPCODE_POW:
		206	/* 2 cycles:
		207	* math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
		208	*
		209	* 26 cycles:
		210	* math pow(8) g4<1>F g2<0,1,0>F g2.1<0,1,0>F { align1 WE_normal 1Q };
		211	* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
		212	*/
		213	latency = is_haswell ? 22 : 24;
		214	break;
		215
		216	case SHADER_OPCODE_TEX:
		217	case SHADER_OPCODE_TXD:
		218	case SHADER_OPCODE_TXF:
		219	case SHADER_OPCODE_TXL:
		220	/* 18 cycles:
		221	* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
		222	* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
		223	* send(8) g4<1>UW g114<8,8,1>F
		224	* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
		225	*
		226	* 697 +/-49 cycles (min 610, n=26):
		227	* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
		228	* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
		229	* send(8) g4<1>UW g114<8,8,1>F
		230	* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
		231	* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
		232	*
		233	* So the latency on our first texture load of the batchbuffer takes
		234	* ~700 cycles, since the caches are cold at that point.
		235	*
		236	* 840 +/- 92 cycles (min 720, n=25):
		237	* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
		238	* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
		239	* send(8) g4<1>UW g114<8,8,1>F
		240	* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
		241	* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
		242	* send(8) g4<1>UW g114<8,8,1>F
		243	* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
		244	* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
		245	*
		246	* On the second load, it takes just an extra ~140 cycles, and after
		247	* accounting for the 14 cycles of the MOV's latency, that makes ~130.
		248	*
		249	* 683 +/- 49 cycles (min = 602, n=47):
		250	* mov(8) g115<1>F 0F { align1 WE_normal 1Q };
		251	* mov(8) g114<1>F 0F { align1 WE_normal 1Q };
		252	* send(8) g4<1>UW g114<8,8,1>F
		253	* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
		254	* send(8) g50<1>UW g114<8,8,1>F
		255	* sampler (10, 0, 0, 1) mlen 2 rlen 4 { align1 WE_normal 1Q };
		256	* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
		257	*
		258	* The unit appears to be pipelined, since this matches up with the
		259	* cache-cold case, despite there being two loads here. If you replace
		260	* the g4 in the MOV to null with g50, it's still 693 +/- 52 (n=39).
		261	*
		262	* So, take some number between the cache-hot 140 cycles and the
		263	* cache-cold 700 cycles. No particular tuning was done on this.
		264	*
		265	* I haven't done significant testing of the non-TEX opcodes. TXL at
		266	* least looked about the same as TEX.
		267	*/
		268	latency = 200;
		269	break;
		270
		271	case SHADER_OPCODE_TXS:
		272	/* Testing textureSize(sampler2D, 0), one load was 420 +/- 41
		273	* cycles (n=15):
		274	* mov(8) g114<1>UD 0D { align1 WE_normal 1Q };
		275	* send(8) g6<1>UW g114<8,8,1>F
		276	* sampler (10, 0, 10, 1) mlen 1 rlen 4 { align1 WE_normal 1Q };
		277	* mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1Q };
		278	*
		279	*
		280	* Two loads was 535 +/- 30 cycles (n=19):
		281	* mov(16) g114<1>UD 0D { align1 WE_normal 1H };
		282	* send(16) g6<1>UW g114<8,8,1>F
		283	* sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
		284	* mov(16) g114<1>UD 0D { align1 WE_normal 1H };
		285	* mov(16) g6<1>F g6<8,8,1>D { align1 WE_normal 1H };
		286	* send(16) g8<1>UW g114<8,8,1>F
		287	* sampler (10, 0, 10, 2) mlen 2 rlen 8 { align1 WE_normal 1H };
		288	* mov(16) g8<1>F g8<8,8,1>D { align1 WE_normal 1H };
		289	* add(16) g6<1>F g6<8,8,1>F g8<8,8,1>F { align1 WE_normal 1H };
		290	*
		291	* Since the only caches that should matter are just the
		292	* instruction/state cache containing the surface state, assume that we
		293	* always have hot caches.
		294	*/
		295	latency = 100;
		296	break;
		297
		298	case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
		299	case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
		300	case VS_OPCODE_PULL_CONSTANT_LOAD:
		301	/* testing using varying-index pull constants:
		302	*
		303	* 16 cycles:
		304	* mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
		305	* send(8) g4<1>F g4<8,8,1>D
		306	* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
		307	*
		308	* ~480 cycles:
		309	* mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
		310	* send(8) g4<1>F g4<8,8,1>D
		311	* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
		312	* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
		313	*
		314	* ~620 cycles:
		315	* mov(8) g4<1>D g2.1<0,1,0>F { align1 WE_normal 1Q };
		316	* send(8) g4<1>F g4<8,8,1>D
		317	* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
		318	* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
		319	* send(8) g4<1>F g4<8,8,1>D
		320	* data (9, 2, 3) mlen 1 rlen 1 { align1 WE_normal 1Q };
		321	* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
		322	*
		323	* So, if it's cache-hot, it's about 140. If it's cache cold, it's
		324	* about 460. We expect to mostly be cache hot, so pick something more
		325	* in that direction.
		326	*/
		327	latency = 200;
		328	break;
		329
		330	case SHADER_OPCODE_GEN7_SCRATCH_READ:
		331	/* Testing a load from offset 0, that had been previously written:
		332	*
		333	* send(8) g114<1>UW g0<8,8,1>F data (0, 0, 0) mlen 1 rlen 1 { align1 WE_normal 1Q };
		334	* mov(8) null g114<8,8,1>F { align1 WE_normal 1Q };
		335	*
		336	* The cycles spent seemed to be grouped around 40-50 (as low as 38),
		337	* then around 140. Presumably this is cache hit vs miss.
		338	*/
		339	latency = 50;
		340	break;
		341
		342	case SHADER_OPCODE_UNTYPED_ATOMIC:
		343	case SHADER_OPCODE_TYPED_ATOMIC:
		344	/* Test code:
		345	* mov(8) g112<1>ud 0x00000000ud { align1 WE_all 1Q };
		346	* mov(1) g112.7<1>ud g1.7<0,1,0>ud { align1 WE_all };
		347	* mov(8) g113<1>ud 0x00000000ud { align1 WE_normal 1Q };
		348	* send(8) g4<1>ud g112<8,8,1>ud
		349	* data (38, 5, 6) mlen 2 rlen 1 { align1 WE_normal 1Q };
		350	*
		351	* Running it 100 times as fragment shader on a 128x128 quad
		352	* gives an average latency of 13867 cycles per atomic op,
		353	* standard deviation 3%. Note that this is a rather
		354	* pessimistic estimate, the actual latency in cases with few
		355	* collisions between threads and favorable pipelining has been
		356	* seen to be reduced by a factor of 100.
		357	*/
		358	latency = 14000;
		359	break;
		360
		361	case SHADER_OPCODE_UNTYPED_SURFACE_READ:
		362	case SHADER_OPCODE_UNTYPED_SURFACE_WRITE:
		363	case SHADER_OPCODE_TYPED_SURFACE_READ:
		364	case SHADER_OPCODE_TYPED_SURFACE_WRITE:
		365	/* Test code:
		366	* mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q };
		367	* mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all };
		368	* mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q };
		369	* send(8) g4<1>UD g112<8,8,1>UD
		370	* data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q };
		371	* .
		372	* . [repeats 8 times]
		373	* .
		374	* mov(8) g112<1>UD 0x00000000UD { align1 WE_all 1Q };
		375	* mov(1) g112.7<1>UD g1.7<0,1,0>UD { align1 WE_all };
		376	* mov(8) g113<1>UD 0x00000000UD { align1 WE_normal 1Q };
		377	* send(8) g4<1>UD g112<8,8,1>UD
		378	* data (38, 6, 5) mlen 2 rlen 1 { align1 WE_normal 1Q };
		379	*
		380	* Running it 100 times as fragment shader on a 128x128 quad
		381	* gives an average latency of 583 cycles per surface read,
		382	* standard deviation 0.9%.
		383	*/
		384	latency = is_haswell ? 300 : 600;
		385	break;
		386
		387	default:
		388	/* 2 cycles:
		389	* mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
		390	*
		391	* 16 cycles:
		392	* mul(8) g4<1>F g2<0,1,0>F 0.5F { align1 WE_normal 1Q };
		393	* mov(8) null g4<8,8,1>F { align1 WE_normal 1Q };
		394	*/
		395	latency = 14;
		396	break;
		397	}
		398	}
		399
		400	class instruction_scheduler {
		401	public:
		402	instruction_scheduler(backend_visitor *v, int grf_count,
		403	instruction_scheduler_mode mode)
		404	{
		405	this->bv = v;
		406	this->mem_ctx = ralloc_context(NULL);
		407	this->grf_count = grf_count;
		408	this->instructions.make_empty();
		409	this->instructions_to_schedule = 0;
		410	this->post_reg_alloc = (mode == SCHEDULE_POST);
		411	this->mode = mode;
		412	this->time = 0;
		413	if (!post_reg_alloc) {
		414	this->remaining_grf_uses = rzalloc_array(mem_ctx, int, grf_count);
		415	this->grf_active = rzalloc_array(mem_ctx, bool, grf_count);
		416	} else {
		417	this->remaining_grf_uses = NULL;
		418	this->grf_active = NULL;
		419	}
		420	}
		421
		422	~instruction_scheduler()
		423	{
		424	ralloc_free(this->mem_ctx);
		425	}
		426	void add_barrier_deps(schedule_node *n);
		427	void add_dep(schedule_node before, schedule_node after, int latency);
		428	void add_dep(schedule_node before, schedule_node after);
		429
		430	void run(cfg_t *cfg);
		431	void add_insts_from_block(bblock_t *block);
		432	void compute_delay(schedule_node *node);
		433	virtual void calculate_deps() = 0;
		434	virtual schedule_node *choose_instruction_to_schedule() = 0;
		435
		436	/**
		437	* Returns how many cycles it takes the instruction to issue.
		438	*
		439	* Instructions in gen hardware are handled one simd4 vector at a time,
		440	* with 1 cycle per vector dispatched. Thus SIMD8 pixel shaders take 2
		441	* cycles to dispatch and SIMD16 (compressed) instructions take 4.
		442	*/
		443	virtual int issue_time(backend_instruction *inst) = 0;
		444
		445	virtual void count_remaining_grf_uses(backend_instruction *inst) = 0;
		446	virtual void update_register_pressure(backend_instruction *inst) = 0;
		447	virtual int get_register_pressure_benefit(backend_instruction *inst) = 0;
		448
		449	void schedule_instructions(bblock_t *block);
		450
		451	void *mem_ctx;
		452
		453	bool post_reg_alloc;
		454	int instructions_to_schedule;
		455	int grf_count;
		456	int time;
		457	exec_list instructions;
		458	backend_visitor *bv;
		459
		460	instruction_scheduler_mode mode;
		461
		462	/**
		463	* Number of instructions left to schedule that reference each vgrf.
		464	*
		465	* Used so that we can prefer scheduling instructions that will end the
		466	* live intervals of multiple variables, to reduce register pressure.
		467	*/
		468	int *remaining_grf_uses;
		469
		470	/**
		471	* Tracks whether each VGRF has had an instruction scheduled that uses it.
		472	*
		473	* This is used to estimate whether scheduling a new instruction will
		474	* increase register pressure.
		475	*/
		476	bool *grf_active;
		477	};
		478
		479	class fs_instruction_scheduler : public instruction_scheduler
		480	{
		481	public:
		482	fs_instruction_scheduler(fs_visitor *v, int grf_count,
		483	instruction_scheduler_mode mode);
		484	void calculate_deps();
		485	bool is_compressed(fs_inst *inst);
		486	schedule_node *choose_instruction_to_schedule();
		487	int issue_time(backend_instruction *inst);
		488	fs_visitor *v;
		489
		490	void count_remaining_grf_uses(backend_instruction *inst);
		491	void update_register_pressure(backend_instruction *inst);
		492	int get_register_pressure_benefit(backend_instruction *inst);
		493	};
		494
		495	fs_instruction_scheduler::fs_instruction_scheduler(fs_visitor *v,
		496	int grf_count,
		497	instruction_scheduler_mode mode)
		498	: instruction_scheduler(v, grf_count, mode),
		499	v(v)
		500	{
		501	}
		502
		503	void
		504	fs_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
		505	{
		506	fs_inst inst = (fs_inst )be;
		507
		508	if (!remaining_grf_uses)
		509	return;
		510
		511	if (inst->dst.file == GRF)
		512	remaining_grf_uses[inst->dst.reg]++;
		513
		514	for (int i = 0; i < inst->sources; i++) {
		515	if (inst->src[i].file != GRF)
		516	continue;
		517
		518	remaining_grf_uses[inst->src[i].reg]++;
		519	}
		520	}
		521
		522	void
		523	fs_instruction_scheduler::update_register_pressure(backend_instruction *be)
		524	{
		525	fs_inst inst = (fs_inst )be;
		526
		527	if (!remaining_grf_uses)
		528	return;
		529
		530	if (inst->dst.file == GRF) {
		531	remaining_grf_uses[inst->dst.reg]--;
		532	grf_active[inst->dst.reg] = true;
		533	}
		534
		535	for (int i = 0; i < inst->sources; i++) {
		536	if (inst->src[i].file == GRF) {
		537	remaining_grf_uses[inst->src[i].reg]--;
		538	grf_active[inst->src[i].reg] = true;
		539	}
		540	}
		541	}
		542
		543	int
		544	fs_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
		545	{
		546	fs_inst inst = (fs_inst )be;
		547	int benefit = 0;
		548
		549	if (inst->dst.file == GRF) {
		550	if (remaining_grf_uses[inst->dst.reg] == 1)
		551	benefit += v->alloc.sizes[inst->dst.reg];
		552	if (!grf_active[inst->dst.reg])
		553	benefit -= v->alloc.sizes[inst->dst.reg];
		554	}
		555
		556	for (int i = 0; i < inst->sources; i++) {
		557	if (inst->src[i].file != GRF)
		558	continue;
		559
		560	if (remaining_grf_uses[inst->src[i].reg] == 1)
		561	benefit += v->alloc.sizes[inst->src[i].reg];
		562	if (!grf_active[inst->src[i].reg])
		563	benefit -= v->alloc.sizes[inst->src[i].reg];
		564	}
		565
		566	return benefit;
		567	}
		568
		569	class vec4_instruction_scheduler : public instruction_scheduler
		570	{
		571	public:
		572	vec4_instruction_scheduler(vec4_visitor *v, int grf_count);
		573	void calculate_deps();
		574	schedule_node *choose_instruction_to_schedule();
		575	int issue_time(backend_instruction *inst);
		576	vec4_visitor *v;
		577
		578	void count_remaining_grf_uses(backend_instruction *inst);
		579	void update_register_pressure(backend_instruction *inst);
		580	int get_register_pressure_benefit(backend_instruction *inst);
		581	};
		582
		583	vec4_instruction_scheduler::vec4_instruction_scheduler(vec4_visitor *v,
		584	int grf_count)
		585	: instruction_scheduler(v, grf_count, SCHEDULE_POST),
		586	v(v)
		587	{
		588	}
		589
		590	void
		591	vec4_instruction_scheduler::count_remaining_grf_uses(backend_instruction *be)
		592	{
		593	}
		594
		595	void
		596	vec4_instruction_scheduler::update_register_pressure(backend_instruction *be)
		597	{
		598	}
		599
		600	int
		601	vec4_instruction_scheduler::get_register_pressure_benefit(backend_instruction *be)
		602	{
		603	return 0;
		604	}
		605
		606	schedule_node::schedule_node(backend_instruction *inst,
		607	instruction_scheduler *sched)
		608	{
		609	const struct brw_device_info *devinfo = sched->bv->devinfo;
		610
		611	this->inst = inst;
		612	this->child_array_size = 0;
		613	this->children = NULL;
		614	this->child_latency = NULL;
		615	this->child_count = 0;
		616	this->parent_count = 0;
		617	this->unblocked_time = 0;
		618	this->cand_generation = 0;
		619	this->delay = 0;
		620
		621	/* We can't measure Gen6 timings directly but expect them to be much
		622	* closer to Gen7 than Gen4.
		623	*/
		624	if (!sched->post_reg_alloc)
		625	this->latency = 1;
		626	else if (devinfo->gen >= 6)
		627	set_latency_gen7(devinfo->is_haswell);
		628	else
		629	set_latency_gen4();
		630	}
		631
		632	void
		633	instruction_scheduler::add_insts_from_block(bblock_t *block)
		634	{
		635	/* Removing the last instruction from a basic block removes the block as
		636	* well, so put a NOP at the end to keep it alive.
		637	*/
		638	if (!block->end()->is_control_flow()) {
		639	backend_instruction *nop = new(mem_ctx) backend_instruction();
		640	nop->opcode = BRW_OPCODE_NOP;
		641	block->end()->insert_after(block, nop);
		642	}
		643
		644	foreach_inst_in_block_safe(backend_instruction, inst, block) {
		645	if (inst->opcode == BRW_OPCODE_NOP \|\| inst->is_control_flow())
		646	continue;
		647
		648	schedule_node *n = new(mem_ctx) schedule_node(inst, this);
		649
		650	this->instructions_to_schedule++;
		651
		652	inst->remove(block);
		653	instructions.push_tail(n);
		654	}
		655	}
		656
		657	/** Recursive computation of the delay member of a node. */
		658	void
		659	instruction_scheduler::compute_delay(schedule_node *n)
		660	{
		661	if (!n->child_count) {
		662	n->delay = issue_time(n->inst);
		663	} else {
		664	for (int i = 0; i < n->child_count; i++) {
		665	if (!n->children[i]->delay)
		666	compute_delay(n->children[i]);
		667	n->delay = MAX2(n->delay, n->latency + n->children[i]->delay);
		668	}
		669	}
		670	}
		671
		672	/**
		673	* Add a dependency between two instruction nodes.
		674	*
		675	* The @after node will be scheduled after @before. We will try to
		676	* schedule it @latency cycles after @before, but no guarantees there.
		677	*/
		678	void
		679	instruction_scheduler::add_dep(schedule_node before, schedule_node after,
		680	int latency)
		681	{
		682	if (!before \|\| !after)
		683	return;
		684
		685	assert(before != after);
		686
		687	for (int i = 0; i < before->child_count; i++) {
		688	if (before->children[i] == after) {
		689	before->child_latency[i] = MAX2(before->child_latency[i], latency);
		690	return;
		691	}
		692	}
		693
		694	if (before->child_array_size <= before->child_count) {
		695	if (before->child_array_size < 16)
		696	before->child_array_size = 16;
		697	else
		698	before->child_array_size *= 2;
		699
		700	before->children = reralloc(mem_ctx, before->children,
		701	schedule_node *,
		702	before->child_array_size);
		703	before->child_latency = reralloc(mem_ctx, before->child_latency,
		704	int, before->child_array_size);
		705	}
		706
		707	before->children[before->child_count] = after;
		708	before->child_latency[before->child_count] = latency;
		709	before->child_count++;
		710	after->parent_count++;
		711	}
		712
		713	void
		714	instruction_scheduler::add_dep(schedule_node before, schedule_node after)
		715	{
		716	if (!before)
		717	return;
		718
		719	add_dep(before, after, before->latency);
		720	}
		721
		722	/**
		723	* Sometimes we really want this node to execute after everything that
		724	* was before it and before everything that followed it. This adds
		725	* the deps to do so.
		726	*/
		727	void
		728	instruction_scheduler::add_barrier_deps(schedule_node *n)
		729	{
		730	schedule_node prev = (schedule_node )n->prev;
		731	schedule_node next = (schedule_node )n->next;
		732
		733	if (prev) {
		734	while (!prev->is_head_sentinel()) {
		735	add_dep(prev, n, 0);
		736	prev = (schedule_node *)prev->prev;
		737	}
		738	}
		739
		740	if (next) {
		741	while (!next->is_tail_sentinel()) {
		742	add_dep(n, next, 0);
		743	next = (schedule_node *)next->next;
		744	}
		745	}
		746	}
		747
		748	/* instruction scheduling needs to be aware of when an MRF write
		749	* actually writes 2 MRFs.
		750	*/
		751	bool
		752	fs_instruction_scheduler::is_compressed(fs_inst *inst)
		753	{
		754	return inst->exec_size == 16;
		755	}
		756
		757	void
		758	fs_instruction_scheduler::calculate_deps()
		759	{
		760	/* Pre-register-allocation, this tracks the last write per VGRF offset.
		761	* After register allocation, reg_offsets are gone and we track individual
		762	* GRF registers.
		763	*/
		764	schedule_node last_grf_write[grf_count 16];
		765	schedule_node *last_mrf_write[BRW_MAX_MRF];
		766	schedule_node *last_conditional_mod[2] = { NULL, NULL };
		767	schedule_node *last_accumulator_write = NULL;
		768	/* Fixed HW registers are assumed to be separate from the virtual
		769	* GRFs, so they can be tracked separately. We don't really write
		770	* to fixed GRFs much, so don't bother tracking them on a more
		771	* granular level.
		772	*/
		773	schedule_node *last_fixed_grf_write = NULL;
		774	int reg_width = v->dispatch_width / 8;
		775
		776	/* The last instruction always needs to still be the last
		777	* instruction. Either it's flow control (IF, ELSE, ENDIF, DO,
		778	* WHILE) and scheduling other things after it would disturb the
		779	* basic block, or it's FB_WRITE and we should do a better job at
		780	* dead code elimination anyway.
		781	*/
		782	schedule_node last = (schedule_node )instructions.get_tail();
		783	add_barrier_deps(last);
		784
		785	memset(last_grf_write, 0, sizeof(last_grf_write));
		786	memset(last_mrf_write, 0, sizeof(last_mrf_write));
		787
		788	/* top-to-bottom dependencies: RAW and WAW. */
		789	foreach_in_list(schedule_node, n, &instructions) {
		790	fs_inst inst = (fs_inst )n->inst;
		791
		792	if (inst->opcode == FS_OPCODE_PLACEHOLDER_HALT \|\|
		793	inst->has_side_effects())
		794	add_barrier_deps(n);
		795
		796	/* read-after-write deps. */
		797	for (int i = 0; i < inst->sources; i++) {
		798	if (inst->src[i].file == GRF) {
		799	if (post_reg_alloc) {
		800	for (int r = 0; r < inst->regs_read(i); r++)
		801	add_dep(last_grf_write[inst->src[i].reg + r], n);
		802	} else {
		803	for (int r = 0; r < inst->regs_read(i); r++) {
		804	add_dep(last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r], n);
		805	}
		806	}
		807	} else if (inst->src[i].file == HW_REG &&
		808	(inst->src[i].fixed_hw_reg.file ==
		809	BRW_GENERAL_REGISTER_FILE)) {
		810	if (post_reg_alloc) {
		811	int size = reg_width;
		812	if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
		813	size = 1;
		814	for (int r = 0; r < size; r++)
		815	add_dep(last_grf_write[inst->src[i].fixed_hw_reg.nr + r], n);
		816	} else {
		817	add_dep(last_fixed_grf_write, n);
		818	}
		819	} else if (inst->src[i].is_accumulator()) {
		820	add_dep(last_accumulator_write, n);
		821	} else if (inst->src[i].file != BAD_FILE &&
		822	inst->src[i].file != IMM &&
		823	inst->src[i].file != UNIFORM &&
		824	(inst->src[i].file != HW_REG \|\|
		825	inst->src[i].fixed_hw_reg.file != IMM)) {
		826	assert(inst->src[i].file != MRF);
		827	add_barrier_deps(n);
		828	}
		829	}
		830
		831	if (inst->base_mrf != -1) {
		832	for (int i = 0; i < inst->mlen; i++) {
		833	/* It looks like the MRF regs are released in the send
		834	* instruction once it's sent, not when the result comes
		835	* back.
		836	*/
		837	add_dep(last_mrf_write[inst->base_mrf + i], n);
		838	}
		839	}
		840
		841	if (inst->reads_flag()) {
		842	add_dep(last_conditional_mod[inst->flag_subreg], n);
		843	}
		844
		845	if (inst->reads_accumulator_implicitly()) {
		846	add_dep(last_accumulator_write, n);
		847	}
		848
		849	/* write-after-write deps. */
		850	if (inst->dst.file == GRF) {
		851	if (post_reg_alloc) {
		852	for (int r = 0; r < inst->regs_written; r++) {
		853	add_dep(last_grf_write[inst->dst.reg + r], n);
		854	last_grf_write[inst->dst.reg + r] = n;
		855	}
		856	} else {
		857	for (int r = 0; r < inst->regs_written; r++) {
		858	add_dep(last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r], n);
		859	last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n;
		860	}
		861	}
		862	} else if (inst->dst.file == MRF) {
		863	int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
		864
		865	add_dep(last_mrf_write[reg], n);
		866	last_mrf_write[reg] = n;
		867	if (is_compressed(inst)) {
		868	if (inst->dst.reg & BRW_MRF_COMPR4)
		869	reg += 4;
		870	else
		871	reg++;
		872	add_dep(last_mrf_write[reg], n);
		873	last_mrf_write[reg] = n;
		874	}
		875	} else if (inst->dst.file == HW_REG &&
		876	inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
		877	if (post_reg_alloc) {
		878	for (int r = 0; r < reg_width; r++)
		879	last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
		880	} else {
		881	last_fixed_grf_write = n;
		882	}
		883	} else if (inst->dst.is_accumulator()) {
		884	add_dep(last_accumulator_write, n);
		885	last_accumulator_write = n;
		886	} else if (inst->dst.file != BAD_FILE &&
		887	!inst->dst.is_null()) {
		888	add_barrier_deps(n);
		889	}
		890
		891	if (inst->mlen > 0 && inst->base_mrf != -1) {
		892	for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
		893	add_dep(last_mrf_write[inst->base_mrf + i], n);
		894	last_mrf_write[inst->base_mrf + i] = n;
		895	}
		896	}
		897
		898	if (inst->writes_flag()) {
		899	add_dep(last_conditional_mod[inst->flag_subreg], n, 0);
		900	last_conditional_mod[inst->flag_subreg] = n;
		901	}
		902
		903	if (inst->writes_accumulator_implicitly(v->devinfo) &&
		904	!inst->dst.is_accumulator()) {
		905	add_dep(last_accumulator_write, n);
		906	last_accumulator_write = n;
		907	}
		908	}
		909
		910	/* bottom-to-top dependencies: WAR */
		911	memset(last_grf_write, 0, sizeof(last_grf_write));
		912	memset(last_mrf_write, 0, sizeof(last_mrf_write));
		913	memset(last_conditional_mod, 0, sizeof(last_conditional_mod));
		914	last_accumulator_write = NULL;
		915	last_fixed_grf_write = NULL;
		916
		917	exec_node *node;
		918	exec_node *prev;
		919	for (node = instructions.get_tail(), prev = node->prev;
		920	!node->is_head_sentinel();
		921	node = prev, prev = node->prev) {
		922	schedule_node n = (schedule_node )node;
		923	fs_inst inst = (fs_inst )n->inst;
		924
		925	/* write-after-read deps. */
		926	for (int i = 0; i < inst->sources; i++) {
		927	if (inst->src[i].file == GRF) {
		928	if (post_reg_alloc) {
		929	for (int r = 0; r < inst->regs_read(i); r++)
		930	add_dep(n, last_grf_write[inst->src[i].reg + r]);
		931	} else {
		932	for (int r = 0; r < inst->regs_read(i); r++) {
		933	add_dep(n, last_grf_write[inst->src[i].reg * 16 + inst->src[i].reg_offset + r]);
		934	}
		935	}
		936	} else if (inst->src[i].file == HW_REG &&
		937	(inst->src[i].fixed_hw_reg.file ==
		938	BRW_GENERAL_REGISTER_FILE)) {
		939	if (post_reg_alloc) {
		940	int size = reg_width;
		941	if (inst->src[i].fixed_hw_reg.vstride == BRW_VERTICAL_STRIDE_0)
		942	size = 1;
		943	for (int r = 0; r < size; r++)
		944	add_dep(n, last_grf_write[inst->src[i].fixed_hw_reg.nr + r]);
		945	} else {
		946	add_dep(n, last_fixed_grf_write);
		947	}
		948	} else if (inst->src[i].is_accumulator()) {
		949	add_dep(n, last_accumulator_write);
		950	} else if (inst->src[i].file != BAD_FILE &&
		951	inst->src[i].file != IMM &&
		952	inst->src[i].file != UNIFORM &&
		953	(inst->src[i].file != HW_REG \|\|
		954	inst->src[i].fixed_hw_reg.file != IMM)) {
		955	assert(inst->src[i].file != MRF);
		956	add_barrier_deps(n);
		957	}
		958	}
		959
		960	if (inst->base_mrf != -1) {
		961	for (int i = 0; i < inst->mlen; i++) {
		962	/* It looks like the MRF regs are released in the send
		963	* instruction once it's sent, not when the result comes
		964	* back.
		965	*/
		966	add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
		967	}
		968	}
		969
		970	if (inst->reads_flag()) {
		971	add_dep(n, last_conditional_mod[inst->flag_subreg]);
		972	}
		973
		974	if (inst->reads_accumulator_implicitly()) {
		975	add_dep(n, last_accumulator_write);
		976	}
		977
		978	/* Update the things this instruction wrote, so earlier reads
		979	* can mark this as WAR dependency.
		980	*/
		981	if (inst->dst.file == GRF) {
		982	if (post_reg_alloc) {
		983	for (int r = 0; r < inst->regs_written; r++)
		984	last_grf_write[inst->dst.reg + r] = n;
		985	} else {
		986	for (int r = 0; r < inst->regs_written; r++) {
		987	last_grf_write[inst->dst.reg * 16 + inst->dst.reg_offset + r] = n;
		988	}
		989	}
		990	} else if (inst->dst.file == MRF) {
		991	int reg = inst->dst.reg & ~BRW_MRF_COMPR4;
		992
		993	last_mrf_write[reg] = n;
		994
		995	if (is_compressed(inst)) {
		996	if (inst->dst.reg & BRW_MRF_COMPR4)
		997	reg += 4;
		998	else
		999	reg++;
		1000
		1001	last_mrf_write[reg] = n;
		1002	}
		1003	} else if (inst->dst.file == HW_REG &&
		1004	inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
		1005	if (post_reg_alloc) {
		1006	for (int r = 0; r < reg_width; r++)
		1007	last_grf_write[inst->dst.fixed_hw_reg.nr + r] = n;
		1008	} else {
		1009	last_fixed_grf_write = n;
		1010	}
		1011	} else if (inst->dst.is_accumulator()) {
		1012	last_accumulator_write = n;
		1013	} else if (inst->dst.file != BAD_FILE &&
		1014	!inst->dst.is_null()) {
		1015	add_barrier_deps(n);
		1016	}
		1017
		1018	if (inst->mlen > 0 && inst->base_mrf != -1) {
		1019	for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
		1020	last_mrf_write[inst->base_mrf + i] = n;
		1021	}
		1022	}
		1023
		1024	if (inst->writes_flag()) {
		1025	last_conditional_mod[inst->flag_subreg] = n;
		1026	}
		1027
		1028	if (inst->writes_accumulator_implicitly(v->devinfo)) {
		1029	last_accumulator_write = n;
		1030	}
		1031	}
		1032	}
		1033
		1034	void
		1035	vec4_instruction_scheduler::calculate_deps()
		1036	{
		1037	schedule_node *last_grf_write[grf_count];
		1038	schedule_node *last_mrf_write[BRW_MAX_MRF];
		1039	schedule_node *last_conditional_mod = NULL;
		1040	schedule_node *last_accumulator_write = NULL;
		1041	/* Fixed HW registers are assumed to be separate from the virtual
		1042	* GRFs, so they can be tracked separately. We don't really write
		1043	* to fixed GRFs much, so don't bother tracking them on a more
		1044	* granular level.
		1045	*/
		1046	schedule_node *last_fixed_grf_write = NULL;
		1047
		1048	/* The last instruction always needs to still be the last instruction.
		1049	* Either it's flow control (IF, ELSE, ENDIF, DO, WHILE) and scheduling
		1050	* other things after it would disturb the basic block, or it's the EOT
		1051	* URB_WRITE and we should do a better job at dead code eliminating
		1052	* anything that could have been scheduled after it.
		1053	*/
		1054	schedule_node last = (schedule_node )instructions.get_tail();
		1055	add_barrier_deps(last);
		1056
		1057	memset(last_grf_write, 0, sizeof(last_grf_write));
		1058	memset(last_mrf_write, 0, sizeof(last_mrf_write));
		1059
		1060	/* top-to-bottom dependencies: RAW and WAW. */
		1061	foreach_in_list(schedule_node, n, &instructions) {
		1062	vec4_instruction inst = (vec4_instruction )n->inst;
		1063
		1064	if (inst->has_side_effects())
		1065	add_barrier_deps(n);
		1066
		1067	/* read-after-write deps. */
		1068	for (int i = 0; i < 3; i++) {
		1069	if (inst->src[i].file == GRF) {
		1070	for (unsigned j = 0; j < inst->regs_read(i); ++j)
		1071	add_dep(last_grf_write[inst->src[i].reg + j], n);
		1072	} else if (inst->src[i].file == HW_REG &&
		1073	(inst->src[i].fixed_hw_reg.file ==
		1074	BRW_GENERAL_REGISTER_FILE)) {
		1075	add_dep(last_fixed_grf_write, n);
		1076	} else if (inst->src[i].is_accumulator()) {
		1077	assert(last_accumulator_write);
		1078	add_dep(last_accumulator_write, n);
		1079	} else if (inst->src[i].file != BAD_FILE &&
		1080	inst->src[i].file != IMM &&
		1081	inst->src[i].file != UNIFORM &&
		1082	(inst->src[i].file != HW_REG \|\|
		1083	inst->src[i].fixed_hw_reg.file != IMM)) {
		1084	/* No reads from MRF, and ATTR is already translated away */
		1085	assert(inst->src[i].file != MRF &&
		1086	inst->src[i].file != ATTR);
		1087	add_barrier_deps(n);
		1088	}
		1089	}
		1090
		1091	if (!inst->is_send_from_grf()) {
		1092	for (int i = 0; i < inst->mlen; i++) {
		1093	/* It looks like the MRF regs are released in the send
		1094	* instruction once it's sent, not when the result comes
		1095	* back.
		1096	*/
		1097	add_dep(last_mrf_write[inst->base_mrf + i], n);
		1098	}
		1099	}
		1100
		1101	if (inst->reads_flag()) {
		1102	assert(last_conditional_mod);
		1103	add_dep(last_conditional_mod, n);
		1104	}
		1105
		1106	if (inst->reads_accumulator_implicitly()) {
		1107	assert(last_accumulator_write);
		1108	add_dep(last_accumulator_write, n);
		1109	}
		1110
		1111	/* write-after-write deps. */
		1112	if (inst->dst.file == GRF) {
		1113	for (unsigned j = 0; j < inst->regs_written; ++j) {
		1114	add_dep(last_grf_write[inst->dst.reg + j], n);
		1115	last_grf_write[inst->dst.reg + j] = n;
		1116	}
		1117	} else if (inst->dst.file == MRF) {
		1118	add_dep(last_mrf_write[inst->dst.reg], n);
		1119	last_mrf_write[inst->dst.reg] = n;
		1120	} else if (inst->dst.file == HW_REG &&
		1121	inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
		1122	last_fixed_grf_write = n;
		1123	} else if (inst->dst.is_accumulator()) {
		1124	add_dep(last_accumulator_write, n);
		1125	last_accumulator_write = n;
		1126	} else if (inst->dst.file != BAD_FILE &&
		1127	!inst->dst.is_null()) {
		1128	add_barrier_deps(n);
		1129	}
		1130
		1131	if (inst->mlen > 0 && !inst->is_send_from_grf()) {
		1132	for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
		1133	add_dep(last_mrf_write[inst->base_mrf + i], n);
		1134	last_mrf_write[inst->base_mrf + i] = n;
		1135	}
		1136	}
		1137
		1138	if (inst->writes_flag()) {
		1139	add_dep(last_conditional_mod, n, 0);
		1140	last_conditional_mod = n;
		1141	}
		1142
		1143	if (inst->writes_accumulator_implicitly(v->devinfo) &&
		1144	!inst->dst.is_accumulator()) {
		1145	add_dep(last_accumulator_write, n);
		1146	last_accumulator_write = n;
		1147	}
		1148	}
		1149
		1150	/* bottom-to-top dependencies: WAR */
		1151	memset(last_grf_write, 0, sizeof(last_grf_write));
		1152	memset(last_mrf_write, 0, sizeof(last_mrf_write));
		1153	last_conditional_mod = NULL;
		1154	last_accumulator_write = NULL;
		1155	last_fixed_grf_write = NULL;
		1156
		1157	exec_node *node;
		1158	exec_node *prev;
		1159	for (node = instructions.get_tail(), prev = node->prev;
		1160	!node->is_head_sentinel();
		1161	node = prev, prev = node->prev) {
		1162	schedule_node n = (schedule_node )node;
		1163	vec4_instruction inst = (vec4_instruction )n->inst;
		1164
		1165	/* write-after-read deps. */
		1166	for (int i = 0; i < 3; i++) {
		1167	if (inst->src[i].file == GRF) {
		1168	for (unsigned j = 0; j < inst->regs_read(i); ++j)
		1169	add_dep(n, last_grf_write[inst->src[i].reg + j]);
		1170	} else if (inst->src[i].file == HW_REG &&
		1171	(inst->src[i].fixed_hw_reg.file ==
		1172	BRW_GENERAL_REGISTER_FILE)) {
		1173	add_dep(n, last_fixed_grf_write);
		1174	} else if (inst->src[i].is_accumulator()) {
		1175	add_dep(n, last_accumulator_write);
		1176	} else if (inst->src[i].file != BAD_FILE &&
		1177	inst->src[i].file != IMM &&
		1178	inst->src[i].file != UNIFORM &&
		1179	(inst->src[i].file != HW_REG \|\|
		1180	inst->src[i].fixed_hw_reg.file != IMM)) {
		1181	assert(inst->src[i].file != MRF &&
		1182	inst->src[i].file != ATTR);
		1183	add_barrier_deps(n);
		1184	}
		1185	}
		1186
		1187	if (!inst->is_send_from_grf()) {
		1188	for (int i = 0; i < inst->mlen; i++) {
		1189	/* It looks like the MRF regs are released in the send
		1190	* instruction once it's sent, not when the result comes
		1191	* back.
		1192	*/
		1193	add_dep(n, last_mrf_write[inst->base_mrf + i], 2);
		1194	}
		1195	}
		1196
		1197	if (inst->reads_flag()) {
		1198	add_dep(n, last_conditional_mod);
		1199	}
		1200
		1201	if (inst->reads_accumulator_implicitly()) {
		1202	add_dep(n, last_accumulator_write);
		1203	}
		1204
		1205	/* Update the things this instruction wrote, so earlier reads
		1206	* can mark this as WAR dependency.
		1207	*/
		1208	if (inst->dst.file == GRF) {
		1209	for (unsigned j = 0; j < inst->regs_written; ++j)
		1210	last_grf_write[inst->dst.reg + j] = n;
		1211	} else if (inst->dst.file == MRF) {
		1212	last_mrf_write[inst->dst.reg] = n;
		1213	} else if (inst->dst.file == HW_REG &&
		1214	inst->dst.fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
		1215	last_fixed_grf_write = n;
		1216	} else if (inst->dst.is_accumulator()) {
		1217	last_accumulator_write = n;
		1218	} else if (inst->dst.file != BAD_FILE &&
		1219	!inst->dst.is_null()) {
		1220	add_barrier_deps(n);
		1221	}
		1222
		1223	if (inst->mlen > 0 && !inst->is_send_from_grf()) {
		1224	for (int i = 0; i < v->implied_mrf_writes(inst); i++) {
		1225	last_mrf_write[inst->base_mrf + i] = n;
		1226	}
		1227	}
		1228
		1229	if (inst->writes_flag()) {
		1230	last_conditional_mod = n;
		1231	}
		1232
		1233	if (inst->writes_accumulator_implicitly(v->devinfo)) {
		1234	last_accumulator_write = n;
		1235	}
		1236	}
		1237	}
		1238
		1239	schedule_node *
		1240	fs_instruction_scheduler::choose_instruction_to_schedule()
		1241	{
		1242	schedule_node *chosen = NULL;
		1243
		1244	if (mode == SCHEDULE_PRE \|\| mode == SCHEDULE_POST) {
		1245	int chosen_time = 0;
		1246
		1247	/* Of the instructions ready to execute or the closest to
		1248	* being ready, choose the oldest one.
		1249	*/
		1250	foreach_in_list(schedule_node, n, &instructions) {
		1251	if (!chosen \|\| n->unblocked_time < chosen_time) {
		1252	chosen = n;
		1253	chosen_time = n->unblocked_time;
		1254	}
		1255	}
		1256	} else {
		1257	/* Before register allocation, we don't care about the latencies of
		1258	* instructions. All we care about is reducing live intervals of
		1259	* variables so that we can avoid register spilling, or get SIMD16
		1260	* shaders which naturally do a better job of hiding instruction
		1261	* latency.
		1262	*/
		1263	foreach_in_list(schedule_node, n, &instructions) {
		1264	fs_inst inst = (fs_inst )n->inst;
		1265
		1266	if (!chosen) {
		1267	chosen = n;
		1268	continue;
		1269	}
		1270
		1271	/* Most important: If we can definitely reduce register pressure, do
		1272	* so immediately.
		1273	*/
		1274	int register_pressure_benefit = get_register_pressure_benefit(n->inst);
		1275	int chosen_register_pressure_benefit =
		1276	get_register_pressure_benefit(chosen->inst);
		1277
		1278	if (register_pressure_benefit > 0 &&
		1279	register_pressure_benefit > chosen_register_pressure_benefit) {
		1280	chosen = n;
		1281	continue;
		1282	} else if (chosen_register_pressure_benefit > 0 &&
		1283	(register_pressure_benefit <
		1284	chosen_register_pressure_benefit)) {
		1285	continue;
		1286	}
		1287
		1288	if (mode == SCHEDULE_PRE_LIFO) {
		1289	/* Prefer instructions that recently became available for
		1290	* scheduling. These are the things that are most likely to
		1291	* (eventually) make a variable dead and reduce register pressure.
		1292	* Typical register pressure estimates don't work for us because
		1293	* most of our pressure comes from texturing, where no single
		1294	* instruction to schedule will make a vec4 value dead.
		1295	*/
		1296	if (n->cand_generation > chosen->cand_generation) {
		1297	chosen = n;
		1298	continue;
		1299	} else if (n->cand_generation < chosen->cand_generation) {
		1300	continue;
		1301	}
		1302
		1303	/* On MRF-using chips, prefer non-SEND instructions. If we don't
		1304	* do this, then because we prefer instructions that just became
		1305	* candidates, we'll end up in a pattern of scheduling a SEND,
		1306	* then the MRFs for the next SEND, then the next SEND, then the
		1307	* MRFs, etc., without ever consuming the results of a send.
		1308	*/
		1309	if (v->devinfo->gen < 7) {
		1310	fs_inst chosen_inst = (fs_inst )chosen->inst;
		1311
		1312	/* We use regs_written > 1 as our test for the kind of send
		1313	* instruction to avoid -- only sends generate many regs, and a
		1314	* single-result send is probably actually reducing register
		1315	* pressure.
		1316	*/
		1317	if (inst->regs_written <= inst->dst.width / 8 &&
		1318	chosen_inst->regs_written > chosen_inst->dst.width / 8) {
		1319	chosen = n;
		1320	continue;
		1321	} else if (inst->regs_written > chosen_inst->regs_written) {
		1322	continue;
		1323	}
		1324	}
		1325	}
		1326
		1327	/* For instructions pushed on the cands list at the same time, prefer
		1328	* the one with the highest delay to the end of the program. This is
		1329	* most likely to have its values able to be consumed first (such as
		1330	* for a large tree of lowered ubo loads, which appear reversed in
		1331	* the instruction stream with respect to when they can be consumed).
		1332	*/
		1333	if (n->delay > chosen->delay) {
		1334	chosen = n;
		1335	continue;
		1336	} else if (n->delay < chosen->delay) {
		1337	continue;
		1338	}
		1339
		1340	/* If all other metrics are equal, we prefer the first instruction in
		1341	* the list (program execution).
		1342	*/
		1343	}
		1344	}
		1345
		1346	return chosen;
		1347	}
		1348
		1349	schedule_node *
		1350	vec4_instruction_scheduler::choose_instruction_to_schedule()
		1351	{
		1352	schedule_node *chosen = NULL;
		1353	int chosen_time = 0;
		1354
		1355	/* Of the instructions ready to execute or the closest to being ready,
		1356	* choose the oldest one.
		1357	*/
		1358	foreach_in_list(schedule_node, n, &instructions) {
		1359	if (!chosen \|\| n->unblocked_time < chosen_time) {
		1360	chosen = n;
		1361	chosen_time = n->unblocked_time;
		1362	}
		1363	}
		1364
		1365	return chosen;
		1366	}
		1367
		1368	int
		1369	fs_instruction_scheduler::issue_time(backend_instruction *inst)
		1370	{
		1371	if (is_compressed((fs_inst *)inst))
		1372	return 4;
		1373	else
		1374	return 2;
		1375	}
		1376
		1377	int
		1378	vec4_instruction_scheduler::issue_time(backend_instruction *inst)
		1379	{
		1380	/* We always execute as two vec4s in parallel. */
		1381	return 2;
		1382	}
		1383
		1384	void
		1385	instruction_scheduler::schedule_instructions(bblock_t *block)
		1386	{
		1387	const struct brw_device_info *devinfo = bv->devinfo;
		1388	backend_instruction *inst = block->end();
		1389	time = 0;
		1390
		1391	/* Remove non-DAG heads from the list. */
		1392	foreach_in_list_safe(schedule_node, n, &instructions) {
		1393	if (n->parent_count != 0)
		1394	n->remove();
		1395	}
		1396
		1397	unsigned cand_generation = 1;
		1398	while (!instructions.is_empty()) {
		1399	schedule_node *chosen = choose_instruction_to_schedule();
		1400
		1401	/* Schedule this instruction. */
		1402	assert(chosen);
		1403	chosen->remove();
		1404	inst->insert_before(block, chosen->inst);
		1405	instructions_to_schedule--;
		1406	update_register_pressure(chosen->inst);
		1407
		1408	/* Update the clock for how soon an instruction could start after the
		1409	* chosen one.
		1410	*/
		1411	time += issue_time(chosen->inst);
		1412
		1413	/* If we expected a delay for scheduling, then bump the clock to reflect
		1414	* that as well. In reality, the hardware will switch to another
		1415	* hyperthread and may not return to dispatching our thread for a while
		1416	* even after we're unblocked.
		1417	*/
		1418	time = MAX2(time, chosen->unblocked_time);
		1419
		1420	if (debug) {
		1421	fprintf(stderr, "clock %4d, scheduled: ", time);
		1422	bv->dump_instruction(chosen->inst);
		1423	}
		1424
		1425	/* Now that we've scheduled a new instruction, some of its
		1426	* children can be promoted to the list of instructions ready to
		1427	* be scheduled. Update the children's unblocked time for this
		1428	* DAG edge as we do so.
		1429	*/
		1430	for (int i = chosen->child_count - 1; i >= 0; i--) {
		1431	schedule_node *child = chosen->children[i];
		1432
		1433	child->unblocked_time = MAX2(child->unblocked_time,
		1434	time + chosen->child_latency[i]);
		1435
		1436	if (debug) {
		1437	fprintf(stderr, "\tchild %d, %d parents: ", i, child->parent_count);
		1438	bv->dump_instruction(child->inst);
		1439	}
		1440
		1441	child->cand_generation = cand_generation;
		1442	child->parent_count--;
		1443	if (child->parent_count == 0) {
		1444	if (debug) {
		1445	fprintf(stderr, "\t\tnow available\n");
		1446	}
		1447	instructions.push_head(child);
		1448	}
		1449	}
		1450	cand_generation++;
		1451
		1452	/* Shared resource: the mathbox. There's one mathbox per EU on Gen6+
		1453	* but it's more limited pre-gen6, so if we send something off to it then
		1454	* the next math instruction isn't going to make progress until the first
		1455	* is done.
		1456	*/
		1457	if (devinfo->gen < 6 && chosen->inst->is_math()) {
		1458	foreach_in_list(schedule_node, n, &instructions) {
		1459	if (n->inst->is_math())
		1460	n->unblocked_time = MAX2(n->unblocked_time,
		1461	time + chosen->latency);
		1462	}
		1463	}
		1464	}
		1465
		1466	if (block->end()->opcode == BRW_OPCODE_NOP)
		1467	block->end()->remove(block);
		1468	assert(instructions_to_schedule == 0);
		1469	}
		1470
		1471	void
		1472	instruction_scheduler::run(cfg_t *cfg)
		1473	{
		1474	if (debug) {
		1475	fprintf(stderr, "\nInstructions before scheduling (reg_alloc %d)\n",
		1476	post_reg_alloc);
		1477	bv->dump_instructions();
		1478	}
		1479
		1480	/* Populate the remaining GRF uses array to improve the pre-regalloc
		1481	* scheduling.
		1482	*/
		1483	if (remaining_grf_uses) {
		1484	foreach_block_and_inst(block, backend_instruction, inst, cfg) {
		1485	count_remaining_grf_uses(inst);
		1486	}
		1487	}
		1488
		1489	foreach_block(block, cfg) {
		1490	if (block->end_ip - block->start_ip <= 1)
		1491	continue;
		1492
		1493	add_insts_from_block(block);
		1494
		1495	calculate_deps();
		1496
		1497	foreach_in_list(schedule_node, n, &instructions) {
		1498	compute_delay(n);
		1499	}
		1500
		1501	schedule_instructions(block);
		1502	}
		1503
		1504	if (debug) {
		1505	fprintf(stderr, "\nInstructions after scheduling (reg_alloc %d)\n",
		1506	post_reg_alloc);
		1507	bv->dump_instructions();
		1508	}
		1509	}
		1510
		1511	void
		1512	fs_visitor::schedule_instructions(instruction_scheduler_mode mode)
		1513	{
		1514	int grf_count;
		1515	if (mode == SCHEDULE_POST)
		1516	grf_count = grf_used;
		1517	else
		1518	grf_count = alloc.count;
		1519
		1520	fs_instruction_scheduler sched(this, grf_count, mode);
		1521	sched.run(cfg);
		1522
		1523	if (unlikely(debug_enabled) && mode == SCHEDULE_POST) {
		1524	fprintf(stderr, "%s%d estimated execution time: %d cycles\n",
		1525	stage_abbrev, dispatch_width, sched.time);
		1526	}
		1527
		1528	invalidate_live_intervals();
		1529	}
		1530
		1531	void
		1532	vec4_visitor::opt_schedule_instructions()
		1533	{
		1534	vec4_instruction_scheduler sched(this, prog_data->total_grf);
		1535	sched.run(cfg);
		1536
		1537	if (unlikely(debug_enabled)) {
		1538	fprintf(stderr, "%s estimated execution time: %d cycles\n",
		1539	stage_abbrev, sched.time);
		1540	}
		1541
		1542	invalidate_live_intervals();
		1543	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/mesa-10.6.0/src/mesa/drivers/dri/i965/brw_schedule_instructions.cpp – Rev 5564