WebSVN – Kolibri OS – Blame – /contrib/sdk/sources/Mesa/mesa-10.6.0/src/mesa/drivers/dri/i965/intel_batchbuffer.c

Rev	Author	Line No.	Line
5564	serge	1	/**************************************************************************
		2	*
		3	* Copyright 2006 VMware, Inc.
		4	* All Rights Reserved.
		5	*
		6	* Permission is hereby granted, free of charge, to any person obtaining a
		7	* copy of this software and associated documentation files (the
		8	* "Software"), to deal in the Software without restriction, including
		9	* without limitation the rights to use, copy, modify, merge, publish,
		10	* distribute, sub license, and/or sell copies of the Software, and to
		11	* permit persons to whom the Software is furnished to do so, subject to
		12	* the following conditions:
		13	*
		14	* The above copyright notice and this permission notice (including the
		15	* next paragraph) shall be included in all copies or substantial portions
		16	* of the Software.
		17	*
		18	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
		19	* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
		20	* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
		21	* IN NO EVENT SHALL VMWARE AND/OR ITS SUPPLIERS BE LIABLE FOR
		22	* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
		23	* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
		24	* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
		25	*
		26	**************************************************************************/
		27
		28	#include "intel_batchbuffer.h"
		29	#include "intel_buffer_objects.h"
		30	#include "intel_reg.h"
		31	#include "intel_bufmgr.h"
		32	#include "intel_buffers.h"
		33	#include "intel_fbo.h"
		34	#include "brw_context.h"
		35
		36	#include
		37	#include
		38
		39	static void
		40	intel_batchbuffer_reset(struct brw_context *brw);
		41
		42	void
		43	intel_batchbuffer_init(struct brw_context *brw)
		44	{
		45	intel_batchbuffer_reset(brw);
		46
		47	if (brw->gen >= 6) {
		48	/* We can't just use brw_state_batch to get a chunk of space for
		49	* the gen6 workaround because it involves actually writing to
		50	* the buffer, and the kernel doesn't let us write to the batch.
		51	*/
		52	brw->batch.workaround_bo = drm_intel_bo_alloc(brw->bufmgr,
		53	"pipe_control workaround",
		54	4096, 4096);
		55	}
		56
		57	if (!brw->has_llc) {
		58	brw->batch.cpu_map = malloc(BATCH_SZ);
		59	brw->batch.map = brw->batch.cpu_map;
		60	}
		61	}
		62
		63	static void
		64	intel_batchbuffer_reset(struct brw_context *brw)
		65	{
		66	if (brw->batch.last_bo != NULL) {
		67	drm_intel_bo_unreference(brw->batch.last_bo);
		68	brw->batch.last_bo = NULL;
		69	}
		70	brw->batch.last_bo = brw->batch.bo;
		71
		72	brw_render_cache_set_clear(brw);
		73
		74	brw->batch.bo = drm_intel_bo_alloc(brw->bufmgr, "batchbuffer",
		75	BATCH_SZ, 4096);
		76	if (brw->has_llc) {
		77	drm_intel_bo_map(brw->batch.bo, true);
		78	brw->batch.map = brw->batch.bo->virtual;
		79	}
		80
		81	brw->batch.reserved_space = BATCH_RESERVED;
		82	brw->batch.state_batch_offset = brw->batch.bo->size;
		83	brw->batch.used = 0;
		84	brw->batch.needs_sol_reset = false;
		85	brw->batch.pipe_controls_since_last_cs_stall = 0;
		86
		87	/* We don't know what ring the new batch will be sent to until we see the
		88	* first BEGIN_BATCH or BEGIN_BATCH_BLT. Mark it as unknown.
		89	*/
		90	brw->batch.ring = UNKNOWN_RING;
		91	}
		92
		93	void
		94	intel_batchbuffer_save_state(struct brw_context *brw)
		95	{
		96	brw->batch.saved.used = brw->batch.used;
		97	brw->batch.saved.reloc_count =
		98	drm_intel_gem_bo_get_reloc_count(brw->batch.bo);
		99	}
		100
		101	void
		102	intel_batchbuffer_reset_to_saved(struct brw_context *brw)
		103	{
		104	drm_intel_gem_bo_clear_relocs(brw->batch.bo, brw->batch.saved.reloc_count);
		105
		106	brw->batch.used = brw->batch.saved.used;
		107	if (brw->batch.used == 0)
		108	brw->batch.ring = UNKNOWN_RING;
		109	}
		110
		111	void
		112	intel_batchbuffer_free(struct brw_context *brw)
		113	{
		114	free(brw->batch.cpu_map);
		115	drm_intel_bo_unreference(brw->batch.last_bo);
		116	drm_intel_bo_unreference(brw->batch.bo);
		117	drm_intel_bo_unreference(brw->batch.workaround_bo);
		118	}
		119
		120	static void
		121	do_batch_dump(struct brw_context *brw)
		122	{
		123	struct drm_intel_decode *decode;
		124	struct intel_batchbuffer *batch = &brw->batch;
		125	int ret;
		126
		127	decode = drm_intel_decode_context_alloc(brw->intelScreen->deviceID);
		128	if (!decode)
		129	return;
		130
		131	ret = drm_intel_bo_map(batch->bo, false);
		132	if (ret == 0) {
		133	drm_intel_decode_set_batch_pointer(decode,
		134	batch->bo->virtual,
		135	batch->bo->offset64,
		136	batch->used);
		137	} else {
		138	fprintf(stderr,
		139	"WARNING: failed to map batchbuffer (%s), "
		140	"dumping uploaded data instead.\n", strerror(ret));
		141
		142	drm_intel_decode_set_batch_pointer(decode,
		143	batch->map,
		144	batch->bo->offset64,
		145	batch->used);
		146	}
		147
		148	drm_intel_decode_set_output_file(decode, stderr);
		149	drm_intel_decode(decode);
		150
		151	drm_intel_decode_context_free(decode);
		152
		153	if (ret == 0) {
		154	drm_intel_bo_unmap(batch->bo);
		155
		156	brw_debug_batch(brw);
		157	}
		158	}
		159
		160	void
		161	intel_batchbuffer_emit_render_ring_prelude(struct brw_context *brw)
		162	{
		163	/* We may need to enable and snapshot OA counters. */
		164	brw_perf_monitor_new_batch(brw);
		165	}
		166
		167	/**
		168	* Called when starting a new batch buffer.
		169	*/
		170	static void
		171	brw_new_batch(struct brw_context *brw)
		172	{
		173	/* Create a new batchbuffer and reset the associated state: */
		174	drm_intel_gem_bo_clear_relocs(brw->batch.bo, 0);
		175	intel_batchbuffer_reset(brw);
		176
		177	/* If the kernel supports hardware contexts, then most hardware state is
		178	* preserved between batches; we only need to re-emit state that is required
		179	* to be in every batch. Otherwise we need to re-emit all the state that
		180	* would otherwise be stored in the context (which for all intents and
		181	* purposes means everything).
		182	*/
		183	if (brw->hw_ctx == NULL)
		184	brw->ctx.NewDriverState \|= BRW_NEW_CONTEXT;
		185
		186	brw->ctx.NewDriverState \|= BRW_NEW_BATCH;
		187
		188	brw->state_batch_count = 0;
		189
		190	brw->ib.type = -1;
		191
		192	/* We need to periodically reap the shader time results, because rollover
		193	* happens every few seconds. We also want to see results every once in a
		194	* while, because many programs won't cleanly destroy our context, so the
		195	* end-of-run printout may not happen.
		196	*/
		197	if (INTEL_DEBUG & DEBUG_SHADER_TIME)
		198	brw_collect_and_report_shader_time(brw);
		199
		200	if (INTEL_DEBUG & DEBUG_PERFMON)
		201	brw_dump_perf_monitors(brw);
		202	}
		203
		204	/**
		205	* Called from intel_batchbuffer_flush before emitting MI_BATCHBUFFER_END and
		206	* sending it off.
		207	*
		208	* This function can emit state (say, to preserve registers that aren't saved
		209	* between batches). All of this state MUST fit in the reserved space at the
		210	* end of the batchbuffer. If you add more GPU state, increase the reserved
		211	* space by updating the BATCH_RESERVED macro.
		212	*/
		213	static void
		214	brw_finish_batch(struct brw_context *brw)
		215	{
		216	/* Capture the closing pipeline statistics register values necessary to
		217	* support query objects (in the non-hardware context world).
		218	*/
		219	brw_emit_query_end(brw);
		220
		221	/* We may also need to snapshot and disable OA counters. */
		222	if (brw->batch.ring == RENDER_RING)
		223	brw_perf_monitor_finish_batch(brw);
		224
		225	/* Mark that the current program cache BO has been used by the GPU.
		226	* It will be reallocated if we need to put new programs in for the
		227	* next batch.
		228	*/
		229	brw->cache.bo_used_by_gpu = true;
		230	}
		231
		232	static void
		233	throttle(struct brw_context *brw)
		234	{
		235	/* Wait for the swapbuffers before the one we just emitted, so we
		236	* don't get too many swaps outstanding for apps that are GPU-heavy
		237	* but not CPU-heavy.
		238	*
		239	* We're using intelDRI2Flush (called from the loader before
		240	* swapbuffer) and glFlush (for front buffer rendering) as the
		241	* indicator that a frame is done and then throttle when we get
		242	* here as we prepare to render the next frame. At this point for
		243	* round trips for swap/copy and getting new buffers are done and
		244	* we'll spend less time waiting on the GPU.
		245	*
		246	* Unfortunately, we don't have a handle to the batch containing
		247	* the swap, and getting our hands on that doesn't seem worth it,
		248	* so we just use the first batch we emitted after the last swap.
		249	*/
		250	if (brw->need_swap_throttle && brw->throttle_batch[0]) {
		251	if (brw->throttle_batch[1]) {
		252	if (!brw->disable_throttling)
		253	drm_intel_bo_wait_rendering(brw->throttle_batch[1]);
		254	drm_intel_bo_unreference(brw->throttle_batch[1]);
		255	}
		256	brw->throttle_batch[1] = brw->throttle_batch[0];
		257	brw->throttle_batch[0] = NULL;
		258	brw->need_swap_throttle = false;
		259	/* Throttling here is more precise than the throttle ioctl, so skip it */
		260	brw->need_flush_throttle = false;
		261	}
		262
		263	if (brw->need_flush_throttle) {
		264	__DRIscreen *psp = brw->intelScreen->driScrnPriv;
		265	drmCommandNone(psp->fd, DRM_I915_GEM_THROTTLE);
		266	brw->need_flush_throttle = false;
		267	}
		268	}
		269
		270	/* TODO: Push this whole function into bufmgr.
		271	*/
		272	static int
		273	do_flush_locked(struct brw_context *brw)
		274	{
		275	struct intel_batchbuffer *batch = &brw->batch;
		276	int ret = 0;
		277
		278	if (brw->has_llc) {
		279	drm_intel_bo_unmap(batch->bo);
		280	} else {
		281	ret = drm_intel_bo_subdata(batch->bo, 0, 4*batch->used, batch->map);
		282	if (ret == 0 && batch->state_batch_offset != batch->bo->size) {
		283	ret = drm_intel_bo_subdata(batch->bo,
		284	batch->state_batch_offset,
		285	batch->bo->size - batch->state_batch_offset,
		286	(char *)batch->map + batch->state_batch_offset);
		287	}
		288	}
		289
		290	if (!brw->intelScreen->no_hw) {
		291	int flags;
		292
		293	if (brw->gen >= 6 && batch->ring == BLT_RING) {
		294	flags = I915_EXEC_BLT;
		295	} else {
		296	flags = I915_EXEC_RENDER;
		297	}
		298	if (batch->needs_sol_reset)
		299	flags \|= I915_EXEC_GEN7_SOL_RESET;
		300
		301	if (ret == 0) {
		302	if (unlikely(INTEL_DEBUG & DEBUG_AUB))
		303	brw_annotate_aub(brw);
		304
		305	if (brw->hw_ctx == NULL \|\| batch->ring != RENDER_RING) {
		306	ret = drm_intel_bo_mrb_exec(batch->bo, 4 * batch->used, NULL, 0, 0,
		307	flags);
		308	} else {
		309	ret = drm_intel_gem_bo_context_exec(batch->bo, brw->hw_ctx,
		310	4 * batch->used, flags);
		311	}
		312	}
		313
		314	throttle(brw);
		315	}
		316
		317	if (unlikely(INTEL_DEBUG & DEBUG_BATCH))
		318	do_batch_dump(brw);
		319
		320	if (ret != 0) {
		321	fprintf(stderr, "intel_do_flush_locked failed: %s\n", strerror(-ret));
		322	exit(1);
		323	}
		324
		325	return ret;
		326	}
		327
		328	int
		329	_intel_batchbuffer_flush(struct brw_context *brw,
		330	const char *file, int line)
		331	{
		332	int ret;
		333
		334	if (brw->batch.used == 0)
		335	return 0;
		336
		337	if (brw->throttle_batch[0] == NULL) {
		338	brw->throttle_batch[0] = brw->batch.bo;
		339	drm_intel_bo_reference(brw->throttle_batch[0]);
		340	}
		341
		342	if (unlikely(INTEL_DEBUG & DEBUG_BATCH)) {
		343	int bytes_for_commands = 4 * brw->batch.used;
		344	int bytes_for_state = brw->batch.bo->size - brw->batch.state_batch_offset;
		345	int total_bytes = bytes_for_commands + bytes_for_state;
		346	fprintf(stderr, "%s:%d: Batchbuffer flush with %4db (pkt) + "
		347	"%4db (state) = %4db (%0.1f%%)\n", file, line,
		348	bytes_for_commands, bytes_for_state,
		349	total_bytes,
		350	100.0f * total_bytes / BATCH_SZ);
		351	}
		352
		353	brw->batch.reserved_space = 0;
		354
		355	brw_finish_batch(brw);
		356
		357	/* Mark the end of the buffer. */
		358	intel_batchbuffer_emit_dword(brw, MI_BATCH_BUFFER_END);
		359	if (brw->batch.used & 1) {
		360	/* Round batchbuffer usage to 2 DWORDs. */
		361	intel_batchbuffer_emit_dword(brw, MI_NOOP);
		362	}
		363
		364	intel_upload_finish(brw);
		365
		366	/* Check that we didn't just wrap our batchbuffer at a bad time. */
		367	assert(!brw->no_batch_wrap);
		368
		369	ret = do_flush_locked(brw);
		370
		371	if (unlikely(INTEL_DEBUG & DEBUG_SYNC)) {
		372	fprintf(stderr, "waiting for idle\n");
		373	drm_intel_bo_wait_rendering(brw->batch.bo);
		374	}
		375
		376	/* Start a new batch buffer. */
		377	brw_new_batch(brw);
		378
		379	return ret;
		380	}
		381
		382
		383	/* This is the only way buffers get added to the validate list.
		384	*/
		385	bool
		386	intel_batchbuffer_emit_reloc(struct brw_context *brw,
		387	drm_intel_bo *buffer,
		388	uint32_t read_domains, uint32_t write_domain,
		389	uint32_t delta)
		390	{
		391	int ret;
		392
		393	ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
		394	buffer, delta,
		395	read_domains, write_domain);
		396	assert(ret == 0);
		397	(void)ret;
		398
		399	/* Using the old buffer offset, write in what the right data would be, in
		400	* case the buffer doesn't move and we can short-circuit the relocation
		401	* processing in the kernel
		402	*/
		403	intel_batchbuffer_emit_dword(brw, buffer->offset64 + delta);
		404
		405	return true;
		406	}
		407
		408	bool
		409	intel_batchbuffer_emit_reloc64(struct brw_context *brw,
		410	drm_intel_bo *buffer,
		411	uint32_t read_domains, uint32_t write_domain,
		412	uint32_t delta)
		413	{
		414	int ret = drm_intel_bo_emit_reloc(brw->batch.bo, 4*brw->batch.used,
		415	buffer, delta,
		416	read_domains, write_domain);
		417	assert(ret == 0);
		418	(void) ret;
		419
		420	/* Using the old buffer offset, write in what the right data would be, in
		421	* case the buffer doesn't move and we can short-circuit the relocation
		422	* processing in the kernel
		423	*/
		424	uint64_t offset = buffer->offset64 + delta;
		425	intel_batchbuffer_emit_dword(brw, offset);
		426	intel_batchbuffer_emit_dword(brw, offset >> 32);
		427
		428	return true;
		429	}
		430
		431
		432	void
		433	intel_batchbuffer_data(struct brw_context *brw,
		434	const void *data, GLuint bytes, enum brw_gpu_ring ring)
		435	{
		436	assert((bytes & 3) == 0);
		437	intel_batchbuffer_require_space(brw, bytes, ring);
		438	memcpy(brw->batch.map + brw->batch.used, data, bytes);
		439	brw->batch.used += bytes >> 2;
		440	}
		441
		442	/**
		443	* According to the latest documentation, any PIPE_CONTROL with the
		444	* "Command Streamer Stall" bit set must also have another bit set,
		445	* with five different options:
		446	*
		447	* - Render Target Cache Flush
		448	* - Depth Cache Flush
		449	* - Stall at Pixel Scoreboard
		450	* - Post-Sync Operation
		451	* - Depth Stall
		452	*
		453	* I chose "Stall at Pixel Scoreboard" since we've used it effectively
		454	* in the past, but the choice is fairly arbitrary.
		455	*/
		456	static void
		457	gen8_add_cs_stall_workaround_bits(uint32_t *flags)
		458	{
		459	uint32_t wa_bits = PIPE_CONTROL_RENDER_TARGET_FLUSH \|
		460	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
		461	PIPE_CONTROL_WRITE_IMMEDIATE \|
		462	PIPE_CONTROL_WRITE_DEPTH_COUNT \|
		463	PIPE_CONTROL_WRITE_TIMESTAMP \|
		464	PIPE_CONTROL_STALL_AT_SCOREBOARD \|
		465	PIPE_CONTROL_DEPTH_STALL;
		466
		467	/* If we're doing a CS stall, and don't already have one of the
		468	* workaround bits set, add "Stall at Pixel Scoreboard."
		469	*/
		470	if ((flags & PIPE_CONTROL_CS_STALL) != 0 && (flags & wa_bits) == 0)
		471	*flags \|= PIPE_CONTROL_STALL_AT_SCOREBOARD;
		472	}
		473
		474	/* Implement the WaCsStallAtEveryFourthPipecontrol workaround on IVB, BYT:
		475	*
		476	* "Every 4th PIPE_CONTROL command, not counting the PIPE_CONTROL with
		477	* only read-cache-invalidate bit(s) set, must have a CS_STALL bit set."
		478	*
		479	* Note that the kernel does CS stalls between batches, so we only need
		480	* to count them within a batch.
		481	*/
		482	static uint32_t
		483	gen7_cs_stall_every_four_pipe_controls(struct brw_context *brw, uint32_t flags)
		484	{
		485	if (brw->gen == 7 && !brw->is_haswell) {
		486	if (flags & PIPE_CONTROL_CS_STALL) {
		487	/* If we're doing a CS stall, reset the counter and carry on. */
		488	brw->batch.pipe_controls_since_last_cs_stall = 0;
		489	return 0;
		490	}
		491
		492	/* If this is the fourth pipe control without a CS stall, do one now. */
		493	if (++brw->batch.pipe_controls_since_last_cs_stall == 4) {
		494	brw->batch.pipe_controls_since_last_cs_stall = 0;
		495	return PIPE_CONTROL_CS_STALL;
		496	}
		497	}
		498	return 0;
		499	}
		500
		501	/**
		502	* Emit a PIPE_CONTROL with various flushing flags.
		503	*
		504	* The caller is responsible for deciding what flags are appropriate for the
		505	* given generation.
		506	*/
		507	void
		508	brw_emit_pipe_control_flush(struct brw_context *brw, uint32_t flags)
		509	{
		510	if (brw->gen >= 8) {
		511	gen8_add_cs_stall_workaround_bits(&flags);
		512
		513	BEGIN_BATCH(6);
		514	OUT_BATCH(_3DSTATE_PIPE_CONTROL \| (6 - 2));
		515	OUT_BATCH(flags);
		516	OUT_BATCH(0);
		517	OUT_BATCH(0);
		518	OUT_BATCH(0);
		519	OUT_BATCH(0);
		520	ADVANCE_BATCH();
		521	} else if (brw->gen >= 6) {
		522	flags \|= gen7_cs_stall_every_four_pipe_controls(brw, flags);
		523
		524	BEGIN_BATCH(5);
		525	OUT_BATCH(_3DSTATE_PIPE_CONTROL \| (5 - 2));
		526	OUT_BATCH(flags);
		527	OUT_BATCH(0);
		528	OUT_BATCH(0);
		529	OUT_BATCH(0);
		530	ADVANCE_BATCH();
		531	} else {
		532	BEGIN_BATCH(4);
		533	OUT_BATCH(_3DSTATE_PIPE_CONTROL \| flags \| (4 - 2));
		534	OUT_BATCH(0);
		535	OUT_BATCH(0);
		536	OUT_BATCH(0);
		537	ADVANCE_BATCH();
		538	}
		539	}
		540
		541	/**
		542	* Emit a PIPE_CONTROL that writes to a buffer object.
		543	*
		544	* \p flags should contain one of the following items:
		545	* - PIPE_CONTROL_WRITE_IMMEDIATE
		546	* - PIPE_CONTROL_WRITE_TIMESTAMP
		547	* - PIPE_CONTROL_WRITE_DEPTH_COUNT
		548	*/
		549	void
		550	brw_emit_pipe_control_write(struct brw_context *brw, uint32_t flags,
		551	drm_intel_bo *bo, uint32_t offset,
		552	uint32_t imm_lower, uint32_t imm_upper)
		553	{
		554	if (brw->gen >= 8) {
		555	gen8_add_cs_stall_workaround_bits(&flags);
		556
		557	BEGIN_BATCH(6);
		558	OUT_BATCH(_3DSTATE_PIPE_CONTROL \| (6 - 2));
		559	OUT_BATCH(flags);
		560	OUT_RELOC64(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
		561	offset);
		562	OUT_BATCH(imm_lower);
		563	OUT_BATCH(imm_upper);
		564	ADVANCE_BATCH();
		565	} else if (brw->gen >= 6) {
		566	flags \|= gen7_cs_stall_every_four_pipe_controls(brw, flags);
		567
		568	/* PPGTT/GGTT is selected by DW2 bit 2 on Sandybridge, but DW1 bit 24
		569	* on later platforms. We always use PPGTT on Gen7+.
		570	*/
		571	unsigned gen6_gtt = brw->gen == 6 ? PIPE_CONTROL_GLOBAL_GTT_WRITE : 0;
		572
		573	BEGIN_BATCH(5);
		574	OUT_BATCH(_3DSTATE_PIPE_CONTROL \| (5 - 2));
		575	OUT_BATCH(flags);
		576	OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
		577	gen6_gtt \| offset);
		578	OUT_BATCH(imm_lower);
		579	OUT_BATCH(imm_upper);
		580	ADVANCE_BATCH();
		581	} else {
		582	BEGIN_BATCH(4);
		583	OUT_BATCH(_3DSTATE_PIPE_CONTROL \| flags \| (4 - 2));
		584	OUT_RELOC(bo, I915_GEM_DOMAIN_INSTRUCTION, I915_GEM_DOMAIN_INSTRUCTION,
		585	PIPE_CONTROL_GLOBAL_GTT_WRITE \| offset);
		586	OUT_BATCH(imm_lower);
		587	OUT_BATCH(imm_upper);
		588	ADVANCE_BATCH();
		589	}
		590	}
		591
		592	/**
		593	* Restriction [DevSNB, DevIVB]:
		594	*
		595	* Prior to changing Depth/Stencil Buffer state (i.e. any combination of
		596	* 3DSTATE_DEPTH_BUFFER, 3DSTATE_CLEAR_PARAMS, 3DSTATE_STENCIL_BUFFER,
		597	* 3DSTATE_HIER_DEPTH_BUFFER) SW must first issue a pipelined depth stall
		598	* (PIPE_CONTROL with Depth Stall bit set), followed by a pipelined depth
		599	* cache flush (PIPE_CONTROL with Depth Flush Bit set), followed by
		600	* another pipelined depth stall (PIPE_CONTROL with Depth Stall bit set),
		601	* unless SW can otherwise guarantee that the pipeline from WM onwards is
		602	* already flushed (e.g., via a preceding MI_FLUSH).
		603	*/
		604	void
		605	intel_emit_depth_stall_flushes(struct brw_context *brw)
		606	{
		607	assert(brw->gen >= 6 && brw->gen <= 9);
		608
		609	brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
		610	brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_CACHE_FLUSH);
		611	brw_emit_pipe_control_flush(brw, PIPE_CONTROL_DEPTH_STALL);
		612	}
		613
		614	/**
		615	* From the Ivybridge PRM, Volume 2 Part 1, Section 3.2 (VS Stage Input):
		616	* "A PIPE_CONTROL with Post-Sync Operation set to 1h and a depth
		617	* stall needs to be sent just prior to any 3DSTATE_VS, 3DSTATE_URB_VS,
		618	* 3DSTATE_CONSTANT_VS, 3DSTATE_BINDING_TABLE_POINTER_VS,
		619	* 3DSTATE_SAMPLER_STATE_POINTER_VS command. Only one PIPE_CONTROL needs
		620	* to be sent before any combination of VS associated 3DSTATE."
		621	*/
		622	void
		623	gen7_emit_vs_workaround_flush(struct brw_context *brw)
		624	{
		625	assert(brw->gen == 7);
		626	brw_emit_pipe_control_write(brw,
		627	PIPE_CONTROL_WRITE_IMMEDIATE
		628	\| PIPE_CONTROL_DEPTH_STALL,
		629	brw->batch.workaround_bo, 0,
		630	0, 0);
		631	}
		632
		633
		634	/**
		635	* Emit a PIPE_CONTROL command for gen7 with the CS Stall bit set.
		636	*/
		637	void
		638	gen7_emit_cs_stall_flush(struct brw_context *brw)
		639	{
		640	brw_emit_pipe_control_write(brw,
		641	PIPE_CONTROL_CS_STALL
		642	\| PIPE_CONTROL_WRITE_IMMEDIATE,
		643	brw->batch.workaround_bo, 0,
		644	0, 0);
		645	}
		646
		647
		648	/**
		649	* Emits a PIPE_CONTROL with a non-zero post-sync operation, for
		650	* implementing two workarounds on gen6. From section 1.4.7.1
		651	* "PIPE_CONTROL" of the Sandy Bridge PRM volume 2 part 1:
		652	*
		653	* [DevSNB-C+{W/A}] Before any depth stall flush (including those
		654	* produced by non-pipelined state commands), software needs to first
		655	* send a PIPE_CONTROL with no bits set except Post-Sync Operation !=
		656	* 0.
		657	*
		658	* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache Flush Enable
		659	* =1, a PIPE_CONTROL with any non-zero post-sync-op is required.
		660	*
		661	* And the workaround for these two requires this workaround first:
		662	*
		663	* [Dev-SNB{W/A}]: Pipe-control with CS-stall bit set must be sent
		664	* BEFORE the pipe-control with a post-sync op and no write-cache
		665	* flushes.
		666	*
		667	* And this last workaround is tricky because of the requirements on
		668	* that bit. From section 1.4.7.2.3 "Stall" of the Sandy Bridge PRM
		669	* volume 2 part 1:
		670	*
		671	* "1 of the following must also be set:
		672	* - Render Target Cache Flush Enable ([12] of DW1)
		673	* - Depth Cache Flush Enable ([0] of DW1)
		674	* - Stall at Pixel Scoreboard ([1] of DW1)
		675	* - Depth Stall ([13] of DW1)
		676	* - Post-Sync Operation ([13] of DW1)
		677	* - Notify Enable ([8] of DW1)"
		678	*
		679	* The cache flushes require the workaround flush that triggered this
		680	* one, so we can't use it. Depth stall would trigger the same.
		681	* Post-sync nonzero is what triggered this second workaround, so we
		682	* can't use that one either. Notify enable is IRQs, which aren't
		683	* really our business. That leaves only stall at scoreboard.
		684	*/
		685	void
		686	intel_emit_post_sync_nonzero_flush(struct brw_context *brw)
		687	{
		688	brw_emit_pipe_control_flush(brw,
		689	PIPE_CONTROL_CS_STALL \|
		690	PIPE_CONTROL_STALL_AT_SCOREBOARD);
		691
		692	brw_emit_pipe_control_write(brw, PIPE_CONTROL_WRITE_IMMEDIATE,
		693	brw->batch.workaround_bo, 0, 0, 0);
		694	}
		695
		696	/* Emit a pipelined flush to either flush render and texture cache for
		697	* reading from a FBO-drawn texture, or flush so that frontbuffer
		698	* render appears on the screen in DRI1.
		699	*
		700	* This is also used for the always_flush_cache driconf debug option.
		701	*/
		702	void
		703	intel_batchbuffer_emit_mi_flush(struct brw_context *brw)
		704	{
		705	if (brw->batch.ring == BLT_RING && brw->gen >= 6) {
		706	BEGIN_BATCH_BLT(4);
		707	OUT_BATCH(MI_FLUSH_DW);
		708	OUT_BATCH(0);
		709	OUT_BATCH(0);
		710	OUT_BATCH(0);
		711	ADVANCE_BATCH();
		712	} else {
		713	int flags = PIPE_CONTROL_NO_WRITE \| PIPE_CONTROL_RENDER_TARGET_FLUSH;
		714	if (brw->gen >= 6) {
		715	if (brw->gen == 9) {
		716	/* Hardware workaround: SKL
		717	*
		718	* Emit Pipe Control with all bits set to zero before emitting
		719	* a Pipe Control with VF Cache Invalidate set.
		720	*/
		721	brw_emit_pipe_control_flush(brw, 0);
		722	}
		723
		724	flags \|= PIPE_CONTROL_INSTRUCTION_INVALIDATE \|
		725	PIPE_CONTROL_DEPTH_CACHE_FLUSH \|
		726	PIPE_CONTROL_VF_CACHE_INVALIDATE \|
		727	PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE \|
		728	PIPE_CONTROL_CS_STALL;
		729
		730	if (brw->gen == 6) {
		731	/* Hardware workaround: SNB B-Spec says:
		732	*
		733	* [Dev-SNB{W/A}]: Before a PIPE_CONTROL with Write Cache
		734	* Flush Enable =1, a PIPE_CONTROL with any non-zero
		735	* post-sync-op is required.
		736	*/
		737	intel_emit_post_sync_nonzero_flush(brw);
		738	}
		739	}
		740	brw_emit_pipe_control_flush(brw, flags);
		741	}
		742
		743	brw_render_cache_set_clear(brw);
		744	}
		745
		746	static void
		747	load_sized_register_mem(struct brw_context *brw,
		748	uint32_t reg,
		749	drm_intel_bo *bo,
		750	uint32_t read_domains, uint32_t write_domain,
		751	uint32_t offset,
		752	int size)
		753	{
		754	int i;
		755
		756	/* MI_LOAD_REGISTER_MEM only exists on Gen7+. */
		757	assert(brw->gen >= 7);
		758
		759	if (brw->gen >= 8) {
		760	BEGIN_BATCH(4 * size);
		761	for (i = 0; i < size; i++) {
		762	OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM \| (4 - 2));
		763	OUT_BATCH(reg + i * 4);
		764	OUT_RELOC64(bo, read_domains, write_domain, offset + i * 4);
		765	}
		766	ADVANCE_BATCH();
		767	} else {
		768	BEGIN_BATCH(3 * size);
		769	for (i = 0; i < size; i++) {
		770	OUT_BATCH(GEN7_MI_LOAD_REGISTER_MEM \| (3 - 2));
		771	OUT_BATCH(reg + i * 4);
		772	OUT_RELOC(bo, read_domains, write_domain, offset + i * 4);
		773	}
		774	ADVANCE_BATCH();
		775	}
		776	}
		777
		778	void
		779	brw_load_register_mem(struct brw_context *brw,
		780	uint32_t reg,
		781	drm_intel_bo *bo,
		782	uint32_t read_domains, uint32_t write_domain,
		783	uint32_t offset)
		784	{
		785	load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 1);
		786	}
		787
		788	void
		789	brw_load_register_mem64(struct brw_context *brw,
		790	uint32_t reg,
		791	drm_intel_bo *bo,
		792	uint32_t read_domains, uint32_t write_domain,
		793	uint32_t offset)
		794	{
		795	load_sized_register_mem(brw, reg, bo, read_domains, write_domain, offset, 2);
		796	}

Subversion Repositories Kolibri OS

(root)/contrib/sdk/sources/Mesa/mesa-10.6.0/src/mesa/drivers/dri/i965/intel_batchbuffer.c – Rev 5568