WebSVN – Kolibri OS – Blame – /drivers/video/drm/i915/intel_lrc.c

Rev	Author	Line No.	Line
5354	serge	1	/*
		2	* Copyright © 2014 Intel Corporation
		3	*
		4	* Permission is hereby granted, free of charge, to any person obtaining a
		5	* copy of this software and associated documentation files (the "Software"),
		6	* to deal in the Software without restriction, including without limitation
		7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
		8	* and/or sell copies of the Software, and to permit persons to whom the
		9	* Software is furnished to do so, subject to the following conditions:
		10	*
		11	* The above copyright notice and this permission notice (including the next
		12	* paragraph) shall be included in all copies or substantial portions of the
		13	* Software.
		14	*
		15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
		16	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
		17	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
		18	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
		19	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
		20	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
		21	* IN THE SOFTWARE.
		22	*
		23	* Authors:
		24	* Ben Widawsky
		25	* Michel Thierry
		26	* Thomas Daniel
		27	* Oscar Mateo
		28	*
		29	*/
		30
		31	/**
		32	* DOC: Logical Rings, Logical Ring Contexts and Execlists
		33	*
		34	* Motivation:
		35	* GEN8 brings an expansion of the HW contexts: "Logical Ring Contexts".
		36	* These expanded contexts enable a number of new abilities, especially
		37	* "Execlists" (also implemented in this file).
		38	*
		39	* One of the main differences with the legacy HW contexts is that logical
		40	* ring contexts incorporate many more things to the context's state, like
		41	* PDPs or ringbuffer control registers:
		42	*
		43	* The reason why PDPs are included in the context is straightforward: as
		44	* PPGTTs (per-process GTTs) are actually per-context, having the PDPs
		45	* contained there mean you don't need to do a ppgtt->switch_mm yourself,
		46	* instead, the GPU will do it for you on the context switch.
		47	*
		48	* But, what about the ringbuffer control registers (head, tail, etc..)?
		49	* shouldn't we just need a set of those per engine command streamer? This is
		50	* where the name "Logical Rings" starts to make sense: by virtualizing the
		51	* rings, the engine cs shifts to a new "ring buffer" with every context
		52	* switch. When you want to submit a workload to the GPU you: A) choose your
		53	* context, B) find its appropriate virtualized ring, C) write commands to it
		54	* and then, finally, D) tell the GPU to switch to that context.
		55	*
		56	* Instead of the legacy MI_SET_CONTEXT, the way you tell the GPU to switch
		57	* to a contexts is via a context execution list, ergo "Execlists".
		58	*
		59	* LRC implementation:
		60	* Regarding the creation of contexts, we have:
		61	*
		62	* - One global default context.
		63	* - One local default context for each opened fd.
		64	* - One local extra context for each context create ioctl call.
		65	*
		66	* Now that ringbuffers belong per-context (and not per-engine, like before)
		67	* and that contexts are uniquely tied to a given engine (and not reusable,
		68	* like before) we need:
		69	*
		70	* - One ringbuffer per-engine inside each context.
		71	* - One backing object per-engine inside each context.
		72	*
		73	* The global default context starts its life with these new objects fully
		74	* allocated and populated. The local default context for each opened fd is
		75	* more complex, because we don't know at creation time which engine is going
		76	* to use them. To handle this, we have implemented a deferred creation of LR
		77	* contexts:
		78	*
		79	* The local context starts its life as a hollow or blank holder, that only
		80	* gets populated for a given engine once we receive an execbuffer. If later
		81	* on we receive another execbuffer ioctl for the same context but a different
		82	* engine, we allocate/populate a new ringbuffer and context backing object and
		83	* so on.
		84	*
		85	* Finally, regarding local contexts created using the ioctl call: as they are
		86	* only allowed with the render ring, we can allocate & populate them right
		87	* away (no need to defer anything, at least for now).
		88	*
		89	* Execlists implementation:
		90	* Execlists are the new method by which, on gen8+ hardware, workloads are
		91	* submitted for execution (as opposed to the legacy, ringbuffer-based, method).
		92	* This method works as follows:
		93	*
		94	* When a request is committed, its commands (the BB start and any leading or
		95	* trailing commands, like the seqno breadcrumbs) are placed in the ringbuffer
		96	* for the appropriate context. The tail pointer in the hardware context is not
		97	* updated at this time, but instead, kept by the driver in the ringbuffer
		98	* structure. A structure representing this request is added to a request queue
		99	* for the appropriate engine: this structure contains a copy of the context's
		100	* tail after the request was written to the ring buffer and a pointer to the
		101	* context itself.
		102	*
		103	* If the engine's request queue was empty before the request was added, the
		104	* queue is processed immediately. Otherwise the queue will be processed during
		105	* a context switch interrupt. In any case, elements on the queue will get sent
		106	* (in pairs) to the GPU's ExecLists Submit Port (ELSP, for short) with a
		107	* globally unique 20-bits submission ID.
		108	*
		109	* When execution of a request completes, the GPU updates the context status
		110	* buffer with a context complete event and generates a context switch interrupt.
		111	* During the interrupt handling, the driver examines the events in the buffer:
		112	* for each context complete event, if the announced ID matches that on the head
		113	* of the request queue, then that request is retired and removed from the queue.
		114	*
		115	* After processing, if any requests were retired and the queue is not empty
		116	* then a new execution list can be submitted. The two requests at the front of
		117	* the queue are next to be submitted but since a context may not occur twice in
		118	* an execution list, if subsequent requests have the same ID as the first then
		119	* the two requests must be combined. This is done simply by discarding requests
		120	* at the head of the queue until either only one requests is left (in which case
		121	* we use a NULL second context) or the first two requests have unique IDs.
		122	*
		123	* By always executing the first two requests in the queue the driver ensures
		124	* that the GPU is kept as busy as possible. In the case where a single context
		125	* completes but a second context is still executing, the request for this second
		126	* context will be at the head of the queue when we remove the first one. This
		127	* request will then be resubmitted along with a new request for a different context,
		128	* which will cause the hardware to continue executing the second request and queue
		129	* the new request (the GPU detects the condition of a context getting preempted
		130	* with the same context and optimizes the context switch flow by not doing
		131	* preemption, but just sampling the new tail pointer).
		132	*
		133	*/
		134
		135	#include
		136	#include
6937	serge	137	#include "i915_drv.h"
6084	serge	138	#include "intel_mocs.h"
5354	serge	139
		140	#define GEN9_LR_CONTEXT_RENDER_SIZE (22 * PAGE_SIZE)
		141	#define GEN8_LR_CONTEXT_RENDER_SIZE (20 * PAGE_SIZE)
		142	#define GEN8_LR_CONTEXT_OTHER_SIZE (2 * PAGE_SIZE)
		143
		144	#define RING_EXECLIST_QFULL (1 << 0x2)
		145	#define RING_EXECLIST1_VALID (1 << 0x3)
		146	#define RING_EXECLIST0_VALID (1 << 0x4)
		147	#define RING_EXECLIST_ACTIVE_STATUS (3 << 0xE)
		148	#define RING_EXECLIST1_ACTIVE (1 << 0x11)
		149	#define RING_EXECLIST0_ACTIVE (1 << 0x12)
		150
		151	#define GEN8_CTX_STATUS_IDLE_ACTIVE (1 << 0)
		152	#define GEN8_CTX_STATUS_PREEMPTED (1 << 1)
		153	#define GEN8_CTX_STATUS_ELEMENT_SWITCH (1 << 2)
		154	#define GEN8_CTX_STATUS_ACTIVE_IDLE (1 << 3)
		155	#define GEN8_CTX_STATUS_COMPLETE (1 << 4)
		156	#define GEN8_CTX_STATUS_LITE_RESTORE (1 << 15)
		157
		158	#define CTX_LRI_HEADER_0 0x01
		159	#define CTX_CONTEXT_CONTROL 0x02
		160	#define CTX_RING_HEAD 0x04
		161	#define CTX_RING_TAIL 0x06
		162	#define CTX_RING_BUFFER_START 0x08
		163	#define CTX_RING_BUFFER_CONTROL 0x0a
		164	#define CTX_BB_HEAD_U 0x0c
		165	#define CTX_BB_HEAD_L 0x0e
		166	#define CTX_BB_STATE 0x10
		167	#define CTX_SECOND_BB_HEAD_U 0x12
		168	#define CTX_SECOND_BB_HEAD_L 0x14
		169	#define CTX_SECOND_BB_STATE 0x16
		170	#define CTX_BB_PER_CTX_PTR 0x18
		171	#define CTX_RCS_INDIRECT_CTX 0x1a
		172	#define CTX_RCS_INDIRECT_CTX_OFFSET 0x1c
		173	#define CTX_LRI_HEADER_1 0x21
		174	#define CTX_CTX_TIMESTAMP 0x22
		175	#define CTX_PDP3_UDW 0x24
		176	#define CTX_PDP3_LDW 0x26
		177	#define CTX_PDP2_UDW 0x28
		178	#define CTX_PDP2_LDW 0x2a
		179	#define CTX_PDP1_UDW 0x2c
		180	#define CTX_PDP1_LDW 0x2e
		181	#define CTX_PDP0_UDW 0x30
		182	#define CTX_PDP0_LDW 0x32
		183	#define CTX_LRI_HEADER_2 0x41
		184	#define CTX_R_PWR_CLK_STATE 0x42
		185	#define CTX_GPGPU_CSR_BASE_ADDRESS 0x44
		186
		187	#define GEN8_CTX_VALID (1<<0)
		188	#define GEN8_CTX_FORCE_PD_RESTORE (1<<1)
		189	#define GEN8_CTX_FORCE_RESTORE (1<<2)
		190	#define GEN8_CTX_L3LLC_COHERENT (1<<5)
		191	#define GEN8_CTX_PRIVILEGE (1<<8)
6084	serge	192
6937	serge	193	#define ASSIGN_CTX_REG(reg_state, pos, reg, val) do { \
		194	(reg_state)[(pos)+0] = i915_mmio_reg_offset(reg); \
		195	(reg_state)[(pos)+1] = (val); \
		196	} while (0)
		197
		198	#define ASSIGN_CTX_PDP(ppgtt, reg_state, n) do { \
6084	serge	199	const u64 _addr = i915_page_dir_dma_addr((ppgtt), (n)); \
		200	reg_state[CTX_PDP ## n ## _UDW+1] = upper_32_bits(_addr); \
		201	reg_state[CTX_PDP ## n ## _LDW+1] = lower_32_bits(_addr); \
6937	serge	202	} while (0)
6084	serge	203
6937	serge	204	#define ASSIGN_CTX_PML4(ppgtt, reg_state) do { \
6084	serge	205	reg_state[CTX_PDP0_UDW + 1] = upper_32_bits(px_dma(&ppgtt->pml4)); \
		206	reg_state[CTX_PDP0_LDW + 1] = lower_32_bits(px_dma(&ppgtt->pml4)); \
6937	serge	207	} while (0)
6084	serge	208
5354	serge	209	enum {
		210	ADVANCED_CONTEXT = 0,
6084	serge	211	LEGACY_32B_CONTEXT,
5354	serge	212	ADVANCED_AD_CONTEXT,
		213	LEGACY_64B_CONTEXT
		214	};
6084	serge	215	#define GEN8_CTX_ADDRESSING_MODE_SHIFT 3
		216	#define GEN8_CTX_ADDRESSING_MODE(dev) (USES_FULL_48BIT_PPGTT(dev) ?\
		217	LEGACY_64B_CONTEXT :\
		218	LEGACY_32B_CONTEXT)
5354	serge	219	enum {
		220	FAULT_AND_HANG = 0,
		221	FAULT_AND_HALT, /* Debug only */
		222	FAULT_AND_STREAM,
		223	FAULT_AND_CONTINUE /* Unsupported */
		224	};
		225	#define GEN8_CTX_ID_SHIFT 32
6084	serge	226	#define CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT 0x17
5354	serge	227
6084	serge	228	static int intel_lr_context_pin(struct drm_i915_gem_request *rq);
		229	static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
		230	struct drm_i915_gem_object *default_ctx_obj);
5354	serge	231
6084	serge	232
5354	serge	233	/**
		234	* intel_sanitize_enable_execlists() - sanitize i915.enable_execlists
		235	* @dev: DRM device.
		236	* @enable_execlists: value of i915.enable_execlists module parameter.
		237	*
		238	* Only certain platforms support Execlists (the prerequisites being
6084	serge	239	* support for Logical Ring Contexts and Aliasing PPGTT or better).
5354	serge	240	*
		241	* Return: 1 if Execlists is supported and has to be enabled.
		242	*/
		243	int intel_sanitize_enable_execlists(struct drm_device *dev, int enable_execlists)
		244	{
		245	WARN_ON(i915.enable_ppgtt == -1);
		246
6084	serge	247	/* On platforms with execlist available, vGPU will only
		248	* support execlist mode, no ring buffer mode.
		249	*/
		250	if (HAS_LOGICAL_RING_CONTEXTS(dev) && intel_vgpu_active(dev))
		251	return 1;
		252
5354	serge	253	if (INTEL_INFO(dev)->gen >= 9)
		254	return 1;
		255
		256	if (enable_execlists == 0)
		257	return 0;
		258
		259	if (HAS_LOGICAL_RING_CONTEXTS(dev) && USES_PPGTT(dev) &&
		260	i915.use_mmio_flip >= 0)
		261	return 1;
		262
		263	return 0;
		264	}
		265
		266	/**
		267	* intel_execlists_ctx_id() - get the Execlists Context ID
		268	* @ctx_obj: Logical Ring Context backing object.
		269	*
		270	* Do not confuse with ctx->id! Unfortunately we have a name overload
		271	* here: the old context ID we pass to userspace as a handler so that
		272	* they can refer to a context, and the new context ID we pass to the
		273	* ELSP so that the GPU can inform us of the context status via
		274	* interrupts.
		275	*
		276	* Return: 20-bits globally unique context ID.
		277	*/
		278	u32 intel_execlists_ctx_id(struct drm_i915_gem_object *ctx_obj)
		279	{
6084	serge	280	u32 lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
		281	LRC_PPHWSP_PN * PAGE_SIZE;
5354	serge	282
		283	/* LRCA is required to be 4K aligned so the more significant 20 bits
		284	* are globally unique */
		285	return lrca >> 12;
		286	}
		287
6084	serge	288	static bool disable_lite_restore_wa(struct intel_engine_cs *ring)
5354	serge	289	{
6084	serge	290	struct drm_device *dev = ring->dev;
		291
6937	serge	292	return (IS_SKL_REVID(dev, 0, SKL_REVID_B0) \|\|
		293	IS_BXT_REVID(dev, 0, BXT_REVID_A1)) &&
6084	serge	294	(ring->id == VCS \|\| ring->id == VCS2);
		295	}
		296
		297	uint64_t intel_lr_context_descriptor(struct intel_context *ctx,
		298	struct intel_engine_cs *ring)
		299	{
		300	struct drm_i915_gem_object *ctx_obj = ctx->engine[ring->id].state;
5354	serge	301	uint64_t desc;
6084	serge	302	uint64_t lrca = i915_gem_obj_ggtt_offset(ctx_obj) +
		303	LRC_PPHWSP_PN * PAGE_SIZE;
5354	serge	304
		305	WARN_ON(lrca & 0xFFFFFFFF00000FFFULL);
		306
		307	desc = GEN8_CTX_VALID;
6084	serge	308	desc \|= GEN8_CTX_ADDRESSING_MODE(dev) << GEN8_CTX_ADDRESSING_MODE_SHIFT;
		309	if (IS_GEN8(ctx_obj->base.dev))
		310	desc \|= GEN8_CTX_L3LLC_COHERENT;
5354	serge	311	desc \|= GEN8_CTX_PRIVILEGE;
		312	desc \|= lrca;
		313	desc \|= (u64)intel_execlists_ctx_id(ctx_obj) << GEN8_CTX_ID_SHIFT;
		314
		315	/* TODO: WaDisableLiteRestore when we start using semaphore
		316	* signalling between Command Streamers */
		317	/* desc \|= GEN8_CTX_FORCE_RESTORE; */
		318
6084	serge	319	/* WaEnableForceRestoreInCtxtDescForVCS:skl */
		320	/* WaEnableForceRestoreInCtxtDescForVCS:bxt */
		321	if (disable_lite_restore_wa(ring))
		322	desc \|= GEN8_CTX_FORCE_RESTORE;
		323
5354	serge	324	return desc;
		325	}
		326
6084	serge	327	static void execlists_elsp_write(struct drm_i915_gem_request *rq0,
		328	struct drm_i915_gem_request *rq1)
5354	serge	329	{
6084	serge	330
		331	struct intel_engine_cs *ring = rq0->ring;
5354	serge	332	struct drm_device *dev = ring->dev;
		333	struct drm_i915_private *dev_priv = dev->dev_private;
6084	serge	334	uint64_t desc[2];
5354	serge	335
6084	serge	336	if (rq1) {
		337	desc[1] = intel_lr_context_descriptor(rq1->ctx, rq1->ring);
		338	rq1->elsp_submitted++;
5354	serge	339	} else {
6084	serge	340	desc[1] = 0;
5354	serge	341	}
		342
6084	serge	343	desc[0] = intel_lr_context_descriptor(rq0->ctx, rq0->ring);
		344	rq0->elsp_submitted++;
5354	serge	345
6084	serge	346	/* You must always write both descriptors in the order below. */
		347	spin_lock(&dev_priv->uncore.lock);
		348	intel_uncore_forcewake_get__locked(dev_priv, FORCEWAKE_ALL);
		349	I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[1]));
		350	I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[1]));
5354	serge	351
6084	serge	352	I915_WRITE_FW(RING_ELSP(ring), upper_32_bits(desc[0]));
		353	/* The context is automatically loaded after the following */
		354	I915_WRITE_FW(RING_ELSP(ring), lower_32_bits(desc[0]));
5354	serge	355
6084	serge	356	/* ELSP is a wo register, use another nearby reg for posting */
		357	POSTING_READ_FW(RING_EXECLIST_STATUS_LO(ring));
		358	intel_uncore_forcewake_put__locked(dev_priv, FORCEWAKE_ALL);
		359	spin_unlock(&dev_priv->uncore.lock);
5354	serge	360	}
		361
6084	serge	362	static int execlists_update_context(struct drm_i915_gem_request *rq)
5354	serge	363	{
6084	serge	364	struct intel_engine_cs *ring = rq->ring;
		365	struct i915_hw_ppgtt *ppgtt = rq->ctx->ppgtt;
		366	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
		367	struct drm_i915_gem_object *rb_obj = rq->ringbuf->obj;
5354	serge	368	struct page *page;
		369	uint32_t *reg_state;
		370
6084	serge	371	BUG_ON(!ctx_obj);
		372	WARN_ON(!i915_gem_obj_is_pinned(ctx_obj));
		373	WARN_ON(!i915_gem_obj_is_pinned(rb_obj));
		374
6937	serge	375	page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
5354	serge	376	reg_state = kmap_atomic(page);
		377
6084	serge	378	reg_state[CTX_RING_TAIL+1] = rq->tail;
		379	reg_state[CTX_RING_BUFFER_START+1] = i915_gem_obj_ggtt_offset(rb_obj);
5354	serge	380
6084	serge	381	if (ppgtt && !USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
		382	/* True 32b PPGTT with dynamic page allocation: update PDP
		383	* registers and point the unallocated PDPs to scratch page.
		384	* PML4 is allocated during ppgtt init, so this is not needed
		385	* in 48-bit mode.
		386	*/
		387	ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
		388	ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
		389	ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
		390	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
		391	}
		392
5354	serge	393	kunmap_atomic(reg_state);
		394
		395	return 0;
		396	}
		397
6084	serge	398	static void execlists_submit_requests(struct drm_i915_gem_request *rq0,
		399	struct drm_i915_gem_request *rq1)
5354	serge	400	{
6084	serge	401	execlists_update_context(rq0);
5354	serge	402
6084	serge	403	if (rq1)
		404	execlists_update_context(rq1);
5354	serge	405
6084	serge	406	execlists_elsp_write(rq0, rq1);
5354	serge	407	}
		408
		409	static void execlists_context_unqueue(struct intel_engine_cs *ring)
		410	{
6084	serge	411	struct drm_i915_gem_request req0 = NULL, req1 = NULL;
		412	struct drm_i915_gem_request cursor = NULL, tmp = NULL;
5354	serge	413
		414	assert_spin_locked(&ring->execlist_lock);
		415
6084	serge	416	/*
		417	* If irqs are not active generate a warning as batches that finish
		418	* without the irqs may get lost and a GPU Hang may occur.
		419	*/
		420	WARN_ON(!intel_irqs_enabled(ring->dev->dev_private));
		421
5354	serge	422	if (list_empty(&ring->execlist_queue))
		423	return;
		424
		425	/* Try to read in pairs */
		426	list_for_each_entry_safe(cursor, tmp, &ring->execlist_queue,
		427	execlist_link) {
		428	if (!req0) {
		429	req0 = cursor;
		430	} else if (req0->ctx == cursor->ctx) {
		431	/* Same ctx: ignore first request, as second request
		432	* will update tail past first request's workload */
		433	cursor->elsp_submitted = req0->elsp_submitted;
		434	list_del(&req0->execlist_link);
		435	list_add_tail(&req0->execlist_link,
		436	&ring->execlist_retired_req_list);
		437	req0 = cursor;
		438	} else {
		439	req1 = cursor;
		440	break;
		441	}
		442	}
		443
6084	serge	444	if (IS_GEN8(ring->dev) \|\| IS_GEN9(ring->dev)) {
		445	/*
		446	* WaIdleLiteRestore: make sure we never cause a lite
		447	* restore with HEAD==TAIL
		448	*/
		449	if (req0->elsp_submitted) {
		450	/*
		451	* Apply the wa NOOPS to prevent ring:HEAD == req:TAIL
		452	* as we resubmit the request. See gen8_emit_request()
		453	* for where we prepare the padding after the end of the
		454	* request.
		455	*/
		456	struct intel_ringbuffer *ringbuf;
		457
		458	ringbuf = req0->ctx->engine[ring->id].ringbuf;
		459	req0->tail += 8;
		460	req0->tail &= ringbuf->size - 1;
		461	}
		462	}
		463
5354	serge	464	WARN_ON(req1 && req1->elsp_submitted);
		465
6084	serge	466	execlists_submit_requests(req0, req1);
5354	serge	467	}
		468
		469	static bool execlists_check_remove_request(struct intel_engine_cs *ring,
		470	u32 request_id)
		471	{
6084	serge	472	struct drm_i915_gem_request *head_req;
5354	serge	473
		474	assert_spin_locked(&ring->execlist_lock);
		475
		476	head_req = list_first_entry_or_null(&ring->execlist_queue,
6084	serge	477	struct drm_i915_gem_request,
5354	serge	478	execlist_link);
		479
		480	if (head_req != NULL) {
		481	struct drm_i915_gem_object *ctx_obj =
		482	head_req->ctx->engine[ring->id].state;
		483	if (intel_execlists_ctx_id(ctx_obj) == request_id) {
		484	WARN(head_req->elsp_submitted == 0,
		485	"Never submitted head request\n");
		486
		487	if (--head_req->elsp_submitted <= 0) {
		488	list_del(&head_req->execlist_link);
		489	list_add_tail(&head_req->execlist_link,
		490	&ring->execlist_retired_req_list);
		491	return true;
		492	}
		493	}
		494	}
		495
		496	return false;
		497	}
		498
		499	/**
6084	serge	500	* intel_lrc_irq_handler() - handle Context Switch interrupts
5354	serge	501	* @ring: Engine Command Streamer to handle.
		502	*
		503	* Check the unread Context Status Buffers and manage the submission of new
		504	* contexts to the ELSP accordingly.
		505	*/
6084	serge	506	void intel_lrc_irq_handler(struct intel_engine_cs *ring)
5354	serge	507	{
		508	struct drm_i915_private *dev_priv = ring->dev->dev_private;
		509	u32 status_pointer;
		510	u8 read_pointer;
		511	u8 write_pointer;
6084	serge	512	u32 status = 0;
5354	serge	513	u32 status_id;
		514	u32 submit_contexts = 0;
		515
		516	status_pointer = I915_READ(RING_CONTEXT_STATUS_PTR(ring));
		517
		518	read_pointer = ring->next_context_status_buffer;
6084	serge	519	write_pointer = status_pointer & GEN8_CSB_PTR_MASK;
5354	serge	520	if (read_pointer > write_pointer)
6084	serge	521	write_pointer += GEN8_CSB_ENTRIES;
5354	serge	522
		523	spin_lock(&ring->execlist_lock);
		524
		525	while (read_pointer < write_pointer) {
		526	read_pointer++;
6084	serge	527	status = I915_READ(RING_CONTEXT_STATUS_BUF_LO(ring, read_pointer % GEN8_CSB_ENTRIES));
		528	status_id = I915_READ(RING_CONTEXT_STATUS_BUF_HI(ring, read_pointer % GEN8_CSB_ENTRIES));
5354	serge	529
6084	serge	530	if (status & GEN8_CTX_STATUS_IDLE_ACTIVE)
		531	continue;
		532
5354	serge	533	if (status & GEN8_CTX_STATUS_PREEMPTED) {
		534	if (status & GEN8_CTX_STATUS_LITE_RESTORE) {
		535	if (execlists_check_remove_request(ring, status_id))
		536	WARN(1, "Lite Restored request removed from queue\n");
		537	} else
		538	WARN(1, "Preemption without Lite Restore\n");
		539	}
		540
		541	if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) \|\|
		542	(status & GEN8_CTX_STATUS_ELEMENT_SWITCH)) {
		543	if (execlists_check_remove_request(ring, status_id))
		544	submit_contexts++;
		545	}
		546	}
		547
6084	serge	548	if (disable_lite_restore_wa(ring)) {
		549	/* Prevent a ctx to preempt itself */
		550	if ((status & GEN8_CTX_STATUS_ACTIVE_IDLE) &&
		551	(submit_contexts != 0))
		552	execlists_context_unqueue(ring);
		553	} else if (submit_contexts != 0) {
5354	serge	554	execlists_context_unqueue(ring);
6084	serge	555	}
5354	serge	556
		557	spin_unlock(&ring->execlist_lock);
		558
		559	WARN(submit_contexts > 2, "More than two context complete events?\n");
6084	serge	560	ring->next_context_status_buffer = write_pointer % GEN8_CSB_ENTRIES;
5354	serge	561
		562	I915_WRITE(RING_CONTEXT_STATUS_PTR(ring),
6084	serge	563	_MASKED_FIELD(GEN8_CSB_PTR_MASK << 8,
		564	((u32)ring->next_context_status_buffer &
		565	GEN8_CSB_PTR_MASK) << 8));
5354	serge	566	}
		567
6084	serge	568	static int execlists_context_queue(struct drm_i915_gem_request *request)
5354	serge	569	{
6084	serge	570	struct intel_engine_cs *ring = request->ring;
		571	struct drm_i915_gem_request *cursor;
5354	serge	572	int num_elements = 0;
		573
6084	serge	574	if (request->ctx != ring->default_context)
		575	intel_lr_context_pin(request);
5354	serge	576
6084	serge	577	i915_gem_request_reference(request);
5354	serge	578
6084	serge	579	spin_lock_irq(&ring->execlist_lock);
5354	serge	580
		581	list_for_each_entry(cursor, &ring->execlist_queue, execlist_link)
		582	if (++num_elements > 2)
		583	break;
		584
		585	if (num_elements > 2) {
6084	serge	586	struct drm_i915_gem_request *tail_req;
5354	serge	587
		588	tail_req = list_last_entry(&ring->execlist_queue,
6084	serge	589	struct drm_i915_gem_request,
5354	serge	590	execlist_link);
		591
6084	serge	592	if (request->ctx == tail_req->ctx) {
5354	serge	593	WARN(tail_req->elsp_submitted != 0,
		594	"More than 2 already-submitted reqs queued\n");
		595	list_del(&tail_req->execlist_link);
		596	list_add_tail(&tail_req->execlist_link,
		597	&ring->execlist_retired_req_list);
		598	}
		599	}
		600
6084	serge	601	list_add_tail(&request->execlist_link, &ring->execlist_queue);
5354	serge	602	if (num_elements == 0)
		603	execlists_context_unqueue(ring);
		604
6084	serge	605	spin_unlock_irq(&ring->execlist_lock);
5354	serge	606
		607	return 0;
		608	}
		609
6084	serge	610	static int logical_ring_invalidate_all_caches(struct drm_i915_gem_request *req)
5354	serge	611	{
6084	serge	612	struct intel_engine_cs *ring = req->ring;
5354	serge	613	uint32_t flush_domains;
		614	int ret;
		615
		616	flush_domains = 0;
		617	if (ring->gpu_caches_dirty)
		618	flush_domains = I915_GEM_GPU_DOMAINS;
		619
6084	serge	620	ret = ring->emit_flush(req, I915_GEM_GPU_DOMAINS, flush_domains);
5354	serge	621	if (ret)
		622	return ret;
		623
		624	ring->gpu_caches_dirty = false;
		625	return 0;
		626	}
		627
6084	serge	628	static int execlists_move_to_gpu(struct drm_i915_gem_request *req,
5354	serge	629	struct list_head *vmas)
		630	{
6084	serge	631	const unsigned other_rings = ~intel_ring_flag(req->ring);
5354	serge	632	struct i915_vma *vma;
		633	uint32_t flush_domains = 0;
		634	bool flush_chipset = false;
		635	int ret;
		636
		637	list_for_each_entry(vma, vmas, exec_list) {
		638	struct drm_i915_gem_object *obj = vma->obj;
		639
6084	serge	640	if (obj->active & other_rings) {
		641	ret = i915_gem_object_sync(obj, req->ring, &req);
		642	if (ret)
		643	return ret;
		644	}
5354	serge	645
		646	if (obj->base.write_domain & I915_GEM_DOMAIN_CPU)
		647	flush_chipset \|= i915_gem_clflush_object(obj, false);
		648
		649	flush_domains \|= obj->base.write_domain;
		650	}
		651
		652	if (flush_domains & I915_GEM_DOMAIN_GTT)
		653	wmb();
		654
		655	/* Unconditionally invalidate gpu caches and ensure that we do flush
		656	* any residual writes from the previous batch.
		657	*/
6084	serge	658	return logical_ring_invalidate_all_caches(req);
5354	serge	659	}
		660
6084	serge	661	int intel_logical_ring_alloc_request_extras(struct drm_i915_gem_request *request)
		662	{
		663	int ret;
		664
		665	request->ringbuf = request->ctx->engine[request->ring->id].ringbuf;
		666
		667	if (request->ctx != request->ring->default_context) {
		668	ret = intel_lr_context_pin(request);
		669	if (ret)
		670	return ret;
		671	}
		672
		673	return 0;
		674	}
		675
		676	static int logical_ring_wait_for_space(struct drm_i915_gem_request *req,
		677	int bytes)
		678	{
		679	struct intel_ringbuffer *ringbuf = req->ringbuf;
		680	struct intel_engine_cs *ring = req->ring;
		681	struct drm_i915_gem_request *target;
		682	unsigned space;
		683	int ret;
		684
		685	if (intel_ring_space(ringbuf) >= bytes)
		686	return 0;
		687
		688	/* The whole point of reserving space is to not wait! */
		689	WARN_ON(ringbuf->reserved_in_use);
		690
		691	list_for_each_entry(target, &ring->request_list, list) {
		692	/*
		693	* The request queue is per-engine, so can contain requests
		694	* from multiple ringbuffers. Here, we must ignore any that
		695	* aren't from the ringbuffer we're considering.
		696	*/
		697	if (target->ringbuf != ringbuf)
		698	continue;
		699
		700	/* Would completion of this request free enough space? */
		701	space = __intel_ring_space(target->postfix, ringbuf->tail,
		702	ringbuf->size);
		703	if (space >= bytes)
		704	break;
		705	}
		706
		707	if (WARN_ON(&target->list == &ring->request_list))
		708	return -ENOSPC;
		709
		710	ret = i915_wait_request(target);
		711	if (ret)
		712	return ret;
		713
		714	ringbuf->space = space;
		715	return 0;
		716	}
		717
		718	/*
		719	* intel_logical_ring_advance_and_submit() - advance the tail and submit the workload
		720	* @request: Request to advance the logical ringbuffer of.
		721	*
		722	* The tail is updated in our logical ringbuffer struct, not in the actual context. What
		723	* really happens during submission is that the context and current tail will be placed
		724	* on a queue waiting for the ELSP to be ready to accept a new context submission. At that
		725	* point, the tail inside the context is updated and the ELSP written to.
		726	*/
		727	static void
		728	intel_logical_ring_advance_and_submit(struct drm_i915_gem_request *request)
		729	{
		730	struct intel_engine_cs *ring = request->ring;
		731	struct drm_i915_private *dev_priv = request->i915;
		732
		733	intel_logical_ring_advance(request->ringbuf);
		734
		735	request->tail = request->ringbuf->tail;
		736
		737	if (intel_ring_stopped(ring))
		738	return;
		739
		740	if (dev_priv->guc.execbuf_client)
		741	i915_guc_submit(dev_priv->guc.execbuf_client, request);
		742	else
		743	execlists_context_queue(request);
		744	}
		745
		746	static void __wrap_ring_buffer(struct intel_ringbuffer *ringbuf)
		747	{
		748	uint32_t __iomem *virt;
		749	int rem = ringbuf->size - ringbuf->tail;
		750
		751	virt = ringbuf->virtual_start + ringbuf->tail;
		752	rem /= 4;
		753	while (rem--)
		754	iowrite32(MI_NOOP, virt++);
		755
		756	ringbuf->tail = 0;
		757	intel_ring_update_space(ringbuf);
		758	}
		759
		760	static int logical_ring_prepare(struct drm_i915_gem_request *req, int bytes)
		761	{
		762	struct intel_ringbuffer *ringbuf = req->ringbuf;
		763	int remain_usable = ringbuf->effective_size - ringbuf->tail;
		764	int remain_actual = ringbuf->size - ringbuf->tail;
		765	int ret, total_bytes, wait_bytes = 0;
		766	bool need_wrap = false;
		767
		768	if (ringbuf->reserved_in_use)
		769	total_bytes = bytes;
		770	else
		771	total_bytes = bytes + ringbuf->reserved_size;
		772
		773	if (unlikely(bytes > remain_usable)) {
		774	/*
		775	* Not enough space for the basic request. So need to flush
		776	* out the remainder and then wait for base + reserved.
		777	*/
		778	wait_bytes = remain_actual + total_bytes;
		779	need_wrap = true;
		780	} else {
		781	if (unlikely(total_bytes > remain_usable)) {
		782	/*
		783	* The base request will fit but the reserved space
6660	serge	784	* falls off the end. So don't need an immediate wrap
		785	* and only need to effectively wait for the reserved
		786	* size space from the start of ringbuffer.
6084	serge	787	*/
		788	wait_bytes = remain_actual + ringbuf->reserved_size;
		789	} else if (total_bytes > ringbuf->space) {
		790	/* No wrapping required, just waiting. */
		791	wait_bytes = total_bytes;
		792	}
		793	}
		794
		795	if (wait_bytes) {
		796	ret = logical_ring_wait_for_space(req, wait_bytes);
		797	if (unlikely(ret))
		798	return ret;
		799
		800	if (need_wrap)
		801	__wrap_ring_buffer(ringbuf);
		802	}
		803
		804	return 0;
		805	}
		806
5354	serge	807	/**
6084	serge	808	* intel_logical_ring_begin() - prepare the logical ringbuffer to accept some commands
		809	*
		810	* @req: The request to start some new work for
		811	* @num_dwords: number of DWORDs that we plan to write to the ringbuffer.
		812	*
		813	* The ringbuffer might not be ready to accept the commands right away (maybe it needs to
		814	* be wrapped, or wait a bit for the tail to be updated). This function takes care of that
		815	* and also preallocates a request (every workload submission is still mediated through
		816	* requests, same as it did with legacy ringbuffer submission).
		817	*
		818	* Return: non-zero if the ringbuffer is not ready to be written to.
		819	*/
		820	int intel_logical_ring_begin(struct drm_i915_gem_request *req, int num_dwords)
		821	{
		822	struct drm_i915_private *dev_priv;
		823	int ret;
		824
		825	WARN_ON(req == NULL);
		826	dev_priv = req->ring->dev->dev_private;
		827
		828	ret = i915_gem_check_wedge(&dev_priv->gpu_error,
		829	dev_priv->mm.interruptible);
		830	if (ret)
		831	return ret;
		832
		833	ret = logical_ring_prepare(req, num_dwords * sizeof(uint32_t));
		834	if (ret)
		835	return ret;
		836
		837	req->ringbuf->space -= num_dwords * sizeof(uint32_t);
		838	return 0;
		839	}
		840
		841	int intel_logical_ring_reserve_space(struct drm_i915_gem_request *request)
		842	{
		843	/*
		844	* The first call merely notes the reserve request and is common for
		845	* all back ends. The subsequent localised _begin() call actually
		846	* ensures that the reservation is available. Without the begin, if
		847	* the request creator immediately submitted the request without
		848	* adding any commands to it then there might not actually be
		849	* sufficient room for the submission commands.
		850	*/
		851	intel_ring_reserved_space_reserve(request->ringbuf, MIN_SPACE_FOR_ADD_REQUEST);
		852
		853	return intel_logical_ring_begin(request, 0);
		854	}
		855
		856	/**
5354	serge	857	* execlists_submission() - submit a batchbuffer for execution, Execlists style
		858	* @dev: DRM device.
		859	* @file: DRM file.
		860	* @ring: Engine Command Streamer to submit to.
		861	* @ctx: Context to employ for this submission.
		862	* @args: execbuffer call arguments.
		863	* @vmas: list of vmas.
		864	* @batch_obj: the batchbuffer to submit.
		865	* @exec_start: batchbuffer start virtual address pointer.
6084	serge	866	* @dispatch_flags: translated execbuffer call flags.
5354	serge	867	*
		868	* This is the evil twin version of i915_gem_ringbuffer_submission. It abstracts
		869	* away the submission details of the execbuffer ioctl call.
		870	*
		871	* Return: non-zero if the submission fails.
		872	*/
6084	serge	873	int intel_execlists_submission(struct i915_execbuffer_params *params,
5354	serge	874	struct drm_i915_gem_execbuffer2 *args,
6084	serge	875	struct list_head *vmas)
5354	serge	876	{
6084	serge	877	struct drm_device *dev = params->dev;
		878	struct intel_engine_cs *ring = params->ring;
5354	serge	879	struct drm_i915_private *dev_priv = dev->dev_private;
6084	serge	880	struct intel_ringbuffer *ringbuf = params->ctx->engine[ring->id].ringbuf;
		881	u64 exec_start;
5354	serge	882	int instp_mode;
		883	u32 instp_mask;
		884	int ret;
		885
		886	instp_mode = args->flags & I915_EXEC_CONSTANTS_MASK;
		887	instp_mask = I915_EXEC_CONSTANTS_MASK;
		888	switch (instp_mode) {
		889	case I915_EXEC_CONSTANTS_REL_GENERAL:
		890	case I915_EXEC_CONSTANTS_ABSOLUTE:
		891	case I915_EXEC_CONSTANTS_REL_SURFACE:
		892	if (instp_mode != 0 && ring != &dev_priv->ring[RCS]) {
		893	DRM_DEBUG("non-0 rel constants mode on non-RCS\n");
		894	return -EINVAL;
		895	}
		896
		897	if (instp_mode != dev_priv->relative_constants_mode) {
		898	if (instp_mode == I915_EXEC_CONSTANTS_REL_SURFACE) {
		899	DRM_DEBUG("rel surface constants mode invalid on gen5+\n");
		900	return -EINVAL;
		901	}
		902
		903	/* The HW changed the meaning on this bit on gen6 */
		904	instp_mask &= ~I915_EXEC_CONSTANTS_REL_SURFACE;
		905	}
		906	break;
		907	default:
		908	DRM_DEBUG("execbuf with unknown constants: %d\n", instp_mode);
		909	return -EINVAL;
		910	}
		911
		912	if (args->flags & I915_EXEC_GEN7_SOL_RESET) {
		913	DRM_DEBUG("sol reset is gen7 only\n");
		914	return -EINVAL;
		915	}
		916
6084	serge	917	ret = execlists_move_to_gpu(params->request, vmas);
5354	serge	918	if (ret)
		919	return ret;
		920
		921	if (ring == &dev_priv->ring[RCS] &&
		922	instp_mode != dev_priv->relative_constants_mode) {
6084	serge	923	ret = intel_logical_ring_begin(params->request, 4);
5354	serge	924	if (ret)
		925	return ret;
		926
		927	intel_logical_ring_emit(ringbuf, MI_NOOP);
		928	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(1));
6937	serge	929	intel_logical_ring_emit_reg(ringbuf, INSTPM);
5354	serge	930	intel_logical_ring_emit(ringbuf, instp_mask << 16 \| instp_mode);
		931	intel_logical_ring_advance(ringbuf);
		932
		933	dev_priv->relative_constants_mode = instp_mode;
		934	}
		935
6084	serge	936	exec_start = params->batch_obj_vm_offset +
		937	args->batch_start_offset;
		938
		939	ret = ring->emit_bb_start(params->request, exec_start, params->dispatch_flags);
5354	serge	940	if (ret)
		941	return ret;
		942
6084	serge	943	trace_i915_gem_ring_dispatch(params->request, params->dispatch_flags);
5354	serge	944
6084	serge	945	i915_gem_execbuffer_move_to_active(vmas, params->request);
		946	i915_gem_execbuffer_retire_commands(params);
		947
5354	serge	948	return 0;
		949	}
		950
		951	void intel_execlists_retire_requests(struct intel_engine_cs *ring)
		952	{
6084	serge	953	struct drm_i915_gem_request req, tmp;
5354	serge	954	struct list_head retired_list;
		955
		956	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
		957	if (list_empty(&ring->execlist_retired_req_list))
		958	return;
		959
		960	INIT_LIST_HEAD(&retired_list);
6084	serge	961	spin_lock_irq(&ring->execlist_lock);
5354	serge	962	list_replace_init(&ring->execlist_retired_req_list, &retired_list);
6084	serge	963	spin_unlock_irq(&ring->execlist_lock);
5354	serge	964
		965	list_for_each_entry_safe(req, tmp, &retired_list, execlist_link) {
		966	struct intel_context *ctx = req->ctx;
		967	struct drm_i915_gem_object *ctx_obj =
		968	ctx->engine[ring->id].state;
		969
		970	if (ctx_obj && (ctx != ring->default_context))
6084	serge	971	intel_lr_context_unpin(req);
5354	serge	972	list_del(&req->execlist_link);
6084	serge	973	i915_gem_request_unreference(req);
5354	serge	974	}
		975	}
		976
		977	void intel_logical_ring_stop(struct intel_engine_cs *ring)
		978	{
		979	struct drm_i915_private *dev_priv = ring->dev->dev_private;
		980	int ret;
		981
		982	if (!intel_ring_initialized(ring))
		983	return;
		984
		985	ret = intel_ring_idle(ring);
		986	if (ret && !i915_reset_in_progress(&to_i915(ring->dev)->gpu_error))
		987	DRM_ERROR("failed to quiesce %s whilst cleaning up: %d\n",
		988	ring->name, ret);
		989
		990	/* TODO: Is this correct with Execlists enabled? */
		991	I915_WRITE_MODE(ring, _MASKED_BIT_ENABLE(STOP_RING));
		992	if (wait_for_atomic((I915_READ_MODE(ring) & MODE_IDLE) != 0, 1000)) {
		993	DRM_ERROR("%s :timed out trying to stop ring\n", ring->name);
		994	return;
		995	}
		996	I915_WRITE_MODE(ring, _MASKED_BIT_DISABLE(STOP_RING));
		997	}
		998
6084	serge	999	int logical_ring_flush_all_caches(struct drm_i915_gem_request *req)
5354	serge	1000	{
6084	serge	1001	struct intel_engine_cs *ring = req->ring;
5354	serge	1002	int ret;
		1003
		1004	if (!ring->gpu_caches_dirty)
		1005	return 0;
		1006
6084	serge	1007	ret = ring->emit_flush(req, 0, I915_GEM_GPU_DOMAINS);
5354	serge	1008	if (ret)
		1009	return ret;
		1010
		1011	ring->gpu_caches_dirty = false;
		1012	return 0;
		1013	}
		1014
6084	serge	1015	static int intel_lr_context_do_pin(struct intel_engine_cs *ring,
		1016	struct drm_i915_gem_object *ctx_obj,
		1017	struct intel_ringbuffer *ringbuf)
5354	serge	1018	{
6084	serge	1019	struct drm_device *dev = ring->dev;
		1020	struct drm_i915_private *dev_priv = dev->dev_private;
		1021	int ret = 0;
5354	serge	1022
6084	serge	1023	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
		1024	ret = i915_gem_obj_ggtt_pin(ctx_obj, GEN8_LR_CONTEXT_ALIGN,
		1025	PIN_OFFSET_BIAS \| GUC_WOPCM_TOP);
		1026	if (ret)
		1027	return ret;
5354	serge	1028
6084	serge	1029	ret = intel_pin_and_map_ringbuffer_obj(ring->dev, ringbuf);
		1030	if (ret)
		1031	goto unpin_ctx_obj;
5354	serge	1032
6084	serge	1033	ctx_obj->dirty = true;
		1034
		1035	/* Invalidate GuC TLB. */
		1036	if (i915.enable_guc_submission)
		1037	I915_WRITE(GEN8_GTCR, GEN8_GTCR_INVALIDATE);
		1038
		1039	return ret;
		1040
		1041	unpin_ctx_obj:
		1042	i915_gem_object_ggtt_unpin(ctx_obj);
		1043
		1044	return ret;
5354	serge	1045	}
		1046
6084	serge	1047	static int intel_lr_context_pin(struct drm_i915_gem_request *rq)
5354	serge	1048	{
		1049	int ret = 0;
6084	serge	1050	struct intel_engine_cs *ring = rq->ring;
		1051	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
		1052	struct intel_ringbuffer *ringbuf = rq->ringbuf;
5354	serge	1053
6084	serge	1054	if (rq->ctx->engine[ring->id].pin_count++ == 0) {
		1055	ret = intel_lr_context_do_pin(ring, ctx_obj, ringbuf);
5354	serge	1056	if (ret)
6084	serge	1057	goto reset_pin_count;
5354	serge	1058	}
		1059	return ret;
		1060
6084	serge	1061	reset_pin_count:
		1062	rq->ctx->engine[ring->id].pin_count = 0;
5354	serge	1063	return ret;
		1064	}
		1065
6084	serge	1066	void intel_lr_context_unpin(struct drm_i915_gem_request *rq)
5354	serge	1067	{
6084	serge	1068	struct intel_engine_cs *ring = rq->ring;
		1069	struct drm_i915_gem_object *ctx_obj = rq->ctx->engine[ring->id].state;
		1070	struct intel_ringbuffer *ringbuf = rq->ringbuf;
5354	serge	1071
		1072	if (ctx_obj) {
		1073	WARN_ON(!mutex_is_locked(&ring->dev->struct_mutex));
6084	serge	1074	if (--rq->ctx->engine[ring->id].pin_count == 0) {
5354	serge	1075	intel_unpin_ringbuffer_obj(ringbuf);
		1076	i915_gem_object_ggtt_unpin(ctx_obj);
		1077	}
		1078	}
		1079	}
		1080
6084	serge	1081	static int intel_logical_ring_workarounds_emit(struct drm_i915_gem_request *req)
5354	serge	1082	{
6084	serge	1083	int ret, i;
		1084	struct intel_engine_cs *ring = req->ring;
		1085	struct intel_ringbuffer *ringbuf = req->ringbuf;
		1086	struct drm_device *dev = ring->dev;
		1087	struct drm_i915_private *dev_priv = dev->dev_private;
		1088	struct i915_workarounds *w = &dev_priv->workarounds;
5354	serge	1089
6084	serge	1090	if (WARN_ON_ONCE(w->count == 0))
5354	serge	1091	return 0;
		1092
6084	serge	1093	ring->gpu_caches_dirty = true;
		1094	ret = logical_ring_flush_all_caches(req);
		1095	if (ret)
		1096	return ret;
5354	serge	1097
6084	serge	1098	ret = intel_logical_ring_begin(req, w->count * 2 + 2);
		1099	if (ret)
		1100	return ret;
5354	serge	1101
6084	serge	1102	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(w->count));
		1103	for (i = 0; i < w->count; i++) {
6937	serge	1104	intel_logical_ring_emit_reg(ringbuf, w->reg[i].addr);
6084	serge	1105	intel_logical_ring_emit(ringbuf, w->reg[i].value);
		1106	}
		1107	intel_logical_ring_emit(ringbuf, MI_NOOP);
5354	serge	1108
6084	serge	1109	intel_logical_ring_advance(ringbuf);
5354	serge	1110
6084	serge	1111	ring->gpu_caches_dirty = true;
		1112	ret = logical_ring_flush_all_caches(req);
		1113	if (ret)
		1114	return ret;
5354	serge	1115
6084	serge	1116	return 0;
5354	serge	1117	}
		1118
6084	serge	1119	#define wa_ctx_emit(batch, index, cmd) \
		1120	do { \
		1121	int __index = (index)++; \
		1122	if (WARN_ON(__index >= (PAGE_SIZE / sizeof(uint32_t)))) { \
		1123	return -ENOSPC; \
		1124	} \
		1125	batch[__index] = (cmd); \
		1126	} while (0)
		1127
6937	serge	1128	#define wa_ctx_emit_reg(batch, index, reg) \
		1129	wa_ctx_emit((batch), (index), i915_mmio_reg_offset(reg))
6084	serge	1130
		1131	/*
		1132	* In this WA we need to set GEN8_L3SQCREG4[21:21] and reset it after
		1133	* PIPE_CONTROL instruction. This is required for the flush to happen correctly
		1134	* but there is a slight complication as this is applied in WA batch where the
		1135	* values are only initialized once so we cannot take register value at the
		1136	* beginning and reuse it further; hence we save its value to memory, upload a
		1137	* constant value with bit21 set and then we restore it back with the saved value.
		1138	* To simplify the WA, a constant value is formed by using the default value
		1139	* of this register. This shouldn't be a problem because we are only modifying
		1140	* it for a short period and this batch in non-premptible. We can ofcourse
		1141	* use additional instructions that read the actual value of the register
		1142	* at that time and set our bit of interest but it makes the WA complicated.
		1143	*
		1144	* This WA is also required for Gen9 so extracting as a function avoids
		1145	* code duplication.
		1146	*/
		1147	static inline int gen8_emit_flush_coherentl3_wa(struct intel_engine_cs *ring,
		1148	uint32_t *const batch,
		1149	uint32_t index)
5354	serge	1150	{
6084	serge	1151	uint32_t l3sqc4_flush = (0x40400000 \| GEN8_LQSC_FLUSH_COHERENT_LINES);
5354	serge	1152
6084	serge	1153	/*
		1154	* WaDisableLSQCROPERFforOCL:skl
		1155	* This WA is implemented in skl_init_clock_gating() but since
		1156	* this batch updates GEN8_L3SQCREG4 with default value we need to
		1157	* set this bit here to retain the WA during flush.
		1158	*/
6937	serge	1159	if (IS_SKL_REVID(ring->dev, 0, SKL_REVID_E0))
6084	serge	1160	l3sqc4_flush \|= GEN8_LQSC_RO_PERF_DIS;
5354	serge	1161
6084	serge	1162	wa_ctx_emit(batch, index, (MI_STORE_REGISTER_MEM_GEN8 \|
		1163	MI_SRM_LRM_GLOBAL_GTT));
6937	serge	1164	wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
6084	serge	1165	wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256);
		1166	wa_ctx_emit(batch, index, 0);
5354	serge	1167
6084	serge	1168	wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
6937	serge	1169	wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
6084	serge	1170	wa_ctx_emit(batch, index, l3sqc4_flush);
5354	serge	1171
6084	serge	1172	wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
		1173	wa_ctx_emit(batch, index, (PIPE_CONTROL_CS_STALL \|
		1174	PIPE_CONTROL_DC_FLUSH_ENABLE));
		1175	wa_ctx_emit(batch, index, 0);
		1176	wa_ctx_emit(batch, index, 0);
		1177	wa_ctx_emit(batch, index, 0);
		1178	wa_ctx_emit(batch, index, 0);
5354	serge	1179
6084	serge	1180	wa_ctx_emit(batch, index, (MI_LOAD_REGISTER_MEM_GEN8 \|
		1181	MI_SRM_LRM_GLOBAL_GTT));
6937	serge	1182	wa_ctx_emit_reg(batch, index, GEN8_L3SQCREG4);
6084	serge	1183	wa_ctx_emit(batch, index, ring->scratch.gtt_offset + 256);
		1184	wa_ctx_emit(batch, index, 0);
5354	serge	1185
6084	serge	1186	return index;
		1187	}
5354	serge	1188
6084	serge	1189	static inline uint32_t wa_ctx_start(struct i915_wa_ctx_bb *wa_ctx,
		1190	uint32_t offset,
		1191	uint32_t start_alignment)
		1192	{
		1193	return wa_ctx->offset = ALIGN(offset, start_alignment);
		1194	}
		1195
		1196	static inline int wa_ctx_end(struct i915_wa_ctx_bb *wa_ctx,
		1197	uint32_t offset,
		1198	uint32_t size_alignment)
		1199	{
		1200	wa_ctx->size = offset - wa_ctx->offset;
		1201
		1202	WARN(wa_ctx->size % size_alignment,
		1203	"wa_ctx_bb failed sanity checks: size %d is not aligned to %d\n",
		1204	wa_ctx->size, size_alignment);
5354	serge	1205	return 0;
		1206	}
		1207
6084	serge	1208	/**
		1209	* gen8_init_indirectctx_bb() - initialize indirect ctx batch with WA
		1210	*
		1211	* @ring: only applicable for RCS
		1212	* @wa_ctx: structure representing wa_ctx
		1213	* offset: specifies start of the batch, should be cache-aligned. This is updated
		1214	* with the offset value received as input.
		1215	* size: size of the batch in DWORDS but HW expects in terms of cachelines
		1216	* @batch: page in which WA are loaded
		1217	* @offset: This field specifies the start of the batch, it should be
		1218	* cache-aligned otherwise it is adjusted accordingly.
		1219	* Typically we only have one indirect_ctx and per_ctx batch buffer which are
		1220	* initialized at the beginning and shared across all contexts but this field
		1221	* helps us to have multiple batches at different offsets and select them based
		1222	* on a criteria. At the moment this batch always start at the beginning of the page
		1223	* and at this point we don't have multiple wa_ctx batch buffers.
		1224	*
		1225	* The number of WA applied are not known at the beginning; we use this field
		1226	* to return the no of DWORDS written.
		1227	*
		1228	* It is to be noted that this batch does not contain MI_BATCH_BUFFER_END
		1229	* so it adds NOOPs as padding to make it cacheline aligned.
		1230	* MI_BATCH_BUFFER_END will be added to perctx batch and both of them together
		1231	* makes a complete batch buffer.
		1232	*
		1233	* Return: non-zero if we exceed the PAGE_SIZE limit.
		1234	*/
		1235
		1236	static int gen8_init_indirectctx_bb(struct intel_engine_cs *ring,
		1237	struct i915_wa_ctx_bb *wa_ctx,
		1238	uint32_t *const batch,
		1239	uint32_t *offset)
5354	serge	1240	{
6084	serge	1241	uint32_t scratch_addr;
		1242	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
5354	serge	1243
6084	serge	1244	/* WaDisableCtxRestoreArbitration:bdw,chv */
		1245	wa_ctx_emit(batch, index, MI_ARB_ON_OFF \| MI_ARB_DISABLE);
5354	serge	1246
6084	serge	1247	/* WaFlushCoherentL3CacheLinesAtContextSwitch:bdw */
		1248	if (IS_BROADWELL(ring->dev)) {
		1249	int rc = gen8_emit_flush_coherentl3_wa(ring, batch, index);
		1250	if (rc < 0)
		1251	return rc;
		1252	index = rc;
		1253	}
5354	serge	1254
6084	serge	1255	/* WaClearSlmSpaceAtContextSwitch:bdw,chv */
		1256	/* Actual scratch location is at 128 bytes offset */
		1257	scratch_addr = ring->scratch.gtt_offset + 2*CACHELINE_BYTES;
5354	serge	1258
6084	serge	1259	wa_ctx_emit(batch, index, GFX_OP_PIPE_CONTROL(6));
		1260	wa_ctx_emit(batch, index, (PIPE_CONTROL_FLUSH_L3 \|
		1261	PIPE_CONTROL_GLOBAL_GTT_IVB \|
		1262	PIPE_CONTROL_CS_STALL \|
		1263	PIPE_CONTROL_QW_WRITE));
		1264	wa_ctx_emit(batch, index, scratch_addr);
		1265	wa_ctx_emit(batch, index, 0);
		1266	wa_ctx_emit(batch, index, 0);
		1267	wa_ctx_emit(batch, index, 0);
5354	serge	1268
6084	serge	1269	/* Pad to end of cacheline */
		1270	while (index % CACHELINE_DWORDS)
		1271	wa_ctx_emit(batch, index, MI_NOOP);
5354	serge	1272
6084	serge	1273	/*
		1274	* MI_BATCH_BUFFER_END is not required in Indirect ctx BB because
		1275	* execution depends on the length specified in terms of cache lines
		1276	* in the register CTX_RCS_INDIRECT_CTX
		1277	*/
5354	serge	1278
6084	serge	1279	return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS);
		1280	}
5354	serge	1281
6084	serge	1282	/**
		1283	* gen8_init_perctx_bb() - initialize per ctx batch with WA
		1284	*
		1285	* @ring: only applicable for RCS
		1286	* @wa_ctx: structure representing wa_ctx
		1287	* offset: specifies start of the batch, should be cache-aligned.
		1288	* size: size of the batch in DWORDS but HW expects in terms of cachelines
		1289	* @batch: page in which WA are loaded
		1290	* @offset: This field specifies the start of this batch.
		1291	* This batch is started immediately after indirect_ctx batch. Since we ensure
		1292	* that indirect_ctx ends on a cacheline this batch is aligned automatically.
		1293	*
		1294	* The number of DWORDS written are returned using this field.
		1295	*
		1296	* This batch is terminated with MI_BATCH_BUFFER_END and so we need not add padding
		1297	* to align it with cacheline as padding after MI_BATCH_BUFFER_END is redundant.
		1298	*/
		1299	static int gen8_init_perctx_bb(struct intel_engine_cs *ring,
		1300	struct i915_wa_ctx_bb *wa_ctx,
		1301	uint32_t *const batch,
		1302	uint32_t *offset)
		1303	{
		1304	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
		1305
		1306	/* WaDisableCtxRestoreArbitration:bdw,chv */
		1307	wa_ctx_emit(batch, index, MI_ARB_ON_OFF \| MI_ARB_ENABLE);
		1308
		1309	wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
		1310
		1311	return wa_ctx_end(wa_ctx, *offset = index, 1);
5354	serge	1312	}
		1313
6084	serge	1314	static int gen9_init_indirectctx_bb(struct intel_engine_cs *ring,
		1315	struct i915_wa_ctx_bb *wa_ctx,
		1316	uint32_t *const batch,
		1317	uint32_t *offset)
5354	serge	1318	{
6084	serge	1319	int ret;
		1320	struct drm_device *dev = ring->dev;
		1321	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
5354	serge	1322
6084	serge	1323	/* WaDisableCtxRestoreArbitration:skl,bxt */
6937	serge	1324	if (IS_SKL_REVID(dev, 0, SKL_REVID_D0) \|\|
		1325	IS_BXT_REVID(dev, 0, BXT_REVID_A1))
6084	serge	1326	wa_ctx_emit(batch, index, MI_ARB_ON_OFF \| MI_ARB_DISABLE);
5354	serge	1327
6084	serge	1328	/* WaFlushCoherentL3CacheLinesAtContextSwitch:skl,bxt */
		1329	ret = gen8_emit_flush_coherentl3_wa(ring, batch, index);
		1330	if (ret < 0)
		1331	return ret;
		1332	index = ret;
5354	serge	1333
6084	serge	1334	/* Pad to end of cacheline */
		1335	while (index % CACHELINE_DWORDS)
		1336	wa_ctx_emit(batch, index, MI_NOOP);
5354	serge	1337
6084	serge	1338	return wa_ctx_end(wa_ctx, *offset = index, CACHELINE_DWORDS);
5354	serge	1339	}
		1340
6084	serge	1341	static int gen9_init_perctx_bb(struct intel_engine_cs *ring,
		1342	struct i915_wa_ctx_bb *wa_ctx,
		1343	uint32_t *const batch,
		1344	uint32_t *offset)
5354	serge	1345	{
6084	serge	1346	struct drm_device *dev = ring->dev;
		1347	uint32_t index = wa_ctx_start(wa_ctx, *offset, CACHELINE_DWORDS);
5354	serge	1348
6084	serge	1349	/* WaSetDisablePixMaskCammingAndRhwoInCommonSliceChicken:skl,bxt */
6937	serge	1350	if (IS_SKL_REVID(dev, 0, SKL_REVID_B0) \|\|
		1351	IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
6084	serge	1352	wa_ctx_emit(batch, index, MI_LOAD_REGISTER_IMM(1));
6937	serge	1353	wa_ctx_emit_reg(batch, index, GEN9_SLICE_COMMON_ECO_CHICKEN0);
6084	serge	1354	wa_ctx_emit(batch, index,
		1355	_MASKED_BIT_ENABLE(DISABLE_PIXEL_MASK_CAMMING));
		1356	wa_ctx_emit(batch, index, MI_NOOP);
5354	serge	1357	}
		1358
6084	serge	1359	/* WaDisableCtxRestoreArbitration:skl,bxt */
6937	serge	1360	if (IS_SKL_REVID(dev, 0, SKL_REVID_D0) \|\|
		1361	IS_BXT_REVID(dev, 0, BXT_REVID_A1))
6084	serge	1362	wa_ctx_emit(batch, index, MI_ARB_ON_OFF \| MI_ARB_ENABLE);
5354	serge	1363
6084	serge	1364	wa_ctx_emit(batch, index, MI_BATCH_BUFFER_END);
		1365
		1366	return wa_ctx_end(wa_ctx, *offset = index, 1);
5354	serge	1367	}
		1368
6084	serge	1369	static int lrc_setup_wa_ctx_obj(struct intel_engine_cs *ring, u32 size)
5354	serge	1370	{
		1371	int ret;
		1372
6084	serge	1373	ring->wa_ctx.obj = i915_gem_alloc_object(ring->dev, PAGE_ALIGN(size));
		1374	if (!ring->wa_ctx.obj) {
		1375	DRM_DEBUG_DRIVER("alloc LRC WA ctx backing obj failed.\n");
		1376	return -ENOMEM;
		1377	}
5354	serge	1378
6084	serge	1379	ret = i915_gem_obj_ggtt_pin(ring->wa_ctx.obj, PAGE_SIZE, 0);
		1380	if (ret) {
		1381	DRM_DEBUG_DRIVER("pin LRC WA ctx backing obj failed: %d\n",
		1382	ret);
		1383	drm_gem_object_unreference(&ring->wa_ctx.obj->base);
5354	serge	1384	return ret;
6084	serge	1385	}
5354	serge	1386
		1387	return 0;
		1388	}
		1389
6084	serge	1390	static void lrc_destroy_wa_ctx_obj(struct intel_engine_cs *ring)
5354	serge	1391	{
6084	serge	1392	if (ring->wa_ctx.obj) {
		1393	i915_gem_object_ggtt_unpin(ring->wa_ctx.obj);
		1394	drm_gem_object_unreference(&ring->wa_ctx.obj->base);
		1395	ring->wa_ctx.obj = NULL;
		1396	}
		1397	}
5354	serge	1398
6084	serge	1399	static int intel_init_workaround_bb(struct intel_engine_cs *ring)
		1400	{
		1401	int ret;
		1402	uint32_t *batch;
		1403	uint32_t offset;
		1404	struct page *page;
		1405	struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
		1406
		1407	WARN_ON(ring->id != RCS);
		1408
		1409	/* update this when WA for higher Gen are added */
		1410	if (INTEL_INFO(ring->dev)->gen > 9) {
		1411	DRM_ERROR("WA batch buffer is not initialized for Gen%d\n",
		1412	INTEL_INFO(ring->dev)->gen);
5354	serge	1413	return 0;
6084	serge	1414	}
5354	serge	1415
6084	serge	1416	/* some WA perform writes to scratch page, ensure it is valid */
		1417	if (ring->scratch.obj == NULL) {
		1418	DRM_ERROR("scratch page not allocated for %s\n", ring->name);
		1419	return -EINVAL;
		1420	}
5354	serge	1421
6084	serge	1422	ret = lrc_setup_wa_ctx_obj(ring, PAGE_SIZE);
		1423	if (ret) {
		1424	DRM_DEBUG_DRIVER("Failed to setup context WA page: %d\n", ret);
5354	serge	1425	return ret;
6084	serge	1426	}
5354	serge	1427
6937	serge	1428	page = i915_gem_object_get_dirty_page(wa_ctx->obj, 0);
6084	serge	1429	batch = kmap_atomic(page);
		1430	offset = 0;
		1431
		1432	if (INTEL_INFO(ring->dev)->gen == 8) {
		1433	ret = gen8_init_indirectctx_bb(ring,
		1434	&wa_ctx->indirect_ctx,
		1435	batch,
		1436	&offset);
		1437	if (ret)
		1438	goto out;
		1439
		1440	ret = gen8_init_perctx_bb(ring,
		1441	&wa_ctx->per_ctx,
		1442	batch,
		1443	&offset);
		1444	if (ret)
		1445	goto out;
		1446	} else if (INTEL_INFO(ring->dev)->gen == 9) {
		1447	ret = gen9_init_indirectctx_bb(ring,
		1448	&wa_ctx->indirect_ctx,
		1449	batch,
		1450	&offset);
		1451	if (ret)
		1452	goto out;
		1453
		1454	ret = gen9_init_perctx_bb(ring,
		1455	&wa_ctx->per_ctx,
		1456	batch,
		1457	&offset);
		1458	if (ret)
		1459	goto out;
5354	serge	1460	}
		1461
6084	serge	1462	out:
		1463	kunmap_atomic(batch);
5354	serge	1464	if (ret)
6084	serge	1465	lrc_destroy_wa_ctx_obj(ring);
5354	serge	1466
6084	serge	1467	return ret;
5354	serge	1468	}
		1469
		1470	static int gen8_init_common_ring(struct intel_engine_cs *ring)
		1471	{
		1472	struct drm_device *dev = ring->dev;
		1473	struct drm_i915_private *dev_priv = dev->dev_private;
6084	serge	1474	u8 next_context_status_buffer_hw;
5354	serge	1475
6084	serge	1476	lrc_setup_hardware_status_page(ring,
		1477	ring->default_context->engine[ring->id].state);
		1478
5354	serge	1479	I915_WRITE_IMR(ring, ~(ring->irq_enable_mask \| ring->irq_keep_mask));
		1480	I915_WRITE(RING_HWSTAM(ring->mmio_base), 0xffffffff);
		1481
		1482	I915_WRITE(RING_MODE_GEN7(ring),
		1483	_MASKED_BIT_DISABLE(GFX_REPLAY_MODE) \|
		1484	_MASKED_BIT_ENABLE(GFX_RUN_LIST_ENABLE));
		1485	POSTING_READ(RING_MODE_GEN7(ring));
6084	serge	1486
		1487	/*
		1488	* Instead of resetting the Context Status Buffer (CSB) read pointer to
		1489	* zero, we need to read the write pointer from hardware and use its
		1490	* value because "this register is power context save restored".
		1491	* Effectively, these states have been observed:
		1492	*
		1493	* \| Suspend-to-idle (freeze) \| Suspend-to-RAM (mem) \|
		1494	* BDW \| CSB regs not reset \| CSB regs reset \|
		1495	* CHT \| CSB regs not reset \| CSB regs not reset \|
		1496	*/
		1497	next_context_status_buffer_hw = (I915_READ(RING_CONTEXT_STATUS_PTR(ring))
		1498	& GEN8_CSB_PTR_MASK);
		1499
		1500	/*
		1501	* When the CSB registers are reset (also after power-up / gpu reset),
		1502	* CSB write pointer is set to all 1's, which is not valid, use '5' in
		1503	* this special case, so the first element read is CSB[0].
		1504	*/
		1505	if (next_context_status_buffer_hw == GEN8_CSB_PTR_MASK)
		1506	next_context_status_buffer_hw = (GEN8_CSB_ENTRIES - 1);
		1507
		1508	ring->next_context_status_buffer = next_context_status_buffer_hw;
5354	serge	1509	DRM_DEBUG_DRIVER("Execlists enabled for %s\n", ring->name);
		1510
		1511	memset(&ring->hangcheck, 0, sizeof(ring->hangcheck));
		1512
		1513	return 0;
		1514	}
		1515
		1516	static int gen8_init_render_ring(struct intel_engine_cs *ring)
		1517	{
		1518	struct drm_device *dev = ring->dev;
		1519	struct drm_i915_private *dev_priv = dev->dev_private;
		1520	int ret;
		1521
		1522	ret = gen8_init_common_ring(ring);
		1523	if (ret)
		1524	return ret;
		1525
		1526	/* We need to disable the AsyncFlip performance optimisations in order
		1527	* to use MI_WAIT_FOR_EVENT within the CS. It should already be
		1528	* programmed to '1' on all products.
		1529	*
		1530	* WaDisableAsyncFlipPerfMode:snb,ivb,hsw,vlv,bdw,chv
		1531	*/
		1532	I915_WRITE(MI_MODE, _MASKED_BIT_ENABLE(ASYNC_FLIP_PERF_DISABLE));
		1533
6084	serge	1534	I915_WRITE(INSTPM, _MASKED_BIT_ENABLE(INSTPM_FORCE_ORDERING));
		1535
		1536	return init_workarounds_ring(ring);
		1537	}
		1538
		1539	static int gen9_init_render_ring(struct intel_engine_cs *ring)
		1540	{
		1541	int ret;
		1542
		1543	ret = gen8_init_common_ring(ring);
5354	serge	1544	if (ret)
		1545	return ret;
		1546
		1547	return init_workarounds_ring(ring);
		1548	}
		1549
6084	serge	1550	static int intel_logical_ring_emit_pdps(struct drm_i915_gem_request *req)
5354	serge	1551	{
6084	serge	1552	struct i915_hw_ppgtt *ppgtt = req->ctx->ppgtt;
		1553	struct intel_engine_cs *ring = req->ring;
		1554	struct intel_ringbuffer *ringbuf = req->ringbuf;
		1555	const int num_lri_cmds = GEN8_LEGACY_PDPES * 2;
		1556	int i, ret;
		1557
		1558	ret = intel_logical_ring_begin(req, num_lri_cmds * 2 + 2);
		1559	if (ret)
		1560	return ret;
		1561
		1562	intel_logical_ring_emit(ringbuf, MI_LOAD_REGISTER_IMM(num_lri_cmds));
		1563	for (i = GEN8_LEGACY_PDPES - 1; i >= 0; i--) {
		1564	const dma_addr_t pd_daddr = i915_page_dir_dma_addr(ppgtt, i);
		1565
6937	serge	1566	intel_logical_ring_emit_reg(ringbuf, GEN8_RING_PDP_UDW(ring, i));
6084	serge	1567	intel_logical_ring_emit(ringbuf, upper_32_bits(pd_daddr));
6937	serge	1568	intel_logical_ring_emit_reg(ringbuf, GEN8_RING_PDP_LDW(ring, i));
6084	serge	1569	intel_logical_ring_emit(ringbuf, lower_32_bits(pd_daddr));
		1570	}
		1571
		1572	intel_logical_ring_emit(ringbuf, MI_NOOP);
		1573	intel_logical_ring_advance(ringbuf);
		1574
		1575	return 0;
		1576	}
		1577
		1578	static int gen8_emit_bb_start(struct drm_i915_gem_request *req,
		1579	u64 offset, unsigned dispatch_flags)
		1580	{
		1581	struct intel_ringbuffer *ringbuf = req->ringbuf;
		1582	bool ppgtt = !(dispatch_flags & I915_DISPATCH_SECURE);
5354	serge	1583	int ret;
		1584
6084	serge	1585	/* Don't rely in hw updating PDPs, specially in lite-restore.
		1586	* Ideally, we should set Force PD Restore in ctx descriptor,
		1587	* but we can't. Force Restore would be a second option, but
		1588	* it is unsafe in case of lite-restore (because the ctx is
		1589	* not idle). PML4 is allocated during ppgtt init so this is
		1590	* not needed in 48-bit.*/
		1591	if (req->ctx->ppgtt &&
		1592	(intel_ring_flag(req->ring) & req->ctx->ppgtt->pd_dirty_rings)) {
		1593	if (!USES_FULL_48BIT_PPGTT(req->i915) &&
		1594	!intel_vgpu_active(req->i915->dev)) {
		1595	ret = intel_logical_ring_emit_pdps(req);
		1596	if (ret)
		1597	return ret;
		1598	}
		1599
		1600	req->ctx->ppgtt->pd_dirty_rings &= ~intel_ring_flag(req->ring);
		1601	}
		1602
		1603	ret = intel_logical_ring_begin(req, 4);
5354	serge	1604	if (ret)
		1605	return ret;
		1606
		1607	/* FIXME(BDW): Address space and security selectors. */
6084	serge	1608	intel_logical_ring_emit(ringbuf, MI_BATCH_BUFFER_START_GEN8 \|
		1609	(ppgtt<<8) \|
		1610	(dispatch_flags & I915_DISPATCH_RS ?
		1611	MI_BATCH_RESOURCE_STREAMER : 0));
5354	serge	1612	intel_logical_ring_emit(ringbuf, lower_32_bits(offset));
		1613	intel_logical_ring_emit(ringbuf, upper_32_bits(offset));
		1614	intel_logical_ring_emit(ringbuf, MI_NOOP);
		1615	intel_logical_ring_advance(ringbuf);
		1616
		1617	return 0;
		1618	}
		1619
		1620	static bool gen8_logical_ring_get_irq(struct intel_engine_cs *ring)
		1621	{
		1622	struct drm_device *dev = ring->dev;
		1623	struct drm_i915_private *dev_priv = dev->dev_private;
		1624	unsigned long flags;
		1625
		1626	if (WARN_ON(!intel_irqs_enabled(dev_priv)))
		1627	return false;
		1628
		1629	spin_lock_irqsave(&dev_priv->irq_lock, flags);
		1630	if (ring->irq_refcount++ == 0) {
		1631	I915_WRITE_IMR(ring, ~(ring->irq_enable_mask \| ring->irq_keep_mask));
		1632	POSTING_READ(RING_IMR(ring->mmio_base));
		1633	}
		1634	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
		1635
		1636	return true;
		1637	}
		1638
		1639	static void gen8_logical_ring_put_irq(struct intel_engine_cs *ring)
		1640	{
		1641	struct drm_device *dev = ring->dev;
		1642	struct drm_i915_private *dev_priv = dev->dev_private;
		1643	unsigned long flags;
		1644
		1645	spin_lock_irqsave(&dev_priv->irq_lock, flags);
		1646	if (--ring->irq_refcount == 0) {
		1647	I915_WRITE_IMR(ring, ~ring->irq_keep_mask);
		1648	POSTING_READ(RING_IMR(ring->mmio_base));
		1649	}
		1650	spin_unlock_irqrestore(&dev_priv->irq_lock, flags);
		1651	}
		1652
6084	serge	1653	static int gen8_emit_flush(struct drm_i915_gem_request *request,
5354	serge	1654	u32 invalidate_domains,
		1655	u32 unused)
		1656	{
6084	serge	1657	struct intel_ringbuffer *ringbuf = request->ringbuf;
5354	serge	1658	struct intel_engine_cs *ring = ringbuf->ring;
		1659	struct drm_device *dev = ring->dev;
		1660	struct drm_i915_private *dev_priv = dev->dev_private;
		1661	uint32_t cmd;
		1662	int ret;
		1663
6084	serge	1664	ret = intel_logical_ring_begin(request, 4);
5354	serge	1665	if (ret)
		1666	return ret;
		1667
		1668	cmd = MI_FLUSH_DW + 1;
		1669
6084	serge	1670	/* We always require a command barrier so that subsequent
		1671	* commands, such as breadcrumb interrupts, are strictly ordered
		1672	* wrt the contents of the write cache being flushed to memory
		1673	* (and thus being coherent from the CPU).
		1674	*/
		1675	cmd \|= MI_FLUSH_DW_STORE_INDEX \| MI_FLUSH_DW_OP_STOREDW;
		1676
		1677	if (invalidate_domains & I915_GEM_GPU_DOMAINS) {
		1678	cmd \|= MI_INVALIDATE_TLB;
		1679	if (ring == &dev_priv->ring[VCS])
		1680	cmd \|= MI_INVALIDATE_BSD;
5354	serge	1681	}
		1682
		1683	intel_logical_ring_emit(ringbuf, cmd);
		1684	intel_logical_ring_emit(ringbuf,
		1685	I915_GEM_HWS_SCRATCH_ADDR \|
		1686	MI_FLUSH_DW_USE_GTT);
		1687	intel_logical_ring_emit(ringbuf, 0); /* upper addr */
		1688	intel_logical_ring_emit(ringbuf, 0); /* value */
		1689	intel_logical_ring_advance(ringbuf);
		1690
		1691	return 0;
		1692	}
		1693
6084	serge	1694	static int gen8_emit_flush_render(struct drm_i915_gem_request *request,
5354	serge	1695	u32 invalidate_domains,
		1696	u32 flush_domains)
		1697	{
6084	serge	1698	struct intel_ringbuffer *ringbuf = request->ringbuf;
5354	serge	1699	struct intel_engine_cs *ring = ringbuf->ring;
		1700	u32 scratch_addr = ring->scratch.gtt_offset + 2 * CACHELINE_BYTES;
6084	serge	1701	bool vf_flush_wa;
5354	serge	1702	u32 flags = 0;
		1703	int ret;
		1704
		1705	flags \|= PIPE_CONTROL_CS_STALL;
		1706
		1707	if (flush_domains) {
		1708	flags \|= PIPE_CONTROL_RENDER_TARGET_CACHE_FLUSH;
		1709	flags \|= PIPE_CONTROL_DEPTH_CACHE_FLUSH;
6320	serge	1710	flags \|= PIPE_CONTROL_DC_FLUSH_ENABLE;
6084	serge	1711	flags \|= PIPE_CONTROL_FLUSH_ENABLE;
5354	serge	1712	}
		1713
		1714	if (invalidate_domains) {
		1715	flags \|= PIPE_CONTROL_TLB_INVALIDATE;
		1716	flags \|= PIPE_CONTROL_INSTRUCTION_CACHE_INVALIDATE;
		1717	flags \|= PIPE_CONTROL_TEXTURE_CACHE_INVALIDATE;
		1718	flags \|= PIPE_CONTROL_VF_CACHE_INVALIDATE;
		1719	flags \|= PIPE_CONTROL_CONST_CACHE_INVALIDATE;
		1720	flags \|= PIPE_CONTROL_STATE_CACHE_INVALIDATE;
		1721	flags \|= PIPE_CONTROL_QW_WRITE;
		1722	flags \|= PIPE_CONTROL_GLOBAL_GTT_IVB;
		1723	}
		1724
6084	serge	1725	/*
		1726	* On GEN9+ Before VF_CACHE_INVALIDATE we need to emit a NULL pipe
		1727	* control.
		1728	*/
		1729	vf_flush_wa = INTEL_INFO(ring->dev)->gen >= 9 &&
		1730	flags & PIPE_CONTROL_VF_CACHE_INVALIDATE;
		1731
		1732	ret = intel_logical_ring_begin(request, vf_flush_wa ? 12 : 6);
5354	serge	1733	if (ret)
		1734	return ret;
		1735
6084	serge	1736	if (vf_flush_wa) {
		1737	intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6));
		1738	intel_logical_ring_emit(ringbuf, 0);
		1739	intel_logical_ring_emit(ringbuf, 0);
		1740	intel_logical_ring_emit(ringbuf, 0);
		1741	intel_logical_ring_emit(ringbuf, 0);
		1742	intel_logical_ring_emit(ringbuf, 0);
		1743	}
		1744
5354	serge	1745	intel_logical_ring_emit(ringbuf, GFX_OP_PIPE_CONTROL(6));
		1746	intel_logical_ring_emit(ringbuf, flags);
		1747	intel_logical_ring_emit(ringbuf, scratch_addr);
		1748	intel_logical_ring_emit(ringbuf, 0);
		1749	intel_logical_ring_emit(ringbuf, 0);
		1750	intel_logical_ring_emit(ringbuf, 0);
		1751	intel_logical_ring_advance(ringbuf);
		1752
		1753	return 0;
		1754	}
		1755
		1756	static u32 gen8_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
		1757	{
		1758	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
		1759	}
		1760
		1761	static void gen8_set_seqno(struct intel_engine_cs *ring, u32 seqno)
		1762	{
		1763	intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno);
		1764	}
		1765
6084	serge	1766	static u32 bxt_a_get_seqno(struct intel_engine_cs *ring, bool lazy_coherency)
5354	serge	1767	{
6084	serge	1768
		1769	/*
		1770	* On BXT A steppings there is a HW coherency issue whereby the
		1771	* MI_STORE_DATA_IMM storing the completed request's seqno
		1772	* occasionally doesn't invalidate the CPU cache. Work around this by
		1773	* clflushing the corresponding cacheline whenever the caller wants
		1774	* the coherency to be guaranteed. Note that this cacheline is known
		1775	* to be clean at this point, since we only write it in
		1776	* bxt_a_set_seqno(), where we also do a clflush after the write. So
		1777	* this clflush in practice becomes an invalidate operation.
		1778	*/
		1779
		1780	if (!lazy_coherency)
		1781	intel_flush_status_page(ring, I915_GEM_HWS_INDEX);
		1782
		1783	return intel_read_status_page(ring, I915_GEM_HWS_INDEX);
		1784	}
		1785
		1786	static void bxt_a_set_seqno(struct intel_engine_cs *ring, u32 seqno)
		1787	{
		1788	intel_write_status_page(ring, I915_GEM_HWS_INDEX, seqno);
		1789
		1790	/* See bxt_a_get_seqno() explaining the reason for the clflush. */
		1791	intel_flush_status_page(ring, I915_GEM_HWS_INDEX);
		1792	}
		1793
		1794	static int gen8_emit_request(struct drm_i915_gem_request *request)
		1795	{
		1796	struct intel_ringbuffer *ringbuf = request->ringbuf;
5354	serge	1797	struct intel_engine_cs *ring = ringbuf->ring;
		1798	u32 cmd;
		1799	int ret;
		1800
6084	serge	1801	/*
		1802	* Reserve space for 2 NOOPs at the end of each request to be
		1803	* used as a workaround for not being allowed to do lite
		1804	* restore with HEAD==TAIL (WaIdleLiteRestore).
		1805	*/
		1806	ret = intel_logical_ring_begin(request, 8);
5354	serge	1807	if (ret)
		1808	return ret;
		1809
6084	serge	1810	cmd = MI_STORE_DWORD_IMM_GEN4;
5354	serge	1811	cmd \|= MI_GLOBAL_GTT;
		1812
		1813	intel_logical_ring_emit(ringbuf, cmd);
		1814	intel_logical_ring_emit(ringbuf,
		1815	(ring->status_page.gfx_addr +
		1816	(I915_GEM_HWS_INDEX << MI_STORE_DWORD_INDEX_SHIFT)));
		1817	intel_logical_ring_emit(ringbuf, 0);
6084	serge	1818	intel_logical_ring_emit(ringbuf, i915_gem_request_get_seqno(request));
5354	serge	1819	intel_logical_ring_emit(ringbuf, MI_USER_INTERRUPT);
		1820	intel_logical_ring_emit(ringbuf, MI_NOOP);
6084	serge	1821	intel_logical_ring_advance_and_submit(request);
5354	serge	1822
6084	serge	1823	/*
		1824	* Here we add two extra NOOPs as padding to avoid
		1825	* lite restore of a context with HEAD==TAIL.
		1826	*/
		1827	intel_logical_ring_emit(ringbuf, MI_NOOP);
		1828	intel_logical_ring_emit(ringbuf, MI_NOOP);
		1829	intel_logical_ring_advance(ringbuf);
		1830
5354	serge	1831	return 0;
		1832	}
		1833
6084	serge	1834	static int intel_lr_context_render_state_init(struct drm_i915_gem_request *req)
		1835	{
		1836	struct render_state so;
		1837	int ret;
		1838
		1839	ret = i915_gem_render_state_prepare(req->ring, &so);
		1840	if (ret)
		1841	return ret;
		1842
		1843	if (so.rodata == NULL)
		1844	return 0;
		1845
		1846	ret = req->ring->emit_bb_start(req, so.ggtt_offset,
		1847	I915_DISPATCH_SECURE);
		1848	if (ret)
		1849	goto out;
		1850
		1851	ret = req->ring->emit_bb_start(req,
		1852	(so.ggtt_offset + so.aux_batch_offset),
		1853	I915_DISPATCH_SECURE);
		1854	if (ret)
		1855	goto out;
		1856
		1857	i915_vma_move_to_active(i915_gem_obj_to_ggtt(so.obj), req);
		1858
		1859	out:
		1860	i915_gem_render_state_fini(&so);
		1861	return ret;
		1862	}
		1863
		1864	static int gen8_init_rcs_context(struct drm_i915_gem_request *req)
		1865	{
		1866	int ret;
		1867
		1868	ret = intel_logical_ring_workarounds_emit(req);
		1869	if (ret)
		1870	return ret;
		1871
		1872	ret = intel_rcs_context_init_mocs(req);
		1873	/*
		1874	* Failing to program the MOCS is non-fatal.The system will not
		1875	* run at peak performance. So generate an error and carry on.
		1876	*/
		1877	if (ret)
		1878	DRM_ERROR("MOCS failed to program: expect performance issues.\n");
		1879
		1880	return intel_lr_context_render_state_init(req);
		1881	}
		1882
5354	serge	1883	/**
		1884	* intel_logical_ring_cleanup() - deallocate the Engine Command Streamer
		1885	*
		1886	* @ring: Engine Command Streamer.
		1887	*
		1888	*/
		1889	void intel_logical_ring_cleanup(struct intel_engine_cs *ring)
		1890	{
		1891	struct drm_i915_private *dev_priv;
		1892
		1893	if (!intel_ring_initialized(ring))
		1894	return;
		1895
		1896	dev_priv = ring->dev->dev_private;
		1897
6937	serge	1898	if (ring->buffer) {
5354	serge	1899	intel_logical_ring_stop(ring);
		1900	WARN_ON((I915_READ_MODE(ring) & MODE_IDLE) == 0);
6937	serge	1901	}
5354	serge	1902
		1903	if (ring->cleanup)
		1904	ring->cleanup(ring);
		1905
		1906	i915_cmd_parser_fini_ring(ring);
6084	serge	1907	i915_gem_batch_pool_fini(&ring->batch_pool);
5354	serge	1908
		1909	if (ring->status_page.obj) {
		1910	kunmap(sg_page(ring->status_page.obj->pages->sgl));
		1911	ring->status_page.obj = NULL;
		1912	}
6084	serge	1913
		1914	lrc_destroy_wa_ctx_obj(ring);
6937	serge	1915	ring->dev = NULL;
5354	serge	1916	}
		1917
		1918	static int logical_ring_init(struct drm_device dev, struct intel_engine_cs ring)
		1919	{
		1920	int ret;
		1921
		1922	/* Intentionally left blank. */
		1923	ring->buffer = NULL;
		1924
		1925	ring->dev = dev;
		1926	INIT_LIST_HEAD(&ring->active_list);
		1927	INIT_LIST_HEAD(&ring->request_list);
6084	serge	1928	i915_gem_batch_pool_init(dev, &ring->batch_pool);
5354	serge	1929	init_waitqueue_head(&ring->irq_queue);
		1930
6937	serge	1931	INIT_LIST_HEAD(&ring->buffers);
5354	serge	1932	INIT_LIST_HEAD(&ring->execlist_queue);
		1933	INIT_LIST_HEAD(&ring->execlist_retired_req_list);
		1934	spin_lock_init(&ring->execlist_lock);
		1935
		1936	ret = i915_cmd_parser_init_ring(ring);
		1937	if (ret)
6937	serge	1938	goto error;
5354	serge	1939
6084	serge	1940	ret = intel_lr_context_deferred_alloc(ring->default_context, ring);
		1941	if (ret)
6937	serge	1942	goto error;
6084	serge	1943
		1944	/* As this is the default context, always pin it */
		1945	ret = intel_lr_context_do_pin(
		1946	ring,
		1947	ring->default_context->engine[ring->id].state,
		1948	ring->default_context->engine[ring->id].ringbuf);
		1949	if (ret) {
		1950	DRM_ERROR(
		1951	"Failed to pin and map ringbuffer %s: %d\n",
		1952	ring->name, ret);
6937	serge	1953	goto error;
5354	serge	1954	}
		1955
6937	serge	1956	return 0;
		1957
		1958	error:
		1959	intel_logical_ring_cleanup(ring);
5354	serge	1960	return ret;
		1961	}
		1962
		1963	static int logical_render_ring_init(struct drm_device *dev)
		1964	{
		1965	struct drm_i915_private *dev_priv = dev->dev_private;
		1966	struct intel_engine_cs *ring = &dev_priv->ring[RCS];
6084	serge	1967	int ret;
5354	serge	1968
		1969	ring->name = "render ring";
		1970	ring->id = RCS;
		1971	ring->mmio_base = RENDER_RING_BASE;
		1972	ring->irq_enable_mask =
		1973	GT_RENDER_USER_INTERRUPT << GEN8_RCS_IRQ_SHIFT;
		1974	ring->irq_keep_mask =
		1975	GT_CONTEXT_SWITCH_INTERRUPT << GEN8_RCS_IRQ_SHIFT;
		1976	if (HAS_L3_DPF(dev))
		1977	ring->irq_keep_mask \|= GT_RENDER_L3_PARITY_ERROR_INTERRUPT;
		1978
6084	serge	1979	if (INTEL_INFO(dev)->gen >= 9)
		1980	ring->init_hw = gen9_init_render_ring;
		1981	else
		1982	ring->init_hw = gen8_init_render_ring;
		1983	ring->init_context = gen8_init_rcs_context;
5354	serge	1984	ring->cleanup = intel_fini_pipe_control;
6937	serge	1985	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
6084	serge	1986	ring->get_seqno = bxt_a_get_seqno;
		1987	ring->set_seqno = bxt_a_set_seqno;
		1988	} else {
		1989	ring->get_seqno = gen8_get_seqno;
		1990	ring->set_seqno = gen8_set_seqno;
		1991	}
5354	serge	1992	ring->emit_request = gen8_emit_request;
		1993	ring->emit_flush = gen8_emit_flush_render;
		1994	ring->irq_get = gen8_logical_ring_get_irq;
		1995	ring->irq_put = gen8_logical_ring_put_irq;
		1996	ring->emit_bb_start = gen8_emit_bb_start;
		1997
6084	serge	1998	ring->dev = dev;
		1999
		2000	ret = intel_init_pipe_control(ring);
		2001	if (ret)
		2002	return ret;
		2003
		2004	ret = intel_init_workaround_bb(ring);
		2005	if (ret) {
		2006	/*
		2007	* We continue even if we fail to initialize WA batch
		2008	* because we only expect rare glitches but nothing
		2009	* critical to prevent us from using GPU
		2010	*/
		2011	DRM_ERROR("WA batch buffer initialization failed: %d\n",
		2012	ret);
		2013	}
		2014
		2015	ret = logical_ring_init(dev, ring);
		2016	if (ret) {
		2017	lrc_destroy_wa_ctx_obj(ring);
		2018	}
		2019
		2020	return ret;
5354	serge	2021	}
		2022
		2023	static int logical_bsd_ring_init(struct drm_device *dev)
		2024	{
		2025	struct drm_i915_private *dev_priv = dev->dev_private;
		2026	struct intel_engine_cs *ring = &dev_priv->ring[VCS];
		2027
		2028	ring->name = "bsd ring";
		2029	ring->id = VCS;
		2030	ring->mmio_base = GEN6_BSD_RING_BASE;
		2031	ring->irq_enable_mask =
		2032	GT_RENDER_USER_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
		2033	ring->irq_keep_mask =
		2034	GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS1_IRQ_SHIFT;
		2035
6084	serge	2036	ring->init_hw = gen8_init_common_ring;
6937	serge	2037	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
6084	serge	2038	ring->get_seqno = bxt_a_get_seqno;
		2039	ring->set_seqno = bxt_a_set_seqno;
		2040	} else {
		2041	ring->get_seqno = gen8_get_seqno;
		2042	ring->set_seqno = gen8_set_seqno;
		2043	}
5354	serge	2044	ring->emit_request = gen8_emit_request;
		2045	ring->emit_flush = gen8_emit_flush;
		2046	ring->irq_get = gen8_logical_ring_get_irq;
		2047	ring->irq_put = gen8_logical_ring_put_irq;
		2048	ring->emit_bb_start = gen8_emit_bb_start;
		2049
		2050	return logical_ring_init(dev, ring);
		2051	}
		2052
		2053	static int logical_bsd2_ring_init(struct drm_device *dev)
		2054	{
		2055	struct drm_i915_private *dev_priv = dev->dev_private;
		2056	struct intel_engine_cs *ring = &dev_priv->ring[VCS2];
		2057
		2058	ring->name = "bds2 ring";
		2059	ring->id = VCS2;
		2060	ring->mmio_base = GEN8_BSD2_RING_BASE;
		2061	ring->irq_enable_mask =
		2062	GT_RENDER_USER_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;
		2063	ring->irq_keep_mask =
		2064	GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VCS2_IRQ_SHIFT;
		2065
6084	serge	2066	ring->init_hw = gen8_init_common_ring;
5354	serge	2067	ring->get_seqno = gen8_get_seqno;
		2068	ring->set_seqno = gen8_set_seqno;
		2069	ring->emit_request = gen8_emit_request;
		2070	ring->emit_flush = gen8_emit_flush;
		2071	ring->irq_get = gen8_logical_ring_get_irq;
		2072	ring->irq_put = gen8_logical_ring_put_irq;
		2073	ring->emit_bb_start = gen8_emit_bb_start;
		2074
		2075	return logical_ring_init(dev, ring);
		2076	}
		2077
		2078	static int logical_blt_ring_init(struct drm_device *dev)
		2079	{
		2080	struct drm_i915_private *dev_priv = dev->dev_private;
		2081	struct intel_engine_cs *ring = &dev_priv->ring[BCS];
		2082
		2083	ring->name = "blitter ring";
		2084	ring->id = BCS;
		2085	ring->mmio_base = BLT_RING_BASE;
		2086	ring->irq_enable_mask =
		2087	GT_RENDER_USER_INTERRUPT << GEN8_BCS_IRQ_SHIFT;
		2088	ring->irq_keep_mask =
		2089	GT_CONTEXT_SWITCH_INTERRUPT << GEN8_BCS_IRQ_SHIFT;
		2090
6084	serge	2091	ring->init_hw = gen8_init_common_ring;
6937	serge	2092	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
6084	serge	2093	ring->get_seqno = bxt_a_get_seqno;
		2094	ring->set_seqno = bxt_a_set_seqno;
		2095	} else {
		2096	ring->get_seqno = gen8_get_seqno;
		2097	ring->set_seqno = gen8_set_seqno;
		2098	}
5354	serge	2099	ring->emit_request = gen8_emit_request;
		2100	ring->emit_flush = gen8_emit_flush;
		2101	ring->irq_get = gen8_logical_ring_get_irq;
		2102	ring->irq_put = gen8_logical_ring_put_irq;
		2103	ring->emit_bb_start = gen8_emit_bb_start;
		2104
		2105	return logical_ring_init(dev, ring);
		2106	}
		2107
		2108	static int logical_vebox_ring_init(struct drm_device *dev)
		2109	{
		2110	struct drm_i915_private *dev_priv = dev->dev_private;
		2111	struct intel_engine_cs *ring = &dev_priv->ring[VECS];
		2112
		2113	ring->name = "video enhancement ring";
		2114	ring->id = VECS;
		2115	ring->mmio_base = VEBOX_RING_BASE;
		2116	ring->irq_enable_mask =
		2117	GT_RENDER_USER_INTERRUPT << GEN8_VECS_IRQ_SHIFT;
		2118	ring->irq_keep_mask =
		2119	GT_CONTEXT_SWITCH_INTERRUPT << GEN8_VECS_IRQ_SHIFT;
		2120
6084	serge	2121	ring->init_hw = gen8_init_common_ring;
6937	serge	2122	if (IS_BXT_REVID(dev, 0, BXT_REVID_A1)) {
6084	serge	2123	ring->get_seqno = bxt_a_get_seqno;
		2124	ring->set_seqno = bxt_a_set_seqno;
		2125	} else {
		2126	ring->get_seqno = gen8_get_seqno;
		2127	ring->set_seqno = gen8_set_seqno;
		2128	}
5354	serge	2129	ring->emit_request = gen8_emit_request;
		2130	ring->emit_flush = gen8_emit_flush;
		2131	ring->irq_get = gen8_logical_ring_get_irq;
		2132	ring->irq_put = gen8_logical_ring_put_irq;
		2133	ring->emit_bb_start = gen8_emit_bb_start;
		2134
		2135	return logical_ring_init(dev, ring);
		2136	}
		2137
		2138	/**
		2139	* intel_logical_rings_init() - allocate, populate and init the Engine Command Streamers
		2140	* @dev: DRM device.
		2141	*
		2142	* This function inits the engines for an Execlists submission style (the equivalent in the
		2143	* legacy ringbuffer submission world would be i915_gem_init_rings). It does it only for
		2144	* those engines that are present in the hardware.
		2145	*
		2146	* Return: non-zero if the initialization failed.
		2147	*/
		2148	int intel_logical_rings_init(struct drm_device *dev)
		2149	{
		2150	struct drm_i915_private *dev_priv = dev->dev_private;
		2151	int ret;
		2152
		2153	ret = logical_render_ring_init(dev);
		2154	if (ret)
		2155	return ret;
		2156
		2157	if (HAS_BSD(dev)) {
		2158	ret = logical_bsd_ring_init(dev);
		2159	if (ret)
		2160	goto cleanup_render_ring;
		2161	}
		2162
		2163	if (HAS_BLT(dev)) {
		2164	ret = logical_blt_ring_init(dev);
		2165	if (ret)
		2166	goto cleanup_bsd_ring;
		2167	}
		2168
		2169	if (HAS_VEBOX(dev)) {
		2170	ret = logical_vebox_ring_init(dev);
		2171	if (ret)
		2172	goto cleanup_blt_ring;
		2173	}
		2174
		2175	if (HAS_BSD2(dev)) {
		2176	ret = logical_bsd2_ring_init(dev);
		2177	if (ret)
		2178	goto cleanup_vebox_ring;
		2179	}
		2180
		2181	return 0;
		2182
		2183	cleanup_vebox_ring:
		2184	intel_logical_ring_cleanup(&dev_priv->ring[VECS]);
		2185	cleanup_blt_ring:
		2186	intel_logical_ring_cleanup(&dev_priv->ring[BCS]);
		2187	cleanup_bsd_ring:
		2188	intel_logical_ring_cleanup(&dev_priv->ring[VCS]);
		2189	cleanup_render_ring:
		2190	intel_logical_ring_cleanup(&dev_priv->ring[RCS]);
		2191
		2192	return ret;
		2193	}
		2194
6084	serge	2195	static u32
		2196	make_rpcs(struct drm_device *dev)
5354	serge	2197	{
6084	serge	2198	u32 rpcs = 0;
5354	serge	2199
6084	serge	2200	/*
		2201	* No explicit RPCS request is needed to ensure full
		2202	* slice/subslice/EU enablement prior to Gen9.
		2203	*/
		2204	if (INTEL_INFO(dev)->gen < 9)
5354	serge	2205	return 0;
		2206
6084	serge	2207	/*
		2208	* Starting in Gen9, render power gating can leave
		2209	* slice/subslice/EU in a partially enabled state. We
		2210	* must make an explicit request through RPCS for full
		2211	* enablement.
		2212	*/
		2213	if (INTEL_INFO(dev)->has_slice_pg) {
		2214	rpcs \|= GEN8_RPCS_S_CNT_ENABLE;
		2215	rpcs \|= INTEL_INFO(dev)->slice_total <<
		2216	GEN8_RPCS_S_CNT_SHIFT;
		2217	rpcs \|= GEN8_RPCS_ENABLE;
		2218	}
5354	serge	2219
6084	serge	2220	if (INTEL_INFO(dev)->has_subslice_pg) {
		2221	rpcs \|= GEN8_RPCS_SS_CNT_ENABLE;
		2222	rpcs \|= INTEL_INFO(dev)->subslice_per_slice <<
		2223	GEN8_RPCS_SS_CNT_SHIFT;
		2224	rpcs \|= GEN8_RPCS_ENABLE;
		2225	}
5354	serge	2226
6084	serge	2227	if (INTEL_INFO(dev)->has_eu_pg) {
		2228	rpcs \|= INTEL_INFO(dev)->eu_per_subslice <<
		2229	GEN8_RPCS_EU_MIN_SHIFT;
		2230	rpcs \|= INTEL_INFO(dev)->eu_per_subslice <<
		2231	GEN8_RPCS_EU_MAX_SHIFT;
		2232	rpcs \|= GEN8_RPCS_ENABLE;
		2233	}
		2234
		2235	return rpcs;
5354	serge	2236	}
		2237
		2238	static int
		2239	populate_lr_context(struct intel_context ctx, struct drm_i915_gem_object ctx_obj,
		2240	struct intel_engine_cs ring, struct intel_ringbuffer ringbuf)
		2241	{
		2242	struct drm_device *dev = ring->dev;
		2243	struct drm_i915_private *dev_priv = dev->dev_private;
		2244	struct i915_hw_ppgtt *ppgtt = ctx->ppgtt;
		2245	struct page *page;
		2246	uint32_t *reg_state;
		2247	int ret;
		2248
		2249	if (!ppgtt)
		2250	ppgtt = dev_priv->mm.aliasing_ppgtt;
		2251
		2252	ret = i915_gem_object_set_to_cpu_domain(ctx_obj, true);
		2253	if (ret) {
		2254	DRM_DEBUG_DRIVER("Could not set to CPU domain\n");
		2255	return ret;
		2256	}
		2257
		2258	ret = i915_gem_object_get_pages(ctx_obj);
		2259	if (ret) {
		2260	DRM_DEBUG_DRIVER("Could not get object pages\n");
		2261	return ret;
		2262	}
		2263
		2264	i915_gem_object_pin_pages(ctx_obj);
		2265
		2266	/* The second page of the context object contains some fields which must
		2267	* be set up prior to the first execution. */
6937	serge	2268	page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
5354	serge	2269	reg_state = kmap_atomic(page);
		2270
		2271	/* A context is actually a big batch buffer with several MI_LOAD_REGISTER_IMM
		2272	* commands followed by (reg, value) pairs. The values we are setting here are
		2273	* only for the first context restore: on a subsequent save, the GPU will
		2274	* recreate this batchbuffer with new values (including all the missing
		2275	* MI_LOAD_REGISTER_IMM commands that we are not initializing here). */
6937	serge	2276	reg_state[CTX_LRI_HEADER_0] =
		2277	MI_LOAD_REGISTER_IMM(ring->id == RCS ? 14 : 11) \| MI_LRI_FORCE_POSTED;
		2278	ASSIGN_CTX_REG(reg_state, CTX_CONTEXT_CONTROL, RING_CONTEXT_CONTROL(ring),
6084	serge	2279	_MASKED_BIT_ENABLE(CTX_CTRL_INHIBIT_SYN_CTX_SWITCH \|
		2280	CTX_CTRL_ENGINE_CTX_RESTORE_INHIBIT \|
6937	serge	2281	CTX_CTRL_RS_CTX_ENABLE));
		2282	ASSIGN_CTX_REG(reg_state, CTX_RING_HEAD, RING_HEAD(ring->mmio_base), 0);
		2283	ASSIGN_CTX_REG(reg_state, CTX_RING_TAIL, RING_TAIL(ring->mmio_base), 0);
5354	serge	2284	/* Ring buffer start address is not known until the buffer is pinned.
		2285	* It is written to the context image in execlists_update_context()
		2286	*/
6937	serge	2287	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_START, RING_START(ring->mmio_base), 0);
		2288	ASSIGN_CTX_REG(reg_state, CTX_RING_BUFFER_CONTROL, RING_CTL(ring->mmio_base),
		2289	((ringbuf->size - PAGE_SIZE) & RING_NR_PAGES) \| RING_VALID);
		2290	ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_U, RING_BBADDR_UDW(ring->mmio_base), 0);
		2291	ASSIGN_CTX_REG(reg_state, CTX_BB_HEAD_L, RING_BBADDR(ring->mmio_base), 0);
		2292	ASSIGN_CTX_REG(reg_state, CTX_BB_STATE, RING_BBSTATE(ring->mmio_base),
		2293	RING_BB_PPGTT);
		2294	ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_U, RING_SBBADDR_UDW(ring->mmio_base), 0);
		2295	ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_HEAD_L, RING_SBBADDR(ring->mmio_base), 0);
		2296	ASSIGN_CTX_REG(reg_state, CTX_SECOND_BB_STATE, RING_SBBSTATE(ring->mmio_base), 0);
5354	serge	2297	if (ring->id == RCS) {
6937	serge	2298	ASSIGN_CTX_REG(reg_state, CTX_BB_PER_CTX_PTR, RING_BB_PER_CTX_PTR(ring->mmio_base), 0);
		2299	ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX, RING_INDIRECT_CTX(ring->mmio_base), 0);
		2300	ASSIGN_CTX_REG(reg_state, CTX_RCS_INDIRECT_CTX_OFFSET, RING_INDIRECT_CTX_OFFSET(ring->mmio_base), 0);
6084	serge	2301	if (ring->wa_ctx.obj) {
		2302	struct i915_ctx_workarounds *wa_ctx = &ring->wa_ctx;
		2303	uint32_t ggtt_offset = i915_gem_obj_ggtt_offset(wa_ctx->obj);
		2304
		2305	reg_state[CTX_RCS_INDIRECT_CTX+1] =
		2306	(ggtt_offset + wa_ctx->indirect_ctx.offset * sizeof(uint32_t)) \|
		2307	(wa_ctx->indirect_ctx.size / CACHELINE_DWORDS);
		2308
		2309	reg_state[CTX_RCS_INDIRECT_CTX_OFFSET+1] =
		2310	CTX_RCS_INDIRECT_CTX_OFFSET_DEFAULT << 6;
		2311
		2312	reg_state[CTX_BB_PER_CTX_PTR+1] =
		2313	(ggtt_offset + wa_ctx->per_ctx.offset * sizeof(uint32_t)) \|
		2314	0x01;
		2315	}
5354	serge	2316	}
6937	serge	2317	reg_state[CTX_LRI_HEADER_1] = MI_LOAD_REGISTER_IMM(9) \| MI_LRI_FORCE_POSTED;
		2318	ASSIGN_CTX_REG(reg_state, CTX_CTX_TIMESTAMP, RING_CTX_TIMESTAMP(ring->mmio_base), 0);
		2319	/* PDP values well be assigned later if needed */
		2320	ASSIGN_CTX_REG(reg_state, CTX_PDP3_UDW, GEN8_RING_PDP_UDW(ring, 3), 0);
		2321	ASSIGN_CTX_REG(reg_state, CTX_PDP3_LDW, GEN8_RING_PDP_LDW(ring, 3), 0);
		2322	ASSIGN_CTX_REG(reg_state, CTX_PDP2_UDW, GEN8_RING_PDP_UDW(ring, 2), 0);
		2323	ASSIGN_CTX_REG(reg_state, CTX_PDP2_LDW, GEN8_RING_PDP_LDW(ring, 2), 0);
		2324	ASSIGN_CTX_REG(reg_state, CTX_PDP1_UDW, GEN8_RING_PDP_UDW(ring, 1), 0);
		2325	ASSIGN_CTX_REG(reg_state, CTX_PDP1_LDW, GEN8_RING_PDP_LDW(ring, 1), 0);
		2326	ASSIGN_CTX_REG(reg_state, CTX_PDP0_UDW, GEN8_RING_PDP_UDW(ring, 0), 0);
		2327	ASSIGN_CTX_REG(reg_state, CTX_PDP0_LDW, GEN8_RING_PDP_LDW(ring, 0), 0);
6084	serge	2328
		2329	if (USES_FULL_48BIT_PPGTT(ppgtt->base.dev)) {
		2330	/* 64b PPGTT (48bit canonical)
		2331	* PDP0_DESCRIPTOR contains the base address to PML4 and
		2332	* other PDP Descriptors are ignored.
		2333	*/
		2334	ASSIGN_CTX_PML4(ppgtt, reg_state);
		2335	} else {
		2336	/* 32b PPGTT
		2337	* PDP*_DESCRIPTOR contains the base address of space supported.
		2338	* With dynamic page allocation, PDPs may not be allocated at
		2339	* this point. Point the unallocated PDPs to the scratch page
		2340	*/
		2341	ASSIGN_CTX_PDP(ppgtt, reg_state, 3);
		2342	ASSIGN_CTX_PDP(ppgtt, reg_state, 2);
		2343	ASSIGN_CTX_PDP(ppgtt, reg_state, 1);
		2344	ASSIGN_CTX_PDP(ppgtt, reg_state, 0);
		2345	}
		2346
5354	serge	2347	if (ring->id == RCS) {
		2348	reg_state[CTX_LRI_HEADER_2] = MI_LOAD_REGISTER_IMM(1);
6937	serge	2349	ASSIGN_CTX_REG(reg_state, CTX_R_PWR_CLK_STATE, GEN8_R_PWR_CLK_STATE,
		2350	make_rpcs(dev));
5354	serge	2351	}
		2352
		2353	kunmap_atomic(reg_state);
6088	serge	2354	i915_gem_object_unpin_pages(ctx_obj);
5354	serge	2355
		2356	return 0;
		2357	}
		2358
		2359	/**
		2360	* intel_lr_context_free() - free the LRC specific bits of a context
		2361	* @ctx: the LR context to free.
		2362	*
		2363	* The real context freeing is done in i915_gem_context_free: this only
		2364	* takes care of the bits that are LRC related: the per-engine backing
		2365	* objects and the logical ringbuffer.
		2366	*/
		2367	void intel_lr_context_free(struct intel_context *ctx)
		2368	{
		2369	int i;
		2370
		2371	for (i = 0; i < I915_NUM_RINGS; i++) {
		2372	struct drm_i915_gem_object *ctx_obj = ctx->engine[i].state;
		2373
		2374	if (ctx_obj) {
		2375	struct intel_ringbuffer *ringbuf =
		2376	ctx->engine[i].ringbuf;
		2377	struct intel_engine_cs *ring = ringbuf->ring;
		2378
		2379	if (ctx == ring->default_context) {
		2380	intel_unpin_ringbuffer_obj(ringbuf);
		2381	i915_gem_object_ggtt_unpin(ctx_obj);
		2382	}
6084	serge	2383	WARN_ON(ctx->engine[ring->id].pin_count);
		2384	intel_ringbuffer_free(ringbuf);
5354	serge	2385	drm_gem_object_unreference(&ctx_obj->base);
		2386	}
		2387	}
		2388	}
		2389
		2390	static uint32_t get_lr_context_size(struct intel_engine_cs *ring)
		2391	{
		2392	int ret = 0;
		2393
		2394	WARN_ON(INTEL_INFO(ring->dev)->gen < 8);
		2395
		2396	switch (ring->id) {
		2397	case RCS:
		2398	if (INTEL_INFO(ring->dev)->gen >= 9)
		2399	ret = GEN9_LR_CONTEXT_RENDER_SIZE;
		2400	else
		2401	ret = GEN8_LR_CONTEXT_RENDER_SIZE;
		2402	break;
		2403	case VCS:
		2404	case BCS:
		2405	case VECS:
		2406	case VCS2:
		2407	ret = GEN8_LR_CONTEXT_OTHER_SIZE;
		2408	break;
		2409	}
		2410
		2411	return ret;
		2412	}
		2413
		2414	static void lrc_setup_hardware_status_page(struct intel_engine_cs *ring,
		2415	struct drm_i915_gem_object *default_ctx_obj)
		2416	{
		2417	struct drm_i915_private *dev_priv = ring->dev->dev_private;
6084	serge	2418	struct page *page;
5354	serge	2419
6084	serge	2420	/* The HWSP is part of the default context object in LRC mode. */
		2421	ring->status_page.gfx_addr = i915_gem_obj_ggtt_offset(default_ctx_obj)
		2422	+ LRC_PPHWSP_PN * PAGE_SIZE;
		2423	page = i915_gem_object_get_page(default_ctx_obj, LRC_PPHWSP_PN);
		2424	ring->status_page.page_addr = kmap(page);
5354	serge	2425	ring->status_page.obj = default_ctx_obj;
		2426
		2427	I915_WRITE(RING_HWS_PGA(ring->mmio_base),
		2428	(u32)ring->status_page.gfx_addr);
		2429	POSTING_READ(RING_HWS_PGA(ring->mmio_base));
		2430	}
		2431
		2432	/**
6084	serge	2433	* intel_lr_context_deferred_alloc() - create the LRC specific bits of a context
5354	serge	2434	* @ctx: LR context to create.
		2435	* @ring: engine to be used with the context.
		2436	*
		2437	* This function can be called more than once, with different engines, if we plan
		2438	* to use the context with them. The context backing objects and the ringbuffers
		2439	* (specially the ringbuffer backing objects) suck a lot of memory up, and that's why
		2440	* the creation is a deferred call: it's better to make sure first that we need to use
		2441	* a given ring with the context.
		2442	*
		2443	* Return: non-zero on error.
		2444	*/
6084	serge	2445
		2446	int intel_lr_context_deferred_alloc(struct intel_context *ctx,
5354	serge	2447	struct intel_engine_cs *ring)
		2448	{
		2449	struct drm_device *dev = ring->dev;
		2450	struct drm_i915_gem_object *ctx_obj;
		2451	uint32_t context_size;
		2452	struct intel_ringbuffer *ringbuf;
		2453	int ret;
		2454
		2455	WARN_ON(ctx->legacy_hw_ctx.rcs_state != NULL);
6084	serge	2456	WARN_ON(ctx->engine[ring->id].state);
5354	serge	2457
		2458	context_size = round_up(get_lr_context_size(ring), 4096);
		2459
6084	serge	2460	/* One extra page as the sharing data between driver and GuC */
		2461	context_size += PAGE_SIZE * LRC_PPHWSP_PN;
5354	serge	2462
6084	serge	2463	ctx_obj = i915_gem_alloc_object(dev, context_size);
		2464	if (!ctx_obj) {
		2465	DRM_DEBUG_DRIVER("Alloc LRC backing obj failed.\n");
		2466	return -ENOMEM;
5354	serge	2467	}
		2468
6084	serge	2469	ringbuf = intel_engine_create_ringbuffer(ring, 4 * PAGE_SIZE);
		2470	if (IS_ERR(ringbuf)) {
		2471	ret = PTR_ERR(ringbuf);
		2472	goto error_deref_obj;
5354	serge	2473	}
		2474
		2475	ret = populate_lr_context(ctx, ctx_obj, ring, ringbuf);
		2476	if (ret) {
		2477	DRM_DEBUG_DRIVER("Failed to populate LRC: %d\n", ret);
6084	serge	2478	goto error_ringbuf;
5354	serge	2479	}
		2480
		2481	ctx->engine[ring->id].ringbuf = ringbuf;
		2482	ctx->engine[ring->id].state = ctx_obj;
		2483
6084	serge	2484	if (ctx != ring->default_context && ring->init_context) {
		2485	struct drm_i915_gem_request *req;
5354	serge	2486
6084	serge	2487	ret = i915_gem_request_alloc(ring,
		2488	ctx, &req);
		2489	if (ret) {
		2490	DRM_ERROR("ring create req: %d\n",
		2491	ret);
		2492	goto error_ringbuf;
5354	serge	2493	}
		2494
6084	serge	2495	ret = ring->init_context(req);
5354	serge	2496	if (ret) {
6084	serge	2497	DRM_ERROR("ring init context: %d\n",
		2498	ret);
		2499	i915_gem_request_cancel(req);
		2500	goto error_ringbuf;
5354	serge	2501	}
6084	serge	2502	i915_add_request_no_flush(req);
5354	serge	2503	}
		2504	return 0;
		2505
6084	serge	2506	error_ringbuf:
		2507	intel_ringbuffer_free(ringbuf);
		2508	error_deref_obj:
5354	serge	2509	drm_gem_object_unreference(&ctx_obj->base);
6084	serge	2510	ctx->engine[ring->id].ringbuf = NULL;
		2511	ctx->engine[ring->id].state = NULL;
5354	serge	2512	return ret;
		2513	}
6084	serge	2514
		2515	void intel_lr_context_reset(struct drm_device *dev,
		2516	struct intel_context *ctx)
		2517	{
		2518	struct drm_i915_private *dev_priv = dev->dev_private;
		2519	struct intel_engine_cs *ring;
		2520	int i;
		2521
		2522	for_each_ring(ring, dev_priv, i) {
		2523	struct drm_i915_gem_object *ctx_obj =
		2524	ctx->engine[ring->id].state;
		2525	struct intel_ringbuffer *ringbuf =
		2526	ctx->engine[ring->id].ringbuf;
		2527	uint32_t *reg_state;
		2528	struct page *page;
		2529
		2530	if (!ctx_obj)
		2531	continue;
		2532
		2533	if (i915_gem_object_get_pages(ctx_obj)) {
		2534	WARN(1, "Failed get_pages for context obj\n");
		2535	continue;
		2536	}
6937	serge	2537	page = i915_gem_object_get_dirty_page(ctx_obj, LRC_STATE_PN);
6084	serge	2538	reg_state = kmap_atomic(page);
		2539
		2540	reg_state[CTX_RING_HEAD+1] = 0;
		2541	reg_state[CTX_RING_TAIL+1] = 0;
		2542
		2543	kunmap_atomic(reg_state);
		2544
		2545	ringbuf->head = 0;
		2546	ringbuf->tail = 0;
		2547	}
		2548	}

Subversion Repositories Kolibri OS

(root)/drivers/video/drm/i915/intel_lrc.c – Rev 6937